10# include <cuda_runtime_api.h>
17 static constexpr char name[] =
"Cuda";
21 using DeviceAttr_t = ::cudaDeviceAttr;
22 using PointerAttr_t = ::cudaPointerAttributes;
23 using Memory_t = ::cudaMemoryType;
24 using DeviceProp_t = ::cudaDeviceProp;
25 using Error_t = ::cudaError_t;
26 using Event_t = ::cudaEvent_t;
27 using Extent_t = ::cudaExtent;
28 using Flag_t =
unsigned int;
29 using FuncAttributes_t = ::cudaFuncAttributes;
30 using HostFn_t = void (*)(
void*
data);
31 using Limit_t = ::cudaLimit;
32 using Memcpy3DParms_t = ::cudaMemcpy3DParms;
33 using MemcpyKind_t = ::cudaMemcpyKind;
34 using PitchedPtr_t = ::cudaPitchedPtr;
35 using Pos_t = ::cudaPos;
36 using Stream_t = ::cudaStream_t;
39 static constexpr Error_t success = ::cudaSuccess;
40 static constexpr Error_t errorNotReady = ::cudaErrorNotReady;
41 static constexpr Error_t errorHostMemoryAlreadyRegistered = ::cudaErrorHostMemoryAlreadyRegistered;
42 static constexpr Error_t errorHostMemoryNotRegistered = ::cudaErrorHostMemoryNotRegistered;
43 static constexpr Error_t errorUnsupportedLimit = ::cudaErrorUnsupportedLimit;
44 static constexpr Error_t errorUnknown = ::cudaErrorUnknown;
46 static constexpr Flag_t eventDefault = cudaEventDefault;
47 static constexpr Flag_t eventBlockingSync = cudaEventBlockingSync;
48 static constexpr Flag_t eventDisableTiming = cudaEventDisableTiming;
49 static constexpr Flag_t eventInterprocess = cudaEventInterprocess;
51 static constexpr Flag_t hostMallocDefault = cudaHostAllocDefault;
52 static constexpr Flag_t hostMallocMapped = cudaHostAllocMapped;
53 static constexpr Flag_t hostMallocPortable = cudaHostAllocPortable;
54 static constexpr Flag_t hostMallocWriteCombined = cudaHostAllocWriteCombined;
55 static constexpr Flag_t hostMallocCoherent = cudaHostAllocDefault;
56 static constexpr Flag_t hostMallocNonCoherent = cudaHostAllocDefault;
58 static constexpr Flag_t hostRegisterDefault = cudaHostRegisterDefault;
59 static constexpr Flag_t hostRegisterPortable = cudaHostRegisterPortable;
60 static constexpr Flag_t hostRegisterMapped = cudaHostRegisterMapped;
61 static constexpr Flag_t hostRegisterIoMemory = cudaHostRegisterIoMemory;
63 static constexpr MemcpyKind_t memcpyDefault = ::cudaMemcpyDefault;
64 static constexpr MemcpyKind_t memcpyDeviceToDevice = ::cudaMemcpyDeviceToDevice;
65 static constexpr MemcpyKind_t memcpyDeviceToHost = ::cudaMemcpyDeviceToHost;
66 static constexpr MemcpyKind_t memcpyHostToDevice = ::cudaMemcpyHostToDevice;
67 static constexpr MemcpyKind_t memcpyHostToHost = ::cudaMemcpyHostToHost;
69 static constexpr Flag_t streamDefault = cudaStreamDefault;
70 static constexpr Flag_t streamNonBlocking = cudaStreamNonBlocking;
72 static constexpr DeviceAttr_t deviceAttributeMaxBlockDimX = ::cudaDevAttrMaxBlockDimX;
73 static constexpr DeviceAttr_t deviceAttributeMaxBlockDimY = ::cudaDevAttrMaxBlockDimY;
74 static constexpr DeviceAttr_t deviceAttributeMaxBlockDimZ = ::cudaDevAttrMaxBlockDimZ;
75 static constexpr DeviceAttr_t deviceAttributeMaxGridDimX = ::cudaDevAttrMaxGridDimX;
76 static constexpr DeviceAttr_t deviceAttributeMaxGridDimY = ::cudaDevAttrMaxGridDimY;
77 static constexpr DeviceAttr_t deviceAttributeMaxGridDimZ = ::cudaDevAttrMaxGridDimZ;
78 static constexpr DeviceAttr_t deviceAttributeMaxSharedMemoryPerBlock = ::cudaDevAttrMaxSharedMemoryPerBlock;
79 static constexpr DeviceAttr_t deviceAttributeMaxThreadsPerBlock = ::cudaDevAttrMaxThreadsPerBlock;
80 static constexpr DeviceAttr_t deviceAttributeMultiprocessorCount = ::cudaDevAttrMultiProcessorCount;
81 static constexpr DeviceAttr_t deviceAttributeWarpSize = ::cudaDevAttrWarpSize;
83 static constexpr Memory_t memoryTypeUnregistered = ::cudaMemoryTypeUnregistered;
84 static constexpr Memory_t memoryTypeHost = ::cudaMemoryTypeHost;
85 static constexpr Memory_t memoryTypeDevice = ::cudaMemoryTypeDevice;
86 static constexpr Memory_t memoryTypeManaged = ::cudaMemoryTypeManaged;
88 static constexpr Limit_t limitPrintfFifoSize = ::cudaLimitPrintfFifoSize;
89 static constexpr Limit_t limitMallocHeapSize = ::cudaLimitMallocHeapSize;
99 static void CUDART_CB hostFunction(
void* data)
101 auto ptr =
reinterpret_cast<HostFnAdaptor*
>(
data);
102 ptr->func_(ptr->data_);
106 static void CUDART_CB streamCallback(Stream_t, Error_t,
void* data)
108 auto ptr =
reinterpret_cast<HostFnAdaptor*
>(
data);
109 ptr->func_(ptr->data_);
115 static inline Error_t deviceGetAttribute(
int* value, DeviceAttr_t attr,
int device)
117 return ::cudaDeviceGetAttribute(value, attr, device);
120 static inline Error_t pointerGetAttributes(PointerAttr_t* attr,
void const* ptr)
122 return ::cudaPointerGetAttributes(attr, ptr);
125 static inline Error_t deviceGetLimit(
size_t* pValue, Limit_t limit)
127 return ::cudaDeviceGetLimit(pValue, limit);
130 static inline Error_t deviceReset()
132 return ::cudaDeviceReset();
135 static inline Error_t deviceSetLimit(Limit_t limit,
size_t value)
137 return ::cudaDeviceSetLimit(limit, value);
140 static inline Error_t deviceSynchronize()
142 return ::cudaDeviceSynchronize();
145 static inline Error_t eventCreate(Event_t* event)
147 return ::cudaEventCreate(event);
150 static inline Error_t eventCreateWithFlags(Event_t* event, Flag_t flags)
152 return ::cudaEventCreateWithFlags(event, flags);
155 static inline Error_t eventDestroy(Event_t event)
157 return ::cudaEventDestroy(event);
160 static inline Error_t eventQuery(Event_t event)
162 return ::cudaEventQuery(event);
165 static inline Error_t eventRecord(Event_t event, Stream_t stream)
167 return ::cudaEventRecord(event, stream);
170 static inline Error_t eventSynchronize(Event_t event)
172 return ::cudaEventSynchronize(event);
175 static inline Error_t free(
void* devPtr)
177 return ::cudaFree(devPtr);
180 static inline Error_t freeAsync(
void* devPtr, Stream_t stream)
182 return ::cudaFreeAsync(devPtr, stream);
185 static inline Error_t funcGetAttributes(FuncAttributes_t* attr,
void const* func)
187 return ::cudaFuncGetAttributes(attr, func);
191 static inline Error_t funcGetAttributes(FuncAttributes_t* attr, T* func)
194# pragma GCC diagnostic push
195# pragma GCC diagnostic ignored "-Wconditionally-supported"
197 return ::cudaFuncGetAttributes(attr,
reinterpret_cast<void const*
>(func));
199# pragma GCC diagnostic pop
203 static inline Error_t getDeviceCount(
int* count)
205 return ::cudaGetDeviceCount(count);
210 return ::cudaGetDeviceProperties(prop, device);
213 static inline char const* getErrorName(Error_t error)
215 return ::cudaGetErrorName(error);
218 static inline char const* getErrorString(Error_t error)
220 return ::cudaGetErrorString(error);
223 static inline Error_t getLastError()
225 return ::cudaGetLastError();
228 static inline Error_t getSymbolAddress(
void** devPtr,
void const* symbol)
230 return ::cudaGetSymbolAddress(devPtr, symbol);
234 static inline Error_t getSymbolAddress(
void** devPtr, T
const& symbol)
236 return ::cudaGetSymbolAddress(devPtr, symbol);
239 static inline Error_t hostGetDevicePointer(
void** pDevice,
void* pHost, Flag_t flags)
241 return ::cudaHostGetDevicePointer(pDevice, pHost, flags);
244 static inline Error_t hostFree(
void* ptr)
246 return ::cudaFreeHost(ptr);
249 static inline Error_t hostMalloc(
void** ptr,
size_t size, Flag_t flags)
251 return ::cudaHostAlloc(ptr, size, flags);
254 static inline Error_t hostRegister(
void* ptr,
size_t size, Flag_t flags)
256 return ::cudaHostRegister(ptr, size, flags);
259 static inline Error_t hostUnregister(
void* ptr)
261 return ::cudaHostUnregister(ptr);
264 static inline Error_t launchHostFunc(Stream_t stream, HostFn_t fn,
void* userData)
266# if CUDART_VERSION >= 10000
268 return ::cudaLaunchHostFunc(stream, HostFnAdaptor::hostFunction,
new HostFnAdaptor{fn, userData});
271 return ::cudaStreamAddCallback(stream, HostFnAdaptor::streamCallback,
new HostFnAdaptor{fn, userData}, 0);
275 static inline Error_t malloc(
void** devPtr,
size_t size)
277 return ::cudaMalloc(devPtr, size);
280 static inline Error_t mallocManaged(
void** devPtr,
size_t size)
282 return ::cudaMallocManaged(devPtr, size);
285 static inline Error_t malloc3D(PitchedPtr_t* pitchedDevPtr, Extent_t extent)
287 return ::cudaMalloc3D(pitchedDevPtr, extent);
290 static inline Error_t mallocAsync(
291 [[maybe_unused]]
void** devPtr,
292 [[maybe_unused]]
size_t size,
293 [[maybe_unused]] Stream_t stream)
295# if CUDART_VERSION >= 11020
296 return ::cudaMallocAsync(devPtr, size, stream);
303 static inline Error_t mallocPitch(
void** devPtr,
size_t* pitch,
size_t width,
size_t height)
305 return ::cudaMallocPitch(devPtr, pitch, width, height);
308 static inline Error_t memGetInfo(
size_t* free,
size_t* total)
310 return ::cudaMemGetInfo(free, total);
313 static inline Error_t
memcpy(
void* dst,
void const* src,
size_t count, MemcpyKind_t kind)
315 return ::cudaMemcpy(dst, src, count, kind);
318 static inline Error_t memcpy2DAsync(
328 return ::cudaMemcpy2DAsync(dst, dpitch, src, spitch, width, height, kind, stream);
331 static inline Error_t memcpy3DAsync(Memcpy3DParms_t
const* p, Stream_t stream)
333 return ::cudaMemcpy3DAsync(p, stream);
336 static inline Error_t memcpyAsync(
void* dst,
void const* src,
size_t count, MemcpyKind_t kind, Stream_t stream)
338 return ::cudaMemcpyAsync(dst, src, count, kind, stream);
341 static inline Error_t memset2DAsync(
349 return ::cudaMemset2DAsync(devPtr, pitch, value, width, height, stream);
352 static inline Error_t memset3DAsync(PitchedPtr_t pitchedDevPtr,
int value, Extent_t extent, Stream_t stream)
354 return ::cudaMemset3DAsync(pitchedDevPtr, value, extent, stream);
357 static inline Error_t memsetAsync(
void* devPtr,
int value,
size_t count, Stream_t stream)
359 return ::cudaMemsetAsync(devPtr, value, count, stream);
362 static inline Error_t setDevice(
int device)
364 return ::cudaSetDevice(device);
367 static inline Error_t streamCreate(Stream_t* pStream)
369 return ::cudaStreamCreate(pStream);
372 static inline Error_t streamCreateWithFlags(Stream_t* pStream, Flag_t flags)
374 return ::cudaStreamCreateWithFlags(pStream, flags);
377 static inline Error_t streamDestroy(Stream_t stream)
379 return ::cudaStreamDestroy(stream);
382 static inline Error_t streamQuery(Stream_t stream)
384 return ::cudaStreamQuery(stream);
387 static inline Error_t streamSynchronize(Stream_t stream)
389 return ::cudaStreamSynchronize(stream);
392 static inline Error_t streamWaitEvent(Stream_t stream, Event_t event, Flag_t flags)
394 return ::cudaStreamWaitEvent(stream, event, flags);
397 static inline PitchedPtr_t makePitchedPtr(
void* d,
size_t p,
size_t xsz,
size_t ysz)
399 return ::make_cudaPitchedPtr(d, p, xsz, ysz);
402 static inline Pos_t makePos(
size_t x,
size_t y,
size_t z)
404 return ::make_cudaPos(x, y, z);
407 static inline Extent_t makeExtent(
size_t w,
size_t h,
size_t d)
409 return ::make_cudaExtent(w, h, d);
DeviceProperties getDeviceProperties(auto const &platform, uint32_t idx)
decltype(auto) data(auto &&any)
pointer to data of an object
void memcpy(Queue< T_Device, T_QueueKind > const &queue, auto &&dest, auto const &source)
copy data byte wise from one to another container