alpaka
Abstraction Library for Parallel Kernel Acceleration
Loading...
Searching...
No Matches
ApiCudaRt.hpp
Go to the documentation of this file.
1/* Copyright 2022 Andrea Bocci
2 * SPDX-License-Identifier: MPL-2.0
3 */
4
5#pragma once
6
8
9#if ALPAKA_LANG_CUDA
10# include <cuda_runtime_api.h>
11
12namespace alpaka
13{
14 struct ApiCudaRt
15 {
16 // Names
17 static constexpr char name[] = "Cuda";
18 static constexpr auto version = ALPAKA_LANG_CUDA;
19
20 // Types
21 using DeviceAttr_t = ::cudaDeviceAttr;
22 using PointerAttr_t = ::cudaPointerAttributes;
23 using Memory_t = ::cudaMemoryType;
24 using DeviceProp_t = ::cudaDeviceProp;
25 using Error_t = ::cudaError_t;
26 using Event_t = ::cudaEvent_t;
27 using Extent_t = ::cudaExtent;
28 using Flag_t = unsigned int;
29 using FuncAttributes_t = ::cudaFuncAttributes;
30 using HostFn_t = void (*)(void* data); // same as cudaHostFn_t, without the CUDART_CB calling convention
31 using Limit_t = ::cudaLimit;
32 using Memcpy3DParms_t = ::cudaMemcpy3DParms;
33 using MemcpyKind_t = ::cudaMemcpyKind;
34 using PitchedPtr_t = ::cudaPitchedPtr;
35 using Pos_t = ::cudaPos;
36 using Stream_t = ::cudaStream_t;
37
38 // Constants
39 static constexpr Error_t success = ::cudaSuccess;
40 static constexpr Error_t errorNotReady = ::cudaErrorNotReady;
41 static constexpr Error_t errorHostMemoryAlreadyRegistered = ::cudaErrorHostMemoryAlreadyRegistered;
42 static constexpr Error_t errorHostMemoryNotRegistered = ::cudaErrorHostMemoryNotRegistered;
43 static constexpr Error_t errorUnsupportedLimit = ::cudaErrorUnsupportedLimit;
44 static constexpr Error_t errorUnknown = ::cudaErrorUnknown;
45
46 static constexpr Flag_t eventDefault = cudaEventDefault;
47 static constexpr Flag_t eventBlockingSync = cudaEventBlockingSync;
48 static constexpr Flag_t eventDisableTiming = cudaEventDisableTiming;
49 static constexpr Flag_t eventInterprocess = cudaEventInterprocess;
50
51 static constexpr Flag_t hostMallocDefault = cudaHostAllocDefault;
52 static constexpr Flag_t hostMallocMapped = cudaHostAllocMapped;
53 static constexpr Flag_t hostMallocPortable = cudaHostAllocPortable;
54 static constexpr Flag_t hostMallocWriteCombined = cudaHostAllocWriteCombined;
55 static constexpr Flag_t hostMallocCoherent = cudaHostAllocDefault; // Not supported.
56 static constexpr Flag_t hostMallocNonCoherent = cudaHostAllocDefault; // Not supported.
57
58 static constexpr Flag_t hostRegisterDefault = cudaHostRegisterDefault;
59 static constexpr Flag_t hostRegisterPortable = cudaHostRegisterPortable;
60 static constexpr Flag_t hostRegisterMapped = cudaHostRegisterMapped;
61 static constexpr Flag_t hostRegisterIoMemory = cudaHostRegisterIoMemory;
62
63 static constexpr MemcpyKind_t memcpyDefault = ::cudaMemcpyDefault;
64 static constexpr MemcpyKind_t memcpyDeviceToDevice = ::cudaMemcpyDeviceToDevice;
65 static constexpr MemcpyKind_t memcpyDeviceToHost = ::cudaMemcpyDeviceToHost;
66 static constexpr MemcpyKind_t memcpyHostToDevice = ::cudaMemcpyHostToDevice;
67 static constexpr MemcpyKind_t memcpyHostToHost = ::cudaMemcpyHostToHost;
68
69 static constexpr Flag_t streamDefault = cudaStreamDefault;
70 static constexpr Flag_t streamNonBlocking = cudaStreamNonBlocking;
71
72 static constexpr DeviceAttr_t deviceAttributeMaxBlockDimX = ::cudaDevAttrMaxBlockDimX;
73 static constexpr DeviceAttr_t deviceAttributeMaxBlockDimY = ::cudaDevAttrMaxBlockDimY;
74 static constexpr DeviceAttr_t deviceAttributeMaxBlockDimZ = ::cudaDevAttrMaxBlockDimZ;
75 static constexpr DeviceAttr_t deviceAttributeMaxGridDimX = ::cudaDevAttrMaxGridDimX;
76 static constexpr DeviceAttr_t deviceAttributeMaxGridDimY = ::cudaDevAttrMaxGridDimY;
77 static constexpr DeviceAttr_t deviceAttributeMaxGridDimZ = ::cudaDevAttrMaxGridDimZ;
78 static constexpr DeviceAttr_t deviceAttributeMaxSharedMemoryPerBlock = ::cudaDevAttrMaxSharedMemoryPerBlock;
79 static constexpr DeviceAttr_t deviceAttributeMaxThreadsPerBlock = ::cudaDevAttrMaxThreadsPerBlock;
80 static constexpr DeviceAttr_t deviceAttributeMultiprocessorCount = ::cudaDevAttrMultiProcessorCount;
81 static constexpr DeviceAttr_t deviceAttributeWarpSize = ::cudaDevAttrWarpSize;
82
83 static constexpr Memory_t memoryTypeUnregistered = ::cudaMemoryTypeUnregistered;
84 static constexpr Memory_t memoryTypeHost = ::cudaMemoryTypeHost;
85 static constexpr Memory_t memoryTypeDevice = ::cudaMemoryTypeDevice;
86 static constexpr Memory_t memoryTypeManaged = ::cudaMemoryTypeManaged;
87
88 static constexpr Limit_t limitPrintfFifoSize = ::cudaLimitPrintfFifoSize;
89 static constexpr Limit_t limitMallocHeapSize = ::cudaLimitMallocHeapSize;
90
91 // Host function helper
92 // Encapsulates the different function signatures used by cudaStreamAddCallback and cudaLaunchHostFn, and the
93 // different calling conventions used by CUDA (__stdcall on Win32) and HIP (standard).
94 struct HostFnAdaptor
95 {
96 HostFn_t func_;
97 void* data_;
98
99 static void CUDART_CB hostFunction(void* data)
100 {
101 auto ptr = reinterpret_cast<HostFnAdaptor*>(data);
102 ptr->func_(ptr->data_);
103 delete ptr;
104 }
105
106 static void CUDART_CB streamCallback(Stream_t, Error_t, void* data)
107 {
108 auto ptr = reinterpret_cast<HostFnAdaptor*>(data);
109 ptr->func_(ptr->data_);
110 delete ptr;
111 }
112 };
113
114 // Runtime API
115 static inline Error_t deviceGetAttribute(int* value, DeviceAttr_t attr, int device)
116 {
117 return ::cudaDeviceGetAttribute(value, attr, device);
118 }
119
120 static inline Error_t pointerGetAttributes(PointerAttr_t* attr, void const* ptr)
121 {
122 return ::cudaPointerGetAttributes(attr, ptr);
123 }
124
125 static inline Error_t deviceGetLimit(size_t* pValue, Limit_t limit)
126 {
127 return ::cudaDeviceGetLimit(pValue, limit);
128 }
129
130 static inline Error_t deviceReset()
131 {
132 return ::cudaDeviceReset();
133 }
134
135 static inline Error_t deviceSetLimit(Limit_t limit, size_t value)
136 {
137 return ::cudaDeviceSetLimit(limit, value);
138 }
139
140 static inline Error_t deviceSynchronize()
141 {
142 return ::cudaDeviceSynchronize();
143 }
144
145 static inline Error_t eventCreate(Event_t* event)
146 {
147 return ::cudaEventCreate(event);
148 }
149
150 static inline Error_t eventCreateWithFlags(Event_t* event, Flag_t flags)
151 {
152 return ::cudaEventCreateWithFlags(event, flags);
153 }
154
155 static inline Error_t eventDestroy(Event_t event)
156 {
157 return ::cudaEventDestroy(event);
158 }
159
160 static inline Error_t eventQuery(Event_t event)
161 {
162 return ::cudaEventQuery(event);
163 }
164
165 static inline Error_t eventRecord(Event_t event, Stream_t stream)
166 {
167 return ::cudaEventRecord(event, stream);
168 }
169
170 static inline Error_t eventSynchronize(Event_t event)
171 {
172 return ::cudaEventSynchronize(event);
173 }
174
175 static inline Error_t free(void* devPtr)
176 {
177 return ::cudaFree(devPtr);
178 }
179
180 static inline Error_t freeAsync(void* devPtr, Stream_t stream)
181 {
182 return ::cudaFreeAsync(devPtr, stream);
183 }
184
185 static inline Error_t funcGetAttributes(FuncAttributes_t* attr, void const* func)
186 {
187 return ::cudaFuncGetAttributes(attr, func);
188 }
189
190 template<typename T>
191 static inline Error_t funcGetAttributes(FuncAttributes_t* attr, T* func)
192 {
193# if ALPAKA_COMP_GNUC
194# pragma GCC diagnostic push
195# pragma GCC diagnostic ignored "-Wconditionally-supported"
196# endif
197 return ::cudaFuncGetAttributes(attr, reinterpret_cast<void const*>(func));
198# if ALPAKA_COMP_GNUC
199# pragma GCC diagnostic pop
200# endif
201 }
202
203 static inline Error_t getDeviceCount(int* count)
204 {
205 return ::cudaGetDeviceCount(count);
206 }
207
208 static inline Error_t getDeviceProperties(DeviceProp_t* prop, int device)
209 {
210 return ::cudaGetDeviceProperties(prop, device);
211 }
212
213 static inline char const* getErrorName(Error_t error)
214 {
215 return ::cudaGetErrorName(error);
216 }
217
218 static inline char const* getErrorString(Error_t error)
219 {
220 return ::cudaGetErrorString(error);
221 }
222
223 static inline Error_t getLastError()
224 {
225 return ::cudaGetLastError();
226 }
227
228 static inline Error_t getSymbolAddress(void** devPtr, void const* symbol)
229 {
230 return ::cudaGetSymbolAddress(devPtr, symbol);
231 }
232
233 template<class T>
234 static inline Error_t getSymbolAddress(void** devPtr, T const& symbol)
235 {
236 return ::cudaGetSymbolAddress(devPtr, symbol);
237 }
238
239 static inline Error_t hostGetDevicePointer(void** pDevice, void* pHost, Flag_t flags)
240 {
241 return ::cudaHostGetDevicePointer(pDevice, pHost, flags);
242 }
243
244 static inline Error_t hostFree(void* ptr)
245 {
246 return ::cudaFreeHost(ptr);
247 }
248
249 static inline Error_t hostMalloc(void** ptr, size_t size, Flag_t flags)
250 {
251 return ::cudaHostAlloc(ptr, size, flags);
252 }
253
254 static inline Error_t hostRegister(void* ptr, size_t size, Flag_t flags)
255 {
256 return ::cudaHostRegister(ptr, size, flags);
257 }
258
259 static inline Error_t hostUnregister(void* ptr)
260 {
261 return ::cudaHostUnregister(ptr);
262 }
263
264 static inline Error_t launchHostFunc(Stream_t stream, HostFn_t fn, void* userData)
265 {
266# if CUDART_VERSION >= 10000
267 // Wrap the host function using the proper calling convention
268 return ::cudaLaunchHostFunc(stream, HostFnAdaptor::hostFunction, new HostFnAdaptor{fn, userData});
269# else
270 // Emulate cudaLaunchHostFunc using cudaStreamAddCallback with a callback adaptor.
271 return ::cudaStreamAddCallback(stream, HostFnAdaptor::streamCallback, new HostFnAdaptor{fn, userData}, 0);
272# endif
273 }
274
275 static inline Error_t malloc(void** devPtr, size_t size)
276 {
277 return ::cudaMalloc(devPtr, size);
278 }
279
280 static inline Error_t mallocManaged(void** devPtr, size_t size)
281 {
282 return ::cudaMallocManaged(devPtr, size);
283 }
284
285 static inline Error_t malloc3D(PitchedPtr_t* pitchedDevPtr, Extent_t extent)
286 {
287 return ::cudaMalloc3D(pitchedDevPtr, extent);
288 }
289
290 static inline Error_t mallocAsync(
291 [[maybe_unused]] void** devPtr,
292 [[maybe_unused]] size_t size,
293 [[maybe_unused]] Stream_t stream)
294 {
295# if CUDART_VERSION >= 11020
296 return ::cudaMallocAsync(devPtr, size, stream);
297# else
298 // Not implemented.
299 return errorUnknown;
300# endif
301 }
302
303 static inline Error_t mallocPitch(void** devPtr, size_t* pitch, size_t width, size_t height)
304 {
305 return ::cudaMallocPitch(devPtr, pitch, width, height);
306 }
307
308 static inline Error_t memGetInfo(size_t* free, size_t* total)
309 {
310 return ::cudaMemGetInfo(free, total);
311 }
312
313 static inline Error_t memcpy(void* dst, void const* src, size_t count, MemcpyKind_t kind)
314 {
315 return ::cudaMemcpy(dst, src, count, kind);
316 }
317
318 static inline Error_t memcpy2DAsync(
319 void* dst,
320 size_t dpitch,
321 void const* src,
322 size_t spitch,
323 size_t width,
324 size_t height,
325 MemcpyKind_t kind,
326 Stream_t stream)
327 {
328 return ::cudaMemcpy2DAsync(dst, dpitch, src, spitch, width, height, kind, stream);
329 }
330
331 static inline Error_t memcpy3DAsync(Memcpy3DParms_t const* p, Stream_t stream)
332 {
333 return ::cudaMemcpy3DAsync(p, stream);
334 }
335
336 static inline Error_t memcpyAsync(void* dst, void const* src, size_t count, MemcpyKind_t kind, Stream_t stream)
337 {
338 return ::cudaMemcpyAsync(dst, src, count, kind, stream);
339 }
340
341 static inline Error_t memset2DAsync(
342 void* devPtr,
343 size_t pitch,
344 int value,
345 size_t width,
346 size_t height,
347 Stream_t stream)
348 {
349 return ::cudaMemset2DAsync(devPtr, pitch, value, width, height, stream);
350 }
351
352 static inline Error_t memset3DAsync(PitchedPtr_t pitchedDevPtr, int value, Extent_t extent, Stream_t stream)
353 {
354 return ::cudaMemset3DAsync(pitchedDevPtr, value, extent, stream);
355 }
356
357 static inline Error_t memsetAsync(void* devPtr, int value, size_t count, Stream_t stream)
358 {
359 return ::cudaMemsetAsync(devPtr, value, count, stream);
360 }
361
362 static inline Error_t setDevice(int device)
363 {
364 return ::cudaSetDevice(device);
365 }
366
367 static inline Error_t streamCreate(Stream_t* pStream)
368 {
369 return ::cudaStreamCreate(pStream);
370 }
371
372 static inline Error_t streamCreateWithFlags(Stream_t* pStream, Flag_t flags)
373 {
374 return ::cudaStreamCreateWithFlags(pStream, flags);
375 }
376
377 static inline Error_t streamDestroy(Stream_t stream)
378 {
379 return ::cudaStreamDestroy(stream);
380 }
381
382 static inline Error_t streamQuery(Stream_t stream)
383 {
384 return ::cudaStreamQuery(stream);
385 }
386
387 static inline Error_t streamSynchronize(Stream_t stream)
388 {
389 return ::cudaStreamSynchronize(stream);
390 }
391
392 static inline Error_t streamWaitEvent(Stream_t stream, Event_t event, Flag_t flags)
393 {
394 return ::cudaStreamWaitEvent(stream, event, flags);
395 }
396
397 static inline PitchedPtr_t makePitchedPtr(void* d, size_t p, size_t xsz, size_t ysz)
398 {
399 return ::make_cudaPitchedPtr(d, p, xsz, ysz);
400 }
401
402 static inline Pos_t makePos(size_t x, size_t y, size_t z)
403 {
404 return ::make_cudaPos(x, y, z);
405 }
406
407 static inline Extent_t makeExtent(size_t w, size_t h, size_t d)
408 {
409 return ::make_cudaExtent(w, h, d);
410 }
411 };
412
413} // namespace alpaka
414
415#endif
#define ALPAKA_LANG_CUDA
Definition config.hpp:258
decltype(auto) data(auto &&any)
pointer to data of an object
void memcpy(Queue< T_Device, T_QueueKind > const &queue, auto &&dest, auto const &source)
copy data byte wise from one to another container
Definition Queue.hpp:237
main alpaka namespace.
Definition alpaka.hpp:76