alpaka
Abstraction Library for Parallel Kernel Acceleration
Loading...
Searching...
No Matches
interface.hpp
Go to the documentation of this file.
1/* Copyright 2024 René Widera, Tim Hanel
2 * SPDX-License-Identifier: MPL-2.0
3 */
4
5#pragma once
14#include "alpaka/tag.hpp"
15
16namespace alpaka::onAcc::internal
17{
18 // forward declaration to avoid cyclic includes
19 template<typename T_Storage, typename T_Type>
20 struct GlobalDeviceMemoryWrapper;
21} // namespace alpaka::onAcc::internal
22
23namespace alpaka::onHost
24{
25 namespace internal
26 {
27 struct MakePlatform
28 {
29 template<typename T_Api, alpaka::concepts::DeviceKind T_DeviceKind>
30 struct Op
31 {
32 auto operator()(T_Api api, T_DeviceKind deviceType) const;
33 };
34 };
35
36 static auto makePlatform(auto api, alpaka::concepts::DeviceKind auto deviceType)
37 {
38 return MakePlatform::Op<ALPAKA_TYPEOF(api), ALPAKA_TYPEOF(deviceType)>{}(api, deviceType);
39 }
40
41 struct GetDeviceCount
42 {
43 template<typename T_Platform>
44 struct Op
45 {
46 uint32_t operator()(T_Platform& platform) const
47 {
48 return platform.getDeviceCount();
49 }
50 };
51 };
52
53 struct MakeDevice
54 {
55 template<typename T_Platform>
56 struct Op
57 {
58 auto operator()(auto& platform, uint32_t idx) const
59 {
60 return platform.makeDevice(idx);
61 }
62 };
63 };
64
65 struct GetDevice
66 {
67 template<typename T_Any>
68 struct Op
69 {
70 auto operator()(T_Any const& any) const
71 {
72 return any.getDevice();
73 }
74 };
75 };
76
77 inline constexpr auto getDevice(auto&& any)
78 {
79 return GetDevice::Op<ALPAKA_TYPEOF(any)>{}(any);
80 }
81
82 struct GetNativeHandle
83 {
84 template<typename T_Any>
85 struct Op
86 {
87 auto operator()(T_Any const& any) const
88 {
89 return any.getNativeHandle();
90 }
91 };
92 };
93
94 inline auto getNativeHandle(auto&& any)
95 {
96 return GetNativeHandle::Op<ALPAKA_TYPEOF(any)>{}(any);
97 }
98
99 struct MakeQueue
100 {
101 template<typename T_Device, alpaka::concepts::QueueKind T_QueueKind>
102 struct Op
103 {
104 auto operator()(T_Device& device, T_QueueKind) const
105 {
106 return device.makeQueue(T_QueueKind{});
107 }
108 };
109 };
110
111 struct MakeEvent
112 {
113 template<typename T_Device>
114 struct Op
115 {
116 auto operator()(T_Device& device) const
117 {
118 return device.makeEvent();
119 }
120 };
121 };
122
123 struct Wait
124 {
125 template<typename T_Any>
126 struct Op
127 {
128 void operator()(T_Any& any)
129 {
130 any.wait();
131 }
132 };
133 };
134
135 inline void wait(auto&& any)
136 {
137 Wait::Op<ALPAKA_TYPEOF(any)>{}(any);
138 }
139
140 struct WaitFor
141 {
142 template<typename T_Queue, typename T_Event>
143 struct Op
144 {
145 void operator()(T_Queue& queue, T_Event& event)
146 {
147 queue.waitFor(event);
148 }
149 };
150 };
151
152 inline void waitFor(auto& queue, auto& event)
153 {
154 WaitFor::Op<ALPAKA_TYPEOF(queue), ALPAKA_TYPEOF(event)>{}(queue, event);
155 }
156
157 struct IsEventComplete
158 {
159 template<typename T_Any>
160 struct Op
161 {
162 bool operator()(T_Any& any)
163 {
164 return any.isEventComplete();
165 }
166 };
167 };
168
169 inline bool isEventComplete(auto&& any)
170 {
171 return IsEventComplete::Op<ALPAKA_TYPEOF(any)>{}(any);
172 }
173
174 struct IsQueueEmpty
175 {
176 template<typename T_Queue>
177 struct Op
178 {
179 bool operator()(T_Queue& queue)
180 {
181 return queue.isQueueEmpty();
182 }
183 };
184 };
185
186 inline bool isQueueEmpty(auto& queue)
187 {
188 return IsQueueEmpty::Op<ALPAKA_TYPEOF(queue)>{}(queue);
189 }
190
191 struct Enqueue
192 {
193 template<
194 typename T_Queue,
195 onHost::concepts::ThreadOrFrameSpec T_LaunchCfg,
196 alpaka::concepts::KernelBundle T_KernelBundle>
197 struct Kernel
198 {
199 void operator()(T_Queue& queue, T_LaunchCfg const& launchCfg, T_KernelBundle const& kernelBundle) const
200 {
201 queue.enqueue(launchCfg, kernelBundle);
202 }
203 };
204
205 template<typename T_Queue, typename T_Task>
206 struct HostTask
207 {
208 void operator()(T_Queue& queue, T_Task const& task) const
209 {
210 queue.enqueueHostFn(task);
211 }
212 };
213
214 template<typename T_Queue, typename T_Task>
215 struct HostTaskDeferred
216 {
217 void operator()(T_Queue& queue, T_Task const& task) const
218 {
219 queue.enqueueHostFnDeferred(task);
220 }
221 };
222
223 template<typename T_Queue, typename T_Event>
224 struct Event
225 {
226 void operator()(T_Queue& queue, T_Event& event) const
227 {
228 queue.enqueue(event);
229 }
230 };
231 };
232
233 inline void enqueueHostFn(auto& queue, auto const& task)
234 {
235 Enqueue::HostTask<ALPAKA_TYPEOF(queue), ALPAKA_TYPEOF(task)>{}(queue, task);
236 }
237
238 inline void enqueueHostFnDeferred(auto& queue, auto const& task)
239 {
240 Enqueue::HostTaskDeferred<ALPAKA_TYPEOF(queue), ALPAKA_TYPEOF(task)>{}(queue, task);
241 }
242
243 template<typename TKernelFn, typename... TArgs>
244 inline void enqueue(
245 auto& queue,
246 onHost::concepts::ThreadOrFrameSpec auto const& launchCfg,
247 KernelBundle<TKernelFn, TArgs...> const& kernelBundle)
248 {
249 Enqueue::Kernel<ALPAKA_TYPEOF(queue), ALPAKA_TYPEOF(launchCfg), KernelBundle<TKernelFn, TArgs...>>{}(
250 queue,
251 launchCfg,
252 kernelBundle);
253 }
254
255 struct AdjustThreadSpec
256 {
257 template<
258 typename T_Device,
259 onHost::concepts::FrameSpec T_FrameSpec,
260 alpaka::concepts::KernelBundle T_KernelBundle>
261 struct Op
262 {
263 auto operator()(
264 T_Device const& device,
265 T_FrameSpec const& frameSpec,
266 T_KernelBundle const& kernelBundle) const
267 {
268 alpaka::unused(device, frameSpec.getExecutor(), kernelBundle);
269 return ThreadSpec{frameSpec.getNumFrames(), frameSpec.getFrameExtents(), frameSpec.getExecutor()};
270 }
271 };
272 };
273
274 template<typename TKernelFn, typename... TArgs>
275 static auto adjustThreadSpec(
276 auto const& device,
277 onHost::concepts::FrameSpec auto const& frameSpec,
278 KernelBundle<TKernelFn, TArgs...> const& kernelBundle)
279 {
280 return AdjustThreadSpec::
281 Op<ALPAKA_TYPEOF(device), ALPAKA_TYPEOF(frameSpec), KernelBundle<TKernelFn, TArgs...>>{}(
282 device,
283 frameSpec,
284 kernelBundle);
285 }
286
287 struct Data
288 {
289 template<typename T_Any>
290 struct Op
291 {
292 decltype(auto) operator()(auto&& any) const
293 {
294 return std::data(any);
295 }
296 };
297
298 static decltype(auto) data(auto&& any)
299 {
300 return Op<ALPAKA_TYPEOF(any)>{}(any);
301 }
302
303 template<typename T_Any>
304 static decltype(auto) data(Handle<T_Any>&& anyHandle)
305 {
306 return Op<ALPAKA_TYPEOF(*anyHandle.get())>{}(*anyHandle.get());
307 }
308 };
309
310 struct Alloc
311 {
312 template<typename T_Type, typename T_Any, typename T_Extents>
313 struct Op
314 {
315 void operator()(T_Any& any, T_Extents const&) const;
316 };
317 };
318
319 struct AllocDeferred
320 {
321 template<typename T_Type, typename T_Any, typename T_Extents>
322 struct Op
323 {
324 void operator()(T_Any& any, T_Extents const&) const;
325 };
326 };
327
328 struct AllocUnified
329 {
330 template<typename T_Type, typename T_Any, typename T_Extents>
331 struct Op
332 {
333 void operator()(T_Any& any, T_Extents const&) const;
334 };
335 };
336
337 struct AllocMapped
338 {
339 template<typename T_Type, typename T_Any, typename T_Extents>
340 struct Op
341 {
342 void operator()(T_Any& any, T_Extents const&) const;
343 };
344 };
345
346 /** checks if a view can be accessed from the given device
347 *
348 * There are two paths to check if a view is accessible:
349 * - first: Try to validate the view in the scope of the device.
350 * - second: Try to validate based on soft criteria in the scope of the view's API.
351 * This path is required because the host API does not know about view data locations.
352 * The second path is optionally and will return always false if not specialized.
353 */
354 struct IsDataAccessible
355 {
356 template<typename T_Device, typename T_Any>
357 struct FirstPath
358 {
359 bool operator()(T_Device& device, T_Any const& any) const;
360 };
361
362 template<typename T_DataApi, alpaka::concepts::DeviceKind T_DeviceKind, typename T_Any>
363 struct SecondPath
364 {
365 bool operator()(T_DataApi, T_DeviceKind, T_Any const&) const
366 {
367 return false;
368 }
369 };
370 };
371
372 struct Memcpy
373 {
374 template<typename T_Queue, typename T_Dest, typename T_Source, typename T_Extents>
375 struct Op
376 {
377 void operator()(T_Queue& queue, auto&&, T_Source const&, T_Extents const&) const;
378 };
379 };
380
381 struct MemcpyDeviceGlobal
382 {
383 template<typename T_Queue, typename T_Dest, typename T_Source>
384 struct Op
385 {
386 /** copy data from or to the device global memory
387 *
388 * It is only allowed to copy data from or to the host.
389 * Copy from device global variable to device global variables is not supported.
390 * The host data is allowed te be a host accessible pointer.
391 */
392 void operator()(T_Queue& queue, T_Dest&&, T_Source&&) const;
393 };
394 };
395
396 struct Memset
397 {
398 template<typename T_Queue, typename T_Dest, typename T_Extents>
399 struct Op
400 {
401 void operator()(T_Queue& queue, auto&&, uint8_t, T_Extents const&) const;
402 };
403 };
404
405 struct Fill
406 {
407 template<typename T_Queue, typename T_Dest, typename T_Value, typename T_Extents>
408 struct Op
409 {
410 void operator()(T_Queue& queue, auto&&, T_Value, T_Extents const&) const;
411 };
412 };
413
414 struct GetDeviceProperties
415 {
416 template<typename T_Any>
417 struct Op
418 {
419 DeviceProperties operator()(auto const& platform, uint32_t idx) const;
420
421 DeviceProperties operator()(auto const& device) const;
422 };
423 };
424
425 struct GetFreeGlobalMemBytes
426 {
427 template<typename T_Any>
428 struct Op
429 {
430 size_t operator()(auto const& device) const
431 {
432 return device.getFreeGlobalMemBytes();
433 }
434 };
435 };
436
437 inline DeviceProperties getDeviceProperties(auto const& platform, uint32_t idx)
438 {
439 return GetDeviceProperties::Op<ALPAKA_TYPEOF(platform)>{}(platform, idx);
440 }
441
442 struct GetExtents
443 {
444 template<typename T_Any>
445 struct Op
446 {
447 decltype(auto) operator()(auto&& any) const
448 {
449 return any.getExtents();
450 }
451 };
452 };
453
454 inline auto getExtents(auto&& any)
455 {
456 return GetExtents::Op<ALPAKA_TYPEOF(any)>{}(any);
457 }
458
459 template<typename T_Any>
460 inline auto getExtents(Handle<T_Any>&& any)
461 {
462 return GetExtents::Op<ALPAKA_TYPEOF(*any.get())>{}(*any.get());
463 }
464
465 struct GetPitches
466 {
467 template<typename T_Any>
468 struct Op
469 {
470 decltype(auto) operator()(auto&& any) const
471 {
472 return any.getPitches();
473 }
474 };
475 };
476
477 inline auto getPitches(auto&& any)
478 {
479 return GetPitches::Op<ALPAKA_TYPEOF(any)>{}(any);
480 }
481
482 template<typename T_Any>
483 inline auto getPitches(Handle<T_Any>&& any)
484 {
485 return GetPitches::Op<ALPAKA_TYPEOF(*any.get())>{}(*any.get());
486 }
487
488 /** implementation to get a SIMD optimized frame spec
489 *
490 * @param internalDevice must be a alpaka internal device implementation
491 */
492 template<typename T_DataType>
493 inline constexpr auto getFrameSpec(
494 auto const& internalDevice,
495 alpaka::concepts::Executor auto executor,
496 alpaka::concepts::VectorOrScalar auto const& extents)
497 {
498 Vec extentMd = extents;
499 auto deviceKind = alpaka::internal::getDeviceKind(internalDevice);
500 auto deviceApi = alpaka::internal::getApi(internalDevice);
501 using ExtentVecType = ALPAKA_TYPEOF(extentMd);
502 // check that all extent dimensions are greater than zero
503 ALPAKA_ASSERT((extentMd > ExtentVecType::fill(0u)).reduce(std::logical_and{}));
505 auto props = internal::GetDeviceProperties::Op<ALPAKA_TYPEOF(internalDevice)>{}(internalDevice);
506 IndexType warpSize = static_cast<IndexType>(props.warpSize);
507 // try to create a specification with a frame size of 512 elements
508 IndexType numFrameElements = 512;
509 // avoid non-power of two values
510 auto fastDimensionValue = roundDownToPowerOfTwo(std::min(warpSize, extentMd.x()));
511 auto frameExtents = ExtentVecType::fill(1).rAssign(fastDimensionValue);
512 numFrameElements /= frameExtents.x();
513 // distribute remainder frame elements
514 while(numFrameElements > IndexType{1})
515 {
516 uint32_t maxIdx = ExtentVecType::dim() - 1u;
517 IndexType maxValue = 0;
518 for(auto i = 0u; i < ExtentVecType::dim(); ++i)
519 {
520 auto v = extentMd[i] / frameExtents[i] / IndexType{2};
521 if(maxValue < v)
522 {
523 maxIdx = i;
524 maxValue = v;
525 }
526 }
527 // apply the change only if we not oversubscribe the extents
528 auto v = extentMd[maxIdx] / frameExtents[maxIdx] / IndexType{2};
529 if(v >= IndexType{1})
530 frameExtents[maxIdx] *= IndexType{2};
531 else
532 break;
533 numFrameElements /= IndexType{2};
534 }
535 IndexType elementsPerFrameItem
536 = static_cast<IndexType>(getNumElemPerThread<T_DataType>(deviceApi, deviceKind));
537 alpaka::concepts::Vector auto numFrames
538 = divExZero(extentMd, frameExtents * frameExtents.fill(1).rAssign(elementsPerFrameItem));
539 // The frame specification is not required to be a multiple of the extent, it can be smaller.
540 auto frameSpec = FrameSpec{numFrames, frameExtents, executor};
541 return frameSpec;
542 }
543 } // namespace internal
544} // namespace alpaka::onHost
#define ALPAKA_ASSERT(...)
The assert can be explicit disabled by defining NDEBUG.
Definition Assert.hpp:14
#define ALPAKA_TYPEOF(...)
Get the type of instance.
Definition common.hpp:153
constexpr WarpSize warpSize
Definition tag.hpp:44
constexpr DeviceKind deviceKind
Definition tag.hpp:30
constexpr Api api
Definition tag.hpp:24
constexpr Device device
Definition scope.hpp:70
constexpr bool any(alpaka::onAcc::concepts::Acc auto const &acc, int32_t predicate)
Evaluates predicate for all active threads of the warp.
Definition warp.hpp:83
constexpr auto queue
Definition lvl.hpp:127
constexpr auto event
Definition lvl.hpp:97
Functionality which is usable on the host CPU controller thread.
Definition api.hpp:40
FrameSpec(T_NumFrames const &, T_FrameExtents const &) -> FrameSpec< alpaka::trait::getVec_t< T_NumFrames >, alpaka::trait::getVec_t< T_FrameExtents >, alpaka::exec::AnyExecutor >
std::shared_ptr< T > Handle
Definition Handle.hpp:30
ThreadSpec(T_NumBlocks const &, T_NumThreads const &) -> ThreadSpec< alpaka::trait::getVec_t< T_NumBlocks >, alpaka::trait::getVec_t< T_NumThreads > >
void reduce(Queue< T_Device, T_QueueKind > const &queue, alpaka::concepts::Executor auto const exec, DataType const &neutralElement, alpaka::concepts::IMdSpan auto out, auto &&binaryReduceFn, auto &&in)
accumulate the results into a scalar value.
Definition reduce.hpp:29
typename GetValueType< T >::type GetValueType_t
Definition trait.hpp:65
ALPAKA_FN_HOST_ACC constexpr auto divExZero(Integral a, Integral b) -> Integral
Returns the max(a / b, 1) as integer.
Definition utility.hpp:41
consteval uint32_t getNumElemPerThread(concepts::Api auto const api, alpaka::concepts::DeviceKind auto const deviceType)
Get the number of elements to compute per thread.
Definition trait.hpp:177
constexpr T roundDownToPowerOfTwo(T value)
round to the next power of two which is equal or lower to the value
Definition utility.hpp:88
ALPAKA_FN_HOST_ACC Vec(T_1, T_Args...) -> Vec< T_1, uint32_t(sizeof...(T_Args)+1u), ArrayStorage< T_1, uint32_t(sizeof...(T_Args)+1u)> >
ALPAKA_FN_HOST KernelBundle(TKernelFn const &, TArgs &&...) -> KernelBundle< TKernelFn, TArgs... >
User defined deduction guide with trailing return type. For CTAD during the construction.