16namespace alpaka::onAcc::internal
19 template<
typename T_Storage,
typename T_Type>
20 struct GlobalDeviceMemoryWrapper;
29 template<
typename T_Api, alpaka::concepts::DeviceKind T_DeviceKind>
32 auto operator()(T_Api api, T_DeviceKind deviceType)
const;
36 static auto makePlatform(
auto api, alpaka::concepts::DeviceKind
auto deviceType)
43 template<
typename T_Platform>
46 uint32_t operator()(T_Platform& platform)
const
48 return platform.getDeviceCount();
55 template<
typename T_Platform>
58 auto operator()(
auto& platform, uint32_t idx)
const
60 return platform.makeDevice(idx);
67 template<
typename T_Any>
70 auto operator()(T_Any
const& any)
const
72 return any.getDevice();
77 inline constexpr auto getDevice(
auto&& any)
82 struct GetNativeHandle
84 template<
typename T_Any>
87 auto operator()(T_Any
const& any)
const
89 return any.getNativeHandle();
94 inline auto getNativeHandle(
auto&& any)
101 template<
typename T_Device, alpaka::concepts::QueueKind T_QueueKind>
104 auto operator()(T_Device& device, T_QueueKind)
const
106 return device.makeQueue(T_QueueKind{});
113 template<
typename T_Device>
116 auto operator()(T_Device& device)
const
118 return device.makeEvent();
125 template<
typename T_Any>
128 void operator()(T_Any& any)
135 inline void wait(
auto&& any)
142 template<
typename T_Queue,
typename T_Event>
145 void operator()(T_Queue& queue, T_Event& event)
147 queue.waitFor(event);
152 inline void waitFor(
auto& queue,
auto& event)
157 struct IsEventComplete
159 template<
typename T_Any>
162 bool operator()(T_Any& any)
164 return any.isEventComplete();
169 inline bool isEventComplete(
auto&& any)
176 template<
typename T_Queue>
179 bool operator()(T_Queue& queue)
181 return queue.isQueueEmpty();
186 inline bool isQueueEmpty(
auto& queue)
195 onHost::concepts::ThreadOrFrameSpec T_LaunchCfg,
196 alpaka::concepts::KernelBundle T_KernelBundle>
199 void operator()(T_Queue& queue, T_LaunchCfg
const& launchCfg, T_KernelBundle
const& kernelBundle)
const
201 queue.enqueue(launchCfg, kernelBundle);
205 template<
typename T_Queue,
typename T_Task>
208 void operator()(T_Queue& queue, T_Task
const& task)
const
210 queue.enqueueHostFn(task);
214 template<
typename T_Queue,
typename T_Task>
215 struct HostTaskDeferred
217 void operator()(T_Queue& queue, T_Task
const& task)
const
219 queue.enqueueHostFnDeferred(task);
223 template<
typename T_Queue,
typename T_Event>
226 void operator()(T_Queue& queue, T_Event& event)
const
228 queue.enqueue(event);
233 inline void enqueueHostFn(
auto& queue,
auto const& task)
238 inline void enqueueHostFnDeferred(
auto& queue,
auto const& task)
243 template<
typename TKernelFn,
typename... TArgs>
246 onHost::concepts::ThreadOrFrameSpec
auto const& launchCfg,
255 struct AdjustThreadSpec
259 onHost::concepts::FrameSpec T_FrameSpec,
260 alpaka::concepts::KernelBundle T_KernelBundle>
264 T_Device
const& device,
265 T_FrameSpec
const& frameSpec,
266 T_KernelBundle
const& kernelBundle)
const
268 alpaka::unused(device, frameSpec.getExecutor(), kernelBundle);
269 return ThreadSpec{frameSpec.getNumFrames(), frameSpec.getFrameExtents(), frameSpec.getExecutor()};
274 template<
typename TKernelFn,
typename... TArgs>
275 static auto adjustThreadSpec(
277 onHost::concepts::FrameSpec
auto const& frameSpec,
280 return AdjustThreadSpec::
289 template<
typename T_Any>
292 decltype(
auto)
operator()(
auto&&
any)
const
294 return std::data(any);
298 static decltype(
auto) data(
auto&& any)
303 template<
typename T_Any>
306 return Op<
ALPAKA_TYPEOF(*anyHandle.get())>{}(*anyHandle.get());
312 template<
typename T_Type,
typename T_Any,
typename T_Extents>
315 void operator()(T_Any& any, T_Extents
const&)
const;
321 template<
typename T_Type,
typename T_Any,
typename T_Extents>
324 void operator()(T_Any& any, T_Extents
const&)
const;
330 template<
typename T_Type,
typename T_Any,
typename T_Extents>
333 void operator()(T_Any& any, T_Extents
const&)
const;
339 template<
typename T_Type,
typename T_Any,
typename T_Extents>
342 void operator()(T_Any& any, T_Extents
const&)
const;
354 struct IsDataAccessible
356 template<
typename T_Device,
typename T_Any>
359 bool operator()(T_Device& device, T_Any
const& any)
const;
362 template<
typename T_DataApi, alpaka::concepts::DeviceKind T_DeviceKind,
typename T_Any>
365 bool operator()(T_DataApi, T_DeviceKind, T_Any
const&)
const
374 template<
typename T_Queue,
typename T_Dest,
typename T_Source,
typename T_Extents>
377 void operator()(T_Queue& queue,
auto&&, T_Source
const&, T_Extents
const&)
const;
381 struct MemcpyDeviceGlobal
383 template<
typename T_Queue,
typename T_Dest,
typename T_Source>
392 void operator()(T_Queue& queue, T_Dest&&, T_Source&&)
const;
398 template<
typename T_Queue,
typename T_Dest,
typename T_Extents>
401 void operator()(T_Queue& queue,
auto&&, uint8_t, T_Extents
const&)
const;
407 template<
typename T_Queue,
typename T_Dest,
typename T_Value,
typename T_Extents>
410 void operator()(T_Queue& queue,
auto&&, T_Value, T_Extents
const&)
const;
414 struct GetDeviceProperties
416 template<
typename T_Any>
419 DeviceProperties operator()(
auto const& platform, uint32_t idx)
const;
421 DeviceProperties operator()(
auto const& device)
const;
425 struct GetFreeGlobalMemBytes
427 template<
typename T_Any>
430 size_t operator()(
auto const& device)
const
432 return device.getFreeGlobalMemBytes();
437 inline DeviceProperties getDeviceProperties(
auto const& platform, uint32_t idx)
439 return GetDeviceProperties::Op<
ALPAKA_TYPEOF(platform)>{}(platform, idx);
444 template<
typename T_Any>
447 decltype(
auto)
operator()(
auto&&
any)
const
449 return any.getExtents();
454 inline auto getExtents(
auto&& any)
459 template<
typename T_Any>
467 template<
typename T_Any>
470 decltype(
auto)
operator()(
auto&&
any)
const
472 return any.getPitches();
477 inline auto getPitches(
auto&& any)
482 template<
typename T_Any>
492 template<
typename T_DataType>
493 inline constexpr auto getFrameSpec(
494 auto const& internalDevice,
495 alpaka::concepts::Executor
auto executor,
496 alpaka::concepts::VectorOrScalar
auto const& extents)
498 Vec extentMd = extents;
499 auto deviceKind = alpaka::internal::getDeviceKind(internalDevice);
500 auto deviceApi = alpaka::internal::getApi(internalDevice);
505 auto props = internal::GetDeviceProperties::Op<
ALPAKA_TYPEOF(internalDevice)>{}(internalDevice);
506 IndexType
warpSize =
static_cast<IndexType
>(props.warpSize);
508 IndexType numFrameElements = 512;
511 auto frameExtents = ExtentVecType::fill(1).rAssign(fastDimensionValue);
512 numFrameElements /= frameExtents.x();
514 while(numFrameElements > IndexType{1})
516 uint32_t maxIdx = ExtentVecType::dim() - 1u;
517 IndexType maxValue = 0;
518 for(
auto i = 0u; i < ExtentVecType::dim(); ++i)
520 auto v = extentMd[i] / frameExtents[i] / IndexType{2};
528 auto v = extentMd[maxIdx] / frameExtents[maxIdx] / IndexType{2};
529 if(v >= IndexType{1})
530 frameExtents[maxIdx] *= IndexType{2};
533 numFrameElements /= IndexType{2};
535 IndexType elementsPerFrameItem
537 alpaka::concepts::Vector
auto numFrames
538 =
divExZero(extentMd, frameExtents * frameExtents.fill(1).rAssign(elementsPerFrameItem));
540 auto frameSpec =
FrameSpec{numFrames, frameExtents, executor};
#define ALPAKA_ASSERT(...)
The assert can be explicit disabled by defining NDEBUG.
#define ALPAKA_TYPEOF(...)
Get the type of instance.
constexpr WarpSize warpSize
constexpr DeviceKind deviceKind
constexpr bool any(alpaka::onAcc::concepts::Acc auto const &acc, int32_t predicate)
Evaluates predicate for all active threads of the warp.
Functionality which is usable on the host CPU controller thread.
FrameSpec(T_NumFrames const &, T_FrameExtents const &) -> FrameSpec< alpaka::trait::getVec_t< T_NumFrames >, alpaka::trait::getVec_t< T_FrameExtents >, alpaka::exec::AnyExecutor >
std::shared_ptr< T > Handle
ThreadSpec(T_NumBlocks const &, T_NumThreads const &) -> ThreadSpec< alpaka::trait::getVec_t< T_NumBlocks >, alpaka::trait::getVec_t< T_NumThreads > >
void reduce(Queue< T_Device, T_QueueKind > const &queue, alpaka::concepts::Executor auto const exec, DataType const &neutralElement, alpaka::concepts::IMdSpan auto out, auto &&binaryReduceFn, auto &&in)
accumulate the results into a scalar value.
typename GetValueType< T >::type GetValueType_t
ALPAKA_FN_HOST_ACC constexpr auto divExZero(Integral a, Integral b) -> Integral
Returns the max(a / b, 1) as integer.
consteval uint32_t getNumElemPerThread(concepts::Api auto const api, alpaka::concepts::DeviceKind auto const deviceType)
Get the number of elements to compute per thread.
constexpr T roundDownToPowerOfTwo(T value)
round to the next power of two which is equal or lower to the value
ALPAKA_FN_HOST_ACC Vec(T_1, T_Args...) -> Vec< T_1, uint32_t(sizeof...(T_Args)+1u), ArrayStorage< T_1, uint32_t(sizeof...(T_Args)+1u)> >
ALPAKA_FN_HOST KernelBundle(TKernelFn const &, TArgs &&...) -> KernelBundle< TKernelFn, TArgs... >
User defined deduction guide with trailing return type. For CTAD during the construction.