29#if ALPAKA_LANG_CUDA || ALPAKA_LANG_HIP
42 template<
typename T_Device>
43 struct Queue : std::enable_shared_from_this<Queue<T_Device>>
45 using ApiInterface =
typename T_Device::ApiInterface;
48 Queue(internal::concepts::DeviceHandle
auto device, uint32_t
const idx,
bool isBlocking)
49 : m_device(std::move(
device))
51 , m_isBlocking(isBlocking)
54 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
56 ApiInterface::setDevice(onHost::getNativeHandle(m_device)));
57 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
59 ApiInterface::streamCreateWithFlags(&m_UniformCudaHipQueue, ApiInterface::streamNonBlocking));
65 onHost::internal::wait(*
this);
66 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK_NOEXCEPT(
71 Queue(Queue
const&) =
delete;
72 Queue& operator=(Queue
const&) =
delete;
74 Queue(Queue&&) =
delete;
75 Queue& operator=(Queue&&) =
delete;
79 return m_idx == other.m_idx && m_device == other.m_device;
84 return !(*
this == other);
90 static_assert(internal::concepts::Queue<Queue>);
93 Handle<T_Device> m_device;
95 typename ApiInterface::Stream_t m_UniformCudaHipQueue;
96 core::CallbackThread m_callBackThread;
97 bool m_isBlocking{
false};
106 void conditionalWait() const noexcept
109 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(ApiInterface, ApiInterface::streamSynchronize(
getNativeHandle()));
112 friend struct alpaka::internal::GetName;
116 return std::string(
"unifiedCudaHip::Queue id=") + std::to_string(m_idx);
119 friend struct onHost::internal::GetNativeHandle;
123 return m_UniformCudaHipQueue;
126 friend struct onHost::internal::Enqueue;
127 friend struct onHost::internal::Wait;
131 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(ApiInterface, ApiInterface::streamSynchronize(
getNativeHandle()));
134 friend struct alpaka::internal::GetDeviceType;
146 std::shared_ptr<Queue> getSharedPtr()
148 return this->shared_from_this();
151 friend struct alpaka::onHost::internal::WaitFor;
153 void waitFor(unifiedCudaHip::Event<T_Device>& event)
155 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
157 ApiInterface::streamWaitEvent(
getNativeHandle(), internal::getNativeHandle(event), 0));
162 friend struct internal::IsQueueEmpty;
168 typename ApiInterface::Error_t ret = ApiInterface::success;
169 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK_IGNORE(
172 ApiInterface::errorNotReady);
173 return (ret == ApiInterface::success);
176 friend struct onHost::internal::GetDevice;
178 friend struct alpaka::internal::GetApi;
179 friend struct onHost::internal::Memcpy;
180 friend struct onHost::internal::MemcpyDeviceGlobal;
181 friend struct onHost::internal::Memset;
182 friend struct onHost::internal::AllocDeferred;
183 friend struct CallKernel;
187 alpaka::concepts::Api T_Api,
188 alpaka::concepts::DeviceKind T_DeviceKind,
189 alpaka::concepts::Executor T_Executor,
190 bool launchedWidthFrameSpec,
191 typename TKernelBundle,
192 typename T_OptimizedThreadSpec>
193 __global__
void gpuKernel(TKernelBundle
const kernelBundle, T_OptimizedThreadSpec
const optimizedThreadSpec)
195 constexpr auto warpSizeValue = alpaka::onAcc::unifiedCudaHip::internal::WarpSize::Get<T_DeviceKind>{}();
196 auto acc = onAcc::Acc{
198 DictEntry(
layer::block, onAcc::unifiedCudaHip::BlockLayer{optimizedThreadSpec}),
199 DictEntry(
layer::thread, onAcc::unifiedCudaHip::ThreadLayer{optimizedThreadSpec}),
200 DictEntry(object::launchedWidthFrameSpec, std::bool_constant<launchedWidthFrameSpec>{}),
201 DictEntry(action::threadBlockSync, onAcc::unifiedCudaHip::Sync{}),
204 DictEntry(object::exec, T_Executor{}),
210 ALPAKA_FN_HOST auto convertVecToUniformCudaHipDim(alpaka::concepts::Vector
auto const& vec) -> dim3
214 if constexpr(vecDim >= 1u)
215 dim.x =
static_cast<unsigned>(vec[vecDim - 1u]);
216 if constexpr(vecDim >= 2u)
217 dim.y =
static_cast<unsigned>(vec[vecDim - 2u]);
218 if constexpr(vecDim >= 3u)
219 dim.z =
static_cast<unsigned>(vec[vecDim - 3u]);
226 template<alpaka::concepts::Vector T_NumBlocks, alpaka::concepts::Vector T_NumThreads>
227 struct OptimizedThreadSpec
229 using NumBlocksVecType =
typename T_NumBlocks::UniVec;
230 using NumThreadsVecType = T_NumThreads;
232 static consteval uint32_t dim()
234 return T_NumThreads::dim();
237 constexpr OptimizedThreadSpec(T_NumBlocks
const&, T_NumThreads
const&)
243 bool launchedWidthFrameSpec,
245 alpaka::concepts::Vector T_NumBlocks,
246 alpaka::concepts::Vector T_NumThreads,
247 alpaka::concepts::Executor T_Executor,
248 typename T_KernelBundle>
250 unifiedCudaHip::Queue<T_Device>& queue,
251 ThreadSpec<T_NumBlocks, T_NumThreads, T_Executor>
const& threadSpec,
252 T_KernelBundle
const& kernelBundle)
const
256 "'exec::anyExecutor' can not be used to enqueue an kernel.");
259 using ApiInterface =
typename unifiedCudaHip::Queue<T_Device>::ApiInterface;
260 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
262 ApiInterface::setDevice(onHost::getNativeHandle(
queue.m_device)));
264 constexpr uint32_t dim = T_NumBlocks::dim();
266 constexpr uint32_t layerDim = dim >= 4u ? 1u : dim;
267 using IdxType =
typename T_NumBlocks::type;
269 Vec<IdxType, layerDim> numBlocks;
270 Vec<IdxType, layerDim> numThreadsPerBlock;
272 if constexpr(dim >= 4u)
274 numBlocks = threadSpec.getNumBlocks().product();
275 numThreadsPerBlock = threadSpec.getNumThreads().product();
279 numBlocks = threadSpec.getNumBlocks();
280 numThreadsPerBlock = threadSpec.getNumThreads();
283 using ThreadSpecType = std::conditional_t<
290 auto optimizedThreadSpec = ThreadSpecType(threadSpec.getNumBlocks(), threadSpec.getNumThreads());
292 auto kernelName = gpuKernel<
296 launchedWidthFrameSpec,
300 uint32_t blockDynSharedMemBytes = onHost::getDynSharedMemBytes(threadSpec, kernelBundle);
303 convertVecToUniformCudaHipDim(numBlocks),
304 convertVecToUniformCudaHipDim(numThreadsPerBlock),
305 static_cast<std::size_t
>(blockDynSharedMemBytes),
306 queue.getNativeHandle()>>>(kernelBundle, optimizedThreadSpec);
308 queue.conditionalWait();
316 template<
typename T_Device>
317 struct GetApi::Op<onHost::unifiedCudaHip::Queue<T_Device>>
319 inline constexpr auto operator()(
auto&& queue)
const
330 template<
typename T_Device,
typename T_Task>
341 unifiedCudaHip::Queue<T_Device>& q;
345 static void uniformCudaHipRtHostFunc(
void* arg)
347 auto data = std::unique_ptr<HostFuncData>(
reinterpret_cast<HostFuncData*
>(arg));
349 auto f =
queue.m_callBackThread.submit([d = std::move(
data)] { d->t(); });
353 void operator()(unifiedCudaHip::Queue<T_Device>& queue, T_Task
const& task)
const
356 using ApiInterface =
typename unifiedCudaHip::Queue<T_Device>::ApiInterface;
358 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
360 ApiInterface::launchHostFunc(
361 queue.getNativeHandle(),
362 uniformCudaHipRtHostFunc,
363 new HostFuncData{queue, task}));
365 queue.conditionalWait();
369 template<
typename T_Device,
typename T_Task>
375 unifiedCudaHip::Queue<T_Device>& q;
379 static void uniformCudaHipRtHostFuncAsync(
void* arg)
381 auto data = std::unique_ptr<HostFuncData>(
reinterpret_cast<HostFuncData*
>(arg));
383 auto queueDependency =
queue.getSharedPtr();
384 queue.m_callBackThread.submit([d = std::move(
data), queueDependency] { d->t(); });
388 void operator()(unifiedCudaHip::Queue<T_Device>& queue, T_Task
const& task)
const
391 using ApiInterface =
typename unifiedCudaHip::Queue<T_Device>::ApiInterface;
393 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
395 ApiInterface::launchHostFunc(
396 queue.getNativeHandle(),
397 uniformCudaHipRtHostFuncAsync,
398 new HostFuncData{queue, task}));
400 queue.conditionalWait();
404 template<
typename T_Device,
typename T_Event>
405 struct internal::Enqueue::Event<unifiedCudaHip::
Queue<T_Device>, T_Event>
407 void operator()(unifiedCudaHip::Queue<T_Device>& queue, T_Event& event)
const
410 using ApiInterface =
typename unifiedCudaHip::Queue<T_Device>::ApiInterface;
411 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
413 ApiInterface::eventRecord(
event.getNativeHandle(),
queue.getNativeHandle()));
415 queue.conditionalWait();
421 alpaka::concepts::UnifiedCudaHipExecutor T_Executor,
422 alpaka::concepts::Vector T_NumBlocks,
423 alpaka::concepts::Vector T_NumThreads,
424 typename T_KernelBundle>
426 Kernel<unifiedCudaHip::Queue<T_Device>, ThreadSpec<T_NumBlocks, T_NumThreads, T_Executor>, T_KernelBundle>
429 unifiedCudaHip::Queue<T_Device>& queue,
431 T_KernelBundle
const& kernelBundle)
const
434 unifiedCudaHip::CallKernel{}.template operator()<
false>(
queue, threadSpec, kernelBundle);
440 alpaka::concepts::UnifiedCudaHipExecutor T_Executor,
441 typename T_NumFrames,
442 typename T_FrameExtents,
443 typename T_KernelBundle>
445 Kernel<unifiedCudaHip::Queue<T_Device>, FrameSpec<T_NumFrames, T_FrameExtents, T_Executor>, T_KernelBundle>
448 unifiedCudaHip::Queue<T_Device>& queue,
449 FrameSpec<T_NumFrames, T_FrameExtents, T_Executor>
const& frameSpec,
450 T_KernelBundle
const& kernelBundle)
const
454 "'exec::anyExecutor' can not be used to enqueue an kernel.");
457 unifiedCudaHip::CallKernel{}.template operator()<
true>(
queue, threadBlocking, kernelBundle);
461 template<
typename T_Device,
typename T_Dest,
typename T_Source,
typename T_Extents>
462 struct Memcpy::Op<unifiedCudaHip::
Queue<T_Device>, T_Dest, T_Source, T_Extents>
465 unifiedCudaHip::Queue<T_Device>& queue,
467 T_Source
const& source,
468 T_Extents
const& extents)
const requires std::same_as<
ALPAKA_TYPEOF(dest), T_Dest>
471 using ApiInterface =
typename unifiedCudaHip::Queue<T_Device>::ApiInterface;
475 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
482 auto copyKind = unifiedCudaHip::MemcpyKind<
488 if constexpr(dim == 1u)
491 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
493 ApiInterface::memcpyAsync(
500 else if constexpr(dim == 2u)
502 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
504 ApiInterface::memcpy2DAsync(
506 dest.getPitches().y(),
508 source.getPitches().y(),
514 else if constexpr(dim >= 3u)
516 auto const extentMdNoXY = extentMd.eraseBack().eraseBack();
518 typename ApiInterface::Memcpy3DParms_t memCpy3DParms{};
520 memCpy3DParms.srcPtr = ApiInterface::makePitchedPtr(
522 const_cast<void*
>(srcPtr),
523 source.getPitches().y(),
524 source.getExtents().x(),
525 source.getExtents().y());
526 memCpy3DParms.dstPtr = ApiInterface::makePitchedPtr(
528 dest.getPitches().y(),
529 dest.getExtents().x(),
530 dest.getExtents().y());
531 memCpy3DParms.extent = ApiInterface::makeExtent(
534 extentMdNoXY.product());
535 memCpy3DParms.kind = copyKind;
537 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
542 queue.conditionalWait();
547 template<
typename T_Device,
typename T_Source,
typename T_Storage,
typename T>
548 struct internal::MemcpyDeviceGlobal::
549 Op<unifiedCudaHip::Queue<T_Device>, onAcc::internal::GlobalDeviceMemoryWrapper<T_Storage, T>, T_Source>
552 unifiedCudaHip::Queue<T_Device>& queue,
553 onAcc::internal::GlobalDeviceMemoryWrapper<T_Storage, T> dest,
558 using ApiInterface =
typename unifiedCudaHip::Queue<T_Device>::ApiInterface;
560 auto copyKind = unifiedCudaHip::
563 void* destPtr{
nullptr};
564 void const* srcPtr{
nullptr};
570 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
572 ApiInterface::getSymbolAddress(
reinterpret_cast<void**
>(&destPtr), dest.getHandle(queueApi)));
574 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
578 queue.conditionalWait();
583 template<
typename T_Device,
typename T_Dest,
typename T_Storage,
typename T>
584 struct internal::MemcpyDeviceGlobal::
585 Op<unifiedCudaHip::Queue<T_Device>, T_Dest, onAcc::internal::GlobalDeviceMemoryWrapper<T_Storage, T>>
588 unifiedCudaHip::Queue<T_Device>& queue,
590 onAcc::internal::GlobalDeviceMemoryWrapper<T_Storage, T> source)
const
594 using ApiInterface =
typename unifiedCudaHip::Queue<T_Device>::ApiInterface;
596 auto copyKind = unifiedCudaHip::
599 void* destPtr{
nullptr};
605 void* srcPtr{
nullptr};
607 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
609 ApiInterface::getSymbolAddress(
reinterpret_cast<void**
>(&srcPtr), source.getHandle(queueApi)));
611 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
615 queue.conditionalWait();
619 template<
typename T_Device,
typename T_Dest,
typename T_Extents>
626 unifiedCudaHip::Queue<T_Device>& queue,
629 T_Extents
const& extents)
const requires(std::is_same_v<
ALPAKA_TYPEOF(dest), T_Dest>)
632 using ApiInterface =
typename unifiedCudaHip::Queue<T_Device>::ApiInterface;
635 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
642 if constexpr(dim == 1u)
644 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
646 ApiInterface::memsetAsync(
648 static_cast<int>(byteValue),
652 else if constexpr(dim == 2u)
654 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
656 ApiInterface::memset2DAsync(
658 dest.getPitches().y(),
659 static_cast<int>(byteValue),
664 else if constexpr(dim >= 3u)
666 typename ApiInterface::PitchedPtr_t
const pitchedPtrVal = ApiInterface::makePitchedPtr(
668 dest.getPitches().y(),
669 dest.getExtents().x(),
670 dest.getExtents().y());
672 auto const extentMdNoXY = extentMd.eraseBack().eraseBack();
673 typename ApiInterface::Extent_t
const extentVal = ApiInterface::makeExtent(
676 extentMdNoXY.product());
678 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
680 ApiInterface::memset3DAsync(
682 static_cast<int>(byteValue),
687 queue.conditionalWait();
691 template<
typename T_Device,
typename T_Dest,
typename T_Value,
typename T_Extents>
692 struct Fill::Op<unifiedCudaHip::
Queue<T_Device>, T_Dest, T_Value, T_Extents>
695 unifiedCudaHip::Queue<T_Device>& queue,
697 T_Value elementValue,
698 T_Extents
const& extents)
const
709 dataView.getSubView(extents),
717 template<
typename T_Type,
typename T_Device, alpaka::concepts::Vector T_Extents>
720 auto operator()(unifiedCudaHip::Queue<T_Device>& queue, T_Extents
const& extents)
const
723 using ApiInterface =
typename T_Device::ApiInterface;
730 constexpr uint32_t alignment = 128u;
733 T_Type* ptr =
nullptr;
734 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
736 ApiInterface::mallocAsync((
void**) &ptr, memSizeInByte,
queue.getNativeHandle()));
738 queue.conditionalWait();
742 auto queueDependency =
queue.getSharedPtr();
744 auto deleter = [ptr, queueDependency]()
746 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK_NOEXCEPT(
748 ApiInterface::freeAsync(
toVoidPtr(ptr), queueDependency->getNativeHandle()));
757 Alignment<alignment>{}};
#define ALPAKA_FN_HOST
All functions that can be used on an accelerator have to be attributed with ALPAKA_FN_ACC or ALPAKA_F...
#define ALPAKA_TYPEOF(...)
Get the type of instance.
#define ALPAKA_FORWARD(instance)
Perfectly forward an instance as argument.
#define ALPAKA_LOG_FUNCTION(logLvl)
Log the entry and exit of a scope.
auto emulatedAlignedMemDescription(uint32_t alignmentInByte, T_Extents extents)
provides a memory description to create multidimensional linewise aligned memory within a one dimensi...
constexpr bool operator!=(alpaka::concepts::Api auto lhs, alpaka::concepts::Api auto rhs)
constexpr bool operator==(alpaka::concepts::Api auto lhs, alpaka::concepts::Api auto rhs)
constexpr AnyExecutor anyExecutor
Automatic executor selection.
void fill(auto &internalQueue, auto executor, alpaka::concepts::IMdSpan< T_Value > auto &&dest, T_Value elementValue)
alpaka internal implementations.
constexpr auto getApi(auto &&any)
constexpr auto getDeviceKind(auto &&any)
constexpr WarpSize warpSize
constexpr DeviceKind deviceKind
static auto adjustThreadSpec(auto const &device, onHost::concepts::FrameSpec auto const &frameSpec, KernelBundle< TKernelFn, TArgs... > const &kernelBundle)
bool isQueueEmpty(auto &queue)
constexpr auto getDevice(auto &&any)
auto getNativeHandle(auto &&any)
void waitFor(auto &queue, auto &event)
Functionality which is usable on the host CPU controller thread.
auto getNativeHandle(auto const &handle)
Get the native handle of an handle.
constexpr auto defaultExecutor(internal::concepts::DeviceHandle auto deviceHandle)
Select a default executor for the given device.
SharedBuffer(T_Any const &, T_Type *, T_UserExtents const &, T_UserPitches const &, std::invocable<> auto, T_MemAlignment const) -> SharedBuffer< ALPAKA_TYPEOF(getApi(std::declval< T_Any >())), T_Type, typename T_UserPitches::UniVec, T_MemAlignment >
decltype(auto) data(auto &&any)
pointer to data of an object
std::convertible_to< std::string > auto getName(auto &&any)
Runtime name for a given object.
ThreadSpec(T_NumBlocks const &, T_NumThreads const &) -> ThreadSpec< alpaka::trait::getVec_t< T_NumBlocks >, alpaka::trait::getVec_t< T_NumThreads > >
Device(Handle< T_Device > &&) -> Device< ALPAKA_TYPEOF(alpaka::internal::getApi(std::declval< T_Device >())), ALPAKA_TYPEOF(alpaka::internal::getDeviceKind(std::declval< T_Device >()))>
void wait(alpaka::concepts::HasGet auto &handle)
wait for all work to be finished
Queue(Handle< T_Queue > &&, T_QueueKind) -> Queue< Device< ALPAKA_TYPEOF(alpaka::internal::getApi(std::declval< T_Queue >())), ALPAKA_TYPEOF(alpaka::internal::getDeviceKind(std::declval< T_Queue >()))>, T_QueueKind >
typename GetValueType< T >::type GetValueType_t
constexpr uint32_t getDim_v
auto * toVoidPtr(T inPtr)
Cast a pointer that may or may not point to volatile memory to a (void*) or (void const*).
constexpr decltype(auto) getDeviceKind(auto &&any)
Get the device type of an object.
constexpr decltype(auto) getApi(auto &&any)
Get the API an object depends on.
constexpr auto makeView(auto &&anyWithApi, T_ValueType *pointer, concepts::Vector auto const &extents, T_MemAlignment const memAlignment=T_MemAlignment{})
ALPAKA_FN_HOST_ACC Dict(Tuple< DictEntry< T_Keys, T_Values >... > const &) -> Dict< DictEntry< T_Keys, T_Values >... >
constexpr decltype(auto) pCast(auto &&input)
Performs a static_cast on the storage type of combined data type.
constexpr auto operator()(auto &&any) const
void operator()(T_Any &any, T_Extents const &) const
void operator()(T_Queue &queue, T_Task const &task) const
void operator()(T_Queue &queue, T_Task const &task) const
void operator()(T_Queue &queue, auto &&, T_Value, T_Extents const &) const
void operator()(T_Queue &queue, auto &&, T_Source const &, T_Extents const &) const
void operator()(T_Queue &queue, auto &&, uint8_t, T_Extents const &) const