alpaka
Abstraction Library for Parallel Kernel Acceleration
Loading...
Searching...
No Matches
Queue.hpp
Go to the documentation of this file.
1/* Copyright 2024 René Widera
2 * SPDX-License-Identifier: MPL-2.0
3 */
4#pragma once
5
16#include "alpaka/api/util.hpp"
21#include "alpaka/onAcc/Acc.hpp"
28
29#if ALPAKA_LANG_CUDA || ALPAKA_LANG_HIP
30
32
33# include <cstdint>
34# include <sstream>
35
36namespace alpaka::onHost
37{
38 namespace unifiedCudaHip
39 {
40 struct CallKernel;
41
42 template<typename T_Device>
43 struct Queue : std::enable_shared_from_this<Queue<T_Device>>
44 {
45 using ApiInterface = typename T_Device::ApiInterface;
46
47 public:
48 Queue(internal::concepts::DeviceHandle auto device, uint32_t const idx, bool isBlocking)
49 : m_device(std::move(device))
50 , m_idx(idx)
51 , m_isBlocking(isBlocking)
52 {
53 ALPAKA_LOG_FUNCTION(onHost::logger::queue);
54 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
55 ApiInterface,
56 ApiInterface::setDevice(onHost::getNativeHandle(m_device)));
57 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
58 ApiInterface,
59 ApiInterface::streamCreateWithFlags(&m_UniformCudaHipQueue, ApiInterface::streamNonBlocking));
60 }
61
62 ~Queue()
63 {
64 ALPAKA_LOG_FUNCTION(onHost::logger::queue);
65 onHost::internal::wait(*this);
66 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK_NOEXCEPT(
67 ApiInterface,
68 ApiInterface::streamDestroy(getNativeHandle()));
69 }
70
71 Queue(Queue const&) = delete;
72 Queue& operator=(Queue const&) = delete;
73
74 Queue(Queue&&) = delete;
75 Queue& operator=(Queue&&) = delete;
76
77 bool operator==(Queue const& other) const
78 {
79 return m_idx == other.m_idx && m_device == other.m_device;
80 }
81
82 bool operator!=(Queue const& other) const
83 {
84 return !(*this == other);
85 }
86
87 private:
88 void _()
89 {
90 static_assert(internal::concepts::Queue<Queue>);
91 }
92
93 Handle<T_Device> m_device;
94 uint32_t m_idx = 0u;
95 typename ApiInterface::Stream_t m_UniformCudaHipQueue;
96 core::CallbackThread m_callBackThread;
97 bool m_isBlocking{false};
98
99 /** Waits until all operations are finished depending on whether the queue is blocking or non-blocking.
100 *
101 * If the queue is a blocking queue the control flow will be blocked and the method is not returning until
102 * all work in the queue is processed. This method should be called after the task is enqueued into the
103 * native CUDA/HIP queue. There is no need to call this method before enqueuing because the queues are
104 * in-order queues and even if another thread is enqueued something before the order is guaranteed.
105 */
106 void conditionalWait() const noexcept
107 {
108 if(m_isBlocking)
109 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(ApiInterface, ApiInterface::streamSynchronize(getNativeHandle()));
110 }
111
112 friend struct alpaka::internal::GetName;
113
114 std::string getName() const
115 {
116 return std::string("unifiedCudaHip::Queue id=") + std::to_string(m_idx);
117 }
118
119 friend struct onHost::internal::GetNativeHandle;
120
121 [[nodiscard]] auto getNativeHandle() const noexcept
122 {
123 return m_UniformCudaHipQueue;
124 }
125
126 friend struct onHost::internal::Enqueue;
127 friend struct onHost::internal::Wait;
128
129 void wait() const
130 {
131 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(ApiInterface, ApiInterface::streamSynchronize(getNativeHandle()));
132 }
133
134 friend struct alpaka::internal::GetDeviceType;
135
136 auto getDeviceKind() const
137 {
138 return alpaka::internal::getDeviceKind(*m_device.get());
139 }
140
141 auto getDevice() const
142 {
143 return m_device;
144 }
145
146 std::shared_ptr<Queue> getSharedPtr()
147 {
148 return this->shared_from_this();
149 }
150
151 friend struct alpaka::onHost::internal::WaitFor;
152
153 void waitFor(unifiedCudaHip::Event<T_Device>& event)
154 {
155 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
156 ApiInterface,
157 ApiInterface::streamWaitEvent(getNativeHandle(), internal::getNativeHandle(event), 0));
158
159 conditionalWait();
160 }
161
162 friend struct internal::IsQueueEmpty;
163
164 bool isQueueEmpty() const
165 {
166 ALPAKA_LOG_FUNCTION(onHost::logger::queue);
167
168 typename ApiInterface::Error_t ret = ApiInterface::success;
169 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK_IGNORE(
170 ApiInterface,
171 ret = ApiInterface::streamQuery(getNativeHandle()),
172 ApiInterface::errorNotReady);
173 return (ret == ApiInterface::success);
174 }
175
176 friend struct onHost::internal::GetDevice;
177
178 friend struct alpaka::internal::GetApi;
179 friend struct onHost::internal::Memcpy;
180 friend struct onHost::internal::MemcpyDeviceGlobal;
181 friend struct onHost::internal::Memset;
182 friend struct onHost::internal::AllocDeferred;
183 friend struct CallKernel;
184 };
185
186 template<
187 alpaka::concepts::Api T_Api,
188 alpaka::concepts::DeviceKind T_DeviceKind,
189 alpaka::concepts::Executor T_Executor,
190 bool launchedWidthFrameSpec,
191 typename TKernelBundle,
192 typename T_OptimizedThreadSpec>
193 __global__ void gpuKernel(TKernelBundle const kernelBundle, T_OptimizedThreadSpec const optimizedThreadSpec)
194 {
195 constexpr auto warpSizeValue = alpaka::onAcc::unifiedCudaHip::internal::WarpSize::Get<T_DeviceKind>{}();
196 auto acc = onAcc::Acc{
197 Dict{
198 DictEntry(layer::block, onAcc::unifiedCudaHip::BlockLayer{optimizedThreadSpec}),
199 DictEntry(layer::thread, onAcc::unifiedCudaHip::ThreadLayer{optimizedThreadSpec}),
200 DictEntry(object::launchedWidthFrameSpec, std::bool_constant<launchedWidthFrameSpec>{}),
201 DictEntry(action::threadBlockSync, onAcc::unifiedCudaHip::Sync{}),
202 DictEntry(object::api, T_Api{}),
203 DictEntry(object::deviceKind, T_DeviceKind{}),
204 DictEntry(object::exec, T_Executor{}),
205 DictEntry(object::warpSize, warpSizeValue)},
206 };
207 kernelBundle(acc);
208 }
209
210 ALPAKA_FN_HOST auto convertVecToUniformCudaHipDim(alpaka::concepts::Vector auto const& vec) -> dim3
211 {
212 constexpr auto vecDim = ALPAKA_TYPEOF(vec)::dim();
213 dim3 dim(1, 1, 1);
214 if constexpr(vecDim >= 1u)
215 dim.x = static_cast<unsigned>(vec[vecDim - 1u]);
216 if constexpr(vecDim >= 2u)
217 dim.y = static_cast<unsigned>(vec[vecDim - 2u]);
218 if constexpr(vecDim >= 3u)
219 dim.z = static_cast<unsigned>(vec[vecDim - 3u]);
220
221 return dim;
222 }
223
224 struct CallKernel
225 {
226 template<alpaka::concepts::Vector T_NumBlocks, alpaka::concepts::Vector T_NumThreads>
227 struct OptimizedThreadSpec
228 {
229 using NumBlocksVecType = typename T_NumBlocks::UniVec;
230 using NumThreadsVecType = T_NumThreads;
231
232 static consteval uint32_t dim()
233 {
234 return T_NumThreads::dim();
235 }
236
237 constexpr OptimizedThreadSpec(T_NumBlocks const&, T_NumThreads const&)
238 {
239 }
240 };
241
242 template<
243 bool launchedWidthFrameSpec,
244 typename T_Device,
245 alpaka::concepts::Vector T_NumBlocks,
246 alpaka::concepts::Vector T_NumThreads,
247 alpaka::concepts::Executor T_Executor,
248 typename T_KernelBundle>
249 void operator()(
250 unifiedCudaHip::Queue<T_Device>& queue,
251 ThreadSpec<T_NumBlocks, T_NumThreads, T_Executor> const& threadSpec,
252 T_KernelBundle const& kernelBundle) const
253 {
254 static_assert(
255 ALPAKA_TYPEOF(threadSpec)::getExecutor() != exec::anyExecutor,
256 "'exec::anyExecutor' can not be used to enqueue an kernel.");
257 ALPAKA_LOG_FUNCTION(onHost::logger::kernel + onHost::logger::queue);
258
259 using ApiInterface = typename unifiedCudaHip::Queue<T_Device>::ApiInterface;
260 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
261 ApiInterface,
262 ApiInterface::setDevice(onHost::getNativeHandle(queue.m_device)));
263
264 constexpr uint32_t dim = T_NumBlocks::dim();
265 // dimension of the cuda/hip layer
266 constexpr uint32_t layerDim = dim >= 4u ? 1u : dim;
267 using IdxType = typename T_NumBlocks::type;
268
269 Vec<IdxType, layerDim> numBlocks;
270 Vec<IdxType, layerDim> numThreadsPerBlock;
271
272 if constexpr(dim >= 4u)
273 {
274 numBlocks = threadSpec.getNumBlocks().product();
275 numThreadsPerBlock = threadSpec.getNumThreads().product();
276 }
277 else
278 {
279 numBlocks = threadSpec.getNumBlocks();
280 numThreadsPerBlock = threadSpec.getNumThreads();
281 }
282
283 using ThreadSpecType = std::conditional_t<
284 dim >= 4u,
285 ALPAKA_TYPEOF(threadSpec),
286 OptimizedThreadSpec<
287 typename ALPAKA_TYPEOF(threadSpec)::NumBlocksVecType,
288 typename ALPAKA_TYPEOF(threadSpec)::NumThreadsVecType>>;
289 // thread spec which is only holding data if the dimension is larger than 3u
290 auto optimizedThreadSpec = ThreadSpecType(threadSpec.getNumBlocks(), threadSpec.getNumThreads());
291
292 auto kernelName = gpuKernel<
293 ALPAKA_TYPEOF(getApi(queue)),
295 T_Executor,
296 launchedWidthFrameSpec,
297 T_KernelBundle,
298 ALPAKA_TYPEOF(optimizedThreadSpec)>;
299
300 uint32_t blockDynSharedMemBytes = onHost::getDynSharedMemBytes(threadSpec, kernelBundle);
301
302 kernelName<<<
303 convertVecToUniformCudaHipDim(numBlocks),
304 convertVecToUniformCudaHipDim(numThreadsPerBlock),
305 static_cast<std::size_t>(blockDynSharedMemBytes),
306 queue.getNativeHandle()>>>(kernelBundle, optimizedThreadSpec);
307
308 queue.conditionalWait();
309 }
310 };
311 } // namespace unifiedCudaHip
312} // namespace alpaka::onHost
313
314namespace alpaka::internal
315{
316 template<typename T_Device>
317 struct GetApi::Op<onHost::unifiedCudaHip::Queue<T_Device>>
318 {
319 inline constexpr auto operator()(auto&& queue) const
320 {
321 return getApi(queue.m_device);
322 }
323 };
324} // namespace alpaka::internal
325
326namespace alpaka::onHost
327{
328 namespace internal
329 {
330 template<typename T_Device, typename T_Task>
331 struct Enqueue::HostTask<unifiedCudaHip::Queue<T_Device>, T_Task>
332 {
333 struct HostFuncData
334 {
335 // We don't need to keep the queue alive, because in its dtor it will synchronize with the CUDA/HIP
336 // stream and wait until all host functions and the CallbackThread are done. It's actually an error to
337 // copy the queue into the host function. Destroying it here would call CUDA/HIP APIs from the host
338 // function. Passing it further to the Callback thread, would make the Callback thread hold a task
339 // containing the queue with the CallbackThread itself. Destroying the task if no other queue instance
340 // exists will make the CallbackThread join itself and crash.
341 unifiedCudaHip::Queue<T_Device>& q;
342 T_Task t;
343 };
344
345 static void uniformCudaHipRtHostFunc(void* arg)
346 {
347 auto data = std::unique_ptr<HostFuncData>(reinterpret_cast<HostFuncData*>(arg));
348 auto& queue = data->q;
349 auto f = queue.m_callBackThread.submit([d = std::move(data)] { d->t(); });
350 f.wait();
351 }
352
353 void operator()(unifiedCudaHip::Queue<T_Device>& queue, T_Task const& task) const
354 {
356 using ApiInterface = typename unifiedCudaHip::Queue<T_Device>::ApiInterface;
357
358 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
359 ApiInterface,
360 ApiInterface::launchHostFunc(
361 queue.getNativeHandle(),
362 uniformCudaHipRtHostFunc,
363 new HostFuncData{queue, task}));
364
365 queue.conditionalWait();
366 }
367 };
368
369 template<typename T_Device, typename T_Task>
370 struct Enqueue::HostTaskDeferred<unifiedCudaHip::Queue<T_Device>, T_Task>
371 {
372 // same as for Enqueue::HostTask, but not waiting for the task to finish
373 struct HostFuncData
374 {
375 unifiedCudaHip::Queue<T_Device>& q;
376 T_Task t;
377 };
378
379 static void uniformCudaHipRtHostFuncAsync(void* arg)
380 {
381 auto data = std::unique_ptr<HostFuncData>(reinterpret_cast<HostFuncData*>(arg));
382 auto& queue = data->q;
383 auto queueDependency = queue.getSharedPtr();
384 queue.m_callBackThread.submit([d = std::move(data), queueDependency] { d->t(); });
385 // don't wait, we're async
386 }
387
388 void operator()(unifiedCudaHip::Queue<T_Device>& queue, T_Task const& task) const
389 {
391 using ApiInterface = typename unifiedCudaHip::Queue<T_Device>::ApiInterface;
392
393 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
394 ApiInterface,
395 ApiInterface::launchHostFunc(
396 queue.getNativeHandle(),
397 uniformCudaHipRtHostFuncAsync,
398 new HostFuncData{queue, task}));
399
400 queue.conditionalWait();
401 }
402 };
403
404 template<typename T_Device, typename T_Event>
405 struct internal::Enqueue::Event<unifiedCudaHip::Queue<T_Device>, T_Event>
406 {
407 void operator()(unifiedCudaHip::Queue<T_Device>& queue, T_Event& event) const
408 {
409 ALPAKA_LOG_FUNCTION(onHost::logger::event + onHost::logger::queue);
410 using ApiInterface = typename unifiedCudaHip::Queue<T_Device>::ApiInterface;
411 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
412 ApiInterface,
413 ApiInterface::eventRecord(event.getNativeHandle(), queue.getNativeHandle()));
414
415 queue.conditionalWait();
416 }
417 };
418
419 template<
420 typename T_Device,
421 alpaka::concepts::UnifiedCudaHipExecutor T_Executor,
422 alpaka::concepts::Vector T_NumBlocks,
423 alpaka::concepts::Vector T_NumThreads,
424 typename T_KernelBundle>
425 struct Enqueue::
426 Kernel<unifiedCudaHip::Queue<T_Device>, ThreadSpec<T_NumBlocks, T_NumThreads, T_Executor>, T_KernelBundle>
427 {
428 void operator()(
429 unifiedCudaHip::Queue<T_Device>& queue,
431 T_KernelBundle const& kernelBundle) const
432 {
434 unifiedCudaHip::CallKernel{}.template operator()<false>(queue, threadSpec, kernelBundle);
435 }
436 };
437
438 template<
439 typename T_Device,
440 alpaka::concepts::UnifiedCudaHipExecutor T_Executor,
441 typename T_NumFrames,
442 typename T_FrameExtents,
443 typename T_KernelBundle>
444 struct Enqueue::
445 Kernel<unifiedCudaHip::Queue<T_Device>, FrameSpec<T_NumFrames, T_FrameExtents, T_Executor>, T_KernelBundle>
446 {
447 void operator()(
448 unifiedCudaHip::Queue<T_Device>& queue,
449 FrameSpec<T_NumFrames, T_FrameExtents, T_Executor> const& frameSpec,
450 T_KernelBundle const& kernelBundle) const
451 {
452 static_assert(
453 ALPAKA_TYPEOF(frameSpec)::getExecutor() != exec::anyExecutor,
454 "'exec::anyExecutor' can not be used to enqueue an kernel.");
456 auto threadBlocking = internal::adjustThreadSpec(*queue.m_device.get(), frameSpec, kernelBundle);
457 unifiedCudaHip::CallKernel{}.template operator()<true>(queue, threadBlocking, kernelBundle);
458 }
459 };
460
461 template<typename T_Device, typename T_Dest, typename T_Source, typename T_Extents>
462 struct Memcpy::Op<unifiedCudaHip::Queue<T_Device>, T_Dest, T_Source, T_Extents>
463 {
464 void operator()(
465 unifiedCudaHip::Queue<T_Device>& queue,
466 auto&& dest,
467 T_Source const& source,
468 T_Extents const& extents) const requires std::same_as<ALPAKA_TYPEOF(dest), T_Dest>
469 {
471 using ApiInterface = typename unifiedCudaHip::Queue<T_Device>::ApiInterface;
472
473 auto extentMd = pCast<size_t>(extents);
474
475 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
476 ApiInterface,
477 ApiInterface::setDevice(onHost::getNativeHandle(queue.m_device)));
478
479 void* destPtr = toVoidPtr(onHost::data(dest));
480 void const* srcPtr = toVoidPtr(onHost::data(source));
481
482 auto copyKind = unifiedCudaHip::MemcpyKind<
483 ApiInterface,
484 ALPAKA_TYPEOF(alpaka::internal::getApi(dest)),
485 ALPAKA_TYPEOF(alpaka::internal::getApi(source))>::kind;
486
487 constexpr auto dim = alpaka::trait::getDim_v<T_Extents>;
488 if constexpr(dim == 1u)
489 {
490 // Initiate the memory copy.
491 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
492 ApiInterface,
493 ApiInterface::memcpyAsync(
494 destPtr,
495 srcPtr,
496 extentMd.x() * sizeof(alpaka::trait::GetValueType_t<T_Dest>),
497 copyKind,
498 internal::getNativeHandle(queue)));
499 }
500 else if constexpr(dim == 2u)
501 {
502 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
503 ApiInterface,
504 ApiInterface::memcpy2DAsync(
505 destPtr,
506 dest.getPitches().y(),
507 srcPtr,
508 source.getPitches().y(),
509 extentMd.x() * sizeof(alpaka::trait::GetValueType_t<T_Dest>),
510 extentMd.y(),
511 copyKind,
512 internal::getNativeHandle(queue)));
513 }
514 else if constexpr(dim >= 3u)
515 {
516 auto const extentMdNoXY = extentMd.eraseBack().eraseBack();
517 // zero-init required per CUDA documentation
518 typename ApiInterface::Memcpy3DParms_t memCpy3DParms{};
519
520 memCpy3DParms.srcPtr = ApiInterface::makePitchedPtr(
521 // CUDA/HIP does not support const for pitched pointer
522 const_cast<void*>(srcPtr),
523 source.getPitches().y(),
524 source.getExtents().x(),
525 source.getExtents().y());
526 memCpy3DParms.dstPtr = ApiInterface::makePitchedPtr(
527 destPtr,
528 dest.getPitches().y(),
529 dest.getExtents().x(),
530 dest.getExtents().y());
531 memCpy3DParms.extent = ApiInterface::makeExtent(
532 extentMd.x() * sizeof(alpaka::trait::GetValueType_t<T_Dest>),
533 extentMd.y(),
534 extentMdNoXY.product());
535 memCpy3DParms.kind = copyKind;
536
537 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
538 ApiInterface,
539 ApiInterface::memcpy3DAsync(&memCpy3DParms, internal::getNativeHandle(queue)));
540 }
541
542 queue.conditionalWait();
543 }
544 };
545
546 // copy to device global memory
547 template<typename T_Device, typename T_Source, typename T_Storage, typename T>
548 struct internal::MemcpyDeviceGlobal::
549 Op<unifiedCudaHip::Queue<T_Device>, onAcc::internal::GlobalDeviceMemoryWrapper<T_Storage, T>, T_Source>
550 {
551 void operator()(
552 unifiedCudaHip::Queue<T_Device>& queue,
553 onAcc::internal::GlobalDeviceMemoryWrapper<T_Storage, T> dest,
554 auto&& source) const
555 {
557
558 using ApiInterface = typename unifiedCudaHip::Queue<T_Device>::ApiInterface;
559 auto queueApi = alpaka::internal::getApi(queue);
560 auto copyKind = unifiedCudaHip::
561 MemcpyKind<ApiInterface, ALPAKA_TYPEOF(queueApi), ALPAKA_TYPEOF(api::host)>::kind;
562
563 void* destPtr{nullptr};
564 void const* srcPtr{nullptr};
565 if constexpr(std::is_pointer_v<ALPAKA_TYPEOF(source)>)
566 srcPtr = source;
567 else
569
570 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
571 ApiInterface,
572 ApiInterface::getSymbolAddress(reinterpret_cast<void**>(&destPtr), dest.getHandle(queueApi)));
573
574 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
575 ApiInterface,
576 ApiInterface::memcpyAsync(destPtr, srcPtr, sizeof(T), copyKind, internal::getNativeHandle(queue)));
577
578 queue.conditionalWait();
579 }
580 };
581
582 // copy from device global memory
583 template<typename T_Device, typename T_Dest, typename T_Storage, typename T>
584 struct internal::MemcpyDeviceGlobal::
585 Op<unifiedCudaHip::Queue<T_Device>, T_Dest, onAcc::internal::GlobalDeviceMemoryWrapper<T_Storage, T>>
586 {
587 void operator()(
588 unifiedCudaHip::Queue<T_Device>& queue,
589 auto&& dest,
590 onAcc::internal::GlobalDeviceMemoryWrapper<T_Storage, T> source) const
591 {
593
594 using ApiInterface = typename unifiedCudaHip::Queue<T_Device>::ApiInterface;
595 auto queueApi = alpaka::internal::getApi(queue);
596 auto copyKind = unifiedCudaHip::
597 MemcpyKind<ApiInterface, ALPAKA_TYPEOF(api::host), ALPAKA_TYPEOF(queueApi)>::kind;
598
599 void* destPtr{nullptr};
600 if constexpr(std::is_pointer_v<ALPAKA_TYPEOF(dest)>)
601 destPtr = dest;
602 else
604
605 void* srcPtr{nullptr};
606
607 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
608 ApiInterface,
609 ApiInterface::getSymbolAddress(reinterpret_cast<void**>(&srcPtr), source.getHandle(queueApi)));
610
611 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
612 ApiInterface,
613 ApiInterface::memcpyAsync(destPtr, srcPtr, sizeof(T), copyKind, internal::getNativeHandle(queue)));
614
615 queue.conditionalWait();
616 }
617 };
618
619 template<typename T_Device, typename T_Dest, typename T_Extents>
620 struct Memset::Op<unifiedCudaHip::Queue<T_Device>, T_Dest, T_Extents>
621 {
622 /** @attention Do not use `requires std::same_as<ALPAKA_TYPEOF(dest), T_Dest>` here else gcc 11.X
623 * (tested 11.4 and 11.3) will run into an internal compiler segfault during the evaluation of the
624 * constraints */
625 void operator()(
626 unifiedCudaHip::Queue<T_Device>& queue,
627 auto&& dest,
628 uint8_t byteValue,
629 T_Extents const& extents) const requires(std::is_same_v<ALPAKA_TYPEOF(dest), T_Dest>)
630 {
632 using ApiInterface = typename unifiedCudaHip::Queue<T_Device>::ApiInterface;
633 auto extentMd = pCast<size_t>(extents);
634
635 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
636 ApiInterface,
637 ApiInterface::setDevice(onHost::getNativeHandle(queue.m_device)));
638
639 void* destPtr = toVoidPtr(onHost::data(dest));
640
641 constexpr auto dim = alpaka::trait::getDim_v<T_Extents>;
642 if constexpr(dim == 1u)
643 {
644 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
645 ApiInterface,
646 ApiInterface::memsetAsync(
647 destPtr,
648 static_cast<int>(byteValue),
649 extentMd.x() * sizeof(alpaka::trait::GetValueType_t<T_Dest>),
650 internal::getNativeHandle(queue)));
651 }
652 else if constexpr(dim == 2u)
653 {
654 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
655 ApiInterface,
656 ApiInterface::memset2DAsync(
657 destPtr,
658 dest.getPitches().y(),
659 static_cast<int>(byteValue),
660 extentMd.x() * sizeof(alpaka::trait::GetValueType_t<T_Dest>),
661 extentMd.y(),
662 internal::getNativeHandle(queue)));
663 }
664 else if constexpr(dim >= 3u)
665 {
666 typename ApiInterface::PitchedPtr_t const pitchedPtrVal = ApiInterface::makePitchedPtr(
667 destPtr,
668 dest.getPitches().y(),
669 dest.getExtents().x(),
670 dest.getExtents().y());
671
672 auto const extentMdNoXY = extentMd.eraseBack().eraseBack();
673 typename ApiInterface::Extent_t const extentVal = ApiInterface::makeExtent(
674 extentMd.x() * sizeof(alpaka::trait::GetValueType_t<T_Dest>),
675 extentMd.y(),
676 extentMdNoXY.product());
677
678 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
679 ApiInterface,
680 ApiInterface::memset3DAsync(
681 pitchedPtrVal,
682 static_cast<int>(byteValue),
683 extentVal,
684 internal::getNativeHandle(queue)));
685 }
686
687 queue.conditionalWait();
688 }
689 };
690
691 template<typename T_Device, typename T_Dest, typename T_Value, typename T_Extents>
692 struct Fill::Op<unifiedCudaHip::Queue<T_Device>, T_Dest, T_Value, T_Extents>
693 {
694 void operator()(
695 unifiedCudaHip::Queue<T_Device>& queue,
696 auto&& dest,
697 T_Value elementValue,
698 T_Extents const& extents) const
699 requires std::same_as<ALPAKA_TYPEOF(dest), T_Dest>
700 && std::same_as<alpaka::trait::GetValueType_t<ALPAKA_TYPEOF(dest)>, T_Value>
701 {
703 // avoid that we pass a SharedBuffer and convert non alpaka data views
704 auto dataView = makeView(dest);
705
706 alpaka::internal::generic::fill(
707 queue,
708 defaultExecutor(getDevice(queue)),
709 dataView.getSubView(extents),
710 elementValue);
711 }
712 };
713
714 /** The code is a copy of the Alloc::Op with the difference that the memory is allocated and freed
715 * within a queue
716 */
717 template<typename T_Type, typename T_Device, alpaka::concepts::Vector T_Extents>
718 struct AllocDeferred::Op<T_Type, unifiedCudaHip::Queue<T_Device>, T_Extents>
719 {
720 auto operator()(unifiedCudaHip::Queue<T_Device>& queue, T_Extents const& extents) const
721 {
723 using ApiInterface = typename T_Device::ApiInterface;
724
725 /** Each CUDA/HIP allocation is aligned to at least 128 byte but typically to 256byte
726 *
727 * @todo check if this value can be derived from the device properties
728 * @todo validate if memory is always aligned to 256 byte
729 */
730 constexpr uint32_t alignment = 128u;
731 auto [memSizeInByte, pitches] = api::util::emulatedAlignedMemDescription<T_Type>(alignment, extents);
732
733 T_Type* ptr = nullptr;
734 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
735 ApiInterface,
736 ApiInterface::mallocAsync((void**) &ptr, memSizeInByte, queue.getNativeHandle()));
737
738 queue.conditionalWait();
739
740 auto deviceDependency = onHost::Device{queue.getDevice()->getSharedPtr()};
741 // it is the shared pointer to the internal queue, NOT onHost::Queue
742 auto queueDependency = queue.getSharedPtr();
743
744 auto deleter = [ptr, queueDependency]()
745 {
746 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK_NOEXCEPT(
747 ApiInterface,
748 ApiInterface::freeAsync(toVoidPtr(ptr), queueDependency->getNativeHandle()));
749 };
750
751 auto sharedBuffer = onHost::SharedBuffer{
752 deviceDependency,
753 ptr,
754 extents,
755 pitches,
756 std::move(deleter),
757 Alignment<alignment>{}};
758 return sharedBuffer;
759 }
760 };
761 } // namespace internal
762} // namespace alpaka::onHost
763#endif
#define ALPAKA_FN_HOST
All functions that can be used on an accelerator have to be attributed with ALPAKA_FN_ACC or ALPAKA_F...
Definition common.hpp:32
#define ALPAKA_TYPEOF(...)
Get the type of instance.
Definition common.hpp:153
#define ALPAKA_FORWARD(instance)
Perfectly forward an instance as argument.
Definition common.hpp:147
#define ALPAKA_LOG_FUNCTION(logLvl)
Log the entry and exit of a scope.
Definition logger.hpp:95
auto emulatedAlignedMemDescription(uint32_t alignmentInByte, T_Extents extents)
provides a memory description to create multidimensional linewise aligned memory within a one dimensi...
Definition util.hpp:100
constexpr auto host
Definition Api.hpp:39
constexpr bool operator!=(alpaka::concepts::Api auto lhs, alpaka::concepts::Api auto rhs)
Definition api.hpp:53
constexpr bool operator==(alpaka::concepts::Api auto lhs, alpaka::concepts::Api auto rhs)
Definition api.hpp:48
constexpr AnyExecutor anyExecutor
Automatic executor selection.
Definition executor.hpp:33
constexpr auto block
Definition tag.hpp:261
constexpr auto thread
Definition tag.hpp:255
constexpr WarpSize warpSize
Definition tag.hpp:44
constexpr DeviceKind deviceKind
Definition tag.hpp:30
constexpr Api api
Definition tag.hpp:24
constexpr Device device
Definition scope.hpp:70
constexpr auto queue
Definition lvl.hpp:127
constexpr auto kernel
Definition lvl.hpp:142
constexpr auto memory
Definition lvl.hpp:112
constexpr auto event
Definition lvl.hpp:97
Functionality which is usable on the host CPU controller thread.
Definition api.hpp:40
auto getNativeHandle(auto const &handle)
Get the native handle of an handle.
constexpr auto defaultExecutor(internal::concepts::DeviceHandle auto deviceHandle)
Select a default executor for the given device.
Definition trait.hpp:148
SharedBuffer(T_Any const &, T_Type *, T_UserExtents const &, T_UserPitches const &, std::invocable<> auto, T_MemAlignment const) -> SharedBuffer< ALPAKA_TYPEOF(getApi(std::declval< T_Any >())), T_Type, typename T_UserPitches::UniVec, T_MemAlignment >
decltype(auto) data(auto &&any)
pointer to data of an object
std::convertible_to< std::string > auto getName(auto &&any)
Runtime name for a given object.
Definition interface.hpp:96
ThreadSpec(T_NumBlocks const &, T_NumThreads const &) -> ThreadSpec< alpaka::trait::getVec_t< T_NumBlocks >, alpaka::trait::getVec_t< T_NumThreads > >
Device(Handle< T_Device > &&) -> Device< ALPAKA_TYPEOF(alpaka::internal::getApi(std::declval< T_Device >())), ALPAKA_TYPEOF(alpaka::internal::getDeviceKind(std::declval< T_Device >()))>
void wait(alpaka::concepts::HasGet auto &handle)
wait for all work to be finished
Queue(Handle< T_Queue > &&, T_QueueKind) -> Queue< Device< ALPAKA_TYPEOF(alpaka::internal::getApi(std::declval< T_Queue >())), ALPAKA_TYPEOF(alpaka::internal::getDeviceKind(std::declval< T_Queue >()))>, T_QueueKind >
typename GetValueType< T >::type GetValueType_t
Definition trait.hpp:65
constexpr uint32_t getDim_v
Definition trait.hpp:41
auto * toVoidPtr(T inPtr)
Cast a pointer that may or may not point to volatile memory to a (void*) or (void const*).
Definition util.hpp:34
constexpr decltype(auto) getDeviceKind(auto &&any)
Get the device type of an object.
Definition interface.hpp:52
constexpr decltype(auto) getApi(auto &&any)
Get the API an object depends on.
Definition interface.hpp:23
constexpr auto makeView(auto &&anyWithApi, T_ValueType *pointer, concepts::Vector auto const &extents, T_MemAlignment const memAlignment=T_MemAlignment{})
Definition View.hpp:37
ALPAKA_FN_HOST_ACC Dict(Tuple< DictEntry< T_Keys, T_Values >... > const &) -> Dict< DictEntry< T_Keys, T_Values >... >
constexpr decltype(auto) pCast(auto &&input)
Performs a static_cast on the storage type of combined data type.
Definition cast.hpp:48