35 template<
typename T_Device>
36 struct Queue : std::enable_shared_from_this<Queue<T_Device>>
68 return !(*
this == other);
103 template<
typename T_Fn>
109 std::lock_guard<std::mutex> lk(
m_mutex);
114#if defined(__GNUC__) && !defined(__clang__)
115# pragma GCC diagnostic push
116# pragma GCC diagnostic ignored "-Wtsan"
119 std::promise<void> p;
120 auto f = p.get_future();
122#if defined(__GNUC__) && !defined(__clang__)
123# pragma GCC diagnostic pop
138 return std::string(
"host::Queue id=") + std::to_string(
m_idx);
150 template<alpaka::onHost::concepts::ThreadSpec T_ThreadSpec>
151 void enqueue(T_ThreadSpec
const& threadSpec,
auto const& kernelBundle)
155 "'exec::anyExecutor' can not be used to enqueue an kernel.");
166 auto moreLayer =
Dict{
167 DictEntry(object::launchedWidthFrameSpec, std::false_type{}),
170 DictEntry(object::exec, threadSpec.getExecutor())};
172 acc(kernelBundle, moreLayer);
176 template<alpaka::onHost::concepts::FrameSpec T_FrameSpec>
177 void enqueue(T_FrameSpec
const& frameSpec,
auto const& kernelBundle)
181 "'exec::anyExecutor' can not be used to enqueue an kernel.");
193 auto moreLayer =
Dict{
194 DictEntry(object::launchedWidthFrameSpec, std::true_type{}),
197 DictEntry(object::exec, adjustedThreadSpec.getExecutor())};
199 acc(kernelBundle, moreLayer);
211 submit([task]() { task(); });
234 return this->shared_from_this();
275 template<
typename T_Device>
284 if(queue.isQueueEmpty() ==
false)
286 queue.submit([]() {}).
wait();
291 template<
typename T_Device,
typename T_Event>
300 std::lock_guard<std::mutex> lk(event.m_mutex);
302 ++
event.m_enqueueCount;
304 auto const enqueueCount =
event.m_enqueueCount;
309 if(queue.m_isBlocking)
312 if(enqueueCount == event.m_enqueueCount)
314 event.m_LastReadyEnqueueCount = std::max(enqueueCount, event.m_LastReadyEnqueueCount);
317 std::promise<void> p;
319 event.m_future = p.get_future();
323 auto sharedEvent =
event.getSharedPtr();
325 event.m_future = queue.submit(
326 [sharedEvent, enqueueCount]()
mutable
328 std::unique_lock<std::mutex> lk2(sharedEvent->m_mutex);
331 if(enqueueCount == sharedEvent->m_enqueueCount)
333 sharedEvent->m_LastReadyEnqueueCount
334 = std::max(enqueueCount, sharedEvent->m_LastReadyEnqueueCount);
342 template<
typename T_Device,
typename T_Event>
351 std::unique_lock<std::mutex> lk(event.m_mutex);
358 if(queue.m_isBlocking)
360 std::shared_future sFuture =
event.m_future;
366 auto sharedEvent =
event.getSharedPtr();
367 auto oldFuture =
event.m_future;
372 queue.submit([sharedEvent, oldFuture]() { oldFuture.get(); });
379 template<
typename T_Device,
typename T_Dest,
typename T_Source,
typename T_Extents>
394 if constexpr(dim == 1u)
397 [extents, destPtr, srcPtr]()
405 auto destPitchBytesWithoutColumn = dest.getPitches().eraseBack();
406 auto sourcePitchBytesWithoutColumn = source.getPitches().eraseBack();
409 [extents, destPtr, srcPtr, destPitchBytesWithoutColumn, sourcePitchBytesWithoutColumn]()
411 auto const dstExtentWithoutColumn = extents.eraseBack();
412 if(
static_cast<std::size_t
>(extents.product()) != 0u)
415 dstExtentWithoutColumn,
419 reinterpret_cast<std::uint8_t*
>(destPtr)
420 + (idx * destPitchBytesWithoutColumn).sum(),
421 reinterpret_cast<std::uint8_t const*
>(srcPtr)
422 + (idx * sourcePitchBytesWithoutColumn).sum(),
423 static_cast<size_t>(extents.back())
433 template<
typename T_Device,
typename T_Source,
typename T_Storage,
typename T>
434 struct internal::MemcpyDeviceGlobal::
435 Op<cpu::Queue<T_Device>, onAcc::internal::GlobalDeviceMemoryWrapper<T_Storage, T>, T_Source>
444 void const* srcPtr{
nullptr};
449 queue.submit([destPtr, srcPtr]() { std::memcpy(destPtr, srcPtr,
sizeof(T)); });
454 template<
typename T_Device,
typename T_Dest,
typename T_Storage,
typename T>
455 struct internal::MemcpyDeviceGlobal::
456 Op<cpu::Queue<T_Device>, T_Dest, onAcc::internal::GlobalDeviceMemoryWrapper<T_Storage, T>>
464 void* destPtr{
nullptr};
470 queue.submit([destPtr, srcPtr]() { std::memcpy(destPtr, srcPtr,
sizeof(T)); });
474 template<
typename T_Device,
typename T_Dest,
typename T_Extents>
488 if constexpr(dim == 1u)
491 [extents, destPtr, byteValue]()
502 auto destPitchBytesWithoutColumn = dest.getPitches().eraseBack();
504 [extents, destPtr, destPitchBytesWithoutColumn, byteValue]()
506 auto const dstExtentWithoutColumn = extents.eraseBack();
507 if(
static_cast<std::size_t
>(extents.product()) != 0u)
510 dstExtentWithoutColumn,
514 reinterpret_cast<std::uint8_t*
>(destPtr)
515 + (idx * destPitchBytesWithoutColumn).sum(),
517 static_cast<size_t>(extents.back())
526 template<
typename T_Device,
typename T_Dest,
typename T_Value,
typename T_Extents>
540 dataView.getSubView(extents),
548 template<
typename T_Type,
typename T_Device, alpaka::concepts::Vector T_Extents>
553 uint32_t result = 1u;
554 while((result << 1u) <= value)
564 auto device = queue.getDevice();
570 auto deviceDependency =
onHost::Device{queue.getDevice()->getSharedPtr()};
571 auto queueDependency = queue.getSharedPtr();
574 device->pinPointer(ptr, memSizeInByte);
577 auto deleter = [ptr, queueDep = std::move(queueDependency)]()
592 std::stringstream ss;
604 template<
typename T_Device>
A thread queue executing tasks asynchronously.
#define ALPAKA_TYPEOF(...)
Get the type of instance.
#define ALPAKA_FORWARD(instance)
Perfectly forward an instance as argument.
Interface concept for objects describing api-related multidimensional memory access.
#define ALPAKA_LOG_INFO(logLvl, callable)
Write a meta data message to the output.
#define ALPAKA_LOG_FUNCTION(logLvl)
Log the entry and exit of a scope.
auto emulatedAlignedMemDescription(uint32_t alignmentInByte, T_Extents extents)
provides a memory description to create multidimensional linewise aligned memory within a one dimensi...
constexpr auto simdOptimizedAlignment(auto api, alpaka::concepts::DeviceKind auto deviceKind)
Calculate the best alignment for SIMD optimized memory allocation.
ALPAKA_FN_INLINE ALPAKA_FN_HOST void alignedFree(size_t alignment, auto ptr)
ALPAKA_FN_INLINE ALPAKA_FN_HOST auto alignedAlloc(size_t alignment, size_t size) -> void *
constexpr AnyExecutor anyExecutor
Automatic executor selection.
alpaka'S function interface
void fill(auto &internalQueue, auto executor, alpaka::concepts::IMdSpan< T_Value > auto &&dest, T_Value elementValue)
alpaka internal implementations.
constexpr auto getDeviceKind(auto &&any)
constexpr DeviceKind deviceKind
static auto adjustThreadSpec(auto const &device, onHost::concepts::FrameSpec auto const &frameSpec, KernelBundle< TKernelFn, TArgs... > const &kernelBundle)
constexpr auto getDevice(auto &&any)
Functionality which is usable on the host CPU controller thread.
constexpr auto defaultExecutor(internal::concepts::DeviceHandle auto deviceHandle)
Select a default executor for the given device.
std::shared_ptr< T > Handle
decltype(auto) data(auto &&any)
pointer to data of an object
auto makeAcc(alpaka::onHost::concepts::ThreadSpec auto const &threadSpec, uint32_t numaIdx, bool setThreadAffinity)
typename GetValueType< T >::type GetValueType_t
constexpr uint32_t getDim_v
auto * toVoidPtr(T inPtr)
Cast a pointer that may or may not point to volatile memory to a (void*) or (void const*).
constexpr decltype(auto) getDeviceKind(auto &&any)
Get the device type of an object.
constexpr decltype(auto) getApi(auto &&any)
Get the API an object depends on.
constexpr auto makeView(auto &&anyWithApi, T_ValueType *pointer, concepts::Vector auto const &extents, T_MemAlignment const memAlignment=T_MemAlignment{})
Strongly typed and constexpr representation of a byte-alignment of memory.
constexpr auto operator()(auto &&queue) const
Helper class to provide access to device global memory variables.
constexpr decltype(auto) getHandle(T_Api api) const
Get the handle to call native API specific memcopy for global device memory operation.
Description of a specific device that one can schedule kernels on.
Life time managed buffer with contiguous data.
std::atomic< bool > m_isBlockingTaskExecuted
Flag to show if a blocking tasks is executed.
std::string getName() const
void enqueue(T_ThreadSpec const &threadSpec, auto const &kernelBundle)
std::shared_ptr< Queue > getSharedPtr()
bool operator!=(Queue const &other) const
Handle< T_Device > m_device
Queue & operator=(Queue const &)=delete
auto getDeviceKind() const
auto submit(T_Fn &&fn)
Submit a task to the queue.
bool operator==(Queue const &other) const
Queue & operator=(Queue &&)=delete
void enqueue(T_FrameSpec const &frameSpec, auto const &kernelBundle)
std::mutex m_mutex
Mutex to ensure sequential execution of tasks and operation if the queue is blocking.
core::CallbackThread m_workerThread
Queue(internal::concepts::DeviceHandle auto device, uint32_t const idx, uint32_t numIdx, bool isBlocking)
bool isQueueEmpty() const
Checks if the queue is empty.
void enqueueHostFnDeferred(auto const &task)
void enqueueHostFn(auto const &task)
execute a task in the queue
Queue(Queue const &)=delete
auto getNativeHandle() const noexcept
auto operator()(cpu::Queue< T_Device > &queue, T_Extents const &extents) const
static consteval uint32_t highestPowerOfTwo(uint32_t value)
void operator()(cpu::Queue< T_Device > &queue, T_Event &event) const
void operator()(cpu::Queue< T_Device > &queue, auto &&dest, T_Value elementValue, T_Extents const &extents) const
void operator()(cpu::Queue< T_Device > &queue, auto &&dest, T_Source const &source, T_Extents const &extents) const
void operator()(cpu::Queue< T_Device > &queue, auto &&dest, uint8_t byteValue, T_Extents const &extents) const
void operator()(cpu::Queue< T_Device > &queue, cpu::Event< T_Device > &event) const
void operator()(cpu::Queue< T_Device > &queue) const
void operator()(cpu::Queue< T_Device > &queue, auto &&dest, onAcc::internal::GlobalDeviceMemoryWrapper< T_Storage, T > source) const
void operator()(cpu::Queue< T_Device > &queue, onAcc::internal::GlobalDeviceMemoryWrapper< T_Storage, T > dest, auto &&source) const