alpaka
Abstraction Library for Parallel Kernel Acceleration
Loading...
Searching...
No Matches
Queue.hpp
Go to the documentation of this file.
1/* Copyright 2025 Simeon Ehrig, René Widera
2 * SPDX-License-Identifier: MPL-2.0
3 */
4
5#pragma once
6
15
16#if ALPAKA_LANG_ONEAPI
17
18# ifndef ALPAKA_SYCL_NUM_MAX_SHARED_MEMORY_ALLOCATIONS
19# define ALPAKA_SYCL_NUM_MAX_SHARED_MEMORY_ALLOCATIONS 32u
20# endif
21
22# ifndef SYCL_EXT_ONEAPI_MEMCPY2D
23# error \
24 "SYCL_EXT_ONEAPI_MEMCPY2D is not defined. Extension https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/supported/sycl_ext_oneapi_memcpy2d.asciidoc is required!"
25# endif
26
28{
29 template<typename T_Device, typename T_Dest, typename T_Extents>
31 struct Memset::Op<syclGeneric::Queue<T_Device>, T_Dest, T_Extents>
32 {
33 void operator()(syclGeneric::Queue<T_Device>& queue, auto&& dest, uint8_t byteValue, T_Extents const& extents)
34 const requires std::same_as<ALPAKA_TYPEOF(dest), T_Dest>
35 {
36 sycl::queue sycl_queue = queue.getNativeHandle();
37
38 auto extentMd = pCast<size_t>(extents);
39 auto const destPitchBytesWithoutColumn = dest.getPitches().eraseBack();
40 auto* destPtr = data(dest);
41
42 constexpr auto dim = alpaka::trait::getDim_v<T_Extents>;
43
44 sycl::event ev;
45
46 if constexpr(dim == 2u)
47 {
48 ev = sycl_queue.ext_oneapi_memset2d(
49 destPtr,
50 destPitchBytesWithoutColumn.back(),
51 byteValue,
52 extentMd.x() * sizeof(alpaka::trait::GetValueType_t<T_Dest>),
53 extentMd.y());
54 }
55 else if constexpr(dim >= 3u)
56 {
57 auto const dstExtentWithoutColumn = extentMd.eraseBack();
58 ev = sycl_queue.ext_oneapi_memset2d(
59 destPtr,
60 destPitchBytesWithoutColumn.back(),
61 byteValue,
62 extentMd.x() * sizeof(alpaka::trait::GetValueType_t<T_Dest>),
63 dstExtentWithoutColumn.product());
64 }
65
66 queue.setLastEvent(ev);
67 if(queue.isBlocking())
68 ev.wait_and_throw();
69 }
70 };
71
72 template<typename T_Device, typename T_Dest, typename T_Source, typename T_Extents>
74 struct internal::Memcpy::Op<syclGeneric::Queue<T_Device>, T_Dest, T_Source, T_Extents>
75 {
76 void operator()(
77 syclGeneric::Queue<T_Device>& queue,
78 auto&& dest,
79 T_Source const& source,
80 T_Extents const& extents) const requires std::same_as<ALPAKA_TYPEOF(dest), T_Dest>
81 {
82 sycl::queue sycl_queue = queue.getNativeHandle();
83
84 auto extentMd = pCast<size_t>(extents);
85 auto const destPitchBytesWithoutColumn = dest.getPitches().eraseBack();
86 auto* destPtr = data(dest);
87 auto const sourcePitchBytesWithoutColumn = source.getPitches().eraseBack();
88 auto* sourcePtr = data(source);
89
90 constexpr auto dim = alpaka::trait::getDim_v<T_Extents>;
91
92 sycl::event ev;
93
94 if constexpr(dim == 2u)
95 {
96 ev = sycl_queue.ext_oneapi_memcpy2d(
97 destPtr,
98 destPitchBytesWithoutColumn.back(),
99 sourcePtr,
100 sourcePitchBytesWithoutColumn.back(),
101 extentMd.x() * sizeof(alpaka::trait::GetValueType_t<T_Dest>),
102 extentMd.y());
103 }
104 else if constexpr(dim >= 3u)
105 {
106 auto const dstExtentWithoutColumn = extentMd.eraseBack();
107 ev = sycl_queue.ext_oneapi_memcpy2d(
108 destPtr,
109 destPitchBytesWithoutColumn.back(),
110 sourcePtr,
111 sourcePitchBytesWithoutColumn.back(),
112 extentMd.x() * sizeof(alpaka::trait::GetValueType_t<T_Dest>),
113 dstExtentWithoutColumn.product());
114 }
115
116 queue.setLastEvent(ev);
117 if(queue.isBlocking())
118 ev.wait_and_throw();
119 }
120 };
121
122 // copy to device global memory
123 template<typename T_Device, typename T_Source, typename T_Storage, typename T>
124 struct internal::MemcpyDeviceGlobal::
125 Op<syclGeneric::Queue<T_Device>, onAcc::internal::GlobalDeviceMemoryWrapper<T_Storage, T>, T_Source>
126 {
127 void operator()(
128 syclGeneric::Queue<T_Device>& queue,
129 onAcc::internal::GlobalDeviceMemoryWrapper<T_Storage, T> dest,
130 auto&& source) const
131 {
133 sycl::queue sycl_queue = queue.getNativeHandle();
134 void const* srcPtr{nullptr};
135 if constexpr(std::is_pointer_v<ALPAKA_TYPEOF(source)>)
136 srcPtr = source;
137 else
138 srcPtr = toVoidPtr(alpaka::onHost::data(source));
139 sycl::event ev = sycl_queue.memcpy(dest.getHandle(alpaka::api::oneApi), srcPtr);
140 queue.setLastEvent(ev);
141 if(queue.isBlocking())
142 ev.wait_and_throw();
143 }
144 };
145
146 // copy from device global memory
147 template<typename T_Device, typename T_Dest, typename T_Storage, typename T>
148 struct internal::MemcpyDeviceGlobal::
149 Op<syclGeneric::Queue<T_Device>, T_Dest, onAcc::internal::GlobalDeviceMemoryWrapper<T_Storage, T>>
150 {
151 void operator()(
152 syclGeneric::Queue<T_Device>& queue,
153 auto&& dest,
154 onAcc::internal::GlobalDeviceMemoryWrapper<T_Storage, T> source) const
155 {
157 sycl::queue sycl_queue = queue.getNativeHandle();
158 void* destPtr{nullptr};
159 if constexpr(std::is_pointer_v<ALPAKA_TYPEOF(dest)>)
160 destPtr = dest;
161 else
162 destPtr = toVoidPtr(alpaka::onHost::data(dest));
163 sycl::event ev = sycl_queue.memcpy(destPtr, source.getHandle(alpaka::api::oneApi));
164 queue.setLastEvent(ev);
165 if(queue.isBlocking())
166 ev.wait_and_throw();
167 }
168 };
169
170 template<typename T_Device, typename T_Dest, typename T_Value, typename T_Extents>
172 struct internal::Fill::Op<syclGeneric::Queue<T_Device>, T_Dest, T_Value, T_Extents>
173 {
174 void operator()(
175 syclGeneric::Queue<T_Device>& queue,
176 auto&& dest,
177 T_Value elementValue,
178 T_Extents const& extents) const
179 requires std::same_as<ALPAKA_TYPEOF(dest), T_Dest>
180 && std::same_as<alpaka::trait::GetValueType_t<ALPAKA_TYPEOF(dest)>, T_Value>
181 {
182 // avoid that we pass a SharedBuffer and convert non alpaka data views
183 auto dataView = makeView(dest);
184
186 queue,
188 dataView.getSubView(extents),
189 elementValue);
190 }
191 };
192
193 namespace detail
194 {
195 template<alpaka::concepts::Vector TVec>
196 inline constexpr auto vecToSyclRange(TVec vec)
197 {
198 constexpr auto dim = std::decay_t<TVec>::dim();
199 return [&vec]<auto... I>(std::index_sequence<I...>)
200 // TODO: check if this is the correct order
201 { return sycl::range<dim>(vec[I]...); }(std::make_index_sequence<dim>{});
202 };
203
204 template<alpaka::concepts::Vector T_NumBlocks, alpaka::concepts::Vector T_NumThreads>
205 struct OptimizedThreadSpec
206 {
207 using NumBlocksVecType = typename T_NumBlocks::UniVec;
208 using NumThreadsVecType = T_NumThreads;
209
210 static consteval uint32_t dim()
211 {
212 return T_NumThreads::dim();
213 }
214
215 constexpr OptimizedThreadSpec(T_NumBlocks const&, T_NumThreads const&)
216 {
217 }
218 };
219
220 /** provides the sycl worker description
221 *
222 * @return A pair of the sycl nd range and an optimized thread spec. The thread spec is not holding any data
223 * for dimension smaller equal to 3u
224 */
225 template<onHost::concepts::ThreadSpec T_ThreadSpec>
226 inline constexpr auto getWorkerDescription(T_ThreadSpec const& threadSpec)
227 {
228 constexpr uint32_t dim = T_ThreadSpec::dim();
229 // dimension of the sycl nd range
230 constexpr uint32_t syclDim = dim >= 4u ? 1u : dim;
231
232 sycl::nd_range<syclDim> gridRange;
233
234 if constexpr(T_ThreadSpec::dim() >= 4u)
235 {
236 gridRange = sycl::nd_range<syclDim>{
237 (threadSpec.getNumBlocks() * threadSpec.getNumThreads()).product(),
238 threadSpec.getNumThreads().product()};
239 }
240 else
241 {
242 gridRange = sycl::nd_range<T_ThreadSpec::dim()>{
243 detail::vecToSyclRange(threadSpec.getNumBlocks() * threadSpec.getNumThreads()),
244 detail::vecToSyclRange(threadSpec.getNumThreads())};
245 }
246
247 using ThreadSpecType = std::conditional_t<
248 dim >= 4u,
249 ALPAKA_TYPEOF(threadSpec),
250 detail::OptimizedThreadSpec<
251 typename ALPAKA_TYPEOF(threadSpec)::NumBlocksVecType,
252 typename ALPAKA_TYPEOF(threadSpec)::NumThreadsVecType>>;
253 // thread spec which is only holding data if the dimension is larger than 3u
254 auto optimizedThreadSpec = ThreadSpecType(threadSpec.getNumBlocks(), threadSpec.getNumThreads());
255 return std::make_pair(gridRange, optimizedThreadSpec);
256 }
257
258 /** Generate the kernel with the given warp size.
259 *
260 * @tparam T_dim number of dimension of the kernel
261 * @tparam T_warpSize requested warp size
262 * @tparam T_isValid 0u means it is not valid, else it is valid and a kernel is generated
263 */
264 template<uint32_t T_dim, uint32_t T_warpSize, uint32_t T_isValid>
265 struct EnqueueKernelWithWarpSize
266 {
267 static void call(
268 sycl::handler& cgh,
269 auto gridRange,
270 auto const& kernelBundle,
271 auto const& st_shared_accessor,
272 auto const& dyn_shared_accessor,
273 auto const& optimizedThreadSpec,
274 auto... args)
275 {
276 cgh.parallel_for(
277 gridRange,
278 [kernelBundle, st_shared_accessor, dyn_shared_accessor, optimizedThreadSpec, args...](
279 sycl::nd_item<T_dim> work_item) [[sycl::reqd_sub_group_size(T_warpSize)]]
280 {
281 onAcc::oneApi::StaticSharedMemory ssm(st_shared_accessor);
282 onAcc::syclGeneric::DynamicSharedMemory dsm(dyn_shared_accessor);
283
284 static_assert(T_dim > 0);
285 static_assert(T_dim <= 3, "more the 3 dimensions are not supported");
286 auto acc = onAcc::Acc{Dict{
287 DictEntry(layer::block, onAcc::syclGeneric::BlockLayer{work_item, optimizedThreadSpec}),
288 DictEntry(layer::thread, onAcc::syclGeneric::ThreadLayer{work_item, optimizedThreadSpec}),
289 DictEntry(action::threadBlockSync, onAcc::syclGeneric::Sync{work_item}),
290 DictEntry(layer::shared, std::ref(ssm)),
291 DictEntry(layer::dynShared, std::ref(dsm)),
292 DictEntry(object::dynSharedMemBytes, dsm.byte_size()),
293 args...}};
294
295 kernelBundle(acc);
296 });
297 }
298 };
299
300 template<uint32_t T_dim, uint32_t T_warpSize>
301 struct EnqueueKernelWithWarpSize<T_dim, T_warpSize, 0u>
302 {
303 static void call(
304 [[maybe_unused]] sycl::handler& cgh,
305 [[maybe_unused]] auto gridRange,
306 [[maybe_unused]] auto const& kernelBundle,
307 [[maybe_unused]] auto const& st_shared_accessor,
308 [[maybe_unused]] auto const& dyn_shared_accessor,
309 [[maybe_unused]] auto const& optimizedThreadSpec,
310 [[maybe_unused]] auto... args)
311 {
312 printf(
313 "Dynamic evaluated warp size on host does not match the compile time warp size ( macro "
314 "ALPAKA_SYCL_SUBGROUP_SIZE) evaluated in the "
315 "kernel. Update the definition of ALPAKA_SYCL_SUBGROUP_SIZE section and check the trait "
316 "Warpsize::Dispatch<>.");
317 abort();
318 }
319 };
320 } // namespace detail
321
322 template<
323 typename T_Device,
325 alpaka::concepts::Vector T_NumBlocks,
326 alpaka::concepts::Vector T_NumThreads,
327 alpaka::concepts::KernelBundle T_KernelBundle>
328 struct Enqueue::
329 Kernel<syclGeneric::Queue<T_Device>, ThreadSpec<T_NumBlocks, T_NumThreads, T_Executor>, T_KernelBundle>
330 {
331 void operator()(
332 syclGeneric::Queue<T_Device>& queue,
333 ThreadSpec<T_NumBlocks, T_NumThreads, T_Executor> const& threadSpec,
334 T_KernelBundle const& kernelBundle) const
335 {
336 static_assert(
337 ALPAKA_TYPEOF(threadSpec)::getExecutor() != exec::anyExecutor,
338 "'exec::anyExecutor' can not be used to enqueue an kernel.");
339 ALPAKA_LOG_FUNCTION(onHost::logger::kernel + onHost::logger::queue);
340
341 constexpr auto st_shared_mem_bytes = onAcc::oneApi::StaticSharedMemory::sizeLookupBufferInBytes(
342 ALPAKA_SYCL_NUM_MAX_SHARED_MEMORY_ALLOCATIONS);
343 // allocate dynamic shared memory -- needs at least 1 byte to make the Xilinx Runtime happy
344 u_int32_t blockDynSharedMemBytes
345 = std::max(u_int32_t(1), onHost::getDynSharedMemBytes(threadSpec, kernelBundle));
346 assert(
347 st_shared_mem_bytes + blockDynSharedMemBytes
348 <= queue.m_device->getNativeHandle().first.template get_info<sycl::info::device::local_mem_size>());
349
350 sycl::event ev = queue.dispatchWarpSize(
351 [&](auto warpSize) requires std::same_as<
352 std::integral_constant<
353 typename ALPAKA_TYPEOF(warpSize)::value_type,
354 ALPAKA_TYPEOF(warpSize)::value>,
355 ALPAKA_TYPEOF(warpSize)>
356 {
357 return queue.m_queue.submit(
358 [warpSize, threadSpec, kernelBundle, blockDynSharedMemBytes](sycl::handler& cgh)
359 {
360 using ApiType = decltype(getApi(queue));
361 using DeviceKindType = ALPAKA_TYPEOF(getDeviceKind(queue));
362
363 auto st_shared_accessor
364 = sycl::local_accessor<std::byte>{sycl::range<1>{st_shared_mem_bytes}, cgh};
365
366 auto dyn_shared_accessor
367 = sycl::local_accessor<std::byte>{sycl::range<1>{blockDynSharedMemBytes}, cgh};
368
369 auto workerDesc = detail::getWorkerDescription(threadSpec);
370 auto optimizedThreadSpec = workerDesc.second;
371 constexpr uint32_t syclDim = workerDesc.first.dimensions;
372
373 constexpr uint32_t w = ALPAKA_TYPEOF(warpSize)::value;
374 detail::EnqueueKernelWithWarpSize<syclDim, w, ALPAKA_SYCL_SUBGROUP_SIZE & w>::call(
375 cgh,
376 workerDesc.first,
377 kernelBundle,
378 st_shared_accessor,
379 dyn_shared_accessor,
380 optimizedThreadSpec,
381 DictEntry(object::api, ApiType{}),
382 DictEntry(object::deviceKind, DeviceKindType{}),
383 DictEntry(object::exec, T_Executor{}),
384 DictEntry(object::launchedWidthFrameSpec, std::bool_constant<false>{}),
385 DictEntry(object::warpSize, warpSize));
386 });
387 });
388
389 queue.setLastEvent(ev);
390 if(queue.isBlocking())
391 ev.wait_and_throw();
392 }
393 };
394
395 template<
396 typename T_Device,
398 alpaka::concepts::Vector T_NumFrames,
399 alpaka::concepts::Vector T_FrameExtents,
400 alpaka::concepts::KernelBundle T_KernelBundle>
401 struct Enqueue::
402 Kernel<syclGeneric::Queue<T_Device>, FrameSpec<T_NumFrames, T_FrameExtents, T_Executor>, T_KernelBundle>
403 {
404 void operator()(
405 syclGeneric::Queue<T_Device>& queue,
406 FrameSpec<T_NumFrames, T_FrameExtents, T_Executor> const& frameSpec,
407 T_KernelBundle const& kernelBundle) const
408 {
409 static_assert(
410 ALPAKA_TYPEOF(frameSpec)::getExecutor() != exec::anyExecutor,
411 "'exec::anyExecutor' can not be used to enqueue an kernel.");
412 ALPAKA_LOG_FUNCTION(onHost::logger::kernel + onHost::logger::queue);
413
414 auto const threadBlocking = internal::adjustThreadSpec(*queue.m_device.get(), frameSpec, kernelBundle);
415
416 constexpr auto st_shared_mem_bytes = onAcc::oneApi::StaticSharedMemory::sizeLookupBufferInBytes(
417 ALPAKA_SYCL_NUM_MAX_SHARED_MEMORY_ALLOCATIONS);
418
419 // allocate dynamic shared memory -- needs at least 1 byte to make the Xilinx Runtime happy
420 u_int32_t blockDynSharedMemBytes
421 = std::max(u_int32_t(1), onHost::getDynSharedMemBytes(threadBlocking, kernelBundle));
422
423 assert(
424 st_shared_mem_bytes + blockDynSharedMemBytes
425 <= queue.m_device->getNativeHandle().first.template get_info<sycl::info::device::local_mem_size>());
426
427 sycl::event ev = queue.dispatchWarpSize(
428 [&](auto warpSize) requires std::same_as<
429 std::integral_constant<
430 typename ALPAKA_TYPEOF(warpSize)::value_type,
431 ALPAKA_TYPEOF(warpSize)::value>,
432 ALPAKA_TYPEOF(warpSize)>
433 {
434 return queue.m_queue.submit(
435 [warpSize, threadBlocking, kernelBundle, blockDynSharedMemBytes](sycl::handler& cgh)
436 {
437 using ApiType = decltype(getApi(queue));
438 using DeviceKindType = ALPAKA_TYPEOF(getDeviceKind(queue));
439 auto st_shared_accessor
440 = sycl::local_accessor<std::byte>{sycl::range<1>{st_shared_mem_bytes}, cgh};
441 auto dyn_shared_accessor
442 = sycl::local_accessor<std::byte>{sycl::range<1>{blockDynSharedMemBytes}, cgh};
443
444 auto workerDesc = detail::getWorkerDescription(threadBlocking);
445 auto optimizedThreadSpec = workerDesc.second;
446 constexpr uint32_t syclDim = workerDesc.first.dimensions;
447
448 constexpr uint32_t w = ALPAKA_TYPEOF(warpSize)::value;
449
450 detail::EnqueueKernelWithWarpSize<syclDim, w, ALPAKA_SYCL_SUBGROUP_SIZE & w>::call(
451 cgh,
452 workerDesc.first,
453 kernelBundle,
454 st_shared_accessor,
455 dyn_shared_accessor,
456 optimizedThreadSpec,
457 DictEntry(object::api, ApiType{}),
458 DictEntry(object::deviceKind, DeviceKindType{}),
459 DictEntry(object::exec, T_Executor{}),
460 DictEntry(object::launchedWidthFrameSpec, std::bool_constant<true>{}),
461 DictEntry(object::warpSize, warpSize));
462 });
463 });
464 if(queue.isBlocking())
465 ev.wait_and_throw();
466 }
467 };
468
469} // namespace alpaka::onHost::internal
470
471#endif
#define ALPAKA_TYPEOF(...)
Get the type of instance.
Definition common.hpp:153
Concept to check for an executor.
Definition trait.hpp:133
Concept to check if a type is a KernelBundle.
Concept to check if a type is a vector.
Definition Vec.hpp:53
#define ALPAKA_LOG_FUNCTION(logLvl)
Log the entry and exit of a scope.
Definition logger.hpp:95
constexpr auto oneApi
Definition Api.hpp:29
constexpr AnyExecutor anyExecutor
Automatic executor selection.
Definition executor.hpp:33
void fill(auto &internalQueue, auto executor, alpaka::concepts::IMdSpan< T_Value > auto &&dest, T_Value elementValue)
Definition generic.hpp:63
constexpr auto getDevice(auto &&any)
Definition interface.hpp:77
constexpr auto queue
Definition lvl.hpp:127
constexpr auto memory
Definition lvl.hpp:112
constexpr auto defaultExecutor(internal::concepts::DeviceHandle auto deviceHandle)
Select a default executor for the given device.
Definition trait.hpp:148
decltype(auto) data(auto &&any)
pointer to data of an object
typename GetValueType< T >::type GetValueType_t
Definition trait.hpp:65
constexpr uint32_t getDim_v
Definition trait.hpp:41
auto * toVoidPtr(T inPtr)
Cast a pointer that may or may not point to volatile memory to a (void*) or (void const*).
Definition util.hpp:34
constexpr decltype(auto) getDeviceKind(auto &&any)
Get the device type of an object.
Definition interface.hpp:52
constexpr decltype(auto) getApi(auto &&any)
Get the API an object depends on.
Definition interface.hpp:23
constexpr auto makeView(auto &&anyWithApi, T_ValueType *pointer, concepts::Vector auto const &extents, T_MemAlignment const memAlignment=T_MemAlignment{})
Definition View.hpp:37
constexpr decltype(auto) pCast(auto &&input)
Performs a static_cast on the storage type of combined data type.
Definition cast.hpp:48