18# ifndef ALPAKA_SYCL_NUM_MAX_SHARED_MEMORY_ALLOCATIONS
19# define ALPAKA_SYCL_NUM_MAX_SHARED_MEMORY_ALLOCATIONS 32u
22# ifndef SYCL_EXT_ONEAPI_MEMCPY2D
24 "SYCL_EXT_ONEAPI_MEMCPY2D is not defined. Extension https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/supported/sycl_ext_oneapi_memcpy2d.asciidoc is required!"
29 template<
typename T_Device,
typename T_Dest,
typename T_Extents>
31 struct Memset::Op<syclGeneric::Queue<T_Device>, T_Dest, T_Extents>
33 void operator()(syclGeneric::Queue<T_Device>& queue,
auto&& dest, uint8_t byteValue, T_Extents
const& extents)
36 sycl::queue sycl_queue =
queue.getNativeHandle();
39 auto const destPitchBytesWithoutColumn = dest.getPitches().eraseBack();
40 auto* destPtr =
data(dest);
46 if constexpr(dim == 2u)
48 ev = sycl_queue.ext_oneapi_memset2d(
50 destPitchBytesWithoutColumn.back(),
55 else if constexpr(dim >= 3u)
57 auto const dstExtentWithoutColumn = extentMd.eraseBack();
58 ev = sycl_queue.ext_oneapi_memset2d(
60 destPitchBytesWithoutColumn.back(),
63 dstExtentWithoutColumn.product());
66 queue.setLastEvent(ev);
67 if(
queue.isBlocking())
72 template<
typename T_Device,
typename T_Dest,
typename T_Source,
typename T_Extents>
74 struct internal::Memcpy::Op<syclGeneric::Queue<T_Device>, T_Dest, T_Source, T_Extents>
77 syclGeneric::Queue<T_Device>& queue,
79 T_Source
const& source,
80 T_Extents
const& extents)
const requires std::same_as<
ALPAKA_TYPEOF(dest), T_Dest>
82 sycl::queue sycl_queue =
queue.getNativeHandle();
85 auto const destPitchBytesWithoutColumn = dest.getPitches().eraseBack();
86 auto* destPtr =
data(dest);
87 auto const sourcePitchBytesWithoutColumn = source.getPitches().eraseBack();
88 auto* sourcePtr =
data(source);
94 if constexpr(dim == 2u)
96 ev = sycl_queue.ext_oneapi_memcpy2d(
98 destPitchBytesWithoutColumn.back(),
100 sourcePitchBytesWithoutColumn.back(),
104 else if constexpr(dim >= 3u)
106 auto const dstExtentWithoutColumn = extentMd.eraseBack();
107 ev = sycl_queue.ext_oneapi_memcpy2d(
109 destPitchBytesWithoutColumn.back(),
111 sourcePitchBytesWithoutColumn.back(),
113 dstExtentWithoutColumn.product());
116 queue.setLastEvent(ev);
117 if(
queue.isBlocking())
123 template<
typename T_Device,
typename T_Source,
typename T_Storage,
typename T>
124 struct internal::MemcpyDeviceGlobal::
125 Op<syclGeneric::Queue<T_Device>, onAcc::internal::GlobalDeviceMemoryWrapper<T_Storage, T>, T_Source>
128 syclGeneric::Queue<T_Device>& queue,
129 onAcc::internal::GlobalDeviceMemoryWrapper<T_Storage, T> dest,
133 sycl::queue sycl_queue =
queue.getNativeHandle();
134 void const* srcPtr{
nullptr};
140 queue.setLastEvent(ev);
141 if(
queue.isBlocking())
147 template<
typename T_Device,
typename T_Dest,
typename T_Storage,
typename T>
148 struct internal::MemcpyDeviceGlobal::
149 Op<syclGeneric::Queue<T_Device>, T_Dest, onAcc::internal::GlobalDeviceMemoryWrapper<T_Storage, T>>
152 syclGeneric::Queue<T_Device>& queue,
154 onAcc::internal::GlobalDeviceMemoryWrapper<T_Storage, T> source)
const
157 sycl::queue sycl_queue =
queue.getNativeHandle();
158 void* destPtr{
nullptr};
164 queue.setLastEvent(ev);
165 if(
queue.isBlocking())
170 template<
typename T_Device,
typename T_Dest,
typename T_Value,
typename T_Extents>
172 struct internal::Fill::Op<syclGeneric::Queue<T_Device>, T_Dest, T_Value, T_Extents>
175 syclGeneric::Queue<T_Device>& queue,
177 T_Value elementValue,
178 T_Extents
const& extents)
const
188 dataView.getSubView(extents),
195 template<alpaka::concepts::Vector TVec>
196 inline constexpr auto vecToSyclRange(TVec vec)
198 constexpr auto dim = std::decay_t<TVec>::dim();
199 return [&vec]<
auto... I>(std::index_sequence<I...>)
201 {
return sycl::range<dim>(vec[I]...); }(std::make_index_sequence<dim>{});
204 template<alpaka::concepts::Vector T_NumBlocks, alpaka::concepts::Vector T_NumThreads>
205 struct OptimizedThreadSpec
207 using NumBlocksVecType =
typename T_NumBlocks::UniVec;
208 using NumThreadsVecType = T_NumThreads;
210 static consteval uint32_t dim()
212 return T_NumThreads::dim();
215 constexpr OptimizedThreadSpec(T_NumBlocks
const&, T_NumThreads
const&)
225 template<onHost::concepts::ThreadSpec T_ThreadSpec>
226 inline constexpr auto getWorkerDescription(T_ThreadSpec
const& threadSpec)
228 constexpr uint32_t dim = T_ThreadSpec::dim();
230 constexpr uint32_t syclDim = dim >= 4u ? 1u : dim;
232 sycl::nd_range<syclDim> gridRange;
234 if constexpr(T_ThreadSpec::dim() >= 4u)
236 gridRange = sycl::nd_range<syclDim>{
237 (threadSpec.getNumBlocks() * threadSpec.getNumThreads()).product(),
238 threadSpec.getNumThreads().product()};
242 gridRange = sycl::nd_range<T_ThreadSpec::dim()>{
243 detail::vecToSyclRange(threadSpec.getNumBlocks() * threadSpec.getNumThreads()),
244 detail::vecToSyclRange(threadSpec.getNumThreads())};
247 using ThreadSpecType = std::conditional_t<
250 detail::OptimizedThreadSpec<
254 auto optimizedThreadSpec = ThreadSpecType(threadSpec.getNumBlocks(), threadSpec.getNumThreads());
255 return std::make_pair(gridRange, optimizedThreadSpec);
264 template<u
int32_t T_dim, u
int32_t T_warpSize, u
int32_t T_isVal
id>
265 struct EnqueueKernelWithWarpSize
270 auto const& kernelBundle,
271 auto const& st_shared_accessor,
272 auto const& dyn_shared_accessor,
273 auto const& optimizedThreadSpec,
278 [kernelBundle, st_shared_accessor, dyn_shared_accessor, optimizedThreadSpec, args...](
279 sycl::nd_item<T_dim> work_item) [[sycl::reqd_sub_group_size(T_warpSize)]]
281 onAcc::oneApi::StaticSharedMemory ssm(st_shared_accessor);
282 onAcc::syclGeneric::DynamicSharedMemory dsm(dyn_shared_accessor);
284 static_assert(T_dim > 0);
285 static_assert(T_dim <= 3,
"more the 3 dimensions are not supported");
286 auto acc = onAcc::Acc{Dict{
287 DictEntry(layer::block, onAcc::syclGeneric::BlockLayer{work_item, optimizedThreadSpec}),
288 DictEntry(layer::thread, onAcc::syclGeneric::ThreadLayer{work_item, optimizedThreadSpec}),
289 DictEntry(action::threadBlockSync, onAcc::syclGeneric::Sync{work_item}),
290 DictEntry(layer::shared, std::ref(ssm)),
291 DictEntry(layer::dynShared, std::ref(dsm)),
292 DictEntry(object::dynSharedMemBytes, dsm.byte_size()),
300 template<u
int32_t T_dim, u
int32_t T_warpSize>
301 struct EnqueueKernelWithWarpSize<T_dim, T_warpSize, 0u>
304 [[maybe_unused]] sycl::handler& cgh,
305 [[maybe_unused]]
auto gridRange,
306 [[maybe_unused]]
auto const& kernelBundle,
307 [[maybe_unused]]
auto const& st_shared_accessor,
308 [[maybe_unused]]
auto const& dyn_shared_accessor,
309 [[maybe_unused]]
auto const& optimizedThreadSpec,
310 [[maybe_unused]]
auto... args)
313 "Dynamic evaluated warp size on host does not match the compile time warp size ( macro "
314 "ALPAKA_SYCL_SUBGROUP_SIZE) evaluated in the "
315 "kernel. Update the definition of ALPAKA_SYCL_SUBGROUP_SIZE section and check the trait "
316 "Warpsize::Dispatch<>.");
329 Kernel<syclGeneric::Queue<T_Device>, ThreadSpec<T_NumBlocks, T_NumThreads, T_Executor>, T_KernelBundle>
332 syclGeneric::Queue<T_Device>& queue,
333 ThreadSpec<T_NumBlocks, T_NumThreads, T_Executor>
const& threadSpec,
334 T_KernelBundle
const& kernelBundle)
const
338 "'exec::anyExecutor' can not be used to enqueue an kernel.");
341 constexpr auto st_shared_mem_bytes = onAcc::oneApi::StaticSharedMemory::sizeLookupBufferInBytes(
342 ALPAKA_SYCL_NUM_MAX_SHARED_MEMORY_ALLOCATIONS);
344 u_int32_t blockDynSharedMemBytes
345 = std::max(u_int32_t(1), onHost::getDynSharedMemBytes(threadSpec, kernelBundle));
347 st_shared_mem_bytes + blockDynSharedMemBytes
348 <=
queue.m_device->getNativeHandle().first.template get_info<sycl::info::device::local_mem_size>());
350 sycl::event ev =
queue.dispatchWarpSize(
351 [&](
auto warpSize)
requires std::same_as<
352 std::integral_constant<
357 return queue.m_queue.submit(
358 [warpSize, threadSpec, kernelBundle, blockDynSharedMemBytes](sycl::handler& cgh)
360 using ApiType =
decltype(
getApi(queue));
363 auto st_shared_accessor
364 = sycl::local_accessor<std::byte>{sycl::range<1>{st_shared_mem_bytes}, cgh};
366 auto dyn_shared_accessor
367 = sycl::local_accessor<std::byte>{sycl::range<1>{blockDynSharedMemBytes}, cgh};
369 auto workerDesc = detail::getWorkerDescription(threadSpec);
370 auto optimizedThreadSpec = workerDesc.second;
371 constexpr uint32_t syclDim = workerDesc.first.dimensions;
374 detail::EnqueueKernelWithWarpSize<syclDim, w, ALPAKA_SYCL_SUBGROUP_SIZE & w>::call(
381 DictEntry(object::api, ApiType{}),
382 DictEntry(object::deviceKind, DeviceKindType{}),
383 DictEntry(object::exec, T_Executor{}),
384 DictEntry(object::launchedWidthFrameSpec, std::bool_constant<false>{}),
385 DictEntry(object::warpSize, warpSize));
389 queue.setLastEvent(ev);
390 if(
queue.isBlocking())
402 Kernel<syclGeneric::Queue<T_Device>, FrameSpec<T_NumFrames, T_FrameExtents, T_Executor>, T_KernelBundle>
405 syclGeneric::Queue<T_Device>& queue,
406 FrameSpec<T_NumFrames, T_FrameExtents, T_Executor>
const& frameSpec,
407 T_KernelBundle
const& kernelBundle)
const
411 "'exec::anyExecutor' can not be used to enqueue an kernel.");
414 auto const threadBlocking = internal::adjustThreadSpec(*
queue.m_device.get(), frameSpec, kernelBundle);
416 constexpr auto st_shared_mem_bytes = onAcc::oneApi::StaticSharedMemory::sizeLookupBufferInBytes(
417 ALPAKA_SYCL_NUM_MAX_SHARED_MEMORY_ALLOCATIONS);
420 u_int32_t blockDynSharedMemBytes
421 = std::max(u_int32_t(1), onHost::getDynSharedMemBytes(threadBlocking, kernelBundle));
424 st_shared_mem_bytes + blockDynSharedMemBytes
425 <=
queue.m_device->getNativeHandle().first.template get_info<sycl::info::device::local_mem_size>());
427 sycl::event ev =
queue.dispatchWarpSize(
428 [&](
auto warpSize)
requires std::same_as<
429 std::integral_constant<
434 return queue.m_queue.submit(
435 [warpSize, threadBlocking, kernelBundle, blockDynSharedMemBytes](sycl::handler& cgh)
437 using ApiType =
decltype(
getApi(queue));
439 auto st_shared_accessor
440 = sycl::local_accessor<std::byte>{sycl::range<1>{st_shared_mem_bytes}, cgh};
441 auto dyn_shared_accessor
442 = sycl::local_accessor<std::byte>{sycl::range<1>{blockDynSharedMemBytes}, cgh};
444 auto workerDesc = detail::getWorkerDescription(threadBlocking);
445 auto optimizedThreadSpec = workerDesc.second;
446 constexpr uint32_t syclDim = workerDesc.first.dimensions;
450 detail::EnqueueKernelWithWarpSize<syclDim, w, ALPAKA_SYCL_SUBGROUP_SIZE & w>::call(
457 DictEntry(object::api, ApiType{}),
458 DictEntry(object::deviceKind, DeviceKindType{}),
459 DictEntry(object::exec, T_Executor{}),
460 DictEntry(object::launchedWidthFrameSpec, std::bool_constant<true>{}),
461 DictEntry(object::warpSize, warpSize));
464 if(
queue.isBlocking())
#define ALPAKA_TYPEOF(...)
Get the type of instance.
Concept to check for an executor.
Concept to check if a type is a KernelBundle.
Concept to check if a type is a vector.
#define ALPAKA_LOG_FUNCTION(logLvl)
Log the entry and exit of a scope.
constexpr AnyExecutor anyExecutor
Automatic executor selection.
void fill(auto &internalQueue, auto executor, alpaka::concepts::IMdSpan< T_Value > auto &&dest, T_Value elementValue)
constexpr auto getDevice(auto &&any)
constexpr auto defaultExecutor(internal::concepts::DeviceHandle auto deviceHandle)
Select a default executor for the given device.
decltype(auto) data(auto &&any)
pointer to data of an object
typename GetValueType< T >::type GetValueType_t
constexpr uint32_t getDim_v
auto * toVoidPtr(T inPtr)
Cast a pointer that may or may not point to volatile memory to a (void*) or (void const*).
constexpr decltype(auto) getDeviceKind(auto &&any)
Get the device type of an object.
constexpr decltype(auto) getApi(auto &&any)
Get the API an object depends on.
constexpr auto makeView(auto &&anyWithApi, T_ValueType *pointer, concepts::Vector auto const &extents, T_MemAlignment const memAlignment=T_MemAlignment{})
constexpr decltype(auto) pCast(auto &&input)
Performs a static_cast on the storage type of combined data type.