22#include <shared_mutex>
28# include <sycl/sycl.hpp>
55 template<alpaka::concepts::DeviceKind T_DeviceKind>
58 auto operator()(T_DeviceKind deviceKind,
auto&& fn)
const;
63 struct Warpsize::Dispatch<
alpaka::deviceKind::Cpu>
65 auto operator()(alpaka::deviceKind::Cpu,
auto&& fn, uint32_t warpSize)
const
70 return fn(std::integral_constant<uint32_t, 1u>{});
72 return fn(std::integral_constant<uint32_t, 2u>{});
74 return fn(std::integral_constant<uint32_t, 4u>{});
76 return fn(std::integral_constant<uint32_t, 8u>{});
78 return fn(std::integral_constant<uint32_t, 16u>{});
80 return fn(std::integral_constant<uint32_t, 32u>{});
82 throw std::runtime_error(
83 std::string(
"Sycl warp size runtime dispatch, unsupported warpSize: ")
84 + std::to_string(warpSize));
85 return fn(std::integral_constant<uint32_t, 1u>{});
91 struct Warpsize::Dispatch<
alpaka::deviceKind::IntelGpu>
93 auto operator()(alpaka::deviceKind::IntelGpu,
auto&& fn, uint32_t warpSize)
const
98 return fn(std::integral_constant<uint32_t, 8u>{});
100 return fn(std::integral_constant<uint32_t, 16u>{});
102 return fn(std::integral_constant<uint32_t, 32u>{});
104 throw std::runtime_error(
105 std::string(
"Sycl warp size runtime dispatch, unsupported warpSize: ")
106 + std::to_string(warpSize));
107 return fn(std::integral_constant<uint32_t, 32u>{});
113 struct Warpsize::Dispatch<
alpaka::deviceKind::AmdGpu>
115 auto operator()(alpaka::deviceKind::AmdGpu,
auto&& fn, uint32_t warpSize)
const
120 return fn(std::integral_constant<uint32_t, 32u>{});
122 return fn(std::integral_constant<uint32_t, 64u>{});
124 throw std::runtime_error(
125 std::string(
"Sycl warp size runtime dispatch, unsupported warpSize: ")
126 + std::to_string(warpSize));
127 return fn(std::integral_constant<uint32_t, 32u>{});
133 struct Warpsize::Dispatch<
alpaka::deviceKind::NvidiaGpu>
135 auto operator()(alpaka::deviceKind::NvidiaGpu,
auto&& fn, uint32_t warpSize)
const
140 return fn(std::integral_constant<uint32_t, 32u>{});
142 throw std::runtime_error(
143 std::string(
"Sycl warp size runtime dispatch, unsupported warpSize: ")
144 + std::to_string(warpSize));
145 return fn(std::integral_constant<uint32_t, 32u>{});
150 template<
typename T_Device>
151 struct Queue : std::enable_shared_from_this<Queue<T_Device>>
154 friend struct alpaka::internal::GetApi;
156 template<alpaka::concepts::Vector TVec>
157 static constexpr auto vecToSyclRange(TVec vec)
159 constexpr auto dim = std::decay_t<TVec>::dim();
160 return [&vec]<
auto... I>(std::index_sequence<I...>)
162 {
return sycl::range<dim>(vec[I]...); }(std::make_index_sequence<dim>{});
165 inline constexpr auto dispatchWarpSize(
auto&& fn)
const
168 = internal::GetDeviceProperties::Op<
ALPAKA_TYPEOF(*m_device.get())>{}(*m_device.get()).warpSize;
178 Queue(internal::concepts::DeviceHandle
auto device, uint32_t
const idx,
bool isBlocking)
179 : m_device(std::move(
device))
184 {sycl::property::queue::in_order{}})
185 , m_isBlocking(isBlocking)
190 [[nodiscard]]
bool isBlocking() const noexcept
195 Queue(Queue
const&) =
delete;
196 Queue& operator=(Queue
const&) =
delete;
198 Queue(Queue&&) =
delete;
199 Queue& operator=(Queue&&) =
delete;
206 m_queue.wait_and_throw();
208 catch(sycl::exception
const& err)
210 std::cerr <<
"Caught SYCL exception while destructing a SYCL queue: " << err.what() <<
" ("
211 << err.code() <<
')' << std::endl;
213 catch(std::exception
const& err)
215 std::cerr <<
"The following runtime error(s) occurred while destructing a SYCL queue:"
216 << err.what() << std::endl;
220 std::shared_ptr<Queue> getSharedPtr()
222 return this->shared_from_this();
232 m_queue.wait_and_throw();
237 std::stringstream ss;
238 ss <<
"Queue<" <<
getApi(m_device).getName() <<
">";
239 ss <<
" id=" << m_idx;
244 friend struct alpaka::internal::GetDeviceType;
245 friend struct alpaka::onHost::internal::Enqueue;
246 friend struct onHost::internal::AllocDeferred;
250 return alpaka::internal::getDeviceKind(*m_device.get());
253 auto getDevice()
const
258 friend struct onHost::internal::GetDevice;
260 friend struct alpaka::onHost::internal::WaitFor;
262 void waitFor(syclGeneric::Event<T_Device>& event)
265 sycl::event sycl_event =
event.getNativeHandle();
266 sycl::event ev = m_queue.submit([sycl_event](sycl::handler& cgh) { cgh.depends_on(sycl_event); });
272 friend struct internal::IsQueueEmpty;
279 bool isQueueEmpty()
const
283 auto const status = getLastEvent().template get_info<sycl::info::event::command_execution_status>();
284 return status == sycl::info::event_command_status::complete;
288 sycl::event getLastEvent()
const
290 std::shared_lock<std::shared_mutex> lock{m_eventGuard};
298 void setLastEvent(sycl::event
const& ev)
const
300 std::unique_lock<std::shared_mutex> lock{m_eventGuard};
304 friend struct alpaka::onHost::internal::Memset;
305 friend struct alpaka::onHost::internal::Memcpy;
306 friend struct alpaka::onHost::internal::MemcpyDeviceGlobal;
307 friend struct alpaka::onHost::internal::Alloc;
308 friend struct alpaka::onHost::internal::AllocDeferred;
309 friend struct alpaka::onHost::internal::AllocMapped;
310 friend struct alpaka::onHost::internal::Fill;
312 Handle<T_Device> m_device;
316 mutable std::shared_mutex m_eventGuard;
324 mutable sycl::event m_lastEvent;
325 core::CallbackThread m_callBackThread;
326 bool m_isBlocking{
false};
331 template<
typename T_Device,
typename T_Task>
332 struct internal::Enqueue::HostTask<syclGeneric::
Queue<T_Device>, T_Task>
334 void operator()(syclGeneric::Queue<T_Device>& queue, T_Task
const& task)
const
343 sycl::event ev =
queue.m_queue.submit(
344 [&queue, task](sycl::handler& cgh)
349 auto f =
queue.m_callBackThread.submit([t = std::move(task)] { t(); });
353 queue.setLastEvent(ev);
354 if(
queue.isBlocking())
359 template<
typename T_Device,
typename T_Task>
360 struct internal::Enqueue::HostTaskDeferred<syclGeneric::
Queue<T_Device>, T_Task>
363 void operator()(syclGeneric::Queue<T_Device>& queue, T_Task
const& task)
const
372 sycl::event ev =
queue.m_queue.submit(
373 [&queue, task](sycl::handler& cgh)
375 cgh.host_task([&queue, task]() {
queue.m_callBackThread.submit([t = std::move(task)] { t(); }); });
377 queue.setLastEvent(ev);
378 if(
queue.isBlocking())
383 template<
typename T_Device,
typename T_Event>
384 struct internal::Enqueue::Event<syclGeneric::
Queue<T_Device>, T_Event>
386 void operator()(syclGeneric::Queue<T_Device>& queue, T_Event& event)
const
392 sycl::event emulatedEvent =
queue.m_queue.submit([](sycl::handler& cgh) { cgh.single_task([]() {}); });
393 event.setEvent(emulatedEvent);
394 if(
queue.isBlocking())
395 emulatedEvent.wait_and_throw();
399 template<
typename T_Device,
typename T_Dest,
typename T_Extents>
401 struct internal::Memset::Op<syclGeneric::Queue<T_Device>, T_Dest, T_Extents>
403 void operator()(syclGeneric::Queue<T_Device>& queue,
auto&& dest, uint8_t byteValue, T_Extents
const& extents)
408 sycl::queue sycl_queue =
queue.getNativeHandle();
409 sycl::event ev = sycl_queue.memset(
410 internal::Data::data(dest),
413 queue.setLastEvent(ev);
414 if(
queue.isBlocking())
419 template<
typename T_Device,
typename T_Dest,
typename T_Source,
typename T_Extents>
421 struct internal::Memcpy::Op<syclGeneric::Queue<T_Device>, T_Dest, T_Source, T_Extents>
424 syclGeneric::Queue<T_Device>& queue,
426 T_Source
const& source,
427 T_Extents
const& extents)
const requires std::same_as<
ALPAKA_TYPEOF(dest), T_Dest>
431 sycl::queue sycl_queue =
queue.getNativeHandle();
432 sycl::event ev = sycl_queue.memcpy(
436 queue.setLastEvent(ev);
437 if(
queue.isBlocking())
442 template<
typename T_Device,
typename T_Dest,
typename T_Value,
typename T_Extents>
444 struct internal::Fill::Op<syclGeneric::Queue<T_Device>, T_Dest, T_Value, T_Extents>
447 syclGeneric::Queue<T_Device>& queue,
449 T_Value elementValue,
450 T_Extents
const& extents)
const
455 sycl::queue sycl_queue =
queue.getNativeHandle();
456 sycl::event ev = sycl_queue.fill(internal::Data::data(dest), elementValue, extents.x());
457 queue.setLastEvent(ev);
458 if(
queue.isBlocking())
466 template<
typename T_Type,
typename T_Device, alpaka::concepts::Vector T_Extents>
467 struct internal::AllocDeferred::Op<T_Type, syclGeneric::
Queue<T_Device>, T_Extents>
469 auto operator()(syclGeneric::Queue<T_Device>& queue, T_Extents
const& extents)
const
479 sycl::queue sycl_queue =
queue.getNativeHandle();
480 auto queueDependency =
queue.getSharedPtr();
483 T_Type* ptr =
reinterpret_cast<T_Type*
>(sycl::aligned_alloc_device(alignment, memSizeInByte, sycl_queue));
486 if(
queue.isBlocking())
487 sycl_queue.wait_and_throw();
489 auto deleter = [queueDep = std::move(queueDependency), ptr]()
491 sycl::queue sycl_queue = queueDep->getNativeHandle();
501 queueDep->m_callBackThread.submit(
502 [sycl_queue, ptr]()
mutable
504 sycl_queue.submit([&](sycl::handler& cgh)
505 { cgh.host_task([=]() { sycl::free(
toVoidPtr(ptr), sycl_queue); }); });
515 Alignment<alignment>{}};
521namespace alpaka::internal
524 template<
typename T_Device>
525 struct GetApi::Op<
alpaka::onHost::syclGeneric::Queue<T_Device>>
527 inline constexpr auto operator()(
auto&& queue)
const
#define ALPAKA_TYPEOF(...)
Get the type of instance.
#define ALPAKA_FORWARD(instance)
Perfectly forward an instance as argument.
#define ALPAKA_LOG_FUNCTION(logLvl)
Log the entry and exit of a scope.
auto emulatedAlignedMemDescription(uint32_t alignmentInByte, T_Extents extents)
provides a memory description to create multidimensional linewise aligned memory within a one dimensi...
constexpr auto simdOptimizedAlignment(auto api, alpaka::concepts::DeviceKind auto deviceKind)
Calculate the best alignment for SIMD optimized memory allocation.
constexpr WarpSize warpSize
Functionality which is usable on the host CPU controller thread.
auto getNativeHandle(auto const &handle)
Get the native handle of an handle.
SharedBuffer(T_Any const &, T_Type *, T_UserExtents const &, T_UserPitches const &, std::invocable<> auto, T_MemAlignment const) -> SharedBuffer< ALPAKA_TYPEOF(getApi(std::declval< T_Any >())), T_Type, typename T_UserPitches::UniVec, T_MemAlignment >
std::convertible_to< std::string > auto getName(auto &&any)
Runtime name for a given object.
Device(Handle< T_Device > &&) -> Device< ALPAKA_TYPEOF(alpaka::internal::getApi(std::declval< T_Device >())), ALPAKA_TYPEOF(alpaka::internal::getDeviceKind(std::declval< T_Device >()))>
void wait(alpaka::concepts::HasGet auto &handle)
wait for all work to be finished
Queue(Handle< T_Queue > &&, T_QueueKind) -> Queue< Device< ALPAKA_TYPEOF(alpaka::internal::getApi(std::declval< T_Queue >())), ALPAKA_TYPEOF(alpaka::internal::getDeviceKind(std::declval< T_Queue >()))>, T_QueueKind >
typename GetValueType< T >::type GetValueType_t
constexpr uint32_t getDim_v
auto * toVoidPtr(T inPtr)
Cast a pointer that may or may not point to volatile memory to a (void*) or (void const*).
constexpr decltype(auto) getDeviceKind(auto &&any)
Get the device type of an object.
constexpr decltype(auto) getApi(auto &&any)
Get the API an object depends on.