alpaka
Abstraction Library for Parallel Kernel Acceleration
Loading...
Searching...
No Matches
Device.hpp
Go to the documentation of this file.
1/* Copyright 2025 Simeon Ehrig, René Widera
2 * SPDX-License-Identifier: MPL-2.0
3 */
4
5#pragma once
6
7#include "Queue.hpp"
8#include "alpaka/Vec.hpp"
11#include "alpaka/api/util.hpp"
14
15#if ALPAKA_LANG_SYCL
16
17# include <sycl/sycl.hpp>
18
19namespace alpaka::onHost
20{
21 namespace syclGeneric
22 {
23 template<typename T_Platform>
24 struct Device : std::enable_shared_from_this<Device<T_Platform>>
25 {
26 public:
27 Device(internal::concepts::PlatformHandle auto platform, auto const& dev, uint32_t const idx)
28 : m_platform(std::move(platform))
29 , m_idx(idx)
30 , m_sycl_dev(dev)
31 , m_properties{internal::getDeviceProperties(*m_platform.get(), m_idx)}
32 {
33 ALPAKA_LOG_FUNCTION(onHost::logger::device);
34 }
35
36 ~Device()
37 {
38 ALPAKA_LOG_FUNCTION(onHost::logger::device);
39 }
40
41 Device(Device const&) = delete;
42 Device& operator=(Device const&) = delete;
43
44 Device(Device&&) = delete;
45 Device& operator=(Device&&) = delete;
46
47 auto getName() const
48 {
49 return m_sycl_dev.get_info<sycl::info::device::name>();
50 }
51
52 std::shared_ptr<Device<T_Platform>> getSharedPtr()
53 {
54 return this->shared_from_this();
55 }
56
57 [[nodiscard]] Handle<syclGeneric::Queue<Device>> makeQueue(alpaka::concepts::QueueKind auto kind)
58 {
59 ALPAKA_LOG_FUNCTION(onHost::logger::queue + onHost::logger::device);
60 static_assert(
61 kind == queueKind::blocking || kind == queueKind::nonBlocking,
62 "Unsupported queue kind.");
63 auto thisHandle = this->getSharedPtr();
64 std::lock_guard<std::mutex> lk{m_writeGuard};
65
66 constexpr bool isBlocking = kind == queueKind::blocking;
67 auto newQueue
68 = std::make_shared<syclGeneric::Queue<Device>>(std::move(thisHandle), queues.size(), isBlocking);
69
70 queues.emplace_back(newQueue);
71 return newQueue;
72 }
73
74 [[nodiscard]] std::pair<sycl::device, sycl::context> getNativeHandle() const noexcept
75 {
76 return {m_sycl_dev, m_platform->getContext()};
77 }
78
79 void wait()
80 {
81 ALPAKA_LOG_FUNCTION(onHost::logger::device);
82 // Copy queue weak refs under lock then release to avoid blocking other operations while waiting.
83 std::vector<std::weak_ptr<syclGeneric::Queue<Device>>> tmpQueues;
84 {
85 std::lock_guard<std::mutex> lk{m_writeGuard};
86 tmpQueues = queues;
87 }
88 for(auto& weakQueue : tmpQueues)
89 {
90 if(auto queue = weakQueue.lock())
91 {
92 queue->wait();
93 }
94 }
95 }
96
97 private:
98 friend struct internal::MakeEvent;
99
100 Handle<syclGeneric::Event<Device>> makeEvent()
101 {
102 ALPAKA_LOG_FUNCTION(onHost::logger::event + onHost::logger::device);
103 auto thisHandle = this->getSharedPtr();
104 std::lock_guard<std::mutex> lk{m_writeGuard};
105 auto newEvent = std::make_shared<syclGeneric::Event<Device>>(std::move(thisHandle), events.size());
106
107 events.emplace_back(newEvent);
108 return newEvent;
109 }
110
111 void _()
112 {
113 static_assert(internal::concepts::Device<Device>);
114 }
115
116 friend struct alpaka::internal::GetDeviceType;
117
118 auto getDeviceKind() const
119 {
120 return alpaka::internal::getDeviceKind(*m_platform.get());
121 }
122
123 Handle<T_Platform> m_platform;
124 uint32_t m_idx = 0u;
125 sycl::device m_sycl_dev;
126
127 std::vector<std::weak_ptr<syclGeneric::Queue<Device>>> queues;
128 std::vector<std::weak_ptr<syclGeneric::Event<Device>>> events;
129 std::mutex m_writeGuard;
130
131 DeviceProperties m_properties;
132
133 friend struct alpaka::internal::GetApi;
134 friend struct internal::GetDeviceProperties;
135 friend struct internal::GetFreeGlobalMemBytes;
136 friend struct internal::AdjustThreadSpec;
137 friend struct onHost::internal::AllocDeferred;
138 friend struct onHost::internal::AllocUnified;
139 friend struct onHost::internal::AllocMapped;
140 friend struct onHost::internal::IsDataAccessible;
141 };
142 } // namespace syclGeneric
143
144 namespace internal
145 {
146
147 template<typename T_Type, typename T_Platform, alpaka::concepts::Vector T_Extents>
148 struct Alloc::Op<T_Type, syclGeneric::Device<T_Platform>, T_Extents>
149 {
150 auto operator()(syclGeneric::Device<T_Platform>& device, T_Extents const& extents) const
151 {
153 constexpr uint32_t alignment = api::util::simdOptimizedAlignment<T_Type>(
154 ALPAKA_TYPEOF(getApi(device)){},
155 ALPAKA_TYPEOF(getDeviceKind(device)){});
156 auto [memSizeInByte, pitches] = api::util::emulatedAlignedMemDescription<T_Type>(alignment, extents);
157
158 auto deviceDependency = onHost::Device{device.getSharedPtr()};
159 auto [sycl_device, sycl_context] = device.getNativeHandle();
160
161 T_Type* ptr = reinterpret_cast<T_Type*>(
162 sycl::aligned_alloc_device(alignment, memSizeInByte, sycl_device, sycl_context));
163 auto deleter = [ctx = sycl_context, ptr]() { sycl::free(toVoidPtr(ptr), ctx); };
164
165 auto sharedBuffer = onHost::SharedBuffer{
166 deviceDependency,
167 ptr,
168 extents,
169 pitches,
170 std::move(deleter),
171 Alignment<alignment>{}};
172 return sharedBuffer;
173 }
174 };
175
176 template<typename T_Type, typename T_Platform, alpaka::concepts::Vector T_Extents>
177 struct AllocUnified::Op<T_Type, syclGeneric::Device<T_Platform>, T_Extents>
178 {
179 auto operator()(syclGeneric::Device<T_Platform>& device, T_Extents const& extents) const
180 {
182 constexpr uint32_t alignment = api::util::simdOptimizedAlignment<T_Type>(
183 ALPAKA_TYPEOF(getApi(device)){},
184 ALPAKA_TYPEOF(getDeviceKind(device)){});
185 auto [memSizeInByte, pitches] = api::util::emulatedAlignedMemDescription<T_Type>(alignment, extents);
186
187 auto deviceDependency = onHost::Device{device.getSharedPtr()};
188 auto [sycl_device, sycl_context] = device.getNativeHandle();
189
190 bool isManagedMemorySupported = sycl_device.has(sycl::aspect::usm_shared_allocations);
191 if(!isManagedMemorySupported)
192 {
193 throw std::runtime_error("Sycl device does not support unified memory allocations.");
194 }
195
196 T_Type* ptr = reinterpret_cast<T_Type*>(
197 sycl::aligned_alloc_shared(alignment, memSizeInByte, sycl_device, sycl_context));
198 auto deleter = [ctx = sycl_context, ptr]() { sycl::free(toVoidPtr(ptr), ctx); };
199
200 auto sharedBuffer = onHost::SharedBuffer{
201 deviceDependency,
202 ptr,
203 extents,
204 pitches,
205 std::move(deleter),
206 Alignment<alignment>{}};
207 return sharedBuffer;
208 }
209 };
210
211 template<typename T_Type, typename T_Platform, alpaka::concepts::Vector T_Extents>
212 struct AllocMapped::Op<T_Type, syclGeneric::Device<T_Platform>, T_Extents>
213 {
214 auto operator()(syclGeneric::Device<T_Platform>& device, T_Extents const& extents) const
215 {
217 constexpr uint32_t alignment = api::util::simdOptimizedAlignment<T_Type>(
218 ALPAKA_TYPEOF(getApi(device)){},
219 ALPAKA_TYPEOF(getDeviceKind(device)){});
220 auto [memSizeInByte, pitches] = api::util::emulatedAlignedMemDescription<T_Type>(alignment, extents);
221
222 auto deviceDependency = onHost::Device{device.getSharedPtr()};
223 auto [_, sycl_context] = device.getNativeHandle();
224
225 T_Type* ptr
226 = reinterpret_cast<T_Type*>(sycl::aligned_alloc_host(alignment, memSizeInByte, sycl_context));
227 auto deleter = [ctx = sycl_context, ptr]() { sycl::free(toVoidPtr(ptr), ctx); };
228
229 auto sharedBuffer = onHost::SharedBuffer{
230 deviceDependency,
231 ptr,
232 extents,
233 pitches,
234 std::move(deleter),
235 Alignment<alignment>{}};
236 return sharedBuffer;
237 }
238 };
239
240 template<typename T_Platform, typename T_Any>
241 struct IsDataAccessible::FirstPath<syclGeneric::Device<T_Platform>, T_Any>
242 {
243 bool operator()(syclGeneric::Device<T_Platform>& device, T_Any const& view) const
244 {
246 auto [sycl_device, sycl_context] = device.getNativeHandle();
247 auto sycl_alloc_type = sycl::get_pointer_type(data(view), sycl_context);
248
249 if(sycl_alloc_type != sycl::usm::alloc::unknown)
250 {
251 try
252 {
253 sycl::device deviceAssociatedWithData = sycl::get_pointer_device(data(view), sycl_context);
254 if(deviceAssociatedWithData == sycl_device)
255 {
256 // sycl device allocated the memory
257 return true;
258 }
259 }
260 catch(...)
261 {
262 }
263 }
264
265 if(sycl_alloc_type == sycl::usm::alloc::shared)
266 {
267 // is shared within the device context
268 return true;
269 }
270 else if(sycl_alloc_type == sycl::usm::alloc::unknown)
271 {
272 // assume that a sycl cpu device can always access host memory
273 if constexpr(
274 ALPAKA_TYPEOF(getApi(view)){} == api::host
277 return true;
278 }
279
280 return false;
281 }
282 };
283
284 template<typename T_Platform>
285 struct GetDeviceProperties::Op<syclGeneric::Device<T_Platform>>
286 {
287 DeviceProperties operator()(syclGeneric::Device<T_Platform> const& device) const
288 {
289 return device.m_properties;
290 }
291 };
292
293 template<
294 typename T_Platform,
295 alpaka::concepts::Executor T_Executor,
296 alpaka::concepts::Vector T_NumFrames,
297 alpaka::concepts::Vector T_FrameExtents,
298 alpaka::concepts::KernelBundle T_KernelBundle>
299 struct AdjustThreadSpec::
300 Op<syclGeneric::Device<T_Platform>, FrameSpec<T_NumFrames, T_FrameExtents, T_Executor>, T_KernelBundle>
301 {
302 using FrameSpecType = FrameSpec<T_NumFrames, T_FrameExtents, T_Executor>;
303
304 auto operator()(
305 syclGeneric::Device<T_Platform> const& device,
306 FrameSpecType const& frameSpec,
307 T_KernelBundle const& kernelBundle) const requires alpaka::concepts::CVector<T_FrameExtents>
308 {
309 alpaka::unused(device, kernelBundle);
311 auto numThreads = frameSpec.getFrameExtents();
312
313 /** This limit is not exact but for typical GPUs, Intel, NVIDIA and AMD we can at least use 1024
314 * threads per block.
315 * @todo Check if this produces issues on FPGAs, in this case the deviceKind should be used and the
316 * limit should be different for each deviceKind.
317 */
318 constexpr typename ALPAKA_TYPEOF(numThreads)::type hardwareLimitThreadsPerBlock = 1024u;
319
320 constexpr auto result = api::util::adjustToLimit<hardwareLimitThreadsPerBlock, 0u, 1u>(numThreads);
321 return ThreadSpec{frameSpec.getNumFrames(), result};
322 }
323
324 auto operator()(
325 syclGeneric::Device<T_Platform> const& device,
326 FrameSpecType const& frameSpec,
327 T_KernelBundle const& kernelBundle) const
328 {
329 alpaka::unused(kernelBundle);
331 auto numThreadsPerBlocks = frameSpec.getFrameExtents();
332 auto const maxThreadsPerBlock = device.m_properties.maxThreadsPerBlock;
333
334 auto result = api::util::adjustToLimit(numThreadsPerBlocks, maxThreadsPerBlock);
335 return ThreadSpec{frameSpec.getNumFrames(), result};
336 }
337 };
338
339 } // namespace internal
340} // namespace alpaka::onHost
341
342namespace alpaka::internal
343{
344 template<typename T_Platform>
345 struct GetApi::Op<onHost::syclGeneric::Device<T_Platform>>
346 {
347 decltype(auto) operator()(auto&& device) const
348 {
349 return internal::getApi(*device.m_platform.get());
350 }
351 };
352} // namespace alpaka::internal
353
354#endif
#define ALPAKA_TYPEOF(...)
Get the type of instance.
Definition common.hpp:153
#define ALPAKA_LOG_FUNCTION(logLvl)
Log the entry and exit of a scope.
Definition logger.hpp:95
consteval auto adjustToLimit(concepts::CVector auto const input)
adjust the input vector to a given limit by halving all components until the product of these is is b...
Definition util.hpp:63
auto emulatedAlignedMemDescription(uint32_t alignmentInByte, T_Extents extents)
provides a memory description to create multidimensional linewise aligned memory within a one dimensi...
Definition util.hpp:100
constexpr auto simdOptimizedAlignment(auto api, alpaka::concepts::DeviceKind auto deviceKind)
Calculate the best alignment for SIMD optimized memory allocation.
Definition util.hpp:140
constexpr auto host
Definition Api.hpp:39
constexpr auto cpu
Definition tag.hpp:170
constexpr auto numaCpu
Definition tag.hpp:180
alpaka internal implementations.
Definition generic.hpp:19
constexpr auto getApi(auto &&any)
Definition interface.hpp:62
constexpr auto getDeviceKind(auto &&any)
Definition interface.hpp:85
constexpr Device device
Definition scope.hpp:70
DeviceProperties getDeviceProperties(auto const &platform, uint32_t idx)
constexpr auto queue
Definition lvl.hpp:127
constexpr auto device
Definition lvl.hpp:82
constexpr auto kernel
Definition lvl.hpp:142
constexpr auto memory
Definition lvl.hpp:112
Functionality which is usable on the host CPU controller thread.
Definition api.hpp:40
auto getNativeHandle(auto const &handle)
Get the native handle of an handle.
SharedBuffer(T_Any const &, T_Type *, T_UserExtents const &, T_UserPitches const &, std::invocable<> auto, T_MemAlignment const) -> SharedBuffer< ALPAKA_TYPEOF(getApi(std::declval< T_Any >())), T_Type, typename T_UserPitches::UniVec, T_MemAlignment >
decltype(auto) data(auto &&any)
pointer to data of an object
std::convertible_to< std::string > auto getName(auto &&any)
Runtime name for a given object.
Definition interface.hpp:96
ThreadSpec(T_NumBlocks const &, T_NumThreads const &) -> ThreadSpec< alpaka::trait::getVec_t< T_NumBlocks >, alpaka::trait::getVec_t< T_NumThreads > >
Device(Handle< T_Device > &&) -> Device< ALPAKA_TYPEOF(alpaka::internal::getApi(std::declval< T_Device >())), ALPAKA_TYPEOF(alpaka::internal::getDeviceKind(std::declval< T_Device >()))>
void wait(alpaka::concepts::HasGet auto &handle)
wait for all work to be finished
auto * toVoidPtr(T inPtr)
Cast a pointer that may or may not point to volatile memory to a (void*) or (void const*).
Definition util.hpp:34
constexpr decltype(auto) getDeviceKind(auto &&any)
Get the device type of an object.
Definition interface.hpp:52
constexpr decltype(auto) getApi(auto &&any)
Get the API an object depends on.
Definition interface.hpp:23
constexpr decltype(auto) get(concepts::SpecializationOf< Dict > auto &t) noexcept
Definition Dict.hpp:151
void operator()(T_Any &any, T_Extents const &) const
void operator()(T_Any &any, T_Extents const &) const
void operator()(T_Any &any, T_Extents const &) const
DeviceProperties operator()(auto const &platform, uint32_t idx) const
bool operator()(T_Device &device, T_Any const &any) const