alpaka
Abstraction Library for Parallel Kernel Acceleration
Loading...
Searching...
No Matches
Device.hpp
Go to the documentation of this file.
1/* Copyright 2024 René Widera, Mehmet Yusufoglu
2 * SPDX-License-Identifier: MPL-2.0
3 */
4
5#pragma once
6
12#include "alpaka/api/util.hpp"
20#include "alpaka/tag.hpp"
21#include "alpaka/utility.hpp"
22
23#include <cstdint>
24#include <memory>
25#include <sstream>
26
27namespace alpaka::onHost
28{
29 namespace cpu
30 {
31 template<typename T_Platform>
32 struct Device : std::enable_shared_from_this<Device<T_Platform>>
33 {
34 public:
35 Device(internal::concepts::PlatformHandle auto platform, uint32_t const idx, uint32_t numaIdx)
36 : m_platform(std::move(platform))
37 , m_idx(idx)
38 , m_numaIdx(numaIdx)
39 , m_properties{internal::getDeviceProperties(*m_platform.get(), m_idx)}
40 {
42 }
43
48
49 Device(Device const&) = delete;
50 Device& operator=(Device const&) = delete;
51
52 Device(Device&&) = delete;
53 Device& operator=(Device&&) = delete;
54
55 bool operator==(Device const& other) const
56 {
57 return m_idx == other.m_idx;
58 }
59
60 bool operator!=(Device const& other) const
61 {
62 return m_idx != other.m_idx;
63 }
64
65 void wait()
66 {
68 // Host device synchronization - wait on all queues associated with this device.
69 // IMPORTANT: Do not hold queuesGuard across potentially long waits; copy weak refs first.
70 std::vector<std::weak_ptr<cpu::Queue<Device>>> tmpQueues;
71 {
72 std::lock_guard<std::mutex> lk{queuesGuard};
73 tmpQueues = queues; // copy weak_ptr list
74 }
75 for(auto& weakQueue : tmpQueues)
76 {
77 if(auto queue = weakQueue.lock())
78 {
79 internal::wait(*queue);
80 }
81 }
82 }
83
84 private:
85 void _()
86 {
87 static_assert(internal::concepts::Device<Device>);
88 }
89
90 Handle<T_Platform> m_platform;
91 uint32_t m_idx = 0u;
92 uint32_t m_numaIdx = internal::hwloc::allNumaDomains;
93 DeviceProperties m_properties;
94 std::vector<std::weak_ptr<cpu::Queue<Device>>> queues;
95 std::vector<std::weak_ptr<cpu::Event<Device>>> events;
96 std::mutex queuesGuard;
97
98 std::shared_ptr<Device> getSharedPtr()
99 {
100 return this->shared_from_this();
101 }
102
103 template<typename T_Device>
104 friend struct Queue;
105
106 void setThreadAffinity() const
107 {
108 internal::hwloc::setThreadAffinity(m_numaIdx);
109 }
110
111 template<typename T>
112 void pinPointer(T* const ptr, size_t bytes)
113 {
114 internal::hwloc::pinPointer(ptr, bytes, m_numaIdx);
115 }
116
117 bool isNumaAware() const
118 {
119 return m_numaIdx != internal::hwloc::allNumaDomains;
120 }
121
122 friend struct alpaka::internal::GetName;
123
124 std::string getName() const
125 {
126 return m_properties.name;
127 }
128
129 friend struct internal::GetNativeHandle;
130
131 [[nodiscard]] uint32_t getNativeHandle() const noexcept
132 {
133 return m_idx;
134 }
135
136 friend struct internal::MakeQueue;
137
138 Handle<cpu::Queue<Device>> makeQueue(alpaka::concepts::QueueKind auto kind)
139 {
141 static_assert(
143 "Unsupported queue kind.");
144 auto thisHandle = this->getSharedPtr();
145 std::lock_guard<std::mutex> lk{queuesGuard};
146
147 constexpr bool isBlocking = kind == queueKind::blocking;
148 auto newQueue = std::make_shared<cpu::Queue<Device>>(
149 std::move(thisHandle),
150 queues.size(),
151 m_numaIdx,
152 isBlocking);
153
154 queues.emplace_back(newQueue);
155 return newQueue;
156 }
157
158 friend struct internal::MakeEvent;
159
160 Handle<cpu::Event<Device>> makeEvent()
161 {
163 auto thisHandle = this->getSharedPtr();
164 std::lock_guard<std::mutex> lk{queuesGuard};
165 auto newEvent = std::make_shared<cpu::Event<Device>>(std::move(thisHandle), queues.size());
166
167 events.emplace_back(newEvent);
168 return newEvent;
169 }
170
171 friend struct alpaka::internal::GetDeviceType;
172
173 auto getDeviceKind() const
174 {
175 return alpaka::internal::getDeviceKind(*m_platform.get());
176 }
177
178 auto getFreeGlobalMemBytes() const
179 {
180#if ALPAKA_HAS_HWLOC
181 if(isNumaAware())
182 return internal::hwloc::getFreeGlobalMemBytes(m_numaIdx);
183#endif
185 }
186
187 friend struct internal::Alloc;
188 friend struct alpaka::internal::GetApi;
189 friend struct internal::GetDeviceProperties;
190 friend struct internal::GetFreeGlobalMemBytes;
191 friend struct internal::AdjustThreadSpec;
192 friend struct internal::AllocDeferred;
193 friend struct internal::AllocUnified;
194 friend struct internal::AllocMapped;
195 };
196 } // namespace cpu
197
198 namespace trait
199
200 {
201 template<typename T_Platform>
202 struct IsExecutorSupportedBy::Op<exec::CpuSerial, cpu::Device<T_Platform>> : std::true_type
203 {
204 };
205#if ALPAKA_OMP
206 template<typename T_Platform>
207 struct IsExecutorSupportedBy::Op<exec::CpuOmpBlocks, cpu::Device<T_Platform>> : std::true_type
208 {
209 };
210#endif
211#if ALPAKA_TBB
212 template<typename T_Platform>
213 struct IsExecutorSupportedBy::Op<exec::CpuTbbBlocks, cpu::Device<T_Platform>> : std::true_type
214 {
215 };
216#endif
217 } // namespace trait
218
219 namespace internal
220 {
221 template<typename T_Type, typename T_Platform, alpaka::concepts::Vector T_Extents>
222 struct Alloc::Op<T_Type, cpu::Device<T_Platform>, T_Extents>
223 {
224 auto operator()(cpu::Device<T_Platform>& device, T_Extents const& extents) const
225 {
227 constexpr uint32_t alignment = api::util::simdOptimizedAlignment<T_Type>(
228 ALPAKA_TYPEOF(getApi(device)){},
229 ALPAKA_TYPEOF(getDeviceKind(device)){});
230 auto [memSizeInByte, pitches] = api::util::emulatedAlignedMemDescription<T_Type>(alignment, extents);
231
232 auto deviceDependency = onHost::Device{device.getSharedPtr()};
233
234 T_Type* ptr = reinterpret_cast<T_Type*>(alpaka::core::alignedAlloc(alignment, memSizeInByte));
235 device.pinPointer(ptr, memSizeInByte);
236 // deviceDependency is captured to keep the device alive until the memory is deleted
237 auto deleter = [ptr, deviceDependency]() { alpaka::core::alignedFree(alignment, ptr); };
238
239 auto sharedBuffer = onHost::SharedBuffer{
240 deviceDependency,
241 ptr,
242 extents,
243 pitches,
244 std::move(deleter),
245 Alignment<alignment>{}};
246
249 [&]()
250 {
251 std::stringstream ss;
252 ss << sharedBuffer;
253 return ss.str();
254 });
255 return sharedBuffer;
256 }
257 };
258
259 template<typename T_Type, typename T_Platform, alpaka::concepts::Vector T_Extents>
260 struct AllocUnified::Op<T_Type, cpu::Device<T_Platform>, T_Extents>
261 {
262 auto operator()(cpu::Device<T_Platform>& device, T_Extents const& extents) const
263 {
265 return Alloc::Op<T_Type, cpu::Device<T_Platform>, T_Extents>{}(device, extents);
266 }
267 };
268
269 template<typename T_Type, typename T_Platform, alpaka::concepts::Vector T_Extents>
270 struct AllocMapped::Op<T_Type, cpu::Device<T_Platform>, T_Extents>
271 {
272 auto operator()(cpu::Device<T_Platform>& device, T_Extents const& extents) const
273 {
275 return Alloc::Op<T_Type, cpu::Device<T_Platform>, T_Extents>{}(device, extents);
276 }
277 };
278
279 template<typename T_Platform, typename T_Any>
280 struct IsDataAccessible::FirstPath<cpu::Device<T_Platform>, T_Any>
281 {
282 bool operator()(cpu::Device<T_Platform>& device, T_Any const& view) const
283 {
285 if constexpr(
286 ALPAKA_TYPEOF(getApi(view)){} == api::host
289 return true;
290 else
291 return false;
292 }
293 };
294
295 /** Set number of thread blocks and threads per block to one
296 *
297 * There is no need to emulate blocks if we have only one thread.
298 */
299 template<
300 typename T_Platform,
301 alpaka::concepts::Vector T_NumFrames,
302 alpaka::concepts::Vector T_FrameExtents,
303 alpaka::concepts::KernelBundle T_KernelBundle>
304 struct AdjustThreadSpec::
305 Op<cpu::Device<T_Platform>, FrameSpec<T_NumFrames, T_FrameExtents, exec::CpuSerial>, T_KernelBundle>
306 {
307 using FrameSpecType = FrameSpec<T_NumFrames, T_FrameExtents, exec::CpuSerial>;
308
309 auto operator()(
310 cpu::Device<T_Platform> const& device,
311 FrameSpecType const& frameSpec,
312 T_KernelBundle const& kernelBundle) const requires alpaka::concepts::CVector<T_FrameExtents>
313 {
314 alpaka::unused(device, kernelBundle);
316
317 /// @todo add shortcut to create a CVec with equal values
318 auto const allOne = ALPAKA_TYPEOF(
319 iotaCVec<typename T_FrameExtents::type, T_FrameExtents::dim()>())::template fill<1u>();
320 return ThreadSpec{allOne, allOne, frameSpec.getExecutor()};
321 }
322
323 auto operator()(
324 cpu::Device<T_Platform> const& device,
325 FrameSpecType const& frameSpec,
326 T_KernelBundle const& kernelBundle) const
327 {
328 alpaka::unused(device, kernelBundle);
330 /// @todo add shortcut to create a CVec with equal values
331 auto const allOne = ALPAKA_TYPEOF(
332 iotaCVec<typename T_FrameExtents::type, T_FrameExtents::dim()>())::template fill<1u>();
333 return ThreadSpec{allOne, allOne, frameSpec.getExecutor()};
334 }
335 };
336
337 template<
338 typename T_Platform,
339 alpaka::concepts::Executor T_Executor,
340 alpaka::concepts::Vector T_NumFrames,
341 alpaka::concepts::Vector T_FrameExtents,
342 alpaka::concepts::KernelBundle T_KernelBundle>
344 struct AdjustThreadSpec::
345 Op<cpu::Device<T_Platform>, FrameSpec<T_NumFrames, T_FrameExtents, T_Executor>, T_KernelBundle>
346 {
347 using FrameSpecType = FrameSpec<T_NumFrames, T_FrameExtents, T_Executor>;
348
349 auto operator()(
350 cpu::Device<T_Platform> const& device,
351 FrameSpecType const& frameSpec,
352 T_KernelBundle const& kernelBundle) const requires alpaka::concepts::CVector<T_FrameExtents>
353 {
354 alpaka::unused(device, kernelBundle);
356
357 // map the number of frames to thread blocks
358 auto numThreadBlocks = frameSpec.getNumFrames();
359 return ThreadSpec{numThreadBlocks, T_FrameExtents::template fill<1u>(), frameSpec.getExecutor()};
360 }
361
362 auto operator()(
363 cpu::Device<T_Platform> const& device,
364 FrameSpecType const& frameSpec,
365 T_KernelBundle const& kernelBundle) const
366 {
367 alpaka::unused(device, kernelBundle);
369
370 // map the number of frames to thread blocks
371 auto numThreadBlocks = frameSpec.getNumFrames();
372 auto const numThreads = Vec<typename T_FrameExtents::type, T_FrameExtents::dim()>::fill(1);
373 return ThreadSpec{numThreadBlocks, numThreads, frameSpec.getExecutor()};
374 }
375 };
376
377 template<typename T_Platform>
378 struct GetDeviceProperties::Op<cpu::Device<T_Platform>>
379 {
380 DeviceProperties operator()(cpu::Device<T_Platform> const& device) const
381 {
382 return device.m_properties;
383 }
384 };
385 } // namespace internal
386} // namespace alpaka::onHost
387
388namespace alpaka::internal
389{
390 template<typename T_Platform>
391 struct GetApi::Op<onHost::cpu::Device<T_Platform>>
392 {
393 inline constexpr auto operator()(auto&& device) const
394 {
395 return alpaka::getApi(device.m_platform);
396 }
397 };
398} // namespace alpaka::internal
#define ALPAKA_TYPEOF(...)
Get the type of instance.
Definition common.hpp:153
#define ALPAKA_LOG_INFO(logLvl, callable)
Write a meta data message to the output.
Definition logger.hpp:106
#define ALPAKA_LOG_FUNCTION(logLvl)
Log the entry and exit of a scope.
Definition logger.hpp:95
auto emulatedAlignedMemDescription(uint32_t alignmentInByte, T_Extents extents)
provides a memory description to create multidimensional linewise aligned memory within a one dimensi...
Definition util.hpp:100
constexpr auto simdOptimizedAlignment(auto api, alpaka::concepts::DeviceKind auto deviceKind)
Calculate the best alignment for SIMD optimized memory allocation.
Definition util.hpp:140
constexpr auto host
Definition Api.hpp:39
ALPAKA_FN_INLINE ALPAKA_FN_HOST void alignedFree(size_t alignment, auto ptr)
ALPAKA_FN_INLINE ALPAKA_FN_HOST auto alignedAlloc(size_t alignment, size_t size) -> void *
constexpr auto cpu
Definition tag.hpp:170
constexpr auto numaCpu
Definition tag.hpp:180
constexpr bool isSeqExecutor_v
Definition tag.hpp:287
constexpr Device device
Definition scope.hpp:70
constexpr auto queue
Definition lvl.hpp:127
constexpr auto device
Definition lvl.hpp:82
constexpr auto kernel
Definition lvl.hpp:142
constexpr auto memory
Definition lvl.hpp:112
constexpr auto event
Definition lvl.hpp:97
Functionality which is usable on the host CPU controller thread.
Definition api.hpp:40
SharedBuffer(T_Any const &, T_Type *, T_UserExtents const &, T_UserPitches const &, std::invocable<> auto, T_MemAlignment const) -> SharedBuffer< ALPAKA_TYPEOF(getApi(std::declval< T_Any >())), T_Type, typename T_UserPitches::UniVec, T_MemAlignment >
void fill(Queue< T_Device, T_QueueKind > const &queue, auto &&dest, T_Value elementValue)
fill memory element wise
Definition Queue.hpp:346
std::shared_ptr< T > Handle
Definition Handle.hpp:30
auto getFreeGlobalMemBytes() -> std::size_t
Definition sysInfo.hpp:210
ThreadSpec(T_NumBlocks const &, T_NumThreads const &) -> ThreadSpec< alpaka::trait::getVec_t< T_NumBlocks >, alpaka::trait::getVec_t< T_NumThreads > >
Device(Handle< T_Device > &&) -> Device< ALPAKA_TYPEOF(alpaka::internal::getApi(std::declval< T_Device >())), ALPAKA_TYPEOF(alpaka::internal::getDeviceKind(std::declval< T_Device >()))>
constexpr auto blocking
Definition tag.hpp:101
constexpr auto nonBlocking
Definition tag.hpp:113
ALPAKA_FN_HOST_ACC Vec(T_1, T_Args...) -> Vec< T_1, uint32_t(sizeof...(T_Args)+1u), ArrayStorage< T_1, uint32_t(sizeof...(T_Args)+1u)> >
constexpr decltype(auto) getDeviceKind(auto &&any)
Get the device type of an object.
Definition interface.hpp:52
constexpr decltype(auto) getApi(auto &&any)
Get the API an object depends on.
Definition interface.hpp:23
consteval auto iotaCVec()
Create and return a CVector of the given length with values 1, 2, ...
Definition CVec.hpp:135
constexpr decltype(auto) get(concepts::SpecializationOf< Dict > auto &t) noexcept
Definition Dict.hpp:151
STL namespace.
std::string name
The name of the device.
bool operator!=(Device const &other) const
Definition Device.hpp:60
Device(internal::concepts::PlatformHandle auto platform, uint32_t const idx, uint32_t numaIdx)
Definition Device.hpp:35
Device & operator=(Device &&)=delete
Device(Device &&)=delete
Device(Device const &)=delete
Device & operator=(Device const &)=delete
bool operator==(Device const &other) const
Definition Device.hpp:55