alpaka
Abstraction Library for Parallel Kernel Acceleration
Loading...
Searching...
No Matches
Device.hpp
Go to the documentation of this file.
1/* Copyright 2024 René Widera, Mehmet Yusufoglu
2 * SPDX-License-Identifier: MPL-2.0
3 */
4
5#pragma once
6
12#include "alpaka/api/util.hpp"
20#include "alpaka/tag.hpp"
21#include "alpaka/utility.hpp"
22
23#include <cstdint>
24#include <memory>
25#include <sstream>
26
27namespace alpaka::onHost
28{
29 namespace cpu
30 {
31 template<typename T_Platform>
32 struct Device : std::enable_shared_from_this<Device<T_Platform>>
33 {
34 public:
35 Device(internal::concepts::PlatformHandle auto platform, uint32_t const idx, uint32_t numaIdx)
36 : m_platform(std::move(platform))
37 , m_idx(idx)
38 , m_numaIdx(numaIdx)
39 , m_properties{internal::getDeviceProperties(*m_platform.get(), m_idx)}
40 {
42 }
43
48
49 Device(Device const&) = delete;
50 Device& operator=(Device const&) = delete;
51
52 Device(Device&&) = delete;
53 Device& operator=(Device&&) = delete;
54
55 bool operator==(Device const& other) const
56 {
57 return m_idx == other.m_idx;
58 }
59
60 bool operator!=(Device const& other) const
61 {
62 return m_idx != other.m_idx;
63 }
64
65 void wait()
66 {
68 // Host device synchronization - wait on all queues associated with this device.
69 // IMPORTANT: Do not hold queuesGuard across potentially long waits; copy weak refs first.
70 std::vector<std::weak_ptr<cpu::Queue<Device>>> tmpQueues;
71 {
72 std::lock_guard<std::mutex> lk{queuesGuard};
73 tmpQueues = queues; // copy weak_ptr list
74 }
75 for(auto& weakQueue : tmpQueues)
76 {
77 if(auto queue = weakQueue.lock())
78 {
79 internal::wait(*queue);
80 }
81 }
82 }
83
84 private:
85 void _()
86 {
88 }
89
91 uint32_t m_idx = 0u;
94 std::vector<std::weak_ptr<cpu::Queue<Device>>> queues;
95 std::vector<std::weak_ptr<cpu::Event<Device>>> events;
96 std::mutex queuesGuard;
97
98 std::shared_ptr<Device> getSharedPtr()
99 {
100 return this->shared_from_this();
101 }
102
103 template<typename T_Device>
104 friend struct Queue;
105
110
111 template<typename T>
112 void pinPointer(T* const ptr, size_t bytes)
113 {
115 }
116
117 bool isNumaAware() const
118 {
120 }
121
123
124 std::string getName() const
125 {
126 return m_properties.name;
127 }
128
130
131 [[nodiscard]] uint32_t getNativeHandle() const noexcept
132 {
133 return m_idx;
134 }
135
136 friend struct internal::MakeQueue;
137
139 {
141 static_assert(
143 "Unsupported queue kind.");
144 auto thisHandle = this->getSharedPtr();
145 std::lock_guard<std::mutex> lk{queuesGuard};
146
147 constexpr bool isBlocking = kind == queueKind::blocking;
148 auto newQueue = std::make_shared<cpu::Queue<Device>>(
149 std::move(thisHandle),
150 queues.size(),
151 m_numaIdx,
152 isBlocking);
153
154 queues.emplace_back(newQueue);
155 return newQueue;
156 }
157
158 friend struct internal::MakeEvent;
159
161 {
163 auto thisHandle = this->getSharedPtr();
164 std::lock_guard<std::mutex> lk{queuesGuard};
165 auto newEvent = std::make_shared<cpu::Event<Device>>(std::move(thisHandle), queues.size());
166
167 events.emplace_back(newEvent);
168 return newEvent;
169 }
170
172
173 auto getDeviceKind() const
174 {
176 }
177
179 {
180#if ALPAKA_HAS_HWLOC
181 if(isNumaAware())
183#endif
185 }
186
187 friend struct internal::Alloc;
195 };
196 } // namespace cpu
197
198 namespace trait
199
200 {
201 template<typename T_Platform>
202 struct IsExecutorSupportedBy::Op<exec::CpuSerial, cpu::Device<T_Platform>> : std::true_type
203 {
204 };
205#if ALPAKA_OMP
206 template<typename T_Platform>
207 struct IsExecutorSupportedBy::Op<exec::CpuOmpBlocks, cpu::Device<T_Platform>> : std::true_type
208 {
209 };
210#endif
211#if ALPAKA_TBB
212 template<typename T_Platform>
213 struct IsExecutorSupportedBy::Op<exec::CpuTbbBlocks, cpu::Device<T_Platform>> : std::true_type
214 {
215 };
216#endif
217 } // namespace trait
218
219 namespace internal
220 {
221 template<typename T_Type, typename T_Platform, alpaka::concepts::Vector T_Extents>
222 struct Alloc::Op<T_Type, cpu::Device<T_Platform>, T_Extents>
223 {
224 auto operator()(cpu::Device<T_Platform>& device, T_Extents const& extents) const
225 {
227 constexpr uint32_t alignment = api::util::simdOptimizedAlignment<T_Type>(
228 ALPAKA_TYPEOF(getApi(device)){},
229 ALPAKA_TYPEOF(getDeviceKind(device)){});
230 auto [memSizeInByte, pitches] = api::util::emulatedAlignedMemDescription<T_Type>(alignment, extents);
231
232 auto deviceDependency = onHost::Device{device.getSharedPtr()};
233
234 T_Type* ptr = reinterpret_cast<T_Type*>(alpaka::core::alignedAlloc(alignment, memSizeInByte));
235 device.pinPointer(ptr, memSizeInByte);
236 // deviceDependency is captured to keep the device alive until the memory is deleted
237 auto deleter = [ptr, deviceDependency]() { alpaka::core::alignedFree(alignment, ptr); };
238
239 auto sharedBuffer = onHost::SharedBuffer{
240 deviceDependency,
241 ptr,
242 extents,
243 pitches,
244 std::move(deleter),
246
249 [&]()
250 {
251 std::stringstream ss;
252 ss << sharedBuffer;
253 return ss.str();
254 });
255 return sharedBuffer;
256 }
257 };
258
259 template<typename T_Type, typename T_Platform, alpaka::concepts::Vector T_Extents>
260 struct AllocUnified::Op<T_Type, cpu::Device<T_Platform>, T_Extents>
261 {
262 auto operator()(cpu::Device<T_Platform>& device, T_Extents const& extents) const
263 {
265 return Alloc::Op<T_Type, cpu::Device<T_Platform>, T_Extents>{}(device, extents);
266 }
267 };
268
269 template<typename T_Type, typename T_Platform, alpaka::concepts::Vector T_Extents>
270 struct AllocMapped::Op<T_Type, cpu::Device<T_Platform>, T_Extents>
271 {
272 auto operator()(cpu::Device<T_Platform>& device, T_Extents const& extents) const
273 {
275 return Alloc::Op<T_Type, cpu::Device<T_Platform>, T_Extents>{}(device, extents);
276 }
277 };
278
279 template<typename T_Platform, typename T_Any>
280 struct IsDataAccessible::FirstPath<cpu::Device<T_Platform>, T_Any>
281 {
282 bool operator()(cpu::Device<T_Platform>& device, T_Any const& view) const
283 {
285 if constexpr(
286 ALPAKA_TYPEOF(getApi(view)){} == api::host
289 return true;
290 else
291 return false;
292 }
293 };
294
295 /** Set number of thread blocks and threads per block to one
296 *
297 * There is no need to emulate blocks if we have only one thread.
298 */
299 template<
300 typename T_Platform,
301 alpaka::concepts::Vector T_NumFrames,
302 alpaka::concepts::Vector T_FrameExtents,
303 alpaka::concepts::KernelBundle T_KernelBundle>
305 Op<cpu::Device<T_Platform>, FrameSpec<T_NumFrames, T_FrameExtents, exec::CpuSerial>, T_KernelBundle>
306 {
308
310 cpu::Device<T_Platform> const& device,
311 FrameSpecType const& frameSpec,
312 T_KernelBundle const& kernelBundle) const requires alpaka::concepts::CVector<T_FrameExtents>
313 {
314 alpaka::unused(device, kernelBundle);
316
317 /// @todo add shortcut to create a CVec with equal values
318 auto const allOne = ALPAKA_TYPEOF(
319 iotaCVec<typename T_FrameExtents::type, T_FrameExtents::dim()>())::template fill<1u>();
320 return ThreadSpec{allOne, allOne, frameSpec.getExecutor()};
321 }
322
324 cpu::Device<T_Platform> const& device,
325 FrameSpecType const& frameSpec,
326 T_KernelBundle const& kernelBundle) const
327 {
328 alpaka::unused(device, kernelBundle);
330 /// @todo add shortcut to create a CVec with equal values
331 auto const allOne = ALPAKA_TYPEOF(
332 iotaCVec<typename T_FrameExtents::type, T_FrameExtents::dim()>())::template fill<1u>();
333 return ThreadSpec{allOne, allOne, frameSpec.getExecutor()};
334 }
335 };
336
337 template<
338 typename T_Platform,
340 alpaka::concepts::Vector T_NumFrames,
341 alpaka::concepts::Vector T_FrameExtents,
342 alpaka::concepts::KernelBundle T_KernelBundle>
345 Op<cpu::Device<T_Platform>, FrameSpec<T_NumFrames, T_FrameExtents, T_Executor>, T_KernelBundle>
346 {
348
350 cpu::Device<T_Platform> const& device,
351 FrameSpecType const& frameSpec,
352 T_KernelBundle const& kernelBundle) const requires alpaka::concepts::CVector<T_FrameExtents>
353 {
354 alpaka::unused(device, kernelBundle);
356
357 // map the number of frames to thread blocks
358 auto numThreadBlocks = frameSpec.getNumFrames();
359 return ThreadSpec{numThreadBlocks, T_FrameExtents::template fill<1u>(), frameSpec.getExecutor()};
360 }
361
363 cpu::Device<T_Platform> const& device,
364 FrameSpecType const& frameSpec,
365 T_KernelBundle const& kernelBundle) const
366 {
367 alpaka::unused(device, kernelBundle);
369
370 // map the number of frames to thread blocks
371 auto numThreadBlocks = frameSpec.getNumFrames();
372 auto const numThreads = Vec<typename T_FrameExtents::type, T_FrameExtents::dim()>::fill(1);
373 return ThreadSpec{numThreadBlocks, numThreads, frameSpec.getExecutor()};
374 }
375 };
376
377 template<typename T_Platform>
378 struct GetDeviceProperties::Op<cpu::Device<T_Platform>>
379 {
381 {
382 return device.m_properties;
383 }
384 };
385 } // namespace internal
386} // namespace alpaka::onHost
387
388namespace alpaka::internal
389{
390 template<typename T_Platform>
391 struct GetApi::Op<onHost::cpu::Device<T_Platform>>
392 {
393 inline constexpr auto operator()(auto&& device) const
394 {
395 return alpaka::getApi(device.m_platform);
396 }
397 };
398} // namespace alpaka::internal
#define ALPAKA_TYPEOF(...)
Get the type of instance.
Definition common.hpp:153
Concept to check if a type is a CVector.
Definition Vec.hpp:74
Concept to check for an executor.
Definition trait.hpp:133
Concept to check if a type is a KernelBundle.
Concept to check if a type is a queue kind.
Definition tag.hpp:76
Concept to check if a type is a vector.
Definition Vec.hpp:53
#define ALPAKA_LOG_INFO(logLvl, callable)
Write a meta data message to the output.
Definition logger.hpp:106
#define ALPAKA_LOG_FUNCTION(logLvl)
Log the entry and exit of a scope.
Definition logger.hpp:95
auto emulatedAlignedMemDescription(uint32_t alignmentInByte, T_Extents extents)
provides a memory description to create multidimensional linewise aligned memory within a one dimensi...
Definition util.hpp:100
constexpr auto simdOptimizedAlignment(auto api, alpaka::concepts::DeviceKind auto deviceKind)
Calculate the best alignment for SIMD optimized memory allocation.
Definition util.hpp:140
constexpr auto host
Definition Api.hpp:39
ALPAKA_FN_INLINE ALPAKA_FN_HOST void alignedFree(size_t alignment, auto ptr)
ALPAKA_FN_INLINE ALPAKA_FN_HOST auto alignedAlloc(size_t alignment, size_t size) -> void *
constexpr auto cpu
Definition tag.hpp:170
constexpr auto numaCpu
Definition tag.hpp:180
constexpr bool isSeqExecutor_v
Definition tag.hpp:287
alpaka internal implementations.
Definition generic.hpp:19
constexpr auto getDeviceKind(auto &&any)
Definition interface.hpp:85
constexpr uint32_t allNumaDomains
Constant to select all NUMA domains.
Definition utility.hpp:31
size_t getFreeGlobalMemBytes(uint32_t numaIdx)
Return the number of free bytes in the numa domain.
Definition utility.hpp:348
void setThreadAffinity(uint32_t numaIdx)
Set the affinity of the current thread to all cores of the NUMA domain.
Definition utility.hpp:180
void pinPointer(T *const ptr, size_t bytes, uint32_t numaIdx)
Set the NUMA domain for the memory range described by ptr and bytes.
Definition utility.hpp:234
void wait(auto &&any)
constexpr auto queue
Definition lvl.hpp:127
constexpr auto device
Definition lvl.hpp:82
constexpr auto kernel
Definition lvl.hpp:142
constexpr auto memory
Definition lvl.hpp:112
constexpr auto event
Definition lvl.hpp:97
Functionality which is usable on the host CPU controller thread.
Definition api.hpp:40
void fill(Queue< T_Device, T_QueueKind > const &queue, auto &&dest, T_Value elementValue)
fill memory element wise
Definition Queue.hpp:346
std::shared_ptr< T > Handle
Definition Handle.hpp:30
auto getFreeGlobalMemBytes() -> std::size_t
Definition sysInfo.hpp:210
constexpr auto blocking
Definition tag.hpp:101
constexpr auto nonBlocking
Definition tag.hpp:113
constexpr decltype(auto) getDeviceKind(auto &&any)
Get the device type of an object.
Definition interface.hpp:52
constexpr decltype(auto) getApi(auto &&any)
Get the API an object depends on.
Definition interface.hpp:23
consteval auto iotaCVec()
Create and return a CVector of the given length with values 1, 2, ...
Definition CVec.hpp:135
constexpr decltype(auto) get(concepts::SpecializationOf< Dict > auto &t) noexcept
Definition Dict.hpp:151
STL namespace.
Strongly typed and constexpr representation of a byte-alignment of memory.
Definition Alignment.hpp:26
Description of a specific device that one can schedule kernels on.
Definition Device.hpp:32
Device/Api-agnostic description of the logical parallelism exposed to a kernel.
Definition FrameSpec.hpp:46
static constexpr T_Executor getExecutor() noexcept
Definition FrameSpec.hpp:67
constexpr NumFramesVecType const & getNumFrames() const noexcept
Definition FrameSpec.hpp:72
Life time managed buffer with contiguous data.
Backend-specific description of the actual block and thread launch shape.
bool operator!=(Device const &other) const
Definition Device.hpp:60
uint32_t getNativeHandle() const noexcept
Definition Device.hpp:131
std::vector< std::weak_ptr< cpu::Event< Device > > > events
Definition Device.hpp:95
void pinPointer(T *const ptr, size_t bytes)
Definition Device.hpp:112
Handle< cpu::Queue< Device > > makeQueue(alpaka::concepts::QueueKind auto kind)
Definition Device.hpp:138
Handle< cpu::Event< Device > > makeEvent()
Definition Device.hpp:160
Handle< T_Platform > m_platform
Definition Device.hpp:90
Device(internal::concepts::PlatformHandle auto platform, uint32_t const idx, uint32_t numaIdx)
Definition Device.hpp:35
Device & operator=(Device &&)=delete
std::vector< std::weak_ptr< cpu::Queue< Device > > > queues
Definition Device.hpp:94
Device(Device &&)=delete
DeviceProperties m_properties
Definition Device.hpp:93
std::shared_ptr< Device > getSharedPtr()
Definition Device.hpp:98
auto getFreeGlobalMemBytes() const
Definition Device.hpp:178
std::string getName() const
Definition Device.hpp:124
void setThreadAffinity() const
Definition Device.hpp:106
Device(Device const &)=delete
Device & operator=(Device const &)=delete
bool operator==(Device const &other) const
Definition Device.hpp:55
auto operator()(cpu::Device< T_Platform > const &device, FrameSpecType const &frameSpec, T_KernelBundle const &kernelBundle) const
Definition Device.hpp:349
auto operator()(cpu::Device< T_Platform > const &device, FrameSpecType const &frameSpec, T_KernelBundle const &kernelBundle) const
Definition Device.hpp:362
auto operator()(cpu::Device< T_Platform > const &device, FrameSpecType const &frameSpec, T_KernelBundle const &kernelBundle) const
Definition Device.hpp:323
auto operator()(cpu::Device< T_Platform > const &device, FrameSpecType const &frameSpec, T_KernelBundle const &kernelBundle) const
Definition Device.hpp:309
auto operator()(cpu::Device< T_Platform > &device, T_Extents const &extents) const
Definition Device.hpp:272
auto operator()(cpu::Device< T_Platform > &device, T_Extents const &extents) const
Definition Device.hpp:262
auto operator()(cpu::Device< T_Platform > &device, T_Extents const &extents) const
Definition Device.hpp:224
DeviceProperties operator()(cpu::Device< T_Platform > const &device) const
Definition Device.hpp:380
bool operator()(cpu::Device< T_Platform > &device, T_Any const &view) const
Definition Device.hpp:282