alpaka
Abstraction Library for Parallel Kernel Acceleration
Loading...
Searching...
No Matches
Device.hpp
Go to the documentation of this file.
1/* Copyright 2024 René Widera
2 * SPDX-License-Identifier: MPL-2.0
3 */
4
5#pragma once
6
9#include "alpaka/api/util.hpp"
13
14#if ALPAKA_LANG_CUDA || ALPAKA_LANG_HIP
15
16
17# include <cstdint>
18# include <memory>
19# include <mutex>
20# include <sstream>
21# include <vector>
22
23namespace alpaka::onHost
24{
25 namespace unifiedCudaHip
26 {
27 template<typename T_Platform>
28 struct Device : std::enable_shared_from_this<Device<T_Platform>>
29 {
30 using ApiInterface = typename T_Platform::ApiInterface;
31
32 public:
33 Device(internal::concepts::PlatformHandle auto platform, uint32_t const idx)
34 : m_platform(std::move(platform))
35 , m_idx(idx)
36 , m_properties{internal::getDeviceProperties(*m_platform.get(), m_idx)}
37 {
38 ALPAKA_LOG_FUNCTION(onHost::logger::device);
39 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(ApiInterface, ApiInterface::setDevice(idx));
40 }
41
42 ~Device()
43 {
44 ALPAKA_LOG_FUNCTION(onHost::logger::device);
45 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(ApiInterface, ApiInterface::setDevice(getNativeHandle()));
46 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(ApiInterface, ApiInterface::deviceSynchronize());
47 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(ApiInterface, ApiInterface::deviceReset());
48 }
49
50 Device(Device const&) = delete;
51 Device& operator=(Device const&) = delete;
52
53 Device(Device&&) = delete;
54 Device& operator=(Device&&) = delete;
55
56 bool operator==(Device const& other) const
57 {
58 return m_idx == other.m_idx;
59 }
60
61 bool operator!=(Device const& other) const
62 {
63 return m_idx != other.m_idx;
64 }
65
66 void wait()
67 {
68 ALPAKA_LOG_FUNCTION(onHost::logger::device);
69 // Make sure this device is the current thread device (getNativeHandle returns device index)
70 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(ApiInterface, ApiInterface::setDevice(getNativeHandle()));
71 // Wait for all work queued on this device to finish
72 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(ApiInterface, ApiInterface::deviceSynchronize());
73 }
74
75 private:
76 void _()
77 {
78 static_assert(internal::concepts::Device<Device>);
79 }
80
81 Handle<T_Platform> m_platform;
82 uint32_t m_idx = 0u;
83 DeviceProperties m_properties;
84 std::vector<std::weak_ptr<unifiedCudaHip::Queue<Device>>> queues;
85 std::vector<std::weak_ptr<unifiedCudaHip::Event<Device>>> events;
86 std::mutex m_writeGuard;
87
88 std::shared_ptr<Device> getSharedPtr()
89 {
90 return this->shared_from_this();
91 }
92
93 friend struct alpaka::internal::GetName;
94
95 std::string getName() const
96 {
97 return m_properties.name;
98 }
99
100 friend struct onHost::internal::GetNativeHandle;
101
102 [[nodiscard]] int getNativeHandle() const noexcept
103 {
104 return m_idx;
105 }
106
107 friend struct onHost::internal::MakeQueue;
108
109 Handle<unifiedCudaHip::Queue<Device>> makeQueue(alpaka::concepts::QueueKind auto kind)
110 {
111 ALPAKA_LOG_FUNCTION(onHost::logger::queue);
112 static_assert(
113 kind == queueKind::blocking || kind == queueKind::nonBlocking,
114 "Unsupported queue kind.");
115 auto thisHandle = this->getSharedPtr();
116 std::lock_guard<std::mutex> lk{m_writeGuard};
117
118 constexpr bool isBlocking = kind == queueKind::blocking;
119 auto newQueue = std::make_shared<unifiedCudaHip::Queue<Device>>(
120 std::move(thisHandle),
121 queues.size(),
122 isBlocking);
123
124 queues.emplace_back(newQueue);
125 return newQueue;
126 }
127
128 friend struct onHost::internal::MakeEvent;
129
130 Handle<unifiedCudaHip::Event<Device>> makeEvent()
131 {
132 ALPAKA_LOG_FUNCTION(onHost::logger::event);
133 auto thisHandle = this->getSharedPtr();
134 std::lock_guard<std::mutex> lk{m_writeGuard};
135 auto newEvent = std::make_shared<unifiedCudaHip::Event<Device>>(std::move(thisHandle), events.size());
136
137 events.emplace_back(newEvent);
138 return newEvent;
139 }
140
141 friend struct alpaka::internal::GetDeviceType;
142
143 auto getDeviceKind() const
144 {
145 return alpaka::internal::getDeviceKind(*m_platform.get());
146 }
147
148 auto getFreeGlobalMemBytes() const
149 {
150 std::size_t freeGlobalMemBytes(0u);
151 std::size_t globalMemCapacityBytes(0u);
152 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
153 ApiInterface,
154 ApiInterface::memGetInfo(&freeGlobalMemBytes, &globalMemCapacityBytes));
155 return freeGlobalMemBytes;
156 }
157
158 friend struct onHost::internal::Alloc;
159 friend struct onHost::internal::AllocDeferred;
160 friend struct onHost::internal::AllocUnified;
161 friend struct onHost::internal::AllocMapped;
162 friend struct alpaka::internal::GetApi;
163 friend struct internal::GetDeviceProperties;
164 friend struct internal::GetFreeGlobalMemBytes;
165 friend struct internal::AdjustThreadSpec;
166 friend struct onHost::internal::IsDataAccessible;
167 };
168 } // namespace unifiedCudaHip
169} // namespace alpaka::onHost
170
171namespace alpaka::internal
172{
173 template<typename T_Platform>
174 struct GetApi::Op<onHost::unifiedCudaHip::Device<T_Platform>>
175 {
176 inline constexpr auto operator()(auto&& device) const
177 {
178 return getApi(device.m_platform);
179 }
180 };
181} // namespace alpaka::internal
182
183namespace alpaka::onHost
184{
185 namespace internal
186 {
187 template<typename T_Type, typename T_Platform, alpaka::concepts::Vector T_Extents>
188 struct Alloc::Op<T_Type, unifiedCudaHip::Device<T_Platform>, T_Extents>
189 {
190 auto operator()(unifiedCudaHip::Device<T_Platform>& device, T_Extents const& extents) const
191 {
193 using ApiInterface = typename T_Platform::ApiInterface;
194
195 T_Type* ptr = nullptr;
196 auto pitches = typename T_Extents::UniVec{sizeof(T_Type)};
197
198 using Idx = typename T_Extents::type;
199
200 constexpr auto dim = T_Extents::dim();
201 if constexpr(dim == 1u)
202 {
203 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
204 ApiInterface,
205 ApiInterface::malloc((void**) &ptr, static_cast<std::size_t>(extents.x()) * sizeof(T_Type)));
206 }
207 else if constexpr(dim == 2u)
208 {
209 size_t rowPitchInBytes = 0u;
210 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
211 ApiInterface,
212 ApiInterface::mallocPitch(
213 (void**) &ptr,
214 &rowPitchInBytes,
215 static_cast<std::size_t>(extents.x()) * sizeof(T_Type),
216 static_cast<std::size_t>(extents.y())));
217
218 pitches = alpaka::calculatePitches<T_Type>(extents, static_cast<Idx>(rowPitchInBytes));
219 }
220 else if constexpr(dim >= 3u)
221 {
222 auto const extentsNoXY = pCast<size_t>(extents.eraseBack().eraseBack());
223 typename ApiInterface::Extent_t const extentVal = ApiInterface::makeExtent(
224 static_cast<std::size_t>(extents.x()) * sizeof(T_Type),
225 static_cast<std::size_t>(extents.y()),
226 pCast<std::size_t>(extentsNoXY).product());
227 typename ApiInterface::PitchedPtr_t pitchedPtrVal;
228 pitchedPtrVal.ptr = nullptr;
229 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(ApiInterface, ApiInterface::malloc3D(&pitchedPtrVal, extentVal));
230
231 ptr = reinterpret_cast<T_Type*>(pitchedPtrVal.ptr);
232 pitches = alpaka::calculatePitches<T_Type>(extents, static_cast<Idx>(pitchedPtrVal.pitch));
233 }
234
235 auto deviceDependency = onHost::Device{device.getSharedPtr()};
236
237 auto deleter = [ptr, deviceDependency]()
238 { ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK_NOEXCEPT(ApiInterface, ApiInterface::free(toVoidPtr(ptr))); };
239
240 /** Each CUDA/HIP allocation is aligned to at least 128 byte but typically to 256byte
241 *
242 * @todo check if this value can be derived from the device properties
243 * @todo validate if memory is always aligned to 256 byte
244 */
245 constexpr uint32_t alignment = 128u;
246
247 auto buffer = onHost::SharedBuffer{
248 deviceDependency,
249 ptr,
250 extents,
251 pitches,
252 std::move(deleter),
253 Alignment<alignment>{}};
254 return buffer;
255 }
256 };
257
258 template<typename T_Type, typename T_Platform, alpaka::concepts::Vector T_Extents>
259 struct AllocUnified::Op<T_Type, unifiedCudaHip::Device<T_Platform>, T_Extents>
260 {
261 auto operator()(unifiedCudaHip::Device<T_Platform>& device, T_Extents const& extents) const
262 {
264 using ApiInterface = typename T_Platform::ApiInterface;
265
266 /** Each CUDA/HIP allocation is aligned to at least 128 byte but typically to 256byte
267 *
268 * @todo check if this value can be derived from the device properties
269 * @todo validate if memory is always aligned to 256 byte
270 */
271 constexpr uint32_t alignment = 128u;
272 auto [memSizeInByte, pitches] = api::util::emulatedAlignedMemDescription<T_Type>(alignment, extents);
273
274 auto deviceDependency = onHost::Device{device.getSharedPtr()};
275
276 T_Type* ptr = nullptr;
277 // HIP is failing if zero byte unified memory is allocated, therefore we do not call the allocation
278 // method for HIP
279 bool isHipZeroByteAllocation = memSizeInByte == 0 && getApi(device) == api::hip;
280 if(!isHipZeroByteAllocation)
281 {
282 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
283 ApiInterface,
284 ApiInterface::mallocManaged((void**) &ptr, memSizeInByte));
285 }
286
287 auto deleter = [ptr, deviceDependency]()
288 { ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK_NOEXCEPT(ApiInterface, ApiInterface::free(toVoidPtr(ptr))); };
289
290 auto sharedBuffer = onHost::SharedBuffer{
291 deviceDependency,
292 ptr,
293 extents,
294 pitches,
295 std::move(deleter),
296 Alignment<alignment>{}};
297 return sharedBuffer;
298 }
299 };
300
301 template<typename T_Type, typename T_Platform, alpaka::concepts::Vector T_Extents>
302 struct AllocMapped::Op<T_Type, unifiedCudaHip::Device<T_Platform>, T_Extents>
303 {
304 auto operator()(unifiedCudaHip::Device<T_Platform>& device, T_Extents const& extents) const
305 {
307 using ApiInterface = typename T_Platform::ApiInterface;
308
309 /** Each CUDA/HIP allocation is aligned to at least 128 byte but typically to 256byte
310 *
311 * @todo check if this value can be derived from the device properties
312 * @todo validate if memory is always aligned to 256 byte
313 */
314 constexpr uint32_t alignment = 128u;
315 auto [memSizeInByte, pitches] = api::util::emulatedAlignedMemDescription<T_Type>(alignment, extents);
316
317 auto deviceDependency = onHost::Device{device.getSharedPtr()};
318
319 T_Type* ptr = nullptr;
320 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
321 ApiInterface,
322 ApiInterface::hostMalloc(
323 (void**) &ptr,
324 memSizeInByte,
325 ApiInterface::hostMallocMapped | ApiInterface::hostMallocPortable));
326
327 auto deleter = [ptr, deviceDependency]()
328 { ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK_NOEXCEPT(ApiInterface, ApiInterface::hostFree(toVoidPtr(ptr))); };
329
330 auto sharedBuffer = onHost::SharedBuffer{
331 deviceDependency,
332 ptr,
333 extents,
334 pitches,
335 std::move(deleter),
336 Alignment<alignment>{}};
337 return sharedBuffer;
338 }
339 };
340
341 template<typename T_Platform, typename T_Any>
342 struct IsDataAccessible::FirstPath<unifiedCudaHip::Device<T_Platform>, T_Any>
343 {
344 bool operator()(unifiedCudaHip::Device<T_Platform>& device, T_Any const& view) const
345 {
347 using ApiInterface = typename T_Platform::ApiInterface;
348 typename ApiInterface::PointerAttr_t ptrAttributes;
349 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
350 ApiInterface,
351 ApiInterface::pointerGetAttributes(&ptrAttributes, onHost::data(view)));
352
353 auto deviceHandle = device.getNativeHandle();
354
355 // pointer is owned by the device itself
356 if(deviceHandle == ptrAttributes.device)
357 return true;
358 if(ptrAttributes.type == ApiInterface::memoryTypeManaged)
359 return true;
360
361 return false;
362 }
363 };
364
365 template<typename T_Platform>
366 struct GetDeviceProperties::Op<unifiedCudaHip::Device<T_Platform>>
367 {
368 DeviceProperties operator()(unifiedCudaHip::Device<T_Platform> const& device) const
369 {
370 return device.m_properties;
371 }
372 };
373
374 template<
375 typename T_Platform,
376 alpaka::concepts::UnifiedCudaHipExecutor T_Executor,
377 alpaka::concepts::Vector T_NumFrames,
378 alpaka::concepts::Vector T_FrameExtents,
379 alpaka::concepts::KernelBundle T_KernelBundle>
380 struct AdjustThreadSpec::
381 Op<unifiedCudaHip::Device<T_Platform>, FrameSpec<T_NumFrames, T_FrameExtents, T_Executor>, T_KernelBundle>
382 {
383 using FrameSpecType = FrameSpec<T_NumFrames, T_FrameExtents, T_Executor>;
384
385 auto operator()(
386 unifiedCudaHip::Device<T_Platform> const&,
387 FrameSpecType const& frameSpec,
388 T_KernelBundle const&) const requires alpaka::concepts::CVector<T_FrameExtents>
389 {
391 auto numThreads = frameSpec.getFrameExtents();
392
393 /** All modern NVIDIA and AMD GPUs support at least 1014 threads.
394 * @attention: Due to lmem, shared memory or register usage the limit could be lower. In this case the
395 * kernel call will vail at runtime with invalid kernel configuration. We can not avoid this at compile
396 * time.
397 */
398 constexpr typename ALPAKA_TYPEOF(numThreads)::type hardwareLimitThreadsPerBlock = 1024u;
399
400 constexpr auto result = api::util::adjustToLimit<hardwareLimitThreadsPerBlock, 0u, 1u>(numThreads);
401 return ThreadSpec{frameSpec.getNumFrames(), result, frameSpec.getExecutor()};
402 }
403
404 auto operator()(
405 unifiedCudaHip::Device<T_Platform> const& device,
406 FrameSpecType const& frameSpec,
407 T_KernelBundle const&) const
408 {
410 auto numThreadsPerBlocks = frameSpec.getFrameExtents();
411 auto const maxThreadsPerBlock = device.m_properties.maxThreadsPerBlock;
412
413 auto result = api::util::adjustToLimit(numThreadsPerBlocks, maxThreadsPerBlock);
414 return ThreadSpec{frameSpec.getNumFrames(), result, frameSpec.getExecutor()};
415 }
416 };
417 } // namespace internal
418} // namespace alpaka::onHost
419
420#endif
#define ALPAKA_TYPEOF(...)
Get the type of instance.
Definition common.hpp:153
#define ALPAKA_LOG_FUNCTION(logLvl)
Log the entry and exit of a scope.
Definition logger.hpp:95
consteval auto adjustToLimit(concepts::CVector auto const input)
adjust the input vector to a given limit by halving all components until the product of these is is b...
Definition util.hpp:63
auto emulatedAlignedMemDescription(uint32_t alignmentInByte, T_Extents extents)
provides a memory description to create multidimensional linewise aligned memory within a one dimensi...
Definition util.hpp:100
constexpr auto hip
Definition Api.hpp:41
constexpr bool operator!=(alpaka::concepts::Api auto lhs, alpaka::concepts::Api auto rhs)
Definition api.hpp:53
constexpr bool operator==(alpaka::concepts::Api auto lhs, alpaka::concepts::Api auto rhs)
Definition api.hpp:48
constexpr Device device
Definition scope.hpp:70
constexpr auto device
Definition lvl.hpp:82
constexpr auto memory
Definition lvl.hpp:112
Functionality which is usable on the host CPU controller thread.
Definition api.hpp:40
auto getNativeHandle(auto const &handle)
Get the native handle of an handle.
SharedBuffer(T_Any const &, T_Type *, T_UserExtents const &, T_UserPitches const &, std::invocable<> auto, T_MemAlignment const) -> SharedBuffer< ALPAKA_TYPEOF(getApi(std::declval< T_Any >())), T_Type, typename T_UserPitches::UniVec, T_MemAlignment >
decltype(auto) data(auto &&any)
pointer to data of an object
std::convertible_to< std::string > auto getName(auto &&any)
Runtime name for a given object.
Definition interface.hpp:96
ThreadSpec(T_NumBlocks const &, T_NumThreads const &) -> ThreadSpec< alpaka::trait::getVec_t< T_NumBlocks >, alpaka::trait::getVec_t< T_NumThreads > >
Device(Handle< T_Device > &&) -> Device< ALPAKA_TYPEOF(alpaka::internal::getApi(std::declval< T_Device >())), ALPAKA_TYPEOF(alpaka::internal::getDeviceKind(std::declval< T_Device >()))>
void wait(alpaka::concepts::HasGet auto &handle)
wait for all work to be finished
constexpr auto calculatePitches(T_Vec const &extent, typename T_Vec::type const &rowPitchBytes)
Calculate the pitches purely from the extents.
auto * toVoidPtr(T inPtr)
Cast a pointer that may or may not point to volatile memory to a (void*) or (void const*).
Definition util.hpp:34
constexpr decltype(auto) getDeviceKind(auto &&any)
Get the device type of an object.
Definition interface.hpp:52
constexpr decltype(auto) getApi(auto &&any)
Get the API an object depends on.
Definition interface.hpp:23
constexpr decltype(auto) get(concepts::SpecializationOf< Dict > auto &t) noexcept
Definition Dict.hpp:151
constexpr decltype(auto) pCast(auto &&input)
Performs a static_cast on the storage type of combined data type.
Definition cast.hpp:48