14#if ALPAKA_LANG_CUDA || ALPAKA_LANG_HIP
27 template<
typename T_Platform>
28 struct Device : std::enable_shared_from_this<Device<T_Platform>>
30 using ApiInterface =
typename T_Platform::ApiInterface;
33 Device(internal::concepts::PlatformHandle
auto platform, uint32_t
const idx)
34 : m_platform(std::move(platform))
39 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(ApiInterface, ApiInterface::setDevice(idx));
45 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(ApiInterface, ApiInterface::setDevice(
getNativeHandle()));
46 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(ApiInterface, ApiInterface::deviceSynchronize());
47 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(ApiInterface, ApiInterface::deviceReset());
50 Device(Device
const&) =
delete;
51 Device& operator=(Device
const&) =
delete;
54 Device& operator=(Device&&) =
delete;
58 return m_idx == other.m_idx;
63 return m_idx != other.m_idx;
70 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(ApiInterface, ApiInterface::setDevice(
getNativeHandle()));
72 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(ApiInterface, ApiInterface::deviceSynchronize());
78 static_assert(internal::concepts::Device<Device>);
81 Handle<T_Platform> m_platform;
83 DeviceProperties m_properties;
84 std::vector<std::weak_ptr<unifiedCudaHip::Queue<Device>>> queues;
85 std::vector<std::weak_ptr<unifiedCudaHip::Event<Device>>> events;
86 std::mutex m_writeGuard;
88 std::shared_ptr<Device> getSharedPtr()
90 return this->shared_from_this();
93 friend struct alpaka::internal::GetName;
97 return m_properties.name;
100 friend struct onHost::internal::GetNativeHandle;
107 friend struct onHost::internal::MakeQueue;
109 Handle<unifiedCudaHip::Queue<Device>> makeQueue(alpaka::concepts::QueueKind
auto kind)
113 kind == queueKind::blocking || kind == queueKind::nonBlocking,
114 "Unsupported queue kind.");
115 auto thisHandle = this->getSharedPtr();
116 std::lock_guard<std::mutex> lk{m_writeGuard};
118 constexpr bool isBlocking = kind == queueKind::blocking;
119 auto newQueue = std::make_shared<unifiedCudaHip::Queue<Device>>(
120 std::move(thisHandle),
124 queues.emplace_back(newQueue);
128 friend struct onHost::internal::MakeEvent;
130 Handle<unifiedCudaHip::Event<Device>> makeEvent()
133 auto thisHandle = this->getSharedPtr();
134 std::lock_guard<std::mutex> lk{m_writeGuard};
135 auto newEvent = std::make_shared<unifiedCudaHip::Event<Device>>(std::move(thisHandle), events.size());
137 events.emplace_back(newEvent);
141 friend struct alpaka::internal::GetDeviceType;
150 std::size_t freeGlobalMemBytes(0u);
151 std::size_t globalMemCapacityBytes(0u);
152 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
154 ApiInterface::memGetInfo(&freeGlobalMemBytes, &globalMemCapacityBytes));
155 return freeGlobalMemBytes;
158 friend struct onHost::internal::Alloc;
159 friend struct onHost::internal::AllocDeferred;
160 friend struct onHost::internal::AllocUnified;
161 friend struct onHost::internal::AllocMapped;
162 friend struct alpaka::internal::GetApi;
163 friend struct internal::GetDeviceProperties;
164 friend struct internal::GetFreeGlobalMemBytes;
165 friend struct internal::AdjustThreadSpec;
166 friend struct onHost::internal::IsDataAccessible;
173 template<
typename T_Platform>
174 struct GetApi::Op<onHost::unifiedCudaHip::Device<T_Platform>>
176 inline constexpr auto operator()(
auto&& device)
const
187 template<
typename T_Type,
typename T_Platform, alpaka::concepts::Vector T_Extents>
188 struct Alloc::Op<T_Type, unifiedCudaHip::
Device<T_Platform>, T_Extents>
190 auto operator()(unifiedCudaHip::Device<T_Platform>& device, T_Extents
const& extents)
const
193 using ApiInterface =
typename T_Platform::ApiInterface;
195 T_Type* ptr =
nullptr;
196 auto pitches =
typename T_Extents::UniVec{
sizeof(T_Type)};
198 using Idx =
typename T_Extents::type;
200 constexpr auto dim = T_Extents::dim();
201 if constexpr(dim == 1u)
203 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
205 ApiInterface::malloc((
void**) &ptr,
static_cast<std::size_t
>(extents.x()) *
sizeof(T_Type)));
207 else if constexpr(dim == 2u)
209 size_t rowPitchInBytes = 0u;
210 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
212 ApiInterface::mallocPitch(
215 static_cast<std::size_t
>(extents.x()) *
sizeof(T_Type),
216 static_cast<std::size_t
>(extents.y())));
220 else if constexpr(dim >= 3u)
222 auto const extentsNoXY =
pCast<size_t>(extents.eraseBack().eraseBack());
223 typename ApiInterface::Extent_t
const extentVal = ApiInterface::makeExtent(
224 static_cast<std::size_t
>(extents.x()) *
sizeof(T_Type),
225 static_cast<std::size_t
>(extents.y()),
227 typename ApiInterface::PitchedPtr_t pitchedPtrVal;
228 pitchedPtrVal.ptr =
nullptr;
229 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(ApiInterface, ApiInterface::malloc3D(&pitchedPtrVal, extentVal));
231 ptr =
reinterpret_cast<T_Type*
>(pitchedPtrVal.ptr);
237 auto deleter = [ptr, deviceDependency]()
238 { ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK_NOEXCEPT(ApiInterface, ApiInterface::free(
toVoidPtr(ptr))); };
245 constexpr uint32_t alignment = 128u;
253 Alignment<alignment>{}};
258 template<
typename T_Type,
typename T_Platform, alpaka::concepts::Vector T_Extents>
261 auto operator()(unifiedCudaHip::Device<T_Platform>& device, T_Extents
const& extents)
const
264 using ApiInterface =
typename T_Platform::ApiInterface;
271 constexpr uint32_t alignment = 128u;
276 T_Type* ptr =
nullptr;
279 bool isHipZeroByteAllocation = memSizeInByte == 0 &&
getApi(device) ==
api::hip;
280 if(!isHipZeroByteAllocation)
282 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
284 ApiInterface::mallocManaged((
void**) &ptr, memSizeInByte));
287 auto deleter = [ptr, deviceDependency]()
288 { ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK_NOEXCEPT(ApiInterface, ApiInterface::free(
toVoidPtr(ptr))); };
296 Alignment<alignment>{}};
301 template<
typename T_Type,
typename T_Platform, alpaka::concepts::Vector T_Extents>
304 auto operator()(unifiedCudaHip::Device<T_Platform>& device, T_Extents
const& extents)
const
307 using ApiInterface =
typename T_Platform::ApiInterface;
314 constexpr uint32_t alignment = 128u;
319 T_Type* ptr =
nullptr;
320 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
322 ApiInterface::hostMalloc(
325 ApiInterface::hostMallocMapped | ApiInterface::hostMallocPortable));
327 auto deleter = [ptr, deviceDependency]()
328 { ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK_NOEXCEPT(ApiInterface, ApiInterface::hostFree(
toVoidPtr(ptr))); };
336 Alignment<alignment>{}};
341 template<
typename T_Platform,
typename T_Any>
344 bool operator()(unifiedCudaHip::Device<T_Platform>& device, T_Any
const& view)
const
347 using ApiInterface =
typename T_Platform::ApiInterface;
348 typename ApiInterface::PointerAttr_t ptrAttributes;
349 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
351 ApiInterface::pointerGetAttributes(&ptrAttributes,
onHost::data(view)));
353 auto deviceHandle =
device.getNativeHandle();
356 if(deviceHandle == ptrAttributes.device)
358 if(ptrAttributes.type == ApiInterface::memoryTypeManaged)
365 template<
typename T_Platform>
368 DeviceProperties
operator()(unifiedCudaHip::Device<T_Platform>
const& device)
const
370 return device.m_properties;
376 alpaka::concepts::UnifiedCudaHipExecutor T_Executor,
377 alpaka::concepts::Vector T_NumFrames,
378 alpaka::concepts::Vector T_FrameExtents,
379 alpaka::concepts::KernelBundle T_KernelBundle>
381 Op<unifiedCudaHip::Device<T_Platform>, FrameSpec<T_NumFrames, T_FrameExtents, T_Executor>, T_KernelBundle>
383 using FrameSpecType = FrameSpec<T_NumFrames, T_FrameExtents, T_Executor>;
386 unifiedCudaHip::Device<T_Platform>
const&,
387 FrameSpecType
const& frameSpec,
388 T_KernelBundle
const&)
const requires alpaka::concepts::CVector<T_FrameExtents>
391 auto numThreads = frameSpec.getFrameExtents();
398 constexpr typename ALPAKA_TYPEOF(numThreads)::type hardwareLimitThreadsPerBlock = 1024u;
401 return ThreadSpec{frameSpec.getNumFrames(), result, frameSpec.getExecutor()};
405 unifiedCudaHip::Device<T_Platform>
const& device,
406 FrameSpecType
const& frameSpec,
407 T_KernelBundle
const&)
const
410 auto numThreadsPerBlocks = frameSpec.getFrameExtents();
411 auto const maxThreadsPerBlock =
device.m_properties.maxThreadsPerBlock;
414 return ThreadSpec{frameSpec.getNumFrames(), result, frameSpec.getExecutor()};
#define ALPAKA_TYPEOF(...)
Get the type of instance.
#define ALPAKA_LOG_FUNCTION(logLvl)
Log the entry and exit of a scope.
consteval auto adjustToLimit(concepts::CVector auto const input)
adjust the input vector to a given limit by halving all components until the product of these is is b...
auto emulatedAlignedMemDescription(uint32_t alignmentInByte, T_Extents extents)
provides a memory description to create multidimensional linewise aligned memory within a one dimensi...
constexpr bool operator!=(alpaka::concepts::Api auto lhs, alpaka::concepts::Api auto rhs)
constexpr bool operator==(alpaka::concepts::Api auto lhs, alpaka::concepts::Api auto rhs)
alpaka internal implementations.
constexpr auto getApi(auto &&any)
constexpr auto getDeviceKind(auto &&any)
size_t getFreeGlobalMemBytes(uint32_t numaIdx)
Return the number of free bytes in the numa domain.
DeviceProperties getDeviceProperties(auto const &platform, uint32_t idx)
Functionality which is usable on the host CPU controller thread.
auto getNativeHandle(auto const &handle)
Get the native handle of an handle.
SharedBuffer(T_Any const &, T_Type *, T_UserExtents const &, T_UserPitches const &, std::invocable<> auto, T_MemAlignment const) -> SharedBuffer< ALPAKA_TYPEOF(getApi(std::declval< T_Any >())), T_Type, typename T_UserPitches::UniVec, T_MemAlignment >
decltype(auto) data(auto &&any)
pointer to data of an object
std::convertible_to< std::string > auto getName(auto &&any)
Runtime name for a given object.
ThreadSpec(T_NumBlocks const &, T_NumThreads const &) -> ThreadSpec< alpaka::trait::getVec_t< T_NumBlocks >, alpaka::trait::getVec_t< T_NumThreads > >
Device(Handle< T_Device > &&) -> Device< ALPAKA_TYPEOF(alpaka::internal::getApi(std::declval< T_Device >())), ALPAKA_TYPEOF(alpaka::internal::getDeviceKind(std::declval< T_Device >()))>
void wait(alpaka::concepts::HasGet auto &handle)
wait for all work to be finished
constexpr auto calculatePitches(T_Vec const &extent, typename T_Vec::type const &rowPitchBytes)
Calculate the pitches purely from the extents.
auto * toVoidPtr(T inPtr)
Cast a pointer that may or may not point to volatile memory to a (void*) or (void const*).
constexpr decltype(auto) getDeviceKind(auto &&any)
Get the device type of an object.
constexpr decltype(auto) getApi(auto &&any)
Get the API an object depends on.
constexpr decltype(auto) get(concepts::SpecializationOf< Dict > auto &t) noexcept
constexpr decltype(auto) pCast(auto &&input)
Performs a static_cast on the storage type of combined data type.
constexpr auto operator()(auto &&any) const
void operator()(T_Any &any, T_Extents const &) const
void operator()(T_Any &any, T_Extents const &) const
void operator()(T_Any &any, T_Extents const &) const
DeviceProperties operator()(auto const &platform, uint32_t idx) const
bool operator()(T_Device &device, T_Any const &any) const