alpaka
Abstraction Library for Parallel Kernel Acceleration
Loading...
Searching...
No Matches
Platform.hpp
Go to the documentation of this file.
1/* Copyright 2024 René Widera
2 * SPDX-License-Identifier: MPL-2.0
3 */
4
5
6#pragma once
7
15
16#if ALPAKA_LANG_CUDA || ALPAKA_LANG_HIP
17
18
19# include <memory>
20# include <mutex>
21# include <sstream>
22# include <vector>
23
24namespace alpaka::onHost
25{
26 namespace unifiedCudaHip
27 {
28 template<typename T_ApiInterface, alpaka::concepts::DeviceKind T_DeviceKind>
29 struct Platform : std::enable_shared_from_this<Platform<T_ApiInterface, T_DeviceKind>>
30 {
31 using ApiInterface = T_ApiInterface;
32
33 public:
34 Platform() = default;
35
36 Platform(Platform const&) = delete;
37 Platform& operator=(Platform const&) = delete;
38
39 Platform(Platform&&) = delete;
40 Platform& operator=(Platform&&) = delete;
41
42 private:
43 void _()
44 {
45 static_assert(internal::concepts::Platform<Platform>);
46 }
47
48 std::vector<std::weak_ptr<unifiedCudaHip::Device<Platform>>> devices;
49 std::mutex deviceGuard;
50
51 std::shared_ptr<Platform> getSharedPtr()
52 {
53 return this->shared_from_this();
54 }
55
56 friend struct alpaka::internal::GetName;
57
58 std::string getName() const
59 {
60 return "unifiedCudaHip::Platform";
61 }
62
63 friend struct onHost::internal::GetDeviceCount;
64
65 uint32_t getDeviceCount()
66 {
68 constexpr bool isSupportedDev = trait::IsDeviceSupportedBy::
69 Op<T_DeviceKind, ALPAKA_TYPEOF(getApi(std::declval<Platform>()))>::value;
70 if constexpr(isSupportedDev)
71 {
72 int numDevices{0};
73 typename ApiInterface::Error_t error = ApiInterface::getDeviceCount(&numDevices);
74 if(error != ApiInterface::success)
75 numDevices = 0;
76
77 if(devices.size() < static_cast<size_t>(numDevices))
78 {
79 std::lock_guard<std::mutex> lk{deviceGuard};
80 devices.resize(numDevices);
81 }
82 return static_cast<uint32_t>(numDevices);
83 }
84
85 return 0;
86 }
87
88 friend struct onHost::internal::MakeDevice;
89
90 Handle<unifiedCudaHip::Device<Platform>> makeDevice(uint32_t const& idx)
91 {
93 uint32_t const numDevices = getDeviceCount();
94 if(idx >= numDevices)
95 {
96 std::stringstream ssErr;
97 ssErr << "Unable to return device handle for GPU (" << T_DeviceKind{}.getName()
98 << ") device with index " << idx << " because there are only " << numDevices << " devices!";
99 throw std::runtime_error(ssErr.str());
100 }
101 std::lock_guard<std::mutex> lk{deviceGuard};
102
103 if(auto sharedPtr = devices[idx].lock())
104 {
105 return sharedPtr;
106 }
107 auto thisHandle = getSharedPtr();
108 auto newDevice = std::make_shared<unifiedCudaHip::Device<Platform>>(std::move(thisHandle), idx);
109 devices[idx] = newDevice;
110 return newDevice;
111 }
112
113 friend struct internal::GetDeviceProperties;
114 };
115 } // namespace unifiedCudaHip
116
117 namespace internal
118 {
119 template<typename T_ApiInterface, alpaka::concepts::DeviceKind T_DeviceKind>
120 struct GetDeviceProperties::Op<unifiedCudaHip::Platform<T_ApiInterface, T_DeviceKind>>
121 {
122 DeviceProperties operator()(
123 unifiedCudaHip::Platform<T_ApiInterface, T_DeviceKind> const&,
124 uint32_t deviceIdx) const
125 {
127 using ApiInterface = typename unifiedCudaHip::Platform<T_ApiInterface, T_DeviceKind>::ApiInterface;
128 typename ApiInterface::DeviceProp_t devProp;
129 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(ApiInterface, ApiInterface::getDeviceProperties(&devProp, deviceIdx));
130
131 std::size_t freeGlobalMemBytes(0u);
132 std::size_t globalMemCapacityBytes(0u);
133 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
134 ApiInterface,
135 ApiInterface::memGetInfo(&freeGlobalMemBytes, &globalMemCapacityBytes));
136
137 auto prop = DeviceProperties{};
138 prop.name = devProp.name;
139 prop.warpSize = devProp.warpSize;
140 prop.multiProcessorCount = devProp.multiProcessorCount;
141 prop.globalMemCapacityBytes = globalMemCapacityBytes;
142 prop.sharedMemPerBlockBytes = devProp.sharedMemPerBlock;
143
144 prop.maxThreadsPerBlock = devProp.maxThreadsPerBlock;
145 // will be copied into the lampda and follows cuda index order
146 Vec<uint32_t, 3u> cudaMaxThreadsPerBlock{
147 devProp.maxThreadsDim[0u],
148 devProp.maxThreadsDim[1u],
149 devProp.maxThreadsDim[2u]};
150 prop.fnMaxThreadsPerBlock = [maxThreadsPerBlock = prop.maxThreadsPerBlock,
151 cudaMaxThreadsPerBlock](uint32_t* data, uint32_t numDims)
152 {
153 if(numDims <= 3u)
154 {
155 for(uint32_t d = 0u; d < numDims; ++d)
156 data[numDims - 1u - d] = cudaMaxThreadsPerBlock[d];
157 }
158 else
159 {
160 /* For more than 3 dimensions alpaka is linearizing to one dimension, therefore we use the
161 * maximum for each dimension. */
162 for(uint32_t d = 0u; d < numDims; ++d)
163 data[d] = maxThreadsPerBlock;
164 }
165 };
166
167 prop.maxBlocksPerGrid = std::numeric_limits<uint32_t>::max();
168 // will be copied into the lampda and follows cuda index order
169 Vec<uint32_t, 3u> cudaMaxBlocksPerGrid{
170 devProp.maxGridSize[0u],
171 devProp.maxGridSize[1u],
172 devProp.maxGridSize[2u]};
173 prop.fnMaxBlocksPerGrid =
174 [maxBlocksPerGrid = prop.maxBlocksPerGrid, cudaMaxBlocksPerGrid](uint32_t* data, uint32_t numDims)
175 {
176 if(numDims <= 3u)
177 {
178 for(uint32_t d = 0u; d < numDims; ++d)
179 data[numDims - 1u - d] = cudaMaxBlocksPerGrid[d];
180 }
181 else
182 {
183 /* For more than 3 dimensions alpaka is linearizing to one dimension, therefore we use the
184 * maximum for each dimension. */
185 for(uint32_t d = 0u; d < numDims; ++d)
186 data[d] = maxBlocksPerGrid;
187 }
188 };
189
190 return prop;
191 }
192 };
193 } // namespace internal
194} // namespace alpaka::onHost
195#endif
#define ALPAKA_LOG_FUNCTION(logLvl)
Log the entry and exit of a scope.
Definition logger.hpp:95
constexpr auto device
Definition lvl.hpp:82
Functionality which is usable on the host CPU controller thread.
Definition api.hpp:40
decltype(auto) data(auto &&any)
pointer to data of an object
std::convertible_to< std::string > auto getName(auto &&any)
Runtime name for a given object.
Definition interface.hpp:96
ALPAKA_FN_HOST_ACC Vec(T_1, T_Args...) -> Vec< T_1, uint32_t(sizeof...(T_Args)+1u), ArrayStorage< T_1, uint32_t(sizeof...(T_Args)+1u)> >
DeviceProperties operator()(auto const &platform, uint32_t idx) const