16#if ALPAKA_LANG_CUDA || ALPAKA_LANG_HIP
26 namespace unifiedCudaHip
28 template<
typename T_ApiInterface, alpaka::concepts::DeviceKind T_DeviceKind>
29 struct Platform : std::enable_shared_from_this<Platform<T_ApiInterface, T_DeviceKind>>
31 using ApiInterface = T_ApiInterface;
36 Platform(Platform
const&) =
delete;
37 Platform& operator=(Platform
const&) =
delete;
39 Platform(Platform&&) =
delete;
40 Platform& operator=(Platform&&) =
delete;
45 static_assert(internal::concepts::Platform<Platform>);
48 std::vector<std::weak_ptr<unifiedCudaHip::Device<Platform>>> devices;
49 std::mutex deviceGuard;
51 std::shared_ptr<Platform> getSharedPtr()
53 return this->shared_from_this();
56 friend struct alpaka::internal::GetName;
60 return "unifiedCudaHip::Platform";
63 friend struct onHost::internal::GetDeviceCount;
65 uint32_t getDeviceCount()
68 constexpr bool isSupportedDev = trait::IsDeviceSupportedBy::
69 Op<T_DeviceKind, ALPAKA_TYPEOF(getApi(std::declval<Platform>()))>::value;
70 if constexpr(isSupportedDev)
73 typename ApiInterface::Error_t error = ApiInterface::getDeviceCount(&numDevices);
74 if(error != ApiInterface::success)
77 if(devices.size() <
static_cast<size_t>(numDevices))
79 std::lock_guard<std::mutex> lk{deviceGuard};
80 devices.resize(numDevices);
82 return static_cast<uint32_t
>(numDevices);
88 friend struct onHost::internal::MakeDevice;
90 Handle<unifiedCudaHip::Device<Platform>> makeDevice(uint32_t
const& idx)
93 uint32_t
const numDevices = getDeviceCount();
96 std::stringstream ssErr;
97 ssErr <<
"Unable to return device handle for GPU (" << T_DeviceKind{}.getName()
98 <<
") device with index " << idx <<
" because there are only " << numDevices <<
" devices!";
99 throw std::runtime_error(ssErr.str());
101 std::lock_guard<std::mutex> lk{deviceGuard};
103 if(
auto sharedPtr = devices[idx].lock())
107 auto thisHandle = getSharedPtr();
108 auto newDevice = std::make_shared<unifiedCudaHip::Device<Platform>>(std::move(thisHandle), idx);
109 devices[idx] = newDevice;
113 friend struct internal::GetDeviceProperties;
119 template<
typename T_ApiInterface, alpaka::concepts::DeviceKind T_DeviceKind>
120 struct GetDeviceProperties::Op<unifiedCudaHip::Platform<T_ApiInterface, T_DeviceKind>>
122 DeviceProperties operator()(
123 unifiedCudaHip::Platform<T_ApiInterface, T_DeviceKind>
const&,
124 uint32_t deviceIdx)
const
127 using ApiInterface =
typename unifiedCudaHip::Platform<T_ApiInterface, T_DeviceKind>::ApiInterface;
128 typename ApiInterface::DeviceProp_t devProp;
129 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(ApiInterface, ApiInterface::getDeviceProperties(&devProp, deviceIdx));
131 std::size_t freeGlobalMemBytes(0u);
132 std::size_t globalMemCapacityBytes(0u);
133 ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
135 ApiInterface::memGetInfo(&freeGlobalMemBytes, &globalMemCapacityBytes));
137 auto prop = DeviceProperties{};
138 prop.name = devProp.name;
139 prop.warpSize = devProp.warpSize;
140 prop.multiProcessorCount = devProp.multiProcessorCount;
141 prop.globalMemCapacityBytes = globalMemCapacityBytes;
142 prop.sharedMemPerBlockBytes = devProp.sharedMemPerBlock;
144 prop.maxThreadsPerBlock = devProp.maxThreadsPerBlock;
147 devProp.maxThreadsDim[0u],
148 devProp.maxThreadsDim[1u],
149 devProp.maxThreadsDim[2u]};
150 prop.fnMaxThreadsPerBlock = [maxThreadsPerBlock = prop.maxThreadsPerBlock,
151 cudaMaxThreadsPerBlock](uint32_t*
data, uint32_t numDims)
155 for(uint32_t d = 0u; d < numDims; ++d)
156 data[numDims - 1u - d] = cudaMaxThreadsPerBlock[d];
162 for(uint32_t d = 0u; d < numDims; ++d)
163 data[d] = maxThreadsPerBlock;
167 prop.maxBlocksPerGrid = std::numeric_limits<uint32_t>::max();
170 devProp.maxGridSize[0u],
171 devProp.maxGridSize[1u],
172 devProp.maxGridSize[2u]};
173 prop.fnMaxBlocksPerGrid =
174 [maxBlocksPerGrid = prop.maxBlocksPerGrid, cudaMaxBlocksPerGrid](uint32_t*
data, uint32_t numDims)
178 for(uint32_t d = 0u; d < numDims; ++d)
179 data[numDims - 1u - d] = cudaMaxBlocksPerGrid[d];
185 for(uint32_t d = 0u; d < numDims; ++d)
186 data[d] = maxBlocksPerGrid;
#define ALPAKA_LOG_FUNCTION(logLvl)
Log the entry and exit of a scope.
Functionality which is usable on the host CPU controller thread.
decltype(auto) data(auto &&any)
pointer to data of an object
std::convertible_to< std::string > auto getName(auto &&any)
Runtime name for a given object.
ALPAKA_FN_HOST_ACC Vec(T_1, T_Args...) -> Vec< T_1, uint32_t(sizeof...(T_Args)+1u), ArrayStorage< T_1, uint32_t(sizeof...(T_Args)+1u)> >