25namespace alpaka::onHost::internal
33 constexpr std::size_t chunkSize = 2048u;
35 template<alpaka::concepts::DeviceKind TDeviceKind,
typename T_Idx,
typename T_Data>
36 consteval T_Idx maximumMiniBlockSize()
39 return static_cast<T_Idx
>(8);
41 return static_cast<T_Idx
>(8);
43 return static_cast<T_Idx
>(8);
45 return static_cast<T_Idx
>(32768) /
sizeof(T_Data);
52 template<
typename T_Acc,
typename T_Idx>
53 constexpr T_Idx conflictFreeAccess(T_Idx
const& n)
61 template<
typename T_Idx,
typename T_Data>
62 ALPAKA_FN_ACC T_Data scanMiniBlock(T_Data* block, alpaka::concepts::CVector<T_Idx>
auto const& extent)
65 for(T_Idx d = extent.x() / T_Idx{2}, offset = T_Idx{1}; d > 0; d >>= 1, offset <<= 1)
67 for(
auto frameElem = T_Idx{0}; frameElem < T_Idx{2} * d; frameElem += T_Idx{2})
69 T_Idx left = offset * (frameElem + T_Idx{1}) - T_Idx{1};
70 T_Idx right = offset * (frameElem + T_Idx{2}) - T_Idx{1};
76 T_Data blockSum =
block[extent.x() - T_Idx{1}];
79 block[extent.x() - T_Idx{1}] = T_Data{0};
82 for(T_Idx d = 1, offset = extent.x() / T_Idx{2}; d < extent.x(); d <<= 1, offset >>= 1)
84 for(
auto frameElem = T_Idx{0}; frameElem < T_Idx{2} * d; frameElem += T_Idx{2})
86 T_Idx left = offset * (frameElem + T_Idx{1}) - T_Idx{1};
87 T_Idx right = offset * (frameElem + T_Idx{2}) - T_Idx{1};
98 template<
typename T_Idx,
typename T_Data>
101 T_Data
const& blockSum,
102 alpaka::concepts::CVector<T_Idx>
auto const& extent)
104 for(
auto i = T_Idx{0}; i < extent.x(); ++i)
106 block[i] += blockSum;
114 template<ScanType SCAN_TYPE,
typename T_Idx,
typename T_Data>
115 class Scan_ScanBlocksKernel
120 alpaka::concepts::Vector
auto const numChunks,
121 alpaka::concepts::CVector
auto const largeChunkExtents,
122 alpaka::concepts::IDataSource
auto const& inputVec,
123 alpaka::concepts::IMdSpan
auto outputVec,
124 auto... blockSums)
const
129 alpaka::concepts::CVector
auto numThreadsPerBlock = acc[
layer::thread].count();
130 constexpr std::integral
auto elsPerThread = largeChunkExtents.x() / numThreadsPerBlock.x();
131 alpaka::concepts::CVector
auto chunkExtent =
CVec<T_Idx, elsPerThread * numThreadsPerBlock.x()>{};
132 alpaka::concepts::Vector
auto numElements = inputVec.getExtents();
134 constexpr std::integral
auto miniBlockSize
135 = std::min(maximumMiniBlockSize<DeviceType, T_Idx, T_Data>(), elsPerThread);
136 constexpr std::integral
auto miniBlocksPerThread = elsPerThread / miniBlockSize;
137 constexpr std::integral
auto miniBlocksPerChunk = chunkExtent.x() / miniBlockSize;
139 constexpr auto LocalArrayLength = miniBlocksPerThread * miniBlockSize;
140 using LocalArray = T_Data[LocalArrayLength];
142 auto const validElementsInLastFrame = (numElements - T_Idx{1}) % chunkExtent + T_Idx{1};
152 bool const lastFrameFull = validElementsInLastFrame == chunkExtent;
153 bool const isLastFrame = chunkIdx == numChunks - T_Idx{1};
158 constexpr auto conflictFreeAdr = conflictFreeAccess<AccType>(miniBlocksPerChunk - T_Idx{1}) + T_Idx{1};
160 auto const frameOffset = chunkExtent * chunkIdx;
168 if((!lastFrameFull && isLastFrame) || elsPerThread % T_Idx{4} != T_Idx{0})
171 for(
auto i = T_Idx{0}; i < elsPerThread; ++i)
173 if(frameOffset + frameElem + i < numElements)
174 regMem[i] = inputVec[frameOffset + frameElem + i];
181 MdSpanArray<LocalArray, T_Idx, alpaka::Alignment<16>> regMemMd{regMem};
183 for(
auto i = T_Idx{0}; i < elsPerThread; i += T_Idx{4})
185 auto inputVecView = SimdPtr{
187 Vec{frameOffset + frameElem + i},
192 regView = inputVecView.load();
197 for(
auto miniBlockOffset = T_Idx{0}; miniBlockOffset < elsPerThread;
198 miniBlockOffset += miniBlockSize)
205 tmp[conflictFreeAccess<AccType>((frameElem + miniBlockOffset) / miniBlockSize)] = miniBlockSum;
210 for(T_Idx d = miniBlocksPerChunk / T_Idx{2}, offset = T_Idx{1}; d > 0; d >>= 1, offset <<= 1)
218 T_Idx left = offset * (frameElem + T_Idx{1}).x() - T_Idx{1};
219 T_Idx right = offset * (frameElem + T_Idx{2}).x() - T_Idx{1};
220 left = conflictFreeAccess<AccType>(left);
221 right = conflictFreeAccess<AccType>(right);
222 tmp[right] += tmp[left];
227 for([[maybe_unused]]
auto frameElem :
231 if constexpr(
sizeof...(blockSums))
233 auto _blockSums = std::get<0>(std::make_tuple(blockSums...));
234 _blockSums[chunkIdx] = tmp[conflictFreeAccess<AccType>(miniBlocksPerChunk - T_Idx{1})];
238 tmp[conflictFreeAccess<AccType>(miniBlocksPerChunk - T_Idx{1})] = 0;
242 for(T_Idx d = 1, offset = miniBlocksPerChunk / T_Idx{2}; d < miniBlocksPerChunk; d <<= 1, offset >>= 1)
250 T_Idx left = offset * (frameElem.x() + T_Idx{1}) - T_Idx{1};
251 T_Idx right = offset * (frameElem.x() + T_Idx{2}) - T_Idx{1};
252 left = conflictFreeAccess<AccType>(left);
253 right = conflictFreeAccess<AccType>(right);
255 tmp[left] = tmp[right];
268 for(
auto miniBlockOffset = T_Idx{0}; miniBlockOffset < elsPerThread;
269 miniBlockOffset += miniBlockSize)
273 if(frameOffset + frameElem + miniBlockOffset < numElements)
276 = tmp[conflictFreeAccess<AccType>((frameElem.x() + miniBlockOffset) / miniBlockSize)];
283 if((!lastFrameFull && isLastFrame) || elsPerThread % T_Idx{4} != T_Idx{0})
286 for(
auto i = T_Idx{0}; i < elsPerThread; ++i)
288 if(frameOffset + frameElem + i < numElements)
290 if constexpr(SCAN_TYPE == EXCLUSIVE_SCAN)
291 outputVec[frameOffset + frameElem + i] = regMem[i];
292 else if constexpr(SCAN_TYPE == INCLUSIVE_SCAN)
293 outputVec[frameOffset + frameElem + i]
294 = inputVec[frameOffset + frameElem + i] + regMem[i];
300 MdSpanArray<LocalArray, T_Idx, alpaka::Alignment<16>> regMemMd{regMem};
302 for(
auto i = T_Idx{0}; i < elsPerThread; i += T_Idx{4})
304 auto outputVecView = SimdPtr{
306 Vec{frameOffset + frameElem + i},
310 if constexpr(SCAN_TYPE == EXCLUSIVE_SCAN)
311 outputVecView = regView.load();
312 else if constexpr(SCAN_TYPE == INCLUSIVE_SCAN)
314 auto inputVecView = SimdPtr{
316 Vec{frameOffset + frameElem + i},
319 outputVecView = inputVecView.load() + regView.load();
331 template<
typename T_Idx>
332 class Scan_AddIncrementsKernel
337 alpaka::concepts::CVector
auto const largeChunkExtents,
338 alpaka::concepts::IMdSpan
auto const& blockSums,
339 alpaka::concepts::IMdSpan
auto outputVec)
const
341 alpaka::concepts::Vector
auto numElements = outputVec.getExtents();
342 alpaka::concepts::CVector
auto numThreadsPerBlock = acc[
layer::thread].count();
343 constexpr auto elsPerThread = largeChunkExtents.x() / numThreadsPerBlock.x();
344 alpaka::concepts::CVector
auto chunkExtent =
CVec<T_Idx, elsPerThread * numThreadsPerBlock.x()>{};
350 [&](
auto const&,
auto&& simdOut)
constexpr
351 { simdOut = simdOut.load() + blockSums[simdOut.getIdx() / chunkExtent]; },
356 template<
typename T_Data>
357 auto scanBufferSize(std::integral
auto const& extent)
360 auto elements =
divCeil(extent, T_Idx{chunkSize});
362 auto bufSize = T_Idx{0};
363 while(elements > T_Idx{1})
366 elements =
divCeil(elements, T_Idx{chunkSize});
369 return bufSize * T_Idx{
sizeof(T_Data)};
372 template<
typename T_Data>
373 auto scanBufferSize(alpaka::concepts::Vector
auto const& extents)
375 static_assert(
ALPAKA_TYPEOF(extents)::dim() == 1,
"scan is only usable for one dimensional buffers");
376 return Vec{scanBufferSize<T_Data>(extents.x())};
379 template<ScanType SCAN_TYPE>
382 alpaka::onHost::concepts::Device
auto& devAcc,
383 alpaka::concepts::Executor
auto& exec,
384 alpaka::concepts::IMdSpan
auto& buffer,
385 alpaka::concepts::IMdSpan
auto& outputVec,
386 alpaka::concepts::IDataSource
auto& inputVec)
392 std::is_same_v<T_Data,
typename ALPAKA_TYPEOF(outputVec)::value_type>,
393 "output vector must have the same data type as input vector");
396 Scan_ScanBlocksKernel<SCAN_TYPE, T_Idx, T_Data> scanBlocks;
407 std::stringstream ss;
409 if(SCAN_TYPE == INCLUSIVE_SCAN)
410 ss <<
", scanType= INCLUSIVE_SCAN";
411 else if(SCAN_TYPE == EXCLUSIVE_SCAN)
412 ss <<
", scanType= EXCLUSIVE_SCAN";
413 ss <<
", numFrames= " << numChunks;
414 ss <<
", chunkExtent= " << chunkExtent;
420 if(frameSpec.getNumFrames() > T_Idx{1})
423 Scan_AddIncrementsKernel<T_Idx> addIncrements;
425 auto bufSizeBytes = frameSpec.getNumFrames() * T_Idx{
sizeof(T_Data)};
426 assert(buffer.getExtents() * T_Idx{sizeof(typename ALPAKA_TYPEOF(buffer)::value_type)} >= bufSizeBytes);
429 auto subBuf = buffer.getSubView(bufSizeBytes);
430 auto increments = MdSpan{
431 reinterpret_cast<T_Data*
>(subBuf.data()),
432 frameSpec.getNumFrames(),
436 auto bufferNext = buffer.getSubView(bufSizeBytes, buffer.getExtents() - bufSizeBytes);
441 KernelBundle{scanBlocks, numChunks, chunkExtent, inputVec, outputVec, increments});
444 scan<EXCLUSIVE_SCAN>(queue, devAcc, exec, bufferNext, increments, increments);
445 queue.enqueue(frameSpec,
KernelBundle{addIncrements, chunkExtent, increments, outputVec});
450 queue.enqueue(frameSpec,
KernelBundle{scanBlocks, numChunks, chunkExtent, inputVec, outputVec});
454 template<ScanType SCAN_TYPE>
457 alpaka::onHost::concepts::Device
auto& devAcc,
458 alpaka::concepts::Executor
auto& exec,
459 alpaka::concepts::IMdSpan
auto& outputVec,
460 alpaka::concepts::IDataSource
auto const& inputVec)
469 scan<SCAN_TYPE>(queue, devAcc, exec, buf, outputVec, inputVec);
471 buf.keepAlive(queue);
#define ALPAKA_FN_ACC
All functions that can be used on an accelerator have to be attributed with ALPAKA_FN_ACC or ALPAKA_F...
#define ALPAKA_TYPEOF(...)
Get the type of instance.
#define ALPAKA_LOG_INFO(logLvl, callable)
Write a meta data message to the output.
constexpr WarpSize warpSize
constexpr uint32_t getSize()
Return the warp size.
constexpr auto blocksInGrid
constexpr auto threadsInGrid
constexpr auto threadsInBlock
constexpr decltype(auto) declareSharedMdArray(concepts::Acc auto const &acc, alpaka::concepts::CVector auto const &extent)
creates an M-dimensional array
ALPAKA_FN_HOST_ACC constexpr auto makeIdxMap(auto const &acc, auto const workGroup, auto const range, T_Traverse traverse=T_Traverse{}, T_IdxLayout idxLayout=T_IdxLayout{})
Creates an index container.
constexpr void syncBlockThreads(concepts::Acc auto const &acc)
Synchronize all threads within a thread block.
FrameSpec(T_NumFrames const &, T_FrameExtents const &) -> FrameSpec< alpaka::trait::getVec_t< T_NumFrames >, alpaka::trait::getVec_t< T_FrameExtents >, alpaka::exec::AnyExecutor >
constexpr auto demangledName()
auto alloc(concepts::Device auto const &device, alpaka::concepts::VectorOrScalar auto const &extents)
Allocate memory on the given device.
ALPAKA_FN_HOST_ACC constexpr auto divCeil(Integral a, Integral b) -> Integral
Returns the ceiling of a / b, as integer.
Vec< T, sizeof...(T_values), detail::CVec< T, T_values... > > CVec
A vector with compile-time known values.
ALPAKA_FN_HOST_ACC Vec(T_1, T_Args...) -> Vec< T_1, uint32_t(sizeof...(T_Args)+1u), ArrayStorage< T_1, uint32_t(sizeof...(T_Args)+1u)> >
ALPAKA_FN_HOST KernelBundle(TKernelFn const &, TArgs &&...) -> KernelBundle< TKernelFn, TArgs... >
User defined deduction guide with trailing return type. For CTAD during the construction.
ALPAKA_FN_HOST_ACC IdxRange(T_Extents const &) -> IdxRange< typename trait::getVec_t< T_Extents >::UniVec >