alpaka
Abstraction Library for Parallel Kernel Acceleration
Loading...
Searching...
No Matches
scan.hpp
Go to the documentation of this file.
1/* Copyright 2025 Anton Reinhard
2 * SPDX-License-Identifier: MPL-2.0
3 */
4
5#pragma once
6
7
8#include "alpaka/CVec.hpp"
9#include "alpaka/Simd.hpp"
10#include "alpaka/Vec.hpp"
12#include "alpaka/onAcc/Acc.hpp"
14#include "alpaka/onAcc/warp.hpp"
17#include "alpaka/trait.hpp"
18
19#include <array> // std::array
20#include <cstddef> // std::size_t
21#include <tuple> // std::tuple
22#include <type_traits> // is_same_v
23#include <typeinfo>
24
25namespace alpaka::onHost::internal
26{
27 enum ScanType
28 {
29 EXCLUSIVE_SCAN,
30 INCLUSIVE_SCAN
31 };
32
33 constexpr std::size_t chunkSize = 2048u;
34
35 template<alpaka::concepts::DeviceKind TDeviceKind, typename T_Idx, typename T_Data>
36 consteval T_Idx maximumMiniBlockSize()
37 {
38 if constexpr(TDeviceKind{} == deviceKind::nvidiaGpu)
39 return static_cast<T_Idx>(8);
40 else if constexpr(TDeviceKind{} == deviceKind::amdGpu)
41 return static_cast<T_Idx>(8);
42 else if constexpr(TDeviceKind{} == deviceKind::intelGpu)
43 return static_cast<T_Idx>(8);
44 else
45 return static_cast<T_Idx>(32768) / sizeof(T_Data);
46 }
47
48 /* This function introduces padding to the shared memory accesses to reduce bank conflicts between threads. The
49 * template parameter is the device kind, which dictates how many memory banks are assumed. For CPU or
50 * unknown/unimplemented device kinds, infinite memory banks are assumed, i.e., no padding is used.
51 */
52 template<typename T_Acc, typename T_Idx>
53 constexpr T_Idx conflictFreeAccess(T_Idx const& n)
54 {
55 constexpr auto warpSize = static_cast<T_Idx>(onAcc::warp::getSize<T_Acc>());
56 return n + n / warpSize;
57 }
58
59 /* Do a muting exclusive scan on the given miniblock, and return the total sum.
60 */
61 template<typename T_Idx, typename T_Data>
62 ALPAKA_FN_ACC T_Data scanMiniBlock(T_Data* block, alpaka::concepts::CVector<T_Idx> auto const& extent)
63 {
64 // -- UP-SWEEP / REDUCE --
65 for(T_Idx d = extent.x() / T_Idx{2}, offset = T_Idx{1}; d > 0; d >>= 1, offset <<= 1)
66 {
67 for(auto frameElem = T_Idx{0}; frameElem < T_Idx{2} * d; frameElem += T_Idx{2})
68 {
69 T_Idx left = offset * (frameElem + T_Idx{1}) - T_Idx{1};
70 T_Idx right = offset * (frameElem + T_Idx{2}) - T_Idx{1};
71 block[right] += block[left];
72 }
73 }
74
75 // save total sum
76 T_Data blockSum = block[extent.x() - T_Idx{1}];
77
78 // set 0
79 block[extent.x() - T_Idx{1}] = T_Data{0};
80
81 // -- DOWN-SWEEP --
82 for(T_Idx d = 1, offset = extent.x() / T_Idx{2}; d < extent.x(); d <<= 1, offset >>= 1)
83 {
84 for(auto frameElem = T_Idx{0}; frameElem < T_Idx{2} * d; frameElem += T_Idx{2})
85 {
86 T_Idx left = offset * (frameElem + T_Idx{1}) - T_Idx{1};
87 T_Idx right = offset * (frameElem + T_Idx{2}) - T_Idx{1};
88 auto t = block[left];
89 block[left] = block[right];
90 block[right] += t;
91 }
92 }
93 return blockSum;
94 }
95
96 /* Do an add increment on the given miniblock, adding the given blockSum to each element.
97 */
98 template<typename T_Idx, typename T_Data>
99 ALPAKA_FN_ACC void addIncrements(
100 T_Data* block,
101 T_Data const& blockSum,
102 alpaka::concepts::CVector<T_Idx> auto const& extent)
103 {
104 for(auto i = T_Idx{0}; i < extent.x(); ++i)
105 {
106 block[i] += blockSum;
107 }
108 }
109
110 /* This kernel calculates an exclusive scan for each block individually. The algorithm is based on Blelloch, with
111 * the improvement from Lichterman, written up in the CUDA blog (see 39.2.5):
112 * https://developer.nvidia.com/gpugems/gpugems3/part-vi-gpu-computing/chapter-39-parallel-prefix-sum-scan-cuda
113 */
114 template<ScanType SCAN_TYPE, typename T_Idx, typename T_Data>
115 class Scan_ScanBlocksKernel
116 {
117 public:
118 ALPAKA_FN_ACC void operator()(
119 auto const& acc,
120 alpaka::concepts::Vector auto const numChunks,
121 alpaka::concepts::CVector auto const largeChunkExtents,
122 alpaka::concepts::IDataSource auto const& inputVec,
123 alpaka::concepts::IMdSpan auto outputVec,
124 auto... blockSums) const
125 {
126 using DeviceType = ALPAKA_TYPEOF(acc.getDeviceKind());
127 using AccType = ALPAKA_TYPEOF(acc);
128
129 alpaka::concepts::CVector auto numThreadsPerBlock = acc[layer::thread].count();
130 constexpr std::integral auto elsPerThread = largeChunkExtents.x() / numThreadsPerBlock.x();
131 alpaka::concepts::CVector auto chunkExtent = CVec<T_Idx, elsPerThread * numThreadsPerBlock.x()>{};
132 alpaka::concepts::Vector auto numElements = inputVec.getExtents();
133
134 constexpr std::integral auto miniBlockSize
135 = std::min(maximumMiniBlockSize<DeviceType, T_Idx, T_Data>(), elsPerThread);
136 constexpr std::integral auto miniBlocksPerThread = elsPerThread / miniBlockSize;
137 constexpr std::integral auto miniBlocksPerChunk = chunkExtent.x() / miniBlockSize;
138
139 constexpr auto LocalArrayLength = miniBlocksPerThread * miniBlockSize;
140 using LocalArray = T_Data[LocalArrayLength];
141
142 auto const validElementsInLastFrame = (numElements - T_Idx{1}) % chunkExtent + T_Idx{1};
143
144 /* This kernel is called with 1-dimensional frame extents.
145 *
146 * All thread blocks will be used to iterate over the frames. Each thread block will handle one or more
147 * frames.
148 */
149 for(auto chunkIdx :
151 {
152 bool const lastFrameFull = validElementsInLastFrame == chunkExtent;
153 bool const isLastFrame = chunkIdx == numChunks - T_Idx{1};
154
155 // allocate "per-thread" register memory to store all mini blocks of a thread persistently
156 LocalArray regMem;
157
158 constexpr auto conflictFreeAdr = conflictFreeAccess<AccType>(miniBlocksPerChunk - T_Idx{1}) + T_Idx{1};
160 auto const frameOffset = chunkExtent * chunkIdx;
161
162 for(auto frameElem : onAcc::makeIdxMap(
163 acc,
166 {
167 // -- COPY TO SHARED MEM --
168 if((!lastFrameFull && isLastFrame) || elsPerThread % T_Idx{4} != T_Idx{0})
169 {
170 // load into miniblocks buffer, from frameElem to frameElem + elsPerThread
171 for(auto i = T_Idx{0}; i < elsPerThread; ++i)
172 {
173 if(frameOffset + frameElem + i < numElements)
174 regMem[i] = inputVec[frameOffset + frameElem + i];
175 else
176 regMem[i] = 0;
177 }
178 }
179 else
180 {
181 MdSpanArray<LocalArray, T_Idx, alpaka::Alignment<16>> regMemMd{regMem};
182
183 for(auto i = T_Idx{0}; i < elsPerThread; i += T_Idx{4})
184 {
185 auto inputVecView = SimdPtr{
186 inputVec,
187 Vec{frameOffset + frameElem + i},
188 Alignment<16>{},
190 auto regView = SimdPtr{regMemMd, Vec{i}, Alignment<16>{}, CVec<T_Idx, 4>{}};
191
192 regView = inputVecView.load();
193 }
194 }
195
196 // -- HANDLE MINI BLOCKS OF THIS THREAD --
197 for(auto miniBlockOffset = T_Idx{0}; miniBlockOffset < elsPerThread;
198 miniBlockOffset += miniBlockSize)
199 {
200 // scan miniblock
201 auto miniBlockSum
202 = scanMiniBlock<T_Idx, T_Data>(regMem + miniBlockOffset, CVec<T_Idx, miniBlockSize>{});
203
204 // write miniblock sum into shared memory
205 tmp[conflictFreeAccess<AccType>((frameElem + miniBlockOffset) / miniBlockSize)] = miniBlockSum;
206 }
207 }
208
209 // -- UP-SWEEP / REDUCE --
210 for(T_Idx d = miniBlocksPerChunk / T_Idx{2}, offset = T_Idx{1}; d > 0; d >>= 1, offset <<= 1)
211 {
213 for(auto frameElem : onAcc::makeIdxMap(
214 acc,
216 IdxRange{CVec<T_Idx, 0>{}, Vec<T_Idx, 1>{T_Idx{2} * d}, T_Idx{2}}))
217 {
218 T_Idx left = offset * (frameElem + T_Idx{1}).x() - T_Idx{1};
219 T_Idx right = offset * (frameElem + T_Idx{2}).x() - T_Idx{1};
220 left = conflictFreeAccess<AccType>(left);
221 right = conflictFreeAccess<AccType>(right);
222 tmp[right] += tmp[left];
223 }
224 }
226
227 for([[maybe_unused]] auto frameElem :
229 {
230 // -- SAVE BLOCK SUMS --
231 if constexpr(sizeof...(blockSums))
232 {
233 auto _blockSums = std::get<0>(std::make_tuple(blockSums...));
234 _blockSums[chunkIdx] = tmp[conflictFreeAccess<AccType>(miniBlocksPerChunk - T_Idx{1})];
235 }
236
237 // -- SET 0 --
238 tmp[conflictFreeAccess<AccType>(miniBlocksPerChunk - T_Idx{1})] = 0;
239 }
240
241 // -- DOWN-SWEEP --
242 for(T_Idx d = 1, offset = miniBlocksPerChunk / T_Idx{2}; d < miniBlocksPerChunk; d <<= 1, offset >>= 1)
243 {
245 for(auto frameElem : onAcc::makeIdxMap(
246 acc,
248 IdxRange{CVec<T_Idx, 0>{}, Vec<T_Idx, 1>{T_Idx{2} * d}, T_Idx{2}}))
249 {
250 T_Idx left = offset * (frameElem.x() + T_Idx{1}) - T_Idx{1};
251 T_Idx right = offset * (frameElem.x() + T_Idx{2}) - T_Idx{1};
252 left = conflictFreeAccess<AccType>(left);
253 right = conflictFreeAccess<AccType>(right);
254 auto t = tmp[left];
255 tmp[left] = tmp[right];
256 tmp[right] += t;
257 }
258 }
260
261 // -- WRITE BACK --
262 for(auto frameElem : onAcc::makeIdxMap(
263 acc,
266 {
267 // -- HANDLE MINI BLOCKS OF THIS THREAD --
268 for(auto miniBlockOffset = T_Idx{0}; miniBlockOffset < elsPerThread;
269 miniBlockOffset += miniBlockSize)
270 {
271 // load block sum from shared memory
272 T_Data blockSum{0};
273 if(frameOffset + frameElem + miniBlockOffset < numElements)
274 {
275 blockSum
276 = tmp[conflictFreeAccess<AccType>((frameElem.x() + miniBlockOffset) / miniBlockSize)];
277 }
278
279 // add block sum to mini block
280 addIncrements<T_Idx>(regMem + miniBlockOffset, blockSum, CVec<T_Idx, miniBlockSize>{});
281 }
282
283 if((!lastFrameFull && isLastFrame) || elsPerThread % T_Idx{4} != T_Idx{0})
284 {
285 // write back to global mem, from frameElem to frameElem + elsPerThread
286 for(auto i = T_Idx{0}; i < elsPerThread; ++i)
287 {
288 if(frameOffset + frameElem + i < numElements)
289 {
290 if constexpr(SCAN_TYPE == EXCLUSIVE_SCAN)
291 outputVec[frameOffset + frameElem + i] = regMem[i];
292 else if constexpr(SCAN_TYPE == INCLUSIVE_SCAN)
293 outputVec[frameOffset + frameElem + i]
294 = inputVec[frameOffset + frameElem + i] + regMem[i];
295 }
296 }
297 }
298 else
299 {
300 MdSpanArray<LocalArray, T_Idx, alpaka::Alignment<16>> regMemMd{regMem};
301
302 for(auto i = T_Idx{0}; i < elsPerThread; i += T_Idx{4})
303 {
304 auto outputVecView = SimdPtr{
305 outputVec,
306 Vec{frameOffset + frameElem + i},
307 Alignment<16>{},
309 auto regView = SimdPtr{regMemMd, Vec{i}, Alignment<16>{}, CVec<T_Idx, 4>{}};
310 if constexpr(SCAN_TYPE == EXCLUSIVE_SCAN)
311 outputVecView = regView.load();
312 else if constexpr(SCAN_TYPE == INCLUSIVE_SCAN)
313 {
314 auto inputVecView = SimdPtr{
315 inputVec,
316 Vec{frameOffset + frameElem + i},
317 Alignment<16>{},
319 outputVecView = inputVecView.load() + regView.load();
320 }
321 }
322 }
323 }
325 }
326 }
327 };
328
329 /* Add prefix sum from previous blocks (blockSums) to all elements in each block.
330 */
331 template<typename T_Idx>
332 class Scan_AddIncrementsKernel
333 {
334 public:
335 ALPAKA_FN_ACC void operator()(
336 auto const& acc,
337 alpaka::concepts::CVector auto const largeChunkExtents,
338 alpaka::concepts::IMdSpan auto const& blockSums,
339 alpaka::concepts::IMdSpan auto outputVec) const
340 {
341 alpaka::concepts::Vector auto numElements = outputVec.getExtents();
342 alpaka::concepts::CVector auto numThreadsPerBlock = acc[layer::thread].count();
343 constexpr auto elsPerThread = largeChunkExtents.x() / numThreadsPerBlock.x();
344 alpaka::concepts::CVector auto chunkExtent = CVec<T_Idx, elsPerThread * numThreadsPerBlock.x()>{};
345
346 auto simdGrid = onAcc::SimdAlgo{onAcc::worker::threadsInGrid};
347 simdGrid.concurrent(
348 acc,
349 numElements,
350 [&](auto const&, auto&& simdOut) constexpr
351 { simdOut = simdOut.load() + blockSums[simdOut.getIdx() / chunkExtent]; },
352 outputVec);
353 }
354 };
355
356 template<typename T_Data>
357 auto scanBufferSize(std::integral auto const& extent)
358 {
359 using T_Idx = ALPAKA_TYPEOF(extent);
360 auto elements = divCeil(extent, T_Idx{chunkSize});
361
362 auto bufSize = T_Idx{0};
363 while(elements > T_Idx{1})
364 {
365 bufSize += elements;
366 elements = divCeil(elements, T_Idx{chunkSize});
367 }
368
369 return bufSize * T_Idx{sizeof(T_Data)};
370 }
371
372 template<typename T_Data>
373 auto scanBufferSize(alpaka::concepts::Vector auto const& extents)
374 {
375 static_assert(ALPAKA_TYPEOF(extents)::dim() == 1, "scan is only usable for one dimensional buffers");
376 return Vec{scanBufferSize<T_Data>(extents.x())};
377 }
378
379 template<ScanType SCAN_TYPE>
380 void scan(
381 auto& queue,
382 alpaka::onHost::concepts::Device auto& devAcc,
383 alpaka::concepts::Executor auto& exec,
384 alpaka::concepts::IMdSpan auto& buffer,
385 alpaka::concepts::IMdSpan auto& outputVec,
386 alpaka::concepts::IDataSource auto& inputVec)
387 {
388 using T_Data = typename ALPAKA_TYPEOF(inputVec)::value_type;
389 using T_Idx = typename ALPAKA_TYPEOF(inputVec)::index_type;
390
391 static_assert(
392 std::is_same_v<T_Data, typename ALPAKA_TYPEOF(outputVec)::value_type>,
393 "output vector must have the same data type as input vector");
394
395 // Instantiate the kernel function object with the given scan type
396 Scan_ScanBlocksKernel<SCAN_TYPE, T_Idx, T_Data> scanBlocks;
397
398 // Define chunkExtent
399 constexpr auto chunkExtent = CVec<T_Idx, chunkSize>{};
400 alpaka::Vec numChunks = divCeil(inputVec.getExtents(), chunkExtent);
401 auto const frameSpec = onHost::FrameSpec{numChunks, CVec<T_Idx, 256u>{}};
402
405 [&]()
406 {
407 std::stringstream ss;
408 ss << "scan: {";
409 if(SCAN_TYPE == INCLUSIVE_SCAN)
410 ss << ", scanType= INCLUSIVE_SCAN";
411 else if(SCAN_TYPE == EXCLUSIVE_SCAN)
412 ss << ", scanType= EXCLUSIVE_SCAN";
413 ss << ", numFrames= " << numChunks;
414 ss << ", chunkExtent= " << chunkExtent;
415 ss << ", value_type=" << onHost::demangledName<T_Data>();
416 ss << "}";
417 return ss.str();
418 });
419
420 if(frameSpec.getNumFrames() > T_Idx{1})
421 {
422 // problem does not fit in 1 frame, recurse
423 Scan_AddIncrementsKernel<T_Idx> addIncrements;
424
425 auto bufSizeBytes = frameSpec.getNumFrames() * T_Idx{sizeof(T_Data)};
426 assert(buffer.getExtents() * T_Idx{sizeof(typename ALPAKA_TYPEOF(buffer)::value_type)} >= bufSizeBytes);
427
428 // get the view to the necessary elements in the buffer for increments
429 auto subBuf = buffer.getSubView(bufSizeBytes);
430 auto increments = MdSpan{
431 reinterpret_cast<T_Data*>(subBuf.data()),
432 frameSpec.getNumFrames(),
433 Vec<T_Idx, 1>{sizeof(T_Data)}};
434
435 // the unused elements in the buffer are used for recursion to the next scan call
436 auto bufferNext = buffer.getSubView(bufSizeBytes, buffer.getExtents() - bufSizeBytes);
437
438 // enqueue the kernel execution tasks
439 queue.enqueue(
440 frameSpec,
441 KernelBundle{scanBlocks, numChunks, chunkExtent, inputVec, outputVec, increments});
442
443 // always recurse into exclusive scan
444 scan<EXCLUSIVE_SCAN>(queue, devAcc, exec, bufferNext, increments, increments);
445 queue.enqueue(frameSpec, KernelBundle{addIncrements, chunkExtent, increments, outputVec});
446 }
447 else
448 {
449 // problem fits within 1 frame
450 queue.enqueue(frameSpec, KernelBundle{scanBlocks, numChunks, chunkExtent, inputVec, outputVec});
451 }
452 }
453
454 template<ScanType SCAN_TYPE>
455 void scan(
456 auto& queue,
457 alpaka::onHost::concepts::Device auto& devAcc,
458 alpaka::concepts::Executor auto& exec,
459 alpaka::concepts::IMdSpan auto& outputVec,
460 alpaka::concepts::IDataSource auto const& inputVec)
461 {
462 using T_Data = ALPAKA_TYPEOF(inputVec)::value_type;
463
464 /* We do not use allocDeferred here since we measured up to a factor 40 higher latency compared to alloc for
465 * CUDA 12.8 on an A30 for the first call. The reason is the cuda per stream caching pool setup time.
466 */
467 auto buf = onHost::alloc<char>(devAcc, scanBufferSize<T_Data>(inputVec.getExtents()));
468
469 scan<SCAN_TYPE>(queue, devAcc, exec, buf, outputVec, inputVec);
470
471 buf.keepAlive(queue);
472 }
473
474} // namespace alpaka::onHost::internal
#define ALPAKA_FN_ACC
All functions that can be used on an accelerator have to be attributed with ALPAKA_FN_ACC or ALPAKA_F...
Definition common.hpp:30
#define ALPAKA_TYPEOF(...)
Get the type of instance.
Definition common.hpp:153
#define ALPAKA_LOG_INFO(logLvl, callable)
Write a meta data message to the output.
Definition logger.hpp:106
constexpr auto intelGpu
Definition tag.hpp:210
constexpr auto amdGpu
Definition tag.hpp:190
constexpr auto nvidiaGpu
Definition tag.hpp:200
constexpr auto thread
Definition tag.hpp:255
constexpr WarpSize warpSize
Definition tag.hpp:44
constexpr Block block
Definition scope.hpp:51
constexpr uint32_t getSize()
Return the warp size.
Definition warp.hpp:122
constexpr auto blocksInGrid
constexpr auto threadsInGrid
constexpr auto threadsInBlock
constexpr decltype(auto) declareSharedMdArray(concepts::Acc auto const &acc, alpaka::concepts::CVector auto const &extent)
creates an M-dimensional array
Definition Acc.hpp:168
ALPAKA_FN_HOST_ACC constexpr auto makeIdxMap(auto const &acc, auto const workGroup, auto const range, T_Traverse traverse=T_Traverse{}, T_IdxLayout idxLayout=T_IdxLayout{})
Creates an index container.
Definition interface.hpp:57
constexpr void syncBlockThreads(concepts::Acc auto const &acc)
Synchronize all threads within a thread block.
Definition Acc.hpp:127
constexpr auto queue
Definition lvl.hpp:127
constexpr auto memory
Definition lvl.hpp:112
FrameSpec(T_NumFrames const &, T_FrameExtents const &) -> FrameSpec< alpaka::trait::getVec_t< T_NumFrames >, alpaka::trait::getVec_t< T_FrameExtents >, alpaka::exec::AnyExecutor >
constexpr auto demangledName()
auto alloc(concepts::Device auto const &device, alpaka::concepts::VectorOrScalar auto const &extents)
Allocate memory on the given device.
Definition Device.hpp:180
ALPAKA_FN_HOST_ACC constexpr auto divCeil(Integral a, Integral b) -> Integral
Returns the ceiling of a / b, as integer.
Definition utility.hpp:34
Vec< T, sizeof...(T_values), detail::CVec< T, T_values... > > CVec
A vector with compile-time known values.
Definition CVec.hpp:31
ALPAKA_FN_HOST_ACC Vec(T_1, T_Args...) -> Vec< T_1, uint32_t(sizeof...(T_Args)+1u), ArrayStorage< T_1, uint32_t(sizeof...(T_Args)+1u)> >
ALPAKA_FN_HOST KernelBundle(TKernelFn const &, TArgs &&...) -> KernelBundle< TKernelFn, TArgs... >
User defined deduction guide with trailing return type. For CTAD during the construction.
ALPAKA_FN_HOST_ACC IdxRange(T_Extents const &) -> IdxRange< typename trait::getVec_t< T_Extents >::UniVec >