alpaka
Abstraction Library for Parallel Kernel Acceleration
Loading...
Searching...
No Matches
transformReduce.hpp
Go to the documentation of this file.
1/* Copyright 2025 René Widera, Mehmet Yusufoglu
2 * SPDX-License-Identifier: MPL-2.0
3 */
4
5#pragma once
6
7#include "alpaka/Vec.hpp"
8#include "alpaka/api/util.hpp"
10#include "alpaka/functor.hpp"
11#include "alpaka/mem/MdSpan.hpp"
14#include "alpaka/onAcc/Acc.hpp"
19#include "alpaka/trait.hpp"
20
21namespace alpaka::onHost::internal
22{
23 struct SimdTransformReduceKernel
24 {
25 uint32_t dynSharedMemBytes = 0u;
26
27 template<typename T_DataType>
28 ALPAKA_FN_ACC void operator()(
29 onAcc::concepts::Acc auto const& acc,
30 alpaka::concepts::Vector auto const& numChunks,
31 alpaka::concepts::Vector auto const& chunkExtents,
32 alpaka::concepts::Vector auto const& extentMd,
33 T_DataType const& neutralElement,
34 alpaka::concepts::IMdSpan auto output,
35 auto const& reduceFunc,
36 auto const& transformFunc,
37 alpaka::concepts::IDataSource auto&&... inputs) const
38 {
39 static_assert(
40 std::is_same_v<ALPAKA_TYPEOF(neutralElement), alpaka::trait::GetValueType_t<ALPAKA_TYPEOF(output)>>,
41 "The neutral element type must match the data output type.");
42
43
44 // Shared memory for block-wide reduction
45 T_DataType* dynS = onAcc::getDynSharedMem<T_DataType>(acc);
46 auto pitchMd = alpaka::calculatePitchesFromExtents<T_DataType>(chunkExtents);
47 auto tbSum = MdSpan{dynS, chunkExtents, pitchMd};
48
49 auto traverseInFrame = alpaka::onAcc::makeIdxMap(
50 acc,
52 alpaka::IdxRange{chunkExtents});
53
54 // Initialize shared memory by setting all elements to the neutral element or identity value
55 // for the reduction operation.
56 for(auto elemIdxInFrame : traverseInFrame)
57 {
58 tbSum[elemIdxInFrame] = neutralElement;
59 }
60
61 auto const chunkDataExtent = numChunks * chunkExtents;
62 auto traverseOverFrames = alpaka::onAcc::makeIdxMap(
63 acc,
65 alpaka::IdxRange{chunkDataExtent.fill(0), chunkDataExtent, chunkExtents});
66
67 for(auto chunkIdx : traverseOverFrames)
68 {
69 for(alpaka::concepts::Vector auto elemIdxInChunk : traverseInFrame)
70 {
71 auto allThreads = alpaka::onAcc::SimdAlgo{
72 alpaka::onAcc::WorkerGroup{chunkIdx + elemIdxInChunk, chunkDataExtent}};
73
74 // reduce functor with simd package support
75 auto reducedValue
77 .transformReduce(acc, extentMd, neutralElement, reduceFunc, transformFunc, inputs...);
78 auto& tbSumRef = tbSum[elemIdxInChunk];
79 tbSumRef = reduceFunc(tbSumRef, reducedValue);
80 }
81 }
82
83 auto const laneIdInBlock = linearize(acc[alpaka::layer::thread].count(), acc[alpaka::layer::thread].idx());
84 auto const blockSize = acc[alpaka::layer::thread].count().product();
85 // Synchronize threads before aggregation
87
88 // Aggregate shared memory slots
89 for(auto [linearSharedElemIdx] : alpaka::onAcc::makeIdxMap(
90 acc,
92 alpaka::IdxRange{blockSize, chunkExtents.product()}))
93 {
94 dynS[laneIdInBlock] = reduceFunc(dynS[laneIdInBlock], dynS[linearSharedElemIdx]);
95 }
96
98
99 // Perform a parallel reduction within the block
100 // This is a tree reduction algorithm
101 for(auto offset = blockSize / 2; offset > 0; offset /= 2)
102 {
104 if(laneIdInBlock < offset)
105 {
106 dynS[laneIdInBlock] = reduceFunc(dynS[laneIdInBlock], dynS[laneIdInBlock + offset]);
107 }
108 }
109
110 // Atomic update of the global result
111 if(laneIdInBlock == 0)
112 {
114 if constexpr(
115 alpaka::concepts::SpecializationOf<ALPAKA_TYPEOF(reduceFunc), ScalarFunc>
116 || alpaka::concepts::SpecializationOf<ALPAKA_TYPEOF(reduceFunc), StencilFunc>)
117 {
118 // Handle wrapped reduce functors e.g. ScalarFunc or StencilFunc
119 using ReduceFunctor = typename ALPAKA_TYPEOF(reduceFunc)::Functor;
121 static_cast<ReduceFunctor const&>(reduceFunc),
122 acc,
123 output.data(),
124 dynS[laneIdInBlock]);
125 }
126 else
127 atomicInvoke(reduceFunc, acc, output.data(), dynS[laneIdInBlock]);
128 }
129 }
130 };
131
132 template<typename T_DataType>
133 inline void transformReduce(
134 auto const& queue,
135 alpaka::concepts::Executor auto const exec,
136 T_DataType const& neutralElement,
137 alpaka::concepts::IMdSpan auto out,
138 auto&& reduceFn,
139 auto&& transformFn,
140 auto&& in0,
141 alpaka::concepts::IDataSource auto&&... in)
142 {
143 auto extentMd = onHost::getExtents(in0);
144 using IndexType = alpaka::trait::GetValueType_t<ALPAKA_TYPEOF(extentMd)>;
145 auto frameSpec = getFrameSpec<T_DataType>(queue.getDevice(), exec, extentMd);
146
147 /* Adjust the launch parameters to not oversubscribe a device too much.
148 *
149 * @todo: This heuristic should be adjusted based on benchmarking different cases.
150 */
151 {
152 IndexType multiprocessorScaling = 1u;
153 if constexpr(!(ALPAKA_TYPEOF(queue.getDevice().getDeviceKind()){} == deviceKind::cpu))
154 {
155 // For non-CPU devices, we scale the number of frames based on an arbitrary number derived from
156 // testing with the dot kernel of the bablestream benchmark.
157 multiprocessorScaling = 32u;
158 }
159
160 auto const numMultiProcessors = queue.getDevice().getDeviceProperties().multiProcessorCount;
161 auto adjsutedNumFrames = alpaka::api::util::adjustToLimit(
162 frameSpec.getNumFrames(),
163 static_cast<IndexType>(numMultiProcessors * multiprocessorScaling));
164 frameSpec = FrameSpec{adjsutedNumFrames, frameSpec.getFrameExtents(), exec};
165 }
166
167 /* Derive the chunk size and number of chunks from the SIMD optimized frame specification.
168 * The chunking parameters influences the numerical precision because it provides the possibility to control
169 * the length of the accumulation chain of a single thread.
170 */
171 auto numChunks = frameSpec.getNumFrames();
172 auto chunkExtents = frameSpec.getFrameExtents();
173
174 auto kernelFn = SimdTransformReduceKernel{
175 static_cast<uint32_t>(frameSpec.getFrameExtents().product() * sizeof(T_DataType))};
176
179 [&]()
180 {
181 std::stringstream ss;
182 ss << "transformReduce{ extents=" << extentMd << ", value_type=" << onHost::demangledName<T_DataType>()
183 << ", " << frameSpec << ", reduceFn=" << onHost::demangledName(reduceFn)
184 << ", transformFn=" << onHost::demangledName(transformFn) << " }";
185 return ss.str();
186 });
187
188 onHost::fill(queue, out, neutralElement, out.getExtents().fill(1));
189 queue.enqueue(
190 frameSpec,
192 kernelFn,
193 numChunks,
194 chunkExtents,
195 extentMd,
196 neutralElement,
197 out,
198 ALPAKA_FORWARD(reduceFn),
199 ALPAKA_FORWARD(transformFn),
200 ALPAKA_FORWARD(in0),
201 ALPAKA_FORWARD(in)...});
202 }
203} // namespace alpaka::onHost::internal
#define ALPAKA_FN_ACC
All functions that can be used on an accelerator have to be attributed with ALPAKA_FN_ACC or ALPAKA_F...
Definition common.hpp:30
#define ALPAKA_TYPEOF(...)
Get the type of instance.
Definition common.hpp:153
#define ALPAKA_FORWARD(instance)
Perfectly forward an instance as argument.
Definition common.hpp:147
#define ALPAKA_LOG_INFO(logLvl, callable)
Write a meta data message to the output.
Definition logger.hpp:106
consteval auto adjustToLimit(concepts::CVector auto const input)
adjust the input vector to a given limit by halving all components until the product of these is is b...
Definition util.hpp:63
constexpr auto cpu
Definition tag.hpp:170
constexpr auto thread
Definition tag.hpp:255
ALPAKA_FN_ACC void atomicInvoke(auto &&fn, concepts::Acc auto const &acc, auto *inOut, auto &&... args)
Defines the equivalent of an atomic invoke for user defined functors.
Definition atomic.hpp:199
constexpr auto blocksInGrid
constexpr auto allThreads
Represent the identity of the executor thread.
constexpr auto linearThreadsInBlock
constexpr auto threadsInBlock
ALPAKA_FN_HOST_ACC constexpr auto makeIdxMap(auto const &acc, auto const workGroup, auto const range, T_Traverse traverse=T_Traverse{}, T_IdxLayout idxLayout=T_IdxLayout{})
Creates an index container.
Definition interface.hpp:57
constexpr auto getDynSharedMem(concepts::Acc auto const &acc) -> T *
Get block shared dynamic memory.
Definition Acc.hpp:192
constexpr void syncBlockThreads(concepts::Acc auto const &acc)
Synchronize all threads within a thread block.
Definition Acc.hpp:127
constexpr auto queue
Definition lvl.hpp:127
constexpr auto memory
Definition lvl.hpp:112
FrameSpec(T_NumFrames const &, T_FrameExtents const &) -> FrameSpec< alpaka::trait::getVec_t< T_NumFrames >, alpaka::trait::getVec_t< T_FrameExtents >, alpaka::exec::AnyExecutor >
void fill(Queue< T_Device, T_QueueKind > const &queue, auto &&dest, T_Value elementValue)
fill memory element wise
Definition Queue.hpp:346
constexpr auto demangledName()
decltype(auto) getExtents(auto &&any)
Object extents.
Definition interface.hpp:25
typename GetValueType< T >::type GetValueType_t
Definition trait.hpp:65
ALPAKA_FN_HOST_ACC StencilFunc(T_Func &&) -> StencilFunc< T_Func >
constexpr auto calculatePitchesFromExtents(T_Vec const &extent)
Calculate the pitches purely from the extents.
constexpr T_IntegralType linearize(Vec< T_IntegralType, T_dim - 1u, T_Storage > const &dim, Vec< T_IntegralType, T_dim, T_OtherStorage > const &idx)
Give the linear index of an N-dimensional index within an N-dimensional index space.
Definition Vec.hpp:832
ALPAKA_FN_HOST_ACC ScalarFunc(T_Func &&) -> ScalarFunc< T_Func >
ALPAKA_FN_HOST KernelBundle(TKernelFn const &, TArgs &&...) -> KernelBundle< TKernelFn, TArgs... >
User defined deduction guide with trailing return type. For CTAD during the construction.
ALPAKA_FN_HOST_ACC IdxRange(T_Extents const &) -> IdxRange< typename trait::getVec_t< T_Extents >::UniVec >