alpaka
Abstraction Library for Parallel Kernel Acceleration
Loading...
Searching...
No Matches
SimdTransformReduce.hpp
Go to the documentation of this file.
1/* Copyright 2024 René Widera
2 * SPDX-License-Identifier: MPL-2.0
3 */
4
5#pragma once
6
7#include "alpaka/Simd.hpp"
8#include "alpaka/SimdPtr.hpp"
9#include "alpaka/Vec.hpp"
10#include "alpaka/api/trait.hpp"
12#include "alpaka/functor.hpp"
15#include "alpaka/onAcc/Acc.hpp"
19
20#include <cstdint>
21#include <new>
22
24{
25
26 /** concurrent reduce implementation */
27 template<typename T_Parent>
29 {
30 constexpr SimdTransformReduce() = default;
31
32 protected:
33 template<uint32_t T_maxConcurrencyInByte, alpaka::concepts::Alignment T_MemAlignment = AutoAligned>
35 concepts::Acc auto const& acc,
36 alpaka::concepts::Vector auto extents,
37 auto const& neutralElement,
38 auto&& reduceFunc,
39 auto&& func,
41 alpaka::concepts::IDataSource auto&&... dataN) const
42 {
43 auto numElements = typename ALPAKA_TYPEOF(extents)::UniVec{extents};
44 using ValueType = alpaka::trait::GetValueType_t<ALPAKA_TYPEOF(data0)>;
45 decltype(auto) transformFunc = wrapTransformFunc(ALPAKA_FORWARD(func));
46
47 constexpr auto simdCfg = T_Parent::template calcSimdPackConfig<ValueType>(
48 ALPAKA_TYPEOF(acc.getApi()){},
49 ALPAKA_TYPEOF(acc.getDeviceKind()){},
50 T_maxConcurrencyInByte);
51
52 constexpr uint32_t simdWidth = simdCfg.simdWidth;
53
54 if constexpr(simdWidth != 1u)
55 {
56 constexpr uint32_t numSimdPerFnCall = simdCfg.numSimdPacksPerFnCall;
58 acc,
59 numElements,
60 neutralElement,
61 ALPAKA_FORWARD(reduceFunc),
62 transformFunc,
63 ALPAKA_FORWARD(data0),
64 ALPAKA_FORWARD(dataN)...);
65 }
66
67 auto const workGroup = asParent().getWorkGroup();
68 // execute the algorithm with SIMD width one
70 acc,
71 workGroup,
72 IdxRange{numElements},
73 asParent().getTraversePolicy(),
74 asParent().getIdxLayoutPolicy());
75
76 using SimdOneReturnType = ALPAKA_TYPEOF(makeSimdized<1u>(neutralElement));
77 SimdOneReturnType simdizedReducedValue = makeSimdized<1u>(neutralElement);
78
79 for(auto idx : traverse)
80 {
81 simdizedReducedValue = reduceFunc(
82 simdizedReducedValue,
84 acc,
85 transformFunc,
86 SimdPtr{data0, idx, T_MemAlignment{}, CVec<uint32_t, 1u>{}},
87 SimdPtr{dataN, idx, T_MemAlignment{}, CVec<uint32_t, 1u>{}}...));
88 }
89
90 auto result = neutralElement;
92 [](auto& lhs, alpaka::concepts::Simd auto const& rhs) { lhs = rhs[0]; },
93 result,
94 simdizedReducedValue);
95 return result;
96 }
97
98 private:
99 template<uint32_t... T_idx>
101 std::integer_sequence<uint32_t, T_idx...>,
102 auto&& op,
103 auto const& acc,
104 auto&& func,
105 auto&&... data)
106 {
107 return Simd{op(CVec<uint32_t, T_idx>{}, acc, ALPAKA_FORWARD(func), ALPAKA_FORWARD(data)...)...};
108 }
109
110 ALPAKA_FN_INLINE ALPAKA_FN_ACC static constexpr decltype(auto) wrapTransformFunc(auto&& transformFunc)
111 {
112 if constexpr(isSpecializationOf_v<ALPAKA_TYPEOF(transformFunc), StencilFunc>)
113 {
114 return ALPAKA_FORWARD(transformFunc);
115 }
116 else if constexpr(isSpecializationOf_v<ALPAKA_TYPEOF(transformFunc), ScalarFunc>)
117 {
118 return [transformFunc = ALPAKA_FORWARD(transformFunc)](
119 onAcc::concepts::Acc auto const& acc,
120 alpaka::concepts::SimdPtr auto&& inPtr0,
121 alpaka::concepts::SimdPtr auto const&... inPtr) constexpr
122 {
124 std::make_integer_sequence<uint32_t, ALPAKA_TYPEOF(inPtr0)::width()>{},
125 [](alpaka::concepts::CVector auto idx,
126 auto const& acc,
127 auto&& func,
128 alpaka::concepts::Simd auto const&... data) constexpr
129 { return callFunctor(acc, func, data[idx.x()]...); },
130 acc,
131 transformFunc,
132 inPtr0.load(),
133 inPtr.load()...);
134 };
135 }
136 else
137 {
138 return [transformFunc = ALPAKA_FORWARD(transformFunc)](
139 onAcc::concepts::Acc auto const& acc,
140 alpaka::concepts::SimdPtr auto&&... inPtr) constexpr
141 { return callFunctor(acc, transformFunc, inPtr.load()...); };
142 }
143 }
144
145 template<alpaka::concepts::Alignment T_MemAlignment, uint32_t T_width>
147 concepts::Acc auto const& acc,
148 auto const& dataIdx,
149 auto&& func,
150 alpaka::concepts::IDataSource auto&&... data)
151 {
152 return callFunctor(acc, func, SimdPtr{data, dataIdx, T_MemAlignment{}, CVec<uint32_t, T_width>{}}...);
153 }
154
155 /** advance the iterator T_repeat times
156 *
157 * We do not check if the iterator points to a valid element, the caller must ensure that we can safely
158 * advance the iterator T_repeat time without jumping over iter.end().
159 *
160 * @tparam T_repeat Number of time sthe iterator should be advanced.
161 * @return Tuple with T_repeat times iterators.
162 */
163 template<uint32_t... T_repeat>
165 auto& iter,
166 std::integer_sequence<uint32_t, T_repeat...>)
167 {
168 // The ternary operator is used to allow using the folding expression on iter.
169 return std::make_tuple(*(T_repeat + 1 != 0u ? iter++ : iter++)...);
170 }
171
172 /** Calls the transform functor T_repeat times and reduces the results with the given reduce function.
173 *
174 * The calls to the functor are independent and compile time unrolled to support instruction parallelism.
175 * In contrast to executeReduceInto() the register footprint is larger because T_repeat temporary results will
176 * be holt. This allows the compiler to use instruction level parallelism. Call this function if result of
177 * reduceFunc is a SIMD pack.
178 *
179 * @param iter the caller must ensure tha the interator can be increased T_repeat times without jumping over
180 * iter.end()
181 * @return a single simdized pack
182 */
183 template<alpaka::concepts::Alignment T_MemAlignment, uint32_t T_width, uint32_t... T_repeat>
184 ALPAKA_FN_INLINE static constexpr auto executeReduce(
185 concepts::Acc auto const& acc,
186 auto& iter,
187 std::integer_sequence<uint32_t, T_repeat...>,
188 auto&& reduceFunc,
189 auto&& func,
190 alpaka::concepts::IDataSource auto&&... data)
191 {
192 auto ids = makeAdvanceIterators(iter, std::integer_sequence<uint32_t, T_repeat...>{});
193 return std::apply(
194 [&](auto const&... dataIdx) constexpr
195 {
196 /* It is not possible to create a Simd{Simd} due to constructor issues. Therefore we need to define
197 * the type for the result explicit.
198 */
199 using ComponentType = ALPAKA_TYPEOF(
201 acc,
202 std::get<0>(std::make_tuple(dataIdx...)),
203 func,
204 data...));
205 auto results = Simd<ComponentType, std::tuple_size_v<ALPAKA_TYPEOF(ids)>>{
206 executeDoTransform<T_MemAlignment, T_width>(acc, dataIdx, func, data...)...};
207
208 return results.reduce(reduceFunc);
209 },
210 ids);
211 }
212
213 /** Reduce simdized packs into a single simdized pack with the given reduce function.
214 *
215 * In contrast to executeReduce() the register footprint is lower because all intermediate results are directly
216 * reduced into the result variable. Call this function if the type of result is a simdized pack is not a SIMD
217 * pack.
218 *
219 * @param result The results of reduceFn with the result of transformFn will be reduced into this simdized
220 * pack.
221 */
222 template<alpaka::concepts::Alignment T_MemAlignment, uint32_t T_width, uint32_t... T_repeat>
223 ALPAKA_FN_INLINE static constexpr void executeReduceInto(
224 concepts::Acc auto const& acc,
225 auto& iter,
226 std::integer_sequence<uint32_t, T_repeat...>,
227 auto& result,
228 auto&& reduceFn,
229 auto&& transformFn,
230 alpaka::concepts::IDataSource auto&&... data)
231 {
232 auto ids = makeAdvanceIterators(iter, std::integer_sequence<uint32_t, T_repeat...>{});
233 std::apply(
234 [&](auto const&... dataIdx) constexpr
235 {
236 ((result = reduceFn(
237 result,
238 executeDoTransform<T_MemAlignment, T_width>(acc, dataIdx, transformFn, data...))),
239 ...);
240 },
241 ids);
242 }
243
244 /** Reduce T_numSimdPerFnCall simdized packs
245 *
246 */
247 template<uint32_t T_simdWidth, uint32_t T_numSimdPerFnCall, alpaka::concepts::Alignment T_MemAlignment>
249 auto const& acc,
250 auto& iter,
251 auto& tmpReturn,
252 auto&& reduceFn,
253 auto&& transformFn,
255 alpaka::concepts::IDataSource auto&&... dataN)
256 {
257 if constexpr(alpaka::concepts::Simd<std::remove_cvref_t<decltype(tmpReturn)>>)
258 {
259 tmpReturn = reduceFn(
260 tmpReturn,
262 acc,
263 iter,
264 std::make_integer_sequence<uint32_t, T_numSimdPerFnCall>{},
265 reduceFn,
266 transformFn,
267 data0,
268 dataN...));
269 }
270 else
271 {
273 acc,
274 iter,
275 std::make_integer_sequence<uint32_t, T_numSimdPerFnCall>{},
276 tmpReturn,
277 reduceFn,
278 transformFn,
279 data0,
280 dataN...);
281 }
282 }
283
284 template<onAcc::concepts::Acc T_Acc, typename T_ReduceOp>
286 {
287 // using a const reference here is fine because we control the lifetime
288 T_Acc const& m_acc;
289 T_ReduceOp const& m_reduceOp;
290
291 constexpr ScalarReducer(T_Acc const& acc, auto&& func) : m_acc(acc), m_reduceOp{ALPAKA_FORWARD(func)}
292 {
293 }
294
295 constexpr auto operator()(auto&& a, auto&& b) const
297 {
299 std::make_integer_sequence<uint32_t, ALPAKA_TYPEOF(a)::width()>{},
300 [this](
302 concepts::Acc auto const& acc,
303 auto&& func,
304 auto const&... data) constexpr
305 {
306 /* const& for data is used instead of && to enforce const evaluation of the operator[]
307 * std simd operator[] is returning a smart reference which is avoided if data is const
308 */
309 alpaka::unused(acc, func);
310 // recursively call until no Simd type is the result
311 return this->operator()(data[idx.x()]...);
312 },
313 m_acc,
316 ALPAKA_FORWARD(b));
317 }
318
319 constexpr auto operator()(auto&& a, auto&& b) const
321 {
323 }
324
325 private:
326 template<uint32_t... T_idx>
328 std::integer_sequence<uint32_t, T_idx...>,
329 auto&& op,
330 auto const& acc,
331 auto&& func,
332 auto&&... data)
333 {
334 return Simd{op(CVec<uint32_t, T_idx>{}, acc, ALPAKA_FORWARD(func), ALPAKA_FORWARD(data)...)...};
335 }
336 };
337
338 /** Get the reducer functor
339 *
340 * @return wrapped functor in case the input is @see ScalarFunc else the identity
341 */
342 ALPAKA_FN_INLINE constexpr auto getReducer(onAcc::concepts::Acc auto const&, auto&& reduceOp) const
343 requires(!isSpecializationOf_v<ALPAKA_TYPEOF(reduceOp), ScalarFunc>)
344 {
345 return reduceOp;
346 }
347
348 ALPAKA_FN_INLINE constexpr auto getReducer(onAcc::concepts::Acc auto const& acc, auto&& reduceOp) const
349 requires(isSpecializationOf_v<ALPAKA_TYPEOF(reduceOp), ScalarFunc>)
350 {
351 return ScalarReducer<ALPAKA_TYPEOF(acc), ALPAKA_TYPEOF(reduceOp)>{acc, reduceOp};
352 }
353
354 constexpr auto const& asParent() const
355 {
356 return static_cast<T_Parent const&>(*this);
357 }
358
359 template<uint32_t T_simdWidth, uint32_t T_numSimdPerFnCall, alpaka::concepts::Alignment T_MemAlignment>
361 auto const& acc,
362 alpaka::concepts::Vector auto numElements,
363 auto const& neutralElement,
364 auto&& userReduceFunc,
365 auto&& func,
367 alpaka::concepts::IDataSource auto&&... dataN) const
368 {
369 auto reduceFunc = getReducer(acc, userReduceFunc);
370
371 auto const workGroup = asParent().getWorkGroup();
372
373 // we SIMDfy only over the fast moving dimension (columns of memory)
374 auto const wSize = workGroup.size(acc).back();
375
376 /* Number of data elements process per functor call. */
377 auto const numElementsPerFnCall = T_simdWidth * T_numSimdPerFnCall;
378 /** To avoid a overflow in the index range we device first by the number of elements per
379 * function call and than by the number of workers.
380 */
381 auto const numSimdPackLoops = numElements.back() / numElementsPerFnCall / wSize;
382
383 // number of elments to jump over to start the remainder loop
384 auto const remainderBegin = numSimdPackLoops * numElementsPerFnCall * wSize;
385
386 // we SIMDfy only over the fast moving dimension (columns of memory)
387 auto domainSize = numElements.rAssign(remainderBegin);
388 auto stride = ALPAKA_TYPEOF(numElements)::fill(1).rAssign(T_simdWidth);
389
390 using IdxType = ALPAKA_TYPEOF(numElements);
391 auto simdIdxContainer = onAcc::makeIdxMap(
392 acc,
393 workGroup,
394 IdxRange{IdxType::fill(0), domainSize, stride},
395 asParent().getTraversePolicy(),
396 asParent().getIdxLayoutPolicy());
397
398 using SimdReturn = ALPAKA_TYPEOF(makeSimdized<T_simdWidth>(neutralElement));
399 SimdReturn simdizedReducedValue = makeSimdized<T_simdWidth>(neutralElement);
400
401 if constexpr(
402 domainSize.dim() > 1u && std::is_same_v<ALPAKA_TYPEOF(asParent().getTraversePolicy()), traverse::Flat>)
403 {
404 /* For cases where we traverse with the flat policy, we cannot assume that we can blindly increase the
405 * iterator later N times. This could happen in cases where we have enough concurrency. We evaluate for
406 * SIMD operations only the fast moving dimension but with the flat policy flattening the worker group
407 * and use all workers on a linear domain. The loop must therefore be split into iterating over all
408 * slow dimensions and an inner loop iterating over the fast moving dimension. For this we need to
409 * build our own groups out of the user-provided workgroup.
410 */
411 // build a worker group with slow-moving dimension threads for the outer loop
412 using index_type = typename IdxType::type;
413 auto wIdx = workGroup.idx(acc).rAssign(index_type{0});
414 auto wSize = workGroup.size(acc).rAssign(index_type{1});
415 auto domSize = domainSize.rAssign(index_type{1});
416
417 auto wOuter = WorkerGroup{wIdx, wSize};
418
419 for(auto rowIdx : onAcc::makeIdxMap(
420 acc,
421 wOuter,
422 IdxRange{domSize},
423 asParent().getTraversePolicy(),
424 asParent().getIdxLayoutPolicy()))
425 {
426 // build a worker group with fast-moving dimension threads for the inner loop
427 auto wIdxInner = ALPAKA_TYPEOF(domainSize)::fill(0).rAssign(workGroup.idx(acc).back());
428 auto wSizeInner = ALPAKA_TYPEOF(domainSize)::fill(1).rAssign(workGroup.size(acc).back());
429 auto wInner = WorkerGroup{wIdxInner, wSizeInner};
430
431 // iterate over the fast-moving dimension only
432 auto simdIdxContainerFastDim = onAcc::makeIdxMap(
433 acc,
434 wInner,
435 IdxRange{rowIdx, domainSize, stride},
436 asParent().getTraversePolicy(),
437 asParent().getIdxLayoutPolicy())[CVec<uint32_t, ALPAKA_TYPEOF(domainSize)::dim() - 1u>{}];
438
439 for(auto iter = simdIdxContainerFastDim.begin(); iter != simdIdxContainerFastDim.end();)
440 {
442 acc,
443 iter,
444 simdizedReducedValue,
445 ALPAKA_FORWARD(reduceFunc),
446 ALPAKA_FORWARD(func),
447 ALPAKA_FORWARD(data0),
448 ALPAKA_FORWARD(dataN)...);
449 }
450 }
451 }
452 else
453 {
454 for(auto iter = simdIdxContainer.begin(); iter != simdIdxContainer.end();)
455 {
457 acc,
458 iter,
459 simdizedReducedValue,
460 ALPAKA_FORWARD(reduceFunc),
461 ALPAKA_FORWARD(func),
462 ALPAKA_FORWARD(data0),
463 ALPAKA_FORWARD(dataN)...);
464 }
465 }
466
467 ALPAKA_TYPEOF(numElements) remainderDomainSize = numElements.fill(0).rAssign(remainderBegin);
468
469 for(auto idx : onAcc::makeIdxMap(
470 acc,
471 workGroup,
472 IdxRange{remainderDomainSize, numElements},
473 asParent().getTraversePolicy(),
474 asParent().getIdxLayoutPolicy()))
475 {
476 auto transformResult = callFunctor(
477 acc,
478 func,
479 SimdPtr{data0, idx, T_MemAlignment{}, CVec<uint32_t, 1u>{}},
480 SimdPtr{dataN, idx, T_MemAlignment{}, CVec<uint32_t, 1u>{}}...);
481
483 [reduceFunc](auto& lhs, alpaka::concepts::Simd auto const& rhs)
484 {
485 // std simd non-const operator[] is returning a smart reference, therefore we need
486 // std::as_const to enforce returning a copy of the value.
487 lhs[0] = reduceFunc(std::as_const(lhs)[0], rhs[0]);
488 },
489 simdizedReducedValue,
490 transformResult);
491 }
492
493 ALPAKA_TYPEOF(neutralElement) result;
495 [reduceFunc](auto& lhs, alpaka::concepts::Simd auto const& rhs) { lhs = rhs.reduce(reduceFunc); },
496 result,
497 simdizedReducedValue);
498 return result;
499 }
500 };
501} // namespace alpaka::onAcc::internal
#define ALPAKA_FN_ACC
All functions that can be used on an accelerator have to be attributed with ALPAKA_FN_ACC or ALPAKA_F...
Definition common.hpp:30
#define ALPAKA_TYPEOF(...)
Get the type of instance.
Definition common.hpp:153
#define ALPAKA_FN_INLINE
Macro defining the inline function attribute.
Definition common.hpp:87
#define ALPAKA_FORWARD(instance)
Perfectly forward an instance as argument.
Definition common.hpp:147
Concept to check for an alignment object.
Definition Alignment.hpp:89
Concept to check if a type is a CVector.
Definition Vec.hpp:74
Concept to check if a type is a SIMD pointer.
Definition SimdPtr.hpp:41
Concept to check if a type is a vector.
Definition Vec.hpp:53
Concept to check if a type is an accelerator.
Definition Acc.hpp:114
ALPAKA_FN_HOST_ACC constexpr auto makeIdxMap(auto const &acc, auto const workGroup, auto const range, T_Traverse traverse=T_Traverse{}, T_IdxLayout idxLayout=T_IdxLayout{})
Creates an index container.
Definition interface.hpp:57
typename GetValueType< T >::type GetValueType_t
Definition trait.hpp:65
Vec< T, sizeof...(T_values), detail::CVec< T, T_values... > > CVec
A vector with compile-time known values.
Definition CVec.hpp:31
constexpr void simdizedInvoke(auto &&fn, auto &&... args)
Invokes the callable object fn with the parameters args.
Definition simdized.hpp:81
constexpr bool isSpecializationOf_v
checks if T is a instance of U
Definition utility.hpp:103
constexpr auto callFunctor(T_Acc const &acc, T_Functor &&functor, T_Args &&... args)
Execute the functor with or without an accelerator as first argument.
Definition functor.hpp:58
constexpr auto makeSimdized(auto &&value)
Transform a type into a SIMD-optimized data structure.
Definition simdized.hpp:50
On some constexpr function signatures ALPAKA_FN_HOST_ACC is required for CUDA; otherwise a __host__ f...
Marks a functor that can only be executed with scalar types and not SIMD packages.
Definition functor.hpp:39
pointer to a SIMD pack with the width T_SimdWidth
Definition SimdPtr.hpp:62
Simd vector.
Definition Simd.hpp:78
constexpr auto reduce(auto &&reduceFunc) const -> decltype(reduceFunc(std::declval< type >(), std::declval< type >()))
reduce all elements to a single value
Definition Simd.hpp:426
Marks a functor which supports SimdPtr as arguments.
Definition functor.hpp:21
ALPAKA_FN_INLINE static ALPAKA_FN_ACC constexpr auto loadAncExecuteScalarOp(std::integer_sequence< uint32_t, T_idx... >, auto &&op, auto const &acc, auto &&func, auto &&... data)
ALPAKA_FN_INLINE static ALPAKA_FN_ACC constexpr auto loadAncExecuteScalarOp(std::integer_sequence< uint32_t, T_idx... >, auto &&op, auto const &acc, auto &&func, auto &&... data)
static ALPAKA_FN_INLINE constexpr void reduceNextSimdized(auto const &acc, auto &iter, auto &tmpReturn, auto &&reduceFn, auto &&transformFn, alpaka::concepts::IDataSource auto &&data0, alpaka::concepts::IDataSource auto &&... dataN)
Reduce T_numSimdPerFnCall simdized packs.
static ALPAKA_FN_INLINE constexpr void executeReduceInto(concepts::Acc auto const &acc, auto &iter, std::integer_sequence< uint32_t, T_repeat... >, auto &result, auto &&reduceFn, auto &&transformFn, alpaka::concepts::IDataSource auto &&... data)
Reduce simdized packs into a single simdized pack with the given reduce function.
static ALPAKA_FN_INLINE constexpr auto executeReduce(concepts::Acc auto const &acc, auto &iter, std::integer_sequence< uint32_t, T_repeat... >, auto &&reduceFunc, auto &&func, alpaka::concepts::IDataSource auto &&... data)
Calls the transform functor T_repeat times and reduces the results with the given reduce function.
ALPAKA_FN_INLINE constexpr auto getReducer(onAcc::concepts::Acc auto const &, auto &&reduceOp) const
Get the reducer functor.
ALPAKA_FN_INLINE ALPAKA_FN_ACC constexpr auto reduceSimdPackExecution(auto const &acc, alpaka::concepts::Vector auto numElements, auto const &neutralElement, auto &&userReduceFunc, auto &&func, alpaka::concepts::IDataSource auto &&data0, alpaka::concepts::IDataSource auto &&... dataN) const
ALPAKA_FN_INLINE ALPAKA_FN_ACC constexpr auto transformReduce(concepts::Acc auto const &acc, alpaka::concepts::Vector auto extents, auto const &neutralElement, auto &&reduceFunc, auto &&func, alpaka::concepts::IDataSource auto &&data0, alpaka::concepts::IDataSource auto &&... dataN) const
static ALPAKA_FN_INLINE constexpr auto executeDoTransform(concepts::Acc auto const &acc, auto const &dataIdx, auto &&func, alpaka::concepts::IDataSource auto &&... data)
static ALPAKA_FN_INLINE constexpr auto makeAdvanceIterators(auto &iter, std::integer_sequence< uint32_t, T_repeat... >)
advance the iterator T_repeat times
ALPAKA_FN_INLINE static ALPAKA_FN_ACC constexpr decltype(auto) wrapTransformFunc(auto &&transformFunc)
ALPAKA_FN_INLINE constexpr auto getReducer(onAcc::concepts::Acc auto const &acc, auto &&reduceOp) const
Linearize the index domain for traversing.
Definition traverse.hpp:22