23namespace alpaka::onAcc::internal
27 template<
typename T_Parent>
28 struct SimdTransformReduce
30 constexpr SimdTransformReduce() =
default;
33 template<u
int32_t T_maxConcurrencyInByte, alpaka::concepts::Alignment T_MemAlignment = AutoAligned>
35 concepts::Acc
auto const& acc,
36 alpaka::concepts::Vector
auto extents,
37 auto const& neutralElement,
40 alpaka::concepts::IDataSource
auto&& data0,
41 alpaka::concepts::IDataSource
auto&&... dataN)
const
43 auto numElements =
typename ALPAKA_TYPEOF(extents)::UniVec{extents};
45 decltype(
auto) transformFunc = wrapTransformFunc(
ALPAKA_FORWARD(func));
47 constexpr auto simdCfg = T_Parent::template calcSimdPackConfig<ValueType>(
50 T_maxConcurrencyInByte);
52 constexpr uint32_t simdWidth = simdCfg.simdWidth;
54 if constexpr(simdWidth != 1u)
56 constexpr uint32_t numSimdPerFnCall = simdCfg.numSimdPacksPerFnCall;
57 return reduceSimdPackExecution<simdWidth, numSimdPerFnCall, T_MemAlignment>(
67 auto const workGroup = asParent().getWorkGroup();
73 asParent().getTraversePolicy(),
74 asParent().getIdxLayoutPolicy());
79 for(
auto idx : traverse)
81 simdizedReducedValue = reduceFunc(
90 auto result = neutralElement;
92 [](
auto& lhs, alpaka::concepts::Simd
auto const& rhs) { lhs = rhs[0]; },
94 simdizedReducedValue);
99 template<uint32_t... T_idx>
101 std::integer_sequence<uint32_t, T_idx...>,
119 onAcc::concepts::Acc
auto const& acc,
120 alpaka::concepts::SimdPtr
auto&& inPtr0,
121 alpaka::concepts::SimdPtr
auto const&... inPtr)
constexpr
123 return loadAncExecuteScalarOp(
124 std::make_integer_sequence<uint32_t,
ALPAKA_TYPEOF(inPtr0)::width()>{},
125 [](alpaka::concepts::CVector
auto idx,
128 alpaka::concepts::Simd
auto const&...
data)
constexpr
129 {
return callFunctor(acc, func, data[idx.x()]...); },
139 onAcc::concepts::Acc
auto const& acc,
140 alpaka::concepts::SimdPtr
auto&&... inPtr)
constexpr
141 {
return callFunctor(acc, transformFunc, inPtr.load()...); };
145 template<alpaka::concepts::Alignment T_MemAlignment, u
int32_t T_w
idth>
147 concepts::Acc
auto const& acc,
150 alpaka::concepts::IDataSource
auto&&... data)
163 template<uint32_t... T_repeat>
166 std::integer_sequence<uint32_t, T_repeat...>)
169 return std::make_tuple(*(T_repeat + 1 != 0u ? iter++ : iter++)...);
183 template<alpaka::concepts::Alignment T_MemAlignment, uint32_t T_width, uint32_t... T_repeat>
185 concepts::Acc
auto const& acc,
187 std::integer_sequence<uint32_t, T_repeat...>,
190 alpaka::concepts::IDataSource
auto&&... data)
192 auto ids = makeAdvanceIterators(iter, std::integer_sequence<uint32_t, T_repeat...>{});
194 [&](
auto const&... dataIdx)
constexpr
200 executeDoTransform<T_MemAlignment, T_width>(
202 std::get<0>(std::make_tuple(dataIdx...)),
206 executeDoTransform<T_MemAlignment, T_width>(acc, dataIdx, func,
data...)...};
208 return results.reduce(reduceFunc);
222 template<alpaka::concepts::Alignment T_MemAlignment, uint32_t T_width, uint32_t... T_repeat>
224 concepts::Acc
auto const& acc,
226 std::integer_sequence<uint32_t, T_repeat...>,
230 alpaka::concepts::IDataSource
auto&&... data)
232 auto ids = makeAdvanceIterators(iter, std::integer_sequence<uint32_t, T_repeat...>{});
234 [&](
auto const&... dataIdx)
constexpr
238 executeDoTransform<T_MemAlignment, T_width>(acc, dataIdx, transformFn,
data...))),
247 template<u
int32_t T_simdW
idth, u
int32_t T_numSimdPerFnCall, alpaka::concepts::Alignment T_MemAlignment>
254 alpaka::concepts::IDataSource
auto&& data0,
255 alpaka::concepts::IDataSource
auto&&... dataN)
257 if constexpr(alpaka::concepts::Simd<std::remove_cvref_t<
decltype(tmpReturn)>>)
259 tmpReturn = reduceFn(
261 executeReduce<T_MemAlignment, T_simdWidth>(
264 std::make_integer_sequence<uint32_t, T_numSimdPerFnCall>{},
272 executeReduceInto<T_MemAlignment, T_simdWidth>(
275 std::make_integer_sequence<uint32_t, T_numSimdPerFnCall>{},
284 template<onAcc::concepts::Acc T_Acc,
typename T_ReduceOp>
289 T_ReduceOp
const& m_reduceOp;
291 constexpr ScalarReducer(T_Acc
const& acc,
auto&& func) : m_acc(acc), m_reduceOp{
ALPAKA_FORWARD(func)}
295 constexpr auto operator()(
auto&& a,
auto&& b)
const
298 return loadAncExecuteScalarOp(
299 std::make_integer_sequence<uint32_t,
ALPAKA_TYPEOF(a)::width()>{},
301 alpaka::concepts::CVector
auto idx,
302 concepts::Acc
auto const& acc,
304 auto const&...
data)
constexpr
309 alpaka::unused(acc, func);
311 return this->operator()(data[idx.x()]...);
319 constexpr auto operator()(
auto&& a,
auto&& b)
const
326 template<uint32_t... T_idx>
328 std::integer_sequence<uint32_t, T_idx...>,
342 ALPAKA_FN_INLINE constexpr auto getReducer(onAcc::concepts::Acc
auto const&,
auto&& reduceOp)
const
348 ALPAKA_FN_INLINE constexpr auto getReducer(onAcc::concepts::Acc
auto const& acc,
auto&& reduceOp)
const
354 constexpr auto const& asParent()
const
356 return static_cast<T_Parent const&
>(*this);
359 template<u
int32_t T_simdW
idth, u
int32_t T_numSimdPerFnCall, alpaka::concepts::Alignment T_MemAlignment>
362 alpaka::concepts::Vector
auto numElements,
363 auto const& neutralElement,
364 auto&& userReduceFunc,
366 alpaka::concepts::IDataSource
auto&& data0,
367 alpaka::concepts::IDataSource
auto&&... dataN)
const
369 auto reduceFunc = getReducer(acc, userReduceFunc);
371 auto const workGroup = asParent().getWorkGroup();
374 auto const wSize = workGroup.size(acc).back();
377 auto const numElementsPerFnCall = T_simdWidth * T_numSimdPerFnCall;
381 auto const numSimdPackLoops = numElements.back() / numElementsPerFnCall / wSize;
384 auto const remainderBegin = numSimdPackLoops * numElementsPerFnCall * wSize;
387 auto domainSize = numElements.rAssign(remainderBegin);
388 auto stride =
ALPAKA_TYPEOF(numElements)::fill(1).rAssign(T_simdWidth);
394 IdxRange{IdxType::fill(0), domainSize, stride},
395 asParent().getTraversePolicy(),
396 asParent().getIdxLayoutPolicy());
402 domainSize.dim() > 1u && std::is_same_v<
ALPAKA_TYPEOF(asParent().getTraversePolicy()), traverse::Flat>)
412 using index_type =
typename IdxType::type;
413 auto wIdx = workGroup.idx(acc).rAssign(index_type{0});
414 auto wSize = workGroup.size(acc).rAssign(index_type{1});
415 auto domSize = domainSize.rAssign(index_type{1});
417 auto wOuter = WorkerGroup{wIdx, wSize};
423 asParent().getTraversePolicy(),
424 asParent().getIdxLayoutPolicy()))
427 auto wIdxInner =
ALPAKA_TYPEOF(domainSize)::fill(0).rAssign(workGroup.idx(acc).back());
428 auto wSizeInner =
ALPAKA_TYPEOF(domainSize)::fill(1).rAssign(workGroup.size(acc).back());
429 auto wInner = WorkerGroup{wIdxInner, wSizeInner};
435 IdxRange{rowIdx, domainSize, stride},
436 asParent().getTraversePolicy(),
437 asParent().getIdxLayoutPolicy())[
CVec<uint32_t,
ALPAKA_TYPEOF(domainSize)::dim() - 1u>{}];
439 for(
auto iter = simdIdxContainerFastDim.begin(); iter != simdIdxContainerFastDim.end();)
441 reduceNextSimdized<T_simdWidth, T_numSimdPerFnCall, T_MemAlignment>(
444 simdizedReducedValue,
454 for(
auto iter = simdIdxContainer.begin(); iter != simdIdxContainer.end();)
456 reduceNextSimdized<T_simdWidth, T_numSimdPerFnCall, T_MemAlignment>(
459 simdizedReducedValue,
467 ALPAKA_TYPEOF(numElements) remainderDomainSize = numElements.fill(0).rAssign(remainderBegin);
469 for(
auto idx : onAcc::makeIdxMap(
472 IdxRange{remainderDomainSize, numElements},
473 asParent().getTraversePolicy(),
474 asParent().getIdxLayoutPolicy()))
479 SimdPtr{data0, idx, T_MemAlignment{}, CVec<uint32_t, 1u>{}},
480 SimdPtr{dataN, idx, T_MemAlignment{}, CVec<uint32_t, 1u>{}}...);
487 lhs[0] = reduceFunc(std::as_const(lhs)[0], rhs[0]);
489 simdizedReducedValue,
497 simdizedReducedValue);
#define ALPAKA_FN_ACC
All functions that can be used on an accelerator have to be attributed with ALPAKA_FN_ACC or ALPAKA_F...
#define ALPAKA_TYPEOF(...)
Get the type of instance.
#define ALPAKA_FN_INLINE
Macro defining the inline function attribute.
#define ALPAKA_FORWARD(instance)
Perfectly forward an instance as argument.
ALPAKA_FN_HOST_ACC constexpr auto makeIdxMap(auto const &acc, auto const workGroup, auto const range, T_Traverse traverse=T_Traverse{}, T_IdxLayout idxLayout=T_IdxLayout{})
Creates an index container.
decltype(auto) data(auto &&any)
pointer to data of an object
typename GetValueType< T >::type GetValueType_t
ALPAKA_FN_HOST_ACC StencilFunc(T_Func &&) -> StencilFunc< T_Func >
Vec< T, sizeof...(T_values), detail::CVec< T, T_values... > > CVec
A vector with compile-time known values.
ALPAKA_FN_HOST_ACC ScalarFunc(T_Func &&) -> ScalarFunc< T_Func >
constexpr void simdizedInvoke(auto &&fn, auto &&... args)
Invokes the callable object fn with the parameters args.
ALPAKA_FN_HOST_ACC IdxRange(T_Extents const &) -> IdxRange< typename trait::getVec_t< T_Extents >::UniVec >
constexpr bool isSpecializationOf_v
checks if T is a instance of U
constexpr auto callFunctor(T_Acc const &acc, T_Functor &&functor, T_Args &&... args)
Execute the functor with or without an accelerator as first argument.
ALPAKA_FN_HOST_ACC Simd(T_1, T_Args...) -> Simd< T_1, uint32_t(sizeof...(T_Args)+1u)>
constexpr auto makeSimdized(auto &&value)
Transform a type into a SIMD-optimized data structure.
On some constexpr function signatures ALPAKA_FN_HOST_ACC is required for CUDA; otherwise a __host__ f...