27 template<
typename T_Parent>
33 template<u
int32_t T_maxConcurrencyInByte, alpaka::concepts::Alignment T_MemAlignment = AutoAligned>
37 auto const& neutralElement,
43 auto numElements =
typename ALPAKA_TYPEOF(extents)::UniVec{extents};
47 constexpr auto simdCfg = T_Parent::template calcSimdPackConfig<ValueType>(
50 T_maxConcurrencyInByte);
52 constexpr uint32_t simdWidth = simdCfg.simdWidth;
54 if constexpr(simdWidth != 1u)
56 constexpr uint32_t numSimdPerFnCall = simdCfg.numSimdPacksPerFnCall;
67 auto const workGroup =
asParent().getWorkGroup();
81 simdizedReducedValue = reduceFunc(
90 auto result = neutralElement;
94 simdizedReducedValue);
99 template<uint32_t... T_idx>
101 std::integer_sequence<uint32_t, T_idx...>,
124 std::make_integer_sequence<uint32_t,
ALPAKA_TYPEOF(inPtr0)::width()>{},
129 {
return callFunctor(acc, func, data[idx.x()]...); },
141 {
return callFunctor(acc, transformFunc, inPtr.load()...); };
145 template<alpaka::concepts::Alignment T_MemAlignment, u
int32_t T_w
idth>
163 template<uint32_t... T_repeat>
166 std::integer_sequence<uint32_t, T_repeat...>)
169 return std::make_tuple(*(T_repeat + 1 != 0u ? iter++ : iter++)...);
187 std::integer_sequence<uint32_t, T_repeat...>,
194 [&](
auto const&... dataIdx)
constexpr
202 std::get<0>(std::make_tuple(dataIdx...)),
208 return results.
reduce(reduceFunc);
226 std::integer_sequence<uint32_t, T_repeat...>,
234 [&](
auto const&... dataIdx)
constexpr
247 template<u
int32_t T_simdW
idth, u
int32_t T_numSimdPerFnCall, alpaka::concepts::Alignment T_MemAlignment>
259 tmpReturn = reduceFn(
264 std::make_integer_sequence<uint32_t, T_numSimdPerFnCall>{},
275 std::make_integer_sequence<uint32_t, T_numSimdPerFnCall>{},
284 template<onAcc::concepts::Acc T_Acc,
typename T_ReduceOp>
299 std::make_integer_sequence<uint32_t,
ALPAKA_TYPEOF(a)::width()>{},
304 auto const&... data)
constexpr
309 alpaka::unused(acc, func);
326 template<uint32_t... T_idx>
328 std::integer_sequence<uint32_t, T_idx...>,
356 return static_cast<T_Parent const&
>(*this);
359 template<u
int32_t T_simdW
idth, u
int32_t T_numSimdPerFnCall, alpaka::concepts::Alignment T_MemAlignment>
363 auto const& neutralElement,
364 auto&& userReduceFunc,
369 auto reduceFunc =
getReducer(acc, userReduceFunc);
371 auto const workGroup =
asParent().getWorkGroup();
374 auto const wSize = workGroup.size(acc).back();
377 auto const numElementsPerFnCall = T_simdWidth * T_numSimdPerFnCall;
381 auto const numSimdPackLoops = numElements.back() / numElementsPerFnCall / wSize;
384 auto const remainderBegin = numSimdPackLoops * numElementsPerFnCall * wSize;
387 auto domainSize = numElements.rAssign(remainderBegin);
388 auto stride =
ALPAKA_TYPEOF(numElements)::fill(1).rAssign(T_simdWidth);
394 IdxRange{IdxType::fill(0), domainSize, stride},
412 using index_type =
typename IdxType::type;
413 auto wIdx = workGroup.idx(acc).rAssign(index_type{0});
414 auto wSize = workGroup.size(acc).rAssign(index_type{1});
415 auto domSize = domainSize.rAssign(index_type{1});
427 auto wIdxInner =
ALPAKA_TYPEOF(domainSize)::fill(0).rAssign(workGroup.idx(acc).back());
428 auto wSizeInner =
ALPAKA_TYPEOF(domainSize)::fill(1).rAssign(workGroup.size(acc).back());
435 IdxRange{rowIdx, domainSize, stride},
439 for(
auto iter = simdIdxContainerFastDim.begin(); iter != simdIdxContainerFastDim.end();)
444 simdizedReducedValue,
454 for(
auto iter = simdIdxContainer.begin(); iter != simdIdxContainer.end();)
459 simdizedReducedValue,
467 ALPAKA_TYPEOF(numElements) remainderDomainSize = numElements.fill(0).rAssign(remainderBegin);
472 IdxRange{remainderDomainSize, numElements},
487 lhs[0] = reduceFunc(std::as_const(lhs)[0], rhs[0]);
489 simdizedReducedValue,
497 simdizedReducedValue);
#define ALPAKA_FN_ACC
All functions that can be used on an accelerator have to be attributed with ALPAKA_FN_ACC or ALPAKA_F...
#define ALPAKA_TYPEOF(...)
Get the type of instance.
#define ALPAKA_FN_INLINE
Macro defining the inline function attribute.
#define ALPAKA_FORWARD(instance)
Perfectly forward an instance as argument.
Concept to check for an alignment object.
Concept to check if a type is a CVector.
Concept to check if a type is a SIMD pointer.
Concept to check if a type is a vector.
Concept to check if a type is an accelerator.
ALPAKA_FN_HOST_ACC constexpr auto makeIdxMap(auto const &acc, auto const workGroup, auto const range, T_Traverse traverse=T_Traverse{}, T_IdxLayout idxLayout=T_IdxLayout{})
Creates an index container.
typename GetValueType< T >::type GetValueType_t
Vec< T, sizeof...(T_values), detail::CVec< T, T_values... > > CVec
A vector with compile-time known values.
constexpr void simdizedInvoke(auto &&fn, auto &&... args)
Invokes the callable object fn with the parameters args.
constexpr bool isSpecializationOf_v
checks if T is a instance of U
constexpr auto callFunctor(T_Acc const &acc, T_Functor &&functor, T_Args &&... args)
Execute the functor with or without an accelerator as first argument.
constexpr auto makeSimdized(auto &&value)
Transform a type into a SIMD-optimized data structure.
On some constexpr function signatures ALPAKA_FN_HOST_ACC is required for CUDA; otherwise a __host__ f...
Marks a functor that can only be executed with scalar types and not SIMD packages.
pointer to a SIMD pack with the width T_SimdWidth
constexpr auto reduce(auto &&reduceFunc) const -> decltype(reduceFunc(std::declval< type >(), std::declval< type >()))
reduce all elements to a single value
Marks a functor which supports SimdPtr as arguments.
Linearize the index domain for traversing.