37 auto numElements =
typename ALPAKA_TYPEOF(extents)::UniVec{extents};
40 constexpr auto simdCfg = T_Parent::template calcSimdPackConfig<ValueType>(
43 T_maxConcurrencyInByte);
45 constexpr uint32_t simdWidth = simdCfg.simdWidth;
47 if constexpr(simdWidth != 1u)
49 constexpr uint32_t numSimdPerFnCall = simdCfg.numSimdPacksPerFnCall;
129 auto const workGroup =
asParent().getWorkGroup();
132 auto const wSize = workGroup.size(acc).back();
135 auto const numElementsPerFnCall = T_simdWidth * T_numSimdPerFnCall;
139 auto const numSimdPackLoops = numElements.back() / numElementsPerFnCall / wSize;
142 auto const remainderBegin = numSimdPackLoops * numElementsPerFnCall * wSize;
145 auto domainSize = numElements.rAssign(remainderBegin);
146 auto stride =
ALPAKA_TYPEOF(numElements)::fill(1).rAssign(T_simdWidth);
160 using index_type =
typename IdxType::type;
161 auto wIdx = workGroup.idx(acc).rAssign(index_type{0});
162 auto wSize = workGroup.size(acc).rAssign(index_type{1});
163 auto domSize = domainSize.rAssign(index_type{1});
175 auto wIdxInner =
ALPAKA_TYPEOF(domainSize)::fill(0).rAssign(workGroup.idx(acc).back());
176 auto wSizeInner =
ALPAKA_TYPEOF(domainSize)::fill(1).rAssign(workGroup.size(acc).back());
183 IdxRange{rowIdx, domainSize, stride},
187 for(
auto iter = simdIdxContainer.begin(); iter != simdIdxContainer.end();)
192 std::make_integer_sequence<uint32_t, T_numSimdPerFnCall>{},
204 IdxRange{IdxType::fill(0), domainSize, stride},
208 for(
auto iter = simdIdxContainer.begin(); iter != simdIdxContainer.end();)
213 std::make_integer_sequence<uint32_t, T_numSimdPerFnCall>{},
220 ALPAKA_TYPEOF(numElements) remainderDomainSize = numElements.fill(0).rAssign(remainderBegin);
225 IdxRange{remainderDomainSize, numElements},