20namespace alpaka::onAcc::internal
23 template<
typename T_Parent>
26 constexpr SimdConcurrent() =
default;
29 template<u
int32_t T_maxConcurrencyInByte, alpaka::concepts::Alignment T_MemAlignment>
32 alpaka::concepts::Vector
auto extents,
34 alpaka::concepts::IDataSource
auto&& data0,
35 alpaka::concepts::IDataSource
auto&&... dataN)
const
37 auto numElements =
typename ALPAKA_TYPEOF(extents)::UniVec{extents};
40 constexpr auto simdCfg = T_Parent::template calcSimdPackConfig<ValueType>(
43 T_maxConcurrencyInByte);
45 constexpr uint32_t simdWidth = simdCfg.simdWidth;
47 if constexpr(simdWidth != 1u)
49 constexpr uint32_t numSimdPerFnCall = simdCfg.numSimdPacksPerFnCall;
50 concurrentSimdPackExecution<simdWidth, numSimdPerFnCall, T_MemAlignment>(
62 asParent().getWorkGroup(),
64 asParent().getTraversePolicy(),
65 asParent().getIdxLayoutPolicy()))
76 constexpr auto const& asParent()
const
78 return static_cast<T_Parent const&
>(*this);
81 template<alpaka::concepts::Alignment T_MemAlignment, u
int32_t T_w
idth>
86 alpaka::concepts::IDataSource
auto&&... data)
98 template<alpaka::concepts::Alignment T_MemAlignment, uint32_t T_width, uint32_t... T_repeat>
102 std::integer_sequence<uint32_t, T_repeat...>,
104 alpaka::concepts::IDataSource
auto&&... data)
111 auto ids = std::make_tuple(*(T_repeat + 1 != 0u ? iter++ : iter++)...);
113 [&](
auto const&... dataIdx)
constexpr
121 template<u
int32_t T_simdW
idth, u
int32_t T_numSimdPerFnCall, alpaka::concepts::Alignment T_MemAlignment>
124 alpaka::concepts::Vector
auto numElements,
126 alpaka::concepts::IDataSource
auto&& data0,
127 alpaka::concepts::IDataSource
auto&&... dataN)
const
129 auto const workGroup = asParent().getWorkGroup();
132 auto const wSize = workGroup.size(acc).back();
135 auto const numElementsPerFnCall = T_simdWidth * T_numSimdPerFnCall;
139 auto const numSimdPackLoops = numElements.back() / numElementsPerFnCall / wSize;
142 auto const remainderBegin = numSimdPackLoops * numElementsPerFnCall * wSize;
145 auto domainSize = numElements.rAssign(remainderBegin);
146 auto stride =
ALPAKA_TYPEOF(numElements)::fill(1).rAssign(T_simdWidth);
150 domainSize.dim() > 1u && std::is_same_v<
ALPAKA_TYPEOF(asParent().getTraversePolicy()), traverse::Flat>)
160 using index_type =
typename IdxType::type;
161 auto wIdx = workGroup.idx(acc).rAssign(index_type{0});
162 auto wSize = workGroup.size(acc).rAssign(index_type{1});
163 auto domSize = domainSize.rAssign(index_type{1});
165 auto wOuter = WorkerGroup{wIdx, wSize};
171 asParent().getTraversePolicy(),
172 asParent().getIdxLayoutPolicy()))
175 auto wIdxInner =
ALPAKA_TYPEOF(domainSize)::fill(0).rAssign(workGroup.idx(acc).back());
176 auto wSizeInner =
ALPAKA_TYPEOF(domainSize)::fill(1).rAssign(workGroup.size(acc).back());
177 auto wInner = WorkerGroup{wIdxInner, wSizeInner};
183 IdxRange{rowIdx, domainSize, stride},
184 asParent().getTraversePolicy(),
185 asParent().getIdxLayoutPolicy())[
CVec<uint32_t,
ALPAKA_TYPEOF(domainSize)::dim() - 1u>{}];
187 for(
auto iter = simdIdxContainer.begin(); iter != simdIdxContainer.end();)
189 execute<T_MemAlignment, T_simdWidth>(
192 std::make_integer_sequence<uint32_t, T_numSimdPerFnCall>{},
201 auto simdIdxContainer = onAcc::makeIdxMap(
204 IdxRange{IdxType::fill(0), domainSize, stride},
205 asParent().getTraversePolicy(),
206 asParent().getIdxLayoutPolicy());
208 for(
auto iter = simdIdxContainer.begin(); iter != simdIdxContainer.end();)
210 execute<T_MemAlignment, T_simdWidth>(
213 std::make_integer_sequence<uint32_t, T_numSimdPerFnCall>{},
220 ALPAKA_TYPEOF(numElements) remainderDomainSize = numElements.fill(0).rAssign(remainderBegin);
222 for(
auto idx : onAcc::makeIdxMap(
225 IdxRange{remainderDomainSize, numElements},
226 asParent().getTraversePolicy(),
227 asParent().getIdxLayoutPolicy()))
231 SimdPtr{data0, idx, T_MemAlignment{}, CVec<uint32_t, 1u>{}},
232 SimdPtr{dataN, idx, T_MemAlignment{}, CVec<uint32_t, 1u>{}}...);
#define ALPAKA_FN_ACC
All functions that can be used on an accelerator have to be attributed with ALPAKA_FN_ACC or ALPAKA_F...
#define ALPAKA_TYPEOF(...)
Get the type of instance.
#define ALPAKA_FN_INLINE
Macro defining the inline function attribute.
#define ALPAKA_FORWARD(instance)
Perfectly forward an instance as argument.
ALPAKA_FN_HOST_ACC constexpr auto makeIdxMap(auto const &acc, auto const workGroup, auto const range, T_Traverse traverse=T_Traverse{}, T_IdxLayout idxLayout=T_IdxLayout{})
Creates an index container.
typename GetValueType< T >::type GetValueType_t
Vec< T, sizeof...(T_values), detail::CVec< T, T_values... > > CVec
A vector with compile-time known values.
ALPAKA_FN_HOST_ACC IdxRange(T_Extents const &) -> IdxRange< typename trait::getVec_t< T_Extents >::UniVec >
On some constexpr function signatures ALPAKA_FN_HOST_ACC is required for CUDA; otherwise a __host__ f...