alpaka
Abstraction Library for Parallel Kernel Acceleration
Loading...
Searching...
No Matches
SimdConcurrent.hpp
Go to the documentation of this file.
1/* Copyright 2024 René Widera
2 * SPDX-License-Identifier: MPL-2.0
3 */
4
5#pragma once
6
7#include "alpaka/Simd.hpp"
8#include "alpaka/SimdPtr.hpp"
9#include "alpaka/Vec.hpp"
10#include "alpaka/api/trait.hpp"
16
17#include <cstdint>
18#include <new>
19
21{
22 /** concurrent foreach implementation */
23 template<typename T_Parent>
25 {
26 constexpr SimdConcurrent() = default;
27
28 protected:
29 template<uint32_t T_maxConcurrencyInByte, alpaka::concepts::Alignment T_MemAlignment>
31 auto const& acc,
32 alpaka::concepts::Vector auto extents,
33 auto&& func,
35 alpaka::concepts::IDataSource auto&&... dataN) const
36 {
37 auto numElements = typename ALPAKA_TYPEOF(extents)::UniVec{extents};
38 using ValueType = alpaka::trait::GetValueType_t<ALPAKA_TYPEOF(data0)>;
39
40 constexpr auto simdCfg = T_Parent::template calcSimdPackConfig<ValueType>(
41 ALPAKA_TYPEOF(acc.getApi()){},
42 ALPAKA_TYPEOF(acc.getDeviceKind()){},
43 T_maxConcurrencyInByte);
44
45 constexpr uint32_t simdWidth = simdCfg.simdWidth;
46
47 if constexpr(simdWidth != 1u)
48 {
49 constexpr uint32_t numSimdPerFnCall = simdCfg.numSimdPacksPerFnCall;
51 acc,
52 numElements,
53 ALPAKA_FORWARD(func),
54 ALPAKA_FORWARD(data0),
55 ALPAKA_FORWARD(dataN)...);
56 }
57 else
58 {
59 // execute the algorithm with SIMD width one
60 for(auto idx : onAcc::makeIdxMap(
61 acc,
62 asParent().getWorkGroup(),
63 IdxRange{numElements},
64 asParent().getTraversePolicy(),
65 asParent().getIdxLayoutPolicy()))
66 {
67 func(
68 acc,
69 SimdPtr{data0, idx, T_MemAlignment{}, CVec<uint32_t, 1u>{}},
70 SimdPtr{dataN, idx, T_MemAlignment{}, CVec<uint32_t, 1u>{}}...);
71 }
72 }
73 }
74
75 private:
76 constexpr auto const& asParent() const
77 {
78 return static_cast<T_Parent const&>(*this);
79 }
80
81 template<alpaka::concepts::Alignment T_MemAlignment, uint32_t T_width>
82 ALPAKA_FN_INLINE static constexpr void executeDo(
83 auto const& acc,
84 auto const& dataIdx,
85 auto&& func,
86 alpaka::concepts::IDataSource auto&&... data)
87 {
88 func(acc, SimdPtr{ALPAKA_FORWARD(data), dataIdx, T_MemAlignment{}, CVec<uint32_t, T_width>{}}...);
89 }
90
91 /** calls the functor and forward the data T_repeat times
92 *
93 * The calls to the functor are independent and compile time unrolled to support instruction parallelism.
94 *
95 * @param iter the caller must ensure tha the interator can be increased T_repeat times without jumping over
96 * iter.end()
97 */
98 template<alpaka::concepts::Alignment T_MemAlignment, uint32_t T_width, uint32_t... T_repeat>
99 ALPAKA_FN_INLINE static constexpr void execute(
100 auto const& acc,
101 auto& iter,
102 std::integer_sequence<uint32_t, T_repeat...>,
103 auto&& func,
104 alpaka::concepts::IDataSource auto&&... data)
105 {
106 /* We do not check if the iterator points to a valid element, the caller must ensure that we can safely
107 * increase the iterator without jumping over iter.end().
108 *
109 * The ternary operator is used to allow using the folding expression on iter.
110 */
111 auto ids = std::make_tuple(*(T_repeat + 1 != 0u ? iter++ : iter++)...);
112 std::apply(
113 [&](auto const&... dataIdx) constexpr
114 {
116 ...);
117 },
118 ids);
119 }
120
121 template<uint32_t T_simdWidth, uint32_t T_numSimdPerFnCall, alpaka::concepts::Alignment T_MemAlignment>
123 auto const& acc,
124 alpaka::concepts::Vector auto numElements,
125 auto&& func,
127 alpaka::concepts::IDataSource auto&&... dataN) const
128 {
129 auto const workGroup = asParent().getWorkGroup();
130
131 // we SIMDfy only over the fast moving dimension (columns of memory)
132 auto const wSize = workGroup.size(acc).back();
133
134 /* Number of data elements process per functor call. */
135 auto const numElementsPerFnCall = T_simdWidth * T_numSimdPerFnCall;
136 /** To avoid a overflow in the index range we device first by the number of elements per
137 * function call and than by the number of workers.
138 */
139 auto const numSimdPackLoops = numElements.back() / numElementsPerFnCall / wSize;
140
141 // number of elments to jump over to start the remainder loop
142 auto const remainderBegin = numSimdPackLoops * numElementsPerFnCall * wSize;
143
144 // we SIMDfy only over the fast moving dimension (columns of memory)
145 auto domainSize = numElements.rAssign(remainderBegin);
146 auto stride = ALPAKA_TYPEOF(numElements)::fill(1).rAssign(T_simdWidth);
147 using IdxType = ALPAKA_TYPEOF(numElements);
148
149 if constexpr(
150 domainSize.dim() > 1u && std::is_same_v<ALPAKA_TYPEOF(asParent().getTraversePolicy()), traverse::Flat>)
151 {
152 /* For cases where we traverse with the flat policy, we cannot assume that we can blindly increase the
153 * iterator later N times. This could happen in cases where we have enough concurrency. We evaluate for
154 * SIMD operations only the fast moving dimension but with the flat policy flattening the worker group
155 * and use all workers on a linear domain. The loop must therefore be splited into iterating over all
156 * slow dimensions and an inner loop iterating over the fast moving dimension. For this we need to
157 * build our own groups out of the user-provided workgroup.
158 */
159 // build a worker group with slow-moving dimension threads for the outer loop
160 using index_type = typename IdxType::type;
161 auto wIdx = workGroup.idx(acc).rAssign(index_type{0});
162 auto wSize = workGroup.size(acc).rAssign(index_type{1});
163 auto domSize = domainSize.rAssign(index_type{1});
164
165 auto wOuter = WorkerGroup{wIdx, wSize};
166
167 for(auto rowIdx : onAcc::makeIdxMap(
168 acc,
169 wOuter,
170 IdxRange{domSize},
171 asParent().getTraversePolicy(),
172 asParent().getIdxLayoutPolicy()))
173 {
174 // build a worker group with fast-moving dimension threads for the inner loop
175 auto wIdxInner = ALPAKA_TYPEOF(domainSize)::fill(0).rAssign(workGroup.idx(acc).back());
176 auto wSizeInner = ALPAKA_TYPEOF(domainSize)::fill(1).rAssign(workGroup.size(acc).back());
177 auto wInner = WorkerGroup{wIdxInner, wSizeInner};
178
179 // iterate over the fast-moving dimension
180 auto simdIdxContainer = onAcc::makeIdxMap(
181 acc,
182 wInner,
183 IdxRange{rowIdx, domainSize, stride},
184 asParent().getTraversePolicy(),
185 asParent().getIdxLayoutPolicy())[CVec<uint32_t, ALPAKA_TYPEOF(domainSize)::dim() - 1u>{}];
186
187 for(auto iter = simdIdxContainer.begin(); iter != simdIdxContainer.end();)
188 {
190 acc,
191 iter,
192 std::make_integer_sequence<uint32_t, T_numSimdPerFnCall>{},
193 ALPAKA_FORWARD(func),
194 ALPAKA_FORWARD(data0),
195 ALPAKA_FORWARD(dataN)...);
196 }
197 }
198 }
199 else
200 {
201 auto simdIdxContainer = onAcc::makeIdxMap(
202 acc,
203 workGroup,
204 IdxRange{IdxType::fill(0), domainSize, stride},
205 asParent().getTraversePolicy(),
206 asParent().getIdxLayoutPolicy());
207
208 for(auto iter = simdIdxContainer.begin(); iter != simdIdxContainer.end();)
209 {
211 acc,
212 iter,
213 std::make_integer_sequence<uint32_t, T_numSimdPerFnCall>{},
214 ALPAKA_FORWARD(func),
215 ALPAKA_FORWARD(data0),
216 ALPAKA_FORWARD(dataN)...);
217 }
218 }
219
220 ALPAKA_TYPEOF(numElements) remainderDomainSize = numElements.fill(0).rAssign(remainderBegin);
221
222 for(auto idx : onAcc::makeIdxMap(
223 acc,
224 workGroup,
225 IdxRange{remainderDomainSize, numElements},
226 asParent().getTraversePolicy(),
227 asParent().getIdxLayoutPolicy()))
228 {
229 func(
230 acc,
231 SimdPtr{data0, idx, T_MemAlignment{}, CVec<uint32_t, 1u>{}},
232 SimdPtr{dataN, idx, T_MemAlignment{}, CVec<uint32_t, 1u>{}}...);
233 }
234 }
235 };
236} // namespace alpaka::onAcc::internal
#define ALPAKA_FN_ACC
All functions that can be used on an accelerator have to be attributed with ALPAKA_FN_ACC or ALPAKA_F...
Definition common.hpp:30
#define ALPAKA_TYPEOF(...)
Get the type of instance.
Definition common.hpp:153
#define ALPAKA_FN_INLINE
Macro defining the inline function attribute.
Definition common.hpp:87
#define ALPAKA_FORWARD(instance)
Perfectly forward an instance as argument.
Definition common.hpp:147
Concept to check for an alignment object.
Definition Alignment.hpp:89
Concept to check if a type is a vector.
Definition Vec.hpp:53
ALPAKA_FN_HOST_ACC constexpr auto makeIdxMap(auto const &acc, auto const workGroup, auto const range, T_Traverse traverse=T_Traverse{}, T_IdxLayout idxLayout=T_IdxLayout{})
Creates an index container.
Definition interface.hpp:57
typename GetValueType< T >::type GetValueType_t
Definition trait.hpp:65
Vec< T, sizeof...(T_values), detail::CVec< T, T_values... > > CVec
A vector with compile-time known values.
Definition CVec.hpp:31
On some constexpr function signatures ALPAKA_FN_HOST_ACC is required for CUDA; otherwise a __host__ f...
pointer to a SIMD pack with the width T_SimdWidth
Definition SimdPtr.hpp:62
ALPAKA_FN_INLINE ALPAKA_FN_ACC constexpr void concurrent(auto const &acc, alpaka::concepts::Vector auto extents, auto &&func, alpaka::concepts::IDataSource auto &&data0, alpaka::concepts::IDataSource auto &&... dataN) const
constexpr auto const & asParent() const
static ALPAKA_FN_INLINE constexpr void execute(auto const &acc, auto &iter, std::integer_sequence< uint32_t, T_repeat... >, auto &&func, alpaka::concepts::IDataSource auto &&... data)
calls the functor and forward the data T_repeat times
static ALPAKA_FN_INLINE constexpr void executeDo(auto const &acc, auto const &dataIdx, auto &&func, alpaka::concepts::IDataSource auto &&... data)
ALPAKA_FN_INLINE ALPAKA_FN_ACC constexpr auto concurrentSimdPackExecution(auto const &acc, alpaka::concepts::Vector auto numElements, auto &&func, alpaka::concepts::IDataSource auto &&data0, alpaka::concepts::IDataSource auto &&... dataN) const
Linearize the index domain for traversing.
Definition traverse.hpp:22