alpaka
Abstraction Library for Parallel Kernel Acceleration
Loading...
Searching...
No Matches
SimdAlgo.hpp
Go to the documentation of this file.
1/* Copyright 2024 René Widera
2 * SPDX-License-Identifier: MPL-2.0
3 */
4
5#pragma once
6
7#include "alpaka/Vec.hpp"
12
13#include <bit>
14#include <cstdint>
15
16namespace alpaka::onAcc
17{
18 /** Creates a functor operate on contiguous data concurrently.
19 *
20 * The class is automatically configured to use the best fitting SIMD width for the given data type and is able to
21 * expose instruction level parallelism.
22 *
23 * @param T_WorkGroup participating thread description. More than one thread can have the same index within the
24 * group. All worker with the same id will get the same index as result.
25 * @param T_Traverse Policy to configure the method used to find the next valid index for a worker. @see namespace
26 * traverse
27 * @param T_IdxLayout Policy to define how indecision will be mapped to worker threads. @see namsepsace layout
28 */
29 template<
30 typename T_WorkGroup,
33 struct SimdAlgo
34 : protected internal::SimdConcurrent<SimdAlgo<T_WorkGroup, T_Traverse, T_IdxLayout>>
35 , protected internal::SimdTransformReduce<SimdAlgo<T_WorkGroup, T_Traverse, T_IdxLayout>>
36 {
37 constexpr SimdAlgo(
38 T_WorkGroup const workGroup,
39 T_Traverse traverse = T_Traverse{},
40 T_IdxLayout idxLayout = T_IdxLayout{})
41 : m_workGroup{workGroup}
42 {
43 alpaka::unused(traverse, idxLayout);
44 }
45
46 constexpr T_WorkGroup getWorkGroup() const
47 {
48 return m_workGroup;
49 }
50
51 constexpr T_Traverse getTraversePolicy() const
52 {
53 return T_Traverse{};
54 }
55
56 constexpr T_IdxLayout getIdxLayoutPolicy() const
57 {
58 return T_IdxLayout{};
59 }
60
61 /** execute the functor concurrently over the given data.
62 *
63 * @attention The number of elements to process is derived from the first MdSpan object.
64 * All other MdSpan objects must have at least the same number of elements.
65 * The optimal concurrency is also derived from the first MdSpan.
66 *
67 * @param func the functor to be executed
68 * @param data0 the first data to be processed
69 * @param dataN the remaining data to be processed
70 *
71 * @{
72 */
74 auto const& acc,
75 auto&& func,
77 alpaka::concepts::IDataSource auto&&... dataN) const
78 {
79 concurrent(acc, data0.getExtents(), ALPAKA_FORWARD(func), ALPAKA_FORWARD(data0), ALPAKA_FORWARD(dataN)...);
80 }
81
82 /**
83 * @param extents number of elements to process in each dimension
84 */
86 auto const& acc,
87 alpaka::concepts::Vector auto extents,
88 auto&& func,
90 alpaka::concepts::IDataSource auto&&... dataN) const
91 {
92 using ValueType = alpaka::trait::GetValueType_t<ALPAKA_TYPEOF(data0)>;
95 ALPAKA_TYPEOF(acc.getApi()){},
96 ALPAKA_TYPEOF(acc.getDeviceKind()){})
97 * sizeof(ValueType)>(
98 acc,
99 extents,
100 ALPAKA_FORWARD(func),
101 ALPAKA_FORWARD(data0),
102 ALPAKA_FORWARD(dataN)...);
103 }
104
105 /** @} */
106
107 /** execute the functor concurrently over the given data.
108 *
109 * @attention The number of elements to process is derived from the first MdSpan object.
110 * All other MdSpan objects must have at least the same number of elements.
111 *
112 * @param T_maxConcurrencyInByte
113 * Maximum number of bytes to be used for concurrency.
114 * Concurrency bytes describe a virtual simd pack size which is not exceeded.
115 * Internally a best fitting SIMD width is calculated and instruction parallelism is exposed based on
116 * T_maxConcurrencyInByte.
117 * @param T_MemAlignment alignment of the memory, if no alignments is given the alignment will be derived from
118 * the MdSpan data descriptions
119 * @param func the functor to be executed
120 * @param data0 the first data to be processed
121 * @param dataN the remaining data to be processed
122 *
123 * @{
124 */
125 template<uint32_t T_maxConcurrencyInByte, alpaka::concepts::Alignment T_MemAlignment = AutoAligned>
127 auto const& acc,
128 auto&& func,
130 alpaka::concepts::IDataSource auto&&... dataN) const
131 {
133 acc,
134 data0.getExtents(),
135 ALPAKA_FORWARD(func),
136 ALPAKA_FORWARD(data0),
137 ALPAKA_FORWARD(dataN)...);
138 }
139
140 /**
141 * @param extents number of elements to process in each dimension
142 */
143 template<uint32_t T_maxConcurrencyInByte, alpaka::concepts::Alignment T_MemAlignment = AutoAligned>
145 auto const& acc,
146 alpaka::concepts::Vector auto extents,
147 auto&& func,
149 alpaka::concepts::IDataSource auto&&... dataN) const
150 {
152 acc,
153 extents,
154 ALPAKA_FORWARD(func),
155 ALPAKA_FORWARD(data0),
156 ALPAKA_FORWARD(dataN)...);
157 }
158
159 /** @} */
160
161
162 /** @brief transform the input data and reduce is to a single value
163 *
164 * @attention If no extent is given the number of elements to process is derived from the first MdSpan object.
165 * All other MdSpan objects must have at least the same number of elements.
166 *
167 * @param neutralElement the neutral element for the reduction operation
168 * @param reduceFunc The binary reduction operation to be executed, e.g. std::plus. The functor should support
169 * Simd packages.
170 * @param transformFunc N-nary functor to be executed, values of all containers will be passed to the functor
171 * as arguments. The functor should support Simd packages. If not you can enforce the element wise execution by
172 * wrapping into
173 * ScalarFunc. If you would like to support stencil executions wrapp fn into StencilFunc. StencilFunc
174 * is getting all arguments as SimdPtr. If StencilFunc is used you should take care to not read outside of
175 * valid memory ranges by using sub-views to your input and output data. Optionally a transformFn can have an
176 * accelerator as first argument.
177 * If the result of this functor is a structured value providing an overload to simdize the type
178 * can improve the performance see alpaka::makeSimdized.
179 * @param data0 the first data to be processed
180 * @param dataN the remaining data to be processed
181 * @return A single reduced value.
182 */
184 auto const& acc,
185 auto const& neutralElement,
186 auto&& reduceFunc,
187 auto&& transformFunc,
189 alpaka::concepts::IDataSource auto&&... dataN) const
190 {
191 return transformReduce(
192 acc,
193 data0.getExtents(),
194 neutralElement,
195 ALPAKA_FORWARD(reduceFunc),
196 ALPAKA_FORWARD(transformFunc),
197 ALPAKA_FORWARD(data0),
198 ALPAKA_FORWARD(dataN)...);
199 }
200
201 /**
202 * @copydoc transformReduce()
203 * @param extents number of elements to process in each dimension
204 */
206 auto const& acc,
207 alpaka::concepts::Vector auto extents,
208 auto const& neutralElement,
209 auto&& reduceFunc,
210 auto&& transformFunc,
212 alpaka::concepts::IDataSource auto&&... dataN) const
213 {
214 using ValueType = alpaka::trait::GetValueType_t<ALPAKA_TYPEOF(data0)>;
215 return transformReduce<
217 ALPAKA_TYPEOF(acc.getApi()){},
218 ALPAKA_TYPEOF(acc.getDeviceKind()){})
219 * sizeof(ValueType)>(
220 acc,
221 extents,
222 neutralElement,
223 ALPAKA_FORWARD(reduceFunc),
224 ALPAKA_FORWARD(transformFunc),
225 ALPAKA_FORWARD(data0),
226 ALPAKA_FORWARD(dataN)...);
227 }
228
229 /**
230 * @copydoc transformReduce()
231 *
232 * @tparam T_maxConcurrencyInByte
233 * Maximum number of bytes to be used for concurrency.
234 * Concurrency bytes describe a virtual simd pack size which is not exceeded.
235 * Internally a best fitting SIMD width is calculated and instruction parallelism is exposed based on
236 * T_maxConcurrencyInByte.
237 * @tparam T_MemAlignment alignment of the memory, if no alignments is given the alignment will be derived from
238 * the MdSpan data descriptions
239 */
240 template<uint32_t T_maxConcurrencyInByte, alpaka::concepts::Alignment T_MemAlignment = AutoAligned>
242 auto const& acc,
243 auto const& neutralElement,
244 auto&& reduceFunc,
245 auto&& transformFunc,
247 alpaka::concepts::IDataSource auto&&... dataN) const
248 {
250 acc,
251 data0.getExtents(),
252 neutralElement,
253 ALPAKA_FORWARD(reduceFunc),
254 ALPAKA_FORWARD(transformFunc),
255 ALPAKA_FORWARD(data0),
256 ALPAKA_FORWARD(dataN)...);
257 }
258
259 /**
260 * @copydoc transformReduce()
261 *
262 * @param extents number of elements to process in each dimension
263 * @tparam T_maxConcurrencyInByte
264 * Maximum number of bytes to be used for concurrency.
265 * Concurrency bytes describe a virtual simd pack size which is not exceeded.
266 * Internally a best fitting SIMD width is calculated and instruction parallelism is exposed based on
267 * T_maxConcurrencyInByte.
268 * @tparam T_MemAlignment alignment of the memory, if no alignments is given the alignment will be derived from
269 * the MdSpan data descriptions
270 */
271 template<uint32_t T_maxConcurrencyInByte, alpaka::concepts::Alignment T_MemAlignment = AutoAligned>
273 auto const& acc,
274 alpaka::concepts::Vector auto extents,
275 auto const& neutralElement,
276 auto&& reduceFunc,
277 auto&& transformFunc,
279 alpaka::concepts::IDataSource auto&&... dataN) const
280 {
282 acc,
283 extents,
284 neutralElement,
285 ALPAKA_FORWARD(reduceFunc),
286 ALPAKA_FORWARD(transformFunc),
287 ALPAKA_FORWARD(data0),
288 ALPAKA_FORWARD(dataN)...);
289 }
290
291 private:
294
297
298 template<typename T_Type, uint32_t T_maxConcurrencyInByte, uint32_t T_cacheLineInByte>
299 static constexpr auto calcSimdWidth()
300 {
301 constexpr uint32_t maxSimdBytes = std::min(T_cacheLineInByte, T_maxConcurrencyInByte);
302 return alpaka::divExZero(maxSimdBytes, static_cast<uint32_t>(sizeof(T_Type)));
303 }
304
305 template<typename T_Type>
307 {
308 using value_type = T_Type;
309 uint32_t simdWidth;
311 };
312
313 /** Generate a SIMD config for the API and device kind.
314 *
315 * Produces an optimized SIMD configuration based on technical constrained.
316 * The SIMD is set to a power of two.
317 * If possible, the SIMD configuration is aligned to the cacheline size for the given device kind.
318 *
319 * @maxConcurrencyInByte The upper limit in bytes a SIMD configuration must not exceed, except a single value
320 * is larger. This parameter is used to control the register pressure.
321 *
322 * @return a configuration with the number of SIMD pack which should be used in parallel for a single
323 * invocation. And the width of a single SIMD pack.
324 */
325 template<typename T_ValueType>
326 [[nodiscard]] static consteval SimdPackConfig<T_ValueType> calcSimdPackConfig(
329 uint32_t maxConcurrencyInByte)
330 {
331 constexpr uint32_t maxArchSimdWidth = getArchSimdWidth<T_ValueType>(api, deviceKind);
332 constexpr uint32_t cachelineBytes = getCachelineSize(api, deviceKind);
333 uint32_t simdWidth = maxArchSimdWidth;
334
335 // Maximum SIMD width allowed by the byte concurrency budget.
336 uint32_t maxWidthAllowed = maxConcurrencyInByte / sizeof(T_ValueType);
337
338 // Clamp max hardware SIMD width and ensure at least 1.
339 uint32_t clampedWidth = std::max(std::min(simdWidth, maxWidthAllowed), 1u);
340
341 // Round down to the nearest power of two.
342 simdWidth = std::bit_floor(clampedWidth);
343
344 uint32_t const simdWidthInByte = simdWidth * sizeof(T_ValueType);
345
346 // Number of SIMD packs that fit into the concurrency budget.
347 uint32_t const numSimdPacksToUtilizeConcurrency = alpaka::divExZero(maxConcurrencyInByte, simdWidthInByte);
348
349 // Number of SIMD packs required to cover one cache line
350 uint32_t const numSimdPacksPerCacheLine = alpaka::divExZero(cachelineBytes, simdWidthInByte);
351
352 // Prefer the largest cache-line multiple that fits into the budget.
353 uint32_t numSimdPacksPerFnCall = numSimdPacksToUtilizeConcurrency;
354 if(numSimdPacksToUtilizeConcurrency >= numSimdPacksPerCacheLine)
355 {
356 uint32_t const cachelineMultiple
357 = (numSimdPacksToUtilizeConcurrency / numSimdPacksPerCacheLine) * numSimdPacksPerCacheLine;
358 numSimdPacksPerFnCall = std::max(cachelineMultiple, 1u);
359 }
360
361 return {simdWidth, numSimdPacksPerFnCall};
362 }
363
364 T_WorkGroup m_workGroup;
365 };
366} // namespace alpaka::onAcc
#define ALPAKA_FN_ACC
All functions that can be used on an accelerator have to be attributed with ALPAKA_FN_ACC or ALPAKA_F...
Definition common.hpp:30
#define ALPAKA_TYPEOF(...)
Get the type of instance.
Definition common.hpp:153
#define ALPAKA_FN_INLINE
Macro defining the inline function attribute.
Definition common.hpp:87
#define ALPAKA_FORWARD(instance)
Perfectly forward an instance as argument.
Definition common.hpp:147
Concept to check for APIs.
Definition api.hpp:42
Concept to check if something is a device kind.
Definition tag.hpp:147
Concept to check if a type is a vector.
Definition Vec.hpp:53
functionality which is usable on the accelerator compute device from within a kernel.
Definition executor.hpp:38
typename GetValueType< T >::type GetValueType_t
Definition trait.hpp:65
ALPAKA_FN_HOST_ACC constexpr auto divExZero(Integral a, Integral b) -> Integral
Returns the max(a / b, 1) as integer.
Definition utility.hpp:41
consteval uint32_t getNumElemPerThread(concepts::Api auto const api, alpaka::concepts::DeviceKind auto const deviceType)
Get the number of elements to compute per thread.
Definition trait.hpp:177
consteval uint32_t getCachelineSize(concepts::Api auto const api, alpaka::concepts::DeviceKind auto const deviceType)
get the cacheline size in bytes
Definition trait.hpp:190
consteval uint32_t getArchSimdWidth(concepts::Api auto const api, alpaka::concepts::DeviceKind auto const deviceType)
Get the SIMD width in bytes for an API and device kind combination.
Definition trait.hpp:152
ALPAKA_FN_INLINE ALPAKA_FN_ACC constexpr void concurrent(auto const &acc, alpaka::concepts::Vector auto extents, auto &&func, alpaka::concepts::IDataSource auto &&data0, alpaka::concepts::IDataSource auto &&... dataN) const
Definition SimdAlgo.hpp:85
ALPAKA_FN_INLINE ALPAKA_FN_ACC constexpr auto transformReduce(auto const &acc, alpaka::concepts::Vector auto extents, auto const &neutralElement, auto &&reduceFunc, auto &&transformFunc, alpaka::concepts::IDataSource auto &&data0, alpaka::concepts::IDataSource auto &&... dataN) const
transform the input data and reduce is to a single value
Definition SimdAlgo.hpp:272
constexpr T_IdxLayout getIdxLayoutPolicy() const
Definition SimdAlgo.hpp:56
ALPAKA_FN_INLINE ALPAKA_FN_ACC constexpr auto transformReduce(auto const &acc, alpaka::concepts::Vector auto extents, auto const &neutralElement, auto &&reduceFunc, auto &&transformFunc, alpaka::concepts::IDataSource auto &&data0, alpaka::concepts::IDataSource auto &&... dataN) const
transform the input data and reduce is to a single value
Definition SimdAlgo.hpp:205
ALPAKA_FN_INLINE ALPAKA_FN_ACC constexpr auto transformReduce(auto const &acc, auto const &neutralElement, auto &&reduceFunc, auto &&transformFunc, alpaka::concepts::IDataSource auto &&data0, alpaka::concepts::IDataSource auto &&... dataN) const
transform the input data and reduce is to a single value
Definition SimdAlgo.hpp:241
constexpr T_WorkGroup getWorkGroup() const
Definition SimdAlgo.hpp:46
ALPAKA_FN_INLINE ALPAKA_FN_ACC constexpr void concurrent(auto const &acc, auto &&func, alpaka::concepts::IDataSource auto &&data0, alpaka::concepts::IDataSource auto &&... dataN) const
execute the functor concurrently over the given data.
Definition SimdAlgo.hpp:73
ALPAKA_FN_INLINE ALPAKA_FN_ACC constexpr void concurrent(auto const &acc, alpaka::concepts::Vector auto extents, auto &&func, alpaka::concepts::IDataSource auto &&data0, alpaka::concepts::IDataSource auto &&... dataN) const
Definition SimdAlgo.hpp:144
constexpr T_Traverse getTraversePolicy() const
Definition SimdAlgo.hpp:51
static constexpr auto calcSimdWidth()
Definition SimdAlgo.hpp:299
ALPAKA_FN_INLINE ALPAKA_FN_ACC constexpr void concurrent(auto const &acc, auto &&func, alpaka::concepts::IDataSource auto &&data0, alpaka::concepts::IDataSource auto &&... dataN) const
execute the functor concurrently over the given data.
Definition SimdAlgo.hpp:126
constexpr SimdAlgo(T_WorkGroup const workGroup, T_Traverse traverse=T_Traverse{}, T_IdxLayout idxLayout=T_IdxLayout{})
Definition SimdAlgo.hpp:37
static consteval SimdPackConfig< T_ValueType > calcSimdPackConfig(alpaka::concepts::Api auto api, alpaka::concepts::DeviceKind auto deviceKind, uint32_t maxConcurrencyInByte)
Generate a SIMD config for the API and device kind.
Definition SimdAlgo.hpp:326
ALPAKA_FN_INLINE ALPAKA_FN_ACC constexpr auto transformReduce(auto const &acc, auto const &neutralElement, auto &&reduceFunc, auto &&transformFunc, alpaka::concepts::IDataSource auto &&data0, alpaka::concepts::IDataSource auto &&... dataN) const
transform the input data and reduce is to a single value
Definition SimdAlgo.hpp:183
internal::SimdConcurrent< SimdAlgo< T_WorkGroup, T_Traverse, T_IdxLayout > > ConcurrentAlgo
Definition SimdAlgo.hpp:292
internal::SimdTransformReduce< SimdAlgo< T_WorkGroup, T_Traverse, T_IdxLayout > > ReduceAlgo
Definition SimdAlgo.hpp:293
concurrent foreach implementation
The index layout will automatically selected based on the executor.
Definition layout.hpp:27
Linearize the index domain for traversing.
Definition traverse.hpp:22