38 T_WorkGroup
const workGroup,
40 T_IdxLayout idxLayout = T_IdxLayout{})
43 alpaka::unused(traverse, idxLayout);
125 template<u
int32_t T_maxConcurrencyInByte, alpaka::concepts::Alignment T_MemAlignment = AutoAligned>
143 template<u
int32_t T_maxConcurrencyInByte, alpaka::concepts::Alignment T_MemAlignment = AutoAligned>
185 auto const& neutralElement,
187 auto&& transformFunc,
208 auto const& neutralElement,
210 auto&& transformFunc,
219 *
sizeof(ValueType)>(
240 template<u
int32_t T_maxConcurrencyInByte, alpaka::concepts::Alignment T_MemAlignment = AutoAligned>
243 auto const& neutralElement,
245 auto&& transformFunc,
271 template<u
int32_t T_maxConcurrencyInByte, alpaka::concepts::Alignment T_MemAlignment = AutoAligned>
275 auto const& neutralElement,
277 auto&& transformFunc,
298 template<
typename T_Type, u
int32_t T_maxConcurrencyInByte, u
int32_t T_cacheLineInByte>
301 constexpr uint32_t maxSimdBytes = std::min(T_cacheLineInByte, T_maxConcurrencyInByte);
305 template<
typename T_Type>
325 template<
typename T_ValueType>
329 uint32_t maxConcurrencyInByte)
333 uint32_t simdWidth = maxArchSimdWidth;
336 uint32_t maxWidthAllowed = maxConcurrencyInByte /
sizeof(T_ValueType);
339 uint32_t clampedWidth = std::max(std::min(simdWidth, maxWidthAllowed), 1u);
342 simdWidth = std::bit_floor(clampedWidth);
344 uint32_t
const simdWidthInByte = simdWidth *
sizeof(T_ValueType);
347 uint32_t
const numSimdPacksToUtilizeConcurrency =
alpaka::divExZero(maxConcurrencyInByte, simdWidthInByte);
350 uint32_t
const numSimdPacksPerCacheLine =
alpaka::divExZero(cachelineBytes, simdWidthInByte);
353 uint32_t numSimdPacksPerFnCall = numSimdPacksToUtilizeConcurrency;
354 if(numSimdPacksToUtilizeConcurrency >= numSimdPacksPerCacheLine)
356 uint32_t
const cachelineMultiple
357 = (numSimdPacksToUtilizeConcurrency / numSimdPacksPerCacheLine) * numSimdPacksPerCacheLine;
358 numSimdPacksPerFnCall = std::max(cachelineMultiple, 1u);
361 return {simdWidth, numSimdPacksPerFnCall};
#define ALPAKA_FN_ACC
All functions that can be used on an accelerator have to be attributed with ALPAKA_FN_ACC or ALPAKA_F...
#define ALPAKA_TYPEOF(...)
Get the type of instance.
#define ALPAKA_FN_INLINE
Macro defining the inline function attribute.
#define ALPAKA_FORWARD(instance)
Perfectly forward an instance as argument.
Concept to check for APIs.
Concept to check if something is a device kind.
Concept to check if a type is a vector.
functionality which is usable on the accelerator compute device from within a kernel.
typename GetValueType< T >::type GetValueType_t
ALPAKA_FN_HOST_ACC constexpr auto divExZero(Integral a, Integral b) -> Integral
Returns the max(a / b, 1) as integer.
consteval uint32_t getNumElemPerThread(concepts::Api auto const api, alpaka::concepts::DeviceKind auto const deviceType)
Get the number of elements to compute per thread.
consteval uint32_t getCachelineSize(concepts::Api auto const api, alpaka::concepts::DeviceKind auto const deviceType)
get the cacheline size in bytes
consteval uint32_t getArchSimdWidth(concepts::Api auto const api, alpaka::concepts::DeviceKind auto const deviceType)
Get the SIMD width in bytes for an API and device kind combination.
uint32_t numSimdPacksPerFnCall
ALPAKA_FN_INLINE ALPAKA_FN_ACC constexpr void concurrent(auto const &acc, alpaka::concepts::Vector auto extents, auto &&func, alpaka::concepts::IDataSource auto &&data0, alpaka::concepts::IDataSource auto &&... dataN) const
ALPAKA_FN_INLINE ALPAKA_FN_ACC constexpr auto transformReduce(auto const &acc, alpaka::concepts::Vector auto extents, auto const &neutralElement, auto &&reduceFunc, auto &&transformFunc, alpaka::concepts::IDataSource auto &&data0, alpaka::concepts::IDataSource auto &&... dataN) const
transform the input data and reduce is to a single value
constexpr T_IdxLayout getIdxLayoutPolicy() const
ALPAKA_FN_INLINE ALPAKA_FN_ACC constexpr auto transformReduce(auto const &acc, alpaka::concepts::Vector auto extents, auto const &neutralElement, auto &&reduceFunc, auto &&transformFunc, alpaka::concepts::IDataSource auto &&data0, alpaka::concepts::IDataSource auto &&... dataN) const
transform the input data and reduce is to a single value
ALPAKA_FN_INLINE ALPAKA_FN_ACC constexpr auto transformReduce(auto const &acc, auto const &neutralElement, auto &&reduceFunc, auto &&transformFunc, alpaka::concepts::IDataSource auto &&data0, alpaka::concepts::IDataSource auto &&... dataN) const
transform the input data and reduce is to a single value
constexpr T_WorkGroup getWorkGroup() const
ALPAKA_FN_INLINE ALPAKA_FN_ACC constexpr void concurrent(auto const &acc, auto &&func, alpaka::concepts::IDataSource auto &&data0, alpaka::concepts::IDataSource auto &&... dataN) const
execute the functor concurrently over the given data.
ALPAKA_FN_INLINE ALPAKA_FN_ACC constexpr void concurrent(auto const &acc, alpaka::concepts::Vector auto extents, auto &&func, alpaka::concepts::IDataSource auto &&data0, alpaka::concepts::IDataSource auto &&... dataN) const
constexpr T_Traverse getTraversePolicy() const
static constexpr auto calcSimdWidth()
ALPAKA_FN_INLINE ALPAKA_FN_ACC constexpr void concurrent(auto const &acc, auto &&func, alpaka::concepts::IDataSource auto &&data0, alpaka::concepts::IDataSource auto &&... dataN) const
execute the functor concurrently over the given data.
constexpr SimdAlgo(T_WorkGroup const workGroup, T_Traverse traverse=T_Traverse{}, T_IdxLayout idxLayout=T_IdxLayout{})
static consteval SimdPackConfig< T_ValueType > calcSimdPackConfig(alpaka::concepts::Api auto api, alpaka::concepts::DeviceKind auto deviceKind, uint32_t maxConcurrencyInByte)
Generate a SIMD config for the API and device kind.
ALPAKA_FN_INLINE ALPAKA_FN_ACC constexpr auto transformReduce(auto const &acc, auto const &neutralElement, auto &&reduceFunc, auto &&transformFunc, alpaka::concepts::IDataSource auto &&data0, alpaka::concepts::IDataSource auto &&... dataN) const
transform the input data and reduce is to a single value
internal::SimdConcurrent< SimdAlgo< T_WorkGroup, T_Traverse, T_IdxLayout > > ConcurrentAlgo
internal::SimdTransformReduce< SimdAlgo< T_WorkGroup, T_Traverse, T_IdxLayout > > ReduceAlgo
concurrent foreach implementation
The index layout will automatically selected based on the executor.
Linearize the index domain for traversing.