21namespace alpaka::onHost::internal
23 struct SimdTransformReduceKernel
25 uint32_t dynSharedMemBytes = 0u;
27 template<
typename T_DataType>
29 onAcc::concepts::Acc
auto const& acc,
30 alpaka::concepts::Vector
auto const& numChunks,
31 alpaka::concepts::Vector
auto const& chunkExtents,
32 alpaka::concepts::Vector
auto const& extentMd,
33 T_DataType
const& neutralElement,
34 alpaka::concepts::IMdSpan
auto output,
35 auto const& reduceFunc,
36 auto const& transformFunc,
37 alpaka::concepts::IDataSource
auto&&... inputs)
const
41 "The neutral element type must match the data output type.");
47 auto tbSum = MdSpan{dynS, chunkExtents, pitchMd};
56 for(
auto elemIdxInFrame : traverseInFrame)
58 tbSum[elemIdxInFrame] = neutralElement;
61 auto const chunkDataExtent = numChunks * chunkExtents;
67 for(
auto chunkIdx : traverseOverFrames)
69 for(alpaka::concepts::Vector
auto elemIdxInChunk : traverseInFrame)
72 alpaka::onAcc::WorkerGroup{chunkIdx + elemIdxInChunk, chunkDataExtent}};
77 .transformReduce(acc, extentMd, neutralElement, reduceFunc, transformFunc, inputs...);
78 auto& tbSumRef = tbSum[elemIdxInChunk];
79 tbSumRef = reduceFunc(tbSumRef, reducedValue);
94 dynS[laneIdInBlock] = reduceFunc(dynS[laneIdInBlock], dynS[linearSharedElemIdx]);
101 for(
auto offset = blockSize / 2; offset > 0; offset /= 2)
104 if(laneIdInBlock < offset)
106 dynS[laneIdInBlock] = reduceFunc(dynS[laneIdInBlock], dynS[laneIdInBlock + offset]);
111 if(laneIdInBlock == 0)
119 using ReduceFunctor =
typename ALPAKA_TYPEOF(reduceFunc)::Functor;
121 static_cast<ReduceFunctor const&
>(reduceFunc),
124 dynS[laneIdInBlock]);
127 atomicInvoke(reduceFunc, acc, output.data(), dynS[laneIdInBlock]);
132 template<
typename T_DataType>
133 inline void transformReduce(
135 alpaka::concepts::Executor
auto const exec,
136 T_DataType
const& neutralElement,
137 alpaka::concepts::IMdSpan
auto out,
141 alpaka::concepts::IDataSource
auto&&... in)
145 auto frameSpec = getFrameSpec<T_DataType>(
queue.getDevice(), exec, extentMd);
152 IndexType multiprocessorScaling = 1u;
157 multiprocessorScaling = 32u;
160 auto const numMultiProcessors =
queue.getDevice().getDeviceProperties().multiProcessorCount;
162 frameSpec.getNumFrames(),
163 static_cast<IndexType
>(numMultiProcessors * multiprocessorScaling));
164 frameSpec =
FrameSpec{adjsutedNumFrames, frameSpec.getFrameExtents(), exec};
171 auto numChunks = frameSpec.getNumFrames();
172 auto chunkExtents = frameSpec.getFrameExtents();
174 auto kernelFn = SimdTransformReduceKernel{
175 static_cast<uint32_t
>(frameSpec.getFrameExtents().product() *
sizeof(T_DataType))};
181 std::stringstream ss;
188 onHost::fill(queue, out, neutralElement, out.getExtents().fill(1));
#define ALPAKA_FN_ACC
All functions that can be used on an accelerator have to be attributed with ALPAKA_FN_ACC or ALPAKA_F...
#define ALPAKA_TYPEOF(...)
Get the type of instance.
#define ALPAKA_FORWARD(instance)
Perfectly forward an instance as argument.
#define ALPAKA_LOG_INFO(logLvl, callable)
Write a meta data message to the output.
consteval auto adjustToLimit(concepts::CVector auto const input)
adjust the input vector to a given limit by halving all components until the product of these is is b...
ALPAKA_FN_ACC void atomicInvoke(auto &&fn, concepts::Acc auto const &acc, auto *inOut, auto &&... args)
Defines the equivalent of an atomic invoke for user defined functors.
constexpr auto blocksInGrid
constexpr auto allThreads
Represent the identity of the executor thread.
constexpr auto linearThreadsInBlock
constexpr auto threadsInBlock
ALPAKA_FN_HOST_ACC constexpr auto makeIdxMap(auto const &acc, auto const workGroup, auto const range, T_Traverse traverse=T_Traverse{}, T_IdxLayout idxLayout=T_IdxLayout{})
Creates an index container.
constexpr auto getDynSharedMem(concepts::Acc auto const &acc) -> T *
Get block shared dynamic memory.
constexpr void syncBlockThreads(concepts::Acc auto const &acc)
Synchronize all threads within a thread block.
FrameSpec(T_NumFrames const &, T_FrameExtents const &) -> FrameSpec< alpaka::trait::getVec_t< T_NumFrames >, alpaka::trait::getVec_t< T_FrameExtents >, alpaka::exec::AnyExecutor >
void fill(Queue< T_Device, T_QueueKind > const &queue, auto &&dest, T_Value elementValue)
fill memory element wise
constexpr auto demangledName()
decltype(auto) getExtents(auto &&any)
Object extents.
typename GetValueType< T >::type GetValueType_t
ALPAKA_FN_HOST_ACC StencilFunc(T_Func &&) -> StencilFunc< T_Func >
constexpr auto calculatePitchesFromExtents(T_Vec const &extent)
Calculate the pitches purely from the extents.
constexpr T_IntegralType linearize(Vec< T_IntegralType, T_dim - 1u, T_Storage > const &dim, Vec< T_IntegralType, T_dim, T_OtherStorage > const &idx)
Give the linear index of an N-dimensional index within an N-dimensional index space.
ALPAKA_FN_HOST_ACC ScalarFunc(T_Func &&) -> ScalarFunc< T_Func >
ALPAKA_FN_HOST KernelBundle(TKernelFn const &, TArgs &&...) -> KernelBundle< TKernelFn, TArgs... >
User defined deduction guide with trailing return type. For CTAD during the construction.
ALPAKA_FN_HOST_ACC IdxRange(T_Extents const &) -> IdxRange< typename trait::getVec_t< T_Extents >::UniVec >