latest/doxygen/internal_2transformReduce_8hpp_source.html

/* Copyright 2025 René Widera, Mehmet Yusufoglu

 * SPDX-License-Identifier: MPL-2.0

 */


#pragma once


#include "alpaka/Vec.hpp"

#include "alpaka/api/util.hpp"

#include "alpaka/core/common.hpp"

#include "alpaka/functor.hpp"

#include "alpaka/mem/MdSpan.hpp"

#include "alpaka/mem/concepts/IDataSource.hpp"

#include "alpaka/mem/concepts/IMdSpan.hpp"

#include "alpaka/onAcc/Acc.hpp"

#include "alpaka/onAcc/SimdAlgo.hpp"

#include "alpaka/onAcc/atomic.hpp"

#include "alpaka/onHost/interface.hpp"

#include "alpaka/onHost/logger/logger.hpp"

#include "alpaka/trait.hpp"


namespace alpaka::onHost::internal

{

    struct SimdTransformReduceKernel

    {

        uint32_t dynSharedMemBytes = 0u;


        template<typename T_DataType>

        ALPAKA_FN_ACC void operator()(

            onAcc::concepts::Acc auto const& acc,

            alpaka::concepts::Vector auto const& numChunks,

            alpaka::concepts::Vector auto const& chunkExtents,

            alpaka::concepts::Vector auto const& extentMd,

            T_DataType const& neutralElement,

            alpaka::concepts::IMdSpan auto output,

            auto const& reduceFunc,

            auto const& transformFunc,

            alpaka::concepts::IDataSource auto&&... inputs) const

        {

            static_assert(

                std::is_same_v<ALPAKA_TYPEOF(neutralElement), alpaka::trait::GetValueType_t<ALPAKA_TYPEOF(output)>>,

                "The neutral element type must match the data output type.");


            // Shared memory for block-wide reduction

            T_DataType* dynS = onAcc::getDynSharedMem<T_DataType>(acc);

            auto pitchMd = alpaka::calculatePitchesFromExtents<T_DataType>(chunkExtents);

            auto tbSum = MdSpan{dynS, chunkExtents, pitchMd};


            auto traverseInFrame = alpaka::onAcc::makeIdxMap(

                acc,

                alpaka::onAcc::worker::threadsInBlock,

                alpaka::IdxRange{chunkExtents});


            // Initialize shared memory by setting all elements to the neutral element or identity value

            // for the reduction operation.

            for(auto elemIdxInFrame : traverseInFrame)

            {

                tbSum[elemIdxInFrame] = neutralElement;

            }


            auto const chunkDataExtent = numChunks * chunkExtents;

            auto traverseOverFrames = alpaka::onAcc::makeIdxMap(

                acc,

                alpaka::onAcc::worker::blocksInGrid,

                alpaka::IdxRange{chunkDataExtent.fill(0), chunkDataExtent, chunkExtents});


            for(auto chunkIdx : traverseOverFrames)

            {

                for(alpaka::concepts::Vector auto elemIdxInChunk : traverseInFrame)

                {

                    auto allThreads = alpaka::onAcc::SimdAlgo{

                        alpaka::onAcc::WorkerGroup{chunkIdx + elemIdxInChunk, chunkDataExtent}};


                    // reduce functor with simd package support

                    auto reducedValue

                        = allThreads

                              .transformReduce(acc, extentMd, neutralElement, reduceFunc, transformFunc, inputs...);

                    auto& tbSumRef = tbSum[elemIdxInChunk];

                    tbSumRef = reduceFunc(tbSumRef, reducedValue);

                }

            }


            auto const laneIdInBlock = linearize(acc[alpaka::layer::thread].count(), acc[alpaka::layer::thread].idx());

            auto const blockSize = acc[alpaka::layer::thread].count().product();

            // Synchronize threads before aggregation

            alpaka::onAcc::syncBlockThreads(acc);


            // Aggregate shared memory slots

            for(auto [linearSharedElemIdx] : alpaka::onAcc::makeIdxMap(

                    acc,

                    alpaka::onAcc::worker::linearThreadsInBlock,

                    alpaka::IdxRange{blockSize, chunkExtents.product()}))

            {

                dynS[laneIdInBlock] = reduceFunc(dynS[laneIdInBlock], dynS[linearSharedElemIdx]);

            }


            alpaka::onAcc::syncBlockThreads(acc);


            // Perform a parallel reduction within the block

            // This is a tree reduction algorithm

            for(auto offset = blockSize / 2; offset > 0; offset /= 2)

            {

                alpaka::onAcc::syncBlockThreads(acc);

                if(laneIdInBlock < offset)

                {

                    dynS[laneIdInBlock] = reduceFunc(dynS[laneIdInBlock], dynS[laneIdInBlock + offset]);

                }

            }


            // Atomic update of the global result

            if(laneIdInBlock == 0)

            {

                using alpaka::onAcc::atomic::alpakaAtomicInvoke;

                if constexpr(

                    alpaka::concepts::SpecializationOf<ALPAKA_TYPEOF(reduceFunc), ScalarFunc>

                    || alpaka::concepts::SpecializationOf<ALPAKA_TYPEOF(reduceFunc), StencilFunc>)

                {

                    // Handle wrapped reduce functors e.g. ScalarFunc or StencilFunc

                    using ReduceFunctor = typename ALPAKA_TYPEOF(reduceFunc)::Functor;

                    alpakaAtomicInvoke(

                        static_cast<ReduceFunctor const&>(reduceFunc),

                        acc,

                        output.data(),

                        dynS[laneIdInBlock]);

                }

                else

                    alpakaAtomicInvoke(reduceFunc, acc, output.data(), dynS[laneIdInBlock]);

            }

        }

    };


    template<typename T_DataType>

    inline void transformReduce(

        auto const& queue,

        alpaka::concepts::Executor auto const exec,

        T_DataType const& neutralElement,

        alpaka::concepts::IMdSpan auto out,

        auto&& reduceFn,

        auto&& transformFn,

        auto&& in0,

        alpaka::concepts::IDataSource auto&&... in)

    {

        auto extentMd = onHost::getExtents(in0);

        using IndexType = alpaka::trait::GetValueType_t<ALPAKA_TYPEOF(extentMd)>;

        auto frameSpec = getSimdFrameSpec<T_DataType>(queue.getDevice(), exec, extentMd);


        /* Adjust the launch parameters to not oversubscribe a device too much.

         *

         * @todo: This heuristic should be adjusted based on benchmarking different cases.

         */

        {

            IndexType multiprocessorScaling = 1u;

            if constexpr(!(ALPAKA_TYPEOF(queue.getDevice().getDeviceKind()){} == deviceKind::cpu))

            {

                // For non-CPU devices, we scale the number of frames based on an arbitrary number derived from

                // testing with the dot kernel of the bablestream benchmark.

                multiprocessorScaling = 32u;

            }


            auto const numMultiProcessors = queue.getDevice().getDeviceProperties().multiProcessorCount;

            auto adjsutedNumFrames = alpaka::api::util::adjustToLimit(

                frameSpec.getNumFrames(),

                static_cast<IndexType>(numMultiProcessors * multiprocessorScaling));

            frameSpec = FrameSpec{adjsutedNumFrames, frameSpec.getFrameExtents(), exec};

        }


        /* Derive the chunk size and number of chunks from the SIMD optimized frame specification.

         * The chunking parameters influences the numerical precision because it provides the possibility to control

         * the length of the accumulation chain of a single thread.

         */

        auto numChunks = frameSpec.getNumFrames();

        auto chunkExtents = frameSpec.getFrameExtents();


        auto kernelFn = SimdTransformReduceKernel{

            static_cast<uint32_t>(frameSpec.getFrameExtents().product() * sizeof(T_DataType))};


        ALPAKA_LOG_INFO(

            onHost::logger::memory,

            [&]()

            {

                std::stringstream ss;

                ss << "transformReduce{ extents=" << extentMd << ", value_type=" << onHost::demangledName<T_DataType>()

                   << ", " << frameSpec << ", reduceFn=" << onHost::demangledName(reduceFn)

                   << ", transformFn=" << onHost::demangledName(transformFn) << " }";

                return ss.str();

            });


        onHost::fill(queue, out, neutralElement, out.getExtents().fill(1));

        queue.enqueue(

            frameSpec,

            KernelBundle{

                kernelFn,

                numChunks,

                chunkExtents,

                extentMd,

                neutralElement,

                out,

                ALPAKA_FORWARD(reduceFn),

                ALPAKA_FORWARD(transformFn),

                ALPAKA_FORWARD(in0),

                ALPAKA_FORWARD(in)...});

    }

} // namespace alpaka::onHost::internal

Acc.hpp

IDataSource.hpp

IMdSpan.hpp

MdSpan.hpp

SimdAlgo.hpp

Vec.hpp

util.hpp

common.hpp

ALPAKA_FN_ACC
#define ALPAKA_FN_ACC
All functions that can be used on an accelerator have to be attributed with ALPAKA_FN_ACC or ALPAKA_F...
Definition common.hpp:31

ALPAKA_TYPEOF
#define ALPAKA_TYPEOF(...)
Get the type of instance.
Definition common.hpp:154

ALPAKA_FORWARD
#define ALPAKA_FORWARD(instance)
Perfectly forward an instance as argument.
Definition common.hpp:148

functor.hpp

logger.hpp

ALPAKA_LOG_INFO
#define ALPAKA_LOG_INFO(logLvl, callable)
Write a meta data message to the output.
Definition logger.hpp:106

alpaka::api::util::adjustToLimit
consteval auto adjustToLimit(concepts::CVector auto const input)
adjust the input vector to a given limit by halving all components until the product of these is belo...
Definition util.hpp:64

alpaka::deviceKind::cpu
constexpr auto cpu
Definition tag.hpp:168

alpaka::layer::thread
constexpr auto thread
Definition tag.hpp:253

alpaka::onAcc::atomic::alpakaAtomicInvoke
ALPAKA_FN_ACC void alpakaAtomicInvoke(auto &&fn, concepts::Acc auto const &acc, auto *inOut, auto &&... args)
Defines the equivalent of an atomic invoke for user defined functors.
Definition atomic.hpp:199

alpaka::onAcc::worker::blocksInGrid
constexpr auto blocksInGrid
Definition WorkerGroup.hpp:106

alpaka::onAcc::worker::allThreads
constexpr auto allThreads
Represent the identity of the executor thread.
Definition WorkerGroup.hpp:134

alpaka::onAcc::worker::linearThreadsInBlock
constexpr auto linearThreadsInBlock
Definition WorkerGroup.hpp:111

alpaka::onAcc::worker::threadsInBlock
constexpr auto threadsInBlock
Definition WorkerGroup.hpp:107

alpaka::onAcc::makeIdxMap
ALPAKA_FN_HOST_ACC constexpr auto makeIdxMap(auto const &acc, auto const workGroup, auto const range, T_Traverse traverse=T_Traverse{}, T_IdxLayout idxLayout=T_IdxLayout{})
Creates an index container.
Definition interface.hpp:57

alpaka::onAcc::getDynSharedMem
constexpr auto getDynSharedMem(concepts::Acc auto const &acc) -> T *
Get block shared dynamic memory.
Definition Acc.hpp:197

alpaka::onAcc::syncBlockThreads
constexpr void syncBlockThreads(concepts::Acc auto const &acc)
Synchronize all threads within a thread block.
Definition Acc.hpp:132

alpaka::onHost::logger::queue
constexpr auto queue
Definition lvl.hpp:127

alpaka::onHost::logger::memory
constexpr auto memory
Definition lvl.hpp:112

alpaka::onHost::FrameSpec
FrameSpec(T_NumFrames const &, T_FrameExtents const &) -> FrameSpec< alpaka::trait::getVec_t< T_NumFrames >, alpaka::trait::getVec_t< T_FrameExtents >, alpaka::exec::AnyExecutor >

alpaka::onHost::fill
void fill(Queue< T_Device, T_QueueKind > const &queue, auto &&dest, T_Value elementValue)
fill memory element wise
Definition Queue.hpp:366

alpaka::onHost::demangledName
constexpr auto demangledName()
Definition demangledName.hpp:49

alpaka::onHost::getExtents
decltype(auto) getExtents(auto &&any)
Object extents.
Definition interface.hpp:25

alpaka::trait::GetValueType_t
typename GetValueType< T >::type GetValueType_t
Definition trait.hpp:65

alpaka::StencilFunc
ALPAKA_FN_HOST_ACC StencilFunc(T_Func &&) -> StencilFunc< T_Func >

alpaka::calculatePitchesFromExtents
constexpr auto calculatePitchesFromExtents(T_Vec const &extent)
Calculate the pitches purely from the extents.
Definition DataPitches.hpp:18

alpaka::linearize
constexpr T_IntegralType linearize(Vec< T_IntegralType, T_dim - 1u, T_Storage > const &dim, Vec< T_IntegralType, T_dim, T_OtherStorage > const &idx)
Give the linear index of an N-dimensional index within an N-dimensional index space.
Definition Vec.hpp:839

alpaka::ScalarFunc
ALPAKA_FN_HOST_ACC ScalarFunc(T_Func &&) -> ScalarFunc< T_Func >

alpaka::KernelBundle
ALPAKA_FN_HOST KernelBundle(TKernelFn const &, TArgs &&...) -> KernelBundle< TKernelFn, TArgs... >
User defined deduction guide with trailing return type. For CTAD during the construction.

alpaka::IdxRange
ALPAKA_FN_HOST_ACC IdxRange(T_Extents const &) -> IdxRange< typename trait::getVec_t< T_Extents >::UniVec >

atomic.hpp

interface.hpp

trait.hpp