latest/doxygen_dev/OmpBlocks_8hpp_source.html

/* Copyright 2024 René Widera

 * SPDX-License-Identifier: MPL-2.0

 */


#pragma once


#include "alpaka/Vec.hpp"

#include "alpaka/api/host/IdxLayer.hpp"

#include "alpaka/api/host/block/mem/SingleThreadStaticShared.hpp"

#include "alpaka/api/host/block/sync/NoOp.hpp"

#include "alpaka/api/host/hwloc/utility.hpp"

#include "alpaka/core/Dict.hpp"

#include "alpaka/core/common.hpp"

#include "alpaka/meta/NdLoop.hpp"

#include "alpaka/onAcc/Acc.hpp"

#include "alpaka/onHost/ThreadSpec.hpp"

#include "alpaka/tag.hpp"


#include <cassert>

#include <stdexcept>

#include <tuple>

#include <type_traits>


#if ALPAKA_OMP


namespace alpaka::onHost

{

    namespace cpu

    {

        template<onHost::concepts::ThreadSpec T_ThreadSpec>

        struct OmpBlocks

        {

            constexpr OmpBlocks(T_ThreadSpec threadBlocking, uint32_t numaIdx, bool setThreadAffinity)

                : m_threadBlocking{std::move(threadBlocking)}

                , m_numaIdx{numaIdx}

                , m_setThreadAffinity{setThreadAffinity}

            {

                if(m_threadBlocking.getNumThreads().product() != 1u)

                {

                    throw std::runtime_error("Thread block extent must be 1.");

                }

            }


            void operator()(auto const& kernelBundle, auto const& dict) const

            {

                using NumThreadsVecType = typename T_ThreadSpec::NumThreadsVecType;

#    pragma omp parallel

                {

                    if(m_setThreadAffinity)

                        internal::hwloc::setThreadAffinity(m_numaIdx);


                    // copy from num blocks to derive correct index type

                    auto blockIdx = m_threadBlocking.getNumBlocks();

                    constexpr uint32_t simdWidth

                        = alpaka::getArchSimdWidth<uint8_t>(api::host, ALPAKA_TYPEOF(dict[object::deviceKind]){});

                    auto blockSharedMem = onAcc::cpu::SingleThreadStaticShared<simdWidth>{};


                    // dynamic shared mem

                    uint32_t blockDynSharedMemBytes = onHost::getDynSharedMemBytes(m_threadBlocking, kernelBundle);

                    auto const blockDynSharedMemEntry = DictEntry{layer::dynShared, std::ref(blockSharedMem)};

                    auto const blockDynSharedMemBytesEntry

                        = DictEntry{object::dynSharedMemBytes, std::ref(blockDynSharedMemBytes)};


                    /* Only add dynamic shared memory objects if defined by the user, if not we will get a clean static

                     * assert if the kernel tries to access dynamic shared memory */

                    auto additionalDict = conditionalAppendDict<

                        trait::HasUserDefinedDynSharedMemBytes<T_ThreadSpec, ALPAKA_TYPEOF(kernelBundle)>::value>(

                        dict,

                        Dict{blockDynSharedMemEntry, blockDynSharedMemBytesEntry});


                    auto blockCount = m_threadBlocking.getNumBlocks();


                    auto const blockLayerEntry = DictEntry{

                        layer::block,

                        onAcc::cpu::GenericLayer{std::cref(blockIdx), std::cref(blockCount)}};

                    auto const threadLayerEntry = DictEntry{layer::thread, onAcc::cpu::OneLayer<NumThreadsVecType>{}};

                    auto const blockSharedMemEntry = DictEntry{layer::shared, std::ref(blockSharedMem)};

                    auto const blockSyncEntry = DictEntry{action::threadBlockSync, onAcc::cpu::NoOp{}};

                    auto const warpSizeEntry = DictEntry{object::warpSize, std::integral_constant<uint32_t, 1u>{}};


                    auto acc = onAcc::Acc(joinDict(

                        Dict{blockLayerEntry, threadLayerEntry, blockSharedMemEntry, blockSyncEntry, warpSizeEntry},

                        additionalDict));


                    using ThreadIdxType = typename NumThreadsVecType::type;

#    pragma omp for nowait

                    for(ThreadIdxType i = 0; i < blockCount.product(); ++i)

                    {

                        blockIdx = mapToND(blockCount, i);

                        kernelBundle(acc);

                        blockSharedMem.reset();

                    }

                }

            }


            T_ThreadSpec m_threadBlocking;

            uint32_t m_numaIdx;

            bool m_setThreadAffinity;

        };

    } // namespace cpu


    inline auto makeAcc(

        alpaka::onHost::concepts::ThreadSpec auto const& threadSpec,

        uint32_t numaIdx,

        bool setThreadAffinity) requires std::same_as<ALPAKA_TYPEOF(threadSpec.getExecutor()), exec::CpuOmpBlocks>

    {

        return cpu::OmpBlocks(threadSpec, numaIdx, setThreadAffinity);

    }

} // namespace alpaka::onHost


#endif

Acc.hpp

Dict.hpp

NdLoop.hpp

NoOp.hpp

SingleThreadStaticShared.hpp

ThreadSpec.hpp

Vec.hpp

utility.hpp

common.hpp

ALPAKA_TYPEOF
#define ALPAKA_TYPEOF(...)
Get the type of instance.
Definition common.hpp:154

IdxLayer.hpp

alpaka::onHost::cpu
Definition Device.hpp:30

alpaka::onHost::internal::hwloc::setThreadAffinity
void setThreadAffinity(uint32_t cpuDomainIdx)
Set the affinity of the current thread to all cores of a CPU domain.
Definition utility.hpp:301

alpaka::onHost
Functionality which is usable on the host CPU controller thread.
Definition api.hpp:40

alpaka::onHost::makeAcc
auto makeAcc(alpaka::onHost::concepts::ThreadSpec auto const &threadSpec, uint32_t numaIdx, bool setThreadAffinity)
Definition Serial.hpp:92

alpaka::conditionalAppendDict
constexpr auto conditionalAppendDict(Dict< T_Entries0... > const &dict0, Dict< T_Entries1... > const &dict1)
Definition Dict.hpp:215

alpaka::getArchSimdWidth
consteval uint32_t getArchSimdWidth(concepts::Api auto const api, alpaka::concepts::DeviceKind auto const deviceType)
Get the SIMD width in bytes for an API and device kind combination.
Definition trait.hpp:152

alpaka::joinDict
constexpr auto joinDict(Dict< T_Entries0... > const &dict0, Dict< T_Entries1... > const &dict1)
Definition Dict.hpp:204

alpaka::Dict
ALPAKA_FN_HOST_ACC Dict(Tuple< DictEntry< T_Keys, T_Values >... > const &) -> Dict< DictEntry< T_Keys, T_Values >... >

alpaka::mapToND
constexpr Vec< T_IntegralType, T_dim > mapToND(Vec< T_IntegralType, T_dim, T_Storage > const &extents, T_IntegralType linearIdx)
Maps a linear index to an N-dimensional index.
Definition Vec.hpp:880

tag.hpp