alpaka
Abstraction Library for Parallel Kernel Acceleration
Loading...
Searching...
No Matches
OmpBlocks.hpp
Go to the documentation of this file.
1/* Copyright 2024 René Widera
2 * SPDX-License-Identifier: MPL-2.0
3 */
4
5#pragma once
6
7#include "alpaka/Vec.hpp"
12#include "alpaka/core/Dict.hpp"
15#include "alpaka/onAcc/Acc.hpp"
17#include "alpaka/tag.hpp"
18
19#include <cassert>
20#include <stdexcept>
21#include <tuple>
22#include <type_traits>
23
24#if ALPAKA_OMP
25
26namespace alpaka::onHost
27{
28 namespace cpu
29 {
30 template<onHost::concepts::ThreadSpec T_ThreadSpec>
31 struct OmpBlocks
32 {
33 constexpr OmpBlocks(T_ThreadSpec threadBlocking, uint32_t numaIdx, bool setThreadAffinity)
34 : m_threadBlocking{std::move(threadBlocking)}
35 , m_numaIdx{numaIdx}
36 , m_setThreadAffinity{setThreadAffinity}
37 {
38 if(m_threadBlocking.getNumThreads().product() != 1u)
39 {
40 throw std::runtime_error("Thread block extent must be 1.");
41 }
42 }
43
44 void operator()(auto const& kernelBundle, auto const& dict) const
45 {
46 using NumThreadsVecType = typename T_ThreadSpec::NumThreadsVecType;
47# pragma omp parallel
48 {
49 if(m_setThreadAffinity)
50 internal::hwloc::setThreadAffinity(m_numaIdx);
51
52 // copy from num blocks to derive correct index type
53 auto blockIdx = m_threadBlocking.getNumBlocks();
54 constexpr uint32_t simdWidth
55 = alpaka::getArchSimdWidth<uint8_t>(api::host, ALPAKA_TYPEOF(dict[object::deviceKind]){});
56 auto blockSharedMem = onAcc::cpu::SingleThreadStaticShared<simdWidth>{};
57
58 // dynamic shared mem
59 uint32_t blockDynSharedMemBytes = onHost::getDynSharedMemBytes(m_threadBlocking, kernelBundle);
60 auto const blockDynSharedMemEntry = DictEntry{layer::dynShared, std::ref(blockSharedMem)};
61 auto const blockDynSharedMemBytesEntry
62 = DictEntry{object::dynSharedMemBytes, std::ref(blockDynSharedMemBytes)};
63
64 /* Only add dynamic shared memory objects if defined by the user, if not we will get a clean static
65 * assert if the kernel tries to access dynamic shared memory */
66 auto additionalDict = conditionalAppendDict<
67 trait::HasUserDefinedDynSharedMemBytes<T_ThreadSpec, ALPAKA_TYPEOF(kernelBundle)>::value>(
68 dict,
69 Dict{blockDynSharedMemEntry, blockDynSharedMemBytesEntry});
70
71 auto blockCount = m_threadBlocking.getNumBlocks();
72
73 auto const blockLayerEntry = DictEntry{
74 layer::block,
75 onAcc::cpu::GenericLayer{std::cref(blockIdx), std::cref(blockCount)}};
76 auto const threadLayerEntry = DictEntry{layer::thread, onAcc::cpu::OneLayer<NumThreadsVecType>{}};
77 auto const blockSharedMemEntry = DictEntry{layer::shared, std::ref(blockSharedMem)};
78 auto const blockSyncEntry = DictEntry{action::threadBlockSync, onAcc::cpu::NoOp{}};
79 auto const warpSizeEntry = DictEntry{object::warpSize, std::integral_constant<uint32_t, 1u>{}};
80
81 auto acc = onAcc::Acc(joinDict(
82 Dict{blockLayerEntry, threadLayerEntry, blockSharedMemEntry, blockSyncEntry, warpSizeEntry},
83 additionalDict));
84
85 using ThreadIdxType = typename NumThreadsVecType::type;
86# pragma omp for nowait
87 for(ThreadIdxType i = 0; i < blockCount.product(); ++i)
88 {
89 blockIdx = mapToND(blockCount, i);
90 kernelBundle(acc);
91 blockSharedMem.reset();
92 }
93 }
94 }
95
96 T_ThreadSpec m_threadBlocking;
97 uint32_t m_numaIdx;
98 bool m_setThreadAffinity;
99 };
100 } // namespace cpu
101
102 inline auto makeAcc(
103 alpaka::onHost::concepts::ThreadSpec auto const& threadSpec,
104 uint32_t numaIdx,
105 bool setThreadAffinity) requires std::same_as<ALPAKA_TYPEOF(threadSpec.getExecutor()), exec::CpuOmpBlocks>
106 {
107 return cpu::OmpBlocks(threadSpec, numaIdx, setThreadAffinity);
108 }
109} // namespace alpaka::onHost
110
111#endif
#define ALPAKA_TYPEOF(...)
Get the type of instance.
Definition common.hpp:153
void setThreadAffinity(uint32_t numaIdx)
Set the affinity of the current thread to all cores of the NUMA domain.
Definition utility.hpp:180
Functionality which is usable on the host CPU controller thread.
Definition api.hpp:40
auto makeAcc(alpaka::onHost::concepts::ThreadSpec auto const &threadSpec, uint32_t numaIdx, bool setThreadAffinity)
Definition Serial.hpp:92
constexpr auto conditionalAppendDict(Dict< T_Entries0... > const &dict0, Dict< T_Entries1... > const &dict1)
Definition Dict.hpp:211
consteval uint32_t getArchSimdWidth(concepts::Api auto const api, alpaka::concepts::DeviceKind auto const deviceType)
Get the SIMD width in bytes for an API and device kind combination.
Definition trait.hpp:152
constexpr auto joinDict(Dict< T_Entries0... > const &dict0, Dict< T_Entries1... > const &dict1)
Definition Dict.hpp:200
ALPAKA_FN_HOST_ACC Dict(Tuple< DictEntry< T_Keys, T_Values >... > const &) -> Dict< DictEntry< T_Keys, T_Values >... >
constexpr Vec< T_IntegralType, T_dim > mapToND(Vec< T_IntegralType, T_dim, T_Storage > const &extents, T_IntegralType linearIdx)
Maps a linear index to an N-dimensional index.
Definition Vec.hpp:873