alpaka
Abstraction Library for Parallel Kernel Acceleration
Loading...
Searching...
No Matches
TbbBlocks.hpp
Go to the documentation of this file.
1/* Copyright 2024 Mehmet Yusufoglu, René Widera
2 * SPDX-License-Identifier: MPL-2.0
3 */
4
5#pragma once
6
12#include "alpaka/core/Dict.hpp"
13#include "alpaka/onAcc/Acc.hpp"
15#include "alpaka/tag.hpp"
16
17#include <cstddef>
18#include <stdexcept>
19#include <type_traits>
20
21#if ALPAKA_TBB
22# include <oneapi/tbb/blocked_range.h>
23# include <oneapi/tbb/parallel_for.h>
24# include <oneapi/tbb/task_group.h>
25
26namespace alpaka::onHost
27{
28 namespace cpu
29 {
30 template<onHost::concepts::ThreadSpec T_ThreadSpec>
31 struct TbbBlocks
32 {
33 using NumThreadsVecType = typename T_ThreadSpec::NumThreadsVecType;
34
35 // Construct the executor with the thread blocking configuration chosen by the queue.
36 constexpr TbbBlocks(T_ThreadSpec threadBlocking, uint32_t numaIdx, bool setThreadAffinity)
37 : m_threadBlocking(std::move(threadBlocking))
38 , m_numaIdx{numaIdx}
39 , m_setThreadAffinity{setThreadAffinity}
40 {
41 if(m_threadBlocking.getNumThreads().product() != 1u)
42 {
43 throw std::runtime_error("Thread block extent must be 1.");
44 }
45 }
46
47 void operator()(auto const& kernelBundle, auto const& dict) const
48 {
49 auto blockCount = m_threadBlocking.getNumBlocks();
50
51 constexpr uint32_t simdWidth
52 = alpaka::getArchSimdWidth<uint8_t>(api::host, ALPAKA_TYPEOF(dict[object::deviceKind]){});
53
54 oneapi::tbb::task_arena tbbArena;
55
56 auto kernel = [&]
57 {
58 using ThreadIdxType = typename NumThreadsVecType::type;
59 ThreadIdxType const linearNumBlocks = blockCount.product();
60
61 oneapi::tbb::parallel_for(
62 static_cast<ThreadIdxType>(0),
63 linearNumBlocks,
64 [&](ThreadIdxType i)
65 {
66 auto const blockIdx = mapToND(blockCount, i);
67
68 auto blockSharedMem = onAcc::cpu::SingleThreadStaticShared<simdWidth>{};
69 // Compose the accelerator dictionary entries consumed by the kernel.
70 auto const blockLayerEntry
71 = DictEntry{layer::block, onAcc::cpu::GenericLayer{std::cref(blockIdx), blockCount}};
72 auto const threadLayerEntry
73 = DictEntry{layer::thread, onAcc::cpu::OneLayer<NumThreadsVecType>{}};
74 auto const blockSharedMemEntry = DictEntry{layer::shared, std::ref(blockSharedMem)};
75 auto const blockSyncEntry = DictEntry{action::threadBlockSync, onAcc::cpu::NoOp{}};
76
77 // dynamic shared mem
78 uint32_t blockDynSharedMemBytes
79 = onHost::getDynSharedMemBytes(m_threadBlocking, kernelBundle);
80 auto const blockDynSharedMemEntry = DictEntry{layer::dynShared, std::ref(blockSharedMem)};
81 auto const blockDynSharedMemBytesEntry
82 = DictEntry{object::dynSharedMemBytes, std::ref(blockDynSharedMemBytes)};
83
84 auto additionalDict = conditionalAppendDict<
85 trait::HasUserDefinedDynSharedMemBytes<T_ThreadSpec, ALPAKA_TYPEOF(kernelBundle)>::
86 value>(dict, Dict{blockDynSharedMemEntry, blockDynSharedMemBytesEntry});
87
88 auto const warpSizeEntry
89 = DictEntry{object::warpSize, std::integral_constant<uint32_t, 1u>{}};
90
91 auto acc = onAcc::Acc(joinDict(
92 Dict{
93 blockLayerEntry,
94 threadLayerEntry,
95 blockSharedMemEntry,
96 blockSyncEntry,
97 warpSizeEntry},
98 additionalDict));
99
100 kernelBundle(acc);
101 });
102 };
103
104 if(m_numaIdx != internal::hwloc::allNumaDomains && m_setThreadAffinity)
105 {
106 oneapi::tbb::task_arena tbbArena;
107
108 auto const& tbbNumaNodes = oneapi::tbb::info::numa_nodes();
109 if(m_numaIdx >= tbbNumaNodes.size())
110 throw std::out_of_range("Invalid NUMA index");
111 auto tbbNumaIdx = tbbNumaNodes[m_numaIdx];
112 tbbArena.initialize(oneapi::tbb::task_arena::constraints{}.set_numa_id(tbbNumaIdx));
113 tbbArena.execute([&] { oneapi::tbb::this_task_arena::isolate(kernel); });
114 }
115 else
116 {
117 oneapi::tbb::this_task_arena::isolate(kernel);
118 }
119 }
120
121 T_ThreadSpec m_threadBlocking;
122 uint32_t m_numaIdx;
123 bool m_setThreadAffinity;
124 };
125 } // namespace cpu
126
127 inline auto makeAcc(
128 alpaka::onHost::concepts::ThreadSpec auto const& threadSpec,
129 uint32_t numaIdx,
130 bool setThreadAffinity) requires std::same_as<ALPAKA_TYPEOF(threadSpec.getExecutor()), exec::CpuTbbBlocks>
131 {
132 return cpu::TbbBlocks(threadSpec, numaIdx, setThreadAffinity);
133 }
134} // namespace alpaka::onHost
135#endif
#define ALPAKA_TYPEOF(...)
Get the type of instance.
Definition common.hpp:153
void setThreadAffinity(uint32_t numaIdx)
Set the affinity of the current thread to all cores of the NUMA domain.
Definition utility.hpp:180
constexpr auto kernel
Definition lvl.hpp:142
Functionality which is usable on the host CPU controller thread.
Definition api.hpp:40
auto makeAcc(alpaka::onHost::concepts::ThreadSpec auto const &threadSpec, uint32_t numaIdx, bool setThreadAffinity)
Definition Serial.hpp:92
constexpr auto conditionalAppendDict(Dict< T_Entries0... > const &dict0, Dict< T_Entries1... > const &dict1)
Definition Dict.hpp:211
consteval uint32_t getArchSimdWidth(concepts::Api auto const api, alpaka::concepts::DeviceKind auto const deviceType)
Get the SIMD width in bytes for an API and device kind combination.
Definition trait.hpp:152
constexpr auto joinDict(Dict< T_Entries0... > const &dict0, Dict< T_Entries1... > const &dict1)
Definition Dict.hpp:200
ALPAKA_FN_HOST_ACC Dict(Tuple< DictEntry< T_Keys, T_Values >... > const &) -> Dict< DictEntry< T_Keys, T_Values >... >
constexpr Vec< T_IntegralType, T_dim > mapToND(Vec< T_IntegralType, T_dim, T_Storage > const &extents, T_IntegralType linearIdx)
Maps a linear index to an N-dimensional index.
Definition Vec.hpp:873