22# include <oneapi/tbb/blocked_range.h>
23# include <oneapi/tbb/parallel_for.h>
24# include <oneapi/tbb/task_group.h>
30 template<onHost::concepts::ThreadSpec T_ThreadSpec>
33 using NumThreadsVecType =
typename T_ThreadSpec::NumThreadsVecType;
36 constexpr TbbBlocks(T_ThreadSpec threadBlocking, uint32_t numaIdx,
bool setThreadAffinity)
37 : m_threadBlocking(std::move(threadBlocking))
41 if(m_threadBlocking.getNumThreads().product() != 1u)
43 throw std::runtime_error(
"Thread block extent must be 1.");
47 void operator()(
auto const& kernelBundle,
auto const& dict)
const
49 auto blockCount = m_threadBlocking.getNumBlocks();
51 constexpr uint32_t simdWidth
54 oneapi::tbb::task_arena tbbArena;
58 using ThreadIdxType =
typename NumThreadsVecType::type;
59 ThreadIdxType
const linearNumBlocks = blockCount.product();
61 oneapi::tbb::parallel_for(
62 static_cast<ThreadIdxType
>(0),
66 auto const blockIdx =
mapToND(blockCount, i);
68 auto blockSharedMem = onAcc::cpu::SingleThreadStaticShared<simdWidth>{};
70 auto const blockLayerEntry
71 = DictEntry{layer::block, onAcc::cpu::GenericLayer{std::cref(blockIdx), blockCount}};
72 auto const threadLayerEntry
73 = DictEntry{layer::thread, onAcc::cpu::OneLayer<NumThreadsVecType>{}};
74 auto const blockSharedMemEntry = DictEntry{layer::shared, std::ref(blockSharedMem)};
75 auto const blockSyncEntry = DictEntry{action::threadBlockSync, onAcc::cpu::NoOp{}};
78 uint32_t blockDynSharedMemBytes
79 = onHost::getDynSharedMemBytes(m_threadBlocking, kernelBundle);
80 auto const blockDynSharedMemEntry = DictEntry{layer::dynShared, std::ref(blockSharedMem)};
81 auto const blockDynSharedMemBytesEntry
82 = DictEntry{object::dynSharedMemBytes, std::ref(blockDynSharedMemBytes)};
85 trait::HasUserDefinedDynSharedMemBytes<T_ThreadSpec,
ALPAKA_TYPEOF(kernelBundle)>::
86 value>(dict,
Dict{blockDynSharedMemEntry, blockDynSharedMemBytesEntry});
88 auto const warpSizeEntry
89 = DictEntry{object::warpSize, std::integral_constant<uint32_t, 1u>{}};
104 if(m_numaIdx != internal::hwloc::allNumaDomains && m_setThreadAffinity)
106 oneapi::tbb::task_arena tbbArena;
108 auto const& tbbNumaNodes = oneapi::tbb::info::numa_nodes();
109 if(m_numaIdx >= tbbNumaNodes.size())
110 throw std::out_of_range(
"Invalid NUMA index");
111 auto tbbNumaIdx = tbbNumaNodes[m_numaIdx];
112 tbbArena.initialize(oneapi::tbb::task_arena::constraints{}.set_numa_id(tbbNumaIdx));
113 tbbArena.execute([&] { oneapi::tbb::this_task_arena::isolate(kernel); });
117 oneapi::tbb::this_task_arena::isolate(kernel);
121 T_ThreadSpec m_threadBlocking;
123 bool m_setThreadAffinity;
128 alpaka::onHost::concepts::ThreadSpec
auto const& threadSpec,
130 bool setThreadAffinity)
requires std::same_as<
ALPAKA_TYPEOF(threadSpec.getExecutor()), exec::CpuTbbBlocks>
132 return cpu::TbbBlocks(threadSpec, numaIdx, setThreadAffinity);
#define ALPAKA_TYPEOF(...)
Get the type of instance.
void setThreadAffinity(uint32_t numaIdx)
Set the affinity of the current thread to all cores of the NUMA domain.
Functionality which is usable on the host CPU controller thread.
auto makeAcc(alpaka::onHost::concepts::ThreadSpec auto const &threadSpec, uint32_t numaIdx, bool setThreadAffinity)
constexpr auto conditionalAppendDict(Dict< T_Entries0... > const &dict0, Dict< T_Entries1... > const &dict1)
consteval uint32_t getArchSimdWidth(concepts::Api auto const api, alpaka::concepts::DeviceKind auto const deviceType)
Get the SIMD width in bytes for an API and device kind combination.
constexpr auto joinDict(Dict< T_Entries0... > const &dict0, Dict< T_Entries1... > const &dict1)
ALPAKA_FN_HOST_ACC Dict(Tuple< DictEntry< T_Keys, T_Values >... > const &) -> Dict< DictEntry< T_Keys, T_Values >... >
constexpr Vec< T_IntegralType, T_dim > mapToND(Vec< T_IntegralType, T_dim, T_Storage > const &extents, T_IntegralType linearIdx)
Maps a linear index to an N-dimensional index.