30 template<onHost::concepts::ThreadSpec T_ThreadSpec>
33 constexpr OmpBlocks(T_ThreadSpec threadBlocking, uint32_t numaIdx,
bool setThreadAffinity)
34 : m_threadBlocking{std::move(threadBlocking)}
38 if(m_threadBlocking.getNumThreads().product() != 1u)
40 throw std::runtime_error(
"Thread block extent must be 1.");
44 void operator()(
auto const& kernelBundle,
auto const& dict)
const
46 using NumThreadsVecType =
typename T_ThreadSpec::NumThreadsVecType;
49 if(m_setThreadAffinity)
50 internal::hwloc::setThreadAffinity(m_numaIdx);
53 auto blockIdx = m_threadBlocking.getNumBlocks();
54 constexpr uint32_t simdWidth
56 auto blockSharedMem = onAcc::cpu::SingleThreadStaticShared<simdWidth>{};
59 uint32_t blockDynSharedMemBytes = onHost::getDynSharedMemBytes(m_threadBlocking, kernelBundle);
60 auto const blockDynSharedMemEntry = DictEntry{layer::dynShared, std::ref(blockSharedMem)};
61 auto const blockDynSharedMemBytesEntry
62 = DictEntry{object::dynSharedMemBytes, std::ref(blockDynSharedMemBytes)};
67 trait::HasUserDefinedDynSharedMemBytes<T_ThreadSpec,
ALPAKA_TYPEOF(kernelBundle)>::value>(
69 Dict{blockDynSharedMemEntry, blockDynSharedMemBytesEntry});
71 auto blockCount = m_threadBlocking.getNumBlocks();
73 auto const blockLayerEntry = DictEntry{
75 onAcc::cpu::GenericLayer{std::cref(blockIdx), std::cref(blockCount)}};
76 auto const threadLayerEntry = DictEntry{layer::thread, onAcc::cpu::OneLayer<NumThreadsVecType>{}};
77 auto const blockSharedMemEntry = DictEntry{layer::shared, std::ref(blockSharedMem)};
78 auto const blockSyncEntry = DictEntry{action::threadBlockSync, onAcc::cpu::NoOp{}};
79 auto const warpSizeEntry = DictEntry{object::warpSize, std::integral_constant<uint32_t, 1u>{}};
82 Dict{blockLayerEntry, threadLayerEntry, blockSharedMemEntry, blockSyncEntry, warpSizeEntry},
85 using ThreadIdxType =
typename NumThreadsVecType::type;
86# pragma omp for nowait
87 for(ThreadIdxType i = 0; i < blockCount.product(); ++i)
89 blockIdx =
mapToND(blockCount, i);
91 blockSharedMem.reset();
96 T_ThreadSpec m_threadBlocking;
98 bool m_setThreadAffinity;
103 alpaka::onHost::concepts::ThreadSpec
auto const& threadSpec,
105 bool setThreadAffinity)
requires std::same_as<
ALPAKA_TYPEOF(threadSpec.getExecutor()), exec::CpuOmpBlocks>
107 return cpu::OmpBlocks(threadSpec, numaIdx, setThreadAffinity);
#define ALPAKA_TYPEOF(...)
Get the type of instance.
void setThreadAffinity(uint32_t numaIdx)
Set the affinity of the current thread to all cores of the NUMA domain.
Functionality which is usable on the host CPU controller thread.
auto makeAcc(alpaka::onHost::concepts::ThreadSpec auto const &threadSpec, uint32_t numaIdx, bool setThreadAffinity)
constexpr auto conditionalAppendDict(Dict< T_Entries0... > const &dict0, Dict< T_Entries1... > const &dict1)
consteval uint32_t getArchSimdWidth(concepts::Api auto const api, alpaka::concepts::DeviceKind auto const deviceType)
Get the SIMD width in bytes for an API and device kind combination.
constexpr auto joinDict(Dict< T_Entries0... > const &dict0, Dict< T_Entries1... > const &dict1)
ALPAKA_FN_HOST_ACC Dict(Tuple< DictEntry< T_Keys, T_Values >... > const &) -> Dict< DictEntry< T_Keys, T_Values >... >
constexpr Vec< T_IntegralType, T_dim > mapToND(Vec< T_IntegralType, T_dim, T_Storage > const &extents, T_IntegralType linearIdx)
Maps a linear index to an N-dimensional index.