alpaka
Abstraction Library for Parallel Kernel Acceleration
Loading...
Searching...
No Matches
cpuArchSize.hpp
Go to the documentation of this file.
1/* Copyright 2025 René Widera
2 * SPDX-License-Identifier: MPL-2.0
3 */
4
5#pragma once
6
8#include "alpaka/utility.hpp"
9
10#include <cstdint>
11
12namespace alpaka::onHost::internal
13{
14
15 /** SIMD width in bytes defined by std::simd
16 *
17 * @return 0 if std::simd is not supported or the T_Type is unsupported, else the SIMD width in bytes
18 */
19 template<typename T_Type>
20 constexpr size_t stdSimdWidth()
21 {
22 return 0;
23 }
24#if ALPAKA_HAS_STD_SIMD
25 template<typename T_Type>
26 requires requires { alpakaStdSimd::native_simd<T_Type>::size(); }
27 constexpr size_t stdSimdWidth()
28 {
29 return alpakaStdSimd::native_simd<T_Type>::size() * sizeof(T_Type);
30 }
31#endif
32
33
34 template<typename T_Type>
35 constexpr uint32_t getCPUSimdWidth()
36 {
37 constexpr size_t possibleSimdWidthBytes =
38#if defined(__AVX512BW__) || defined(__AVX512F__) || defined(__AVX512DQ__) || defined(__AVX512VL__)
39 64u;
40#elif defined(__riscv_vector)
41 64u;
42#elif defined(__riscv)
43 // do not use vectors if the vector extension is not set
44 sizeof(T_Type);
45#elif defined(__AVX2__)
46 32u;
47#elif defined(__SSE__) || defined(__SSE2__) || defined(__SSE4_1__) || defined(__SSE4_2__)
48 16u;
49// Macro to be define by the user to enable SVE backend and specify SVE size
50#elif defined(SVE_VECTOR_BITS)
51 SVE_VECTOR_BITS / 8;
52// If user has specified SVE vector lenght using the flag -msve-vector-bits
53#elif defined(__ARM_FEATURE_SVE_BITS)
54 __ARM_FEATURE_SVE_BITS / 8;
55// ARM e.g. nvidia grace hopper
56#elif defined(__ARM_FEATURE_SVE2_AES)
57 16u;
58// ARM e.g AWS Graviton 3
59#elif defined(__ARM_FEATURE_SVE)
60 32u;
61#elif defined(__ARM_NEON)
62 16u;
63#elif defined(__ALTIVEC__)
64 16u;
65#else
66 sizeof(T_Type);
67#endif
68
69 // we assume that the standard is maintaining the vector length better than we, therefore take it if vector
70 // types are supported
71 constexpr size_t simdWidthInByte = stdSimdWidth<T_Type>() ? stdSimdWidth<T_Type>() : possibleSimdWidthBytes;
72
73 return alpaka::divExZero(simdWidthInByte, sizeof(T_Type));
74 }
75
76 constexpr uint32_t getCPUNumPipelines()
77 {
78 /* INTEL can issue 4 commands and AMD typically 2, since we can not distinguish between both we use
79 * the higher value.
80 * ARM SVE can typically issue 4 commands too.
81 *
82 * Therefor we use at the moment as default 4.
83 */
84 constexpr uint32_t numPipes = 4u;
85 return numPipes;
86 }
87
88 constexpr uint32_t getCPUCachelineSize()
89 {
90 constexpr uint32_t cachlineBytes =
91#ifdef __cpp_lib_hardware_interference_size
92 std::hardware_constructive_interference_size;
93
94#else
95 // Fallback value, typically 64 bytes
96 64;
97#endif
98 return cachlineBytes;
99 }
100
101} // namespace alpaka::onHost::internal
ALPAKA_FN_HOST_ACC constexpr auto divExZero(Integral a, Integral b) -> Integral
Returns the max(a / b, 1) as integer.
Definition utility.hpp:41