alpaka
Abstraction Library for Parallel Kernel Acceleration
Loading...
Searching...
No Matches
warp.hpp
Go to the documentation of this file.
1/* Copyright 2025 Mehmet Yusufoglu, René Widera
2 * SPDX-License-Identifier: MPL-2.0
3 */
4
5#pragma once
6
8#include "alpaka/concepts.hpp"
11
12#include <algorithm>
13#include <cstdint>
14
15#if ALPAKA_LANG_ONEAPI
16# include <sycl/sycl.hpp>
17
18namespace alpaka::onAcc::warp::internal
19{
20 // GPU back-ends use native SYCL subgroup operations.
21 template<alpaka::onAcc::concepts::Acc T_Acc>
22 struct Activemask::Op<T_Acc, api::OneApi>
23 {
24 auto operator()(T_Acc const&, api::OneApi) const
25 {
26 sycl::sub_group sg = sycl::ext::oneapi::this_work_item::get_sub_group();
27
28 return getMask(sg);
29 }
30
31 static auto getMask(auto const subGroup)
32 {
33 auto sgMask = sycl::ext::oneapi::group_ballot(subGroup, true);
34
35 constexpr auto const warpSize = T_Acc::getWarpSize();
36 using ReturnType = std::conditional_t<warpSize <= 32, uint32_t, uint64_t>;
37 ReturnType mask;
38 sgMask.extract_bits(mask, 0u);
39 return mask;
40 };
41 };
42
43 template<alpaka::onAcc::concepts::Acc T_Acc>
44 struct GetLaneIdx::Op<T_Acc, api::OneApi>
45 {
46 constexpr auto operator()(T_Acc const&, api::OneApi) const
47 {
48 sycl::sub_group sg = sycl::ext::oneapi::this_work_item::get_sub_group();
49 // lane id within the warp subgroup
50 return sg.get_local_id()[0];
51 }
52 };
53
54 template<alpaka::onAcc::concepts::Acc T_Acc>
55 struct GetWarpIdx::Op<T_Acc, api::OneApi>
56 {
57 constexpr auto operator()(T_Acc const&, api::OneApi) const
58 {
59 sycl::sub_group sg = sycl::ext::oneapi::this_work_item::get_sub_group();
60 // lane id within the warp subgroup
61 return sg.get_group_linear_id();
62 }
63 };
64
65 template<alpaka::onAcc::concepts::Acc T_Acc>
66 struct All::Op<T_Acc, api::OneApi>
67 {
68 bool operator()(T_Acc const& acc, api::OneApi, int32_t predicate) const
69 {
70 using DeviceKind = ALPAKA_TYPEOF(acc[object::deviceKind]);
71 if constexpr(DeviceKind{} == alpaka::deviceKind::amdGpu)
72 {
73 /* Workaround for AMD GPUs: Sycl is taking the results of the non active threads into account
74 * and therefore even if all participating threads have a true predicate the result will be false.
75 * We vote with ballot and mask the result with the active thread mask.
76 */
77 sycl::sub_group sg = sycl::ext::oneapi::this_work_item::get_sub_group();
78 auto activeMask = Activemask::Op<T_Acc, api::OneApi>::getMask(sg);
79 auto sgMask = sycl::ext::oneapi::group_ballot(sg, predicate != 0);
80
81 constexpr auto const warpSize = T_Acc::getWarpSize();
82 using ReturnType = std::conditional_t<warpSize <= 32, uint32_t, uint64_t>;
83 ReturnType predicateMask;
84 sgMask.extract_bits(predicateMask, 0u);
85 return activeMask & predicateMask == activeMask;
86 }
87 else
88 {
89 sycl::sub_group sg = sycl::ext::oneapi::this_work_item::get_sub_group();
90 return sycl::all_of_group(sg, predicate != 0);
91 }
92 }
93 };
94
95 template<alpaka::onAcc::concepts::Acc T_Acc>
96 struct Any::Op<T_Acc, api::OneApi>
97 {
98 bool operator()(T_Acc const& acc, api::OneApi, int32_t predicate) const
99 {
100 using DeviceKind = ALPAKA_TYPEOF(acc[object::deviceKind]);
101 if constexpr(DeviceKind{} == alpaka::deviceKind::amdGpu)
102 {
103 /* Workaround for AMD GPUs: Sycl is taking the results of non active threads into account
104 * and therefore even if all participating threads have a false predicate the result will be true.
105 * We vote with ballot and mask the result with the active thread mask.
106 */
107 sycl::sub_group sg = sycl::ext::oneapi::this_work_item::get_sub_group();
108 auto activeMask = Activemask::Op<T_Acc, api::OneApi>::getMask(sg);
109 auto sgMask = sycl::ext::oneapi::group_ballot(sg, predicate != 0);
110
111 constexpr auto const warpSize = T_Acc::getWarpSize();
112 using ReturnType = std::conditional_t<warpSize <= 32, uint32_t, uint64_t>;
113 ReturnType predicateMask;
114 sgMask.extract_bits(predicateMask, 0u);
115 return activeMask & predicateMask;
116 }
117 else
118 {
119 sycl::sub_group sg = sycl::ext::oneapi::this_work_item::get_sub_group();
120 return sycl::any_of_group(sg, predicate != 0);
121 }
122 }
123 };
124
125 template<alpaka::onAcc::concepts::Acc T_Acc>
126 struct Ballot::Op<T_Acc, api::OneApi>
127 {
128 auto operator()(T_Acc const&, api::OneApi, int32_t predicate) const
129 {
130 sycl::sub_group sg = sycl::ext::oneapi::this_work_item::get_sub_group();
131 auto sgMask = sycl::ext::oneapi::group_ballot(sg, predicate != 0);
132
133 constexpr auto const warpSize = T_Acc::getWarpSize();
134 using ReturnType = std::conditional_t<warpSize <= 32, uint32_t, uint64_t>;
135 ReturnType mask;
136 sgMask.extract_bits(mask, 0u);
137 return mask;
138 }
139 };
140
141 template<alpaka::onAcc::concepts::Acc T_Acc, typename T>
142 struct Shfl::Op<T_Acc, api::OneApi, T>
143 {
144 constexpr T operator()(T_Acc const&, api::OneApi, T const& value, uint32_t srcLane, uint32_t width) const
145 {
146 sycl::sub_group sg = sycl::ext::oneapi::this_work_item::get_sub_group();
147 uint32_t laneIdxInWarp = sg.get_local_id()[0];
148 uint32_t partitionOffset = (laneIdxInWarp / width) * width;
149 uint32_t srcInPartitionLaneIdx = partitionOffset + (srcLane % width);
150
151 return sycl::select_from_group(sg, value, srcInPartitionLaneIdx);
152 }
153 };
154
155 template<alpaka::onAcc::concepts::Acc T_Acc, typename T>
156 struct ShflDown::Op<T_Acc, api::OneApi, T>
157 {
158 constexpr T operator()(T_Acc const&, api::OneApi, T const& value, uint32_t delta, uint32_t width) const
159 {
160 sycl::sub_group sg = sycl::ext::oneapi::this_work_item::get_sub_group();
161
162 uint32_t laneIdxInWarp = sg.get_local_id()[0];
163 uint32_t groupEndIdx = (laneIdxInWarp / width + 1) * width;
164
165 T result = sycl::shift_group_left(sg, value, delta);
166 if(laneIdxInWarp + delta >= groupEndIdx)
167 result = value;
168 return result;
169 }
170 };
171
172 template<alpaka::onAcc::concepts::Acc T_Acc, typename T>
173 struct ShflUp::Op<T_Acc, api::OneApi, T>
174 {
175 constexpr T operator()(T_Acc const&, api::OneApi, T const& value, uint32_t delta, uint32_t width) const
176 {
177 sycl::sub_group sg = sycl::ext::oneapi::this_work_item::get_sub_group();
178
179 uint32_t laneIdxInWarp = sg.get_local_id()[0];
180 uint32_t groupStartIdx = (laneIdxInWarp / width) * width;
181
182 T result = sycl::shift_group_right(sg, value, delta);
183 if(laneIdxInWarp - groupStartIdx < delta)
184 result = value;
185 return result;
186 }
187 };
188
189 template<alpaka::onAcc::concepts::Acc T_Acc, typename T>
190 struct ShflXor::Op<T_Acc, api::OneApi, T>
191 {
192 constexpr T operator()(T_Acc const&, api::OneApi, T const& value, uint32_t laneMask, uint32_t width) const
193 {
194 sycl::sub_group sg = sycl::ext::oneapi::this_work_item::get_sub_group();
195 uint32_t laneIdxInWarp = sg.get_local_id()[0];
196 uint32_t groupStartIdx = (laneIdxInWarp / width) * width;
197 uint32_t const relativeIdx = laneIdxInWarp - groupStartIdx;
198 uint32_t const sourceLane = (relativeIdx % width) ^ laneMask;
199 return sycl::select_from_group(sg, value, sourceLane < width ? groupStartIdx + sourceLane : laneIdxInWarp);
200 }
201 };
202} // namespace alpaka::onAcc::warp::internal
203#endif
#define ALPAKA_TYPEOF(...)
Get the type of instance.
Definition common.hpp:153
constexpr auto amdGpu
Definition tag.hpp:190
constexpr WarpSize warpSize
Definition tag.hpp:44
constexpr DeviceKind deviceKind
Definition tag.hpp:30
constexpr Api api
Definition tag.hpp:24