alpaka
Abstraction Library for Parallel Kernel Acceleration
Loading...
Searching...
No Matches
memFence.hpp
Go to the documentation of this file.
1/* Copyright 2025 Mehmet Yusufoglu, René Widera
2 * SPDX-License-Identifier: MPL-2.0
3 */
4
5#pragma once
6
11#include "alpaka/onAcc/Acc.hpp"
14
15#include <type_traits>
16
17#if ALPAKA_LANG_CUDA
18
20{
21
22 namespace detail
23 {
24 template<concepts::MemoryOrder TMemOrder>
25 [[maybe_unused]] static constexpr __device__ void cuda_ptx_fence_system([[maybe_unused]] TMemOrder const)
26 {
27# if ALPAKA_ARCH_PTX >= ALPAKA_VERSION_NUMBER(9, 0, 0)
28 // full acquire/release semantics support
29 if constexpr(std::is_same_v<TMemOrder, order::Relaxed>)
30 { // Relaxed ordering requires no fence
31 }
32 else if constexpr(std::is_same_v<TMemOrder, order::Acquire>)
33 {
34 asm volatile("fence.acquire.sys;" ::);
35 }
36 else if constexpr(std::is_same_v<TMemOrder, order::Release>)
37 {
38 asm volatile("fence.release.sys;" ::);
39 }
40 else if constexpr(std::is_same_v<TMemOrder, order::AcqRel>)
41 {
42 asm volatile("fence.acq_rel.sys;" ::);
43 }
44 else
45 { // Sequential consistency
46 asm volatile("fence.sc.sys;" ::);
47 }
48# elif ALPAKA_ARCH_PTX >= ALPAKA_VERSION_NUMBER(7, 0, 0)
49 // only acq_rel and sc available
50 if constexpr(std::is_same_v<TMemOrder, order::Relaxed>)
51 { // Relaxed ordering requires no fence
52 }
53 else if constexpr(std::is_same_v<TMemOrder, order::Acquire>)
54 {
55 asm volatile("fence.acq_rel.sys;" ::);
56 }
57 else if constexpr(std::is_same_v<TMemOrder, order::Release>)
58 {
59 asm volatile("fence.acq_rel.sys;" ::);
60 }
61 else if constexpr(std::is_same_v<TMemOrder, order::AcqRel>)
62 {
63 asm volatile("fence.acq_rel.sys;" ::);
64 }
65 else
66 {
67 // Sequential consistency
68 asm volatile("fence.sc.sys;" ::);
69 }
70# endif
71 }
72
73 template<concepts::MemoryOrder TMemOrder>
74 [[maybe_unused]] static constexpr __device__ void cuda_ptx_fence_device([[maybe_unused]] TMemOrder const)
75 {
76# if ALPAKA_ARCH_PTX >= ALPAKA_VERSION_NUMBER(9, 0, 0)
77 // full acquire/release semantics support
78 if constexpr(std::is_same_v<TMemOrder, order::Relaxed>)
79 { // Relaxed ordering requires no fence
80 }
81 else if constexpr(std::is_same_v<TMemOrder, order::Acquire>)
82 {
83 asm volatile("fence.acquire.gpu;" ::);
84 }
85 else if constexpr(std::is_same_v<TMemOrder, order::Release>)
86 {
87 asm volatile("fence.release.gpu;" ::);
88 }
89 else if constexpr(std::is_same_v<TMemOrder, order::AcqRel>)
90 {
91 asm volatile("fence.acq_rel.gpu;" ::);
92 }
93 else
94 { // Sequential consistency
95 asm volatile("fence.sc.gpu;" ::);
96 }
97# elif ALPAKA_ARCH_PTX >= ALPAKA_VERSION_NUMBER(7, 0, 0)
98 // only acq_rel and sc available
99 if constexpr(std::is_same_v<TMemOrder, order::Relaxed>)
100 { // Relaxed ordering requires no fence
101 }
102 else if constexpr(std::is_same_v<TMemOrder, order::Acquire>)
103 {
104 asm volatile("fence.acq_rel.gpu;" ::);
105 }
106 else if constexpr(std::is_same_v<TMemOrder, order::Release>)
107 {
108 asm volatile("fence.acq_rel.gpu;" ::);
109 }
110 else if constexpr(std::is_same_v<TMemOrder, order::AcqRel>)
111 {
112 asm volatile("fence.acq_rel.gpu;" ::);
113 }
114 else
115 {
116 // Sequential consistency
117 asm volatile("fence.sc.gpu;" ::);
118 }
119# endif
120 }
121
122 template<concepts::MemoryOrder TMemOrder>
123 [[maybe_unused]] static constexpr __device__ void cuda_ptx_fence_block([[maybe_unused]] TMemOrder const)
124 {
125# if ALPAKA_ARCH_PTX >= ALPAKA_VERSION_NUMBER(9, 0, 0)
126 // full acquire/release semantics support
127 if constexpr(std::is_same_v<TMemOrder, order::Relaxed>)
128 { // Relaxed ordering requires no fence
129 }
130 else if constexpr(std::is_same_v<TMemOrder, order::Acquire>)
131 {
132 asm volatile("fence.acquire.cta;" ::);
133 }
134 else if constexpr(std::is_same_v<TMemOrder, order::Release>)
135 {
136 asm volatile("fence.release.cta;" ::);
137 }
138 else if constexpr(std::is_same_v<TMemOrder, order::AcqRel>)
139 {
140 asm volatile("fence.acq_rel.cta;" ::);
141 }
142 else
143 { // Sequential consistency
144 asm volatile("fence.sc.cta;" ::);
145 }
146# elif ALPAKA_ARCH_PTX >= ALPAKA_VERSION_NUMBER(7, 0, 0)
147 // only acq_rel and sc available
148 if constexpr(std::is_same_v<TMemOrder, order::Relaxed>)
149 { // Relaxed ordering requires no fence
150 }
151 else if constexpr(std::is_same_v<TMemOrder, order::Acquire>)
152 {
153 asm volatile("fence.acq_rel.cta;" ::);
154 }
155 else if constexpr(std::is_same_v<TMemOrder, order::Release>)
156 {
157 asm volatile("fence.acq_rel.cta;" ::);
158 }
159 else if constexpr(std::is_same_v<TMemOrder, order::AcqRel>)
160 {
161 asm volatile("fence.acq_rel.cta;" ::);
162 }
163 else
164 { // Sequential consistency
165 asm volatile("fence.sc.cta;" ::);
166 }
167# endif
168 }
169
170 template<concepts::MemoryOrder TMemOrder>
171 [[maybe_unused]] static constexpr __device__ void cuda_mem_fence_block([[maybe_unused]] TMemOrder const order)
172 {
173 if constexpr(std::is_same_v<TMemOrder, order::Relaxed>)
174 { // Relaxed ordering requires no fence
175 return;
176 }
177# ifdef ALPAKA_CUDA_ATOMIC
178 ::cuda::atomic_thread_fence(MemOrderCuda::get(order), ::cuda::thread_scope_block);
179# else
180# if ALPAKA_ARCH_PTX
181# if ALPAKA_ARCH_PTX >= ALPAKA_VERSION_NUMBER(7, 0, 0)
182 cuda_ptx_fence_block(order);
183# else
184 __threadfence_block();
185# endif
186# endif
187# endif
188 }
189
190 template<concepts::MemoryOrder TMemOrder>
191 [[maybe_unused]] static constexpr __device__ void cuda_mem_fence_device([[maybe_unused]] TMemOrder const order)
192 {
193 if constexpr(std::is_same_v<TMemOrder, order::Relaxed>)
194 { // Relaxed ordering requires no fence
195 return;
196 }
197# ifdef ALPAKA_CUDA_ATOMIC
198 ::cuda::atomic_thread_fence(MemOrderCuda::get(order), ::cuda::thread_scope_device);
199# else
200# if ALPAKA_ARCH_PTX
201# if ALPAKA_ARCH_PTX >= ALPAKA_VERSION_NUMBER(7, 0, 0)
202 cuda_ptx_fence_device(order);
203# else
204 __threadfence();
205# endif
206# endif
207# endif
208 }
209
210 template<concepts::MemoryOrder TMemOrder>
211 [[maybe_unused]] static constexpr __device__ void cuda_mem_fence_system([[maybe_unused]] TMemOrder const order)
212 {
213 if constexpr(std::is_same_v<TMemOrder, order::Relaxed>)
214 { // Relaxed ordering requires no fence
215 return;
216 }
217# ifdef ALPAKA_CUDA_ATOMIC
218 ::cuda::atomic_thread_fence(MemOrderCuda::get(order), ::cuda::thread_scope_system);
219# else
220# if ALPAKA_ARCH_PTX
221# if ALPAKA_ARCH_PTX >= ALPAKA_VERSION_NUMBER(7, 0, 0)
222 cuda_ptx_fence_system(order);
223# else
224 __threadfence_system();
225# endif
226# endif
227# endif
228 }
229
230 } // namespace detail
231
232 /** Specializations should not have to be enabled for backend combinations without CUDA
233 * Removing this top level guard will not cause issues if intrinsics like __threadfence_block are protected
234 * inside the specialization.
235 */
236 template<typename T_Api, typename T_Scope, concepts::MemoryOrder T_Order>
237 requires std::same_as<T_Api, api::Cuda>
238 struct MemoryFence::Op<T_Api, T_Scope, T_Order>
239 {
240 ALPAKA_FN_ACC constexpr void operator()(onAcc::concepts::Acc auto const&, T_Scope const, T_Order const order)
241 const
242 {
243 // Host pass is not allowed.
244# if ALPAKA_ARCH_PTX
245 if constexpr(std::is_same_v<T_Scope, scope::Block>)
246 {
247 detail::cuda_mem_fence_block(order);
248 }
249 else if constexpr(std::is_same_v<T_Scope, scope::Device>)
250 {
251 detail::cuda_mem_fence_device(order);
252 }
253 else if constexpr(std::is_same_v<T_Scope, scope::System>)
254 {
255 detail::cuda_mem_fence_system(order);
256 }
257# endif
258 }
259 };
260} // namespace alpaka::onAcc::internalCompute
261
262#endif // ALPAKA_LANG_CUDA
#define ALPAKA_FN_ACC
All functions that can be used on an accelerator have to be attributed with ALPAKA_FN_ACC or ALPAKA_F...
Definition common.hpp:30
constexpr void operator()(T_Acc const &acc, T_MemoryOrder const order, T_Scope const scope) const