alpaka
Abstraction Library for Parallel Kernel Acceleration
Loading...
Searching...
No Matches
ComputeApi.hpp
Go to the documentation of this file.
1/* Copyright 2024 Jeffrey Kelling, Rene Widera, Bernhard Manfred Gruber, René Widera
2 * SPDX-License-Identifier: MPL-2.0
3 */
4
5#pragma once
6
11#include "alpaka/tag.hpp"
12
13#include <cstddef>
14
15#if ALPAKA_LANG_CUDA || ALPAKA_LANG_HIP
16
17namespace alpaka::onAcc
18{
19 namespace unifiedCudaHip
20 {
21
22 struct Sync
23 {
24 __device__ void operator()() const
25 {
26 __syncthreads();
27 }
28 };
29
30 namespace internal
31 {
32 /** This trait is only for uniform CUDA and HIP warp size abstraction
33 *
34 * Use onAcc::internal::GetWarpSize to query the warp size independent of the API.
35 * The warp size must be a std::integral_constant<uint32_t,X>.
36 */
37 struct WarpSize
38 {
39 template<alpaka::concepts::DeviceKind T_DeviceKind>
40 struct Get;
41 };
42 } // namespace internal
43 } // namespace unifiedCudaHip
44} // namespace alpaka::onAcc
45
47{
48 template<typename T, typename T_Acc>
49 requires alpaka::concepts::UnifiedCudaHipExecutor<ALPAKA_TYPEOF(std::declval<T_Acc>()[object::exec])>
51 {
52 __device__ decltype(auto) operator()(auto const& acc) const
53 {
54 alpaka::unused(acc);
55 // Because unaligned access to variables is not allowed in device code,
56 // we use the widest possible alignment supported by CUDA types to have
57 // all types aligned correctly.
58 // See:
59 // - http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared
60 // - http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#vector-types
61 extern __shared__ std::byte shMem alignas(std::max_align_t)[];
62 return reinterpret_cast<T*>(shMem);
63 }
64 };
65
66 template<typename T, size_t T_uniqueId, typename T_Acc>
67 requires alpaka::concepts::UnifiedCudaHipExecutor<ALPAKA_TYPEOF(std::declval<T_Acc>()[object::exec])>
69 {
70 __device__ decltype(auto) operator()(auto const& acc) const
71 {
72 alpaka::unused(acc);
73 __shared__ uint8_t shMem alignas(alignof(T))[sizeof(T)];
74 return *(reinterpret_cast<T*>(shMem));
75 }
76 };
77} // namespace alpaka::onAcc::internalCompute
78
79#endif
functionality which is usable on the accelerator compute device from within a kernel.
Definition executor.hpp:38