Warp and Subgroup Functions
A warp is a hardware-scheduled group of threads that share a common execution context and execute instructions collectively, while individual threads may be active or inactive (masked) due to control-flow divergence. Threads in a warp can exchange values via warp shuffle functions without going through shared memory. A thread block may contain multiple warps. The number of threads within a thread block is not required to be a multiple of the warp size. Threads in different warps cannot use warp shuffle functions to exchange values. A warp is always a one-dimensional group of threads, even within n-dimensional kernels.
When to Reach for Warp Functions
Use warp functions when:
you want fast communication among threads that execute in lock-step
you are implementing a reduction or prefix-style pattern inside a warp
you need ballot-style voting or lane-to-lane value exchange.
A Warp Reduction With shflDown
The following example reduces one value per lane to one value per warp.
makeIdxMap uses warps directly instead of first mapping at the thread-block level.
struct WarpSumKernel { /** Warp kernel * * This kernel assumes that `in` and `out` are one-dimensional. * The requires clause enforces this constraint. */ ALPAKA_FN_ACC void operator()( onAcc::concepts::Acc auto const& acc, concepts::IDataSource auto const& in, concepts::IMdSpan auto out) const requires(concepts::Dim<ALPAKA_TYPEOF(in), 1u> && concepts::Dim<ALPAKA_TYPEOF(out), 1u>) { auto const warpSize = onAcc::warp::getSize(acc); auto const idxInWarp = onAcc::warp::getLaneIdx(acc); auto const workSize = pCast<uint32_t>(in.getExtents()); // This example requires that the work size is a multiple of the warp size. ALPAKA_ASSERT_ACC((workSize.x() % warpSize) == 0u); for(auto [blockBase] : onAcc::makeIdxMap(acc, onAcc::worker::linearWarpsInGrid, IdxRange{0u, workSize, warpSize})) { auto value = in[Vec{blockBase + idxInWarp}]; for(uint32_t offset = warpSize / 2u; offset > 0u; offset /= 2u) value += onAcc::warp::shflDown(acc, value, offset); if(onAcc::warp::getLaneIdx(acc) == 0u) { out[blockBase / warpSize] = value; } } } };
Important rules:
All participating threads must call the same warp intrinsic in a compatible control-flow region.
Use the actual warp size reported by the accelerator instead of hard-coding
32, which is typical for NVIDIA devices.On host devices, the warp size can be
1. The code still compiles and runs, but the subgroup behavior is naturally trivial there.
Other warp functions:
onAcc::warp::shflto broadcast from a chosen laneonAcc::warp::shflUpread from the lower laneonAcc::warp::shflXorxorthe read value from a lane with its ownonAcc::warp::allandonAcc::warp::anyfor voting between participating warp threadsonAcc::warp::ballotfor predicate masks
Complete Source File
190_warp.cpp
1/* Copyright 2026 René Widera
2 * SPDX-License-Identifier: ISC
3 */
4
5#include "docsTest.hpp"
6
7#include <alpaka/alpaka.hpp>
8
9#include <catch2/catch_template_test_macros.hpp>
10#include <catch2/catch_test_macros.hpp>
11
12#include <vector>
13
14using namespace alpaka;
15
16struct WarpSumKernel
17{
18 /** Warp kernel
19 *
20 * This kernel assumes that `in` and `out` are one-dimensional.
21 * The requires clause enforces this constraint.
22 */
23 ALPAKA_FN_ACC void operator()(
24 onAcc::concepts::Acc auto const& acc,
25 concepts::IDataSource auto const& in,
26 concepts::IMdSpan auto out) const
27 requires(concepts::Dim<ALPAKA_TYPEOF(in), 1u> && concepts::Dim<ALPAKA_TYPEOF(out), 1u>)
28 {
29 auto const warpSize = onAcc::warp::getSize(acc);
30 auto const idxInWarp = onAcc::warp::getLaneIdx(acc);
31 auto const workSize = pCast<uint32_t>(in.getExtents());
32
33 // This example requires that the work size is a multiple of the warp size.
34 ALPAKA_ASSERT_ACC((workSize.x() % warpSize) == 0u);
35
36 for(auto [blockBase] :
37 onAcc::makeIdxMap(acc, onAcc::worker::linearWarpsInGrid, IdxRange{0u, workSize, warpSize}))
38 {
39 auto value = in[Vec{blockBase + idxInWarp}];
40 for(uint32_t offset = warpSize / 2u; offset > 0u; offset /= 2u)
41 value += onAcc::warp::shflDown(acc, value, offset);
42
43 if(onAcc::warp::getLaneIdx(acc) == 0u)
44 {
45 out[blockBase / warpSize] = value;
46 }
47 }
48 }
49};
50
51
52TEMPLATE_LIST_TEST_CASE("tutorial warp shuffle reduction", "[docs]", docs::test::TestBackends)
53{
54 auto cfg = TestType::makeDict();
55 auto deviceSpec = cfg[object::deviceSpec];
56 auto exec = cfg[object::exec];
57
58 auto selector = onHost::makeDeviceSelector(deviceSpec);
59 if(!selector.isAvailable())
60 return;
61 onHost::concepts::Device auto device = selector.makeDevice(0);
62 onHost::Queue queue = device.makeQueue(queueKind::blocking);
63 auto const warpSize = device.getDeviceProperties().warpSize;
64
65 auto const blocks = 2u;
66
67 std::vector<uint32_t> hostInput(blocks * warpSize);
68 std::vector<uint32_t> hostOutput(blocks, 0u);
69 std::vector<uint32_t> expectedOutput(blocks, 0u);
70
71 for(uint32_t blockIdx = 0; blockIdx < blocks; ++blockIdx)
72 {
73 for(uint32_t laneIdx = 0; laneIdx < warpSize; ++laneIdx)
74 {
75 auto const value = blockIdx * warpSize + laneIdx + 1u;
76 hostInput[blockIdx * warpSize + laneIdx] = value;
77 expectedOutput[blockIdx] += value;
78 }
79 }
80
81 auto inputBuffer = onHost::allocLike(device, hostInput);
82 auto outputBuffer = onHost::allocLike(device, hostOutput);
83
84 onHost::memcpy(queue, inputBuffer, hostInput);
85 onHost::memset(queue, outputBuffer, 0x00);
86
87 onHost::concepts::FrameSpec auto frameSpec = onHost::FrameSpec{Vec{blocks}, Vec{warpSize}, exec};
88 queue.enqueue(frameSpec, KernelBundle{WarpSumKernel{}, inputBuffer, outputBuffer});
89
90 onHost::memcpy(queue, hostOutput, outputBuffer);
91 onHost::wait(queue);
92
93 for(uint32_t blockIdx = 0; blockIdx < blocks; ++blockIdx)
94 CHECK(hostOutput[blockIdx] == expectedOutput[blockIdx]);
95}