#pragma once

// ============================================================================
// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/alpaka.hpp ==
// ==
/* Copyright 2024 René Widera, Mehmet Yusufoglu
 * SPDX-License-Identifier: MPL-2.0
 */

// #pragma once
	// ============================================================================
	// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/CVec.hpp ==
	// ==
	/* Copyright 2024 René Widera
	 * SPDX-License-Identifier: MPL-2.0
	 */

	// #pragma once
		// ============================================================================
		// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/Vec.hpp ==
		// ==
		/* Copyright 2024 René Widera
		 * SPDX-License-Identifier: MPL-2.0
		 */

		// #pragma once
			// ============================================================================
			// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/cast.hpp ==
			// ==
			/* Copyright 2025 René Widera
			 * SPDX-License-Identifier: MPL-2.0
			 */

			// #pragma once
				// ============================================================================
				// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/core/common.hpp ==
				// ==
				/* Copyright 2024 Axel Hübl, Benjamin Worpitz, Matthias Werner, Jan Stephan, René Widera, Andrea Bocci, Aurora Perego
				 * SPDX-License-Identifier: MPL-2.0
				 */

				// #pragma once
					// ============================================================================
					// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/core/config.hpp ==
					// ==
					/* Copyright 2023 Benjamin Worpitz, Matthias Werner, René Widera, Sergei Bastrakov, Jeffrey Kelling,
					 *                Bernhard Manfred Gruber, Jan Stephan, Mehmet Yusufoglu
					 * SPDX-License-Identifier: MPL-2.0
					 */

					// #pragma once
						// ============================================================================
						// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/core/PP.hpp ==
						// ==
						/* Copyright 2024 René Widera
						 * SPDX-License-Identifier: MPL-2.0
						 */

						// #pragma once
						#define ALPAKA_PP_CAT(left, right) left##right
						#define ALPAKA_PP_REMOVE_FIRST_COMMA_DO(ignore, ...) __VA_ARGS__
						#define ALPAKA_PP_REMOVE_FIRST_COMMA(...) ALPAKA_PP_REMOVE_FIRST_COMMA_DO(0 __VA_ARGS__)

						/** solution from https://stackoverflow.com/a/62984543
						 * @{
						 */
						#define ALPAKA_PP_REMOVE_BRACKETS_DO(X) ALPAKAESC(ISHALPAKA X)
						#define ISHALPAKA(...) ISHALPAKA __VA_ARGS__
						#define ALPAKAESC(...) ALPAKAESC_(__VA_ARGS__)
						#define ALPAKAESC_(...) VAN##__VA_ARGS__
						#define VANISHALPAKA
						/** @} */

						#define ALPAKA_PP_REMOVE_BRACKETS(x) ALPAKA_PP_REMOVE_BRACKETS_DO(x)

						/* version number encoding
						 * 4 digits for major version (max 9999)
						 * 3 digits for minor version (max 999)
						 * 5 digits for patch version (max 99999)
						 * example: version 1.2.3 -> 0001 002 00003
						 */
						#define ALPAKA_VERSION_NUMBER(major, minor, patch)                                                                    \
						    ((((major) % 10000llu) * 100'000'000llu) + (((minor) % 1000llu) * 100000llu) + ((patch) % 100000llu))

						#define ALPAKA_VERSION_NUMBER_NOT_AVAILABLE ALPAKA_VERSION_NUMBER(0llu, 0llu, 0llu)
						#define ALPAKA_VERSION_NUMBER_UNKNOWN ALPAKA_VERSION_NUMBER(9999llu, 999llu, 99999llu)

						// version number conversion from vendor format to ALPAKA_VERSION_NUMBER
						#define ALPAKA_YYYYMMDD_TO_VERSION(V) ALPAKA_VERSION_NUMBER(((V) / 10000llu), ((V) / 100llu) % 100llu, (V) % 100llu)

						#define ALPAKA_YYYYMM_TO_VERSION(V) ALPAKA_VERSION_NUMBER(((V) / 100llu) % 10000llu, (V) % 100llu, 0llu)

						#define ALPAKA_VVRRP_TO_VERSION(V)                                                                                    \
						    ALPAKA_VERSION_NUMBER(((V) / 1000llu) % 10000llu, ((V) / 10llu) % 100llu, (V) % 10llu)

						#define ALPAKA_VRP_TO_VERSION(V) ALPAKA_VERSION_NUMBER(((V) / 100llu) % 10000llu, ((V) / 10llu) % 10llu, (V) % 10llu)

						#define ALPAKA_VRRPP_TO_VERSION(V)                                                                                    \
						    ALPAKA_VERSION_NUMBER(((V) / 10000llu) % 10000llu, ((V) / 100llu) % 100llu, (V) % 100llu)
						// ==
						// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/core/PP.hpp ==
						// ============================================================================

						// ============================================================================
						// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/core/hipConfig.hpp ==
						// ==
						/* Copyright 2025 René Widera
						 * SPDX-License-Identifier: MPL-2.0
						 */

						// #pragma once
						// #include "alpaka/core/PP.hpp"    // amalgamate: file already inlined

						// We can not use ALPAKA_LANG_HIP because this file is required by core/config.hpp where ALPAKA_LANG_HIP is defined.
						#if defined(__HIP__)

						#    include <hip/hip_version.h>

						// version numbers are only defined on the device side
						#    if !defined(ALPAKA_AMDGPU_ARCH) && defined(__HIP__) && defined(__HIP_DEVICE_COMPILE__)                           \
						        && __HIP_DEVICE_COMPILE__ == 1

						/* Map AMDGPU arch macro -> ALPAKA_VRRPP_TO_VERSION(wrapped code)
						 *  Rules:
						 *   - gfx9xy (numeric): 9xy -> 90x0y  (e.g., 908->90008, 906->90006, 942->90402)
						 *   - gfx10xy / gfx11xy: stxy -> st0x0y (e.g., 1036->100306, 1103->110003)
						 *   - Suffix: a == 10 (90a->90010), b == 11, c == 11
						 *
						 * An overview of AMD GPU architectures can be found here:
						 * https://llvm.org/docs/AMDGPUUsage.html#processors
						 */

						#        if defined(__gfx1251__)
						/* RDNA 4 APU variant */
						#            define ALPAKA_AMDGPU_ARCH ALPAKA_VRRPP_TO_VERSION(120501)
						#        elif defined(__gfx1250__)
						/* RDNA 4 APU (APU) */
						#            define ALPAKA_AMDGPU_ARCH ALPAKA_VRRPP_TO_VERSION(120500)
						#        elif defined(__gfx1201__)
						/* RDNA 4 dGPU (RX 9070 / RX 9070 GRE / 9070 XT) */
						#            define ALPAKA_AMDGPU_ARCH ALPAKA_VRRPP_TO_VERSION(120001)
						#        elif defined(__gfx1200__)
						/* RDNA 4 dGPU (RX 9060 / RX 9060 XT) */
						#            define ALPAKA_AMDGPU_ARCH ALPAKA_VRRPP_TO_VERSION(120000)

						#        elif defined(__gfx1153__)
						/* RDNA 3.5 iGPU (Medusa Point / Strix Halo successor) */
						#            define ALPAKA_AMDGPU_ARCH ALPAKA_VRRPP_TO_VERSION(110503)
						#        elif defined(__gfx1152__)
						/* RDNA 3.5 iGPU (Krackan Point) */
						#            define ALPAKA_AMDGPU_ARCH ALPAKA_VRRPP_TO_VERSION(110502)
						#        elif defined(__gfx1151__)
						/* RDNA 3.5 iGPU (Strix Halo) */
						#            define ALPAKA_AMDGPU_ARCH ALPAKA_VRRPP_TO_VERSION(110501)
						#        elif defined(__gfx1150__)
						/* RDNA 3.5 iGPU (Radeon 890M on Strix Point) */
						#            define ALPAKA_AMDGPU_ARCH ALPAKA_VRRPP_TO_VERSION(110500)

						#        elif defined(__gfx1103__)
						/* RDNA 3 APU (Radeon 780M, 760M, ROG Ally Extreme) */
						#            define ALPAKA_AMDGPU_ARCH ALPAKA_VRRPP_TO_VERSION(110003)
						#        elif defined(__gfx1102__)
						/* RDNA 3 Desktop (RX 7600 / 7600 XT) */
						#            define ALPAKA_AMDGPU_ARCH ALPAKA_VRRPP_TO_VERSION(110002)
						#        elif defined(__gfx1101__)
						/* RDNA 3 Desktop (RX 7700 / 7700 XT, Pro W7700 / V710) */
						#            define ALPAKA_AMDGPU_ARCH ALPAKA_VRRPP_TO_VERSION(110001)
						#        elif defined(__gfx1100__)
						/* RDNA 3 Desktop (RX 7900 XT, XTX, Pro W7900) */
						#            define ALPAKA_AMDGPU_ARCH ALPAKA_VRRPP_TO_VERSION(110000)

						#        elif defined(__gfx1036__)
						/* RDNA 2 APU (Radeon Graphics 128-SP iGPU) */
						#            define ALPAKA_AMDGPU_ARCH ALPAKA_VRRPP_TO_VERSION(100306)
						#        elif defined(__gfx1035__)
						/* RDNA 2 APU (Radeon 660M, 680M) */
						#            define ALPAKA_AMDGPU_ARCH ALPAKA_VRRPP_TO_VERSION(100305)
						#        elif defined(__gfx1034__)
						/* RDNA 2 Mobile (Pro W6300/W6400, RX 6400-6500) */
						#            define ALPAKA_AMDGPU_ARCH ALPAKA_VRRPP_TO_VERSION(100304)
						#        elif defined(__gfx1033__)
						/* RDNA 2 APU (Steam Deck) */
						#            define ALPAKA_AMDGPU_ARCH ALPAKA_VRRPP_TO_VERSION(100303)
						#        elif defined(__gfx1032__)
						/* RDNA 2 Desktop (RX 6600 XT, 6650 XT/S, 6700S) */
						#            define ALPAKA_AMDGPU_ARCH ALPAKA_VRRPP_TO_VERSION(100302)
						#        elif defined(__gfx1031__)
						/* RDNA 2 Desktop (RX 6700 series, 6750/6850M XT) */
						#            define ALPAKA_AMDGPU_ARCH ALPAKA_VRRPP_TO_VERSION(100301)
						#        elif defined(__gfx1030__)
						/* RDNA 2 Desktop (RX 6800 / 6900 XT, Pro W6800) */
						#            define ALPAKA_AMDGPU_ARCH ALPAKA_VRRPP_TO_VERSION(100300)

						#        elif defined(__gfx1013__)
						/* RDNA 1 Mobile (RX 5300M / 5500M) */
						#            define ALPAKA_AMDGPU_ARCH ALPAKA_VRRPP_TO_VERSION(100103)
						#        elif defined(__gfx1012__)
						/* RDNA 1 Desktop (RX 5500 / 5500 XT) */
						#            define ALPAKA_AMDGPU_ARCH ALPAKA_VRRPP_TO_VERSION(100102)
						#        elif defined(__gfx1011__)
						/* RDNA 1 Desktop (Pro V520) */
						#            define ALPAKA_AMDGPU_ARCH ALPAKA_VRRPP_TO_VERSION(100101)
						#        elif defined(__gfx1010__)
						/* RDNA 1 Desktop (RX 5700 / 5700 XT, Pro 5600 XT/M) */
						#            define ALPAKA_AMDGPU_ARCH ALPAKA_VRRPP_TO_VERSION(100100)

						#        elif defined(__gfx942__)
						/* CDNA 3 (Instinct MI300 series: MI300/MI300A/MI300X) */
						#            define ALPAKA_AMDGPU_ARCH ALPAKA_VRRPP_TO_VERSION(90402)
						#        elif defined(__gfx941__)
						/* CDNA 2/3 (Instinct MI210) */
						#            define ALPAKA_AMDGPU_ARCH ALPAKA_VRRPP_TO_VERSION(90401)
						#        elif defined(__gfx940__)
						/* CDNA 2 (Instinct MI200) */
						#            define ALPAKA_AMDGPU_ARCH ALPAKA_VRRPP_TO_VERSION(90400)

						#        elif defined(__gfx90c__)
						/* CDNA 1 (Renoir APUs), c -> 12 */
						#            define ALPAKA_AMDGPU_ARCH ALPAKA_VRRPP_TO_VERSION(90012)
						#        elif defined(__gfx90b__)
						/* (If present) b -> 11 */
						#            define ALPAKA_AMDGPU_ARCH ALPAKA_VRRPP_TO_VERSION(90011)
						#        elif defined(__gfx90a__)
						/* CDNA 2 (Instinct MI250 / MI250X), a -> 10 */
						#            define ALPAKA_AMDGPU_ARCH ALPAKA_VRRPP_TO_VERSION(90010)
						#        elif defined(__gfx908__)
						/* CDNA 1 (Instinct MI100) */
						#            define ALPAKA_AMDGPU_ARCH ALPAKA_VRRPP_TO_VERSION(90008)
						#        elif defined(__gfx906__)
						/* Vega 20 (Radeon VII, Instinct MI50/60) */
						#            define ALPAKA_AMDGPU_ARCH ALPAKA_VRRPP_TO_VERSION(90006)

						#        else
						#            warning                                                                                                  \
						                "Unknown AMDGPU architecture, please define __gfxXXX__ macro for your target. Until alpaka is updated you can define the macro ALPAKA_AMDGPU_ARCH to avoid this warning."
						#            define ALPAKA_AMDGPU_ARCH ALPAKA_VERSION_NUMBER_UNKNOWN
						#        endif

						#    endif
						#endif
						// ==
						// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/core/hipConfig.hpp ==
						// ============================================================================

						// ============================================================================
						// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/version.hpp ==
						// ==
						/* Copyright 2025 Benjamin Worpitz, Erik Zenker, Jan Stephan, René Widera
						 * SPDX-License-Identifier: MPL-2.0
						 */

						// #pragma once
						// #include "core/PP.hpp"    // amalgamate: file already inlined

						#define ALPAKA_VERSION_MAJOR 3
						#define ALPAKA_VERSION_MINOR 0
						#define ALPAKA_VERSION_PATCH 0

						//! The alpaka library version number
						#define ALPAKA_VERSION ALPAKA_VERSION_NUMBER(ALPAKA_VERSION_MAJOR, ALPAKA_VERSION_MINOR, ALPAKA_VERSION_PATCH)
						// ==
						// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/version.hpp ==
						// ============================================================================


					// guard cmake target alpaka
					#if defined(ALPAKA_CMAKE_TARGET_ALPAKA) && !defined(ALPAKA_CMAKE_TARGET_ALPAKA_FINALIZE_CALLED)
					#    error "After adding the cmake target alpaka or alpaka::alpaka you should call 'alpaka_finalize(targetName)'"
					#endif
					// guard cmake target alpaka::headers
					#if defined(ALPAKA_CMAKE_TARGET_HEADERS) && !defined(ALPAKA_CMAKE_TARGET_HEADERS_FINALIZE_CALLED)
					#    error "After adding the cmake target alpaka::headers you should call 'alpaka_finalize(targetName)'"
					#endif
					// guard cmake target alpaka::cuda
					#if defined(ALPAKA_CMAKE_TARGET_CUDA) && !defined(ALPAKA_CMAKE_TARGET_CUDA_FINALIZE_CALLED)
					#    error "After adding the cmake target alpaka::cuda you should call 'alpaka_finalize(targetName)'"
					#endif
					// guard cmake target alpaka::hip
					#if defined(ALPAKA_CMAKE_TARGET_HIP) && !defined(ALPAKA_CMAKE_TARGET_HIP_FINALIZE_CALLED)
					#    error "After adding the cmake target alpaka::hip you should call 'alpaka_finalize(targetName)'"
					#endif
					// guard cmake target alpaka::onapi
					#if defined(ALPAKA_CMAKE_TARGET_ONEAPI) && !defined(ALPAKA_CMAKE_TARGET_ONEAPI_FINALIZE_CALLED)
					#    error "After adding the cmake target alpaka::oneapi you should call 'alpaka_finalize(targetName)'"
					#endif
					// guard cmake target alpaka::host
					#if defined(ALPAKA_CMAKE_TARGET_HOST) && !defined(ALPAKA_CMAKE_TARGET_HOST_FINALIZE_CALLED)
					#    error "After adding the cmake target alpaka::host you should call 'alpaka_finalize(targetName)'"
					#endif

					#ifdef __INTEL_COMPILER
					#    warning                                                                                                          \
					        "The Intel Classic compiler (icpc) is no longer supported. Please upgrade to the Intel LLVM compiler (ipcx)."
					#endif

					// ######## detect operating systems ########

					// WINDOWS
					#if !defined(ALPAKA_OS_WINDOWS)
					#    if defined(_WIN64) || defined(__MINGW64__)
					#        define ALPAKA_OS_WINDOWS 1
					#    else
					#        define ALPAKA_OS_WINDOWS 0
					#    endif
					#endif


					// Linux
					#if !defined(ALPAKA_OS_LINUX)
					#    if defined(__linux) || defined(__linux__) || defined(__gnu_linux__)
					#        define ALPAKA_OS_LINUX 1
					#    else
					#        define ALPAKA_OS_LINUX 0
					#    endif
					#endif

					// Apple
					#if !defined(ALPAKA_OS_IOS)
					#    if defined(__APPLE__)
					#        define ALPAKA_OS_IOS 1
					#    else
					#        define ALPAKA_OS_IOS 0
					#    endif
					#endif

					// Cygwin
					#if !defined(ALPAKA_OS_CYGWIN)
					#    if defined(__CYGWIN__)
					#        define ALPAKA_OS_CYGWIN 1
					#    else
					#        define ALPAKA_OS_CYGWIN 0
					#    endif
					#endif

					// ### architectures

					// X86
					#if !defined(ALPAKA_ARCH_X86)
					#    if defined(__x86_64__) || defined(_M_X64)
					#        define ALPAKA_ARCH_X86 1
					#    else
					#        define ALPAKA_ARCH_X86 0
					#    endif
					#endif

					// RISCV
					#if !defined(ALPAKA_ARCH_RISCV)
					#    if defined(__riscv)
					#        define ALPAKA_ARCH_RISCV 1
					#    else
					#        define ALPAKA_ARCH_RISCV 0
					#    endif
					#endif

					// ARM
					#if !defined(ALPAKA_ARCH_ARM)
					#    if defined(__ARM_ARCH) || defined(__arm__) || defined(__arm64)
					#        define ALPAKA_ARCH_ARM 1
					#    else
					#        define ALPAKA_ARCH_ARM 0
					#    endif
					#endif

					/** NVIDIA device compile
					 *
					 * * The version on the host side will always be ALPAKA_VERSION_NUMBER_NOT_AVAILABLE.
					 *
					 *   Rules:
					 *   - sm75 -> ALPAKA_VERSION_NUMBER(7,5,0)
					 *   - sm91 -> ALPAKA_VERSION_NUMBER(9,1,0)
					 */
					#if !defined(ALPAKA_ARCH_PTX)
					#    if defined(__CUDA_ARCH__)
					#        define ALPAKA_ARCH_PTX ALPAKA_VRP_TO_VERSION(__CUDA_ARCH__)
					#    else
					#        define ALPAKA_ARCH_PTX ALPAKA_VERSION_NUMBER_NOT_AVAILABLE
					#    endif
					#endif

					/** HIP device compile
					 *
					 * The version on the host side will always be ALPAKA_VERSION_NUMBER_NOT_AVAILABLE.
					 * On the device side unknown version will be set to ALPAKA_VERSION_NUMBER_UNKNOWN.
					 *
					 *  Rules:
					 *   - the last two digits will be handled as HEX values and support 0-9 and a-f
					 *   - gfx9xy (numeric): 9xy -> ALPAKA_VERSION_NUMBER(9,x,y)
					 *   - gfx10xy / gfx11xy: stxy -> ALPAKA_VERSION_NUMBER(st,x,y)
					 *   - Suffix: a == 10, b == 11, c == 12
					 *      - gfx90a -> ALPAKA_VERSION_NUMBER(9,0,10)
					 *      - gfx90c -> ALPAKA_VERSION_NUMBER(9,0,12)
					 */
					#if !defined(ALPAKA_ARCH_AMD)
					#    if defined(__HIP__) && defined(__HIP_DEVICE_COMPILE__) && __HIP_DEVICE_COMPILE__ == 1
					#        define ALPAKA_ARCH_AMD ALPAKA_AMDGPU_ARCH
					#    else
					#        define ALPAKA_ARCH_AMD ALPAKA_VERSION_NUMBER_NOT_AVAILABLE
					#    endif
					#endif

					// ######## compiler ########

					// HIP compiler detection
					#if !defined(ALPAKA_COMP_HIP)
					#    if defined(__HIP__) // Defined by hip-clang and vanilla clang in HIP mode.
					#        include <hip/hip_version.h>
					// HIP doesn't give us a patch level for the last entry, just a gitdate
					#        define ALPAKA_COMP_HIP ALPAKA_VERSION_NUMBER(HIP_VERSION_MAJOR, HIP_VERSION_MINOR, 0)
					#    else
					#        define ALPAKA_COMP_HIP ALPAKA_VERSION_NUMBER_NOT_AVAILABLE
					#    endif
					#endif

					// nvcc compiler
					#if defined(__NVCC__)
					#    define ALPAKA_COMP_NVCC ALPAKA_VERSION_NUMBER(__CUDACC_VER_MAJOR__, __CUDACC_VER_MINOR__, __CUDACC_VER_BUILD__)
					#else
					#    define ALPAKA_COMP_NVCC ALPAKA_VERSION_NUMBER_NOT_AVAILABLE
					#endif

					// clang compiler
					#if defined(__clang__)
					#    define ALPAKA_COMP_CLANG ALPAKA_VERSION_NUMBER(__clang_major__, __clang_minor__, __clang_patchlevel__)
					#else
					#    define ALPAKA_COMP_CLANG ALPAKA_VERSION_NUMBER_NOT_AVAILABLE
					#endif

					// MSVC compiler
					#if defined(_MSC_VER)
					#    define ALPAKA_COMP_MSVC                                                                                          \
					        ALPAKA_VERSION_NUMBER((_MSC_FULL_VER) % 10'000'000, ((_MSC_FULL_VER) / 100000) % 100, (_MSC_FULL_VER) % 100000)
					#else
					#    define ALPAKA_COMP_MSVC ALPAKA_VERSION_NUMBER_NOT_AVAILABLE
					#endif

					// gnu compiler (excluding compilers which emulates gnu compiler like clang)
					#if defined(__GNUC__) && !defined(__clang__)
					#    if defined(__GNUC_PATCHLEVEL__)
					#        define ALPAKA_COMP_GNUC ALPAKA_VERSION_NUMBER(__GNUC__, __GNUC_MINOR__, __GNUC_PATCHLEVEL__)
					#    else
					#        define ALPAKA_COMP_GNUC ALPAKA_VERSION_NUMBER(__GNUC__, __GNUC_MINOR__, 0)
					#    endif
					#else
					#    define ALPAKA_COMP_GNUC ALPAKA_VERSION_NUMBER_NOT_AVAILABLE
					#endif

					// IBM compiler
					// only clang based is supported
					#if defined(__ibmxl__)
					#    define ALPAKA_COMP_IBM ALPAKA_VERSION_NUMBER(__ibmxl_version__, __ibmxl_release__, __ibmxl_modification__)
					#else
					#    define ALPAKA_COMP_IBM ALPAKA_VERSION_NUMBER_NOT_AVAILABLE
					#endif

					// clang CUDA compiler detection
					// Currently __CUDA__ is only defined by clang when compiling CUDA code.
					#if defined(__clang__) && defined(__CUDA__)
					#    define ALPAKA_COMP_CLANG_CUDA ALPAKA_VERSION_NUMBER(__clang_major__, __clang_minor__, __clang_patchlevel__)
					#else
					#    define ALPAKA_COMP_CLANG_CUDA ALPAKA_VERSION_NUMBER_NOT_AVAILABLE
					#endif

					// PGI and NV HPC SDK compiler detection
					#if defined(__PGI)
					#    define ALPAKA_COMP_PGI ALPAKA_VERSION_NUMBER(__PGIC__, __PGIC_MINOR__, __PGIC_PATCHLEVEL__)
					#else
					#    define ALPAKA_COMP_PGI ALPAKA_VERSION_NUMBER_NOT_AVAILABLE
					#endif

					// Intel LLVM compiler detection
					#if !defined(ALPAKA_COMP_ICPX)
					#    if defined(SYCL_LANGUAGE_VERSION) && defined(__INTEL_LLVM_COMPILER)
					// The version string for icpx 2023.1.0 is 20230100. In Boost.Predef this becomes (53,1,0).
					#        define ALPAKA_COMP_ICPX ALPAKA_YYYYMMDD_TO_VERSION(__INTEL_LLVM_COMPILER)
					#    else
					#        define ALPAKA_COMP_ICPX ALPAKA_VERSION_NUMBER_NOT_AVAILABLE
					#    endif
					#endif

					// ######## C++ language ########

					//---------------------------------------HIP-----------------------------------
					// __HIP__ is defined by both hip-clang and vanilla clang in HIP mode.
					// https://github.com/ROCm-Developer-Tools/HIP/blob/master/docs/markdown/hip_porting_guide.md#compiler-defines-summary
					#if !defined(ALPAKA_LANG_HIP)
					#    if defined(__HIP__)
					#        include <hip/hip_version.h>
					// HIP doesn't give us a patch level for the last entry, just a gitdate
					#        define ALPAKA_LANG_HIP ALPAKA_VERSION_NUMBER(HIP_VERSION_MAJOR, HIP_VERSION_MINOR, 0)
					#    else
					#        define ALPAKA_LANG_HIP ALPAKA_VERSION_NUMBER_NOT_AVAILABLE
					#    endif
					#endif

					// CUDA
					#if !defined(ALPAKA_LANG_CUDA)
					#    if defined(__CUDACC__) || defined(__CUDA__)
					#        include <cuda.h>
					#        if __has_include(<cuda/atomic>)
					#            define ALPAKA_CUDA_ATOMIC
					#            include <cuda/atomic>
					#            if ALPAKA_COMP_CLANG_CUDA && defined(_Float16)
					#                pragma clang diagnostic push
					#                pragma clang diagnostic ignored "-Wreserved-identifier"
					// We see errors when using clang as the CUDA compiler if TBB is also enabled
					// Errors occour inside TBB because the _Float16 macro is redefined and pulled in from <cuda/atomic>
					#                undef _Float16
					#                pragma clang diagnostic pop
					#            endif
					#        endif
					// CUDA doesn't give us a patch level for the last entry, just zero.
					#        define ALPAKA_LANG_CUDA ALPAKA_VVRRP_TO_VERSION(CUDART_VERSION)
					#    else
					#        define ALPAKA_LANG_CUDA ALPAKA_VERSION_NUMBER_NOT_AVAILABLE
					#    endif
					#endif

					// Intel OneAPI Sycl GPU
					#if !defined(ALPAKA_LANG_SYCL)
					#    if defined(SYCL_LANGUAGE_VERSION)
					#        define ALPAKA_LANG_SYCL ALPAKA_YYYYMMDD_TO_VERSION(SYCL_LANGUAGE_VERSION)
					#    else
					#        define ALPAKA_LANG_SYCL ALPAKA_VERSION_NUMBER_NOT_AVAILABLE
					#    endif
					#    if (ALPAKA_COMP_ICPX)
					// ONE API must be detected via the ICPX compiler see
					// https://www.intel.com/content/www/us/en/docs/dpcpp-cpp-compiler/developer-guide-reference/2023-2/use-predefined-macros-to-specify-intel-compilers.html
					#        define ALPAKA_LANG_ONEAPI ALPAKA_COMP_ICPX
					#    else
					#        define ALPAKA_LANG_ONEAPI ALPAKA_VERSION_NUMBER_NOT_AVAILABLE
					#    endif
					#endif

					// OpenMP
					#if !defined(ALPAKA_OMP)
					#    if defined(_OPENMP)
					#        include <omp.h>
					#    endif
					#    if defined(_OPENMP)
					#        define ALPAKA_OMP ALPAKA_YYYYMM_TO_VERSION(_OPENMP)
					#    else
					#        define ALPAKA_OMP ALPAKA_VERSION_NUMBER_NOT_AVAILABLE
					#    endif
					#endif

					// oneTBB
					// Use _has_include to detect oneTBB version if available, there is no predefined macro like OpenMP _OPENMP
					// When the header is available we define ALPAKA_TBB to the real version, otherwise it drops back to
					// ALPAKA_VERSION_NUMBER_NOT_AVAILABLE.
					#if !defined(ALPAKA_TBB)
					#    // Does not provide a macro we can check therefore we need to load the headers first to set ALPAKA_TBB
					#    if defined(__has_include)
					#        // alpaka assumes if the TBB headers can be found, TBB can be activated for usage.
					#        // If CMake is not used e.g. in compiler explorers or other build engines, the macro ALPAKA_DISABLE_TBB
					#        // must be set if the TBB headers are available but linker flags for TBB are not passed.
					#        // This can be the reason together if icpx is used since oneAPI is mostly shipping TBB directly.
					#        if __has_include(<oneapi/tbb/version.h>) && !defined(ALPAKA_DISABLE_TBB)
					#            include <oneapi/tbb/version.h>
					#        endif
					#    endif
					#    // TBB headers define TBB_VERSION_* when present; otherwise we fall back to NOT_AVAILABLE.
					#    if defined(TBB_VERSION_MAJOR)
					#        if defined(TBB_VERSION_PATCH)
					#            define ALPAKA_TBB ALPAKA_VERSION_NUMBER(TBB_VERSION_MAJOR, TBB_VERSION_MINOR, TBB_VERSION_PATCH)
					#        else
					#            define ALPAKA_TBB ALPAKA_VERSION_NUMBER(TBB_VERSION_MAJOR, TBB_VERSION_MINOR, 0)
					#        endif
					#    else
					#        define ALPAKA_TBB ALPAKA_VERSION_NUMBER_NOT_AVAILABLE
					#    endif
					#endif
					// ==
					// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/core/config.hpp ==
					// ============================================================================


				#include <type_traits>

				#if ALPAKA_LANG_HIP
				// HIP defines some keywords like __forceinline__ in header files.
				#    include <hip/hip_runtime.h>
				#endif

				//! All functions that can be used on an accelerator have to be attributed with ALPAKA_FN_ACC or ALPAKA_FN_HOST_ACC.
				//!
				//! \code{.cpp}
				//! Usage:
				//! ALPAKA_FN_ACC
				//! auto add(std::int32_t a, std::int32_t b)
				//! -> std::int32_t;
				//! \endcode
				//! @{
				#if ALPAKA_LANG_CUDA || ALPAKA_LANG_HIP
				#    define ALPAKA_FN_ACC __device__ __host__
				#    define ALPAKA_FN_HOST_ACC __device__ __host__
				#    define ALPAKA_FN_HOST __host__
				#else
				#    define ALPAKA_FN_ACC
				#    define ALPAKA_FN_HOST_ACC
				#    define ALPAKA_FN_HOST
				#endif
				//! @}

				//! All functions marked with ALPAKA_FN_ACC or ALPAKA_FN_HOST_ACC that are exported to / imported from different
				//! translation units have to be attributed with ALPAKA_FN_EXTERN. Note that this needs to be applied to both the
				//! declaration and the definition.
				//!
				//! Usage:
				//! ALPAKA_FN_ACC ALPAKA_FN_EXTERN auto add(std::int32_t a, std::int32_t b) -> std::int32_t;
				//!
				//! Warning: If this is used together with the SYCL back-end make sure that your SYCL runtime supports generic
				//! address spaces. Otherwise it is forbidden to use pointers as parameter or return type for functions marked
				//! with ALPAKA_FN_EXTERN.
				#if ALPAKA_LANG_SYCL
				/*
				   This is required by the SYCL standard, section 5.10.1 "SYCL functions and member functions linkage":

				   The default behavior in SYCL applications is that all the definitions and declarations of the functions and member
				   functions are available to the SYCL compiler, in the same translation unit. When this is not the case, all the
				   symbols that need to be exported to a SYCL library or from a C++ library to a SYCL application need to be defined
				   using the macro: SYCL_EXTERNAL.
				*/
				#    define ALPAKA_FN_EXTERN SYCL_EXTERNAL
				#else
				#    define ALPAKA_FN_EXTERN
				#endif

				//! Disable nvcc warning:
				//! 'calling a __host__ function from __host__ __device__ function.'
				//! Usage:
				//! ALPAKA_NO_HOST_ACC_WARNING
				//! ALPAKA_FN_HOST_ACC function_declaration()
				//! WARNING: Only use this method if there is no other way.
				//! Most cases can be solved by #if ALPAKA_ARCH_PTX or #if ALPAKA_LANG_CUDA.
				#if (ALPAKA_LANG_CUDA && !ALPAKA_COMP_CLANG_CUDA)
				#    if ALPAKA_COMP_MSVC
				#        define ALPAKA_NO_HOST_ACC_WARNING __pragma(hd_warning_disable)
				#    else
				#        define ALPAKA_NO_HOST_ACC_WARNING _Pragma("hd_warning_disable")
				#    endif
				#else
				#    define ALPAKA_NO_HOST_ACC_WARNING
				#endif

				//! Macro defining the inline function attribute.
				//!
				//! The macro should stay on the left hand side of keywords, e.g. 'static', 'constexpr', 'explicit' or the return type.
				#if ALPAKA_LANG_CUDA || ALPAKA_LANG_HIP
				#    define ALPAKA_FN_INLINE __forceinline__
				#elif ALPAKA_COMP_MSVC
				// TODO: With C++20 [[msvc::forceinline]] can be used.
				#    define ALPAKA_FN_INLINE __forceinline
				#else
				// For gcc, clang, and clang-based compilers like Intel icpx
				#    define ALPAKA_FN_INLINE [[gnu::always_inline]] inline
				#endif

				//! This macro defines a variable lying in global accelerator device memory.
				//!
				//! Example:
				//!   ALPAKA_STATIC_ACC_MEM_GLOBAL alpaka::DevGlobal<TAcc, int> variable;
				//!
				//! Those variables behave like ordinary variables when used in file-scope,
				//! but inside kernels the get() method must be used to access the variable.
				//! They are declared inline to resolve to a single instance across multiple
				//! translation units.
				//! Like ordinary variables, only one definition is allowed (ODR)
				//! Failure to do so might lead to linker errors.
				//!
				//! In contrast to ordinary variables, you can not define such variables
				//! as static compilation unit local variables with internal linkage
				//! because this is forbidden by CUDA.
				//!
				//! \attention It is not allowed to initialize the variable together with the declaration.
				//!            To initialize the variable alpaka::memcpy must be used.
				//! \code{.cpp}
				//! ALPAKA_STATIC_ACC_MEM_GLOBAL alpaka::DevGlobal<TAcc, int> foo;
				//!
				//! struct DeviceMemoryKernel
				//! {
				//!    ALPAKA_NO_HOST_ACC_WARNING
				//!    template<typename TAcc>
				//!    ALPAKA_FN_ACC void operator()(TAcc const& acc) const
				//!    {
				//!      auto a = foo<TAcc>.get();
				//!    }
				//!  }
				//!
				//! void initFoo() {
				//!     auto extent = alpaka::Vec<alpaka::DimInt<1u>, size_t>{1};
				//!     int initialValue = 42;
				//!     alpaka::ViewPlainPtr<DevHost, int, alpaka::DimInt<1u>, size_t> bufHost(&initialValue, devHost, extent);
				//!     alpaka::memcpy(queue, foo<Acc>, bufHost, extent);
				//! }
				//! \endcode
				#if (                                                                                                                 \
				    (ALPAKA_LANG_CUDA && ALPAKA_COMP_CLANG_CUDA) || (ALPAKA_LANG_CUDA && ALPAKA_COMP_NVCC && ALPAKA_ARCH_PTX)         \
				    || ALPAKA_LANG_HIP)
				#    if defined(__CUDACC_RDC__) || defined(__CLANG_RDC__)
				#        define ALPAKA_STATIC_ACC_MEM_GLOBAL                                                                          \
				            template<typename TAcc>                                                                                   \
				            __device__ inline
				#    else
				#        define ALPAKA_STATIC_ACC_MEM_GLOBAL                                                                          \
				            template<typename TAcc>                                                                                   \
				            __device__ static
				#    endif
				#else
				#    define ALPAKA_STATIC_ACC_MEM_GLOBAL                                                                              \
				        template<typename TAcc>                                                                                       \
				        inline
				#endif

				/** Perfectly forward an instance as argument. */
				#define ALPAKA_FORWARD(instance) std::forward<decltype(instance)>(instance)

				/** Get the type of instance
				 *
				 * References will be removed which is often required because traits are mostly defined for the type only.
				 */
				#define ALPAKA_TYPEOF(...) std::decay_t<decltype(__VA_ARGS__)>
				// ==
				// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/core/common.hpp ==
				// ============================================================================

				// ============================================================================
				// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/trait.hpp ==
				// ==
				/* Copyright 2024 René Widera
				 * SPDX-License-Identifier: MPL-2.0
				 */

				// #pragma once
					// ============================================================================
					// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/utility.hpp ==
					// ==
					/* Copyright 2024 Benjamin Worpitz, René Widera, Bernhard Manfred Gruber, Jan Stephan, Andrea Bocci
					 * SPDX-License-Identifier: MPL-2.0
					 */
					// #pragma once
					// #include "alpaka/core/common.hpp"    // amalgamate: file already inlined

					#include <algorithm>
					#include <bit>
					#include <climits>
					#include <concepts>
					#include <type_traits>
					#include <utility>

					namespace alpaka
					{
					    namespace core
					    {
					        //! convert any type to a reference type
					        //
					        // This function is equivalent to std::declval() but can be used
					        // within an alpaka accelerator kernel too.
					        // This function can be used only within std::decltype().
					#if ALPAKA_LANG_CUDA && ALPAKA_COMP_CLANG_CUDA || ALPAKA_COMP_HIP
					        template<class T>
					        ALPAKA_FN_HOST_ACC std::add_rvalue_reference_t<T> declval();
					#else
					        using std::declval;
					#endif
					    } // namespace core

					    /// Returns the ceiling of a / b, as integer.
					    template<std::integral Integral>
					    [[nodiscard]] ALPAKA_FN_HOST_ACC constexpr auto divCeil(Integral a, Integral b) -> Integral
					    {
					        return (a + b - Integral{1}) / b;
					    }

					    /// Returns the  max(a / b, 1) as integer.
					    template<std::integral Integral>
					    [[nodiscard]] ALPAKA_FN_HOST_ACC constexpr auto divExZero(Integral a, Integral b) -> Integral
					    {
					        return std::max(a / b, Integral{1});
					    }

					    /// Computes the nth power of base, in integers.
					    template<typename Integral, typename = std::enable_if_t<std::is_integral_v<Integral>>>
					    [[nodiscard]] ALPAKA_FN_HOST_ACC constexpr auto intPow(Integral base, Integral n) -> Integral
					    {
					        if(n == 0)
					            return 1;
					        auto r = base;
					        for(Integral i = 1; i < n; i++)
					            r *= base;
					        return r;
					    }

					    /// Computes the floor of the nth root of value, in integers.
					    template<typename Integral, typename = std::enable_if_t<std::is_integral_v<Integral>>>
					    [[nodiscard]] ALPAKA_FN_HOST_ACC constexpr auto nthRootFloor(Integral value, Integral n) -> Integral
					    {
					        // adapted from: https://en.wikipedia.org/wiki/Integer_square_root
					        Integral L = 0;
					        Integral R = value + 1;
					        while(L != R - 1)
					        {
					            Integral const M = (L + R) / 2;
					            if(intPow(M, n) <= value)
					                L = M;
					            else
					                R = M;
					        }
					        return L;
					    }

					    template<std::integral T>
					    inline constexpr T firstSetBit(T value)
					    {
					        using UnsignedValueType = std::make_unsigned_t<T>;
					        return sizeof(T) * CHAR_BIT - 1 - std::countl_zero(static_cast<UnsignedValueType>(value));
					    }

					    /** round to the next power of two which is equal or lower to the value
					     *
					     * @param value input value >0
					     */
					    template<std::integral T>
					    inline constexpr T roundDownToPowerOfTwo(T value)
					    {
					        return T{1} << firstSetBit(value);
					    }

					    /** checks if T is a instance of U
					     *
					     * @tparam T full type specialization
					     * @tparam U unspecialized template type
					     *
					     * @return true if T is a specialization of U
					     *
					     * @{
					     */
					    template<typename T, template<typename...> typename U>
					    inline constexpr bool isSpecializationOf_v = std::false_type{};

					    template<template<typename...> typename U, typename... Vs>
					    inline constexpr bool isSpecializationOf_v<U<Vs...>, U> = std::true_type{};

					    /** @} */

					    namespace concepts
					    {
					        /** Validates if T is a specialization of the unspecialized template type U.
					         *
					         * @tparam T full type specialization
					         * @tparam U unspecialized template type
					         */
					        template<typename T, template<typename...> typename U>
					        concept SpecializationOf = isSpecializationOf_v<std::remove_cvref_t<T>, U>;
					    } // namespace concepts

					    /**
					     * @brief Helper function calculating the integer power for the given base and exponent.
					     */
					    constexpr auto ipow(std::integral auto const base, std::integral auto const exponent)
					        requires std::same_as<ALPAKA_TYPEOF(base), ALPAKA_TYPEOF(exponent)>
					    {
					        using T_Res = ALPAKA_TYPEOF(base);
					        T_Res result = T_Res{1};
					        if(exponent == T_Res{0})
					            return result;

					        result = ipow(base, exponent / T_Res{2});
					        result *= result;

					        if(exponent % T_Res{2})
					            result *= base;

					        return result;
					    }
					} // namespace alpaka
					// ==
					// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/utility.hpp ==
					// ============================================================================

					// ============================================================================
					// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/vecConcepts.hpp ==
					// ==
					/* Copyright 2024 René Widera
					 * SPDX-License-Identifier: MPL-2.0
					 */

					// #pragma once
					// #include <concepts>    // amalgamate: file already included
					#include <string>
					#include <type_traits>

					namespace alpaka
					{
					    namespace concepts
					    {
					        namespace detail
					        {
					            // integral to integral
					            template<typename T_From, typename T_To>
					            constexpr bool integralIntegralLossless
					                = std::is_integral_v<T_From> && std::is_integral_v<T_To>
					                  && ((std::is_signed_v<T_From> == std::is_signed_v<T_To>
					                       && std::numeric_limits<T_From>::digits <= std::numeric_limits<T_To>::digits)
					                      || (std::is_unsigned_v<T_From> && std::is_signed_v<T_To>
					                          && std::numeric_limits<T_From>::digits < std::numeric_limits<T_To>::digits));

					            //  floating-point to floating-point
					            template<typename T_From, typename T_To>
					            constexpr bool floatFloatLossless
					                = std::is_floating_point_v<T_From> && std::is_floating_point_v<T_To>
					                  && std::numeric_limits<T_From>::radix == std::numeric_limits<T_To>::radix
					                  && std::numeric_limits<T_From>::digits <= std::numeric_limits<T_To>::digits
					                  && std::numeric_limits<T_From>::max_exponent <= std::numeric_limits<T_To>::max_exponent
					                  && std::numeric_limits<T_From>::min_exponent >= std::numeric_limits<T_To>::min_exponent;

					            //  integral to floating-point
					            //  numeric_limits::digits for integers excludes the sign bit
					            template<typename T_From, typename T_To>
					            constexpr bool integralFloatLossless = std::is_integral_v<T_From> && std::is_floating_point_v<T_To>
					                                                   && (std::numeric_limits<T_From>::digits + std::is_signed_v<T_From>)
					                                                          <= std::numeric_limits<T_To>::digits;
					        } // namespace detail

					        /** Concept to check if a type can be lossless converted to another type.
					         *
					         * This concept ensures that a type `T_From` can be converted to a type `T_To` without any loss of information.
					         * It checks for implicit convertibility, signedness compatibility, and precision preservation for both integer
					         * and floating-point types.
					         *
					         * @tparam T_From The source type to be converted.
					         * @tparam T_To The target type to which the source type is converted.
					         */
					        template<typename T_From, typename T_To>
					        concept LosslesslyConvertible
					            = std::convertible_to<T_From, T_To>
					              && (detail::integralIntegralLossless<T_From, T_To> || detail::floatFloatLossless<T_From, T_To>
					                  || detail::integralFloatLossless<T_From, T_To>);

					        template<typename T_From, typename T_To>
					        concept Convertible = requires { std::is_convertible_v<T_From, T_To>; };
					    }; // namespace concepts
					} // namespace alpaka
					// ==
					// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/vecConcepts.hpp ==
					// ============================================================================


				// #include <concepts>    // amalgamate: file already included
				#include <cstdint>
				#include <limits>

				namespace alpaka
				{
				    /** This type is used in cases where a template type parameter is not required and can optionally be passed to a
				     * trait or concept.
				     */
				    struct NotRequired
				    {
				    };

				    constexpr uint32_t notRequiredDim = std::numeric_limits<uint32_t>::max();
				    constexpr uint32_t notRequiredWidth = notRequiredDim;

				    namespace trait
				    {
				        template<typename T>
				        struct GetDim
				        {
				            static constexpr uint32_t value = T::dim();
				        };

				        template<std::integral T>
				        struct GetDim<T>
				        {
				            static constexpr uint32_t value = 1u;
				        };

				        template<typename T>
				        constexpr uint32_t getDim_v = GetDim<T>::value;

				        template<typename T>
				        struct GetValueType
				        {
				            using type = typename T::value_type;
				        };

				        template<typename T>
				        requires(std::is_fundamental_v<T>)
				        struct GetValueType<T>
				        {
				            using type = T;
				        };

				        // resolve handles
				        template<typename T>
				        requires requires() { typename T::element_type; }
				        struct GetValueType<T>
				        {
				            using type = typename GetValueType<typename T::element_type>::type;
				        };

				        template<typename T>
				        using GetValueType_t = typename GetValueType<T>::type;

				        /** Check if a type used as kernel argument is trivially copyable
				         *
				         * @attention In case this trait is specialized for a user type, the user should be sure that the result of
				         * calling the copy constructor is equivalent to using memcpy to duplicate the object. An existing destructor
				         * must be free of side effects.
				         *
				         * It is implementation defined whether the closure type of a lambda is trivially copyable.
				         * Therefore, the default implementation is true for trivially copyable or empty (stateless) types.
				         *
				         * @tparam T type to check
				         */
				        template<typename T, typename = void>
				        struct IsKernelArgumentTriviallyCopyable
				            : std::bool_constant<std::is_empty_v<T> || std::is_trivially_copyable_v<T>>
				        {
				        };

				        /** Check if the kernel type is trivially copyable
				         *
				         * @attention In case this trait is specialized for a user type, the user should be sure that the result of
				         * calling the copy constructor is equivalent to using memcpy to duplicate the object. An existing destructor
				         * must be free of side effects.
				         *
				         * The default implementation is true for trivially copyable types (or for extended lambda expressions for
				         * CUDA).
				         *
				         * @tparam T type to check
				         * @{
				         */
				        template<typename T, typename = void>
				        struct IsKernelTriviallyCopyable
				#if ALPAKA_LANG_CUDA && ALPAKA_COMP_NVCC
				            : std::bool_constant<
				                  std::is_trivially_copyable_v<T> || __nv_is_extended_device_lambda_closure_type(T)
				                  || __nv_is_extended_host_device_lambda_closure_type(T)>
				#else
				            : std::is_trivially_copyable<T>
				#endif
				        {
				        };
				    } // namespace trait

				    template<typename T>
				    inline constexpr bool isKernelArgumentTriviallyCopyable_v = trait::IsKernelArgumentTriviallyCopyable<T>::value;

				    template<typename T>
				    inline constexpr bool isKernelTriviallyCopyable_v = trait::IsKernelTriviallyCopyable<T>::value;

				    template<typename T>
				    [[nodiscard]] consteval uint32_t getDim([[maybe_unused]] T const& any)
				    {
				        return trait::getDim_v<T>;
				    }

				    template<typename T_From, typename T_To>
				    constexpr bool isLosslesslyConvertible_v = concepts::LosslesslyConvertible<T_From, T_To>;

				    template<typename T_From, typename T_To>
				    constexpr bool isConvertible_v = concepts::Convertible<T_From, T_To>;

				    namespace concepts
				    {
				        /** @brief Concept to check for a kernel function object
				         *
				         * @details
				         * The kernel function object must be trivially copyable.
				         */
				        template<typename T>
				        concept KernelFn = isKernelArgumentTriviallyCopyable_v<T>;

				        /** @brief Concept to check for a kernel argument object
				         *
				         * @details
				         * A kernel call requires that its arguments are trivially copyable, which this concept requires.
				         */
				        template<typename T>
				        concept KernelArg = isKernelArgumentTriviallyCopyable_v<T>;
				    } // namespace concepts
				} // namespace alpaka
				// ==
				// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/trait.hpp ==
				// ============================================================================


			namespace alpaka
			{
			    namespace internal
			    {
			        struct PCast
			        {
			            template<typename T_To, typename T_Input>
			            struct Op
			            {
			                decltype(auto) operator()(auto&& any) const;
			            };
			        };

			        struct LPCast
			        {
			            template<typename T_To, typename T_Input>
			            struct Op
			            {
			                decltype(auto) operator()(auto&& any) const
			                {
			                    return PCast::Op<T_To, T_Input>{}(any);
			                }
			            };
			        };
			    } // namespace internal

			    /** Performs a static_cast on the storage type of combined data type.
			     *
			     * @code
			     * alpaka::Vec<float, 4> foo{0.f, 0.f, 0.f, 0.f};
			     * alpaka::Vec<int32_t, 4> bar = pCast<int32_t>(foo);
			     * @endcode
			     *
			     * @tparam T_To The target type to which the input is cast.
			     * @param input The input value to be cast. value_type must be cast able to `T_To`.
			     * @return input with exchanged value_type
			     */
			    template<typename T_To>
			    constexpr decltype(auto) pCast(auto&& input) requires(isConvertible_v<typename ALPAKA_TYPEOF(input)::type, T_To>)
			    {
			        return internal::PCast::Op<T_To, ALPAKA_TYPEOF(input)>{}(input);
			    }

			    /** Performs a static_cast on the storage type of combined data type.
			     *
			     * It ensures that the conversion is lossless by requiring that the value_type of the input is lossless convertible
			     * to the target type `T_To`.
			     *
			     * @code
			     * alpaka::Vec<float, 4> foo{0.f, 0.f, 0.f, 0.f};
			     * // Invalid, loss of precision due to conversion from float to int
			     * // alpaka::Vec<int32_t, 4> bar = lpCast<int32_t>(foo);
			     * alpaka::Vec<double, 4> bar = lpCast<double>(foo);
			     * @endcode
			     *
			     * @tparam T_To The target type to which the input is cast.
			     * @param input The input value to be cast. value_type must be cast able to `T_To`.
			     * @return input with exchanged value_type
			     */
			    template<typename T_To>
			    constexpr decltype(auto) lpCast(auto&& input)
			        requires(isLosslesslyConvertible_v<typename ALPAKA_TYPEOF(input)::type, T_To>)
			    {
			        return internal::LPCast::Op<T_To, ALPAKA_TYPEOF(input)>{}(input);
			    }
			} // namespace alpaka
			// ==
			// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/cast.hpp ==
			// ============================================================================

		// #include "alpaka/core/common.hpp"    // amalgamate: file already inlined
			// ============================================================================
			// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/core/util.hpp ==
			// ==
			/* Copyright 2024 René Widera
			 * SPDX-License-Identifier: MPL-2.0
			 */

			// #pragma once
			// #include "alpaka/core/common.hpp"    // amalgamate: file already inlined

			#include <cstdio>
			#include <tuple>
			// #include <utility>    // amalgamate: file already included

			namespace alpaka
			{
			    template<typename T>
			    constexpr decltype(auto) unWrapp(T&& value)
			    {
			        using WrappedType = std::unwrap_reference_t<std::decay_t<decltype(value)>>;
			        return std::unwrap_reference_t<WrappedType>(std::forward<T>(value));
			    }

			    template<typename T>
			    using RemoveVolatileFromPointer_t = std::add_pointer_t<std::remove_volatile_t<std::remove_pointer_t<T>>>;

			    /**
			     * @brief Cast a pointer that may or may not point to volatile memory to a (void*) or (void const*).
			     *
			     * Useful for freeing the memory.
			     *
			     * @param inPtr The pointer to convert.
			     * @tparam T The type of the given pointer.
			     */
			    template<typename T>
			    auto* toVoidPtr(T inPtr)
			    {
			        static_assert(std::is_pointer_v<T>);
			        using DataType = std::remove_pointer_t<T>;
			        using VoidPtrType = std::conditional_t<std::is_const_v<DataType>, void const*, void*>;
			        return reinterpret_cast<VoidPtrType>(const_cast<RemoveVolatileFromPointer_t<T>>(inPtr));
			    }
			} // namespace alpaka
			// ==
			// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/core/util.hpp ==
			// ============================================================================

		// #include "alpaka/trait.hpp"    // amalgamate: file already inlined
			// ============================================================================
			// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/unused.hpp ==
			// ==
			/* Copyright 2026 René Widera
			 * SPDX-License-Identifier: MPL-2.0
			 */

			// #pragma once
			namespace alpaka
			{
			    /** Utility to mark variables as unused to avoid compiler warnings
			     *
			     * Using '[[maybe_unused]]` in function interfaces for arguments make the interface long and sometimes it is not
			     * important that only the argument type is used within the function and not the instance itself.
			     * This can be used to keep the function interfaces clean and readable.
			     */
			    inline constexpr void unused([[maybe_unused]] auto&&... values)
			    {
			    }
			} // namespace alpaka
			// ==
			// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/unused.hpp ==
			// ============================================================================


		#include <array>
		// #include <concepts>    // amalgamate: file already included
		// #include <cstdint>    // amalgamate: file already included
		#include <iosfwd>
		#include <ranges>
		#include <sstream>
		// #include <string>    // amalgamate: file already included
		#include <type_traits>
		// #include <utility>    // amalgamate: file already included

		namespace alpaka
		{
		    namespace trait
		    {
		        template<typename T>
		        struct IsVector : std::false_type
		        {
		        };

		        template<typename T>
		        struct IsCVector : std::false_type
		        {
		        };
		    } // namespace trait

		    template<typename T>
		    constexpr bool isVector_v = trait::IsVector<T>::value;

		    template<typename T>
		    constexpr bool isCVector_v = trait::IsCVector<T>::value;

		    namespace concepts
		    {

		        /** Concept to check if a type is a vector
		         *
		         * @tparam T Type to check
		         * @tparam T_ValueType enforce a value type of the vector, if not provided the value type is not checked
		         * @tparam T_dim enforce a dimensionality of the vector, if not provided the value is not checked
		         */
		        template<typename T, typename T_ValueType = alpaka::NotRequired, uint32_t T_dim = alpaka::notRequiredDim>
		        concept Vector = isVector_v<T>
		                         && (std::same_as<T_ValueType, trait::GetValueType_t<std::decay_t<T>>>
		                             || std::same_as<T_ValueType, alpaka::NotRequired>)
		                         && ((T_dim == alpaka::notRequiredDim) || (T::dim() == T_dim));

		        /** Concept to check if a type is a vector or scalar variable
		         *
		         * @tparam T Type to check
		         * @tparam T_ValueType enforce a value type of T, if not provided the value type is not checked
		         */
		        template<typename T, typename T_ValueType = alpaka::NotRequired>
		        concept VectorOrScalar = (isVector_v<T> || std::integral<T> || std::floating_point<T>)
		                                 && (std::same_as<T_ValueType, trait::GetValueType_t<std::decay_t<T>>>
		                                     || std::same_as<T_ValueType, alpaka::NotRequired>);

		        /** Concept to check if a type is a CVector
		         *
		         * @details
		         * Checks whether the given type is a CVector. For more information, refer to the implementation alpaka::CVec.
		         */
		        template<typename T, typename T_ValueType = alpaka::NotRequired>
		        concept CVector = isCVector_v<T>
		                          && (std::same_as<T_ValueType, trait::GetValueType_t<std::decay_t<T>>>
		                              || std::same_as<T_ValueType, alpaka::NotRequired>);

		        /** Concept to check if a type is a vector or a specific other type
		         *
		         * @tparam T Type to check
		         * @tparam T_RequiredComponent enforce that T is a vector or a specific other type
		         */
		        template<typename T, typename T_RequiredComponent>
		        concept TypeOrVector = (isVector_v<T> || std::is_same_v<T, T_RequiredComponent>);

		        template<typename T, typename T_RequiredComponent>
		        concept VectorOrConvertibleType = (isVector_v<T> || std::is_convertible_v<T, T_RequiredComponent>);
		    } // namespace concepts

		    /** Array storge for vector data
		     *
		     * This class is a workaround and is simply wrapping std::array. It is required because the dim in std::array
		     * in the template signature is size_t. This produces template deduction issues for math::Vec if we sue
		     * array as default storage without this wrapper.
		     */
		    template<typename T_Type, uint32_t T_dim>
		    struct ArrayStorage : protected std::array<T_Type, T_dim>
		    {
		        using type = T_Type;
		        using BaseType = std::array<T_Type, T_dim>;
		        using BaseType::operator[];

		        // constructor is required because exposing the array constructors does not work
		        template<typename... T_Args>
		        constexpr ArrayStorage(T_Args&&... args) : BaseType{std::forward<T_Args>(args)...}
		        {
		        }

		        constexpr ArrayStorage(std::array<T_Type, T_dim> const& data) : BaseType{data}
		        {
		        }
		    };

		    namespace detail
		    {
		        template<typename T, T... T_values>
		        struct CVec
		        {
		            using type = T;

		            static consteval uint32_t dim()
		            {
		                return sizeof...(T_values);
		            }

		            constexpr T operator[](std::integral auto const idx) const
		            {
		                // default initializes with first value
		                T result = std::get<0>(std::forward_as_tuple(T_values...));

		                if constexpr(dim() > 1u)
		                {
		                    [[maybe_unused]] bool _ = std::apply(
		                        [idx, &result](auto&&, auto&&... values) constexpr
		                        {
		                            using IdxType = ALPAKA_TYPEOF(idx);
		                            IdxType i{1u};
		                            return ((idx == i++ && (result = values, true)) || ...);
		                        },
		                        std::forward_as_tuple(T_values...));
		                }
		                return result;
		            }

		            template<T T_value>
		            static constexpr auto fill()
		            {
		                using IotaSeq = std::make_integer_sequence<T, dim()>;
		                return integerSequenceToCVec(IotaSeq{}, [](auto&&) constexpr { return T_value; });
		            }

		        private:
		            template<T... T_indices>
		            static constexpr auto integerSequenceToCVec(
		                std::integer_sequence<T, T_indices...>,
		                auto const op = std::identity{})
		            {
		                return CVec<T, op(T_indices)...>{};
		            }
		        };

		        template<typename T>
		        struct TemplateSignatureStorage : std::false_type
		        {
		        };

		        template<typename T_Type, T_Type... T_values>
		        struct TemplateSignatureStorage<CVec<T_Type, T_values...>> : std::true_type
		        {
		        };

		        template<typename T>
		        constexpr bool TemplateSignatureStorage_v = TemplateSignatureStorage<T>::value;
		    } // namespace detail

		    template<typename T_Type, uint32_t T_dim, typename T_Storage = ArrayStorage<T_Type, T_dim>>
		    struct Vec : private T_Storage
		    {
		        using Storage = T_Storage;
		        using type = T_Type;
		        using ParamType = type;

		        using index_type = uint32_t;
		        using size_type = uint32_t;
		        using rank_type = uint32_t;

		        // universal vec used as fallback if T_Storage is holding the state in the template signature
		        using UniVec = Vec<T_Type, T_dim>;

		        /*Vecs without elements are not allowed*/
		        static_assert(T_dim > 0u);

		        constexpr Vec() = default;

		        /** Initialize via a generator expression
		         *
		         * The generator must return the value for the corresponding index of the component which is passed to the
		         * generator.
		         */
		        template<
		            typename F,
		            std::enable_if_t<std::is_invocable_v<F, std::integral_constant<uint32_t, 0u>>, uint32_t> = 0u>
		        constexpr explicit Vec(F&& generator)
		            : Vec(std::forward<F>(generator), std::make_integer_sequence<uint32_t, T_dim>{})
		        {
		        }

		    private:
		        template<typename F, uint32_t... Is>
		        constexpr explicit Vec(F&& generator, std::integer_sequence<uint32_t, Is...>)
		            : Storage{generator(std::integral_constant<uint32_t, Is>{})...}
		        {
		        }

		    public:
		        /** Constructor for N-dimensional vector
		         *
		         * @attention This constructor allows implicit casts.
		         *
		         * @param args value of each dimension, x,y,z,...
		         *
		         * A constexpr vector should be initialized with {} instead of () because at least
		         * CUDA 11.6 has problems in cases where a compile time evaluation is required.
		         * @code{.cpp}
		         *   constexpr auto vec1 = Vec{ 1 };
		         *   constexpr auto vec2 = Vec{ 1, 2 };
		         *   //or explicit
		         *   constexpr auto vec3 = Vec<int, 3u>{ 1, 2, 3 };
		         *   constexpr auto vec4 = Vec<int, 3u>{ {1, 2, 3} };
		         * @endcode
		         */
		        template<typename... T_Args>
		        requires(std::is_convertible_v<T_Args, T_Type> && ...)
		        constexpr Vec(T_Args const&... args) : Storage(static_cast<T_Type>(args)...)
		        {
		        }

		        constexpr Vec(Vec const& other) = default;

		        constexpr Vec(T_Storage const& other) : T_Storage{other}
		        {
		        }

		        /** constructor allows changing the storage policy
		         */
		        template<typename T_OtherStorage>
		        constexpr Vec(Vec<T_Type, T_dim, T_OtherStorage> const& other)
		            : Vec([&](uint32_t const i) constexpr { return other[i]; })
		        {
		        }

		        /** Allow static_cast / explicit cast to member type for 1D vector */
		        template<uint32_t T_deferDim = T_dim, typename = typename std::enable_if<T_deferDim == 1u>::type>
		        constexpr explicit operator type()
		        {
		            return (*this)[0];
		        }

		        static consteval uint32_t dim()
		        {
		            return T_dim;
		        }

		        /**
		         * Creates a Vec where all dimensions are set to the same value
		         *
		         * @param value Value which is set for all dimensions
		         * @return new Vec<...>
		         */
		        static constexpr auto fill(concepts::Convertible<T_Type> auto const& value)
		        {
		            if constexpr(requires { detail::TemplateSignatureStorage_v<T_Storage>; })
		            {
		                return UniVec([=](uint32_t const) { return static_cast<T_Type>(value); });
		            }
		            else
		            {
		                return Vec([=](uint32_t const) { return static_cast<T_Type>(value); });
		            }
		        }

		        template<auto T_v>
		        requires(isConvertible_v<ALPAKA_TYPEOF(T_v), T_Type>)
		        static constexpr auto fill() requires requires { T_Storage::template fill<T_v>(); }
		        {
		            return Vec<T_Type, T_dim, ALPAKA_TYPEOF(T_Storage::template fill<static_cast<T_Type>(T_v)>())>{};
		        }

		        constexpr Vec toRT() const

		        {
		            return *this;
		        }

		        constexpr Vec revert() const
		        {
		            Vec invertedVec{};
		            for(uint32_t i = 0u; i < T_dim; i++)
		                invertedVec[T_dim - 1 - i] = (*this)[i];

		            return invertedVec;
		        }

		        constexpr Vec& operator=(Vec const&) = default;
		        constexpr Vec& operator=(Vec&&) = default;

		        constexpr Vec operator-() const
		        {
		            return Vec([this](uint32_t const i) constexpr { return -(*this)[i]; });
		        }

		#define ALPAKA_VECTOR_ASSIGN_OP(op)                                                                                   \
		    template<typename T_OtherStorage>                                                                                 \
		    constexpr Vec& operator op(Vec<T_Type, T_dim, T_OtherStorage> const& rhs)                                         \
		    {                                                                                                                 \
		        for(uint32_t i = 0u; i < T_dim; i++)                                                                          \
		        {                                                                                                             \
		            if constexpr(requires { unWrapp((*this)[i]) op rhs[i]; })                                                 \
		            {                                                                                                         \
		                unWrapp((*this)[i]) op rhs[i];                                                                        \
		            }                                                                                                         \
		            else                                                                                                      \
		            {                                                                                                         \
		                (*this)[i] op rhs[i];                                                                                 \
		            }                                                                                                         \
		        }                                                                                                             \
		        return *this;                                                                                                 \
		    }                                                                                                                 \
		    constexpr Vec& operator op(concepts::LosslesslyConvertible<T_Type> auto const value)                              \
		    {                                                                                                                 \
		        for(uint32_t i = 0u; i < T_dim; i++)                                                                          \
		        {                                                                                                             \
		            if constexpr(requires { unWrapp((*this)[i]) op value; })                                                  \
		            {                                                                                                         \
		                unWrapp((*this)[i]) op value;                                                                         \
		            }                                                                                                         \
		            else                                                                                                      \
		            {                                                                                                         \
		                (*this)[i] op value;                                                                                  \
		            }                                                                                                         \
		        }                                                                                                             \
		        return *this;                                                                                                 \
		    }

		        /** assign operator
		         * @{
		         */
		        ALPAKA_VECTOR_ASSIGN_OP(+=)
		        ALPAKA_VECTOR_ASSIGN_OP(-=)
		        ALPAKA_VECTOR_ASSIGN_OP(/=)
		        ALPAKA_VECTOR_ASSIGN_OP(*=)
		        ALPAKA_VECTOR_ASSIGN_OP(=)
		        /** @} */

		#undef ALPAKA_VECTOR_ASSIGN_OP

		        constexpr decltype(auto) operator[](std::integral auto const idx)
		        {
		            return Storage::operator[](idx);
		        }

		        constexpr decltype(auto) operator[](std::integral auto const idx) const
		        {
		            return Storage::operator[](idx);
		        }

		#define ALPAKA_NAMED_ARRAY_ACCESS(functionName, indexPos)                                                             \
		    /* An integer underflow may occur with `indexPos`, for example if `T_dim` is equal to 1 and `y()` should be       \
		     declared as `(T_dim - 2u)`. Therefore the `requires` is fine, because everything which should be theoretical     \
		     negative become much bigger.                                                                                     \
		     * than T_dim. */                                                                                                 \
		    constexpr decltype(auto) functionName() requires((indexPos) < T_dim)                                              \
		    {                                                                                                                 \
		        return (*this)[indexPos];                                                                                     \
		    }                                                                                                                 \
		    constexpr decltype(auto) functionName() const requires((indexPos) < T_dim)                                        \
		    {                                                                                                                 \
		        return (*this)[indexPos];                                                                                     \
		    }

		        /** named member access
		         *
		         * index -> name [0->w, 1->z, 2->y, 3->x]
		         * @{
		         */
		        ALPAKA_NAMED_ARRAY_ACCESS(x, T_dim - 1u)
		        ALPAKA_NAMED_ARRAY_ACCESS(y, T_dim - 2u)
		        ALPAKA_NAMED_ARRAY_ACCESS(z, T_dim - 3u)
		        ALPAKA_NAMED_ARRAY_ACCESS(w, T_dim - 4u)
		        /** @} */

		#undef ALPAKA_NAMED_ARRAY_ACCESS

		        constexpr decltype(auto) back()
		        {
		            return (*this)[T_dim - 1u];
		        }

		        constexpr decltype(auto) back() const
		        {
		            return (*this)[T_dim - 1u];
		        }

		        /** Shrink the number of elements of a vector.
		         *
		         * Highest indices kept alive.
		         *
		         * @tparam T_numElements New dimension of the vector.
		         * @return First T_numElements elements of the origin vector
		         */
		        template<uint32_t T_numElements>
		        constexpr Vec<T_Type, T_numElements> rshrink() const
		        {
		            static_assert(T_numElements <= T_dim);
		            Vec<T_Type, T_numElements> result{};
		            for(uint32_t i = 0u; i < T_numElements; i++)
		                result[T_numElements - 1u - i] = (*this)[T_dim - 1u - i];

		            return result;
		        }

		        /** Shrink the vector
		         *
		         * Removes the last value.
		         */
		        constexpr Vec<T_Type, T_dim - 1u> eraseBack() const requires(T_dim > 1u)
		        {
		            constexpr auto reducedDim = T_dim - 1u;
		            Vec<T_Type, reducedDim> result{};
		            for(uint32_t i = 0u; i < reducedDim; i++)
		                result[i] = (*this)[i];

		            return result;
		        }

		        /** Shrink the number of elements of a vector.
		         *
		         * @tparam T_numElements New dimension of the vector.
		         * @param startIdx Index within the origin vector which will be the last element in the result.
		         * @return T_numElements elements of the origin vector starting with the index startIdx.
		         *         Indexing will wrapp around when the begin of the origin vector is reached.
		         */
		        template<uint32_t T_numElements>
		        constexpr Vec<type, T_numElements> rshrink(std::integral auto const startIdx) const
		        {
		            static_assert(T_numElements <= T_dim);
		            Vec<type, T_numElements> result;
		            for(uint32_t i = 0u; i < T_numElements; i++)
		                result[T_numElements - 1u - i] = (*this)[(T_dim + startIdx - i) % T_dim];
		            return result;
		        }

		        /** Assign an value to the given index position
		         *
		         * @tparam T_elementIdx Index of the element from the begin which shall be replaced; range: [ 0; T_dim - 1 ]
		         * @param value value to assign to the element at the given index position
		         * @return copy of the vector with where the index positions are updated with value
		         */
		        template<uint32_t T_elementIdx = 0>
		        constexpr Vec<T_Type, T_dim> assign(T_Type const& value) const requires(T_elementIdx < T_dim)
		        {
		            auto result = *this;
		            result[T_elementIdx] = value;
		            return result;
		        }

		        /** Assign an value to the given index position
		         *
		         * @param selection CVec with the indices of the elements which shall be replaced; indices range must be
		         * [0; T_dim -1]
		         * @param value value to assign to the element at the given index position
		         * @return copy of the vector with where the index positions are updated with value
		         */
		        constexpr Vec<T_Type, T_dim> assign(
		            concepts::CVector auto const selection,
		            concepts::Vector<T_Type> auto const& value) const requires(ALPAKA_TYPEOF(value)::dim() <= T_dim)
		        {
		            auto result = *this;
		            result.ref(selection) = value;
		            return result;
		        }

		        /** Assign an value to the given index position
		         *
		         * @tparam T_elementIdx Index of the element from the back which shall be replaced; range: [ 0; T_dim - 1 ]
		         * @param value value to assign to the element at the given index position
		         * @return copy of the vector with where the index positions are updated with value
		         */
		        template<uint32_t T_elementIdx = T_dim - 1u>
		        constexpr Vec<T_Type, T_dim> rAssign(T_Type const& value) const requires(T_elementIdx < T_dim)
		        {
		            Vec<T_Type, T_dim> result = *this;
		            result[T_elementIdx] = value;
		            return result;
		        }

		        /** Removes a component
		         *
		         * It is not allowed to call this method on a vector with the dimensionality of one.
		         *
		         * @tparam dimToRemove index which shall be removed; range: [ 0; T_dim - 1 ]
		         * @return vector with `T_dim - 1` elements
		         */
		        template<std::integral auto dimToRemove>
		        constexpr Vec<type, T_dim - 1u> remove() const requires(T_dim >= 2u)
		        {
		            Vec<type, T_dim - 1u> result{};
		            for(int i = 0u; i < static_cast<int>(T_dim - 1u); ++i)
		            {
		                // skip component which must be deleted
		                int const sourceIdx = i >= static_cast<int>(dimToRemove) ? i + 1 : i;
		                result[i] = (*this)[sourceIdx];
		            }
		            return result;
		        }

		        /** Returns product of all components.
		         *
		         * @return product of components
		         */
		        constexpr type product() const
		        {
		            type result = (*this)[0];
		            for(uint32_t i = 1u; i < T_dim; i++)
		                result *= (*this)[i];
		            return result;
		        }

		        /** Returns sum of all components.
		         *
		         * @return sum of components
		         */
		        constexpr type sum() const
		        {
		            type result = (*this)[0];
		            for(uint32_t i = 1u; i < T_dim; i++)
		                result += (*this)[i];
		            return result;
		        }

		        /**
		         * == comparison operator.
		         *
		         * Compares dims of two DataSpaces.
		         *
		         * @param other Vec to compare to
		         * @return true if all components in both vectors are equal, else false
		         */
		        template<typename T_OtherStorage>
		        constexpr bool operator==(Vec<T_Type, T_dim, T_OtherStorage> const& rhs) const
		        {
		            bool result = true;
		            for(uint32_t i = 0u; i < T_dim; i++)
		                result = result && ((*this)[i] == rhs[i]);
		            return result;
		        }

		        /**
		         * != comparison operator.
		         *
		         * Compares dims of two DataSpaces.
		         *
		         * @param other Vec to compare to
		         * @return true if one component in both vectors are not equal, else false
		         */
		        template<typename T_OtherStorage>
		        constexpr bool operator!=(Vec<T_Type, T_dim, T_OtherStorage> const& rhs) const
		        {
		            return !((*this) == rhs);
		        }

		        template<typename T_OtherStorage>
		        constexpr auto min(Vec<T_Type, T_dim, T_OtherStorage> const& rhs) const
		        {
		            typename Vec::UniVec result{};
		            for(uint32_t d = 0u; d < T_dim; d++)
		                result[d] = std::min((*this)[d], rhs[d]);
		            return result;
		        }

		        /** create string out of the vector
		         *
		         * @param separator string to separate components of the vector
		         * @param enclosings string with dim 2 to enclose vector
		         *                   dim == 0 ? no enclose symbols
		         *                   dim == 1 ? means enclose symbol begin and end are equal
		         *                   dim >= 2 ? letter[0] = begin enclose symbol
		         *                               letter[1] = end enclose symbol
		         *
		         * example:
		         * .toString(";","|")     -> |x;...;z|
		         * .toString(",","[]")    -> [x,...,z]
		         */
		        std::string toString(std::string const separator = ",", std::string const enclosings = "{}") const
		        {
		            std::string locale_enclosing_begin;
		            std::string locale_enclosing_end;
		            size_t enclosing_dim = enclosings.size();

		            if(enclosing_dim > 0)
		            {
		                /* % avoid out of memory access */
		                locale_enclosing_begin = enclosings[0 % enclosing_dim];
		                locale_enclosing_end = enclosings[1 % enclosing_dim];
		            }

		            std::stringstream stream;
		            stream << locale_enclosing_begin << (*this)[0];

		            for(uint32_t i = 1u; i < T_dim; ++i)
		                stream << separator << (*this)[i];
		            stream << locale_enclosing_end;
		            return stream.str();
		        }

		        /** swizzle operator */
		        template<typename T, T... T_values>
		        constexpr auto operator[](Vec<T, sizeof...(T_values), detail::CVec<T, T_values...>> const v) const
		        {
		            using InType = ALPAKA_TYPEOF(v);
		            return Vec<T_Type, InType::dim()>{(*this)[T_values]...};
		        }

		        template<typename T, T... T_values>
		        constexpr auto ref(Vec<T, sizeof...(T_values), detail::CVec<T, T_values...>> const v)
		        {
		            using InType = ALPAKA_TYPEOF(v);
		            using ArrayType = std::array<ALPAKA_TYPEOF(std::ref((*this)[T{0}])), sizeof...(T_values)>;
		            auto array = ArrayType{std::ref((*this)[T_values])...};
		            return Vec<T_Type, InType::dim(), ALPAKA_TYPEOF(array)>{array};
		        }

		        template<typename T, T... T_values>
		        constexpr auto ref(Vec<T, sizeof...(T_values), detail::CVec<T, T_values...>> const v) const
		        {
		            using InType = ALPAKA_TYPEOF(v);
		            using ArrayType = std::array<ALPAKA_TYPEOF(std::ref((*this)[T{0}])), sizeof...(T_values)>;
		            auto array = ArrayType{std::ref((*this)[T_values])...};
		            return Vec<T_Type, InType::dim(), ALPAKA_TYPEOF(array)>{array};
		        }

		        /** reduce all elements to a single value
		         *
		         * For better numerical stability a tree reduce algorithm is used.
		         *
		         * @tparam BinaryOp binary functor executed to reduce the range
		         *                  The binary operation must be associative.
		         * @return the type of the result depends on the binary functor
		         */
		        [[nodiscard]] constexpr auto reduce(auto&& reduceFunc) const
		            -> decltype(reduceFunc(std::declval<type>(), std::declval<type>()))
		        {
		            return reduce_range(ALPAKA_FORWARD(reduceFunc));
		        }

		    private:
		        /** reduce over a range of elements
		         *
		         * @tparam BinaryOp binary functor executed to reduce the range
		         * @tparam T_start start index
		         * @tparam T_end end index (excluded)
		         * @return the type of the result depends on the binary functor
		         */
		        template<uint32_t T_start = 0u, uint32_t T_end = dim()>
		        [[nodiscard]] constexpr auto reduce_range(auto&& reduceFunc) const
		            -> decltype(reduceFunc(std::declval<type>(), std::declval<type>()))
		        {
		            // elements in the range
		            constexpr uint32_t size = T_end - T_start;
		            // single element termination
		            if constexpr(size == 1u)
		            {
		                return (*this)[T_start];
		            }
		#if ALPAKA_LANG_SYCL
		            // SYCL can not call recursive functions
		            auto result = (*this)[T_start];
		            for(uint32_t i = T_start + 1u; i < T_end; ++i)
		            {
		                result = reduceFunc(result, (*this)[i]);
		            }
		            return result;
		#else
		            // split range at midpoint
		            constexpr uint32_t mid = T_start + size / 2u;

		            // recursively reduce both halves and combine
		            return reduceFunc(
		                reduce_range<T_start, mid>(ALPAKA_FORWARD(reduceFunc)),
		                reduce_range<mid, T_end>(ALPAKA_FORWARD(reduceFunc)));
		#endif
		        }
		    };

		    template<std::size_t I, typename T_Type, uint32_t T_dim, typename T_Storage>
		    constexpr auto get(Vec<T_Type, T_dim, T_Storage> const& v)
		    {
		        return v[I];
		    }

		    template<std::size_t I, typename T_Type, uint32_t T_dim, typename T_Storage>
		    constexpr decltype(auto) get(Vec<T_Type, T_dim, T_Storage>& v)
		    {
		        return v[I];
		    }

		    template<typename Type>
		    struct Vec<Type, 0>
		    {
		        using type = Type;
		        static constexpr uint32_t T_dim = 0;

		        template<typename OtherType>
		        constexpr operator Vec<OtherType, 0>() const
		        {
		            return Vec<OtherType, 0>();
		        }

		        /**
		         * == comparison operator.
		         *
		         * Returns always true
		         */
		        constexpr bool operator==(Vec const&) const
		        {
		            return true;
		        }

		        /**
		         * != comparison operator.
		         *
		         * Returns always false
		         */
		        constexpr bool operator!=(Vec const&) const
		        {
		            return false;
		        }

		        static constexpr Vec create(Type)
		        {
		            /* this method should never be actually called,
		             * it exists only for Visual Studio to handle alpaka::Size_t< 0 >
		             */
		            static_assert(sizeof(Type) != 0 && false);
		        }
		    };

		    // type deduction guide
		    template<typename T_1, typename... T_Args>
		    ALPAKA_FN_HOST_ACC Vec(T_1, T_Args...)
		        -> Vec<T_1, uint32_t(sizeof...(T_Args) + 1u), ArrayStorage<T_1, uint32_t(sizeof...(T_Args) + 1u)>>;

		    template<typename Type, uint32_t T_dim, typename T_Storage>
		    std::ostream& operator<<(std::ostream& s, Vec<Type, T_dim, T_Storage> const& vec)
		    {
		        return s << vec.toString();
		    }

		#define ALPAKA_VECTOR_BINARY_OP(typenameOrConcept, resultScalarType, op)                                              \
		    template<typenameOrConcept T_Type, uint32_t T_dim, typename T_Storage, typename T_OtherStorage>                   \
		    constexpr auto operator op(                                                                                       \
		        const Vec<T_Type, T_dim, T_Storage>& lhs,                                                                     \
		        const Vec<T_Type, T_dim, T_OtherStorage>& rhs)                                                                \
		    {                                                                                                                 \
		        /* to avoid allocation side effects the result is always a vector                                             \
		         * with default policies                                                                                      \
		         */                                                                                                           \
		        Vec<resultScalarType, T_dim> result{};                                                                        \
		        for(uint32_t i = 0u; i < T_dim; i++)                                                                          \
		            result[i] = lhs[i] op rhs[i];                                                                             \
		        return result;                                                                                                \
		    }                                                                                                                 \
		                                                                                                                      \
		    template<                                                                                                         \
		        typenameOrConcept T_Type,                                                                                     \
		        concepts::LosslesslyConvertible<T_Type> T_ValueType,                                                          \
		        uint32_t T_dim,                                                                                               \
		        typename T_Storage>                                                                                           \
		    constexpr auto operator op(const Vec<T_Type, T_dim, T_Storage>& lhs, T_ValueType rhs)                             \
		    {                                                                                                                 \
		        /* to avoid allocation side effects the result is always a vector                                             \
		         * with default policies                                                                                      \
		         */                                                                                                           \
		        Vec<resultScalarType, T_dim> result{};                                                                        \
		        for(uint32_t i = 0u; i < T_dim; i++)                                                                          \
		            result[i] = lhs[i] op rhs;                                                                                \
		        return result;                                                                                                \
		    }                                                                                                                 \
		    template<                                                                                                         \
		        typenameOrConcept T_Type,                                                                                     \
		        concepts::LosslesslyConvertible<T_Type> T_ValueType,                                                          \
		        uint32_t T_dim,                                                                                               \
		        typename T_Storage>                                                                                           \
		    constexpr auto operator op(T_ValueType lhs, const Vec<T_Type, T_dim, T_Storage>& rhs)                             \
		    {                                                                                                                 \
		        /* to avoid allocation side effects the result is always a vector                                             \
		         * with default policies                                                                                      \
		         */                                                                                                           \
		        Vec<resultScalarType, T_dim> result{};                                                                        \
		        for(uint32_t i = 0u; i < T_dim; i++)                                                                          \
		            result[i] = lhs op rhs[i];                                                                                \
		        return result;                                                                                                \
		    }

		    /** binary operators
		     * @{
		     */
		    ALPAKA_VECTOR_BINARY_OP(typename, T_Type, +)
		    ALPAKA_VECTOR_BINARY_OP(typename, T_Type, -)
		    ALPAKA_VECTOR_BINARY_OP(typename, T_Type, *)
		    ALPAKA_VECTOR_BINARY_OP(typename, T_Type, /)
		    ALPAKA_VECTOR_BINARY_OP(typename, bool, >=)
		    ALPAKA_VECTOR_BINARY_OP(typename, bool, >)
		    ALPAKA_VECTOR_BINARY_OP(typename, bool, <=)
		    ALPAKA_VECTOR_BINARY_OP(typename, bool, <)
		    ALPAKA_VECTOR_BINARY_OP(typename, bool, &&)
		    ALPAKA_VECTOR_BINARY_OP(typename, bool, ||)
		    ALPAKA_VECTOR_BINARY_OP(std::integral, T_Type, %)
		    ALPAKA_VECTOR_BINARY_OP(std::integral, T_Type, <<)
		    ALPAKA_VECTOR_BINARY_OP(std::integral, T_Type, >>)
		    ALPAKA_VECTOR_BINARY_OP(std::integral, T_Type, &)
		    ALPAKA_VECTOR_BINARY_OP(std::integral, T_Type, |)
		    ALPAKA_VECTOR_BINARY_OP(std::integral, T_Type, ^)
		    /** @} */

		#undef ALPAKA_VECTOR_BINARY_OP

		    /** Give the linear index of an N-dimensional index within an N-dimensional index space.
		     *
		     * @tparam T_IntegralType vector data type (must be an integral type)
		     * @tparam T_dim dimension of the vector, should be >= 2
		     * @param dim N-dimensional dim of the index space (N can be one dimension less compared to idx)
		     * @param idx N-dimensional index within the index space
		     *            @attention behaviour is undefined for negative index
		     *            @attention if idx is outside of dim the result will be outside of the the index domain too
		     * @return linear index within the index domain
		     *
		     * @{
		     */
		    template<std::integral T_IntegralType, typename T_Storage, typename T_OtherStorage, uint32_t T_dim>
		    constexpr T_IntegralType linearize(
		        Vec<T_IntegralType, T_dim - 1u, T_Storage> const& dim,
		        Vec<T_IntegralType, T_dim, T_OtherStorage> const& idx) requires(T_dim >= 2u)
		    {
		        T_IntegralType linearIdx{idx[0]};
		        for(uint32_t d = 1u; d < T_dim; ++d)
		            linearIdx = linearIdx * dim[d - 1u] + idx[d];

		        return linearIdx;
		    }

		    template<std::integral T_IntegralType, typename T_Storage, typename T_OtherStorage, uint32_t T_dim>
		    constexpr T_IntegralType linearize(
		        Vec<T_IntegralType, T_dim, T_Storage> const& dim,
		        Vec<T_IntegralType, T_dim, T_OtherStorage> const& idx)
		    {
		        return linearize(dim.template rshrink<T_dim - 1u>(), idx);
		    }

		    template<std::integral T_IntegralType, typename T_Storage, typename T_OtherStorage>
		    ALPAKA_FN_HOST_ACC T_IntegralType linearize(
		        Vec<T_IntegralType, 1u, T_Storage> const&,
		        Vec<T_IntegralType, 1u, T_OtherStorage> const& idx)
		    {
		        return idx.x();
		    }

		    /** @} */

		    /** Maps a linear index to an N-dimensional index
		     *
		     * @tparam T_IntegralType vector data type (must be an integral type)
		     * @param dim N-dimensional index space
		     * @param linearIdx Linear index within dim.
		     *        @attention If linearIdx is an index outside of dim the result will be outside of the index domain
		     * too.
		     * @return N-dimensional index
		     *
		     * @{
		     */
		    template<std::integral T_IntegralType, typename T_Storage, uint32_t T_dim>
		    constexpr Vec<T_IntegralType, T_dim> mapToND(
		        Vec<T_IntegralType, T_dim, T_Storage> const& extents,
		        T_IntegralType linearIdx) requires(T_dim >= 2u)
		    {
		        constexpr uint32_t reducedDim = T_dim - 1u;
		        Vec<T_IntegralType, reducedDim> pitchExtents;
		        pitchExtents.back() = extents.back();
		        for(uint32_t d = 1u; d < T_dim - 1u; ++d)
		            pitchExtents[reducedDim - 1u - d] = extents[T_dim - 1u - d] * pitchExtents[reducedDim - d];

		        Vec<T_IntegralType, T_dim> result;
		        for(uint32_t d = 0u; d < T_dim - 1u; ++d)
		        {
		            result[d] = linearIdx / pitchExtents[d];
		            linearIdx -= pitchExtents[d] * result[d];
		        }
		        result[T_dim - 1u] = linearIdx;
		        return result;
		    }

		    template<std::integral T_IntegralType, typename T_Storage>
		    constexpr Vec<T_IntegralType, 1u> mapToND(
		        Vec<T_IntegralType, 1u, T_Storage> const& extents,
		        T_IntegralType linearIdx)
		    {
		        alpaka::unused(extents);
		        return {linearIdx};
		    }

		    /** @} */

		    namespace trait
		    {
		        template<typename T_Type, uint32_t T_dim, typename T_Storage>
		        struct IsVector<Vec<T_Type, T_dim, T_Storage>> : std::true_type
		        {
		        };

		        template<typename T_Type, uint32_t T_dim, T_Type... T_values>
		        struct IsCVector<Vec<T_Type, T_dim, detail::CVec<T_Type, T_values...>>> : std::true_type
		        {
		        };
		    } // namespace trait

		    namespace trait
		    {
		        template<typename T_Type, uint32_t T_dim, typename T_Storage>
		        struct GetDim<alpaka::Vec<T_Type, T_dim, T_Storage>>
		        {
		            static constexpr uint32_t value = T_dim;
		        };

		        template<typename T>
		        struct GetVec;

		        template<std::integral T>
		        struct GetVec<T>
		        {
		            using type = Vec<T, 1u>;
		        };

		        template<typename T_Type, uint32_t T_dim, typename T_Storage>
		        struct GetVec<alpaka::Vec<T_Type, T_dim, T_Storage>>
		        {
		            using type = alpaka::Vec<T_Type, T_dim, T_Storage>;
		        };

		        template<typename T>
		        using getVec_t = typename GetVec<T>::type;

		        template<typename T_Type, uint32_t T_dim, typename T_Storage>
		        struct GetValueType<Vec<T_Type, T_dim, T_Storage>>
		        {
		            using type = T_Type;
		        };

		    } // namespace trait

		    template<typename T>
		    consteval auto getVec(T const& any)
		    {
		        return trait::getVec_t<T>{any};
		    }

		    namespace internal
		    {
		        template<typename T_To, typename T_Type, uint32_t T_dim, typename T_Storage>
		        struct PCast::Op<T_To, alpaka::Vec<T_Type, T_dim, T_Storage>>
		        {
		            constexpr auto operator()(auto&& input) const
		                requires std::convertible_to<T_Type, T_To> && (!std::same_as<T_To, T_Type>)
		            {
		                return typename alpaka::Vec<T_To, T_dim, T_Storage>::UniVec([&](uint32_t idx) constexpr
		                                                                            { return static_cast<T_To>(input[idx]); });
		            }

		            constexpr decltype(auto) operator()(auto&& input) const requires std::same_as<T_To, T_Type>
		            {
		                return std::forward<decltype(input)>(input);
		            }
		        };
		    } // namespace internal

		    /** @todo the function for integral values is defined in Utils.hpp
		     * move this to a better place, e.g. math and expose this for the user too
		     */
		    template<concepts::Vector T_Vector0, concepts::Vector T_Vector1>
		    requires(std::is_same_v<trait::GetValueType_t<T_Vector0>, trait::GetValueType_t<T_Vector1>>)
		    [[nodiscard]] ALPAKA_FN_HOST_ACC constexpr concepts::Vector auto divCeil(T_Vector0 a, T_Vector1 b)
		    {
		        return (a + b - T_Vector0::fill(1)) / b;
		    }

		    template<concepts::Vector T_Vector0, concepts::Vector T_Vector1>
		    requires(std::is_same_v<trait::GetValueType_t<T_Vector0>, trait::GetValueType_t<T_Vector1>>)
		    [[nodiscard]] ALPAKA_FN_HOST_ACC constexpr concepts::Vector auto divExZero(T_Vector0 a, T_Vector1 b)
		    {
		        auto tmp = a / b;

		        using ValueType = alpaka::trait::GetValueType_t<T_Vector0>;
		        for(uint32_t d = 0u; d < a.dim(); ++d)
		            tmp[d] = std::max(tmp[d], ValueType{1u});
		        return tmp;
		    }
		}; // namespace alpaka

		namespace std
		{
		    template<typename T_Type, uint32_t T_dim, typename T_Storage>
		    struct tuple_size<alpaka::Vec<T_Type, T_dim, T_Storage>>
		    {
		        static constexpr std::size_t value = T_dim;
		    };

		    template<std::size_t I, typename T_Type, uint32_t T_dim, typename T_Storage>
		    struct tuple_element<I, alpaka::Vec<T_Type, T_dim, T_Storage>>
		    {
		        using type = T_Type;
		    };
		} // namespace std
		// ==
		// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/Vec.hpp ==
		// ============================================================================

	// #include "alpaka/core/common.hpp"    // amalgamate: file already inlined

	// #include <array>    // amalgamate: file already included
	// #include <concepts>    // amalgamate: file already included
	// #include <cstdint>    // amalgamate: file already included
	#include <functional>
	#include <type_traits>
	// #include <utility>    // amalgamate: file already included

	namespace alpaka
	{
	    /** @brief A vector with compile-time known values
	     *
	     * @details
	     * A CVec is guaranteed to be constexpr, because all of its values are stored in the type. A CVec instance
	     * satisfies the alpaka::concept::Vector. Some ways to create common types of vectors are fillCVec() and
	     * iotaCVec().
	     *
	     * @tparam T The type of the vector's stored values
	     * @tparam T_values List of values of type T that the vector stores; the length of the vector is inferred from the
	     * length of this list
	     */
	    template<typename T, T... T_values>
	    using CVec = Vec<T, sizeof...(T_values), detail::CVec<T, T_values...>>;

	    namespace detail
	    {
	        template<typename T, T... T_values>
	        [[nodiscard]] constexpr auto integerSequenceToCVec(std::integer_sequence<T, T_values...>)
	        {
	            return alpaka::CVec<T, T_values...>{};
	        }

	        template<typename T, T... T_values>
	        [[nodiscard]] constexpr auto toIntegerSequence(alpaka::CVec<T, T_values...>)
	        {
	            return std::integer_sequence<T, T_values...>{};
	        }

	        template<typename Int, Int... Is1, Int... Is2>
	        [[nodiscard]] constexpr auto combine(std::integer_sequence<Int, Is1...>, std::integer_sequence<Int, Is2...>)
	        {
	            return std::integer_sequence<Int, Is1..., Is2...>{};
	        }

	        template<typename Last>
	        [[nodiscard]] constexpr auto concatenate(Last last)
	        {
	            return last;
	        }

	        template<typename First, typename... Rest>
	        [[nodiscard]] constexpr auto concatenate(First first, Rest... rest)
	        {
	            return combine(first, concatenate(rest...));
	        }

	        template<bool pred, typename T, T T_v>
	        using selectValue = std::conditional_t<pred, std::integer_sequence<T>, std::integer_sequence<T, T_v>>;

	        /** @brief Return all values of an integer sequence for which a filter returns true
	         *
	         * @tparam T_UnaryOp The type of the function or functor to filter with. Must take one argument and return a
	         * boolean.
	         * @tparam T The type of the given values.
	         * @tparam T_values The values to filter.
	         * @param op The filter function/functor.
	         * @param _ An integer sequence of values to filter
	         * @return The filtered integer sequence
	         */
	        template<typename T_UnaryOp, typename T, T... T_values>
	        [[nodiscard]] constexpr auto filterValues(T_UnaryOp const op, std::integer_sequence<T, T_values...> _)
	        {
	            alpaka::unused(_);
	            return concatenate(selectValue<op(T_values), T, T_values>{}...);
	        }

	        /** A functor that can check for any of the contained values
	         *
	         * @details
	         * The functor contains the given sequence of values and implements an `operator()(T value)`, which returns
	         * true if the `value` is part of the sequence.
	         *
	         * @tparam T_Seq The sequence to check against
	         */
	        template<typename T_Seq>
	        struct Contains;

	        template<typename T, template<typename, T...> typename T_Seq, T... T_values>
	        struct Contains<T_Seq<T, T_values...>>
	        {
	            using argument_type = T;

	            constexpr bool operator()(T value) const
	            {
	                return ((value == T_values) || ...);
	            }
	        };

	        /* this specialization is required for clang20 but in principle the specialization above should cover it
	         * compile error: CVec.hpp:92:51: error: implicit instantiation of undefined template
	         * 'alpaka::detail::Contains<std::integer_sequence<unsigned int, 0>>' 92 |         return
	         * integerSequenceToCVec(filterValues(Contains<ALPAKA_TYPEOF(rightSeq)>{}, toIntegerSequence(left)));
	         */
	        template<typename T, T... T_values>
	        struct Contains<std::integer_sequence<T, T_values...>>
	        {
	            using argument_type = T;

	            constexpr bool operator()(T value) const
	            {
	                return ((value == T_values) || ...);
	            }
	        };
	    } // namespace detail

	    /** Create and return a CVector of the given length with values 1, 2, ...
	     *
	     * @details
	     * The function is defined consteval, so the result can and should always be constexpr.
	     *
	     * @tparam T Type of the stored values
	     * @tparam T_dim Length of the vector
	     *
	     * @return The vector containing the iota sequence
	     */
	    template<typename T, uint32_t T_dim>
	    [[nodiscard]] consteval auto iotaCVec()
	    {
	        using IotaSeq = std::make_integer_sequence<T, T_dim>;
	        return detail::integerSequenceToCVec(IotaSeq{});
	    }

	    /** Create and return a CVector of some length, filled with the given value
	     *
	     * @details
	     * The function is defined consteval, so the result can and should always be constexpr.
	     *
	     * @tparam T Type of the stored values
	     * @tparam T_dim Length of the vector
	     * @tparam T_val Values to fill the vector with
	     *
	     * @return The filled vector
	     */
	    template<typename T, uint32_t T_dim, T T_val>
	    [[nodiscard]] consteval auto fillCVec()
	    {
	        auto concatCVec = []<T... T_values>(CVec<T, T_values...>) -> auto { return CVec<T, T_values..., T_val>{}; };

	        static_assert(T_dim > 0);
	        if constexpr(T_dim == 1)
	            return CVec<T, T_val>{};
	        else
	            return concatCVec(fillCVec<T, T_dim - 1, T_val>());
	    }

	    /** Filter the left vector with the right vector's values
	     *
	     * @return A CVec that contains all values of the left vector that don't exist in the right vector. Preserves
	     * original order.
	     */
	    [[nodiscard]] constexpr auto filter(concepts::CVector auto left, concepts::CVector auto right)
	    {
	        using namespace detail;
	        constexpr auto rightSeq = toIntegerSequence(right);

	        return integerSequenceToCVec(
	            filterValues(detail::Contains<ALPAKA_TYPEOF(rightSeq)>{}, toIntegerSequence(left)));
	    }

	} // namespace alpaka
	// ==
	// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/CVec.hpp ==
	// ============================================================================

	// ============================================================================
	// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/Simd.hpp ==
	// ==
	/* Copyright 2025 René Widera
	 * SPDX-License-Identifier: MPL-2.0
	 */

	/** @file This file provides a basic implementation of a SIMD vector.
	 *
	 * The implementation is based on the class Vec:
	 *   - the storge policy should become the native SIMD implementation e.g. std::simd
	 *   - load/ store and simd specifis should be implemented in the storage policy
	 *   - the name of storage policy should be changed
	 *
	 *   The current operator operations relay on compilers auto vectorization.
	 */

	// #pragma once
		// ============================================================================
		// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/SimdMask.hpp ==
		// ==
		/* Copyright 2025 René Widera
		 * SPDX-License-Identifier: MPL-2.0
		 */

		/** @file This file provides a basic implementation of a SIMD vector.
		 *
		 * The implementation is based on the class Vec:
		 *   - the storge policy should become the native SIMD implementation e.g. std::simd
		 *   - load/ store and simd specifis should be implemented in the storage policy
		 *   - the name of storage policy should be changed
		 *
		 *   The current operator operations relay on compilers auto vectorization.
		 */

		// #pragma once
		// #include "alpaka/Vec.hpp"    // amalgamate: file already inlined
		// #include "alpaka/cast.hpp"    // amalgamate: file already inlined
		// #include "alpaka/core/util.hpp"    // amalgamate: file already inlined
			// ============================================================================
			// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/mem/Alignment.hpp ==
			// ==
			/* Copyright 2025 René Widera
			 * SPDX-License-Identifier: MPL-2.0
			 */

			// #pragma once
			// #include <cstdint>    // amalgamate: file already included
			// #include <limits>    // amalgamate: file already included
			#include <type_traits>

			namespace alpaka
			{
			    /** @brief Strongly typed and constexpr representation of a byte-alignment of memory
			     *
			     * @details
			     * The number of bytes is stored at compile-time using a value template parameter. Therefore, alignments should
			     * always be declared `constexpr`. If no explicit alignment is given, a default will be set.
			     *
			     * To use the alignment, the Alignment::get() function can be called for a given type parameter, returning either
			     * the object's set alignment, or the given type's alignment, if the default was used.
			     *
			     * @tparam T_byte The number of bytes in uint32_t.
			     */
			    template<uint32_t T_byte = std::numeric_limits<uint32_t>::max()>
			    struct Alignment
			    {
			        /** Get the byte-alignment of a given type when using this alignment.
			         *
			         * @details
			         * Trying to use an alignment with a smaller value than the alignment of the given `T_Type` results in a failed
			         * `static_assert`.
			         *
			         * @tparam T_Type The type for which to get the alignment.
			         * @return If T_byte is not specifically set: alignment of T_Type, else: value of T_byte
			         */
			        template<typename T_Type>
			        static consteval uint32_t get()
			        {
			            // auto alignment
			            if constexpr(T_byte == std::numeric_limits<uint32_t>::max())
			                return static_cast<uint32_t>(alignof(T_Type));
			            else
			            {
			                static_assert(
			                    value >= alignof(T_Type),
			                    "tried to use alignment that is smaller than the alignment of the type it's for");
			                return value;
			            }
			        }

			    private:
			        static consteval uint32_t get()
			        {
			            return value;
			        }

			        static constexpr uint32_t value = T_byte;
			    };

			    using AutoAligned = Alignment<>;

			    namespace trait
			    {
			        template<typename T_Type>
			        struct IsAlignment : std::false_type
			        {
			        };

			        template<uint32_t T_byte>
			        struct IsAlignment<Alignment<T_byte>> : std::true_type
			        {
			        };
			    } // namespace trait

			    template<typename T_Type>
			    constexpr bool isAlignment_v = trait::IsAlignment<T_Type>::value;

			    namespace concepts
			    {
			        /** @brief Concept to check for an alignment object
			         *
			         * @details
			         * An alignment represents a byte alignment of memory. The class is used for strong typing.
			         * For more information, refer to the struct alpaka::Alignment or the general documentation.
			         *
			         * @todo link to alignment documentation in the general docs
			         */
			        template<typename T>
			        concept Alignment = trait::IsAlignment<T>::value;
			    } // namespace concepts
			} // namespace alpaka
			// ==
			// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/mem/Alignment.hpp ==
			// ============================================================================

			// ============================================================================
			// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/simd/concepts.hpp ==
			// ==
			/* Copyright 2025 René Widera
			 * SPDX-License-Identifier: MPL-2.0
			 */

			/** @file This file provides a basic implementation of a SIMD vector.
			 *
			 * The implementation is based on the class Vec:
			 *   - the storge policy should become the native SIMD implementation e.g. std::simd
			 *   - load/ store and simd specifis should be implemented in the storage policy
			 *   - the name of storage policy should be changed
			 *
			 *   The current operator operations relay on compilers auto vectorization.
			 */

			// #pragma once
			// #include <concepts>    // amalgamate: file already included
			// #include <cstdint>    // amalgamate: file already included
			#include <type_traits>

			namespace alpaka
			{
			    namespace trait
			    {
			        template<typename T>
			        struct IsSimd : std::false_type
			        {
			        };

			        template<typename T>
			        struct IsSimdMask : std::false_type
			        {
			        };

			    } // namespace trait

			    template<typename T>
			    constexpr bool isSimd_v = trait::IsSimd<T>::value;

			    template<typename T>
			    constexpr bool isSimdMask_v = trait::IsSimdMask<T>::value;

			    namespace concepts
			    {
			        template<typename T>
			        concept Simd = isSimd_v<T>;

			        template<typename T>
			        concept SimdMask = isSimdMask_v<T>;

			        template<typename T>
			        concept SimdOrScalar = (isSimd_v<T> || std::integral<T> || std::floating_point<T>);

			        template<typename T, typename T_RequiredComponent>
			        concept TypeOrSimd = (isSimd_v<T> || std::is_same_v<T, T_RequiredComponent>);

			        template<typename T, typename T_RequiredComponent>
			        concept SimdOrConvertibleType = (isSimd_v<T> || std::is_convertible_v<T, T_RequiredComponent>);
			    } // namespace concepts
			} // namespace alpaka
			// ==
			// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/simd/concepts.hpp ==
			// ============================================================================

			// ============================================================================
			// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/simd/internal/StdSimdMask.hpp ==
			// ==
			/* Copyright 2026 René Widera
			 * SPDX-License-Identifier: MPL-2.0
			 */

			/** @file This file provides a basic implementation of a SIMD vector.
			 *
			 * The implementation is based on the class Vec:
			 *   - the storge policy should become the native SIMD implementation e.g. std::simd
			 *   - load/ store and simd specifis should be implemented in the storage policy
			 *   - the name of storage policy should be changed
			 *
			 *   The current operator operations relay on compilers auto vectorization.
			 */

			// #pragma once
				// ============================================================================
				// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/api.hpp ==
				// ==
				/* Copyright 2024 René Widera
				 * SPDX-License-Identifier: MPL-2.0
				 */


				// #pragma once
					// ============================================================================
					// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/cuda/Api.hpp ==
					// ==
					/* Copyright 2024 René Widera
					 * SPDX-License-Identifier: MPL-2.0
					 */


					// #pragma once
						// ============================================================================
						// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/unifiedCudaHip/trait.hpp ==
						// ==
						/* Copyright 2024 René Widera
						 * SPDX-License-Identifier: MPL-2.0
						 */


						// #pragma once
							// ============================================================================
							// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/concepts/api.hpp ==
							// ==
							/* Copyright 2024 René Widera
							 * SPDX-License-Identifier: MPL-2.0
							 */

							// #pragma once
								// ============================================================================
								// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/concepts/hasName.hpp ==
								// ==
								/* Copyright 2024 René Widera
								 * SPDX-License-Identifier: MPL-2.0
								 */

								// #pragma once
									// ============================================================================
									// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/internal/interface.hpp ==
									// ==
									/* Copyright 2024 René Widera
									 * SPDX-License-Identifier: MPL-2.0
									 */

									// #pragma once
										// ============================================================================
										// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/KernelBundle.hpp ==
										// ==
										/* Copyright 2023 René Widera, Mehmet Yusufoglu
										 * SPDX-License-Identifier: MPL-2.0
										 */

										// #pragma once
											// ============================================================================
											// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/apply.hpp ==
											// ==
											/* Copyright 2025 René Widera
											 * SPDX-License-Identifier: MPL-2.0
											 */

											// #pragma once
											// #include "alpaka/core/common.hpp"    // amalgamate: file already inlined

											// #include <utility>    // amalgamate: file already included

											namespace alpaka
											{
											    namespace detail
											    {
											        template<typename T_Func, typename T_TupleLike, std::size_t... T_idx>
											        ALPAKA_FN_INLINE constexpr decltype(auto) applyImpl(
											            T_Func&& func,
											            T_TupleLike&& tuple,
											            std::index_sequence<T_idx...>)
											        {
											            using std::get;
											            return std::forward<T_Func>(func)(get<T_idx>(std::forward<T_TupleLike>(tuple))...);
											        }
											    } // namespace detail

											    /** Applies a function to the elements of a tuple-like object.
											     *
											     * This function forwards the function and the tuple-like object, and uses an index sequence to unpack the tuple.
											     *
											     * @param func The function to apply.
											     * @param tuple The tuple-like object containing the arguments for the function.
											     * @return The result of applying the function to the elements of the tuple-like object.
											     */
											    template<typename T_Func, typename T_TupleLike>
											    ALPAKA_FN_INLINE constexpr decltype(auto) apply(T_Func&& func, T_TupleLike&& tuple)
											    {
											        /** @attention Do not use std::tuple_size_v here because it results in compile issues with gcc11.4 */
											        return detail::applyImpl(
											            std::forward<T_Func>(func),
											            std::forward<T_TupleLike>(tuple),
											            std::make_index_sequence<std::tuple_size<std::decay_t<T_TupleLike>>::value>{});
											    }
											} // namespace alpaka
											// ==
											// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/apply.hpp ==
											// ============================================================================

											// ============================================================================
											// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/core/Dict.hpp ==
											// ==
											/* Copyright 2024 René Widera
											 * SPDX-License-Identifier: MPL-2.0
											 */

											// #pragma once
												// ============================================================================
												// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/Tuple.hpp ==
												// ==
												/* Copyright 2025 Tapish Narwal, René Widera
												 * SPDX-License-Identifier: MPL-2.0
												 */

												// #pragma once
												// #include "alpaka/core/common.hpp"    // amalgamate: file already inlined
												// #include "alpaka/utility.hpp"    // amalgamate: file already inlined

												// #include <tuple>    // amalgamate: file already included
												#include <type_traits>
												// #include <utility>    // amalgamate: file already included

												namespace alpaka
												{
												    template<typename... T_Args>
												    struct Tuple;

												    namespace detail
												    {
												        template<std::size_t I, typename T>
												        struct TupleLeaf
												        {
												            using type = T;
												            T value;
												        };

												        template<typename IndexSequence, typename... T_Args>
												        struct TupleImpl;

												        template<std::size_t... Is, typename... T_Args>
												        struct TupleImpl<std::index_sequence<Is...>, T_Args...> : TupleLeaf<Is, T_Args>...
												        {
												            template<typename... T_CArgs>
												            constexpr TupleImpl(T_CArgs&&... us) noexcept((std::is_nothrow_constructible_v<T_Args, T_CArgs&&> && ...))
												                : TupleLeaf<Is, T_Args>{std::forward<T_CArgs>(us)}...
												            {
												            }

												            constexpr TupleImpl() requires(std::is_default_constructible_v<T_Args> && ...)
												            = default;
												        };
												    } // namespace detail

												    /** basic tuple implementation
												     *
												     * This class is trivially copyable if all members are trivially copable too and can therefore used for a
												     * collection to pass arguments into kernels. You should use @see alpaka::apply to apply operation to the tuple.
												     */
												    template<typename... T_Args>
												    struct Tuple : detail::TupleImpl<std::make_index_sequence<sizeof...(T_Args)>, T_Args...>
												    {
												        using StdTuple = std::tuple<T_Args...>;
												        using Base = detail::TupleImpl<std::make_index_sequence<sizeof...(T_Args)>, T_Args...>;

												        template<typename... T_CArgs>
												        requires(
												            sizeof...(T_Args) == sizeof...(T_CArgs) && sizeof...(T_Args) > 0
												            && (!std::is_same_v<std::remove_cvref_t<std::tuple_element_t<0, std::tuple<T_CArgs...>>>, Tuple>)
												            && (std::is_constructible_v<T_Args, T_CArgs &&> && ...))
												        constexpr Tuple(T_CArgs&&... us) noexcept((std::is_nothrow_constructible_v<T_Args, T_CArgs&&> && ...))
												            : Base(std::forward<T_CArgs>(us)...)
												        {
												        }

												        constexpr Tuple() requires(std::is_default_constructible_v<T_Args> && ...)
												        = default;

												        /** get element by index
												         *
												         * @tparam I index which should not be larger than the number of elements -1
												         * @{
												         */
												        template<size_t I>
												        constexpr auto const& get() const
												        {
												            static_assert(I < sizeof...(T_Args), "Index is outside of the allowed range.");
												            return static_cast<detail::TupleLeaf<I, std::tuple_element_t<I, StdTuple>> const&>(*this).value;
												        }

												        template<size_t I>
												        constexpr auto& get()
												        {
												            static_assert(I < sizeof...(T_Args), "Index is outside of the allowed range.");
												            return static_cast<detail::TupleLeaf<I, std::tuple_element_t<I, StdTuple>>&>(*this).value;
												        }

												        /** @} */
												    };

												    template<typename... T_Args>
												    Tuple(T_Args&&...) -> Tuple<T_Args...>;

												    template<size_t T_idx>
												    constexpr decltype(auto) get(concepts::SpecializationOf<Tuple> auto&& t) noexcept
												    {
												        return ALPAKA_FORWARD(t).template get<T_idx>();
												    }

												    constexpr auto makeTuple(auto&&... args)
												    {
												        return Tuple{ALPAKA_FORWARD(args)...};
												    }
												} // namespace alpaka

												namespace std
												{
												    // Specialization of tuple_size for our custom Tuple
												    template<typename... T_Args>
												    struct tuple_size<alpaka::Tuple<T_Args...>> : std::integral_constant<std::size_t, sizeof...(T_Args)>
												    {
												    };

												    template<std::size_t I, typename... T_Args>
												    struct tuple_element<I, alpaka::Tuple<T_Args...>>
												    {
												        using type = typename std::tuple_element_t<I, typename alpaka::Tuple<T_Args...>::StdTuple>;
												    };
												} // namespace std
												// ==
												// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/Tuple.hpp ==
												// ============================================================================

											// #include "alpaka/core/common.hpp"    // amalgamate: file already inlined
											// #include "alpaka/core/util.hpp"    // amalgamate: file already inlined
											// #include "alpaka/unused.hpp"    // amalgamate: file already inlined
											// #include "alpaka/utility.hpp"    // amalgamate: file already inlined

											// #include <cstdio>    // amalgamate: file already included
											// #include <tuple>    // amalgamate: file already included
											// #include <utility>    // amalgamate: file already included

											namespace alpaka
											{
											    namespace internal
											    {
											        // https://stackoverflow.com/a/64606884
											        template<typename X, typename T_Tuple>
											        struct KeyIdx
											        {
											            static_assert(sizeof(T_Tuple) && false);
											        };

											        template<typename X, template<typename...> typename T_Tuple, typename... T>
											        struct KeyIdx<X, T_Tuple<T...>>
											        {
											            template<std::size_t... idx>
											            static constexpr ssize_t find_idx(std::index_sequence<idx...>)
											            {
											                ssize_t found_idx = -1;
											                // notUsed is required to avoid warning that the expression is not used
											                [[maybe_unused]] bool notUsed
											                    = ((std::is_same_v<X, typename T::KeyType> && (found_idx = idx, true)) || ...);
											                return found_idx;
											            }

											        public:
											            static constexpr ssize_t value = find_idx(std::index_sequence_for<T...>{});
											        };

											        template<typename X, template<typename...> typename T_Tuple>
											        class KeyIdx<X, T_Tuple<>>
											        {
											            static constexpr ssize_t find_idx(std::index_sequence<>)
											            {
											                return -1;
											            }

											        public:
											            static constexpr ssize_t value = find_idx(std::index_sequence_for<>{});
											        };
											    } // namespace internal

											    template<typename T_Key, typename T_Tuple>
											    inline consteval ssize_t getIdx(T_Tuple&&, T_Key const& = T_Key{})
											    {
											        constexpr auto idx = internal::KeyIdx<T_Key, std::decay_t<T_Tuple>>::value;
											        return idx;
											    }

											    template<typename T_Key, typename T_Tuple>
											    consteval bool hasTag(T_Tuple&&, T_Key const& = T_Key{})
											    {
											        constexpr auto idx = internal::KeyIdx<T_Key, std::decay_t<T_Tuple>>::value;
											        return idx != -1;
											    }

											    template<typename T_Key, typename T_Tuple>
											    inline constexpr decltype(auto) getTag(T_Tuple&& t, T_Key const& = T_Key{})
											    {
											        constexpr auto idx = internal::KeyIdx<T_Key, std::decay_t<T_Tuple>>::value;
											        static_assert(idx != -1, "Member in dict missing!");
											        static_assert(idx < std::tuple_size_v<std::decay_t<T_Tuple>>, "index out of range!");
											        return unWrapp(get<idx>(std::forward<T_Tuple>(t)).value);
											    }

											    template<typename T_Key, typename T_Value>
											    struct DictEntry
											    {
											        using KeyType = T_Key;
											        using ValueType = T_Value;

											        constexpr DictEntry(T_Key const, T_Value const& v) : value{v}
											        {
											        }

											        constexpr DictEntry() = default;

											        T_Value value;
											    };

											    namespace trait
											    {
											        template<typename T_Object, typename T_Sfinae = void>
											        struct ToDictEntry
											        {
											            template<typename T>
											            static constexpr auto get(T&& data)
											            {
											                return std::forward<T>(data);
											            }
											        };
											    } // namespace trait

											    template<typename... T_DictEntry>
											    struct Dict
											    {
											        static_assert(sizeof...(T_DictEntry) && false);
											    };

											    template<typename... T_Keys, typename... T_Values>
											    struct Dict<DictEntry<T_Keys, T_Values>...> : Tuple<DictEntry<T_Keys, T_Values>...>
											    {
											        using TupleType = Tuple<DictEntry<T_Keys, T_Values>...>;

											        constexpr Dict(Tuple<DictEntry<T_Keys, T_Values>...> const& data) : Tuple<DictEntry<T_Keys, T_Values>...>{data}
											        {
											        }

											        constexpr Dict(DictEntry<T_Keys, T_Values> const&... dictEntries)
											            : Tuple<DictEntry<T_Keys, T_Values>...>{dictEntries...}
											        {
											        }

											        constexpr Dict(Dict const&) = default;
											        constexpr Dict(Dict&&) = default;

											        static constexpr auto makeDict() requires(std::default_initializable<T_Values>, ...)
											        {
											            return Dict{alpaka::makeTuple(DictEntry<T_Keys, T_Values>{}...)};
											        }

											        ALPAKA_NO_HOST_ACC_WARNING
											        constexpr decltype(auto) operator[](auto const tag) const
											        {
											            return getTag(*this, tag);
											        }

											        ALPAKA_NO_HOST_ACC_WARNING
											        constexpr decltype(auto) operator[](auto const tag)
											        {
											            return getTag(*this, tag);
											        }
											    };

											    template<size_t T_idx>
											    constexpr decltype(auto) get(concepts::SpecializationOf<Dict> auto& t) noexcept
											    {
											        return t.template get<T_idx>();
											    }

											    template<size_t T_idx>
											    constexpr decltype(auto) get(concepts::SpecializationOf<Dict> auto const& t) noexcept
											    {
											        return t.template get<T_idx>();
											    }

											    // type deduction guide
											    template<typename... T_Keys, typename... T_Values>
											    ALPAKA_FN_HOST_ACC Dict(Tuple<DictEntry<T_Keys, T_Values>...> const&) -> Dict<DictEntry<T_Keys, T_Values>...>;

											    template<typename... T_Keys, typename... T_Values>
											    ALPAKA_FN_HOST_ACC Dict(DictEntry<T_Keys, T_Values> const&...) -> Dict<DictEntry<T_Keys, T_Values>...>;

											} // namespace alpaka

											namespace std
											{
											    template<typename... T_Keys, typename... T_Values>
											    struct tuple_size<alpaka::Dict<alpaka::DictEntry<T_Keys, T_Values>...>>
											    {
											        static constexpr std::size_t value = sizeof...(T_Keys);
											    };

											    template<std::size_t I, typename... T_Keys, typename... T_Values>
											    struct tuple_element<I, alpaka::Dict<alpaka::DictEntry<T_Keys, T_Values>...>>
											    {
											        using type = decltype(alpaka::get<I>(std::declval<alpaka::Tuple<alpaka::DictEntry<T_Keys, T_Values>...>>()));
											    };
											} // namespace std

											namespace alpaka
											{

											    template<std::size_t... idx0, std::size_t... idx1, typename T_Dict0, typename T_Dict1>
											    constexpr auto joinDictHelper(
											        std::index_sequence<idx0...>,
											        std::index_sequence<idx1...>,
											        T_Dict0 dict0,
											        T_Dict1 dict1)
											    {
											        return Dict{get<idx0>(dict0)..., get<idx1>(dict1)...};
											    }

											    template<typename... T_Entries0, typename... T_Entries1>
											    constexpr auto joinDict(Dict<T_Entries0...> const& dict0, Dict<T_Entries1...> const& dict1)
											    {
											        return joinDictHelper(
											            std::index_sequence_for<T_Entries0...>{},
											            std::index_sequence_for<T_Entries1...>{},
											            dict0,
											            dict1);
											    }

											    template<bool condition, typename... T_Entries0, typename... T_Entries1>
											    requires(condition == true)
											    constexpr auto conditionalAppendDict(Dict<T_Entries0...> const& dict0, Dict<T_Entries1...> const& dict1)
											    {
											        return joinDictHelper(
											            std::index_sequence_for<T_Entries0...>{},
											            std::index_sequence_for<T_Entries1...>{},
											            dict0,
											            dict1);
											    }

											    template<bool condition, typename... T_Entries0, typename... T_Entries1>
											    requires(condition == false)
											    constexpr auto conditionalAppendDict(Dict<T_Entries0...> const& dict0, Dict<T_Entries1...> const& dict1)
											    {
											        alpaka::unused(dict1);
											        return dict0;
											    }
											} // namespace alpaka
											// ==
											// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/core/Dict.hpp ==
											// ============================================================================

											// ============================================================================
											// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/core/RemoveRestrict.hpp ==
											// ==
											/* Copyright 2021 Rene Widera
											 * SPDX-License-Identifier: MPL-2.0
											 */

											// #pragma once
											// #include "alpaka/core/config.hpp"    // amalgamate: file already inlined

											namespace alpaka
											{
											    //! Removes __restrict__ from a type
											    template<typename T>
											    struct remove_restrict
											    {
											        using type = T;
											    };

											#if ALPAKA_COMP_MSVC
											    template<typename T>
											    struct remove_restrict<T* __restrict>
											    {
											        using type = T*;
											    };
											#else
											    template<typename T>
											    struct remove_restrict<T* __restrict__>
											    {
											        using type = T*;
											    };
											#endif

											    //! Helper to remove __restrict__ from a type
											    template<typename T>
											    using remove_restrict_t = typename remove_restrict<T>::type;
											} // namespace alpaka
											// ==
											// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/core/RemoveRestrict.hpp ==
											// ============================================================================

										// #include "alpaka/core/common.hpp"    // amalgamate: file already inlined
										// #include "alpaka/core/config.hpp"    // amalgamate: file already inlined
										// #include "alpaka/trait.hpp"    // amalgamate: file already inlined
										// #include "alpaka/utility.hpp"    // amalgamate: file already inlined

										// #include <tuple>    // amalgamate: file already included
										#include <type_traits>

										namespace alpaka
										{
										    namespace onHost
										    {
										        /** Provides an instance of an object which can be used within the compute kernel*/
										        struct MakeAccessibleOnAcc
										        {
										            template<typename T_Any>
										            struct Op
										            {
										                /** @return @attention returns a reference to the original data */
										                auto const& operator()(auto const& any) const
										                {
										                    return any;
										                }

										                auto& operator()(auto& any) const
										                {
										                    return any;
										                }
										            };
										        };

										        /** Provides an instance of an object which can be used within the compute kernel
										         *
										         * @return compute kernel compatible object if MakeAccessibleOnAcc is specialized else the identity
										         */
										        inline decltype(auto) makeAccessibleOnAcc(auto&& any)
										        {
										            return MakeAccessibleOnAcc::Op<ALPAKA_TYPEOF(any)>{}(ALPAKA_FORWARD(any));
										        }
										    } // namespace onHost

										    //! \brief The class used to bind kernel function object and arguments together. Once an instance of this class
										    //! is created, arguments are not needed to be separately given to functions who need kernel function and
										    //! arguments.
										    //! \tparam TKernelFn The kernel function object type.
										    //! \tparam TArgs Kernel function object
										    //! invocation argument types as a parameter pack.
										    template<typename TKernelFn, typename... TArgs>
										    class KernelBundle
										    {
										    public:
										        //! The function object type
										        using KernelFn = std::decay_t<TKernelFn>;
										        //! Tuple type to encapsulate kernel function argument types and argument values
										        using ArgTuple = std::conditional_t<
										            sizeof...(TArgs) == 0,
										            std::tuple<>,
										            alpaka::Tuple<remove_restrict_t<ALPAKA_TYPEOF(onHost::makeAccessibleOnAcc(std::declval<TArgs>()))>...>>;

										        // Constructor
										        constexpr KernelBundle(KernelFn const& kernelFn) : m_kernelFn{kernelFn}, m_args(std::tuple<>{})
										        {
										            static_assert(
										                alpaka::concepts::KernelFn<KernelFn>,
										                "Kernel functor must be trivially copyable or specialize trait::IsKernelTriviallyCopyable<>!");
										        }

										        // Constructor
										        constexpr KernelBundle(KernelFn const& kernelFn, auto&&... args)
										            : m_kernelFn{kernelFn}
										            , m_args(onHost::makeAccessibleOnAcc(ALPAKA_FORWARD(args))...)
										        {
										            static_assert(
										                alpaka::concepts::KernelFn<KernelFn>,
										                "Kernel functor must be trivially copyable or specialize trait::IsKernelTriviallyCopyable<>!");
										            static_assert(
										                (alpaka::concepts::KernelArg<
										                     remove_restrict_t<ALPAKA_TYPEOF(onHost::makeAccessibleOnAcc(std::declval<TArgs>()))>>
										                 && ...),
										                "All kernel arguments must be trivially copyable or specialize "
										                "trait::IsKernelArgumentTriviallyCopyable<>!");
										        }

										        constexpr KernelBundle(KernelBundle const& b) = default;
										        constexpr KernelBundle& operator=(KernelBundle const&) = default;

										        /** allow move assignment and constriction
										         *
										         *  @attention if the functor or the arguments contains non movable types the move operators can be
										         * inaccessible.
										         *
										         *  @{
										         */
										        constexpr KernelBundle(KernelBundle&& b) = default;
										        constexpr KernelBundle& operator=(KernelBundle&&) = default;

										        /** @} */

										        template<typename TAcc>
										        requires(
										            alpaka::concepts::KernelFn<KernelFn>
										            && std::is_invocable_v<
										                std::remove_const_t<KernelFn>,
										                TAcc,
										                remove_restrict_t<ALPAKA_TYPEOF(onHost::makeAccessibleOnAcc(std::declval<TArgs>()))>...>)
										        constexpr auto operator()(TAcc const& acc) const
										        {
										            static_assert(
										                std::is_invocable_v<
										                    std::add_const_t<KernelFn>,
										                    TAcc,
										                    remove_restrict_t<ALPAKA_TYPEOF(onHost::makeAccessibleOnAcc(std::declval<TArgs>()))>...>,
										                "the operator() function of a kernel must be marked const");
										            static_assert(
										                std::same_as<
										                    void,
										                    std::invoke_result_t<
										                        std::add_const_t<KernelFn>,
										                        TAcc,
										                        remove_restrict_t<ALPAKA_TYPEOF(onHost::makeAccessibleOnAcc(std::declval<TArgs>()))>...>>,
										                "the return type of the operator() function of a kernel must be void");
										            alpaka::apply(
										                /* It is required to take the arguments as const reference.
										                 * The reason is that these arguments are shared between threads in a block. If the user like to mutate
										                 * these he should use a non const copy in the kernel function signature. This is the reason why we can
										                 * not keep const correctness for buffers and view within the copy-constructor of these.
										                 */
										                [&](alpaka::concepts::KernelArg auto const&... args) constexpr { m_kernelFn(acc, args...); },
										                m_args);
										        }

										        KernelFn m_kernelFn;
										        // Store the argument types without const and reference
										        ArgTuple m_args;
										    };

										    //! \brief User defined deduction guide with trailing return type. For CTAD during the construction.
										    //! \tparam TKernelFn The kernel function object type.
										    //! \tparam TArgs Kernel function object argument types as a parameter pack.
										    //! \param kernelFn The kernel object
										    //! \param args The kernel invocation arguments.

										    //! \return Kernel function bundle. An instance of KernelBundle which consists the kernel function object and its
										    //! arguments.
										    template<typename TKernelFn, typename... TArgs>
										    ALPAKA_FN_HOST KernelBundle(TKernelFn const&, TArgs&&...) -> KernelBundle<TKernelFn, TArgs...>;

										    namespace trait
										    {
										        template<typename T>
										        struct IsKernelBundle : std::integral_constant<bool, isSpecializationOf_v<T, KernelBundle>>
										        {
										        };
										    } // namespace trait

										    template<typename T>
										    constexpr bool isKernelBundle_v = trait::IsKernelBundle<T>::value;

										} // namespace alpaka

										namespace alpaka::concepts
										{
										    /** Concept to check if a type is a KernelBundle
										     *
										     * @tparam T Type to check
										     */
										    template<typename T>
										    concept KernelBundle = isKernelBundle_v<T>;
										} // namespace alpaka::concepts
										// ==
										// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/KernelBundle.hpp ==
										// ============================================================================

									// #include "alpaka/Vec.hpp"    // amalgamate: file already inlined
									// #include "alpaka/core/common.hpp"    // amalgamate: file already inlined
									// #include "alpaka/mem/Alignment.hpp"    // amalgamate: file already inlined
										// ============================================================================
										// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/onHost/Handle.hpp ==
										// ==
										/* Copyright 2024 René Widera
										 * SPDX-License-Identifier: MPL-2.0
										 */

										// #pragma once
										#include <memory>
										#include <mutex>
										#include <type_traits>

										namespace alpaka::onHost
										{
										    template<typename T_Object, typename... T_Args>
										    inline auto make_sharedSingleton(T_Args&&... args)
										    {
										        static std::mutex mutex;
										        static std::weak_ptr<T_Object> platform;

										        std::lock_guard<std::mutex> lk(mutex);
										        if(auto sharedPtr = platform.lock())
										        {
										            return sharedPtr;
										        }
										        auto new_platform = std::make_shared<T_Object>(std::forward<T_Args>(args)...);
										        platform = new_platform;
										        return new_platform;
										    }

										    template<typename T>
										    using Handle = std::shared_ptr<T>;
										} // namespace alpaka::onHost
										// ==
										// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/onHost/Handle.hpp ==
										// ============================================================================

										// ============================================================================
										// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/onHost/demangledName.hpp ==
										// ==
										/* Copyright 2024 René Widera
										 * SPDX-License-Identifier: MPL-2.0
										 */

										// #pragma once
										// #include "alpaka/core/config.hpp"    // amalgamate: file already inlined

										#include <source_location>
										// #include <string>    // amalgamate: file already included
										#include <string_view>

										/** This type is required to be in the global namespace to avoid invalid offsets during demangling */
										struct AlpakaDemangleReferenceType
										{
										};

										namespace alpaka::onHost
										{
										    /// \file
										    /// use source_location to derive the demangled type name
										    /// based on:
										    /// https://www.reddit.com/r/cpp/comments/lfi6jt/finally_a_possibly_portable_way_to_convert_types/?utm_source=share&utm_medium=web3x&utm_name=web3xcss&utm_term=1&utm_content=share_button

										    template<typename T>
										    constexpr auto EmbedTypeIntoSignature()
										    {
										        return std::string_view{std::source_location::current().function_name()};
										    }

										    template<typename T>
										    struct Demangled
										    {
										        static constexpr auto name()
										        {
										            constexpr size_t testSignatureLength = sizeof("AlpakaDemangleReferenceType") - 1;
										            auto const DummySignature = EmbedTypeIntoSignature<AlpakaDemangleReferenceType>();
										            // count char's until the type name starts
										            auto const startPosition = DummySignature.find("AlpakaDemangleReferenceType");
										            // count char's after the type information by removing type name information and pre information
										            auto const tailLength = DummySignature.size() - startPosition - testSignatureLength;
										            auto const EmbeddingSignature = EmbedTypeIntoSignature<T>();
										            auto const typeLength = EmbeddingSignature.size() - startPosition - tailLength;
										            return EmbeddingSignature.substr(startPosition, typeLength);
										        }
										    };

										    template<typename T>
										    constexpr auto demangledName()
										    {
										        return std::string(Demangled<T>::name());
										    }

										    template<typename T>
										    constexpr auto demangledName(T const&)
										    {
										        return std::string(Demangled<T>::name());
										    }

										    /** Simplify the C++ signature of a function
										     *
										     *  Template parameters will be left out and the alpaka namespace will be removed.
										     */
										    inline std::string simplifyFunctionSignature(std::string const& deName)
										    {
										        std::string simplified;
										        simplified.reserve(deName.size());

										        int templateDepth = 0;
										        // Simplify nested templates by removing template arguments, e.g., <...>
										        for(char const c : deName)
										        {
										            if(c == '<')
										            {
										                if(templateDepth++ == 0)
										                    simplified += "<...>";
										                continue;
										            }
										            if(c == '>')
										            {
										                if(templateDepth > 0)
										                {
										                    --templateDepth;
										                    continue;
										                }
										            }
										            if(templateDepth > 0)
										                continue;
										            simplified += c;
										        }

										        // Remove "alpaka::" from the signatures
										        std::string withoutAlpaka;
										        withoutAlpaka.reserve(simplified.size());
										        constexpr std::string_view alpakaNamespace = "alpaka::";
										        for(size_t i = 0; i < simplified.size();)
										        {
										            if(simplified.compare(i, alpakaNamespace.size(), alpakaNamespace) == 0)
										            {
										                i += alpakaNamespace.size();
										                continue;
										            }
										            withoutAlpaka += simplified[i++];
										        }
										        simplified = std::move(withoutAlpaka);
										        return simplified;
										    }

										} // namespace alpaka::onHost
										// ==
										// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/onHost/demangledName.hpp ==
										// ============================================================================


									namespace alpaka
									{
									    /** alpaka internal implementations.
									     *
									     * @attention do not use any functions from this namespace in our user applications.
									     *          The interface can change at any time without further notice and is for internal use only.
									     */
									    namespace internal
									    {
									        struct GetStaticName
									        {
									            template<typename T_Any>
									            struct Op
									            {
									                auto operator()([[maybe_unused]] T_Any const& any) const
									                {
									                    if constexpr(requires { T_Any::getName(); })
									                        return T_Any::getName();
									                    else
									                        return onHost::demangledName(any);
									                }
									            };
									        };

									        struct GetName
									        {
									            template<typename T_Any>
									            struct Op
									            {
									                auto operator()(T_Any const& any) const
									                {
									                    return any.getName();
									                }
									            };
									        };

									        struct GetApi
									        {
									            template<typename T_Any>
									            struct Op
									            {
									                inline constexpr auto operator()(auto&& any) const
									                {
									                    return any.getApi();
									                }
									            };
									        };

									        inline constexpr auto getApi(auto&& any)
									        {
									            return GetApi::Op<std::decay_t<decltype(any)>>{}(any);
									        }

									        template<typename T_Any>
									        inline constexpr auto getApi(onHost::Handle<T_Any>&& anyHandle)
									        {
									            return GetApi::Op<ALPAKA_TYPEOF(*anyHandle.get())>{}(*anyHandle.get());
									        }

									        struct GetDeviceType
									        {
									            template<typename T_Any>
									            struct Op
									            {
									                inline constexpr auto operator()(auto&& any) const
									                {
									                    return any.getDeviceKind();
									                }
									            };
									        };

									        inline constexpr auto getDeviceKind(auto&& any)
									        {
									            return GetDeviceType::Op<std::decay_t<decltype(any)>>{}(any);
									        }

									        struct GetAlignment
									        {
									            template<typename T_Any>
									            struct Op
									            {
									                constexpr auto operator()(auto&& any) const requires requires { any.getAlignment(); }
									                {
									                    return any.getAlignment();
									                }

									                constexpr auto operator()(auto&& any) const
									                {
									                    alpaka::unused(any);
									                    return Alignment<>{};
									                }
									            };
									        };

									        constexpr auto getAlignment(auto&& any)
									        {
									            return GetAlignment::Op<std::decay_t<decltype(any)>>{}(any);
									        }

									        /** Load data from a data source as SIMD vector
									         *
									         * A data source is not required to have physical stored data, it can also be a generator, therefore only the
									         * data source knows how load create aSIMD vector.
									         */
									        struct LoadAsSimd
									        {
									            template<typename T_AnyDataSource, alpaka::concepts::Alignment T_Alignment, alpaka::concepts::Vector T_Idx>
									            struct Op
									            {
									                /** Get data as SIMD vector
									                 *
									                 * @see loadAsSimd for more details.
									                 */
									                template<uint32_t T_simdWidth>
									                constexpr auto load(auto&& anyDataSource, T_Alignment dataAlignment, T_Idx const& index) const;
									            };
									        };

									        /** Get data as SIMD vector
									         *
									         * Load T_simdWidth contiguous data staring from index. The data is contiguous in the fast moving dimension of
									         * index.
									         *
									         * @tparam T_simdWidth number of elements in the SIMD vector
									         * @param anyDataSource data source to load data from
									         * @param dataAlignment Alignment of the data source resulting SIMD vector. This can be smaller or equal
									         * compared to the data source alignment due to possible offsets applied before.
									         * @param index Offset index relative to the first element of data source.
									         * @return SIMD vector with data loaded from the data source, aligned to dataAlignment
									         */
									        template<uint32_t T_simdWidth>
									        constexpr auto loadAsSimd(auto&& anyDataSource, auto dataAlignment, auto const& index)
									        {
									            return LoadAsSimd::Op<ALPAKA_TYPEOF(anyDataSource), ALPAKA_TYPEOF(dataAlignment), ALPAKA_TYPEOF(index)>{}
									                .template load<T_simdWidth>(ALPAKA_FORWARD(anyDataSource), dataAlignment, index);
									        }
									    } // namespace internal
									} // namespace alpaka
									// ==
									// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/internal/interface.hpp ==
									// ============================================================================


								// #include <concepts>    // amalgamate: file already included
								#include <type_traits>

								namespace alpaka::concepts
								{
								    /**
								     */
								    template<typename T>
								    concept HasStaticName = requires(T t) {
								        { internal::GetStaticName::Op<std::decay_t<T>>{}(t) } -> std::convertible_to<std::string>;
								    };

								    template<typename T>
								    concept HasName = requires(T t) {
								        { internal::GetName::Op<T>{}(t) } -> std::convertible_to<std::string>;
								    };
								} // namespace alpaka::concepts
								// ==
								// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/concepts/hasName.hpp ==
								// ============================================================================


							// #include <concepts>    // amalgamate: file already included

							namespace alpaka
							{
							    namespace detail
							    {
							        struct ApiBase
							        {
							        };
							    } // namespace detail

							    namespace trait
							    {
							        template<typename T_Type>
							        struct IsApi : std::is_base_of<detail::ApiBase, T_Type>
							        {
							        };
							    } // namespace trait

							    template<typename T_Type>
							    constexpr bool isApi_v = trait::IsApi<T_Type>::value;

							    namespace concepts
							    {
							        /** @brief Concept to check for APIs
							         *
							         * @details
							         * This concept requires that the template is an API. An API in alpaka is the representation of a software
							         * library that can target one or multiple accelerators. Examples of APIs are alpaka::api::Cuda and
							         * alpaka::api::Host. An Api together with an alpaka::concepts::DeviceKind can make up an
							         * alpaka::onHost::Device.
							         */
							        template<typename T>
							        concept Api = isApi_v<T> && requires(T t) { requires HasStaticName<T>; };
							    } // namespace concepts
							} // namespace alpaka
							// ==
							// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/concepts/api.hpp ==
							// ============================================================================

							// ============================================================================
							// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/trait.hpp ==
							// ==
							/* Copyright 2024 René Widera
							 * SPDX-License-Identifier: MPL-2.0
							 */

							// #pragma once
							// #include "alpaka/api/concepts/api.hpp"    // amalgamate: file already inlined
							// #include "alpaka/core/common.hpp"    // amalgamate: file already inlined
								// ============================================================================
								// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/math/internal/math.hpp ==
								// ==
								/* Copyright 2023 Alexander Matthes, Axel Huebl, Benjamin Worpitz, Matthias Werner, Bernhard Manfred Gruber,
								 * Jeffrey Kelling, Sergei Bastrakov, Andrea Bocci, René Widera
								 * SPDX-License-Identifier: MPL-2.0
								 */

								// #pragma once

								// #include "alpaka/core/common.hpp"    // amalgamate: file already inlined
									// ============================================================================
									// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/math/internal/stlMath.hpp ==
									// ==
									/* Copyright 2023 Alexander Matthes, Axel Huebl, Benjamin Worpitz, Matthias Werner, Bernhard Manfred Gruber,
									 * Jeffrey Kelling, Sergei Bastrakov, Andrea Bocci, René Widera
									 * SPDX-License-Identifier: MPL-2.0
									 */

									// #pragma once
									#include <cmath>
									#include <complex>

									namespace alpaka::math::internal
									{
									    struct StlMath
									    {
									    };

									    constexpr auto stlMath = StlMath{};

									} // namespace alpaka::math::internal
									// ==
									// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/math/internal/stlMath.hpp ==
									// ============================================================================


								// #include <cmath>    // amalgamate: file already included
								// #include <complex>    // amalgamate: file already included
								#include <type_traits>

								namespace alpaka::math::internal
								{

								#define ALPAKA_MATH_UNARY_FUNCTOR(FUNC_NAME, OP_NAME)                                                                 \
								    struct FUNC_NAME                                                                                                  \
								    {                                                                                                                 \
								        template<typename T_MathImpl, typename T_Arg>                                                                 \
								        struct Op                                                                                                     \
								        {                                                                                                             \
								            constexpr auto operator()(T_MathImpl, T_Arg const& argument) const                                        \
								            {                                                                                                         \
								                if constexpr(std::same_as<T_MathImpl, StlMath>)                                                       \
								                {                                                                                                     \
								                    /* use for ADL lookup namespace std only if StlMath is used */                                    \
								                    using std::OP_NAME;                                                                               \
								                    return OP_NAME(argument);                                                                         \
								                }                                                                                                     \
								                else                                                                                                  \
								                    return OP_NAME(argument);                                                                         \
								            }                                                                                                         \
								        };                                                                                                            \
								    }

								    ALPAKA_MATH_UNARY_FUNCTOR(Abs, abs);

								    ALPAKA_MATH_UNARY_FUNCTOR(Cos, cos);
								    ALPAKA_MATH_UNARY_FUNCTOR(Acos, acos);
								    ALPAKA_MATH_UNARY_FUNCTOR(Acosh, acosh);
								    ALPAKA_MATH_UNARY_FUNCTOR(Cosh, cosh);

								    ALPAKA_MATH_UNARY_FUNCTOR(Sin, sin);
								    ALPAKA_MATH_UNARY_FUNCTOR(Asin, asin);
								    ALPAKA_MATH_UNARY_FUNCTOR(Asinh, asinh);
								    ALPAKA_MATH_UNARY_FUNCTOR(Sinh, sinh);

								    ALPAKA_MATH_UNARY_FUNCTOR(Tan, tan);
								    ALPAKA_MATH_UNARY_FUNCTOR(Atan, atan);
								    ALPAKA_MATH_UNARY_FUNCTOR(Atanh, atanh);
								    ALPAKA_MATH_UNARY_FUNCTOR(Tanh, tanh);

								    ALPAKA_MATH_UNARY_FUNCTOR(Cbrt, cbrt);

								    ALPAKA_MATH_UNARY_FUNCTOR(Ceil, ceil);
								    ALPAKA_MATH_UNARY_FUNCTOR(Round, round);
								    ALPAKA_MATH_UNARY_FUNCTOR(Lround, lround);
								    ALPAKA_MATH_UNARY_FUNCTOR(Llround, llround);

								    ALPAKA_MATH_UNARY_FUNCTOR(Trunc, trunc);
								    ALPAKA_MATH_UNARY_FUNCTOR(Floor, floor);

								    ALPAKA_MATH_UNARY_FUNCTOR(Log, log);
								    ALPAKA_MATH_UNARY_FUNCTOR(Log2, log2);
								    ALPAKA_MATH_UNARY_FUNCTOR(Log10, log10);

								    ALPAKA_MATH_UNARY_FUNCTOR(Exp, exp);
								    ALPAKA_MATH_UNARY_FUNCTOR(Sqrt, sqrt);
								    ALPAKA_MATH_UNARY_FUNCTOR(Arg, arg);
								    ALPAKA_MATH_UNARY_FUNCTOR(Erf, erf);

								    ALPAKA_MATH_UNARY_FUNCTOR(Isnan, isnan);
								    ALPAKA_MATH_UNARY_FUNCTOR(Isinf, isinf);
								    ALPAKA_MATH_UNARY_FUNCTOR(Isfinite, isfinite);

								    ALPAKA_MATH_UNARY_FUNCTOR(Conj, conj);

								#undef ALPAKA_MATH_UNARY_FUNCTOR

								    namespace detail
								    {
								        //! Fallback implementation when no better ADL match was found
								        template<typename T_Arg>
								        ALPAKA_FN_INLINE constexpr auto rsqrt(T_Arg const& arg)
								        {
								            // Still use ADL to try find sqrt(arg)
								            using std::sqrt;
								            return static_cast<T_Arg>(1) / sqrt(arg);
								        }
								    } // namespace detail

								    struct Rsqrt
								    {
								        template<typename T_MathImpl, typename T_Arg>
								        struct Op
								        {
								            constexpr auto operator()(T_MathImpl, T_Arg const& arg) const
								            {
								                if constexpr(std::same_as<T_MathImpl, StlMath>)
								                {
								                    // use for ADL lookup namespace std only if StlMath is used
								                    using detail::rsqrt;
								                    return rsqrt(arg);
								                }
								                else
								                    return rsqrt(arg);
								            }
								        };
								    };

								    struct Atan2
								    {
								        template<typename T_MathImpl, typename T_Y, typename T_X>
								        struct Op
								        {
								            constexpr auto operator()(T_MathImpl, T_Y const& y, T_X const& x) const
								            {
								                if constexpr(std::same_as<T_MathImpl, StlMath>)
								                {
								                    // use for ADL lookup namespace std only if StlMath is used
								                    using std::atan2;
								                    return atan2(y, x);
								                }
								                else
								                    return atan2(y, x);
								            }
								        };
								    };

								    namespace detail
								    {
								        //! Fallback implementation when no better ADL match was found
								        template<typename T_Arg>
								        constexpr auto sincos(T_Arg const& arg, T_Arg& result_sin, T_Arg& result_cos)
								        {
								            // Still use ADL to try find sin(arg) and cos(arg)
								            using std::sin;
								            result_sin = sin(arg);
								            using std::cos;
								            result_cos = cos(arg);
								        }
								    } // namespace detail

								    // Sincos function
								    struct SinCos
								    {
								        template<typename T_MathImpl, typename T_Arg>
								        struct Op
								        {
								            constexpr auto operator()(T_MathImpl, T_Arg const& arg, T_Arg& result_sin, T_Arg& result_cos) const
								            {
								                if constexpr(std::same_as<T_MathImpl, StlMath>)
								                {
								                    // use for ADL lookup namespace std only if StlMath is used
								                    using detail::sincos;
								                    return sincos(arg, result_sin, result_cos);
								                }
								                else
								                    return sincos(arg, result_sin, result_cos);
								            }
								        };
								    };

								    struct Copysign
								    {
								        template<typename T_MathImpl, typename T_Mag, typename T_Sgn>
								        struct Op
								        {
								            constexpr auto operator()(T_MathImpl, T_Mag const& mag, T_Sgn const& sgn) const
								            {
								                if constexpr(std::same_as<T_MathImpl, StlMath>)
								                {
								                    // use for ADL lookup namespace std only if StlMath is used
								                    using std::copysign;
								                    return copysign(mag, sgn);
								                }
								                else
								                    return copysign(mag, sgn);
								            }
								        };
								    };

								    struct Min
								    {
								        template<typename T_MathImpl, typename T_A, typename T_B>
								        struct Op
								        {
								            constexpr auto operator()(T_MathImpl, T_A const& a, T_B const& b) const
								            {
								                if constexpr(std::same_as<T_MathImpl, StlMath>)
								                {
								                    // use for ADL lookup namespace std only if StlMath is used
								                    using std::min;
								                    return min(a, b);
								                }
								                else
								                    return min(a, b);
								            }
								        };
								    };

								    struct Max
								    {
								        template<typename T_MathImpl, typename T_A, typename T_B>
								        struct Op
								        {
								            constexpr auto operator()(T_MathImpl, T_A const& a, T_B const& b) const
								            {
								                if constexpr(std::same_as<T_MathImpl, StlMath>)
								                {
								                    // use for ADL lookup namespace std only if StlMath is used
								                    using std::max;
								                    return max(a, b);
								                }
								                else
								                    return max(a, b);
								            }
								        };
								    };

								    struct Pow
								    {
								        template<typename T_MathImpl, typename T_Base, typename T_Exp>
								        struct Op
								        {
								            constexpr auto operator()(T_MathImpl, T_Base const& base, T_Exp const& exp) const
								            {
								                if constexpr(std::same_as<T_MathImpl, StlMath>)
								                {
								                    // use for ADL lookup namespace std only if StlMath is used
								                    using std::pow;
								                    return pow(base, exp);
								                }
								                else
								                    return pow(base, exp);
								            }
								        };
								    };

								    struct Fmod
								    {
								        template<typename T_MathImpl, typename T_X, typename T_Y>
								        struct Op
								        {
								            constexpr auto operator()(T_MathImpl, T_X const& x, T_Y const& y) const
								            {
								                if constexpr(std::same_as<T_MathImpl, StlMath>)
								                {
								                    // use for ADL lookup namespace std only if StlMath is used
								                    using std::fmod;
								                    return fmod(x, y);
								                }
								                else
								                    return fmod(x, y);
								            }
								        };
								    };

								    struct Remainder
								    {
								        template<typename T_MathImpl, typename T_X, typename T_Y>
								        struct Op
								        {
								            constexpr auto operator()(T_MathImpl, T_X const& x, T_Y const& y) const
								            {
								                if constexpr(std::same_as<T_MathImpl, StlMath>)
								                {
								                    // use for ADL lookup namespace std only if StlMath is used
								                    using std::remainder;
								                    return remainder(x, y);
								                }
								                else
								                    return remainder(x, y);
								            }
								        };
								    };

								    struct Fma
								    {
								        template<typename T_MathImpl, typename T_X, typename T_Y, typename T_Z>
								        struct Op
								        {
								            constexpr auto operator()(T_MathImpl, T_X const& x, T_Y const& y, T_Z const& z) const
								            {
								                if constexpr(std::same_as<T_MathImpl, StlMath>)
								                {
								                    // use for ADL lookup namespace std only if StlMath is used
								                    using std::fma;
								                    return fma(x, y, z);
								                }
								                else
								                    return fma(x, y, z);
								            }
								        };
								    };
								} // namespace alpaka::math::internal

									// ============================================================================
									// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/math/internal/stlMathImpl.hpp ==
									// ==
									/* Copyright 2023 Alexander Matthes, Axel Huebl, Benjamin Worpitz, Matthias Werner, Bernhard Manfred Gruber,
									 * Jeffrey Kelling, Sergei Bastrakov, Andrea Bocci, René Widera, Mehmet Yusufoglu
									 * SPDX-License-Identifier: MPL-2.0
									 */

									// #pragma once
									/** @file This file contains specializations of methods where we do not want to fall back to `std::*` functions.
									 */

										// ============================================================================
										// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/core/Unreachable.hpp ==
										// ==
										/* Copyright 2022 Jan Stephan, Jeffrey Kelling
										 * SPDX-License-Identifier: MPL-2.0
										 */

										// #pragma once
										// #include "alpaka/core/config.hpp"    // amalgamate: file already inlined

										//! Before CUDA 11.5 nvcc is unable to correctly identify return statements in 'if constexpr' branches. It will issue
										//! a false warning about a missing return statement unless it is told that the following code section is unreachable.
										//!
										//! \param x A dummy value for the expected return type of the calling function.
										#if (ALPAKA_COMP_NVCC && ALPAKA_ARCH_PTX)
										#    if ALPAKA_LANG_CUDA >= ALPAKA_VERSION_NUMBER(11, 3, 0)
										#        define ALPAKA_UNREACHABLE(...) __builtin_unreachable()
										#    else
										#        define ALPAKA_UNREACHABLE(...) return __VA_ARGS__
										#    endif
										#elif ALPAKA_COMP_MSVC
										#    define ALPAKA_UNREACHABLE(...) __assume(false)
										#elif ALPAKA_COMP_GNUC || ALPAKA_COMP_CLANG
										#    define ALPAKA_UNREACHABLE(...) __builtin_unreachable()
										#else
										#    define ALPAKA_UNREACHABLE(...)
										#endif
										// ==
										// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/core/Unreachable.hpp ==
										// ============================================================================

										// ============================================================================
										// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/core/decay.hpp ==
										// ==
										/* Copyright 2023 Sergei Bastrakov, Jan Stephan, Bernhard Manfred Gruber
										 * SPDX-License-Identifier: MPL-2.0
										 */

										// #pragma once
										#include <type_traits>

										namespace alpaka
										{
										    //! Provides a decaying wrapper around std::is_same. Example: is_decayed_v<volatile float, float> returns true.
										    template<typename T, typename U>
										    inline constexpr auto is_decayed_v = std::is_same_v<std::decay_t<T>, std::decay_t<U>>;
										} // namespace alpaka
										// ==
										// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/core/decay.hpp ==
										// ============================================================================

										// ============================================================================
										// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/math/internal/ieee754.hpp ==
										// ==
										/* Copyright 2025 Mehmet Yusufoglu, Andrea Bocci, René Widera
										 * SPDX-License-Identifier: MPL-2.0
										 */

										// #pragma once
										// #include "alpaka/core/Unreachable.hpp"    // amalgamate: file already inlined
										// #include "alpaka/core/common.hpp"    // amalgamate: file already inlined

										// #include <bit>    // amalgamate: file already included
										// #include <cstdint>    // amalgamate: file already included
										#include <type_traits>

										namespace alpaka::math::internal
										{
										    namespace concepts
										    {
										        /** Checks for single and double floating point precision */
										        template<typename T>
										        concept FloatingPoint = std::is_same_v<T, float> || std::is_same_v<T, double>;
										    } // namespace concepts

										    // Bit pattern checks keep isnan portable across host/device and fast-math builds.
										    template<concepts::FloatingPoint T>
										    constexpr bool ieeeIsnan(T const& arg)
										    {
										        if constexpr(std::is_same_v<T, float>)
										        {
										            constexpr uint32_t expMask = 0x7F80'0000;
										            constexpr uint32_t fracMask = 0x007F'FFFF;
										            auto bits = std::bit_cast<uint32_t>(arg);
										            return ((bits & expMask) == expMask) && (bits & fracMask);
										        }
										        else if constexpr(std::is_same_v<T, double>)
										        {
										            constexpr uint64_t expMask = 0x7FF0'0000'0000'0000ULL;
										            constexpr uint64_t fracMask = 0x000F'FFFF'FFFF'FFFFULL;
										            auto bits = std::bit_cast<uint64_t>(arg);
										            return ((bits & expMask) == expMask) && (bits & fracMask);
										        }

										        ALPAKA_UNREACHABLE(T{});
										    }

										    template<concepts::FloatingPoint T>
										    constexpr bool ieeeIsinf(T const& arg)
										    {
										        if constexpr(std::is_same_v<T, float>)
										        {
										            constexpr uint32_t expMask = 0x7F80'0000;
										            constexpr uint32_t fracMask = 0x007F'FFFF;
										            auto bits = std::bit_cast<uint32_t>(arg);
										            return ((bits & expMask) == expMask) && !(bits & fracMask);
										        }
										        else if constexpr(std::is_same_v<T, double>)
										        {
										            constexpr uint64_t expMask = 0x7FF0'0000'0000'0000ULL;
										            constexpr uint64_t fracMask = 0x000F'FFFF'FFFF'FFFFULL;
										            auto bits = std::bit_cast<uint64_t>(arg);
										            return ((bits & expMask) == expMask) && !(bits & fracMask);
										        }

										        ALPAKA_UNREACHABLE(T{});
										    }

										    template<concepts::FloatingPoint T>
										    constexpr bool ieeeIsfinite(T const& arg)
										    {
										        if constexpr(std::is_same_v<T, float>)
										        {
										            constexpr uint32_t expMask = 0x7F80'0000;
										            auto bits = std::bit_cast<uint32_t>(arg);
										            return (bits & expMask) != expMask;
										        }
										        else if constexpr(std::is_same_v<T, double>)
										        {
										            constexpr uint64_t expMask = 0x7FF0'0000'0000'0000ULL;
										            auto bits = std::bit_cast<uint64_t>(arg);
										            return (bits & expMask) != expMask;
										        }

										        ALPAKA_UNREACHABLE(T{});
										    }
										} // namespace alpaka::math::internal
										// ==
										// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/math/internal/ieee754.hpp ==
										// ============================================================================

									// #include "alpaka/math/internal/math.hpp"    // amalgamate: file already inlined
									// #include "alpaka/math/internal/stlMath.hpp"    // amalgamate: file already inlined

									// #include <cmath>    // amalgamate: file already included
									#include <type_traits>

									namespace alpaka::math::internal
									{
									    template<typename T_A, typename T_B>
									    requires(std::is_arithmetic_v<T_A> && std::is_arithmetic_v<T_B>)
									    struct Min::Op<StlMath, T_A, T_B>
									    {
									        constexpr auto operator()(StlMath, T_A const& a, T_B const& b) const
									        {
									            if constexpr(std::is_integral_v<T_A> && std::is_integral_v<T_B>)
									            {
									                using std::min;
									                return min(a, b);
									            }
									            else if constexpr(
									                is_decayed_v<T_A, float> || is_decayed_v<T_B, float> || is_decayed_v<T_A, double>
									                || is_decayed_v<T_B, double>)
									            {
									                using std::fmin;
									                return fmin(a, b);
									            }
									            else
									                static_assert(!sizeof(T_A), "Unsupported data type");

									            ALPAKA_UNREACHABLE(std::common_type_t<T_A, T_B>{});
									        }
									    };

									    template<typename T_A, typename T_B>
									    requires(std::is_arithmetic_v<T_A> && std::is_arithmetic_v<T_B>)
									    struct Max::Op<StlMath, T_A, T_B>
									    {
									        constexpr auto operator()(StlMath, T_A const& a, T_B const& b) const
									        {
									            if constexpr(std::is_integral_v<T_A> && std::is_integral_v<T_B>)
									            {
									                using std::max;
									                return max(a, b);
									            }
									            else if constexpr(
									                is_decayed_v<T_A, float> || is_decayed_v<T_B, float> || is_decayed_v<T_A, double>
									                || is_decayed_v<T_B, double>)
									            {
									                using std::fmax;
									                return fmax(a, b);
									            }
									            else
									                static_assert(!sizeof(T_A), "Unsupported data type");

									            ALPAKA_UNREACHABLE(std::common_type_t<T_A, T_B>{});
									        }
									    };

									    //! Custom IEEE 754 bitwise implementation of isnan
									    //! std counterpart does not work correctly for `-ffast-math` flags at CPU.
									    template<std::floating_point T_Arg>
									    struct Isnan::Op<StlMath, T_Arg>
									    {
									        constexpr auto operator()(StlMath, T_Arg const& arg) const -> bool
									        {
									            return ieeeIsnan(arg);
									        }
									    };

									    //! Custom IEEE 754 bitwise implementation of isinf
									    //! std counterpart does not work correctly for `-ffast-math` flags at CPU.
									    template<std::floating_point T_Arg>
									    struct Isinf::Op<StlMath, T_Arg>
									    {
									        constexpr auto operator()(StlMath, T_Arg const& arg) const -> bool
									        {
									            return ieeeIsinf(arg);
									        }
									    };

									    //! Custom IEEE 754 bitwise implementation of isinf
									    //! std counterpart does not work correctly for `-ffast-math` flags at CPU.
									    template<std::floating_point T_Arg>
									    struct Isfinite::Op<StlMath, T_Arg>
									    {
									        constexpr auto operator()(StlMath, T_Arg const& arg) const -> bool
									        {
									            return ieeeIsfinite(arg);
									        }
									    };

									    //! Custom IEEE 754 bitwise implementation of isnan
									    //! std counterpart does not work correctly for `-ffast-math` flags at CPU.
									} // namespace alpaka::math::internal
									// ==
									// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/math/internal/stlMathImpl.hpp ==
									// ============================================================================

								// ==
								// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/math/internal/math.hpp ==
								// ============================================================================

								// ============================================================================
								// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/tag.hpp ==
								// ==
								/* Copyright 2024 René Widera
								 * SPDX-License-Identifier: MPL-2.0
								 */

								// #pragma once
								// #include "alpaka/core/PP.hpp"    // amalgamate: file already inlined
									// ============================================================================
									// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/core/Tag.hpp ==
									// ==
									/* Copyright 2024 René Widera
									 * SPDX-License-Identifier: MPL-2.0
									 */

									// #pragma once
									#include <type_traits>

									namespace alpaka
									{
									    template<typename T_Id = decltype([]() -> void {})>
									    struct Tag
									    {
									    };

									#define ALPAKA_TAG(name)                                                                                              \
									    constexpr Tag<std::integral_constant<size_t, __COUNTER__>> name                                                   \
									    {                                                                                                                 \
									    }

									    namespace trait
									    {
									        template<typename T_Object, typename T_Sfinae = void>
									        struct IsTag : std::false_type
									        {
									        };

									        template<typename T_Id>
									        struct IsTag<Tag<T_Id>> : std::true_type
									        {
									        };

									        template<typename T_Id>
									        constexpr bool isTag_v = IsTag<T_Id>::value;

									    } // namespace trait

									} // namespace alpaka
									// ==
									// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/core/Tag.hpp ==
									// ============================================================================

								// #include "alpaka/core/util.hpp"    // amalgamate: file already inlined
								// #include "alpaka/unused.hpp"    // amalgamate: file already inlined

								#include <cassert>
								// #include <string>    // amalgamate: file already included
								// #include <tuple>    // amalgamate: file already included

								namespace alpaka
								{
								    namespace object
								    {
								        struct Api
								        {
								        };

								        constexpr Api api;

								        struct DeviceKind
								        {
								        };

								        constexpr DeviceKind deviceKind;

								        ALPAKA_TAG(exec);

								        ALPAKA_TAG(launchedWidthFrameSpec);

								        ALPAKA_TAG(deviceSpec);

								        ALPAKA_TAG(dynSharedMemBytes);

								        struct WarpSize
								        {
								        };

								        constexpr WarpSize warpSize;
								    } // namespace object

								    namespace queueKind
								    {
								        namespace detail
								        {
								            struct QueueKindBase
								            {
								            };
								        } // namespace detail

								        namespace trait
								        {
								            template<typename T_QueueKind>
								            struct IsQueueKind : std::is_base_of<detail::QueueKindBase, T_QueueKind>
								            {
								            };
								        } // namespace trait

								        template<typename T_QueueKind>
								        constexpr bool isQueueKind_v = trait::IsQueueKind<T_QueueKind>::value;
								    } // namespace queueKind

								    namespace concepts
								    {
								        /** Concept to check if a type is a queue kind
								         *
								         * @details
								         * Example queue kinds are alpaka::queueKind::Blocking or alpaka::queueKind::NonBlocking.
								         */
								        template<typename T_QueueKind>
								        concept QueueKind = queueKind::isQueueKind_v<T_QueueKind>;
								    } // namespace concepts

								    namespace queueKind
								    {
								        constexpr bool operator==(alpaka::concepts::QueueKind auto lhs, alpaka::concepts::QueueKind auto rhs)
								        {
								            return std::is_same_v<ALPAKA_TYPEOF(lhs), ALPAKA_TYPEOF(rhs)>;
								        }

								        constexpr bool operator!=(alpaka::concepts::QueueKind auto lhs, alpaka::concepts::QueueKind auto rhs)
								        {
								            return !(lhs == rhs);
								        }

								        /** Queue should block during the task execution
								         */
								        struct Blocking : detail::QueueKindBase
								        {
								            static std::string getName()
								            {
								                return "Blocking";
								            }
								        };

								        constexpr auto blocking = Blocking{};

								        /** Queue should process task asynchronously
								         */
								        struct NonBlocking : detail::QueueKindBase
								        {
								            static std::string getName()
								            {
								                return "NonBlocking";
								            }
								        };

								        constexpr auto nonBlocking = NonBlocking{};
								    } // namespace queueKind

								    namespace deviceKind
								    {
								        namespace detail
								        {
								            struct DeviceKindBase
								            {
								            };
								        } // namespace detail

								        namespace trait
								        {
								            template<typename T_DeviceKind>
								            struct IsDeviceKind : std::is_base_of<detail::DeviceKindBase, T_DeviceKind>
								            {
								            };
								        } // namespace trait

								        template<typename T_DeviceKind>
								        constexpr bool isDeviceKind_v = trait::IsDeviceKind<T_DeviceKind>::value;
								    } // namespace deviceKind

								    namespace concepts
								    {
								        /** @brief Concept to check if something is a device kind
								         *
								         * @details
								         * A device kind in alpaka is a type of acceleration device, such as a GPU vendor. Examples are
								         * alpaka::deviceKind::amdGpu or alpaka::deviceKind::cpu. Together with an alpaka::onHost::Api, it can make
								         * up an alpaka::onHost::Device.
								         */
								        template<typename T_DeviceKind>
								        concept DeviceKind = deviceKind::isDeviceKind_v<T_DeviceKind>;
								    } // namespace concepts

								    namespace deviceKind
								    {
								        constexpr bool operator==(concepts::DeviceKind auto lhs, concepts::DeviceKind auto rhs)
								        {
								            return std::is_same_v<ALPAKA_TYPEOF(lhs), ALPAKA_TYPEOF(rhs)>;
								        }

								        constexpr bool operator!=(concepts::DeviceKind auto lhs, concepts::DeviceKind auto rhs)
								        {
								            return !(lhs == rhs);
								        }

								        struct Cpu : detail::DeviceKindBase
								        {
								            static std::string getName()
								            {
								                return "Cpu";
								            }
								        };

								        constexpr auto cpu = Cpu{};

								        struct NumaCpu : detail::DeviceKindBase
								        {
								            static std::string getName()
								            {
								                return "NumaCpu";
								            }
								        };

								        constexpr auto numaCpu = NumaCpu{};

								        struct AmdGpu : detail::DeviceKindBase
								        {
								            static std::string getName()
								            {
								                return "AmdGpu";
								            }
								        };

								        constexpr auto amdGpu = AmdGpu{};

								        struct NvidiaGpu : detail::DeviceKindBase
								        {
								            static std::string getName()
								            {
								                return "NvidiaGpu";
								            }
								        };

								        constexpr auto nvidiaGpu = NvidiaGpu{};

								        struct IntelGpu : detail::DeviceKindBase
								        {
								            static std::string getName()
								            {
								                return "IntelGpu";
								            }
								        };

								        constexpr auto intelGpu = IntelGpu{};

								        constexpr auto allDevices = std::make_tuple(cpu, numaCpu, amdGpu, nvidiaGpu, intelGpu);

								    } // namespace deviceKind

								    namespace layer
								    {
								        namespace detail
								        {
								            struct LayerBase
								            {
								            };
								        } // namespace detail

								        namespace trait
								        {
								            template<typename T_Layer>
								            struct IsLayer : std::is_base_of<detail::LayerBase, T_Layer>
								            {
								            };
								        } // namespace trait

								        template<typename T_Layer>
								        constexpr bool isLayer_v = trait::IsLayer<T_Layer>::value;
								    } // namespace layer

								    namespace concepts
								    {
								        /** @brief Concept to check for a compute layer of an accelerator
								         *
								         * @details
								         * A layer is one specific part of the compute hierarchy of accelerators, for example alpaka::layer::Thread or
								         * alpaka::layer::Block.
								         */
								        template<typename T_Layer>
								        concept Layer = layer::isLayer_v<T_Layer>;
								    } // namespace concepts

								    namespace layer
								    {
								        struct Thread : detail::LayerBase
								        {
								        };

								        constexpr auto thread = Thread{};

								        struct Block : detail::LayerBase
								        {
								        };

								        constexpr auto block = Block{};

								        ALPAKA_TAG(shared);
								        ALPAKA_TAG(dynShared);
								    } // namespace layer

								    namespace action
								    {
								        ALPAKA_TAG(threadBlockSync);
								    } // namespace action

								    struct Empty
								    {
								    };

								    namespace exec
								    {
								        namespace trait
								        {
								            template<typename T_Executor>
								            struct IsSeqExecutor : std::false_type
								            {
								            };
								        } // namespace trait

								        template<typename T_Exec>
								        constexpr bool isSeqExecutor_v = trait::IsSeqExecutor<T_Exec>::value;
								    } // namespace exec

								    /** check if a executor can only be used with a single thred per block
								     *
								     * @return true if a block can only have a single thread, else false
								     */
								    template<typename T_Exec>
								    consteval bool isSeqExecutor(T_Exec exec)
								    {
								        alpaka::unused(exec);
								        return exec::isSeqExecutor_v<T_Exec>;
								    }
								} // namespace alpaka
								// ==
								// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/tag.hpp ==
								// ============================================================================


							// #include <algorithm>    // amalgamate: file already included
							// #include <cstdint>    // amalgamate: file already included

							namespace alpaka
							{
							    namespace trait
							    {
							        /** Map's all API's by default to stl math functions. */
							        struct GetMathImpl
							        {
							            template<alpaka::concepts::Api T_Api>
							            struct Op
							            {
							                constexpr decltype(auto) operator()(T_Api const) const
							                {
							                    return alpaka::math::internal::stlMath;
							                }
							            };
							        };

							        template<alpaka::concepts::Api T_Api>
							        constexpr decltype(auto) getMathImpl(T_Api const api)
							        {
							            return GetMathImpl::Op<T_Api>{}(api);
							        }

							        /** Defines the implementation used for intrinsics */
							        struct GetIntrinsicImpl
							        {
							            template<alpaka::concepts::Api T_Api>
							            struct Op
							            {
							                constexpr decltype(auto) operator()(T_Api const) const
							                {
							                    static_assert(
							                        sizeof(T_Api) && false,
							                        "Intrinsic implementation for the current used API is not defined.");
							                    return 0;
							                }
							            };
							        };

							        template<alpaka::concepts::Api T_Api>
							        constexpr decltype(auto) getIntrinsicImpl(T_Api const api)
							        {
							            return GetIntrinsicImpl::Op<T_Api>{}(api);
							        }

							        struct GetArchSimdWidth
							        {
							            template<typename T_Type, alpaka::concepts::Api T_Api, alpaka::concepts::DeviceKind T_DeviceKind>
							            struct Op
							            {
							                consteval uint32_t operator()(T_Api const, T_DeviceKind const) const
							                {
							                    static_assert(sizeof(T_Api) && false, "Missing definition of GetArchSimdWidth for API.");
							                    return 1u;
							                }
							            };
							        };

							        /** Number of commands a CPU can issue at the same time. */
							        struct GetNumPipelines
							        {
							            template<alpaka::concepts::Api T_Api, alpaka::concepts::DeviceKind T_DeviceKind>
							            struct Op
							            {
							                /** @return the return value must be >= 1 */
							                consteval uint32_t operator()(T_Api const, T_DeviceKind const) const
							                {
							                    static_assert(sizeof(T_Api) && false, "Missing definition of GetNumPipelines for API.");
							                    return 1u;
							                }
							            };
							        };

							        struct GetCachelineSize
							        {
							            template<alpaka::concepts::Api T_Api, alpaka::concepts::DeviceKind T_DeviceKind>
							            struct Op
							            {
							                consteval uint32_t operator()(T_Api const, T_DeviceKind const) const
							                {
							                    static_assert(sizeof(T_Api) && false, "GetCachelineSize for the current used API is not defined.");
							                    return 42u;
							                }
							            };
							        };

							        // true for alpaka MdSpan implementations
							        template<typename T>
							        struct IsExecutor : std::false_type
							        {
							        };

							        /** Adjusting the requested in alignment in order to meet device specific constraints. */
							        struct GetAdjustedAlignment
							        {
							            template<typename T_Type, concepts::Api T_Api, concepts::DeviceKind T_DeviceKind>
							            struct Op
							            {
							                consteval uint32_t operator()(T_Api const, T_DeviceKind const, uint32_t const alignment) const
							                {
							                    return alignment;
							                }
							            };
							        };
							    } // namespace trait

							    template<typename T>
							    constexpr bool isExecutor = trait::IsExecutor<T>::value;

							    namespace concepts
							    {
							        /** @brief Concept to check for an executor
							         *
							         * @details
							         * An executor in alpaka is a specific way of executing on an alpaka::onHost::Device. Examples of executors are
							         * alpaka::exec::GpuCuda or alpaka::onHost::cpu::OmpBlocks.
							         */
							        template<typename T>
							        concept Executor = alpaka::isExecutor<T>;
							    } // namespace concepts

							    constexpr bool operator==(concepts::Executor auto lhs, concepts::Executor auto rhs)
							    {
							        return std::is_same_v<ALPAKA_TYPEOF(lhs), ALPAKA_TYPEOF(rhs)>;
							    }

							    constexpr bool operator!=(concepts::Executor auto lhs, concepts::Executor auto rhs)
							    {
							        return !(lhs == rhs);
							    }

							    /** Get the SIMD width in bytes for an API and device kind combination.
							     *
							     * @tparam T_Type data type
							     * @return number of elements that can be processed in parallel in a vector register
							     */
							    template<typename T_Type>
							    consteval uint32_t getArchSimdWidth(
							        concepts::Api auto const api,
							        alpaka::concepts::DeviceKind auto const deviceType)
							    {
							        return trait::GetArchSimdWidth::Op<T_Type, ALPAKA_TYPEOF(api), ALPAKA_TYPEOF(deviceType)>{}(api, deviceType);
							    }

							    /** Get the number of instructions that can be issued in parallel
							     */
							    consteval uint32_t getNumPipelines(
							        concepts::Api auto const api,
							        alpaka::concepts::DeviceKind auto const deviceType)
							    {
							        return trait::GetNumPipelines::Op<ALPAKA_TYPEOF(api), ALPAKA_TYPEOF(deviceType)>{}(api, deviceType);
							    }

							    /**  Get the number of elements to compute per thread
							     *
							     * This function considers the SIMD width for the corresponding data type and the potential for instruction
							     * parallelism.
							     *
							     * @tparam T_Type The data type used to determine the SIMD width.
							     * @return The minimum number of elements a thread should compute to achieve optimal utilization.
							     */
							    template<typename T_Type>
							    consteval uint32_t getNumElemPerThread(
							        concepts::Api auto const api,
							        alpaka::concepts::DeviceKind auto const deviceType)
							    {
							        return getArchSimdWidth<T_Type>(api, deviceType) * getNumPipelines(api, deviceType);
							    }

							    /** get the cacheline size in bytes
							     *
							     * Cache line size is the distance between two memory address that guarantees to be false sharing free.
							     *
							     * @return cacheline size in bytes
							     */
							    consteval uint32_t getCachelineSize(
							        concepts::Api auto const api,
							        alpaka::concepts::DeviceKind auto const deviceType)
							    {
							        return trait::GetCachelineSize::Op<ALPAKA_TYPEOF(api), ALPAKA_TYPEOF(deviceType)>{}(api, deviceType);
							    }

							    /**
							     * @brief Adjusts the memory alignment based on a specific API and device kind.
							     * @tparam T_Type The data type being allocated.
							     * @param alignment the previously selected alignment
							     * @return adjusted alignment in bytes
							     */
							    template<typename T_Type>
							    consteval uint32_t getAdjustedAlignment(
							        concepts::Api auto const api,
							        concepts::DeviceKind auto const deviceType,
							        auto const alignment)
							    {
							        auto val = trait::GetAdjustedAlignment::Op<T_Type, ALPAKA_TYPEOF(api), ALPAKA_TYPEOF(deviceType)>{}(
							            api,
							            deviceType,
							            alignment);
							        return val;
							    }

							    namespace onAcc::trait
							    {
							        /** Defines the implementation used for atomic operations toghether with the used executor */
							        struct GetAtomicImpl
							        {
							            template<alpaka::concepts::Executor T_Executor, typename T_AtomicScope>
							            struct Op
							            {
							                constexpr decltype(auto) operator()(T_Executor const) const
							                {
							                    static_assert(
							                        sizeof(T_Executor) && false,
							                        "Atomic implementation for the current used executor is not defined.");
							                    return 0;
							                }
							            };
							        };

							        template<alpaka::concepts::Executor T_Executor, typename T_AtomicScope>
							        constexpr decltype(auto) getAtomicImpl(T_Executor const executor, T_AtomicScope const atomicScope)
							        {
							            return GetAtomicImpl::Op<T_Executor, T_AtomicScope>{}(executor, atomicScope);
							        }
							    } // namespace onAcc::trait
							} // namespace alpaka
							// ==
							// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/trait.hpp ==
							// ============================================================================


						#include <type_traits>

						namespace alpaka::unifiedCudaHip::trait
						{
						    template<alpaka::concepts::Executor T_Executor>
						    struct IsUnifiedExecutor : std::false_type
						    {
						    };

						    template<alpaka::concepts::Api T_Api>
						    struct IsUnifiedApi : std::false_type
						    {
						    };
						} // namespace alpaka::unifiedCudaHip::trait
						// ==
						// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/unifiedCudaHip/trait.hpp ==
						// ============================================================================

						// ============================================================================
						// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/concepts.hpp ==
						// ==
						/* Copyright 2024 René Widera
						 * SPDX-License-Identifier: MPL-2.0
						 */

						// #pragma once
						// #include "alpaka/api/concepts/api.hpp"    // amalgamate: file already inlined
						// #include "alpaka/concepts/hasName.hpp"    // amalgamate: file already inlined
							// ============================================================================
							// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/mem/concepts/AssignableFrom.hpp ==
							// ==
							/* Copyright 2026 René Widera
							 * SPDX-License-Identifier: MPL-2.0
							 */

							// #pragma once
							// #include "alpaka/trait.hpp"    // amalgamate: file already inlined
							// #include "alpaka/unused.hpp"    // amalgamate: file already inlined

							// #include <concepts>    // amalgamate: file already included

							namespace alpaka::concepts
							{
							    /** Check whether the specified data type T_To can be assigned to T_From
							     *
							     * Read the check as a variable of the type T_From is assigned to a variable of the type T_To.
							     *
							     * @attention it is not equal to std::is_assignable
							     *
							     * Equivalent to execute:
							     *
							     * @code
							     *      T_To to;
							     *      T_From from;
							     *      to = foo;
							     * @endcode
							     */
							    template<typename T_To, typename T_From>
							    concept AssignableFrom = requires(T_To to, T_From from) { to = from; };
							} // namespace alpaka::concepts
							// ==
							// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/mem/concepts/AssignableFrom.hpp ==
							// ============================================================================

							// ============================================================================
							// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/mem/concepts/ExpectedValueType.hpp ==
							// ==
							/* Copyright 2025 Simeon Ehrig
							 * SPDX-License-Identifier: MPL-2.0
							 */

							// #pragma once
							// #include "alpaka/trait.hpp"    // amalgamate: file already inlined

							// #include <concepts>    // amalgamate: file already included

							namespace alpaka::concepts
							{
							    /** Check whether the specified data type matches the expected type, or if the expected type is
							     *`alpaka::NotRequired`, then all data types passes the concept.
							     **/
							    template<typename T_Data, typename T_Expected>
							    concept ExpectedValueType = std::same_as<T_Expected, T_Data> || std::same_as<T_Expected, alpaka::NotRequired>;
							} // namespace alpaka::concepts
							// ==
							// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/mem/concepts/ExpectedValueType.hpp ==
							// ============================================================================

							// ============================================================================
							// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/mem/concepts/IBuffer.hpp ==
							// ==
							/* Copyright 2025 Simeon Ehrig
							 * SPDX-License-Identifier: MPL-2.0
							 */

							// #pragma once
							// #include "alpaka/mem/concepts/ExpectedValueType.hpp"    // amalgamate: file already inlined
								// ============================================================================
								// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/mem/concepts/IView.hpp ==
								// ==
								/* Copyright 2025 Simeon Ehrig
								 * SPDX-License-Identifier: MPL-2.0
								 */

								// #pragma once
								// #include "alpaka/api/concepts/api.hpp"    // amalgamate: file already inlined
								// #include "alpaka/mem/concepts/ExpectedValueType.hpp"    // amalgamate: file already inlined
									// ============================================================================
									// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/mem/concepts/IMdSpan.hpp ==
									// ==
									/* Copyright 2025 Simeon Ehrig
									 * SPDX-License-Identifier: MPL-2.0
									 */

									// #pragma once
									// #include "alpaka/Vec.hpp"    // amalgamate: file already inlined
									// #include "alpaka/mem/Alignment.hpp"    // amalgamate: file already inlined
									// #include "alpaka/mem/concepts/ExpectedValueType.hpp"    // amalgamate: file already inlined
										// ============================================================================
										// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/mem/concepts/IDataSource.hpp ==
										// ==
										/* Copyright 2025 Simeon Ehrig
										 * SPDX-License-Identifier: MPL-2.0
										 */

										// #pragma once
										// #include "alpaka/Vec.hpp"    // amalgamate: file already inlined
										// #include "alpaka/mem/Alignment.hpp"    // amalgamate: file already inlined
										// #include "alpaka/mem/concepts/AssignableFrom.hpp"    // amalgamate: file already inlined
										// #include "alpaka/mem/concepts/ExpectedValueType.hpp"    // amalgamate: file already inlined
										// #include "alpaka/trait.hpp"    // amalgamate: file already inlined

										// #include <concepts>    // amalgamate: file already included

										namespace alpaka::concepts
										{
										    namespace impl
										    {
										        /** @brief Interface concept for objects describing a multidimensional data source.
										         *
										         * @details
										         *
										         * An object that implements the interface returns a value for a multidimensional index. Therefore, it behaves
										         * like multidimensional memory that can only be read. It is not permitted to write a new value to an index
										         * position. An `IDataSource` object has an immutable, fixed multidimensional size. The `IDataSource` object is
										         * not required to reference the storage. It may create or calculate the data instead of reading it from
										         * memory.
										         *
										         * The immutable extent is required for algorithms such as `alpaka::onHost::transform`.
										         *
										         * @param t Object that implements the `IDataSource` interface. May or may not have a const modifier.
										         * @param vec Vector with the same number of elements as the dimension of the `IDataSource` like object.
										         * Used to call the access operator.
										         *
										         *
										         * @section membertypes Member types
										         * - <b>T::value_type</b>: The element type. May or may not be const.
										         * - <b>T::index_type</b>: The index type of the pitch.
										         *
										         * @note The access operator [] with an integral as an argument is only available if the dimension is one.
										         **/
										        template<typename T>
										        concept IDataSource = requires(T t, alpaka::Vec<typename T::index_type, T::dim()> vec) {
										            typename T::value_type;
										            typename T::index_type;

										            /* Non const data sources must be assignable.
										             * You can NOT assign const data sources to non const dta sources because this will remove the const-ness.
										             */
										            requires concepts::AssignableFrom<std::decay_t<T>, std::decay_t<T>>;

										            // only the non-const type is moveable
										            requires std::movable<std::remove_const_t<T>>;

										            /// The bool operator returns true if the access operator returns valid values. For example, memory
										            /// access may be invalid after moving the DataSource.
										            static_cast<bool>(t);

										            { T::dim() } -> std::same_as<uint32_t>;

										            /* check multi-dimensional access operator
										             if T has no reference type, the access operator needs to return a copy
										             `|| requires { typename T::reference; }` is only required, that the statement becomes true in any case
										            */
										            requires (!requires { typename T::reference; } &&
										                requires { { t[vec] } -> std::same_as<typename T::value_type>;})
										                || requires { typename T::reference; };

										            /* check 1-dimensional access operator
										            checking for (T::dim() != 1u) disables the access operator requirement for multidimensional
										            IDataSources */
										            requires
										                      (T::dim() != 1u) ||
										                      (T::dim() == 1u && !requires { typename T::reference; } &&
										                          requires {{ t[0] } -> std::same_as<typename T::value_type>; })
										                      || requires { typename T::reference; };

										            // typically the alignment of the value_type.
										            { t.getAlignment() } -> alpaka::concepts::Alignment;
										            /// @todo implement concept alpaka::concepts::Extents and use it as return value
										            t.getExtents();
										            /// @todo implement concept alpaka::concepts::Pitches and use it as return value
										            t.getPitches();
										        };
										    } // namespace impl

										    template<typename T, typename T_ValueType = alpaka::NotRequired>
										    concept IDataSource = requires {
										        requires impl::IDataSource<std::remove_reference_t<T>>;
										        requires ExpectedValueType<trait::GetValueType_t<std::decay_t<T>>, T_ValueType>;
										    };
										} // namespace alpaka::concepts
										// ==
										// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/mem/concepts/IDataSource.hpp ==
										// ============================================================================

									// #include "alpaka/trait.hpp"    // amalgamate: file already inlined

									// #include <concepts>    // amalgamate: file already included

									namespace alpaka::concepts
									{
									    namespace impl
									    {
									        /** @brief Interface concept for objects describing multidimensional memory access.
									         *
									         * @details
									         *
									         * An object of type `alpaka::mdspan` does not store any information about the storage location, e.g., whether
									         * the memory is located on a CPU or a GPU. The interface corresponds to that of a standard library container
									         * with continuous memory, but has some differences to support multidimensional memory. For example, instead of
									         * the member function `size()`, which returns the 1D size, `alpaka::mdspan` like objects provides the function
									         *`getExtents()`, which returns the size of each dimension.
									         *
									         * @param t Object of type `alpaka::mdspan`. May or may not have a const modifier.
									         * @param mut_t Mutable object of type `alpaka::mdspan`. Does not have a const modifier.
									         * @param const_t Constant object of type `alpaka::mdspan`. Does have a const modifier.
									         * @param vec Vector with the same number of elements as the dimension of the `alpaka::mdspan` like object.
									         * Used to call the access operator.
									         *
									         *  @section components Components
									         *
									         * An `alpaka::mdspan` like object contains 4 components:
									         * - A pointer to the actual memory.
									         * - An extents object that describes the number of dimensions and their respective sizes.
									         * - A pitch object that specifies how many bytes are required to jump to the next element in each dimension.
									         * - An alignment object that describes how the elements are aligned in memory, see:
									         * alpaka::concepts::Alignment
									         *
									         * @section membertypes Member types
									         * - <b>T::reference</b>: The element reference type is either const or non-const, depending on
									         *`T::value_type`.
									         * - <b>T::const_reference</b>: The constant reference type for an element. Always const.
									         * - <b>T::pointer</b>: The element pointer type is either const or non-const, depending on
									         *`T::value_type`.
									         * - <b>T::const_pointer</b>: The constant pointer type for an element. Always pointer-to-const.
									         **/
									        template<typename T, typename T_Mut, typename T_Const>
									        concept IMdSpan
									            = requires(T t, T_Mut mut_t, T_Const const_t, alpaka::Vec<typename T::index_type, T::dim()> vec) {
									                  requires IDataSource<T>;

									                  typename T::reference;
									                  typename T::const_reference;
									                  typename T::pointer;
									                  typename T::const_pointer;

									                  { *mut_t } -> std::same_as<typename T::reference>;
									                  { *const_t } -> std::same_as<typename T::const_reference>;
									                  { mut_t.data() } -> std::same_as<typename T::pointer>;
									                  { const_t.data() } -> std::same_as<typename T::const_pointer>;

									                  { mut_t[vec] } -> std::same_as<typename T::reference>;
									                  { const_t[vec] } -> std::same_as<typename T::const_reference>;
									                  // only if MdSpan like object is 1D, the access operator with an integral is available
									                  requires(T::dim() != 1u) || (T::dim() == 1u && requires {
									                              { mut_t[typename T::index_type{0}] } -> std::same_as<typename T::reference>;
									                          });
									                  requires(T::dim() != 1u) || (T::dim() == 1u && requires {
									                              { const_t[typename T::index_type{0}] } -> std::same_as<typename T::const_reference>;
									                          });
									                  /// @todo add getSlice, getConstSlice and getView, getConstView functions
									              };

									    } // namespace impl

									    /** @brief Interface concept for objects describing multidimensional memory access.
									     *
									     * @details
									     * An object of type `alpaka::mdspan` does not store any information about the storage location, e.g., whether
									     * the memory is located on a CPU or a GPU.
									     *
									     * @attention Use `alpaka::IMdSpan` to restrict types in your code. The actual interface is described in
									     * alpaka::concepts::impl::IMdSpan.
									     **/
									    template<typename T, typename T_ValueType = alpaka::NotRequired>
									    concept IMdSpan = requires {
									        requires impl::IMdSpan<
									            std::remove_reference_t<T>,
									            std::remove_const_t<std::remove_reference_t<T>>,
									            std::add_const_t<std::remove_reference_t<T>>>;
									        requires ExpectedValueType<trait::GetValueType_t<std::decay_t<T>>, T_ValueType>;
									    };
									} // namespace alpaka::concepts
									// ==
									// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/mem/concepts/IMdSpan.hpp ==
									// ============================================================================

								// #include "alpaka/trait.hpp"    // amalgamate: file already inlined

								#include <type_traits>

								namespace alpaka::concepts
								{
								    namespace impl
								    {
								        /** @brief Interface concept for objects describing api-related multidimensional memory access.
								         *
								         * @details
								         * An `alpaka::view`-like object contains information about the device(s) to which it is connected. The
								         * `alpaka::view`-like object has no memory ownership, and therefore, it does not manage the memory lifetime.
								         * The represented memory can have any dimensionality.
								         *
								         * Any object fitting the `IView` concept is also an `IMdSpan`.
								         **/
								        template<typename T, typename T_Mut, typename T_Const>
								        concept IView = requires(T t, alpaka::Vec<typename T::index_type, T::dim()> vec) {
								            requires IMdSpan<T, T_Mut, T_Const>;
								            { t.getApi() } -> alpaka::concepts::Api;

								            /** @brief Creates a sub view to a part of the memory.
								             *
								             * @see alpaka::View::getSubView
								             *
								             * @{
								             */
								            t.getSubView(vec /* extents */) /* -> alpaka::concepts::impl::IView */;
								            t.getSubView(vec /* offset */, vec /* extents */) /* -> alpaka::concepts::impl::IView */;

								            /** @} */
								        };
								    } // namespace impl

								    /** @brief Interface concept for objects describing api-related multidimensional memory access.
								     *
								     * @details
								     * An `alpaka::view`-like object contains information about the device(s) to which it is connected. The
								     * `alpaka::view`-like object has no memory ownership, and, therefore, it does not manage the memory lifetime.
								     * The represented memory can have any dimensionality.
								     *
								     * @attention Use `alpaka::IView` to restrict types in your code. The actual interface is described in
								     * alpaka::concepts::impl::IView.
								     **/
								    template<typename T, typename T_ValueType = alpaka::NotRequired>
								    concept IView = requires(T t) {
								        requires impl::IView<
								            std::remove_reference_t<T>,
								            std::remove_const_t<std::remove_reference_t<T>>,
								            std::add_const_t<std::remove_reference_t<T>>>;
								        requires ExpectedValueType<trait::GetValueType_t<std::decay_t<T>>, T_ValueType>;
								    };
								} // namespace alpaka::concepts
								// ==
								// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/mem/concepts/IView.hpp ==
								// ============================================================================

							// #include "alpaka/trait.hpp"    // amalgamate: file already inlined

							#include <type_traits>

							namespace alpaka::concepts
							{
							    /** Dummy function for concepts.
							     *
							     * Represent a callable without arguments and return value void. Required because nvcc could not handle empty
							     * lambdas in concepts.
							     */
							    inline void empty_callable()
							    {
							    }

							    namespace impl
							    {
							        /** @brief Interface concept for objects describing multidimensional owned memory.
							         *
							         * @details
							         * An `alpaka::buffer`-like object contains information about the device(s) to which it is connected. The
							         * `alpaka::buffer`-like object has memory ownership and therefore manages memory lifetime according to the
							         * RAII principle. The represented memory can have any dimensionality.
							         *
							         * Any object that fulfills the `IBuffer` concept is also an `IView` and `IMdSpan`.
							         *
							         * @section memberfunction member functions
							         *
							         * - <b>t.addDestructorAction</b>: Adds a destructor action to the shared buffer.
							         * @code{.unparsed}
							         *    The action will be executed when the buffer is destroyed.
							         *    This can be used to add additional cleanup actions e.g. waiting on a specific queue.
							         *    Actions are executed in FIFO order.
							         * @endcode
							         * - <b>t.destructorWaitFor</b>: Add an action to be executed when the shared_ptr is destroyed.
							         **/
							        template<typename T, typename T_Mut, typename T_Const>
							        concept IBuffer = requires(T t) {
							            requires IView<T, T_Mut, T_Const>;
							            t.addDestructorAction(alpaka::concepts::empty_callable);
							            t.destructorWaitFor(alpaka::concepts::empty_callable);
							        };
							    } // namespace impl

							    /** @brief Interface concept for objects describing multidimensional owned memory.
							     *
							     * @details
							     * An `alpaka::buffer`-like object contains information about the device(s) to which it is connected. The
							     * `alpaka::buffer`-like object has memory ownership and therefore manages memory lifetime according to the RAII
							     * principle.
							     *
							     * @attention Use `alpaka::IBuffer` to restrict types in your code. The actual interface is described in
							     * alpaka::concepts::impl::IBuffer.
							     **/
							    template<typename T, typename T_ValueType = alpaka::NotRequired>
							    concept IBuffer = requires(T t) {
							        requires impl::IBuffer<
							            std::remove_reference_t<T>,
							            std::remove_const_t<std::remove_reference_t<T>>,
							            std::add_const_t<std::remove_reference_t<T>>>;
							        requires ExpectedValueType<trait::GetValueType_t<std::decay_t<T>>, T_ValueType>;
							    };
							} // namespace alpaka::concepts
							// ==
							// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/mem/concepts/IBuffer.hpp ==
							// ============================================================================

						// #include "alpaka/mem/concepts/IMdSpan.hpp"    // amalgamate: file already inlined
						// #include "alpaka/mem/concepts/IView.hpp"    // amalgamate: file already inlined
						// #include "alpaka/tag.hpp"    // amalgamate: file already inlined

						// #include <concepts>    // amalgamate: file already included

						namespace alpaka
						{
						    namespace concepts
						    {
						        /** @brief Concept to check if the given type has a `get()` function.
						         */
						        template<typename T>
						        concept HasGet = requires(T t) { t.get(); };

						        /** @brief Concept to check if the given type has a static `dim()` function
						         */
						        template<typename T>
						        concept HasStaticDim = requires(T t) { T::dim(); };

						        /** @brief Concept to check if the given type is of the given dimensionality
						         *
						         * @details
						         * The checked type must also fulfill HasStaticDim.
						         *
						         * @tparam T The type to check
						         * @tparam T_dim The dimension the checked type should have
						         */
						        template<typename T, unsigned int T_dim>
						        concept Dim = (T::dim() == T_dim);

						        /** @brief Concept to check if the given type is a GPU DeviceKind
						         */
						        template<typename T>
						        concept GpuType
						            = alpaka::concepts::DeviceKind<T>
						              && (T{} == deviceKind::nvidiaGpu || T{} == deviceKind::amdGpu || T{} == deviceKind::intelGpu);

						        /** @brief Concept to check if the given type is a pointer, using std::is_pointer
						         */
						        template<typename T>
						        concept Pointer = std::is_pointer_v<T>;

						        /** @brief Concept to check that a device specification with an API and device kind can be extracted. */
						        template<typename T>
						        concept DeviceSpec = requires(T t) {
						            { internal::getApi(t) } -> alpaka::concepts::Api;
						            { internal::getDeviceKind(t) } -> alpaka::concepts::DeviceKind;
						        };
						    } // namespace concepts

						    namespace internal
						    {
						        template<alpaka::concepts::Api T_Api>
						        struct GetApi::Op<T_Api>
						        {
						            inline constexpr auto operator()(auto&& api) const
						            {
						                return api;
						            }
						        };
						    } // namespace internal
						} // namespace alpaka
						// ==
						// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/concepts.hpp ==
						// ============================================================================

					// #include "alpaka/core/config.hpp"    // amalgamate: file already inlined
						// ============================================================================
						// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/onHost/trait.hpp ==
						// ==
						/* Copyright 2024 René Widera
						 * SPDX-License-Identifier: MPL-2.0
						 */

						// #pragma once
						// #include "Handle.hpp"    // amalgamate: file already inlined
						// #include "alpaka/KernelBundle.hpp"    // amalgamate: file already inlined
						// #include "alpaka/core/common.hpp"    // amalgamate: file already inlined
							// ============================================================================
							// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/executor.hpp ==
							// ==
							/* Copyright 2024 René Widera, Mehmet Yusufoglu
							 * SPDX-License-Identifier: MPL-2.0
							 */

							// #pragma once
								// ============================================================================
								// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/cuda/executor.hpp ==
								// ==
								/* Copyright 2024 René Widera
								 * SPDX-License-Identifier: MPL-2.0
								 */

								// #pragma once
								// #include "alpaka/api/trait.hpp"    // amalgamate: file already inlined
									// ============================================================================
									// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/unifiedCudaHip/tag.hpp ==
									// ==
									/* Copyright 2024 René Widera
									 * SPDX-License-Identifier: MPL-2.0
									 */

									// #pragma once
									namespace alpaka
									{
									    namespace onAcc::internal
									    {
									        struct CudaHipAtomic
									        {
									        };

									        constexpr auto cudaHipAtomic = CudaHipAtomic{};
									    } // namespace onAcc::internal

									    namespace math::internal
									    {
									        struct CudaHipMath
									        {
									        };

									        constexpr auto cudaHipMath = CudaHipMath{};
									    } // namespace math::internal

									    namespace internal
									    {
									        struct CudaHipIntrinsic
									        {
									        };

									        constexpr auto cudaHipIntrinsic = CudaHipIntrinsic{};
									    } // namespace internal
									} // namespace alpaka
									// ==
									// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/unifiedCudaHip/tag.hpp ==
									// ============================================================================

								// #include "alpaka/api/unifiedCudaHip/trait.hpp"    // amalgamate: file already inlined

								// #include <string>    // amalgamate: file already included

								namespace alpaka
								{
								    namespace exec
								    {
								        struct GpuCuda
								        {
								            static std::string getName()
								            {
								                return "GpuCuda";
								            }
								        };

								        constexpr GpuCuda gpuCuda;
								    } // namespace exec

								    namespace trait
								    {
								        template<>
								        struct IsExecutor<exec::GpuCuda> : std::true_type
								        {
								        };
								    } // namespace trait
								} // namespace alpaka

								namespace alpaka::onAcc::trait
								{
								    template<typename T_AtomicScope>
								    struct GetAtomicImpl::Op<alpaka::exec::GpuCuda, T_AtomicScope>
								    {
								        constexpr decltype(auto) operator()(alpaka::exec::GpuCuda const, T_AtomicScope const) const
								        {
								            return internal::cudaHipAtomic;
								        }
								    };
								} // namespace alpaka::onAcc::trait

								namespace alpaka::unifiedCudaHip::trait
								{
								    template<>
								    struct IsUnifiedExecutor<alpaka::exec::GpuCuda> : std::true_type
								    {
								    };
								} // namespace alpaka::unifiedCudaHip::trait
								// ==
								// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/cuda/executor.hpp ==
								// ============================================================================

								// ============================================================================
								// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/hip/executor.hpp ==
								// ==
								/* Copyright 2024 René Widera
								 * SPDX-License-Identifier: MPL-2.0
								 */

								// #pragma once
								// #include "alpaka/api/trait.hpp"    // amalgamate: file already inlined
								// #include "alpaka/api/unifiedCudaHip/tag.hpp"    // amalgamate: file already inlined
								// #include "alpaka/api/unifiedCudaHip/trait.hpp"    // amalgamate: file already inlined

								// #include <string>    // amalgamate: file already included

								namespace alpaka
								{
								    namespace exec
								    {
								        struct GpuHip
								        {
								            static std::string getName()
								            {
								                return "GpuHip";
								            }
								        };

								        constexpr GpuHip gpuHip;
								    } // namespace exec

								    namespace trait
								    {
								        template<>
								        struct IsExecutor<exec::GpuHip> : std::true_type
								        {
								        };
								    } // namespace trait
								} // namespace alpaka

								namespace alpaka::onAcc::trait
								{
								    template<typename T_AtomicScope>
								    struct GetAtomicImpl::Op<alpaka::exec::GpuHip, T_AtomicScope>
								    {
								        constexpr decltype(auto) operator()(alpaka::exec::GpuHip const, T_AtomicScope const) const
								        {
								            return internal::cudaHipAtomic;
								        }
								    };
								} // namespace alpaka::onAcc::trait

								namespace alpaka::unifiedCudaHip::trait
								{
								    template<>
								    struct IsUnifiedExecutor<alpaka::exec::GpuHip> : std::true_type
								    {
								    };
								} // namespace alpaka::unifiedCudaHip::trait
								// ==
								// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/hip/executor.hpp ==
								// ============================================================================

								// ============================================================================
								// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/host/executor.hpp ==
								// ==
								/* Copyright 2024 René Widera, Mehmet Yusufoglu
								 * SPDX-License-Identifier: MPL-2.0
								 */

								// #pragma once
									// ============================================================================
									// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/host/tag.hpp ==
									// ==
									/* Copyright 2024 René Widera
									 * SPDX-License-Identifier: MPL-2.0
									 */

									// #pragma once
									namespace alpaka::onAcc
									{
									    namespace internal
									    {
									        struct StlAtomic
									        {
									        };

									        constexpr auto stlAtomic = StlAtomic{};
									    } // namespace internal
									} // namespace alpaka::onAcc

									namespace alpaka::internal
									{
									    struct StlIntrinsic
									    {
									    };

									    constexpr auto stlIntrinsic = StlIntrinsic{};
									} // namespace alpaka::internal
									// ==
									// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/host/tag.hpp ==
									// ============================================================================

								// #include "alpaka/api/trait.hpp"    // amalgamate: file already inlined
									// ============================================================================
									// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/internal/stlIntrinsic.hpp ==
									// ==
									/* Copyright 2025 Luca Venerando Greco, René Widera
									 * SPDX-License-Identifier: MPL-2.0
									 */

									// #pragma once
									// #include "alpaka/api/host/tag.hpp"    // amalgamate: file already inlined
									// #include "alpaka/core/Unreachable.hpp"    // amalgamate: file already inlined
										// ============================================================================
										// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/internal/intrinsic.hpp ==
										// ==
										/* Copyright 2025 Luca Venerando Greco, René Widera
										 * SPDX-License-Identifier: MPL-2.0
										 */

										// #pragma once
										// #include <cstdint>    // amalgamate: file already included

										namespace alpaka::internal::intrinsic
										{
										    struct Popcount
										    {
										        template<typename T_IntrinsicImpl, typename T_Arg>
										        struct Op
										        {
										            int32_t operator()(T_IntrinsicImpl const, T_Arg const& val) const;
										        };
										    };

										    struct Ffs
										    {
										        template<typename T_IntrinsicImpl, typename T_Arg>
										        struct Op
										        {
										            int32_t operator()(T_IntrinsicImpl const, T_Arg const& val) const;
										        };
										    };

										    struct Clz
										    {
										        template<typename T_IntrinsicImpl, typename T_Arg>
										        struct Op
										        {
										            int32_t operator()(T_IntrinsicImpl const, T_Arg const& val) const;
										        };
										    };
										} // namespace alpaka::internal::intrinsic
										// ==
										// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/internal/intrinsic.hpp ==
										// ============================================================================


									// #include <bit>    // amalgamate: file already included
									#include <type_traits>

									namespace alpaka::internal::intrinsic
									{
									    template<typename T_Arg>
									    struct Popcount::Op<alpaka::internal::StlIntrinsic, T_Arg>
									    {
									        constexpr auto operator()(alpaka::internal::StlIntrinsic const, T_Arg const& val) const
									        {
									            if constexpr(sizeof(T_Arg) == 4u)
									            {
									                return std::popcount(std::bit_cast<unsigned int>(val));
									            }
									            else if constexpr(sizeof(T_Arg) == 8u)
									            {
									                return std::popcount(std::bit_cast<unsigned long long>(val));
									            }
									            else
									                static_assert(!sizeof(T_Arg), "Unsupported data type, sizeof() must be 4 or 8");

									            ALPAKA_UNREACHABLE(int{});
									        }
									    };

									    template<typename T_Arg>
									    struct Ffs::Op<alpaka::internal::StlIntrinsic, T_Arg>
									    {
									        constexpr auto operator()(alpaka::internal::StlIntrinsic const, T_Arg const& val) const
									        {
									            if constexpr(sizeof(T_Arg) == 4u)
									            {
									                auto value = std::bit_cast<unsigned int>(val);
									                return value == 0u ? 0u : std::countr_zero(value) + 1;
									            }
									            else if constexpr(sizeof(T_Arg) == 8u)
									            {
									                auto value = std::bit_cast<unsigned long long>(val);
									                return value == 0u ? 0 : std::countr_zero(value) + 1;
									            }
									            else
									                static_assert(!sizeof(T_Arg), "Unsupported data type, sizeof() must be 4 or 8");

									            ALPAKA_UNREACHABLE(int{});
									        }
									    };

									    template<typename T_Arg>
									    struct Clz::Op<alpaka::internal::StlIntrinsic, T_Arg>
									    {
									        constexpr auto operator()(alpaka::internal::StlIntrinsic const, T_Arg const& val) const
									        {
									            if constexpr(sizeof(T_Arg) == 4u)
									            {
									                return std::countl_zero(std::bit_cast<unsigned int>(val));
									            }
									            else if constexpr(sizeof(T_Arg) == 8u)
									            {
									                return std::countl_zero(std::bit_cast<unsigned long long>(val));
									            }
									            else
									                static_assert(!sizeof(T_Arg), "Unsupported data type, sizeof() must be 4 or 8");

									            ALPAKA_UNREACHABLE(int{});
									        }
									    };
									} // namespace alpaka::internal::intrinsic
									// ==
									// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/internal/stlIntrinsic.hpp ==
									// ============================================================================

									// ============================================================================
									// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/onAcc/internal/atomic.hpp ==
									// ==
									/* Copyright 2025 René Widera
									 * SPDX-License-Identifier: MPL-2.0
									 */

									// #pragma once
										// ============================================================================
										// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/onAcc/internal/interface.hpp ==
										// ==
										/* Copyright 2024 René Widera
										 * SPDX-License-Identifier: MPL-2.0
										 */

										// #pragma once
										// #include "alpaka/CVec.hpp"    // amalgamate: file already inlined
											// ============================================================================
											// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/UniqueId.hpp ==
											// ==
											/* Copyright 2024 René Widera
											 * SPDX-License-Identifier: MPL-2.0
											 */

											// #pragma once
											// #include <cstdint>    // amalgamate: file already included
											#include <source_location>
											#include <string_view>

											namespace alpaka
											{
											    class UniqueId
											    {
											    public:
											        static constexpr size_t getId(std::source_location const location = std::source_location::current())
											        {
											            return generate(location);
											        }

											    private:
											        static constexpr size_t generate(std::source_location const& location)
											        {
											            size_t hash = 0xc6a4'a793'5bd1'e995;
											            hashCombine(hash, location.file_name());
											            hashCombine(hash, location.function_name());
											            hashCombine(hash, location.line());
											            hashCombine(hash, static_cast<size_t>(location.column()) << 32u);
											            return hash;
											        }

											        static constexpr void hashCombine(size_t& seed, std::string_view value)
											        {
											            for(char c : value)
											            {
											                seed ^= static_cast<size_t>(c) + 0x9e37'79b9 + (seed << 6) + (seed >> 2);
											            }
											        }

											        static constexpr void hashCombine(size_t& seed, size_t value)
											        {
											            seed ^= value + 0x9e37'79b9 + (seed << 6) + (seed >> 2);
											        }
											    };

											    /** creates a unique id on any call
											     *
											     * If a class is storing the compile time id and the file of the class is included within two compile units the
											     * id will be equal in both compile units.
											     * The id is derived from the file name, function name, line, and column from where this method is called.
											     * If this call is used to default set a template parameter of a class it will only generate once a unique number
											     * not each time the class will be used.
											     *
											     * @param location The location is the base for the unique id. For the same location the same id is generated.
											     * @return unique id
											     */
											    inline consteval size_t uniqueId(std::source_location const location = std::source_location::current())
											    {
											        return UniqueId::getId(location);
											    }
											} // namespace alpaka
											// ==
											// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/UniqueId.hpp ==
											// ============================================================================

										// #include "alpaka/Vec.hpp"    // amalgamate: file already inlined
										// #include "alpaka/core/common.hpp"    // amalgamate: file already inlined
											// ============================================================================
											// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/onAcc/tag.hpp ==
											// ==
											/* Copyright 2024 René Widera
											 * SPDX-License-Identifier: MPL-2.0
											 */

											// #pragma once
											// #include "alpaka/core/PP.hpp"    // amalgamate: file already inlined
											// #include "alpaka/core/Tag.hpp"    // amalgamate: file already inlined
											// #include "alpaka/core/util.hpp"    // amalgamate: file already inlined

											// #include <cassert>    // amalgamate: file already included
											// #include <tuple>    // amalgamate: file already included

											namespace alpaka::onAcc
											{
											    /** Origin of index domains
											     *
											     * An origin is used to query the index domain within a block or grid.
											     */
											    namespace origin
											    {
											        ALPAKA_TAG(thread);
											        ALPAKA_TAG(warp);
											        ALPAKA_TAG(block);
											        ALPAKA_TAG(grid);
											    } // namespace origin

											    /** Unit of index domains
											     *
											     * A unit is used to describe the quantity of the index domain with respect to an origin
											     */
											    namespace unit
											    {
											        ALPAKA_TAG(warps);
											        ALPAKA_TAG(threads);
											        ALPAKA_TAG(blocks);
											    } // namespace unit

											    namespace trait
											    {
											        template<typename T>
											        struct IsOrigin : std::false_type
											        {
											        };

											        template<>
											        struct IsOrigin<ALPAKA_TYPEOF(origin::warp)> : std::true_type
											        {
											        };

											        template<>
											        struct IsOrigin<ALPAKA_TYPEOF(origin::block)> : std::true_type
											        {
											        };

											        template<>
											        struct IsOrigin<ALPAKA_TYPEOF(origin::grid)> : std::true_type
											        {
											        };

											        template<>
											        struct IsOrigin<ALPAKA_TYPEOF(origin::thread)> : std::true_type
											        {
											        };

											        template<typename T>
											        struct IsUnit : std::false_type
											        {
											        };

											        template<>
											        struct IsUnit<ALPAKA_TYPEOF(unit::threads)> : std::true_type
											        {
											        };

											        template<>
											        struct IsUnit<ALPAKA_TYPEOF(unit::warps)> : std::true_type
											        {
											        };

											        template<>
											        struct IsUnit<ALPAKA_TYPEOF(unit::blocks)> : std::true_type
											        {
											        };
											    } // namespace trait

											    template<typename T>
											    constexpr bool isOrigin_v = trait::IsOrigin<T>::value;

											    template<typename T>
											    constexpr bool isUnit_v = trait::IsUnit<T>::value;

											    namespace concepts
											    {
											        template<typename T>
											        concept Origin = isOrigin_v<T>;

											        template<typename T>
											        concept Unit = isUnit_v<T>;
											    } // namespace concepts

											} // namespace alpaka::onAcc
											// ==
											// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/onAcc/tag.hpp ==
											// ============================================================================

										// #include "alpaka/tag.hpp"    // amalgamate: file already inlined

										namespace alpaka::onAcc
										{
										    namespace internalCompute
										    {
										        struct Sync
										        {
										            template<typename T_Acc, alpaka::concepts::Layer T_Scope>
										            struct Op
										            {
										                constexpr auto operator()(T_Acc const& acc, T_Scope const scope) const;
										            };
										        };

										        constexpr void sync(auto const& acc, alpaka::concepts::Layer auto const scope)
										        {
										            Sync::Op<ALPAKA_TYPEOF(acc), ALPAKA_TYPEOF(scope)>{}(acc, scope);
										        }

										        struct SharedMemory
										        {
										            template<typename T, size_t T_uniqueId, typename T_Acc>
										            struct Static
										            {
										                constexpr decltype(auto) operator()(auto const& acc) const
										                {
										                    return acc[layer::shared].template allocVar<T, T_uniqueId>();
										                }
										            };

										            template<typename T, typename T_Acc>
										            struct Dynamic
										            {
										                constexpr auto operator()(auto const& acc) const -> T*
										                {
										                    static_assert(
										                        T_Acc::hasKey(object::dynSharedMemBytes),
										                        "Dynamic shared memory not configured. Add member 'dynSharedMemBytes' to the kernel or "
										                        "specialize 'onHost::trait:BlockDynSharedMemBytes'!");
										                    uint32_t numBytes = acc[object::dynSharedMemBytes];
										                    return acc[layer::dynShared].template allocDynamic<T, uniqueId()>(numBytes);
										                }
										            };
										        };

										        template<typename T, size_t T_uniqueId>
										        constexpr decltype(auto) declareSharedVar(auto const& acc)
										        {
										            return SharedMemory::Static<T, T_uniqueId, std::decay_t<decltype(acc)>>{}(acc);
										        }

										        template<typename T>
										        constexpr auto declareDynamicSharedMem(auto const& acc) -> T*
										        {
										            return SharedMemory::Dynamic<T, std::decay_t<decltype(acc)>>{}(acc);
										        }

										        struct Atomic
										        {
										            /** Implements a atomic operation */
										            template<typename TOp, typename TAtomicImpl, typename T, typename T_Scope, typename TSfinae = void>
										            struct Op;
										        };

										        /** Get the index of an object within a layer in the selected units*/
										        struct GetIdxWithin
										        {
										            template<typename T_Acc, typename T_Origin, typename T_Unit>
										            struct Op
										            {
										                constexpr alpaka::concepts::Vector auto operator()(T_Acc const& acc, T_Origin origin, T_Unit unit)
										                    const;
										            };
										        };

										        /** Get the number of elments in a layer in the selected units*/
										        struct GetExtentsOf
										        {
										            template<typename T_Acc, typename T_Origin, typename T_Unit>
										            struct Op
										            {
										                constexpr alpaka::concepts::Vector auto operator()(T_Acc const& acc, T_Origin origin, T_Unit unit)
										                    const;
										            };
										        };

										        struct MemoryFence
										        {
										            // Backend specializations provide the definition.
										            template<typename T_Acc, typename T_MemoryOrder, typename T_Scope>
										            struct Op
										            {
										                constexpr void operator()(T_Acc const& acc, T_MemoryOrder const order, T_Scope const scope) const;
										            };
										        };
										    } // namespace internalCompute
										} // namespace alpaka::onAcc
										// ==
										// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/onAcc/internal/interface.hpp ==
										// ============================================================================

										// ============================================================================
										// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/operation.hpp ==
										// ==
										/* Copyright 2020 Benjamin Worpitz, Bernhard Manfred Gruber
										 * SPDX-License-Identifier: MPL-2.0
										 */

										// #pragma once
										// #include "alpaka/core/common.hpp"    // amalgamate: file already inlined
										// #include "alpaka/core/config.hpp"    // amalgamate: file already inlined

										// #include <algorithm>    // amalgamate: file already included
										#include <type_traits>

										/** Contains functors with operation following the atomic operation semantics.
										 *
										 * @attention The operations itself are not atomic, only the argument interface follows corresponding atomic
										 * operations. The argument updated is always hand in as pointer.
										 *
										 */
										namespace alpaka::operation
										{
										    //! The addition function object.
										    struct Add
										    {
										        //! \return The old value of addr.
										        template<typename T>
										        ALPAKA_FN_HOST_ACC auto operator()(T* const addr, T const& value) const -> T
										        {
										            auto const old = *addr;
										            auto& ref = *addr;
										#if ALPAKA_COMP_GNUC
										#    pragma GCC diagnostic push
										#    pragma GCC diagnostic ignored "-Wconversion"
										#endif
										            ref += value;
										            return old;
										#if ALPAKA_COMP_GNUC
										#    pragma GCC diagnostic pop
										#endif
										        }
										    };

										    //! The subtraction function object.
										    struct Sub
										    {
										        //! \return The old value of addr.
										        ALPAKA_NO_HOST_ACC_WARNING
										        template<typename T>
										        ALPAKA_FN_HOST_ACC auto operator()(T* const addr, T const& value) const -> T
										        {
										            auto const old = *addr;
										            auto& ref = *addr;
										#if ALPAKA_COMP_GNUC
										#    pragma GCC diagnostic push
										#    pragma GCC diagnostic ignored "-Wconversion"
										#endif
										            ref -= value;
										#if ALPAKA_COMP_GNUC
										#    pragma GCC diagnostic pop
										#endif
										            return old;
										        }
										    };

										    //! The minimum function object.
										    struct Min
										    {
										        //! \return The old value of addr.
										        ALPAKA_NO_HOST_ACC_WARNING
										        template<typename T>
										        ALPAKA_FN_HOST_ACC auto operator()(T* const addr, T const& value) const -> T
										        {
										            auto const old = *addr;
										            auto& ref = *addr;
										            ref = std::min(ref, value);
										            return old;
										        }
										    };

										    //! The maximum function object.
										    struct Max
										    {
										        //! \return The old value of addr.
										        ALPAKA_NO_HOST_ACC_WARNING
										        template<typename T>
										        ALPAKA_FN_HOST_ACC auto operator()(T* const addr, T const& value) const -> T
										        {
										            auto const old = *addr;
										            auto& ref = *addr;
										            ref = std::max(ref, value);
										            return old;
										        }
										    };

										    //! The exchange function object.
										    struct Exch
										    {
										        //! \return The old value of addr.
										        ALPAKA_NO_HOST_ACC_WARNING
										        template<typename T>
										        ALPAKA_FN_HOST_ACC auto operator()(T* const addr, T const& value) const -> T
										        {
										            auto const old = *addr;
										            auto& ref = *addr;
										            ref = value;
										            return old;
										        }
										    };

										    //! The increment function object.
										    struct Inc
										    {
										        //! Increments up to value, then reset to 0.
										        //!
										        //! \return The old value of addr.
										        ALPAKA_NO_HOST_ACC_WARNING
										        template<typename T>
										        ALPAKA_FN_HOST_ACC auto operator()(T* const addr, T const& value) const -> T
										        {
										            auto const old = *addr;
										            auto& ref = *addr;
										            ref = ((old >= value) ? static_cast<T>(0) : static_cast<T>(old + static_cast<T>(1)));
										            return old;
										        }
										    };

										    //! The decrement function object.
										    struct Dec
										    {
										        //! Decrement down to 0, then reset to value.
										        //!
										        //! \return The old value of addr.
										        ALPAKA_NO_HOST_ACC_WARNING
										        template<typename T>
										        ALPAKA_FN_HOST_ACC auto operator()(T* const addr, T const& value) const -> T
										        {
										            auto const old = *addr;
										            auto& ref = *addr;
										            ref = (((old == static_cast<T>(0)) || (old > value)) ? value : static_cast<T>(old - static_cast<T>(1)));
										            return old;
										        }
										    };

										    //! The and function object.
										    struct And
										    {
										        //! \return The old value of addr.
										        ALPAKA_NO_HOST_ACC_WARNING
										        template<typename T>
										        ALPAKA_FN_HOST_ACC auto operator()(T* const addr, T const& value) const -> T
										        {
										            auto const old = *addr;
										            auto& ref = *addr;
										            ref &= value;
										            return old;
										        }
										    };

										    //! The or function object.
										    struct Or
										    {
										        //! \return The old value of addr.
										        ALPAKA_NO_HOST_ACC_WARNING
										        template<typename T>
										        ALPAKA_FN_HOST_ACC auto operator()(T* const addr, T const& value) const -> T
										        {
										            auto const old = *addr;
										            auto& ref = *addr;
										            ref |= value;
										            return old;
										        }
										    };

										    //! The exclusive or function object.
										    struct Xor
										    {
										        //! \return The old value of addr.
										        ALPAKA_NO_HOST_ACC_WARNING
										        template<typename T>
										        ALPAKA_FN_HOST_ACC auto operator()(T* const addr, T const& value) const -> T
										        {
										            auto const old = *addr;
										            auto& ref = *addr;
										            ref ^= value;
										            return old;
										        }
										    };

										    //! The compare and swap function object.
										    struct Cas
										    {
										        //! Cas for non floating point values
										        // \return The old value of addr.
										        ALPAKA_NO_HOST_ACC_WARNING
										        template<typename T, std::enable_if_t<!std::is_floating_point_v<T>, bool> = true>
										        ALPAKA_FN_HOST_ACC auto operator()(T* addr, T const& compare, T const& value) const -> T
										        {
										            auto const old = *addr;
										            auto& ref = *addr;

										            // check if values are bit-wise equal
										            ref = ((old == compare) ? value : old);
										            return old;
										        }

										        //! Cas for floating point values
										        // \return The old value of addr.
										        ALPAKA_NO_HOST_ACC_WARNING
										        template<typename T, std::enable_if_t<std::is_floating_point_v<T>, bool> = true>
										        ALPAKA_FN_HOST_ACC auto operator()(T* addr, T const& compare, T const& value) const -> T
										        {
										            static_assert(sizeof(T) == 4u || sizeof(T) == 8u, "Cas is supporting only 32bit and 64bit values!");
										            // Type to reinterpret too to perform the bit comparison
										            using BitType = std::conditional_t<sizeof(T) == 4u, unsigned int, unsigned long long>;

										            // type used to have a safe way to reinterprete the data into another type
										            // std::variant can not be used because clang8 has issues to compile std::variant
										            struct BitUnion
										            {
										                union
										                {
										                    T value;
										                    BitType r;
										                };
										            };

										            auto const old = *addr;
										            auto& ref = *addr;


										            BitUnion o{old};
										            BitUnion c{compare};

										            ref = ((o.r == c.r) ? value : old);
										            return old;
										        }
										    };
										} // namespace alpaka::operation
										// ==
										// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/operation.hpp ==
										// ============================================================================


									namespace alpaka::onAcc::internal
									{
									    struct NonAtomic
									    {
									    };

									    /** Execute the operation as non-atomic operation */
									    constexpr auto nonAtomic = NonAtomic{};

									} // namespace alpaka::onAcc::internal

									namespace alpaka::onAcc::internalCompute
									{
									    template<typename T, typename T_AtomicOp, typename T_Scope>
									    struct Atomic::Op<T_AtomicOp, onAcc::internal::NonAtomic, T, T_Scope>
									    {
									        static auto atomicOp(onAcc::internal::NonAtomic const&, T* const addr, T const& value) -> T
									        {
									            return T_AtomicOp{}(addr, value);
									        }
									    };

									    template<typename T, typename T_Scope>
									    struct Atomic::Op<operation::Cas, internal::NonAtomic, T, T_Scope>
									    {
									        static auto atomicOp(internal::NonAtomic const&, T* const addr, T const& compare, T const& value) -> T
									        {
									            return operation::Cas{}(addr, compare, value);
									        }
									    };
									} // namespace alpaka::onAcc::internalCompute
									// ==
									// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/onAcc/internal/atomic.hpp ==
									// ============================================================================

									// ============================================================================
									// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/onAcc/scope.hpp ==
									// ==
									/* Copyright 2025 Mehmet Yusufoglu, René Widera
									 * SPDX-License-Identifier: MPL-2.0
									 */

									// #pragma once
									// #include <string>    // amalgamate: file already included

									/**
									 * @brief Provides scopes for atomic and memory fence operations, analogous to NVIDIA CUDA's atomic and fence scopes.
									 *
									 * This namespace defines the visibility scopes for atomic operations and memory fences,
									 * which control the visibility of memory operations across threads, blocks, and devices.
									 * The provided scopes are:
									 * - Block: Visibility within a thread block.
									 * - Device: Visibility across all thread blocks on the same device.
									 * - System: System-wide visibility, mapped to the strongest available atomic/fence by the backend.
									 *
									 * @see alpaka::onAcc::atomicAdd, alpaka::onAcc::memFence
									 */
									namespace alpaka::onAcc
									{
									    namespace scope
									    {
									        /**
									         * @brief Base tag for scope types.
									         *
									         * This tag can be used to constrain APIs that accept only valid scopes.
									         */
									        struct ScopeTag
									        {
									        };

									        /**
									         * @brief Scope for atomic and fence operations visible only within the same thread block.
									         *
									         * When used with atomic operations (e.g., atomicAdd), only threads within the same block
									         * will see the updated value. When used with threadFence, it ensures that all writes
									         * from the current thread are visible to all other threads in the same block.
									         *
									         * @note Analogous to CUDA's `atomicAdd_block` and `threadFence_block`.
									         */
									        struct Block : ScopeTag
									        {
									            static std::string getName()
									            {
									                return "Block";
									            }
									        };

									        inline constexpr Block block{};

									        /**
									         * @brief Scope for atomic and fence operations visible across all thread blocks on the same device.
									         *
									         * When used with atomic operations, all threads on the same device will see the updated value.
									         * When used with threadFence, it ensures that all writes from the current thread are visible
									         * to all other threads on the same device.
									         *
									         * @note This scope is stronger than Block but weaker than System.
									         */
									        struct Device : ScopeTag
									        {
									            static std::string getName()
									            {
									                return "Device";
									            }
									        };

									        inline constexpr Device device{};

									        /**
									         * @brief Scope for atomic and fence operations with system-wide visibility.
									         *
									         * When used with atomic operations, all threads in the system (potentially across multiple devices)
									         * will see the updated value. When used with threadFence, it ensures that all writes from the current
									         * thread are visible to all other threads in the system.
									         *
									         * @attention System operations are only visible to other threads of the same device kind.
									         * Operations executed on a host compute device will not be visible to threads in, for example, CUDA/HIP or
									         * oneAPI kernels, and vice versa.
									         *
									         * @note This is the strongest scope, analogous to CUDA's `atomicAdd_system` and the strongest fence.
									         */
									        struct System : ScopeTag
									        {
									            static std::string getName()
									            {
									                return "System";
									            }
									        };

									        inline constexpr System system{};
									    } // namespace scope

									    namespace concepts
									    {
									        template<typename T>
									        concept Scope = std::derived_from<T, scope::ScopeTag>;
									    } // namespace concepts

									} // namespace alpaka::onAcc
									// ==
									// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/onAcc/scope.hpp ==
									// ============================================================================

								// #include "alpaka/tag.hpp"    // amalgamate: file already inlined

								// #include <string>    // amalgamate: file already included

								namespace alpaka
								{
								    namespace exec
								    {
								        struct CpuSerial
								        {
								            static std::string getName()
								            {
								                return "CpuSerial";
								            }
								        };

								        constexpr CpuSerial cpuSerial;

								        struct CpuOmpBlocks
								        {
								            static std::string getName()
								            {
								                return "CpuOmpBlocks";
								            }
								        };

								        constexpr CpuOmpBlocks cpuOmpBlocks;

								        struct CpuTbbBlocks
								        {
								            static std::string getName()
								            {
								                return "CpuTbbBlocks";
								            }
								        };

								        constexpr CpuTbbBlocks cpuTbbBlocks;

								        namespace trait
								        {
								            template<>
								            struct IsSeqExecutor<CpuSerial> : std::true_type
								            {
								            };

								            template<>
								            struct IsSeqExecutor<CpuOmpBlocks> : std::true_type
								            {
								            };

								            template<>
								            struct IsSeqExecutor<CpuTbbBlocks> : std::true_type
								            {
								            };
								        } // namespace trait
								    } // namespace exec

								    namespace trait
								    {
								        template<>
								        struct IsExecutor<exec::CpuSerial> : std::true_type
								        {
								        };

								        template<>
								        struct IsExecutor<exec::CpuOmpBlocks> : std::true_type
								        {
								        };

								        template<>
								        struct IsExecutor<exec::CpuTbbBlocks> : std::true_type
								        {
								        };

								    } // namespace trait
								} // namespace alpaka

								namespace alpaka::onAcc::trait
								{
								    template<typename T_AtomicScope>
								    struct GetAtomicImpl::Op<alpaka::exec::CpuSerial, T_AtomicScope>
								    {
								        constexpr decltype(auto) operator()(alpaka::exec::CpuSerial const, T_AtomicScope const) const
								        {
								            return alpaka::onAcc::internal::stlAtomic;
								        }
								    };

								    template<>
								    struct GetAtomicImpl::Op<alpaka::exec::CpuSerial, onAcc::scope::Block>
								    {
								        constexpr decltype(auto) operator()(alpaka::exec::CpuSerial const, onAcc::scope::Block const) const
								        {
								            return alpaka::onAcc::internal::nonAtomic;
								        }
								    };

								    template<typename T_AtomicScope>
								    struct GetAtomicImpl::Op<alpaka::exec::CpuOmpBlocks, T_AtomicScope>
								    {
								        constexpr decltype(auto) operator()(alpaka::exec::CpuOmpBlocks const, T_AtomicScope const) const
								        {
								            return alpaka::onAcc::internal::stlAtomic;
								        }
								    };

								    template<>
								    struct GetAtomicImpl::Op<alpaka::exec::CpuOmpBlocks, onAcc::scope::Block>
								    {
								        constexpr decltype(auto) operator()(alpaka::exec::CpuOmpBlocks const, onAcc::scope::Block const) const
								        {
								            return alpaka::onAcc::internal::nonAtomic;
								        }
								    };

								    template<typename T_AtomicScope>
								    struct GetAtomicImpl::Op<alpaka::exec::CpuTbbBlocks, T_AtomicScope>
								    {
								        constexpr decltype(auto) operator()(alpaka::exec::CpuTbbBlocks const, T_AtomicScope const) const
								        {
								            return alpaka::onAcc::internal::stlAtomic;
								        }
								    };

								    template<>
								    struct GetAtomicImpl::Op<alpaka::exec::CpuTbbBlocks, onAcc::scope::Block>
								    {
								        constexpr decltype(auto) operator()(alpaka::exec::CpuTbbBlocks const, onAcc::scope::Block const) const
								        {
								            return alpaka::onAcc::internal::nonAtomic;
								        }
								    };
								} // namespace alpaka::onAcc::trait
								// ==
								// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/host/executor.hpp ==
								// ============================================================================

								// ============================================================================
								// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/oneApi/executor.hpp ==
								// ==
								/* Copyright 2025 Simeon Ehrig
								 * SPDX-License-Identifier: MPL-2.0
								 */

								// #pragma once
									// ============================================================================
									// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/syclGeneric/tag.hpp ==
									// ==
									/* Copyright 2025 Simeon Ehrig
									 * SPDX-License-Identifier: MPL-2.0
									 */

									// #pragma once
									namespace alpaka
									{
									    namespace onAcc::internal
									    {
									        struct SyclAtomic
									        {
									        };

									        constexpr auto syclAtomic = SyclAtomic{};
									    } // namespace onAcc::internal

									    namespace math::internal
									    {
									        struct SyclMath
									        {
									        };

									        constexpr auto syclMath = SyclMath{};
									    } // namespace math::internal

									    namespace internal
									    {
									        struct SyclIntrinsic
									        {
									        };

									        constexpr auto syclIntrinsic = SyclIntrinsic{};
									    } // namespace internal
									} // namespace alpaka
									// ==
									// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/syclGeneric/tag.hpp ==
									// ============================================================================

								// #include "alpaka/api/trait.hpp"    // amalgamate: file already inlined

								namespace alpaka
								{
								    namespace exec
								    {
								        struct OneApi
								        {
								            static std::string getName()
								            {
								                return "OneApi";
								            }
								        };

								        constexpr OneApi oneApi{};
								    } // namespace exec

								    namespace trait
								    {
								        template<>
								        struct IsExecutor<exec::OneApi> : std::true_type
								        {
								        };
								    } // namespace trait

								    namespace onAcc::trait
								    {
								        template<typename T_AtomicScope>
								        struct GetAtomicImpl::Op<alpaka::exec::OneApi, T_AtomicScope>
								        {
								            constexpr decltype(auto) operator()(alpaka::exec::OneApi const, T_AtomicScope const) const
								            {
								                return alpaka::onAcc::internal::syclAtomic;
								            }
								        };
								    } // namespace onAcc::trait
								} // namespace alpaka
								// ==
								// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/oneApi/executor.hpp ==
								// ============================================================================

							// #include "alpaka/core/PP.hpp"    // amalgamate: file already inlined

							namespace alpaka::exec
							{
							    /** list of all executors supported by alpaka
							     *
							     * The order is from high parallelism to low parallelism for executors which are falling into the same category.
							     * This list is used at places where a function can be called without an executor. In this case the first available
							     * executor is used.
							     */
							    constexpr auto allExecutors = std::make_tuple(gpuCuda, gpuHip, oneApi, cpuOmpBlocks, cpuTbbBlocks, cpuSerial);

							    /** list of enabled executors
							     *
							     * - executors can be dis-/enabled by the CMake define alpaka_EXEC_<ExecutorName>
							     * - the second way to disable an executor is to define the preprocessor define ALPAKA_DISABLE_EXEC_<ExecutorName>,
							     * if not the executor is enabled
							     */
							    constexpr auto enabledExecutors = std::tuple_cat(
							        // empty tuple to avoid issues with the first comma
							        std::tuple<>{}
							#ifndef ALPAKA_DISABLE_EXEC_CpuOmpBlocks
							        ,
							        std::tuple{exec::cpuOmpBlocks}
							#endif
							#ifndef ALPAKA_DISABLE_EXEC_CpuTbbBlocks
							        ,
							        std::tuple{exec::cpuTbbBlocks}
							#endif
							#ifndef ALPAKA_DISABLE_EXEC_CpuSerial
							        ,
							        std::tuple{exec::cpuSerial}
							#endif
							#ifndef ALPAKA_DISABLE_EXEC_GpuCuda
							        ,
							        std::tuple{exec::gpuCuda}
							#endif
							#ifndef ALPAKA_DISABLE_EXEC_GpuHip
							        ,
							        std::tuple{exec::gpuHip}
							#endif
							#ifndef ALPAKA_DISABLE_EXEC_OneApi
							        ,
							        std::tuple{exec::oneApi}
							#endif
							    );
							} // namespace alpaka::exec
							// ==
							// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/executor.hpp ==
							// ============================================================================

							// ============================================================================
							// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/meta/filter.hpp ==
							// ==
							/* Copyright 2024 René Widera
							 * SPDX-License-Identifier: MPL-2.0
							 */

							// #include <functional>    // amalgamate: file already included
							// #include <tuple>    // amalgamate: file already included
							// #include <utility>    // amalgamate: file already included

							// #pragma once
							namespace alpaka::meta
							{
							    constexpr auto filter(auto const unaryConditionFn, auto const list)
							    {
							        return std::apply(
							            [=](auto... ts) constexpr
							            {
							                return std::tuple_cat(
							                    std::conditional_t<unaryConditionFn(ts), std::tuple<decltype(ts)>, std::tuple<>>{}...);
							            },
							            list);
							    }
							} // namespace alpaka::meta
							// ==
							// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/meta/filter.hpp ==
							// ============================================================================

							// ============================================================================
							// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/onHost/concepts.hpp ==
							// ==
							/* Copyright 2024 René Widera
							 * SPDX-License-Identifier: MPL-2.0
							 */

							// #pragma once
							// #include "alpaka/concepts.hpp"    // amalgamate: file already inlined
							// #include "alpaka/internal/interface.hpp"    // amalgamate: file already inlined
								// ============================================================================
								// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/onHost/internal/interface.hpp ==
								// ==
								/* Copyright 2024 René Widera, Tim Hanel
								 * SPDX-License-Identifier: MPL-2.0
								 */

								// #pragma once								// #include "alpaka/KernelBundle.hpp"    // amalgamate: file already inlined
								// #include "alpaka/api/trait.hpp"    // amalgamate: file already inlined
									// ============================================================================
									// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/core/Assert.hpp ==
									// ==
									/* Copyright 2022 Axel Huebl, Benjamin Worpitz, Matthias Werner, Jan Stephan, Bernhard Manfred Gruber
									 * SPDX-License-Identifier: MPL-2.0
									 */

									// #pragma once
									// #include "alpaka/core/common.hpp"    // amalgamate: file already inlined
									// #include "alpaka/core/config.hpp"    // amalgamate: file already inlined

									// #include <cassert>    // amalgamate: file already included
									#include <type_traits>

									//! The assert can be explicit disabled by defining NDEBUG
									#define ALPAKA_ASSERT(...) assert(__VA_ARGS__)

									//! Macro which expands to a noop.
									//! Macro enforces an semicolon after the call.
									#define ALPAKA_NOOP(...)                                                                                              \
									    do                                                                                                                \
									    {                                                                                                                 \
									    } while(false)

									//! ALPAKA_ASSERT_ACC_IMPL is an assert-like macro.
									//! It can be disabled setting the ALPAKA_DISABLE_ASSERT_ACC preprocessor symbol or the NDEBUG preprocessor symbol.
									#if !defined(ALPAKA_DISABLE_ASSERT_ACC)
									#    define ALPAKA_ASSERT_ACC_IMPL(...) ALPAKA_ASSERT(__VA_ARGS__)
									#else
									#    define ALPAKA_ASSERT_ACC_IMPL(...) ALPAKA_NOOP(__VA_ARGS__)
									#endif

									//! ALPAKA_ASSERT_ACC is an assert-like macro.
									//!
									//! In device code for a GPU or SYCL backend it can be disabled setting the ALPAKA_DISABLE_ASSERT_ACC preprocessor
									//! symbol or the NDEBUG preprocessor symbol. In device code for a native C++ CPU backend and in host code, it is
									//! equivalent to ALPAKA_ASSERT, and can be disabled setting the NDEBUG preprocessor symbol.
									#if defined(ALPAKA_LANG_CUDA) && defined(__CUDA_ARCH__)
									// CUDA device code
									#    define ALPAKA_ASSERT_ACC(...) ALPAKA_ASSERT_ACC_IMPL(__VA_ARGS__)
									#elif defined(ALPAKA_LANG_HIP) && defined(__HIP_DEVICE_COMPILE__)
									// HIP/ROCm device code
									#    define ALPAKA_ASSERT_ACC(...) ALPAKA_ASSERT_ACC_IMPL(__VA_ARGS__)
									#elif defined(ALPAKA_LANG_SYCL) && defined(__SYCL_DEVICE_ONLY__)
									// SYCL/oneAPI device code
									#    if defined(SYCL_EXT_ONEAPI_ASSERT)
									#        define ALPAKA_ASSERT_ACC(...) ALPAKA_ASSERT_ACC_IMPL(__VA_ARGS__)
									#    else
									#        define ALPAKA_ASSERT_ACC(...) ALPAKA_NOOP(__VA_ARGS__)
									#    endif
									// add here any other #elif conditions for non-CPU backends
									// ...
									#else
									// CPU backend, or host code
									#    define ALPAKA_ASSERT_ACC(...) ALPAKA_ASSERT(__VA_ARGS__)
									#endif

									namespace alpaka::core
									{
									    namespace detail
									    {
									        template<typename TArg>
									        struct AssertValueUnsigned
									        {
									            ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_HOST_ACC static constexpr auto assertValueUnsigned(
									                [[maybe_unused]] TArg const& arg)
									            {
									                if constexpr(std::is_signed_v<TArg>)
									                    ALPAKA_ASSERT_ACC(arg >= 0);

									                // Nothing to do for unsigned types.
									            }
									        };
									    } // namespace detail

									    //! This method checks integral values if they are greater or equal zero.
									    //! The implementation prevents warnings for checking this for unsigned types.
									    ALPAKA_NO_HOST_ACC_WARNING
									    template<typename TArg>
									    ALPAKA_FN_HOST_ACC constexpr auto assertValueUnsigned(TArg const& arg) -> void
									    {
									        detail::AssertValueUnsigned<TArg>::assertValueUnsigned(arg);
									    }

									    namespace detail
									    {
									        template<typename TLhs, typename TRhs>
									        struct AssertGreaterThan
									        {
									            ALPAKA_NO_HOST_ACC_WARNING ALPAKA_FN_HOST_ACC static constexpr auto assertGreaterThan(
									                [[maybe_unused]] TRhs const& rhs)
									            {
									                if constexpr(std::is_signed_v<TRhs> || (TLhs::value != 0u))
									                    ALPAKA_ASSERT_ACC(TLhs::value > rhs);

									                // Nothing to do for unsigned types comparing to zero.
									            }
									        };
									    } // namespace detail

									    //! This function asserts that the integral value TLhs is greater than TRhs.
									    ALPAKA_NO_HOST_ACC_WARNING
									    template<typename TLhs, typename TRhs>
									    ALPAKA_FN_HOST_ACC constexpr auto assertGreaterThan(TRhs const& rhs) -> void
									    {
									        detail::AssertGreaterThan<TLhs, TRhs>::assertGreaterThan(rhs);
									    }
									} // namespace alpaka::core
									// ==
									// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/core/Assert.hpp ==
									// ============================================================================

								// #include "alpaka/core/common.hpp"    // amalgamate: file already inlined
									// ============================================================================
									// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/onHost/DeviceProperties.hpp ==
									// ==
									/* Copyright 2024 René Widera
									 * SPDX-License-Identifier: MPL-2.0
									 */


									// #pragma once
									// #include "alpaka/Vec.hpp"    // amalgamate: file already inlined

									// #include <array>    // amalgamate: file already included
									// #include <cstdint>    // amalgamate: file already included
									// #include <functional>    // amalgamate: file already included
									#include <ostream>
									// #include <string>    // amalgamate: file already included

									namespace alpaka::onHost
									{
									    namespace internal
									    {
									        // forward declaration
									        struct GetDeviceProperties;
									    } // namespace internal

									    /** Properties of a device
									     *
									     * Collection of static properties of a device.
									     */
									    struct DeviceProperties
									    {
									        auto getName() const
									        {
									            return name;
									        }

									        /** The total amount of global device memory in bytes.
									         *
									         * @attention It is **not** the amount of free memory!
									         *
									         * Device memory is the physical memory of the compute device.
									         * On systems with a GPU which is sharing the memory with the host CPU, this value may be equal to the total
									         * amount of system memory.
									         */
									        size_t globalMemCapacityBytes;
									        /** The amount of shared memory per thread block in bytes. */
									        uint32_t sharedMemPerBlockBytes;
									        /** The name of the device. */
									        std::string name;
									        /** The number of multiprocessors.*/
									        uint32_t multiProcessorCount;
									        /** The warp size.
									         *
									         * Number of threads per thread block that are executed in lock-step.
									         */
									        uint32_t warpSize;
									        /** The maximum total number of threads per thread block. */
									        uint32_t maxThreadsPerBlock;

									        /** Maximum number of threads within a thread block for each dimension.
									         *
									         * @attention Do not assume that the limits are equal for any dimension.
									         * The product of two or more dimensions can exceed maxThreadsPerBlock, this will result in an invalid
									         * configuration when used for kernel execution. All values are 32-bit indexes, take care of overflows.
									         *
									         *  @tparam T_dim Number of dimensions used for a kernel call.
									         *  @return Maximum number of threads within a block, usable for ThreadSpec.
									         */
									        template<uint32_t T_dim>
									        Vec<uint32_t, T_dim> getMaxThreadsPerBlock() const
									        {
									            std::array<uint32_t, T_dim> res;
									            fnMaxThreadsPerBlock(res.data(), T_dim);
									            return {res};
									        }

									        /** The maximum total number of blocks within a grid. */
									        uint32_t maxBlocksPerGrid;

									        /** Maximum number of blocks within a grid for each dimension.
									         *
									         * @attention Do not assume that the limits are equal for any dimension.
									         * The product of two or more dimensions can exceed maxBlocksPerGrid, this will result in an invalid
									         * configuration when used for kernel execution. All values are 32-bit indexes, take care of overflows.
									         *
									         *  @tparam T_dim Number of dimensions used for a kernel call.
									         *  @return Maximum number of blocks, usable for ThreadSpec.
									         */
									        template<uint32_t T_dim>
									        Vec<uint32_t, T_dim> getMaxBlocksPerGrid() const
									        {
									            std::array<uint32_t, T_dim> res;
									            fnMaxBlocksPerGrid(res.data(), T_dim);
									            return {res};
									        }

									    private:
									        friend internal::GetDeviceProperties;

									        /** function to fill maximum number of threads per block per dimension
									         *
									         * result: pointer to vector data, follows alpaka index order
									         * numDims: number of dimensions of the result, elements in result
									         */
									        std::function<void(uint32_t* result, uint32_t numDims)> fnMaxThreadsPerBlock;

									        /** function to fill maximum number of blocks within a grid per dimension
									         *
									         * result: pointer to vector data, follows alpaka index order
									         * numDims: number of dimensions of the result, elements in result
									         */
									        std::function<void(uint32_t* result, uint32_t numDims)> fnMaxBlocksPerGrid;
									    };

									    inline std::ostream& operator<<(std::ostream& s, DeviceProperties const& p)
									    {
									        s << "name: " << p.name << "\n";
									        s << "multiProcessorCount: " << p.multiProcessorCount << "\n";
									        s << "warpSize: " << p.warpSize << "\n";
									        s << "maxThreadsPerBlock: " << p.maxThreadsPerBlock << "\n";
									        s << "maxBlocksPerGrid: " << p.maxBlocksPerGrid << "\n";
									        s << "globalMemCapacityBytes: " << p.globalMemCapacityBytes << "\n";
									        s << "sharedMemPerBlockBytes: " << p.sharedMemPerBlockBytes << "\n";
									        return s;
									    };
									} // namespace alpaka::onHost
									// ==
									// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/onHost/DeviceProperties.hpp ==
									// ============================================================================

									// ============================================================================
									// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/onHost/FrameSpec.hpp ==
									// ==
									/* Copyright 2024 René Widera
									 * SPDX-License-Identifier: MPL-2.0
									 */

									// #pragma once
									// #include "alpaka/Vec.hpp"    // amalgamate: file already inlined
										// ============================================================================
										// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/executor.hpp ==
										// ==
										/* Copyright 2024 René Widera, Mehmet Yusufoglu
										 * SPDX-License-Identifier: MPL-2.0
										 */

										// #pragma once
										// #include "alpaka/api/cuda/executor.hpp"    // amalgamate: file already inlined
										// #include "alpaka/api/hip/executor.hpp"    // amalgamate: file already inlined
										// #include "alpaka/api/host/executor.hpp"    // amalgamate: file already inlined
										// #include "alpaka/api/oneApi/executor.hpp"    // amalgamate: file already inlined
										// #include "alpaka/api/trait.hpp"    // amalgamate: file already inlined

										// #include <string>    // amalgamate: file already included

										namespace alpaka
										{
										    namespace exec
										    {
										        /** Automatic executor selection
										         *
										         * If this executor is used in alpaka interfaces, the best fitting available executor will automatically
										         * select. The selection based often on the device or queue provided in the interfaces.
										         */
										        struct AnyExecutor
										        {
										            static std::string getName()
										            {
										                return "AnyExecutor";
										            }
										        };

										        /** @copydoc AnyExecutor */
										        constexpr AnyExecutor anyExecutor;
										    } // namespace exec

										    namespace trait
										    {
										        template<>
										        struct IsExecutor<exec::AnyExecutor> : std::true_type
										        {
										        };
										    } // namespace trait
										} // namespace alpaka
										// ==
										// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/executor.hpp ==
										// ============================================================================

									// #include "alpaka/concepts.hpp"    // amalgamate: file already inlined
									// #include "alpaka/core/common.hpp"    // amalgamate: file already inlined
										// ============================================================================
										// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/onHost/ThreadSpec.hpp ==
										// ==
										/* Copyright 2024 René Widera
										 * SPDX-License-Identifier: MPL-2.0
										 */

										// #pragma once
										// #include "alpaka/Vec.hpp"    // amalgamate: file already inlined
										// #include "alpaka/api/executor.hpp"    // amalgamate: file already inlined
										// #include "alpaka/concepts.hpp"    // amalgamate: file already inlined
										// #include "alpaka/core/common.hpp"    // amalgamate: file already inlined

										// #include <cstdint>    // amalgamate: file already included
										// #include <ostream>    // amalgamate: file already included

										namespace alpaka::onHost
										{
										    /** @brief Backend-specific description of the actual block and thread launch shape.
										     *
										     * A thread specification directly describes the number of blocks and the number of threads per block that are
										     * passed to the backend. This is the closest alpaka equivalent to a CUDA grid/block launch configuration.
										     *
										     * In contrast to `alpaka::onHost::FrameSpec`, a `ThreadSpec` is not a logical frame decomposition. It is used
										     * when a kernel requires exact guarantees about the number of blocks and the size of each block.
										     *
										     * @tparam T_Executor If the executor is alpaka::exec::AnyExecutor alpaka will select a good fitting executor for
										     * the action where the ThreadSpec is used.
										     */
										    template<
										        alpaka::concepts::Vector T_NumBlocks,
										        alpaka::concepts::Vector<typename T_NumBlocks::type, T_NumBlocks::dim()> T_NumThreads,
										        alpaka::concepts::Executor T_Executor = alpaka::exec::AnyExecutor>
										    struct ThreadSpec
										    {
										        using index_type = typename T_NumBlocks::type;
										        using NumBlocksVecType = typename T_NumBlocks::UniVec;
										        using NumThreadsVecType = T_NumThreads;

										    private:
										        NumBlocksVecType m_numBlocks;
										        NumThreadsVecType m_numThreads;

										    public:
										        constexpr ThreadSpec(
										            T_NumBlocks const& numBlocks,
										            T_NumThreads const& numThreadsPerBlock,
										            T_Executor executor = T_Executor{})
										            : m_numBlocks(numBlocks)
										            , m_numThreads(numThreadsPerBlock)
										        {
										            alpaka::unused(executor);
										        }

										        [[nodiscard]] static constexpr T_Executor getExecutor()
										        {
										            return T_Executor{};
										        }

										        [[nodiscard]] constexpr NumThreadsVecType const& getNumThreads() const noexcept
										        {
										            return m_numThreads;
										        }

										        [[nodiscard]] constexpr NumBlocksVecType const& getNumBlocks() const noexcept
										        {
										            return m_numBlocks;
										        }

										        [[nodiscard]] static consteval uint32_t dim()
										        {
										            return T_NumThreads::dim();
										        }
										    };

										    template<alpaka::concepts::VectorOrScalar T_NumBlocks, alpaka::concepts::VectorOrScalar T_NumThreads>
										    ThreadSpec(T_NumBlocks const&, T_NumThreads const&)
										        -> ThreadSpec<alpaka::trait::getVec_t<T_NumBlocks>, alpaka::trait::getVec_t<T_NumThreads>>;

										    template<
										        alpaka::concepts::VectorOrScalar T_NumBlocks,
										        alpaka::concepts::VectorOrScalar T_NumThreads,
										        alpaka::concepts::Executor T_Executor>
										    ThreadSpec(T_NumBlocks const&, T_NumThreads const&, T_Executor)
										        -> ThreadSpec<alpaka::trait::getVec_t<T_NumBlocks>, alpaka::trait::getVec_t<T_NumThreads>, T_Executor>;

										    namespace trait
										    {
										        template<typename T>
										        struct IsThreadSpec : std::false_type
										        {
										        };

										        template<
										            alpaka::concepts::Vector T_NumBlocks,
										            alpaka::concepts::Vector T_NumThreads,
										            alpaka::concepts::Executor T_Executor>
										        struct IsThreadSpec<onHost::ThreadSpec<T_NumBlocks, T_NumThreads, T_Executor>> : std::true_type
										        {
										        };
										    } // namespace trait

										    template<typename T>
										    constexpr bool isThreadSpec_v = trait::IsThreadSpec<T>::value;

										    namespace concepts
										    {
										        /** Concept to check if a type is a ThreadSpec
										         *
										         * @tparam T Type to check
										         * @tparam T_IndexType enforce a index type of the thread specification, if not provided the type is not
										         * checked
										         * @tparam T_dim enforce a dimensionality of the thread specification, if not provided the value is not
										         * checked
										         */
										        template<typename T, typename T_IndexType = alpaka::NotRequired, uint32_t T_dim = alpaka::notRequiredDim>
										        concept ThreadSpec
										            = isThreadSpec_v<T>
										              && (std::same_as<T_IndexType, alpaka::NotRequired> || std::same_as<typename T::index_type, T_IndexType>)
										              && ((T_dim == alpaka::notRequiredDim) || (T::dim() == T_dim));
										    } // namespace concepts

										    std::ostream& operator<<(std::ostream& s, concepts::ThreadSpec auto const& t)
										    {
										        return s << "ThreadSpec{ blocks=" << t.getNumBlocks() << ", threads=" << t.getNumThreads() << " }";
										    }
										} // namespace alpaka::onHost
										// ==
										// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/onHost/ThreadSpec.hpp ==
										// ============================================================================


									// #include <cstdint>    // amalgamate: file already included
									// #include <ostream>    // amalgamate: file already included

									namespace alpaka::onHost
									{
									    /** @brief Device/Api-agnostic description of the logical parallelism exposed to a kernel.
									     *
									     * A frame specification describes how a multidimensional index range [0; K) is divided into fixed-size chunks,
									     * called frames (NF), each with a frame extent (FE), where `K = NF * FE`.
									     * K does not need to match the problem size (P), e.g., the number of elements in a buffer you want to process in a
									     * kernel. Often, the best performance of a kernel can be achieved if `K <= P`, and if the
									     * kernel uses SIMD operations, `K <= P/(SIMD width)`.
									     * alpaka derives the onHost::ThreadSpec to launch the kernel, based on a hysteric and additional launch
									     * information from the `FrameSpec`. Therefor a kernel enqueued with a frame specification should always be written
									     * to be executable with any onHost::ThreadSpec and should not depend on hard-coded thread numbers, to ensure
									     * portability between devices.
									     *
									     * A `FrameSpec` is therefore not equivalent to a CUDA-style grid description. It specifies only the maximum
									     * parallelism made available to the kernel. It does not guarantee the number of physical thread blocks, nor the
									     * number of physical threads per block used by the backend. If exact control over blocks and threads is required,
									     * use onHost::ThreadSpec.
									     *
									     * @tparam T_NumFrames The n-dimensional number of frames.
									     * @tparam T_FrameExtents The n-dimensional size of one logical frame.
									     * @tparam T_Executor The executor used to translate the onHost::ThreadSpec into a thread block hierarchy.
									     * If the executor is exec::AnyExecutor alpaka will select a good fitting executor for the action where the
									     * ThreadSpec is used.
									     */
									    template<
									        alpaka::concepts::Vector T_NumFrames,
									        alpaka::concepts::Vector<typename T_NumFrames::type, T_NumFrames::dim()> T_FrameExtents,
									        alpaka::concepts::Executor T_Executor = alpaka::exec::AnyExecutor>
									    struct FrameSpec
									    {
									        using index_type = typename T_NumFrames::type;

									        using NumFramesVecType = T_NumFrames;
									        using FrameExtentsVecType = T_FrameExtents;

									    private:
									        NumFramesVecType m_numFrames;
									        FrameExtentsVecType m_frameExtents;

									    public:
									        constexpr FrameSpec(
									            T_NumFrames const& numFrames,
									            T_FrameExtents const& frameExtent,
									            T_Executor executor = T_Executor{})
									            : m_numFrames(numFrames)
									            , m_frameExtents(frameExtent)
									        {
									            alpaka::unused(executor);
									        }

									        [[nodiscard]] static constexpr T_Executor getExecutor() noexcept
									        {
									            return T_Executor{};
									        }

									        [[nodiscard]] constexpr NumFramesVecType const& getNumFrames() const noexcept
									        {
									            return m_numFrames;
									        }

									        [[nodiscard]] constexpr FrameExtentsVecType const& getFrameExtents() const noexcept
									        {
									            return m_frameExtents;
									        }

									        [[nodiscard]] static consteval uint32_t dim()
									        {
									            return T_FrameExtents::dim();
									        }
									    };

									    template<alpaka::concepts::VectorOrScalar T_NumFrames, alpaka::concepts::VectorOrScalar T_FrameExtents>
									    FrameSpec(T_NumFrames const&, T_FrameExtents const&) -> FrameSpec<
									        alpaka::trait::getVec_t<T_NumFrames>,
									        alpaka::trait::getVec_t<T_FrameExtents>,
									        alpaka::exec::AnyExecutor>;

									    template<
									        alpaka::concepts::VectorOrScalar T_NumFrames,
									        alpaka::concepts::VectorOrScalar T_FrameExtents,
									        alpaka::concepts::Executor T_Executor>
									    FrameSpec(T_NumFrames const&, T_FrameExtents const&, T_Executor)
									        -> FrameSpec<alpaka::trait::getVec_t<T_NumFrames>, alpaka::trait::getVec_t<T_FrameExtents>, T_Executor>;

									    namespace trait
									    {
									        template<typename T>
									        struct IsFrameSpec : std::false_type
									        {
									        };

									        template<
									            alpaka::concepts::Vector T_NumFrames,
									            alpaka::concepts::Vector T_FrameExtents,
									            alpaka::concepts::Executor T_Executor>
									        struct IsFrameSpec<onHost::FrameSpec<T_NumFrames, T_FrameExtents, T_Executor>> : std::true_type
									        {
									        };
									    } // namespace trait

									    template<typename T>
									    constexpr bool isFrameSpec_v = trait::IsFrameSpec<T>::value;

									    namespace concepts
									    {
									        /** Concept to check if a type is a FrameSpec
									         *
									         * @tparam T Type to check
									         * @tparam T_IndexType enforce a index type of the frame specification, if not provided the type is not checked
									         * @tparam T_dim enforce a dimensionality of the frame specification, if not provided the value is not
									         * checked
									         */
									        template<typename T, typename T_IndexType = alpaka::NotRequired, uint32_t T_dim = alpaka::notRequiredDim>
									        concept FrameSpec
									            = isFrameSpec_v<T>
									              && (std::same_as<T_IndexType, alpaka::NotRequired> || std::same_as<typename T::index_type, T_IndexType>)
									              && ((T_dim == alpaka::notRequiredDim) || (T::dim() == T_dim));

									        /** Concept to check if a type is a ThreadSpec or a FrameSpec
									         *
									         * @tparam T Type to check
									         */
									        template<typename T>
									        concept ThreadOrFrameSpec = isFrameSpec_v<T> || isThreadSpec_v<T>;
									    } // namespace concepts

									    std::ostream& operator<<(std::ostream& s, concepts::FrameSpec auto const& d)
									    {
									        return s << "FrameSpec{ frames=" << d.getNumFrames() << ", frameExtent=" << d.getFrameExtents() << " }";
									    }

									} // namespace alpaka::onHost
									// ==
									// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/onHost/FrameSpec.hpp ==
									// ============================================================================

								// #include "alpaka/onHost/Handle.hpp"    // amalgamate: file already inlined
								// #include "alpaka/onHost/ThreadSpec.hpp"    // amalgamate: file already inlined
								// #include "alpaka/tag.hpp"    // amalgamate: file already inlined

								namespace alpaka::onAcc::internal
								{
								    // forward declaration to avoid cyclic includes
								    template<typename T_Storage, typename T_Type>
								    struct GlobalDeviceMemoryWrapper;
								} // namespace alpaka::onAcc::internal

								namespace alpaka::onHost
								{
								    namespace internal
								    {
								        struct MakePlatform
								        {
								            template<typename T_Api, alpaka::concepts::DeviceKind T_DeviceKind>
								            struct Op
								            {
								                auto operator()(T_Api api, T_DeviceKind deviceType) const;
								            };
								        };

								        static auto makePlatform(auto api, alpaka::concepts::DeviceKind auto deviceType)
								        {
								            return MakePlatform::Op<ALPAKA_TYPEOF(api), ALPAKA_TYPEOF(deviceType)>{}(api, deviceType);
								        }

								        struct GetDeviceCount
								        {
								            template<typename T_Platform>
								            struct Op
								            {
								                uint32_t operator()(T_Platform& platform) const
								                {
								                    return platform.getDeviceCount();
								                }
								            };
								        };

								        struct MakeDevice
								        {
								            template<typename T_Platform>
								            struct Op
								            {
								                auto operator()(auto& platform, uint32_t idx) const
								                {
								                    return platform.makeDevice(idx);
								                }
								            };
								        };

								        struct GetDevice
								        {
								            template<typename T_Any>
								            struct Op
								            {
								                auto operator()(T_Any const& any) const
								                {
								                    return any.getDevice();
								                }
								            };
								        };

								        inline constexpr auto getDevice(auto&& any)
								        {
								            return GetDevice::Op<ALPAKA_TYPEOF(any)>{}(any);
								        }

								        struct GetNativeHandle
								        {
								            template<typename T_Any>
								            struct Op
								            {
								                auto operator()(T_Any const& any) const
								                {
								                    return any.getNativeHandle();
								                }
								            };
								        };

								        inline auto getNativeHandle(auto&& any)
								        {
								            return GetNativeHandle::Op<ALPAKA_TYPEOF(any)>{}(any);
								        }

								        struct MakeQueue
								        {
								            template<typename T_Device, alpaka::concepts::QueueKind T_QueueKind>
								            struct Op
								            {
								                auto operator()(T_Device& device, T_QueueKind) const
								                {
								                    return device.makeQueue(T_QueueKind{});
								                }
								            };
								        };

								        struct MakeEvent
								        {
								            template<typename T_Device>
								            struct Op
								            {
								                auto operator()(T_Device& device) const
								                {
								                    return device.makeEvent();
								                }
								            };
								        };

								        struct Wait
								        {
								            template<typename T_Any>
								            struct Op
								            {
								                void operator()(T_Any& any)
								                {
								                    any.wait();
								                }
								            };
								        };

								        inline void wait(auto&& any)
								        {
								            Wait::Op<ALPAKA_TYPEOF(any)>{}(any);
								        }

								        struct WaitFor
								        {
								            template<typename T_Queue, typename T_Event>
								            struct Op
								            {
								                void operator()(T_Queue& queue, T_Event& event)
								                {
								                    queue.waitFor(event);
								                }
								            };
								        };

								        inline void waitFor(auto& queue, auto& event)
								        {
								            WaitFor::Op<ALPAKA_TYPEOF(queue), ALPAKA_TYPEOF(event)>{}(queue, event);
								        }

								        struct IsEventComplete
								        {
								            template<typename T_Any>
								            struct Op
								            {
								                bool operator()(T_Any& any)
								                {
								                    return any.isEventComplete();
								                }
								            };
								        };

								        inline bool isEventComplete(auto&& any)
								        {
								            return IsEventComplete::Op<ALPAKA_TYPEOF(any)>{}(any);
								        }

								        struct IsQueueEmpty
								        {
								            template<typename T_Queue>
								            struct Op
								            {
								                bool operator()(T_Queue& queue)
								                {
								                    return queue.isQueueEmpty();
								                }
								            };
								        };

								        inline bool isQueueEmpty(auto& queue)
								        {
								            return IsQueueEmpty::Op<ALPAKA_TYPEOF(queue)>{}(queue);
								        }

								        struct Enqueue
								        {
								            template<
								                typename T_Queue,
								                onHost::concepts::ThreadOrFrameSpec T_LaunchCfg,
								                alpaka::concepts::KernelBundle T_KernelBundle>
								            struct Kernel
								            {
								                void operator()(T_Queue& queue, T_LaunchCfg const& launchCfg, T_KernelBundle const& kernelBundle) const
								                {
								                    queue.enqueue(launchCfg, kernelBundle);
								                }
								            };

								            template<typename T_Queue, typename T_Task>
								            struct HostTask
								            {
								                void operator()(T_Queue& queue, T_Task const& task) const
								                {
								                    queue.enqueueHostFn(task);
								                }
								            };

								            template<typename T_Queue, typename T_Task>
								            struct HostTaskDeferred
								            {
								                void operator()(T_Queue& queue, T_Task const& task) const
								                {
								                    queue.enqueueHostFnDeferred(task);
								                }
								            };

								            template<typename T_Queue, typename T_Event>
								            struct Event
								            {
								                void operator()(T_Queue& queue, T_Event& event) const
								                {
								                    queue.enqueue(event);
								                }
								            };
								        };

								        inline void enqueueHostFn(auto& queue, auto const& task)
								        {
								            Enqueue::HostTask<ALPAKA_TYPEOF(queue), ALPAKA_TYPEOF(task)>{}(queue, task);
								        }

								        inline void enqueueHostFnDeferred(auto& queue, auto const& task)
								        {
								            Enqueue::HostTaskDeferred<ALPAKA_TYPEOF(queue), ALPAKA_TYPEOF(task)>{}(queue, task);
								        }

								        template<typename TKernelFn, typename... TArgs>
								        inline void enqueue(
								            auto& queue,
								            onHost::concepts::ThreadOrFrameSpec auto const& launchCfg,
								            KernelBundle<TKernelFn, TArgs...> const& kernelBundle)
								        {
								            Enqueue::Kernel<ALPAKA_TYPEOF(queue), ALPAKA_TYPEOF(launchCfg), KernelBundle<TKernelFn, TArgs...>>{}(
								                queue,
								                launchCfg,
								                kernelBundle);
								        }

								        struct AdjustThreadSpec
								        {
								            template<
								                typename T_Device,
								                onHost::concepts::FrameSpec T_FrameSpec,
								                alpaka::concepts::KernelBundle T_KernelBundle>
								            struct Op
								            {
								                auto operator()(
								                    T_Device const& device,
								                    T_FrameSpec const& frameSpec,
								                    T_KernelBundle const& kernelBundle) const
								                {
								                    alpaka::unused(device, frameSpec.getExecutor(), kernelBundle);
								                    return ThreadSpec{frameSpec.getNumFrames(), frameSpec.getFrameExtents(), frameSpec.getExecutor()};
								                }
								            };
								        };

								        template<typename TKernelFn, typename... TArgs>
								        static auto adjustThreadSpec(
								            auto const& device,
								            onHost::concepts::FrameSpec auto const& frameSpec,
								            KernelBundle<TKernelFn, TArgs...> const& kernelBundle)
								        {
								            return AdjustThreadSpec::
								                Op<ALPAKA_TYPEOF(device), ALPAKA_TYPEOF(frameSpec), KernelBundle<TKernelFn, TArgs...>>{}(
								                    device,
								                    frameSpec,
								                    kernelBundle);
								        }

								        struct Data
								        {
								            template<typename T_Any>
								            struct Op
								            {
								                decltype(auto) operator()(auto&& any) const
								                {
								                    return std::data(any);
								                }
								            };

								            static decltype(auto) data(auto&& any)
								            {
								                return Op<ALPAKA_TYPEOF(any)>{}(any);
								            }

								            template<typename T_Any>
								            static decltype(auto) data(Handle<T_Any>&& anyHandle)
								            {
								                return Op<ALPAKA_TYPEOF(*anyHandle.get())>{}(*anyHandle.get());
								            }
								        };

								        struct Alloc
								        {
								            template<typename T_Type, typename T_Any, typename T_Extents>
								            struct Op
								            {
								                void operator()(T_Any& any, T_Extents const&) const;
								            };
								        };

								        struct AllocDeferred
								        {
								            template<typename T_Type, typename T_Any, typename T_Extents>
								            struct Op
								            {
								                void operator()(T_Any& any, T_Extents const&) const;
								            };
								        };

								        struct AllocUnified
								        {
								            template<typename T_Type, typename T_Any, typename T_Extents>
								            struct Op
								            {
								                void operator()(T_Any& any, T_Extents const&) const;
								            };
								        };

								        struct AllocMapped
								        {
								            template<typename T_Type, typename T_Any, typename T_Extents>
								            struct Op
								            {
								                void operator()(T_Any& any, T_Extents const&) const;
								            };
								        };

								        /** checks if a view can be accessed from the given device
								         *
								         * There are two paths to check if a view is accessible:
								         *   - first: Try to validate the view in the scope of the device.
								         *   - second: Try to validate based on soft criteria in the scope of the view's API.
								         *             This path is required because the host API does not know about view data locations.
								         *             The second path is optionally and will return always false if not specialized.
								         */
								        struct IsDataAccessible
								        {
								            template<typename T_Device, typename T_Any>
								            struct FirstPath
								            {
								                bool operator()(T_Device& device, T_Any const& any) const;
								            };

								            template<typename T_DataApi, alpaka::concepts::DeviceKind T_DeviceKind, typename T_Any>
								            struct SecondPath
								            {
								                bool operator()(T_DataApi, T_DeviceKind, T_Any const&) const
								                {
								                    return false;
								                }
								            };
								        };

								        struct Memcpy
								        {
								            template<typename T_Queue, typename T_Dest, typename T_Source, typename T_Extents>
								            struct Op
								            {
								                void operator()(T_Queue& queue, auto&&, T_Source const&, T_Extents const&) const;
								            };
								        };

								        struct MemcpyDeviceGlobal
								        {
								            template<typename T_Queue, typename T_Dest, typename T_Source>
								            struct Op
								            {
								                /** copy data from or to the device global memory
								                 *
								                 * It is only allowed to copy data from or to the host.
								                 * Copy from device global variable to device global variables is not supported.
								                 * The host data is allowed te be a host accessible pointer.
								                 */
								                void operator()(T_Queue& queue, T_Dest&&, T_Source&&) const;
								            };
								        };

								        struct Memset
								        {
								            template<typename T_Queue, typename T_Dest, typename T_Extents>
								            struct Op
								            {
								                void operator()(T_Queue& queue, auto&&, uint8_t, T_Extents const&) const;
								            };
								        };

								        struct Fill
								        {
								            template<typename T_Queue, typename T_Dest, typename T_Value, typename T_Extents>
								            struct Op
								            {
								                void operator()(T_Queue& queue, auto&&, T_Value, T_Extents const&) const;
								            };
								        };

								        struct GetDeviceProperties
								        {
								            template<typename T_Any>
								            struct Op
								            {
								                DeviceProperties operator()(auto const& platform, uint32_t idx) const;

								                DeviceProperties operator()(auto const& device) const;
								            };
								        };

								        struct GetFreeGlobalMemBytes
								        {
								            template<typename T_Any>
								            struct Op
								            {
								                size_t operator()(auto const& device) const
								                {
								                    return device.getFreeGlobalMemBytes();
								                }
								            };
								        };

								        inline DeviceProperties getDeviceProperties(auto const& platform, uint32_t idx)
								        {
								            return GetDeviceProperties::Op<ALPAKA_TYPEOF(platform)>{}(platform, idx);
								        }

								        struct GetExtents
								        {
								            template<typename T_Any>
								            struct Op
								            {
								                decltype(auto) operator()(auto&& any) const
								                {
								                    return any.getExtents();
								                }
								            };
								        };

								        inline auto getExtents(auto&& any)
								        {
								            return GetExtents::Op<ALPAKA_TYPEOF(any)>{}(any);
								        }

								        template<typename T_Any>
								        inline auto getExtents(Handle<T_Any>&& any)
								        {
								            return GetExtents::Op<ALPAKA_TYPEOF(*any.get())>{}(*any.get());
								        }

								        struct GetPitches
								        {
								            template<typename T_Any>
								            struct Op
								            {
								                decltype(auto) operator()(auto&& any) const
								                {
								                    return any.getPitches();
								                }
								            };
								        };

								        inline auto getPitches(auto&& any)
								        {
								            return GetPitches::Op<ALPAKA_TYPEOF(any)>{}(any);
								        }

								        template<typename T_Any>
								        inline auto getPitches(Handle<T_Any>&& any)
								        {
								            return GetPitches::Op<ALPAKA_TYPEOF(*any.get())>{}(*any.get());
								        }

								        /** Provide a frame specification for the given extents
								         *
								         * @param internalDevice must be an alpaka internal device implementation
								         */
								        inline constexpr auto getFrameSpec(
								            auto const& internalDevice,
								            alpaka::concepts::Executor auto executor,
								            alpaka::concepts::VectorOrScalar auto const& extents)
								        {
								            static_assert(executor != exec::anyExecutor, "'exec::anyExecutor' can not be used here");
								            Vec extentMd = extents;
								            using ExtentVecType = ALPAKA_TYPEOF(extentMd);
								            // check that all extent dimensions are greater than zero
								            ALPAKA_ASSERT((extentMd > ExtentVecType::fill(0u)).reduce(std::logical_and{}));
								            using IndexType = alpaka::trait::GetValueType_t<ExtentVecType>;
								            auto props = internal::GetDeviceProperties::Op<ALPAKA_TYPEOF(internalDevice)>{}(internalDevice);
								            IndexType warpSize = static_cast<IndexType>(props.warpSize);
								            // try to create a specification with a frame size of 512 elements
								            IndexType numFrameElements = 512;
								            // avoid non-power of two values
								            IndexType fastDimensionValue = roundDownToPowerOfTwo(std::min(warpSize, extentMd.x()));
								            ExtentVecType frameExtents = ExtentVecType::fill(1).rAssign(fastDimensionValue);
								            numFrameElements /= frameExtents.x();
								            // distribute remainder frame elements
								            while(numFrameElements > IndexType{1})
								            {
								                uint32_t maxIdx = ExtentVecType::dim() - 1u;
								                IndexType maxValue = 0;
								                for(auto i = 0u; i < ExtentVecType::dim(); ++i)
								                {
								                    auto v = extentMd[i] / frameExtents[i] / IndexType{2};
								                    if(maxValue < v)
								                    {
								                        maxIdx = i;
								                        maxValue = v;
								                    }
								                }
								                // apply the change only if we not oversubscribe the extents
								                auto v = extentMd[maxIdx] / frameExtents[maxIdx] / IndexType{2};
								                if(v >= IndexType{1})
								                    frameExtents[maxIdx] *= IndexType{2};
								                else
								                    break;
								                numFrameElements /= IndexType{2};
								            }

								            ExtentVecType numFrames = divExZero(extentMd, frameExtents);
								            auto frameSpec = FrameSpec{numFrames, frameExtents, executor};
								            return frameSpec;
								        }

								        /** Provides a SIMD optimized frame specification
								         *
								         * The frame specification is optimized for a flat non-hierarchical execution via onAcc::worker::threadsInGrid.
								         *
								         * @tparam T_DataType the data type for which you would like to SIMD optimize
								         * @param internalDevice must be a alpaka internal device implementation
								         */
								        template<typename T_DataType>
								        inline constexpr auto getSimdFrameSpec(
								            auto const& internalDevice,
								            alpaka::concepts::Executor auto executor,
								            alpaka::concepts::VectorOrScalar auto const& extents)
								        {
								            static_assert(executor != exec::anyExecutor, "'exec::anyExecutor' can not be used here");
								            Vec extentMd = extents;
								            auto deviceKind = alpaka::internal::getDeviceKind(internalDevice);
								            auto deviceApi = alpaka::internal::getApi(internalDevice);
								            using ExtentVecType = ALPAKA_TYPEOF(extentMd);
								            // check that all extent dimensions are greater than zero
								            ALPAKA_ASSERT((extentMd > ExtentVecType::fill(0u)).reduce(std::logical_and{}));
								            using IndexType = alpaka::trait::GetValueType_t<ExtentVecType>;

								            ExtentVecType frameExtents = getFrameSpec(internalDevice, executor, extents).getFrameExtents();

								            IndexType elementsPerFrameItem
								                = static_cast<IndexType>(getNumElemPerThread<T_DataType>(deviceApi, deviceKind));

								            /* The number of frames depends on an imaginary frame extent where each frame item is computing multiple
								             * elements from the problem extents.
								             */
								            ExtentVecType numFrames
								                = divExZero(extentMd, frameExtents * frameExtents.fill(1).rAssign(elementsPerFrameItem));
								            // The frame specification is not required to be a multiple of the extent, it can be smaller.
								            FrameSpec frameSpec = FrameSpec{numFrames, frameExtents, executor};
								            return frameSpec;
								        }
								    } // namespace internal
								} // namespace alpaka::onHost
								// ==
								// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/onHost/internal/interface.hpp ==
								// ============================================================================


							// #include <concepts>    // amalgamate: file already included
							// #include <string>    // amalgamate: file already included

							namespace alpaka::onHost
							{
							    namespace internal::concepts
							    {
							        template<typename T>
							        concept Device = requires(T device) {
							            { alpaka::internal::GetName::Op<T>{}(device) } -> std::convertible_to<std::string>;
							            { internal::MakeEvent::Op<T>{}(device) };
							            { internal::GetNativeHandle::Op<T>{}(device) };
							            { internal::GetDeviceProperties::Op<T>{}(device) };
							        };

							        template<typename T>
							        concept Platform = requires(T platform) {
							            { alpaka::internal::GetName::Op<T>{}(platform) };
							        };

							        template<typename T>
							        concept Queue = requires(T device) {
							            { alpaka::internal::GetName::Op<T>{}(device) } -> std::convertible_to<std::string>;
							            { internal::GetNativeHandle::Op<T>{}(device) };
							        };

							        template<typename T>
							        concept QueueHandle = requires(T t) {
							            typename T::element_type;
							            requires Queue<typename T::element_type>;
							        };

							        template<typename T>
							        concept PlatformHandle = requires(T t) {
							            typename T::element_type;
							            requires Platform<typename T::element_type>;
							        };

							        template<typename T>
							        concept DeviceHandle = requires(T t) {
							            typename T::element_type;
							            requires Device<typename T::element_type>;
							        };
							    } // namespace internal::concepts

							    namespace concepts
							    {
							        template<typename T>
							        concept NameHandle = requires(T t) {
							            typename T::element_type;
							            requires alpaka::concepts::HasName<typename T::element_type>;
							        };

							        template<typename T>
							        concept StaticNameHandle = requires(T t) {
							            typename T::element_type;
							            requires alpaka::concepts::HasStaticName<typename T::element_type>;
							        };
							    } // namespace concepts

							} // namespace alpaka::onHost
							// ==
							// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/onHost/concepts.hpp ==
							// ============================================================================

						// #include "alpaka/tag.hpp"    // amalgamate: file already inlined

						#include <type_traits>

						namespace alpaka::onHost
						{
						    namespace trait
						    {
						        struct IsPlatformAvailable
						        {
						            template<alpaka::concepts::Api T_Api>
						            struct Op : std::false_type
						            {
						            };
						        };

						        struct IsExecutorSupportedBy
						        {
						            template<alpaka::concepts::Executor T_Executor, typename T_Device>
						            struct Op : std::false_type
						            {
						            };
						        };

						        template<alpaka::concepts::Executor T_Executor, internal::concepts::DeviceHandle T_DeviceHandle>
						        struct IsExecutorSupportedBy::Op<T_Executor, T_DeviceHandle>
						            : IsExecutorSupportedBy::Op<T_Executor, typename T_DeviceHandle::element_type>
						        {
						        };

						        struct IsDeviceSupportedBy
						        {
						            template<alpaka::concepts::DeviceKind T_DeviceKind, typename T_Api>
						            struct Op : std::false_type
						            {
						            };
						        };

						        template<typename T_Kernel, concepts::ThreadSpec T_Spec>
						        struct BlockDynSharedMemBytes
						        {
						            BlockDynSharedMemBytes(T_Kernel kernel, T_Spec spec)
						            {
						                alpaka::unused(kernel, spec);
						            }

						            /** Get amount of dynamic shared memory in bytes.
						             *
						             * @attention requires (false) is disabling the function if you specialize these traits remove the require
						             * statement. Disabling is required to enable the trait evaluation only in cases where the user is defining
						             * the trait.
						             */
						            uint32_t operator()(auto const&... args) const requires(false)
						            {
						                alpaka::unused(args...);
						                return 0;
						            }
						        };

						        template<onHost::concepts::ThreadSpec T_ThreadSpec, alpaka::concepts::KernelBundle T_KernelBundle>
						        struct GetDynSharedMemBytes
						        {
						            static constexpr bool zeroSharedMemory = true;

						            uint32_t operator()(T_ThreadSpec const spec, [[maybe_unused]] T_KernelBundle const& kernelBundle) const
						            {
						                alpaka::unused(spec);
						                return 0u;
						            }
						        };

						        template<concepts::ThreadSpec T_Spec, typename T_KernelFn, typename... T_Args>
						        requires requires() { std::declval<T_KernelFn>().dynSharedMemBytes; } || requires() {
						            BlockDynSharedMemBytes<T_KernelFn, T_Spec>{std::declval<T_KernelFn>(), std::declval<T_Spec>()}(
						                std::declval<remove_restrict_t<std::decay_t<T_Args>>>()...);
						        }
						        struct GetDynSharedMemBytes<T_Spec, KernelBundle<T_KernelFn, T_Args...>>
						        {
						            uint32_t operator()(
						                T_Spec const spec,
						                [[maybe_unused]] KernelBundle<T_KernelFn, T_Args...> const& kernelBundle) const
						            {
						                if constexpr(requires {
						                                 BlockDynSharedMemBytes<T_KernelFn, T_Spec>{kernelBundle.m_kernelFn, spec}(
						                                     std::declval<remove_restrict_t<std::decay_t<T_Args>>>()...);
						                             })
						                {
						                    return alpaka::apply(
						                        [&](auto const&... args)
						                        { return BlockDynSharedMemBytes<T_KernelFn, T_Spec>{kernelBundle.m_kernelFn, spec}(args...); },
						                        kernelBundle.m_args);
						                }
						                else
						                {
						                    return kernelBundle.m_kernelFn.dynSharedMemBytes;
						                }
						            }
						        };

						        template<onHost::concepts::ThreadSpec T_ThreadSpec, alpaka::concepts::KernelBundle T_KernelBundle>
						        struct HasUserDefinedDynSharedMemBytes : std::true_type
						        {
						        };

						        template<onHost::concepts::ThreadSpec T_ThreadSpec, alpaka::concepts::KernelBundle T_KernelBundle>
						        requires(trait::GetDynSharedMemBytes<T_ThreadSpec, T_KernelBundle>::zeroSharedMemory == true)
						        struct HasUserDefinedDynSharedMemBytes<T_ThreadSpec, T_KernelBundle> : std::false_type
						        {
						        };

						        // required to return a compile time constant
						        struct GetMaxThreadsPerBlock
						        {
						            template<
						                alpaka::concepts::Api T_Api,
						                alpaka::concepts::DeviceKind T_DeviceKind,
						                alpaka::concepts::Executor T_Exec>
						            struct Op
						            {
						                consteval uint32_t operator()(T_Api const, T_DeviceKind const, T_Exec const) const
						                {
						                    static_assert(
						                        sizeof(T_Api) && false,
						                        "Missing definition of GetMaxThreadsPerBlock for this combination of API, device kind, "
						                        "and executor.");
						                    return 1u;
						                }
						            };
						        };

						    } // namespace trait

						    consteval bool isPlatformAvaiable(alpaka::concepts::Api auto api)
						    {
						        return trait::IsPlatformAvailable::Op<std::decay_t<decltype(api)>>::value;
						    }

						    consteval bool isExecutorSupportedBy(auto executor, internal::concepts::DeviceHandle auto const& deviceHandle)
						    {
						        return trait::IsExecutorSupportedBy::Op<ALPAKA_TYPEOF(executor), ALPAKA_TYPEOF(deviceHandle)>::value;
						    }

						    constexpr auto supportedExecutors(internal::concepts::DeviceHandle auto deviceHandle, auto const listOfExecutors)
						    {
						        return meta::filter(
						            // we can not use isExecutorSupportedBy() because gcc14 is stricter in the detection which functions can
						            // be evaluated at compile time
						            [&](auto executor) constexpr
						            { return trait::IsExecutorSupportedBy::Op<ALPAKA_TYPEOF(executor), ALPAKA_TYPEOF(deviceHandle)>::value; },
						            listOfExecutors);
						    }

						    /** Select a default executor for the given device.
						     *
						     * Picks the first executor (with the most parallelism) supported by the device out of all known executors.
						     */
						    constexpr auto defaultExecutor(internal::concepts::DeviceHandle auto deviceHandle)
						    {
						        return std::get<0>(supportedExecutors(deviceHandle, exec::allExecutors));
						    }

						    constexpr auto supportedDevices(auto const api)
						    {
						        return meta::filter(
						            // we can not use isExecutorSupportedBy() because gcc14 is stricter in the detection which functions can
						            // be evaluated at compile time
						            [&](auto devTag) constexpr
						            { return trait::IsDeviceSupportedBy::Op<ALPAKA_TYPEOF(devTag), ALPAKA_TYPEOF(api)>::value; },
						            deviceKind::allDevices);
						    }

						    template<onHost::concepts::ThreadSpec T_ThreadSpec, alpaka::concepts::KernelBundle T_KernelBundle>
						    constexpr uint32_t getDynSharedMemBytes(T_ThreadSpec spec, T_KernelBundle const& kernelBundle)
						    {
						        return trait::GetDynSharedMemBytes<T_ThreadSpec, T_KernelBundle>{}(spec, kernelBundle);
						    }

						    template<onHost::concepts::ThreadSpec T_ThreadSpec, alpaka::concepts::KernelBundle T_KernelBundle>
						    consteval bool hasUserDefinedDynSharedMemBytes(T_ThreadSpec spec, T_KernelBundle const& kernelBundle)
						    {
						        alpaka::unused(spec, kernelBundle);
						        return trait::HasUserDefinedDynSharedMemBytes<T_ThreadSpec, T_KernelBundle>::value;
						    }

						    /** A safe(ish) compile time lower bound on max threads per block for a given combination of API, device kind and
						     * executor.
						     *
						     * Returns the minimum number of threads-per-block guaranteed to be supported across
						     * all devices of the executor's backend family. The actual device may support more;
						     * this is a conservative bound for compile-time clamping of block sizes.
						     *
						     * @attention Due to lmem, shared memory or register usage the actual limit could be lower. In this case
						     * the kernel launched using this compile time max will fail at runtime with invalid kernel configuration. We can
						     * not avoid this at compile time.
						     *
						     */
						    template<alpaka::concepts::Api T_Api, alpaka::concepts::DeviceKind T_DeviceKind, alpaka::concepts::Executor T_Exec>
						    consteval uint32_t getMaxThreadsPerBlock(T_Api api, T_DeviceKind deviceKind, T_Exec exec)
						    {
						        return trait::GetMaxThreadsPerBlock::Op<T_Api, T_DeviceKind, T_Exec>{}(api, deviceKind, exec);
						    }

						} // namespace alpaka::onHost
						// ==
						// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/onHost/trait.hpp ==
						// ============================================================================

					// #include "alpaka/utility.hpp"    // amalgamate: file already inlined

					// #include <memory>    // amalgamate: file already included
					// #include <sstream>    // amalgamate: file already included

					namespace alpaka
					{
					    namespace api
					    {
					        struct Cuda : detail::ApiBase
					        {
					            using element_type = Cuda;

					            auto get() const
					            {
					                return this;
					            }

					            void _()
					            {
					                static_assert(concepts::Api<Cuda>);
					            }

					            static std::string getName()
					            {
					                return "Cuda";
					            }
					        };

					        constexpr auto cuda = Cuda{};

					    } // namespace api

					    namespace onHost::trait
					    {
					#if ALPAKA_LANG_CUDA
					        template<>
					        struct IsPlatformAvailable::Op<api::Cuda> : std::true_type
					        {
					        };

					        template<>
					        struct IsDeviceSupportedBy::Op<deviceKind::NvidiaGpu, api::Cuda> : std::true_type
					        {
					        };
					#endif

					        /** All modern NVIDIA GPUs support at least 1024 threads per block. */
					        template<>
					        struct GetMaxThreadsPerBlock::Op<api::Cuda, deviceKind::NvidiaGpu, exec::GpuCuda>
					        {
					            consteval uint32_t operator()(api::Cuda const, deviceKind::NvidiaGpu const, exec::GpuCuda const) const
					            {
					                return 1024u;
					            }
					        };
					    } // namespace onHost::trait

					    namespace unifiedCudaHip::trait
					    {
					        template<>
					        struct IsUnifiedApi<api::Cuda> : std::true_type
					        {
					        };
					    } // namespace unifiedCudaHip::trait

					    namespace trait
					    {
					        template<typename T_Type>
					        struct GetArchSimdWidth::Op<T_Type, api::Cuda, deviceKind::NvidiaGpu>
					        {
					            constexpr uint32_t operator()(api::Cuda const, deviceKind::NvidiaGpu const) const
					            {
					                /** vector load and store width in bytes */
					                constexpr size_t simdWidthInByte = 16u;
					                return alpaka::divExZero(simdWidthInByte, sizeof(T_Type));
					            }
					        };

					        template<>
					        struct GetNumPipelines::Op<api::Cuda, deviceKind::NvidiaGpu>
					        {
					            constexpr uint32_t operator()(api::Cuda const, deviceKind::NvidiaGpu const) const
					            {
					                /* NVIDIA GPUs have two scheduler what we interpreted as pipelines. */
					                constexpr uint32_t numPipes = 2u;
					                return numPipes;
					            }
					        };

					        template<>
					        struct GetCachelineSize::Op<api::Cuda, deviceKind::NvidiaGpu>
					        {
					            constexpr uint32_t operator()(api::Cuda const, deviceKind::NvidiaGpu const) const
					            {
					                // loading 16 byte per thread will result in optimal memory bandwith
					                return 16u;
					            }
					        };
					    } // namespace trait
					} // namespace alpaka
					// ==
					// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/cuda/Api.hpp ==
					// ============================================================================

					// ============================================================================
					// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/hip/Api.hpp ==
					// ==
					/* Copyright 2024 René Widera
					 * SPDX-License-Identifier: MPL-2.0
					 */


					// #pragma once
					// #include "alpaka/api/unifiedCudaHip/trait.hpp"    // amalgamate: file already inlined
					// #include "alpaka/concepts.hpp"    // amalgamate: file already inlined
					// #include "alpaka/core/config.hpp"    // amalgamate: file already inlined
					// #include "alpaka/onHost/trait.hpp"    // amalgamate: file already inlined
					// #include "alpaka/utility.hpp"    // amalgamate: file already inlined

					// #include <memory>    // amalgamate: file already included
					// #include <sstream>    // amalgamate: file already included

					namespace alpaka
					{
					    namespace api
					    {
					        struct Hip : detail::ApiBase
					        {
					            using element_type = Hip;

					            auto get() const
					            {
					                return this;
					            }

					            void _()
					            {
					                static_assert(concepts::Api<Hip>);
					            }

					            static std::string getName()
					            {
					                return "Hip";
					            }
					        };

					        constexpr auto hip = Hip{};
					    } // namespace api

					    namespace onHost::trait
					    {
					#if ALPAKA_LANG_HIP
					        template<>
					        struct IsPlatformAvailable::Op<api::Hip> : std::true_type
					        {
					        };

					        template<>
					        struct IsDeviceSupportedBy::Op<deviceKind::AmdGpu, api::Hip> : std::true_type
					        {
					        };
					#endif

					        /** All modern AMD GPUs support at least 1024 threads per block. */
					        template<>
					        struct GetMaxThreadsPerBlock::Op<api::Hip, deviceKind::AmdGpu, exec::GpuHip>
					        {
					            consteval uint32_t operator()(api::Hip const, deviceKind::AmdGpu const, exec::GpuHip const) const
					            {
					                return 1024u;
					            }
					        };
					    } // namespace onHost::trait

					    namespace unifiedCudaHip::trait
					    {
					        template<>
					        struct IsUnifiedApi<api::Hip> : std::true_type
					        {
					        };
					    } // namespace unifiedCudaHip::trait

					    namespace trait
					    {
					        template<typename T_Type>
					        struct GetArchSimdWidth::Op<T_Type, api::Hip, deviceKind::AmdGpu>
					        {
					            constexpr uint32_t operator()(api::Hip const, deviceKind::AmdGpu const) const
					            {
					                /** vector load/store width in bytes */
					                constexpr size_t simdWidthInByte = 16u;
					                return alpaka::divExZero(simdWidthInByte, sizeof(T_Type));
					            }
					        };

					        template<>
					        struct GetNumPipelines::Op<api::Hip, deviceKind::AmdGpu>
					        {
					            constexpr uint32_t operator()(api::Hip const, deviceKind::AmdGpu const) const
					            {
					                /* AMD GPUs SIMD units will be interpreted as pipelines */
					                constexpr uint32_t numPipes = 4u;
					                return numPipes;
					            }
					        };

					        template<>
					        struct GetCachelineSize::Op<api::Hip, deviceKind::AmdGpu>
					        {
					            constexpr uint32_t operator()(api::Hip const, deviceKind::AmdGpu const) const
					            {
					                // loading 16 byte per thread will result in optimal memory bandwith
					                return 16u;
					            }
					        };
					    } // namespace trait
					} // namespace alpaka
					// ==
					// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/hip/Api.hpp ==
					// ============================================================================

					// ============================================================================
					// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/host/Api.hpp ==
					// ==
					/* Copyright 2024 René Widera
					 * SPDX-License-Identifier: MPL-2.0
					 */

					// #pragma once
						// ============================================================================
						// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/host/cpuArchSize.hpp ==
						// ==
						/* Copyright 2025 René Widera
						 * SPDX-License-Identifier: MPL-2.0
						 */

						// #pragma once
							// ============================================================================
							// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/simd/simdConfig.hpp ==
							// ==
							/* Copyright 2026 René Widera
							 * SPDX-License-Identifier: MPL-2.0
							 */

							// #pragma once
							// #include "alpaka/core/config.hpp"    // amalgamate: file already inlined

							/* We can not include 'experimental/simd' with NVCC else we will trigger the compiler error:
							 * experimental/bits/simd.h(1537): error: invalid type conversion
							 * reinterpret_cast<__vector_type_t<float, 4>>(__v)));
							 */
							#if !ALPAKA_COMP_NVCC

							#    if !defined(ALPAKA_DISABLE_STD_SIMD)
							#        if __has_include(<simd>)
							#            include <simd>
							namespace alpakaStdSimd = std;
							#            if !defined(ALPAKA_HAS_STD_SIMD)
							#                define ALPAKA_HAS_STD_SIMD 1
							#            endif
							#        elif __has_include(<experimental/simd>)
							#            include <experimental/simd>
							namespace alpakaStdSimd = std::experimental;
							#            if !defined(ALPAKA_HAS_STD_SIMD)
							#                define ALPAKA_HAS_STD_SIMD 1
							#            endif
							#        endif
							#    endif

							#endif

							// In case it is not already set, set it to disabled, to ensure that his header is includes whereever the macro is
							// used. If this header is not included compiler flag `-Wundef` will show an error.
							#if !defined(ALPAKA_HAS_STD_SIMD)
							#    define ALPAKA_HAS_STD_SIMD 0
							#endif
							// ==
							// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/simd/simdConfig.hpp ==
							// ============================================================================

						// #include "alpaka/utility.hpp"    // amalgamate: file already inlined

						// #include <cstdint>    // amalgamate: file already included

						namespace alpaka::onHost::internal
						{

						    /** SIMD  width in bytes defined by std::simd
						     *
						     * @return 0 if std::simd is not supported or the T_Type is unsupported, else the SIMD width in bytes
						     */
						    template<typename T_Type>
						    constexpr size_t stdSimdWidth()
						    {
						        return 0;
						    }
						#if ALPAKA_HAS_STD_SIMD
						    template<typename T_Type>
						    requires requires { alpakaStdSimd::native_simd<T_Type>::size(); }
						    constexpr size_t stdSimdWidth()
						    {
						        return alpakaStdSimd::native_simd<T_Type>::size() * sizeof(T_Type);
						    }
						#endif


						    template<typename T_Type>
						    constexpr uint32_t getCPUSimdWidth()
						    {
						        constexpr size_t possibleSimdWidthBytes =
						#if defined(__AVX512BW__) || defined(__AVX512F__) || defined(__AVX512DQ__) || defined(__AVX512VL__)
						            64u;
						#elif defined(__riscv_vector)
						            64u;
						#elif defined(__riscv)
						            // do not use vectors if the vector extension is not set
						            sizeof(T_Type);
						#elif defined(__AVX2__)
						            32u;
						#elif defined(__SSE__) || defined(__SSE2__) || defined(__SSE4_1__) || defined(__SSE4_2__)
						            16u;
						// Macro to be define by the user to enable SVE backend and specify SVE size
						#elif defined(SVE_VECTOR_BITS)
						            SVE_VECTOR_BITS / 8;
						// If user has specified SVE vector lenght using the flag -msve-vector-bits
						#elif defined(__ARM_FEATURE_SVE_BITS)
						            __ARM_FEATURE_SVE_BITS / 8;
						// ARM e.g. nvidia grace hopper
						#elif defined(__ARM_FEATURE_SVE2_AES)
						            16u;
						// ARM e.g AWS Graviton 3
						#elif defined(__ARM_FEATURE_SVE)
						            32u;
						#elif defined(__ARM_NEON)
						            16u;
						#elif defined(__ALTIVEC__)
						            16u;
						#else
						            sizeof(T_Type);
						#endif

						        // we assume that the standard is maintaining the vector length better than we, therefore take it if vector
						        // types are supported
						        constexpr size_t simdWidthInByte = stdSimdWidth<T_Type>() ? stdSimdWidth<T_Type>() : possibleSimdWidthBytes;

						        return alpaka::divExZero(simdWidthInByte, sizeof(T_Type));
						    }

						    constexpr uint32_t getCPUNumPipelines()
						    {
						        /* INTEL can issue 4 commands and AMD typically 2, since we can not distinguish between both we use
						         * the higher value.
						         * ARM SVE can typically issue 4 commands too.
						         *
						         * Therefor we use at the moment as default 4.
						         */
						        constexpr uint32_t numPipes = 4u;
						        return numPipes;
						    }

						    constexpr uint32_t getCPUCachelineSize()
						    {
						        constexpr uint32_t cachlineBytes =
						#ifdef __cpp_lib_hardware_interference_size
						            std::hardware_constructive_interference_size;

						#else
						            // Fallback value, typically 64 bytes
						            64;
						#endif
						        return cachlineBytes;
						    }

						} // namespace alpaka::onHost::internal
						// ==
						// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/host/cpuArchSize.hpp ==
						// ============================================================================

					// #include "alpaka/api/trait.hpp"    // amalgamate: file already inlined
					// #include "alpaka/concepts.hpp"    // amalgamate: file already inlined
						// ============================================================================
						// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/mem/trait.hpp ==
						// ==
						/* Copyright 2025 René Widera
						 * SPDX-License-Identifier: MPL-2.0
						 */

						// #pragma once

						// #include "alpaka/core/common.hpp"    // amalgamate: file already inlined
							// ============================================================================
							// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/onAcc/layout.hpp ==
							// ==
							/* Copyright 2024 Andrea Bocci, René Widera
							 * SPDX-License-Identifier: MPL-2.0
							 */

							// #pragma once
							namespace alpaka::onAcc
							{
							    namespace layout
							    {
							        /** Generates indices scattered based on the number of worker threads for each dimension.*/
							        struct Strided
							        {
							        };

							        constexpr auto strided = Strided{};

							        /** Indices will be contiguous within each dimension for each worker thread. */
							        struct Contiguous
							        {
							        };

							        constexpr auto contiguous = Contiguous{};

							        /** The index layout will automatically selected based on the executor. */
							        struct Optimized
							        {
							        };

							        constexpr auto optimized = Optimized{};
							    } // namespace layout
							} // namespace alpaka::onAcc
							// ==
							// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/onAcc/layout.hpp ==
							// ============================================================================

						// #include "alpaka/tag.hpp"    // amalgamate: file already inlined

						// #include <cstdint>    // amalgamate: file already included

						namespace alpaka
						{
						    namespace onAcc::internal
						    {
						        namespace trait
						        {
						            struct AutoIndexMapping
						            {
						                template<typename T_Acc, typename T_Api, alpaka::concepts::DeviceKind T_DeviceKind>
						                struct Op
						                {
						                    constexpr auto operator()(T_Acc const&, T_Api, T_DeviceKind) const
						                    {
						                        return layout::Strided{};
						                    }
						                };
						            };
						        } // namespace trait

						        constexpr auto adjustMapping(auto const& acc)
						        {
						            return trait::AutoIndexMapping::
						                Op<ALPAKA_TYPEOF(acc), ALPAKA_TYPEOF(acc.getApi()), ALPAKA_TYPEOF(acc.getDeviceKind())>{}(
						                    acc,
						                    acc.getApi(),
						                    acc.getDeviceKind());
						        }

						    } // namespace onAcc::internal

						    namespace internal
						    {
						        /** Specialize the trait for DataSource class if the object is copyable.
						         *
						         * @tparam TDataSource The DataSource class.
						         *
						         * @details
						         *
						         * The trait is used in the alpaka::internal::concepts::CopyConstructableDataSource concept to check whether
						         * the copy constructor respects the const correctness of the data type.
						         *
						         * Example specialization:
						         *
						         * @code
						         * template<typename T_Type>
						         * struct CopyConstructableDataSource<Storage<T_Type> : std::true_type {
						         *      using InnerMutable = Storage<std::remove_const_t<T_Type>>;
						         *      using InnerConst = Storage<std::add_const_t<T_Type>>;
						         * };
						         * @endcode
						         */
						        template<typename TDataSource>
						        struct CopyConstructableDataSource : std::false_type
						        {
						        };

						    }; // namespace internal
						} // namespace alpaka
						// ==
						// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/mem/trait.hpp ==
						// ============================================================================

					// #include "alpaka/onHost/trait.hpp"    // amalgamate: file already inlined

					// #include <string>    // amalgamate: file already included

					namespace alpaka
					{
					    namespace api
					    {
					        struct Host : detail::ApiBase
					        {
					            using element_type = Host;

					            auto get() const
					            {
					                return this;
					            }

					            void _()
					            {
					                static_assert(concepts::Api<Host>);
					            }

					            static std::string getName()
					            {
					                return "Host";
					            }
					        };

					        constexpr auto host = Host{};
					    } // namespace api

					    namespace onHost::trait
					    {
					        template<>
					        struct IsPlatformAvailable::Op<api::Host> : std::true_type
					        {
					        };

					        template<>
					        struct IsDeviceSupportedBy::Op<deviceKind::Cpu, api::Host> : std::true_type
					        {
					        };

					        template<>
					        struct IsDeviceSupportedBy::Op<deviceKind::NumaCpu, api::Host> : std::true_type
					        {
					        };

					        template<typename T_DeviceKind>
					        struct GetMaxThreadsPerBlock::Op<api::Host, T_DeviceKind, exec::CpuSerial>
					        {
					            consteval uint32_t operator()(api::Host const, T_DeviceKind const, exec::CpuSerial const) const
					            {
					                return 1u;
					            }
					        };

					        template<typename T_DeviceKind>
					        struct GetMaxThreadsPerBlock::Op<api::Host, T_DeviceKind, exec::CpuOmpBlocks>
					        {
					            consteval uint32_t operator()(api::Host const, T_DeviceKind const, exec::CpuOmpBlocks const) const
					            {
					                return 1u;
					            }
					        };

					        template<typename T_DeviceKind>
					        struct GetMaxThreadsPerBlock::Op<api::Host, T_DeviceKind, exec::CpuTbbBlocks>
					        {
					            consteval uint32_t operator()(api::Host const, T_DeviceKind const, exec::CpuTbbBlocks const) const
					            {
					                return 1u;
					            }
					        };
					    } // namespace onHost::trait

					    namespace trait
					    {

					        template<typename T_Type>
					        struct GetArchSimdWidth::Op<T_Type, api::Host, deviceKind::Cpu>
					        {
					            constexpr uint32_t operator()(api::Host const, deviceKind::Cpu const) const
					            {
					                return alpaka::onHost::internal::getCPUSimdWidth<T_Type>();
					            }
					        };

					        template<>
					        struct GetNumPipelines::Op<api::Host, deviceKind::Cpu>
					        {
					            constexpr uint32_t operator()(api::Host const, deviceKind::Cpu const) const
					            {
					                return alpaka::onHost::internal::getCPUNumPipelines();
					            }
					        };

					        template<>
					        struct GetCachelineSize::Op<api::Host, deviceKind::Cpu>
					        {
					            constexpr uint32_t operator()(api::Host const, deviceKind::Cpu const) const
					            {
					                return alpaka::onHost::internal::getCPUCachelineSize();
					            }
					        };

					        template<typename T_Type>
					        struct GetArchSimdWidth::Op<T_Type, api::Host, deviceKind::NumaCpu>
					        {
					            constexpr uint32_t operator()(api::Host const, deviceKind::NumaCpu const) const
					            {
					                return alpaka::onHost::internal::getCPUSimdWidth<T_Type>();
					            }
					        };

					        template<>
					        struct GetNumPipelines::Op<api::Host, deviceKind::NumaCpu>
					        {
					            constexpr uint32_t operator()(api::Host const, deviceKind::NumaCpu const) const
					            {
					                return alpaka::onHost::internal::getCPUNumPipelines();
					            }
					        };

					        template<>
					        struct GetCachelineSize::Op<api::Host, deviceKind::NumaCpu>
					        {
					            constexpr uint32_t operator()(api::Host const, deviceKind::NumaCpu const) const
					            {
					                return alpaka::onHost::internal::getCPUCachelineSize();
					            }
					        };

					    } // namespace trait

					    namespace onAcc::internal::trait
					    {
					        template<typename T_Acc>
					        struct AutoIndexMapping::Op<T_Acc, api::Host, deviceKind::Cpu>
					        {
					            constexpr auto operator()(T_Acc const&, api::Host, deviceKind::Cpu) const
					            {
					                return layout::Contiguous{};
					            }
					        };

					        template<typename T_Acc>
					        struct AutoIndexMapping::Op<T_Acc, api::Host, deviceKind::NumaCpu>
					        {
					            constexpr auto operator()(T_Acc const&, api::Host, deviceKind::NumaCpu) const
					            {
					                return layout::Contiguous{};
					            }
					        };
					    } // namespace onAcc::internal::trait
					} // namespace alpaka
					// ==
					// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/host/Api.hpp ==
					// ============================================================================

					// ============================================================================
					// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/oneApi/Api.hpp ==
					// ==
					/* Copyright 2024 René Widera, Simeon Ehrig
					 * SPDX-License-Identifier: MPL-2.0
					 */

					// #pragma once
					// #include "alpaka/api/host/cpuArchSize.hpp"    // amalgamate: file already inlined
						// ============================================================================
						// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/syclGeneric/Api.hpp ==
						// ==
						/* Copyright 2024 René Widera
						 * SPDX-License-Identifier: MPL-2.0
						 */

						// #pragma once
						// #include "alpaka/concepts.hpp"    // amalgamate: file already inlined

						// #include <memory>    // amalgamate: file already included
						// #include <string>    // amalgamate: file already included

						namespace alpaka
						{
						    namespace api
						    {
						        template<typename TApiInterface>
						        struct GenericSycl : detail::ApiBase
						        {
						            using element_type = TApiInterface;

						            auto get() const
						            {
						                return static_cast<TApiInterface const*>(this);
						            }

						            void _()
						            {
						                static_assert(concepts::Api<GenericSycl<TApiInterface>>);
						            }

						            static std::string getName()
						            {
						                return "GenericSycl";
						            }
						        };
						    } // namespace api
						} // namespace alpaka
						// ==
						// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/syclGeneric/Api.hpp ==
						// ============================================================================

					// #include "alpaka/api/trait.hpp"    // amalgamate: file already inlined
					// #include "alpaka/concepts.hpp"    // amalgamate: file already inlined
					// #include "alpaka/mem/trait.hpp"    // amalgamate: file already inlined
					// #include "alpaka/onHost/trait.hpp"    // amalgamate: file already inlined
					// #include "alpaka/utility.hpp"    // amalgamate: file already inlined

					// #include <string>    // amalgamate: file already included

					namespace alpaka
					{
					    namespace api
					    {
					        struct OneApi : public GenericSycl<OneApi>
					        {
					            static std::string getName()
					            {
					                return "OneApi";
					            }
					        };

					        constexpr auto oneApi = OneApi{};
					    } // namespace api

					#if ALPAKA_LANG_ONEAPI

					    namespace onHost::trait
					    {
					        template<>
					        struct IsPlatformAvailable::Op<api::OneApi> : std::true_type
					        {
					        };

					        template<>
					        struct IsDeviceSupportedBy::Op<deviceKind::IntelGpu, api::OneApi> : std::true_type
					        {
					        };

					        template<>
					        struct IsDeviceSupportedBy::Op<deviceKind::NvidiaGpu, api::OneApi> : std::true_type
					        {
					        };

					        template<>
					        struct IsDeviceSupportedBy::Op<deviceKind::AmdGpu, api::OneApi> : std::true_type
					        {
					        };

					        template<>
					        struct IsDeviceSupportedBy::Op<deviceKind::Cpu, api::OneApi> : std::true_type
					        {
					        };

					        /** This limit is not exact but for typical CPUs, and GPUs from Intel, NVIDIA and AMD we can at least use 1024
					         * threads per block.
					         *  @todo Check if this produces issues on FPGAs, in this case the deviceKind should be used and the
					         * limit should be different for each deviceKind.
					         */
					        template<alpaka::concepts::DeviceKind T_DeviceKind>
					        struct GetMaxThreadsPerBlock::Op<api::OneApi, T_DeviceKind, exec::OneApi>
					        {
					            consteval uint32_t operator()(api::OneApi const, T_DeviceKind const, exec::OneApi const) const
					            {
					                return 1024u;
					            }
					        };
					    } // namespace onHost::trait

					#endif
					    namespace trait
					    {

					        template<typename T_Type>
					        struct GetArchSimdWidth::Op<T_Type, api::OneApi, deviceKind::Cpu>
					        {
					            constexpr uint32_t operator()(api::OneApi const, deviceKind::Cpu const) const
					            {
					                return onHost::internal::getCPUSimdWidth<T_Type>();
					            }
					        };

					        template<>
					        struct GetNumPipelines::Op<api::OneApi, deviceKind::Cpu>
					        {
					            constexpr uint32_t operator()(api::OneApi const, deviceKind::Cpu const) const
					            {
					                return onHost::internal::getCPUNumPipelines();
					            }
					        };

					        template<>
					        struct GetCachelineSize::Op<api::OneApi, deviceKind::Cpu>
					        {
					            constexpr uint32_t operator()(api::OneApi const, deviceKind::Cpu const) const
					            {
					                return onHost::internal::getCPUCachelineSize();
					            }
					        };

					        // for GPU
					        template<typename T_Type, concepts::GpuType T_DeviceKind>
					        struct GetArchSimdWidth::Op<T_Type, api::OneApi, T_DeviceKind>
					        {
					            constexpr uint32_t operator()(api::OneApi const, T_DeviceKind const) const
					            {
					                /** vector load and store width in bytes */
					                // copied from CUDA/HIP -> not verified if this is the optional value
					                constexpr std::size_t simdWidthInByte = 16u;
					                return alpaka::divExZero(simdWidthInByte, sizeof(T_Type));
					            }
					        };

					        template<concepts::GpuType T_DeviceKind>
					        struct GetNumPipelines::Op<api::OneApi, T_DeviceKind>
					        {
					            constexpr uint32_t operator()(api::OneApi const, T_DeviceKind const) const
					            {
					                /* AMD GPUs SIMD units will be interpreted as pipelines, CUDA GPUs using 2 pipelines (2 warp schedular)
					                 * @TODO check INTEL GPUs
					                 */
					                constexpr uint32_t numPipes = 4u;
					                return numPipes;
					            }
					        };

					        template<concepts::GpuType T_DeviceKind>
					        struct GetCachelineSize::Op<api::OneApi, T_DeviceKind>
					        {
					            constexpr uint32_t operator()(api::OneApi const, T_DeviceKind const) const
					            {
					                // loading 16 byte per thread will result in optimal memory bandwith
					                // copied from CUDA/HIP -> not verified if this is the optional value
					                return 16u;
					            }
					        };

					        template<typename T_Type>
					        struct GetAdjustedAlignment::Op<T_Type, api::OneApi, deviceKind::IntelGpu>
					        {
					            consteval uint32_t operator()(api::OneApi const, deviceKind::IntelGpu const, uint32_t const alignmentBytes)
					                const
					            {
					                /* Level Zero imposes a 64 KiB alignment limit.
					                 @see https://www.intel.com/content/www/us/en/developer/articles/release-notes/oneapi-dpcpp/2024.html
					                 Quote: "Limit alignment of allocation requests at 64KB which is the only alignment supported by Level
					                 Zero."
					                */
					                constexpr uint32_t onePageSize = 64u * 1024u;
					                uint32_t tmp = alignmentBytes;
					                while(tmp > onePageSize && tmp >= alignof(T_Type) * 2)
					                {
					                    tmp /= 2;
					                }
					                return tmp;
					            }
					        };
					    } // namespace trait

					    namespace onAcc::internal::trait
					    {
					        template<typename T_Acc>
					        struct AutoIndexMapping::Op<T_Acc, api::OneApi, deviceKind::Cpu>
					        {
					            constexpr auto operator()(T_Acc const&, api::OneApi, deviceKind::Cpu) const
					            {
					                return layout::Contiguous{};
					            }
					        };
					    } // namespace onAcc::internal::trait
					} // namespace alpaka
					// ==
					// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/oneApi/Api.hpp ==
					// ============================================================================

				// #include "alpaka/core/config.hpp"    // amalgamate: file already inlined
				// #include "alpaka/meta/filter.hpp"    // amalgamate: file already inlined
				// #include "alpaka/onHost/trait.hpp"    // amalgamate: file already inlined

				// #include <algorithm>    // amalgamate: file already included
				#include <type_traits>

				namespace alpaka
				{
				    /** provides the API used during the execution of the current code path
				     *
				     * @attention if api::host os returned it can also mean that this method was called within the host controlling
				     * workflow and not within a kernel running on a CPU device.
				     */
				    constexpr auto thisApi()
				    {
				#if ALPAKA_LANG_SYCL && ALPAKA_LANG_ONEAPI && defined(__SYCL_DEVICE_ONLY__)
				        return api::oneApi;
				#elif ALPAKA_LANG_CUDA && (ALPAKA_COMP_CLANG_CUDA || ALPAKA_COMP_NVCC) && __CUDA_ARCH__
				        return api::cuda;
				#elif ALPAKA_LANG_HIP && defined(__HIP_DEVICE_COMPILE__) && __HIP_DEVICE_COMPILE__ == 1
				        return api::hip;
				#else
				        return api::host;
				#endif
				    }

				    namespace onHost
				    {
				        constexpr auto apis = std::make_tuple(api::host, api::cuda, api::hip, api::oneApi);

				        constexpr auto enabledApis = meta::filter([](auto api) constexpr { return isPlatformAvaiable(api); }, apis);
				    } // namespace onHost

				    namespace api
				    {
				        constexpr bool operator==(alpaka::concepts::Api auto lhs, alpaka::concepts::Api auto rhs)
				        {
				            return std::is_same_v<ALPAKA_TYPEOF(lhs), ALPAKA_TYPEOF(rhs)>;
				        }

				        constexpr bool operator!=(alpaka::concepts::Api auto lhs, alpaka::concepts::Api auto rhs)
				        {
				            return !(lhs == rhs);
				        }
				    } // namespace api
				} // namespace alpaka
				// ==
				// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/api.hpp ==
				// ============================================================================

			// #include "alpaka/mem/Alignment.hpp"    // amalgamate: file already inlined
				// ============================================================================
				// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/simd/internal/StdSimd.hpp ==
				// ==
				/* Copyright 2026 René Widera
				 * SPDX-License-Identifier: MPL-2.0
				 */

				/** @file This file provides a basic implementation of a SIMD vector.
				 *
				 * The implementation is based on the class Vec:
				 *   - the storge policy should become the native SIMD implementation e.g. std::simd
				 *   - load/ store and simd specifis should be implemented in the storage policy
				 *   - the name of storage policy should be changed
				 *
				 *   The current operator operations relay on compilers auto vectorization.
				 */

				// #pragma once
				// #include "alpaka/api/api.hpp"    // amalgamate: file already inlined
				// #include "alpaka/mem/Alignment.hpp"    // amalgamate: file already inlined
				// #include "alpaka/simd/concepts.hpp"    // amalgamate: file already inlined
				// #include "alpaka/simd/simdConfig.hpp"    // amalgamate: file already inlined
					// ============================================================================
					// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/simd/trait.hpp ==
					// ==
					/* Copyright 2025 René Widera
					 * SPDX-License-Identifier: MPL-2.0
					 */

					/** @file This file provides a basic implementation of a SIMD vector.
					 *
					 * The implementation is based on the class Vec:
					 *   - the storge policy should become the native SIMD implementation e.g. std::simd
					 *   - load/ store and simd specifis should be implemented in the storage policy
					 *   - the name of storage policy should be changed
					 *
					 *   The current operator operations relay on compilers auto vectorization.
					 */

					// #pragma once
					// #include "alpaka/api/api.hpp"    // amalgamate: file already inlined
					// #include "alpaka/api/concepts/api.hpp"    // amalgamate: file already inlined

					// #include <concepts>    // amalgamate: file already included
					#include <type_traits>

					namespace alpaka
					{
					    namespace trait
					    {
					        /** Get the storage type for a SIMD pack */
					        template<concepts::Api T_Api, typename T_Type, uint32_t T_width>
					        struct GetSimdStorageType;

					        /** Get the storage type for a SIMD mask pack */
					        template<concepts::Api T_Api, typename T_Type, uint32_t T_width>
					        struct GetSimdMaskStorageType;
					    } // namespace trait
					} // namespace alpaka
					// ==
					// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/simd/trait.hpp ==
					// ============================================================================

				// #include "alpaka/vecConcepts.hpp"    // amalgamate: file already inlined

				#include <type_traits>

				#if ALPAKA_HAS_STD_SIMD

				namespace alpaka
				{
				    namespace internal
				    {
				        template<typename T_Type, uint32_t T_width>
				        struct StdSimd
				            : protected alpakaStdSimd::rebind_simd_t<T_Type, alpakaStdSimd::fixed_size_simd<T_Type, T_width>>
				        {
				            using BaseType = alpakaStdSimd::rebind_simd_t<T_Type, alpakaStdSimd::fixed_size_simd<T_Type, T_width>>;

				            using value_type = typename BaseType::value_type;
				            using reference = typename BaseType::reference;

				            using BaseType::operator[];

				            constexpr StdSimd() = default;
				            constexpr StdSimd(StdSimd const&) = default;
				            constexpr StdSimd(StdSimd&&) = default;
				            constexpr StdSimd& operator=(StdSimd&& rhs) = default;

				            constexpr StdSimd& operator=(StdSimd const& rhs) = default;

				            constexpr StdSimd& operator=(T_Type const value)
				            {
				                this->asNativeType() = value;
				                return *this;
				            }

				            // constructor is required because exposing the array constructors does not work
				            template<typename... T_Args>
				            requires(sizeof...(T_Args) == T_width && (std::same_as<T_Args, T_Type> && ...))
				            ALPAKA_FN_HOST_ACC StdSimd(T_Args&&... args)
				                : BaseType([=](int i) constexpr { return std::array<T_Type, T_width>{args...}[i]; })
				            {
				            }

				            constexpr StdSimd(BaseType const& nativeSimd) : BaseType{nativeSimd}
				            {
				            }

				            /** static cast the instance to the parent std::simd class
				             *
				             * This method is mostly used to get access to native arithmetic and comparison operators.
				             * @{
				             */
				            constexpr auto& asNativeType()
				            {
				                return static_cast<BaseType&>(*this);
				            }

				            constexpr auto const& asNativeType() const
				            {
				                return static_cast<BaseType const&>(*this);
				            }

				            /** @} */

				            constexpr decltype(auto) where(alpaka::concepts::SimdMask auto const& mask) const
				            {
				                return alpakaStdSimd::where(mask.asNativeType(), asNativeType());
				            }

				            constexpr decltype(auto) where(alpaka::concepts::SimdMask auto const& mask)
				            {
				                return alpakaStdSimd::where(mask.asNativeType(), asNativeType());
				            }

				            static constexpr auto fill(T_Type value)
				            {
				                return StdSimd{BaseType(value)};
				            }

				            constexpr void copyFrom(T_Type const* data, alpaka::concepts::Alignment auto alignment)
				            {
				                if constexpr((alignment.template get<T_Type>() % alpakaStdSimd::memory_alignment_v<BaseType>) == 0u)
				                    this->asNativeType().copy_from(data, alpakaStdSimd::vector_aligned);
				                else
				                    this->asNativeType().copy_from(data, alpakaStdSimd::element_aligned);
				            }

				            constexpr void copyTo(auto* data, alpaka::concepts::Alignment auto alignment) const
				            {
				                if constexpr((alignment.template get<T_Type>() % alpakaStdSimd::memory_alignment_v<BaseType>) == 0u)
				                    this->asNativeType().copy_to(data, alpakaStdSimd::vector_aligned);
				                else
				                    this->asNativeType().copy_to(data, alpakaStdSimd::element_aligned);
				            }

				            /** assign operator
				             */
				#    define ALPAKA_VECTOR_ASSIGN_OP(op)                                                                               \
				        constexpr StdSimd& operator op(StdSimd const& rhs)                                                            \
				        {                                                                                                             \
				            this->asNativeType() op rhs.asNativeType();                                                               \
				            return *this;                                                                                             \
				        }                                                                                                             \
				        constexpr StdSimd& operator op(T_Type const value)                                                            \
				        {                                                                                                             \
				            this->asNativeType() op value;                                                                            \
				            return *this;                                                                                             \
				        }

				            ALPAKA_VECTOR_ASSIGN_OP(+=)
				            ALPAKA_VECTOR_ASSIGN_OP(-=)
				            ALPAKA_VECTOR_ASSIGN_OP(/=)
				            ALPAKA_VECTOR_ASSIGN_OP(*=)

				#    undef ALPAKA_VECTOR_ASSIGN_OP
				        };

				#    define ALPAKA_VECTOR_BINARY_OP(typenameOrConcept, op)                                                            \
				        template<typenameOrConcept T_Type, uint32_t T_width>                                                          \
				        constexpr auto operator op(const StdSimd<T_Type, T_width>& lhs, const StdSimd<T_Type, T_width>& rhs)          \
				        {                                                                                                             \
				            return StdSimd<T_Type, T_width>{lhs.asNativeType() op rhs.asNativeType()};                                \
				        }                                                                                                             \
				        template<typenameOrConcept T_Type, uint32_t T_width>                                                          \
				        constexpr auto operator op(const StdSimd<T_Type, T_width>& lhs, T_Type rhs)                                   \
				        {                                                                                                             \
				            return StdSimd<T_Type, T_width>{lhs.asNativeType() op rhs};                                               \
				        }                                                                                                             \
				        template<typenameOrConcept T_Type, uint32_t T_width>                                                          \
				        constexpr auto operator op(T_Type lhs, const StdSimd<T_Type, T_width>& rhs)                                   \
				        {                                                                                                             \
				            return StdSimd<T_Type, T_width>{lhs op rhs.asNativeType()};                                               \
				        }

				        ALPAKA_VECTOR_BINARY_OP(typename, +)
				        ALPAKA_VECTOR_BINARY_OP(typename, -)
				        ALPAKA_VECTOR_BINARY_OP(typename, *)
				        ALPAKA_VECTOR_BINARY_OP(typename, /)
				        ALPAKA_VECTOR_BINARY_OP(std::integral, <<)
				        ALPAKA_VECTOR_BINARY_OP(std::integral, >>)
				        ALPAKA_VECTOR_BINARY_OP(std::integral, &)
				        ALPAKA_VECTOR_BINARY_OP(std::integral, |)
				        ALPAKA_VECTOR_BINARY_OP(std::integral, ^)

				        /** Workaround clang + glibc 12 issue with std::simd modulo operator
				         *
				         * /usr/lib/gcc/x86_64-linux-gnu/12/../../../../include/c++/12/experimental/bits/simd_x86.h:1492:51: error:
				         * explicit qualification required to use member '_S_divides' from dependent base class 1492 |           return
				         * _Base::_S_minus(__x, _S_multiplies(__y, _S_divides(__x, __y)));
				         *
				         * This workaround is executing the operation lane by lane which can break SIMD usage if the auto vectorizer is
				         * not understanding the code.
				         */
				#    if defined(__clang__) && defined(__GLIBCXX__) && (!defined(_GLIBCXX_RELEASE) || _GLIBCXX_RELEASE == 12)
				        template<std::integral T_Type, uint32_t T_width>
				        constexpr auto operator%(const StdSimd<T_Type, T_width>& lhs, const StdSimd<T_Type, T_width>& rhs)
				        {
				            using BaseType = typename StdSimd<T_Type, T_width>::BaseType;
				            return StdSimd<T_Type, T_width>(
				                BaseType([&](int i) { return lhs.asNativeType()[i] % rhs.asNativeType()[i]; }));
				        }

				        template<std::integral T_Type, uint32_t T_width>
				        constexpr auto operator%(StdSimd<T_Type, T_width> const& lhs, T_Type rhs)
				        {
				            using BaseType = typename StdSimd<T_Type, T_width>::BaseType;
				            return StdSimd<T_Type, T_width>(BaseType([&](int i) { return lhs.asNativeType()[i] % rhs; }));
				        }

				        template<std::integral T_Type, uint32_t T_width>
				        constexpr auto operator%(T_Type lhs, StdSimd<T_Type, T_width> const& rhs)
				        {
				            using BaseType = typename StdSimd<T_Type, T_width>::BaseType;
				            return StdSimd<T_Type, T_width>(BaseType([&](int i) { return lhs % rhs.asNativeType()[i]; }));
				        }
				#    else
				        ALPAKA_VECTOR_BINARY_OP(std::integral, %)
				#    endif
				#    undef ALPAKA_VECTOR_BINARY_OP

				    } // namespace internal

				    namespace trait
				    {
				        template<typename T_Type, uint32_t T_width>
				        requires(
				            std::has_single_bit(T_width) && std::has_single_bit(sizeof(T_Type))
				            && alpakaStdSimd::fixed_size_simd<T_Type, T_width>::size() > 0)
				        struct GetSimdStorageType<alpaka::api::Host, T_Type, T_width>
				        {
				            using type = internal::StdSimd<T_Type, T_width>;
				        };

				    } // namespace trait
				} // namespace alpaka
				#endif
				// ==
				// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/simd/internal/StdSimd.hpp ==
				// ============================================================================

			// #include "alpaka/simd/simdConfig.hpp"    // amalgamate: file already inlined
			// #include "alpaka/simd/trait.hpp"    // amalgamate: file already inlined

			#include <type_traits>

			#if ALPAKA_HAS_STD_SIMD

			namespace alpaka
			{
			    namespace internal
			    {
			        template<typename T_Type, uint32_t T_width>
			        struct StdSimdMask
			            : protected alpakaStdSimd::
			                  rebind_simd_t<T_Type, alpakaStdSimd::simd_mask<T_Type, alpakaStdSimd::simd_abi::fixed_size<T_width>>>
			        {
			            using BaseType = alpakaStdSimd::
			                rebind_simd_t<T_Type, alpakaStdSimd::simd_mask<T_Type, alpakaStdSimd::simd_abi::fixed_size<T_width>>>;

			            using value_type = typename BaseType::value_type;
			            using reference = typename BaseType::reference;

			            using BaseType::operator[];

			            constexpr StdSimdMask() = default;
			            constexpr StdSimdMask(StdSimdMask const&) = default;
			            constexpr StdSimdMask(StdSimdMask&&) = default;
			            constexpr StdSimdMask& operator=(StdSimdMask&& rhs) = default;

			            constexpr StdSimdMask& operator=(StdSimdMask const& rhs) = default;

			            constexpr StdSimdMask& operator=(T_Type const value)
			            {
			                this->asNativeType() = value;
			                return *this;
			            }

			            // constructor is required because exposing the array constructors does not work
			            template<typename... T_Args>
			            requires(sizeof...(T_Args) == T_width && (std::same_as<T_Args, T_Type> && ...))
			            constexpr StdSimdMask(T_Args const&... args) : BaseType{}
			            {
			                std::array<T_Type, T_width> const initArgs{ALPAKA_FORWARD(args)...};
			                for(uint32_t i = 0u; i < T_width; ++i)
			                    this->asNativeType()[i] = static_cast<bool>(initArgs[i]);
			            }

			            template<typename... T_Args>
			            requires(sizeof...(T_Args) == T_width && (std::same_as<T_Args, bool> && ...))
			            constexpr StdSimdMask(T_Args... args) : BaseType{}
			            {
			                std::array<bool, T_width> const initArgs{args...};
			                for(uint32_t i = 0u; i < T_width; ++i)
			                    this->asNativeType()[i] = initArgs[i];
			            }

			            constexpr StdSimdMask(BaseType const& nativeSimd) : BaseType{nativeSimd}
			            {
			            }

			            /** static cast the instance to the parent std::simd_mask class
			             *
			             * This method is mostly used to get access to native comparison operators.
			             *
			             * @{
			             */
			            constexpr auto& asNativeType()
			            {
			                return static_cast<BaseType&>(*this);
			            }

			            constexpr auto const& asNativeType() const
			            {
			                return static_cast<BaseType const&>(*this);
			            }

			            /** @} */

			            static constexpr auto fill(bool value)
			            {
			                return StdSimdMask{BaseType(value)};
			            }

			            constexpr void copyFrom(T_Type const* data, alpaka::concepts::Alignment auto alignment)
			            {
			                if constexpr((alignment.template get<T_Type>() % alpakaStdSimd::memory_alignment_v<BaseType>) == 0u)
			                    this->asNativeType().copy_from(data, alpakaStdSimd::vector_aligned);
			                else
			                    this->asNativeType().copy_from(data, alpakaStdSimd::element_aligned);
			            }

			            constexpr void copyTo(auto* data, alpaka::concepts::Alignment auto alignment) const
			            {
			                if constexpr((alignment.template get<T_Type>() % alpakaStdSimd::memory_alignment_v<BaseType>) == 0u)
			                    this->asNativeType().copy_to(data, alpakaStdSimd::vector_aligned);
			                else
			                    this->asNativeType().copy_to(data, alpakaStdSimd::element_aligned);
			            }

			            /** assign operator
			             */
			#    define ALPAKA_VECTOR_ASSIGN_OP(op)                                                                               \
			        template<typename T_OtherStorage>                                                                             \
			        constexpr StdSimdMask& operator op(StdSimdMask const& rhs)                                                    \
			        {                                                                                                             \
			            this->asNativeType() op rhs.asNativeType();                                                               \
			            return *this;                                                                                             \
			        }                                                                                                             \
			        constexpr StdSimdMask& operator op(T_Type const value)                                                        \
			        {                                                                                                             \
			            this->asNativeType() op value;                                                                            \
			            return *this;                                                                                             \
			        }

			            ALPAKA_VECTOR_ASSIGN_OP(&=)
			            ALPAKA_VECTOR_ASSIGN_OP(|=)
			            ALPAKA_VECTOR_ASSIGN_OP(^=)

			#    undef ALPAKA_VECTOR_ASSIGN_OP
			        };

			#    define ALPAKA_VECTOR_BINARY_CMP_OP(returnSimdType, argSimdType, typenameOrConcept, op)                           \
			        template<typenameOrConcept T_Type, uint32_t T_width>                                                          \
			        constexpr auto operator op(const argSimdType<T_Type, T_width>& lhs, const argSimdType<T_Type, T_width>& rhs)  \
			        {                                                                                                             \
			            return returnSimdType<T_Type, T_width>{lhs.asNativeType() op rhs.asNativeType()};                         \
			        }                                                                                                             \
			        template<typenameOrConcept T_Type, uint32_t T_width>                                                          \
			        constexpr auto operator op(const argSimdType<T_Type, T_width>& lhs, T_Type rhs)                               \
			        {                                                                                                             \
			            return returnSimdType<T_Type, T_width>{lhs.asNativeType() op rhs};                                        \
			        }                                                                                                             \
			        template<typenameOrConcept T_Type, uint32_t T_width>                                                          \
			        constexpr auto operator op(T_Type lhs, const argSimdType<T_Type, T_width>& rhs)                               \
			        {                                                                                                             \
			            return returnSimdType<T_Type, T_width>{lhs op rhs.asNativeType()};                                        \
			        }

			        ALPAKA_VECTOR_BINARY_CMP_OP(StdSimdMask, StdSimd, typename, >=)
			        ALPAKA_VECTOR_BINARY_CMP_OP(StdSimdMask, StdSimd, typename, >)
			        ALPAKA_VECTOR_BINARY_CMP_OP(StdSimdMask, StdSimd, typename, <=)
			        ALPAKA_VECTOR_BINARY_CMP_OP(StdSimdMask, StdSimd, typename, <)
			        ALPAKA_VECTOR_BINARY_CMP_OP(StdSimdMask, StdSimd, typename, ==)
			        ALPAKA_VECTOR_BINARY_CMP_OP(StdSimdMask, StdSimd, typename, !=)

			        ALPAKA_VECTOR_BINARY_CMP_OP(StdSimdMask, StdSimdMask, typename, ==)
			        ALPAKA_VECTOR_BINARY_CMP_OP(StdSimdMask, StdSimdMask, typename, !=)
			        ALPAKA_VECTOR_BINARY_CMP_OP(StdSimdMask, StdSimdMask, typename, &&)
			        ALPAKA_VECTOR_BINARY_CMP_OP(StdSimdMask, StdSimdMask, typename, ||)

			#    undef ALPAKA_VECTOR_BINARY_CMP_OP

			    } // namespace internal

			    namespace trait
			    {
			        template<typename T_Type, uint32_t T_width>
			        requires(
			            std::has_single_bit(T_width) && std::has_single_bit(sizeof(T_Type))
			            && alpakaStdSimd::fixed_size_simd_mask<T_Type, T_width>::size() > 0u)
			        struct GetSimdMaskStorageType<alpaka::api::Host, T_Type, T_width>
			        {
			            using type = internal::StdSimdMask<T_Type, T_width>;
			        };
			    } // namespace trait
			} // namespace alpaka
			#endif
			// ==
			// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/simd/internal/StdSimdMask.hpp ==
			// ============================================================================

		// #include "alpaka/simd/trait.hpp"    // amalgamate: file already inlined
		// #include "alpaka/trait.hpp"    // amalgamate: file already inlined
			// ============================================================================
			// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/simd/internal/EmuSimdMask.hpp ==
			// ==
			/* Copyright 2026 René Widera
			 * SPDX-License-Identifier: MPL-2.0
			 */

			/** @file This file provides a basic implementation of a SIMD vector.
			 *
			 * The implementation is based on the class Vec:
			 *   - the storge policy should become the native SIMD implementation e.g. std::simd
			 *   - load/ store and simd specifis should be implemented in the storage policy
			 *   - the name of storage policy should be changed
			 *
			 *   The current operator operations relay on compilers auto vectorization.
			 */

			// #pragma once
			// #include "alpaka/api/api.hpp"    // amalgamate: file already inlined
				// ============================================================================
				// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/simd/internal/EmuSimd.hpp ==
				// ==
				/* Copyright 2025 René Widera
				 * SPDX-License-Identifier: MPL-2.0
				 */

				/** @file This file provides a basic implementation of a SIMD vector.
				 *
				 * The implementation is based on the class Vec:
				 *   - the storge policy should become the native SIMD implementation e.g. std::simd
				 *   - load/ store and simd specifis should be implemented in the storage policy
				 *   - the name of storage policy should be changed
				 *
				 *   The current operator operations relay on compilers auto vectorization.
				 */

				// #pragma once
				// #include "alpaka/api/api.hpp"    // amalgamate: file already inlined
				// #include "alpaka/simd/concepts.hpp"    // amalgamate: file already inlined
					// ============================================================================
					// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/simd/internal/SmartMaskValueRef.hpp ==
					// ==
					/* Copyright 2026 René Widera
					 * SPDX-License-Identifier: MPL-2.0
					 */

					// #pragma once
						// ============================================================================
						// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/simd/internal/utility.hpp ==
						// ==
						/* Copyright 2025 René Widera
						 * SPDX-License-Identifier: MPL-2.0
						 */

						// #pragma once
						// #include <cstdint>    // amalgamate: file already included
						#include <type_traits>

						namespace alpaka::internal
						{

						    /** Convert a bool value into a mask type for SIMD
						     *
						     * @return
						     */
						    template<typename T>
						    constexpr auto valueMaskCast(bool condition)
						    {
						        return condition;
						    }

						    /** specialization for 4 and 8 byte types
						     *
						     * @return value type where all bits are 1 if condition is true, else all bit are zero
						     */
						    template<typename T>
						    requires(sizeof(T) == 4u || sizeof(T) == 8u)
						    constexpr auto valueMaskCast(bool condition)
						    {
						        using ValueMaskType = std::conditional_t<sizeof(T) == 4u, uint32_t, uint64_t>;
						        // if condition is true value will be 1 and negated to set all bits
						        return -static_cast<ValueMaskType>(condition);
						    }
						} // namespace alpaka::internal
						// ==
						// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/simd/internal/utility.hpp ==
						// ============================================================================


					#include <type_traits>

					namespace alpaka::internal
					{

					    /** Simd mask reference
					     *
					     * A SIMD mask is not required to store its values as bool, it can store the values as a representable value type
					     * where all bits are 1 for true and zero for false. To be able to assign values to a SIMD mask we can not return a
					     * reference to the stored value because we need to cast the value during the write. For the read we need to cast
					     * the value to bool.
					     */
					    template<typename T, typename T_ValueMask>
					    struct SmartMaskValueRef
					    {
					        using value_type = T;
					        using ValueMaskType = T_ValueMask;

					        constexpr SmartMaskValueRef(ValueMaskType& ref) noexcept : valueRef(ref)
					        {
					        }

					        // Convert to bool
					        constexpr operator bool() const noexcept
					        {
					            if constexpr(std::is_same_v<ValueMaskType, bool>)
					            {
					                return valueRef;
					            }
					            else
					            {
					                return valueRef != ValueMaskType{0};
					            }
					        }

					        // Optional: convert to raw storage type
					        constexpr ValueMaskType value() const noexcept
					        {
					            return valueRef;
					        }

					        // Unary operators
					        constexpr bool operator!() const noexcept
					        {
					            return !static_cast<bool>(*this);
					        }

					        constexpr ValueMaskType operator~() const noexcept
					        {
					            if constexpr(std::is_same_v<ValueMaskType, bool>)
					                return !valueRef;
					            else
					                return ~valueRef;
					        }

					        // Binary operators returning values (not references)
					        constexpr ValueMaskType operator|(SmartMaskValueRef const& rhs) const noexcept
					        {
					            return valueRef | rhs.valueRef;
					        }

					        constexpr ValueMaskType operator&(SmartMaskValueRef const& rhs) const noexcept
					        {
					            return valueRef & rhs.valueRef;
					        }

					        constexpr ValueMaskType operator^(SmartMaskValueRef const& rhs) const noexcept
					        {
					            if constexpr(std::is_same_v<ValueMaskType, bool>)
					                return static_cast<bool>(*this) != static_cast<bool>(rhs);
					            else
					                return valueRef ^ rhs.valueRef;
					        }

					        // Comparison operators
					        constexpr bool operator==(SmartMaskValueRef const& rhs) const noexcept
					        {
					            return static_cast<bool>(*this) == static_cast<bool>(rhs);
					        }

					        constexpr bool operator!=(SmartMaskValueRef const& rhs) const noexcept
					        {
					            return !(*this == rhs);
					        }

					#define SIMD_MASK_REF_ASSIGN_OP(OP, BOOL_FALLBACK)                                                                    \
					    constexpr SmartMaskValueRef& operator OP(bool b) noexcept                                                         \
					    {                                                                                                                 \
					        return (*this OP internal::valueMaskCast<ValueMaskType>(b));                                                  \
					    }                                                                                                                 \
					                                                                                                                      \
					    constexpr SmartMaskValueRef& operator OP(ValueMaskType v) noexcept                                                \
					    {                                                                                                                 \
					        if constexpr(std::is_same_v<ValueMaskType, bool>)                                                             \
					        {                                                                                                             \
					            valueRef = valueRef BOOL_FALLBACK static_cast<bool>(v);                                                   \
					        }                                                                                                             \
					        else                                                                                                          \
					        {                                                                                                             \
					            valueRef OP v;                                                                                            \
					        }                                                                                                             \
					        return *this;                                                                                                 \
					    }                                                                                                                 \
					    static_assert(true)

					        SIMD_MASK_REF_ASSIGN_OP(|=, ||);
					        SIMD_MASK_REF_ASSIGN_OP(&=, &&);
					        SIMD_MASK_REF_ASSIGN_OP(^=, !=);
					        SIMD_MASK_REF_ASSIGN_OP(=, =);

					#undef SIMD_MASK_REF_ASSIGN_OP

					    private:
					        ValueMaskType& valueRef;
					    };

					} // namespace alpaka::internal
					// ==
					// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/simd/internal/SmartMaskValueRef.hpp ==
					// ============================================================================

					// ============================================================================
					// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/simd/internal/alignment.hpp ==
					// ==
					/* Copyright 2025 René Widera
					 * SPDX-License-Identifier: MPL-2.0
					 */

					/** @file This file provides a basic implementation of a SIMD vector.
					 *
					 * The implementation is based on the class Vec:
					 *   - the storge policy should become the native SIMD implementation e.g. std::simd
					 *   - load/ store and simd specifis should be implemented in the storage policy
					 *   - the name of storage policy should be changed
					 *
					 *   The current operator operations relay on compilers auto vectorization.
					 */

					// #pragma once
					// #include "alpaka/mem/Alignment.hpp"    // amalgamate: file already inlined

					// #include <bit>    // amalgamate: file already included
					// #include <cstdint>    // amalgamate: file already included
					// #include <functional>    // amalgamate: file already included

					namespace alpaka::internal
					{
					    /** Calculates the best alignment based
					     *
					     * Takes care that the alignment never exceeds T_Alignment.
					     * In the worst case the alignment falls back to the alignment of the component type.
					     */
					    template<typename T_ValueType, uint32_t T_numElements, alpaka::concepts::Alignment T_Alignment>
					    consteval uint32_t optimalAlignment()
					    {
					        constexpr uint32_t currentTypeAlignment = static_cast<uint32_t>(alignof(T_ValueType));
					        if constexpr(T_numElements % 2 != 0u)
					            return currentTypeAlignment;

					        constexpr uint32_t dataSizeInBytes = static_cast<uint32_t>(sizeof(T_ValueType) * T_numElements);
					        constexpr uint32_t alignment = std::min(T_Alignment::template get<T_ValueType>(), dataSizeInBytes);
					        if constexpr(std::has_single_bit(alignment))
					            return alignment;

					        return static_cast<uint32_t>(alignof(T_ValueType));
					    }
					} // namespace alpaka::internal
					// ==
					// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/simd/internal/alignment.hpp ==
					// ============================================================================

				// #include "alpaka/simd/internal/utility.hpp"    // amalgamate: file already inlined
				// #include "alpaka/simd/trait.hpp"    // amalgamate: file already inlined

				// #include <concepts>    // amalgamate: file already included
				#include <type_traits>

				namespace alpaka
				{
				    namespace internal
				    {
				        /** Simd array storge for vector data
				         *
				         * The storage is aligned for native simd usage.
				         */
				        template<typename T_Type, uint32_t T_width>
				        struct alignas(alpaka::internal::optimalAlignment<T_Type, T_width, Alignment<sizeof(T_Type) * T_width>>())
				            EmuSimd : protected std::array<T_Type, T_width>
				        {
				            using BaseType = std::array<T_Type, T_width>;

				            using value_type = typename BaseType::value_type;
				            using reference = typename BaseType::reference;

				            using BaseType::operator[];

				            constexpr EmuSimd() = default;

				            constexpr EmuSimd(EmuSimd const& other)
				            {
				                // attention:  using default constructor results in bad performance
				                for(uint32_t i = 0u; i < T_width; ++i)
				                    BaseType::operator[](i) = other[i];
				            }

				            constexpr EmuSimd(EmuSimd&&) = default;
				            constexpr EmuSimd& operator=(EmuSimd&& rhs) = default;

				            constexpr EmuSimd& operator=(EmuSimd const& rhs) = default;

				            constexpr EmuSimd& operator=(T_Type const value)
				            {
				                for(uint32_t i = 0u; i < T_width; i++)
				                {
				                    asNativeType()[i] = value;
				                }
				                return *this;
				            }

				            // constructor is required because exposing the array constructors does not work
				            template<typename... T_Args>
				            requires(sizeof...(T_Args) == T_width && (std::same_as<T_Args, T_Type> && ...))
				            constexpr EmuSimd(T_Args&&... args) : BaseType{std::forward<T_Args>(args)...}
				            {
				            }

				            constexpr EmuSimd(BaseType const& base) : BaseType{base}
				            {
				            }

				            /** static cast the instance to the parent class
				             *
				             * This method is mostly used to get access to native arithmetic and comparison operators.
				             * @{
				             */
				            constexpr auto& asNativeType()
				            {
				                return static_cast<EmuSimd&>(*this);
				            }

				            constexpr auto const& asNativeType() const
				            {
				                return static_cast<EmuSimd const&>(*this);
				            }

				            /** @} */

				            static constexpr auto fill(T_Type value)
				            {
				                return EmuSimd([&value](uint32_t const) { return value; });
				            }

				            template<typename F>
				            requires(std::is_invocable_v<F, std::integral_constant<uint32_t, 0u>>)
				            constexpr explicit EmuSimd(F&& generator)
				                : EmuSimd(std::forward<F>(generator), std::make_integer_sequence<uint32_t, T_width>{})
				            {
				            }

				            constexpr void copyFrom(T_Type const* data, alpaka::concepts::Alignment auto alignment)
				            {
				                if constexpr((alignment.template get<T_Type>() % alignof(ALPAKA_TYPEOF(*this))) == 0u)
				                    *(this) = *reinterpret_cast<ALPAKA_TYPEOF(*this) const*>(data);
				                else
				                {
				                    for(uint32_t i = 0u; i < T_width; ++i)
				                        asNativeType()[i] = data[i];
				                }
				            }

				            constexpr void copyTo(auto* data, alpaka::concepts::Alignment auto alignment) const
				            {
				                if constexpr((alignment.template get<T_Type>() % alignof(ALPAKA_TYPEOF(*this))) == 0u)
				                    *reinterpret_cast<std::remove_const_t<ALPAKA_TYPEOF(*this)>*>(data) = (*this);
				                else
				                {
				                    for(uint32_t i = 0u; i < T_width; ++i)
				                        data[i] = asNativeType()[i];
				                }
				            }

				            template<alpaka::concepts::SimdMask Mask, alpaka::concepts::Simd T_Simd>
				            friend struct SimdWhereExpr;

				            /** element wise conditional value update where t is a scalar */
				            constexpr void update(alpaka::concepts::SimdMask auto const& mask, alpaka::concepts::Simd auto const& t)
				            {
				                using MaskType = ALPAKA_TYPEOF(valueMaskCast<T_Type>(t[0]));
				                if constexpr(std::same_as<MaskType, bool>)
				                {
				                    for(uint32_t i = 0u; i < T_width; ++i)
				                        asNativeType()[i] = (mask[i] ? t[i] : asNativeType()[i]);
				                }
				                else
				                {
				                    for(uint32_t i = 0u; i < T_width; ++i)
				                        asNativeType()[i] = std::bit_cast<T_Type>(
				                            (mask.asNativeType()[i] & std::bit_cast<MaskType>(t[i]))
				                            | (~mask.asNativeType()[i] & std::bit_cast<MaskType>(asNativeType()[i])));
				                }
				            }

				            /** element wise conditional value update where t is a scalar */
				            constexpr void update(alpaka::concepts::SimdMask auto const& mask, T_Type const& t)
				            {
				                using MaskType = ALPAKA_TYPEOF(valueMaskCast<T_Type>(t));
				                if constexpr(std::same_as<MaskType, bool>)
				                {
				                    for(uint32_t i = 0u; i < T_width; ++i)
				                        asNativeType()[i] = (mask[i] ? t : asNativeType()[i]);
				                }
				                else
				                {
				                    for(uint32_t i = 0u; i < T_width; ++i)
				                        asNativeType()[i] = std::bit_cast<T_Type>(
				                            (mask.asNativeType()[i] & std::bit_cast<MaskType>(t))
				                            | (~mask.asNativeType()[i] & std::bit_cast<MaskType>(asNativeType()[i])));
				                }
				            }

				            /** assign operator
				             */
				#define ALPAKA_VECTOR_ASSIGN_OP(op)                                                                                   \
				    constexpr EmuSimd& operator op(EmuSimd const& rhs)                                                                \
				    {                                                                                                                 \
				        for(uint32_t i = 0u; i < T_width; i++)                                                                        \
				        {                                                                                                             \
				            asNativeType()[i] op rhs[i];                                                                              \
				        }                                                                                                             \
				        return *this;                                                                                                 \
				    }                                                                                                                 \
				    constexpr EmuSimd& operator op(T_Type const value)                                                                \
				    {                                                                                                                 \
				        for(uint32_t i = 0u; i < T_width; i++)                                                                        \
				        {                                                                                                             \
				            asNativeType()[i] op value;                                                                               \
				        }                                                                                                             \
				        return *this;                                                                                                 \
				    }

				            ALPAKA_VECTOR_ASSIGN_OP(+=)
				            ALPAKA_VECTOR_ASSIGN_OP(-=)
				            ALPAKA_VECTOR_ASSIGN_OP(/=)
				            ALPAKA_VECTOR_ASSIGN_OP(*=)

				#undef ALPAKA_VECTOR_ASSIGN_OP

				        private:
				            template<typename F, uint32_t... Is>
				            constexpr explicit EmuSimd(F&& generator, std::integer_sequence<uint32_t, Is...>)
				                : BaseType{generator(std::integral_constant<uint32_t, Is>{})...}
				            {
				            }
				        };

				#define ALPAKA_VECTOR_BINARY_OP(typenameOrConcept, op)                                                                \
				    template<typenameOrConcept T_Type, uint32_t T_width>                                                              \
				    constexpr auto operator op(const EmuSimd<T_Type, T_width>& lhs, const EmuSimd<T_Type, T_width>& rhs)              \
				    {                                                                                                                 \
				        EmuSimd<T_Type, T_width> ret{};                                                                               \
				        for(uint32_t i = 0u; i < T_width; i++)                                                                        \
				            ret[i] = lhs[i] op rhs[i];                                                                                \
				        return ret;                                                                                                   \
				    }                                                                                                                 \
				    template<typenameOrConcept T_Type, uint32_t T_width>                                                              \
				    constexpr auto operator op(const EmuSimd<T_Type, T_width>& lhs, T_Type rhs)                                       \
				    {                                                                                                                 \
				        EmuSimd<T_Type, T_width> ret{};                                                                               \
				        for(uint32_t i = 0u; i < T_width; i++)                                                                        \
				            ret[i] = lhs[i] op rhs;                                                                                   \
				        return ret;                                                                                                   \
				    }                                                                                                                 \
				    template<typenameOrConcept T_Type, uint32_t T_width>                                                              \
				    constexpr auto operator op(T_Type lhs, const EmuSimd<T_Type, T_width>& rhs)                                       \
				    {                                                                                                                 \
				        EmuSimd<T_Type, T_width> ret{};                                                                               \
				        for(uint32_t i = 0u; i < T_width; i++)                                                                        \
				            ret[i] = lhs op rhs[i];                                                                                   \
				        return ret;                                                                                                   \
				    }

				        ALPAKA_VECTOR_BINARY_OP(typename, +)
				        ALPAKA_VECTOR_BINARY_OP(typename, -)
				        ALPAKA_VECTOR_BINARY_OP(typename, *)
				        ALPAKA_VECTOR_BINARY_OP(typename, /)
				        ALPAKA_VECTOR_BINARY_OP(std::integral, %)
				        ALPAKA_VECTOR_BINARY_OP(std::integral, <<)
				        ALPAKA_VECTOR_BINARY_OP(std::integral, >>)
				        ALPAKA_VECTOR_BINARY_OP(std::integral, &)
				        ALPAKA_VECTOR_BINARY_OP(std::integral, |)
				        ALPAKA_VECTOR_BINARY_OP(std::integral, ^)

				#undef ALPAKA_VECTOR_BINARY_OP

				    } // namespace internal

				    namespace trait
				    {
				        template<concepts::Api T_Api, typename T_Type, uint32_t T_width>
				        struct GetSimdStorageType
				        {
				            using type = internal::EmuSimd<T_Type, T_width>;
				        };
				    } // namespace trait
				} // namespace alpaka
				// ==
				// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/simd/internal/EmuSimd.hpp ==
				// ============================================================================

			// #include "alpaka/simd/internal/SmartMaskValueRef.hpp"    // amalgamate: file already inlined
			// #include "alpaka/simd/internal/alignment.hpp"    // amalgamate: file already inlined
			// #include "alpaka/simd/internal/utility.hpp"    // amalgamate: file already inlined
			// #include "alpaka/simd/trait.hpp"    // amalgamate: file already inlined

			// #include <concepts>    // amalgamate: file already included
			#include <type_traits>

			namespace alpaka
			{
			    namespace internal
			    {
			        template<typename T_Type, uint32_t T_width>
			        struct alignas(alpaka::internal::optimalAlignment<
			                       ALPAKA_TYPEOF(internal::valueMaskCast<T_Type>(true)),
			                       T_width,
			                       Alignment<sizeof(ALPAKA_TYPEOF(internal::valueMaskCast<T_Type>(true))) * T_width>>())
			            EmuSimdMask : protected std::array<ALPAKA_TYPEOF(internal::valueMaskCast<T_Type>(true)), T_width>
			        {
			            using ValueMaskType = ALPAKA_TYPEOF(internal::valueMaskCast<T_Type>(true));

			            using BaseType = std::array<ValueMaskType, T_width>;

			            using value_type = bool;
			            using reference = SmartMaskValueRef<bool, ValueMaskType>;

			            using BaseType::operator[];

			            constexpr reference operator[](std::integral auto const idx)
			            {
			                return reference(BaseType::operator[](idx));
			            }

			            constexpr EmuSimdMask() = default;

			            constexpr EmuSimdMask(EmuSimdMask const& other)
			            {
			                // attention:  using default constructor results in bad performance
			                for(uint32_t i = 0u; i < T_width; ++i)
			                    BaseType::operator[](i) = other[i];
			            }

			            constexpr EmuSimdMask(EmuSimdMask&&) = default;

			            constexpr EmuSimdMask& operator=(EmuSimdMask&& rhs) = default;

			            constexpr EmuSimdMask& operator=(EmuSimdMask const& rhs) = default;

			            constexpr EmuSimdMask& operator=(T_Type const value)
			            {
			                for(uint32_t i = 0u; i < T_width; i++)
			                {
			                    asNativeType()[i] = value;
			                }
			                return *this;
			            }

			            // constructor is required because exposing the array constructors does not work
			            template<typename... T_Args>
			            requires(sizeof...(T_Args) == T_width && (std::same_as<T_Args, T_Type> && ...))
			            constexpr EmuSimdMask(T_Args const&... args) : BaseType{args...}
			            {
			            }

			            template<typename... T_Args>
			            requires(sizeof...(T_Args) == T_width && (std::same_as<T_Args, bool> && ...))
			            constexpr EmuSimdMask(T_Args... args) : BaseType{valueMaskCast<T_Type>(args)...}
			            {
			            }

			            constexpr EmuSimdMask(BaseType const& base) : BaseType{base}
			            {
			            }

			            /** static cast the instance to the parent class
			             *
			             * This method is mostly used to get access to native comparison operators.
			             * @{
			             */
			            constexpr auto& asNativeType()
			            {
			                return static_cast<EmuSimdMask&>(*this);
			            }

			            constexpr auto const& asNativeType() const
			            {
			                return static_cast<EmuSimdMask const&>(*this);
			            }

			            /** @} */

			            static constexpr auto fill(bool value)
			            {
			                auto maskValue = valueMaskCast<T_Type>(value);
			                BaseType ret{};
			                for(uint32_t i = 0u; i < T_width; ++i)
			                    ret[i] = maskValue;

			                return EmuSimdMask(ret);
			            }

			            constexpr void copyFrom(T_Type const* data, alpaka::concepts::Alignment auto alignment)
			            {
			                if constexpr((alignment.template get<T_Type>() % alignof(ALPAKA_TYPEOF(*this))) == 0u)
			                    *(this) = *reinterpret_cast<ALPAKA_TYPEOF(*this) const*>(data);
			                else
			                {
			                    for(uint32_t i = 0u; i < T_width; ++i)
			                        asNativeType()[i] = data[i];
			                }
			            }

			            constexpr void copyTo(auto* data, alpaka::concepts::Alignment auto alignment) const
			            {
			                if constexpr((alignment.template get<T_Type>() % alignof(ALPAKA_TYPEOF(*this))) == 0u)
			                    *reinterpret_cast<ALPAKA_TYPEOF(*this) const*>(data) = (*this);
			                else
			                {
			                    for(uint32_t i = 0u; i < T_width; ++i)
			                        data[i] = asNativeType()[i];
			                }
			            }

			            /** assign operator
			             */
			#define ALPAKA_VECTOR_ASSIGN_OP(op)                                                                                   \
			    template<typename T_OtherStorage>                                                                                 \
			    constexpr EmuSimdMask& operator op(EmuSimdMask const& rhs)                                                        \
			    {                                                                                                                 \
			        for(uint32_t i = 0u; i < T_width; i++)                                                                        \
			        {                                                                                                             \
			            asNativeType()[i] op rhs[i];                                                                              \
			        }                                                                                                             \
			        return *this;                                                                                                 \
			    }                                                                                                                 \
			    constexpr EmuSimdMask& operator op(T_Type const value)                                                            \
			    {                                                                                                                 \
			        for(uint32_t i = 0u; i < T_width; i++)                                                                        \
			        {                                                                                                             \
			            asNativeType()[i] op value;                                                                               \
			        }                                                                                                             \
			        return *this;                                                                                                 \
			    }
			            ALPAKA_VECTOR_ASSIGN_OP(&=)
			            ALPAKA_VECTOR_ASSIGN_OP(|=)
			            ALPAKA_VECTOR_ASSIGN_OP(^=)

			#undef ALPAKA_VECTOR_ASSIGN_OP
			        };

			#define ALPAKA_VECTOR_BINARY_CMP_OP(returnSimdType, argSimdType, typenameOrConcept, op)                               \
			    template<typenameOrConcept T_Type, uint32_t T_width>                                                              \
			    constexpr auto operator op(const argSimdType<T_Type, T_width>& lhs, const argSimdType<T_Type, T_width>& rhs)      \
			    {                                                                                                                 \
			        returnSimdType<T_Type, T_width> ret{};                                                                        \
			        for(uint32_t i = 0u; i < T_width; i++)                                                                        \
			            ret[i] = valueMaskCast<T_Type>(lhs[i] op rhs[i]);                                                         \
			        return ret;                                                                                                   \
			    }                                                                                                                 \
			    template<typenameOrConcept T_Type, uint32_t T_width>                                                              \
			    constexpr auto operator op(const argSimdType<T_Type, T_width>& lhs, T_Type rhs)                                   \
			    {                                                                                                                 \
			        returnSimdType<T_Type, T_width> ret{};                                                                        \
			        for(uint32_t i = 0u; i < T_width; i++)                                                                        \
			            ret[i] = valueMaskCast<T_Type>(lhs[i] op rhs);                                                            \
			        return ret;                                                                                                   \
			    }                                                                                                                 \
			    template<typenameOrConcept T_Type, uint32_t T_width>                                                              \
			    constexpr auto operator op(T_Type lhs, const argSimdType<T_Type, T_width>& rhs)                                   \
			    {                                                                                                                 \
			        returnSimdType<T_Type, T_width> ret{};                                                                        \
			        for(uint32_t i = 0u; i < T_width; i++)                                                                        \
			            ret[i] = valueMaskCast<T_Type>(lhs op rhs[i]);                                                            \
			        return ret;                                                                                                   \
			    }

			        ALPAKA_VECTOR_BINARY_CMP_OP(EmuSimdMask, EmuSimd, typename, >=)
			        ALPAKA_VECTOR_BINARY_CMP_OP(EmuSimdMask, EmuSimd, typename, >)
			        ALPAKA_VECTOR_BINARY_CMP_OP(EmuSimdMask, EmuSimd, typename, <=)
			        ALPAKA_VECTOR_BINARY_CMP_OP(EmuSimdMask, EmuSimd, typename, <)
			        ALPAKA_VECTOR_BINARY_CMP_OP(EmuSimdMask, EmuSimd, typename, ==)
			        ALPAKA_VECTOR_BINARY_CMP_OP(EmuSimdMask, EmuSimd, typename, !=)

			        ALPAKA_VECTOR_BINARY_CMP_OP(EmuSimdMask, EmuSimdMask, typename, ==)
			        ALPAKA_VECTOR_BINARY_CMP_OP(EmuSimdMask, EmuSimdMask, typename, !=)
			        ALPAKA_VECTOR_BINARY_CMP_OP(EmuSimdMask, EmuSimdMask, typename, &&)
			        ALPAKA_VECTOR_BINARY_CMP_OP(EmuSimdMask, EmuSimdMask, typename, ||)

			#undef ALPAKA_VECTOR_BINARY_CMP_OP
			    } // namespace internal

			    namespace trait
			    {
			        template<concepts::Api T_Api, typename T_Type, uint32_t T_width>
			        struct GetSimdMaskStorageType
			        {
			            using type = internal::EmuSimdMask<T_Type, T_width>;
			        };
			    } // namespace trait
			} // namespace alpaka
			// ==
			// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/simd/internal/EmuSimdMask.hpp ==
			// ============================================================================

		// #include "simd/internal/utility.hpp"    // amalgamate: file already inlined

		// #include <array>    // amalgamate: file already included
		// #include <bit>    // amalgamate: file already included
		// #include <concepts>    // amalgamate: file already included
		#include <cstddef>
		// #include <cstdint>    // amalgamate: file already included
		// #include <functional>    // amalgamate: file already included
		// #include <iosfwd>    // amalgamate: file already included
		// #include <ranges>    // amalgamate: file already included
		// #include <sstream>    // amalgamate: file already included
		// #include <string>    // amalgamate: file already included
		#include <type_traits>

		namespace alpaka
		{
		    /** Simd mask vector
		     *
		     * @attention You should not use this type to create a buffer of SIMD masks.
		     * The implementation is not ABI compatible between different API's.
		     * Using Simd masks created on the host and used in the compute kernel will be undefined behaviour.
		     *
		     * This class is designed to be used within a kernel together with the `where()` operation.
		     *
		     * @tparam T_Type data value type the mask should be applied to
		     * @tparam T_width number of lanes in the SIMD mask vector
		     * @tparam T_Storage wrapped native representation of the SIMD mask
		     */
		    template<
		        typename T_Type,
		        uint32_t T_width,
		        /** do not use ALPAKA_TYPEOF(thisApi()) here else nvcc + gcc can trigger a compile error
		         * error: use of built-in trait '__decay(alpaka::api::Host)' in function signature;
		         */
		        typename T_Storage = typename trait::GetSimdMaskStorageType<decltype(thisApi()), T_Type, T_width>::type>
		    struct SimdMask;

		    namespace trait
		    {
		        template<typename T_Type, uint32_t T_width, typename T_Storage>
		        struct IsSimdMask<SimdMask<T_Type, T_width, T_Storage>> : std::true_type
		        {
		        };
		    } // namespace trait

		    // friend forward declaration
		    template<concepts::SimdMask T_Mask, concepts::Simd T_Simd>
		    struct SimdWhereExpr;

		    template<typename T_Type, uint32_t T_width, typename T_Storage>
		    struct SimdMask : private T_Storage
		    {
		        using Storage = T_Storage;
		        using type = bool;
		        /** type is an implementation detail, can be a proxy type. */
		        using reference = typename Storage::reference;

		        using index_type = uint32_t;
		        using size_type = uint32_t;
		        using rank_type = uint32_t;

		        // universal vec used as fallback if T_Storage is holding the state in the template signature
		        using UniSimdMask = SimdMask<T_Type, T_width>;

		        /*Simds without elements are not allowed*/
		        static_assert(T_width > 0u);

		        SimdMask() = default;

		        /** Initialize via a generator expression
		         *
		         * The generator must return the value for the corresponding index of the component which is passed to the
		         * generator.
		         *
		         * @note The generator needs to have the function interface `bool generator(uint32_t id)`.
		         */
		        template<typename F>
		        requires(std::is_invocable_v<F, std::integral_constant<uint32_t, 0u>>)
		        ALPAKA_FN_HOST_ACC explicit SimdMask(F&& generator)
		            : SimdMask(std::forward<F>(generator), std::make_integer_sequence<uint32_t, T_width>{})
		        {
		        }

		        /** Constructor for SIMD pack
		         *
		         * @attention This constructor allows implicit casts.
		         *
		         * @param args value of each lane index, x,y,z,...
		         *
		         * A constexpr vector should be initialized with {} instead of () because at least
		         * CUDA 11.6 has problems in cases where a compile time evaluation is required.
		         * @code{.cpp}
		         *   constexpr auto vec1 = Simd{ 1 };
		         *   constexpr auto vec2 = Simd{ 1, 2 };
		         *   //or explicit
		         *   constexpr auto vec3 = Simd<int, 3u>{ 1, 2, 3 };
		         *   constexpr auto vec4 = Simd<int, 3u>{ {1, 2, 3} };
		         * @endcode
		         */
		        template<typename... T_Args>
		        requires(
		            ((std::is_convertible_v<T_Args, T_Type> && !std::same_as<bool, T_Args>) && ...)
		            && (sizeof...(T_Args) == T_width))
		        ALPAKA_FN_HOST_ACC SimdMask(T_Args const&... args) : Storage(static_cast<T_Type>(args)...)
		        {
		        }

		        template<typename... T_Args>
		        requires((std::same_as<T_Args, bool> && ...) && (sizeof...(T_Args) == T_width))
		        ALPAKA_FN_HOST_ACC SimdMask(T_Args const&... args) : Storage(args...)
		        {
		        }

		        SimdMask(SimdMask const& other) = default;

		        ALPAKA_FN_HOST_ACC SimdMask(T_Storage const& other) : T_Storage{other}
		        {
		        }

		        ALPAKA_FN_HOST_ACC SimdMask(typename T_Storage::BaseType const& base) : T_Storage{base}
		        {
		        }

		        /** constructor allows changing the storage policy
		         */
		        template<typename T_OtherStorage>
		        ALPAKA_FN_HOST_ACC SimdMask(SimdMask<T_Type, T_width, T_OtherStorage> const& other)
		            : T_Storage(other.asStorage())
		        {
		        }

		        /** Allow static_cast / explicit cast to member type for 1D vector */
		        constexpr explicit operator bool() requires(T_width == 1u)
		        {
		            return static_cast<bool>(Storage::operator[](0));
		        }

		        /** Number of components/lanes in the SIMD pack. */
		        static consteval uint32_t width()
		        {
		            return T_width;
		        }

		        constexpr void copyFrom(T_Type const* data, concepts::Alignment auto alignment)
		        {
		            Storage::copyFrom(data, alignment);
		        }

		        constexpr void copyTo(auto* data, concepts::Alignment auto alignment) const
		        {
		            Storage::copyTo(data, alignment);
		        }

		        /**
		         * Creates a Simd where all lanes are set to the same value
		         *
		         * @param value Value which is set for all lanes
		         * @return new Simd<...>
		         */
		        static constexpr auto fill(bool value)
		        {
		            return SimdMask{Storage::fill(value)};
		        }

		        constexpr SimdMask toRT() const
		        {
		            return *this;
		        }

		        constexpr SimdMask& operator=(SimdMask const&) = default;
		        constexpr SimdMask& operator=(SimdMask&&) = default;

		        constexpr SimdMask operator-() const
		        {
		            return Simd([this](uint32_t const i) constexpr { return -Storage::operator[](i); });
		        }

		        using Storage::asNativeType;

		        /** static cast the instance to the storage type
		         *
		         * @attention: Do not use this method in user code, it is an implementation detail and can cause undefined
		         * behaviour if used wrong.
		         *
		         * @{
		         */
		        constexpr auto& asStorage()
		        {
		            return static_cast<Storage&>(*this);
		        }

		        constexpr auto const& asStorage() const
		        {
		            return static_cast<Storage const&>(*this);
		        }

		        /** @} */

		        /** assign operator
		         * @{
		         */
		#define ALPAKA_VECTOR_ASSIGN_OP(op)                                                                                   \
		    template<typename T_OtherStorage>                                                                                 \
		    constexpr SimdMask& operator op(SimdMask<T_Type, T_width, T_OtherStorage> const& rhs)                             \
		    {                                                                                                                 \
		        this->asStorage() op rhs.asStorage();                                                                         \
		        return *this;                                                                                                 \
		    }                                                                                                                 \
		    constexpr SimdMask& operator op(concepts::LosslesslyConvertible<T_Type> auto const value)                         \
		    {                                                                                                                 \
		        this->asStorage() op static_cast<T_Type>(value);                                                              \
		        return *this;                                                                                                 \
		    }

		        ALPAKA_VECTOR_ASSIGN_OP(&=)
		        ALPAKA_VECTOR_ASSIGN_OP(|=)
		        ALPAKA_VECTOR_ASSIGN_OP(^=)
		        ALPAKA_VECTOR_ASSIGN_OP(=)

		#undef ALPAKA_VECTOR_ASSIGN_OP

		        /** @} */

		        constexpr reference operator[](std::integral auto const idx)
		        {
		            return Storage::operator[](idx);
		        }

		        constexpr type operator[](std::integral auto const idx) const
		        {
		            return static_cast<type>(Storage::operator[](idx));
		        }

		        /** @brief named lane access
		         *
		         * @attention The mapping from names x,y,z,w to memory indices differ from the mapping of an alpaka vector @c
		         * Vec. The availability of the naming methods depends on the SIMD width.
		         *
		         * You can have access to the same lane index via different nonspecific naming.
		         *
		         * @code
		         * lane index   :  0,  1,  2,  3, ...,  9, 10, ... , 15
		         * hexadecimal  : s0, s1, s2, s3, ..., s9, SA, ... , SF
		         * coordinate   :  x,  y,  z,  w
		         * color channel:  r,  g,  b,  a
		         * @endcode
		         *
		         * @{
		         */
		#define ALPAKA_NAMED_ARRAY_ACCESS(functionName, laneIdx)                                                              \
		    constexpr reference functionName() requires(T_width >= laneIdx + 1)                                               \
		    {                                                                                                                 \
		        return (*this)[laneIdx];                                                                                      \
		    }                                                                                                                 \
		    constexpr type functionName() const requires(T_width >= laneIdx + 1)                                              \
		    {                                                                                                                 \
		        return (*this)[laneIdx];                                                                                      \
		    }

		        ALPAKA_NAMED_ARRAY_ACCESS(x, 0u)
		        ALPAKA_NAMED_ARRAY_ACCESS(y, 1u)
		        ALPAKA_NAMED_ARRAY_ACCESS(z, 2u)
		        ALPAKA_NAMED_ARRAY_ACCESS(w, 3u)

		        ALPAKA_NAMED_ARRAY_ACCESS(r, 0u)
		        ALPAKA_NAMED_ARRAY_ACCESS(g, 1u)
		        ALPAKA_NAMED_ARRAY_ACCESS(b, 2u)
		        ALPAKA_NAMED_ARRAY_ACCESS(a, 3u)

		        ALPAKA_NAMED_ARRAY_ACCESS(s0, 0u)
		        ALPAKA_NAMED_ARRAY_ACCESS(s1, 1u)
		        ALPAKA_NAMED_ARRAY_ACCESS(s2, 2u)
		        ALPAKA_NAMED_ARRAY_ACCESS(s3, 3u)
		        ALPAKA_NAMED_ARRAY_ACCESS(s4, 4u)
		        ALPAKA_NAMED_ARRAY_ACCESS(s5, 5u)
		        ALPAKA_NAMED_ARRAY_ACCESS(s6, 6u)
		        ALPAKA_NAMED_ARRAY_ACCESS(s7, 7u)
		        ALPAKA_NAMED_ARRAY_ACCESS(s8, 8u)
		        ALPAKA_NAMED_ARRAY_ACCESS(s9, 9u)
		        ALPAKA_NAMED_ARRAY_ACCESS(sA, 10u)
		        ALPAKA_NAMED_ARRAY_ACCESS(sB, 11u)
		        ALPAKA_NAMED_ARRAY_ACCESS(sC, 12u)
		        ALPAKA_NAMED_ARRAY_ACCESS(sD, 13u)
		        ALPAKA_NAMED_ARRAY_ACCESS(sE, 14u)
		        ALPAKA_NAMED_ARRAY_ACCESS(sF, 15u)

		#undef ALPAKA_NAMED_ARRAY_ACCESS

		        /** @} */

		        /** reduce all elements to a single value
		         *
		         * For better numerical stability a tree reduce algorithm is used.
		         *
		         * @tparam BinaryOp binary functor executed to reduce the range
		         *                  The binary operation must be associative.
		         * @return the type of the result depends on the binary functor
		         */
		        [[nodiscard]] constexpr type reduce(auto&& reduceFunc) const
		        {
		            return reduce_range(ALPAKA_FORWARD(reduceFunc));
		        }

		        /** create string out of the SIMD pack
		         *
		         * @param separator string to separate components of the SIMD pack
		         * @param enclosings string with width 2 to enclose SIMD pack
		         *                   width == 0 ? no enclose symbols
		         *                   width == 1 ? means enclose symbol begin and end are equal
		         *                   width >= 2 ? letter[0] = begin enclose symbol
		         *                               letter[1] = end enclose symbol
		         *
		         * example:
		         * .toString(";","|")     -> |x;...;z|
		         * .toString(",","[]")    -> [x,...,z]
		         */
		        std::string toString(std::string const separator = ",", std::string const enclosings = "{}") const
		        {
		            std::string locale_enclosing_begin;
		            std::string locale_enclosing_end;
		            size_t enclosingLaneIdx = enclosings.size();

		            if(enclosingLaneIdx > 0)
		            {
		                /* % avoid out of memory access */
		                locale_enclosing_begin = enclosings[0 % enclosingLaneIdx];
		                locale_enclosing_end = enclosings[1 % enclosingLaneIdx];
		            }

		            std::stringstream stream;
		            stream << locale_enclosing_begin << Storage::operator[](0);

		            for(uint32_t i = 1u; i < T_width; ++i)
		                stream << separator << Storage::operator[](i);
		            stream << locale_enclosing_end;
		            return stream.str();
		        }

		    private:
		        template<typename F, uint32_t... Is>
		        ALPAKA_FN_HOST_ACC explicit SimdMask(F&& generator, std::integer_sequence<uint32_t, Is...>)
		            : Storage{generator(std::integral_constant<uint32_t, Is>{})...}
		        {
		        }

		        /** reduce over a range of elements
		         *
		         * @tparam BinaryOp binary functor executed to reduce the range
		         * @tparam T_start start index
		         * @tparam T_end end index (excluded)
		         * @return the type of the result depends on the binary functor
		         */
		        template<uint32_t T_start = 0u, uint32_t T_end = width()>
		        [[nodiscard]] constexpr type reduce_range(auto&& reduceFunc) const
		        {
		            // elements in the range
		            constexpr uint32_t size = T_end - T_start;
		            // single element termination
		            if constexpr(size == 1u)
		            {
		                return (*this)[T_start];
		            }
		#if ALPAKA_LANG_SYCL
		            // SYCL can not call recursive functions
		            auto result = (*this)[T_start];
		            for(uint32_t i = T_start + 1u; i < T_end; ++i)
		            {
		                result = reduceFunc(result, (*this)[i]);
		            }
		            return result;
		#else
		            // split range at midpoint
		            constexpr uint32_t mid = T_start + size / 2u;

		            // recursively reduce both halves and combine
		            return reduceFunc(
		                reduce_range<T_start, mid>(ALPAKA_FORWARD(reduceFunc)),
		                reduce_range<mid, T_end>(ALPAKA_FORWARD(reduceFunc)));
		#endif
		        }

		        template<concepts::SimdMask Mask, concepts::Simd T_Simd>
		        friend struct SimdWhereExpr;
		    };

		    template<std::size_t I, typename T_Type, uint32_t T_width, typename T_Storage>
		    constexpr auto get(SimdMask<T_Type, T_width, T_Storage> const& v)
		    {
		        return v[I];
		    }

		    template<std::size_t I, typename T_Type, uint32_t T_width, typename T_Storage>
		    constexpr auto& get(SimdMask<T_Type, T_width, T_Storage>& v)
		    {
		        return v[I];
		    }

		    template<typename Type, uint32_t T_width, typename T_Storage>
		    std::ostream& operator<<(std::ostream& s, SimdMask<Type, T_width, T_Storage> const& vec)
		    {
		        return s << vec.toString();
		    }

		    // type deduction guide
		    template<typename T_1, typename... T_Args>
		    ALPAKA_FN_HOST_ACC SimdMask(T_1, T_Args...) -> SimdMask<T_1, uint32_t(sizeof...(T_Args) + 1u)>;

		    /** Creates a mask for the given type
		     *
		     * @tparam T value type of SIMD object which should be masked
		     * @tparam T_Args arguments forwarded to the constructor of the mask
		     */
		    template<typename T, typename... T_Args>
		    requires((std::same_as<std::remove_cvref_t<T_Args>, bool>) && ...)
		    constexpr auto makeSimdMask(T_Args... args)
		    {
		        using Storage =
		            typename trait::GetSimdMaskStorageType<ALPAKA_TYPEOF(thisApi()), T, uint32_t(sizeof...(T_Args))>::type;
		        return SimdMask<T, uint32_t(sizeof...(T_Args)), Storage>(Storage(ALPAKA_FORWARD(args)...));
		    }

		#define ALPAKA_VECTOR_BINARY_OP(typenameOrConcept, op)                                                                \
		    template<typenameOrConcept T_Type, uint32_t T_width, typename T_Storage, typename T_OtherStorage>                 \
		    constexpr auto operator op(                                                                                       \
		        const SimdMask<T_Type, T_width, T_Storage>& lhs,                                                              \
		        const SimdMask<T_Type, T_width, T_OtherStorage>& rhs)                                                         \
		    {                                                                                                                 \
		        using StoreageType = ALPAKA_TYPEOF(lhs.asStorage() op rhs.asStorage());                                       \
		        return SimdMask<T_Type, T_width, StoreageType>(lhs.asStorage() op rhs.asStorage());                           \
		    }                                                                                                                 \
		    template<                                                                                                         \
		        typenameOrConcept T_Type,                                                                                     \
		        concepts::LosslesslyConvertible<T_Type> T_ValueType,                                                          \
		        uint32_t T_width,                                                                                             \
		        typename T_Storage>                                                                                           \
		    constexpr auto operator op(const SimdMask<T_Type, T_width, T_Storage>& lhs, T_ValueType rhs)                      \
		    {                                                                                                                 \
		        using StoreageType = ALPAKA_TYPEOF(lhs.asStorage() op static_cast<T_Type>(rhs));                              \
		        return SimdMask<T_Type, T_width, StoreageType>(lhs.asStorage() op static_cast<T_Type>(rhs));                  \
		    }                                                                                                                 \
		    template<                                                                                                         \
		        typenameOrConcept T_Type,                                                                                     \
		        concepts::LosslesslyConvertible<T_Type> T_ValueType,                                                          \
		        uint32_t T_width,                                                                                             \
		        typename T_Storage>                                                                                           \
		    constexpr auto operator op(T_ValueType lhs, const SimdMask<T_Type, T_width, T_Storage>& rhs)                      \
		    {                                                                                                                 \
		        using StoreageType = ALPAKA_TYPEOF(static_cast<T_Type>(lhs) op rhs.asStorage());                              \
		        return SimdMask<T_Type, T_width, StoreageType>(static_cast<T_Type>(lhs) op rhs.asStorage());                  \
		    }

		    ALPAKA_VECTOR_BINARY_OP(typename, &&)
		    ALPAKA_VECTOR_BINARY_OP(typename, ||)
		    ALPAKA_VECTOR_BINARY_OP(std::integral, &)
		    ALPAKA_VECTOR_BINARY_OP(std::integral, |)
		    ALPAKA_VECTOR_BINARY_OP(std::integral, ^)
		    ALPAKA_VECTOR_BINARY_OP(typename, ==)
		    ALPAKA_VECTOR_BINARY_OP(typename, !=)

		#undef ALPAKA_VECTOR_BINARY_OP

		    /** @} */


		    namespace trait
		    {
		        template<typename T_Type, uint32_t T_width, typename T_Storage>
		        struct GetDim<alpaka::SimdMask<T_Type, T_width, T_Storage>>
		        {
		            static constexpr uint32_t value = T_width;
		        };

		        template<typename T_Type, uint32_t T_width, typename T_Storage>
		        struct GetValueType<alpaka::SimdMask<T_Type, T_width, T_Storage>>
		        {
		            using type = T_Type;
		        };
		    } // namespace trait

		    namespace internal
		    {
		        template<typename T_To, typename T_Type, uint32_t T_width, typename T_Storage>
		        struct PCast::Op<T_To, alpaka::SimdMask<T_Type, T_width, T_Storage>>
		        {
		            constexpr auto operator()(auto&& input) const
		                requires std::convertible_to<T_Type, T_To> && (!std::same_as<T_To, T_Type>)
		            {
		                return typename alpaka::SimdMask<T_To, T_width, T_Storage>::UniSimdMask(
		                    [&](uint32_t idx) constexpr { return static_cast<T_To>(input[idx]); });
		            }

		            constexpr decltype(auto) operator()(auto&& input) const requires std::same_as<T_To, T_Type>
		            {
		                return std::forward<decltype(input)>(input);
		            }
		        };
		    } // namespace internal
		}; // namespace alpaka

		namespace std
		{
		    template<typename T_Type, uint32_t T_width, typename T_Storage>
		    struct tuple_size<alpaka::SimdMask<T_Type, T_width, T_Storage>>
		    {
		        static constexpr std::size_t value = T_width;
		    };

		    template<std::size_t I, typename T_Type, uint32_t T_width, typename T_Storage>
		    struct tuple_element<I, alpaka::SimdMask<T_Type, T_width, T_Storage>>
		    {
		        using type = T_Type;
		    };
		} // namespace std
		// ==
		// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/SimdMask.hpp ==
		// ============================================================================

	// #include "alpaka/Vec.hpp"    // amalgamate: file already inlined
	// #include "alpaka/cast.hpp"    // amalgamate: file already inlined
	// #include "alpaka/core/util.hpp"    // amalgamate: file already inlined
	// #include "alpaka/mem/Alignment.hpp"    // amalgamate: file already inlined
	// #include "alpaka/simd/concepts.hpp"    // amalgamate: file already inlined
	// #include "alpaka/simd/trait.hpp"    // amalgamate: file already inlined
	// #include "alpaka/trait.hpp"    // amalgamate: file already inlined
	// #include "simd/internal/EmuSimd.hpp"    // amalgamate: file already inlined
	// #include "simd/internal/StdSimd.hpp"    // amalgamate: file already inlined

	// #include <array>    // amalgamate: file already included
	// #include <bit>    // amalgamate: file already included
	// #include <concepts>    // amalgamate: file already included
	// #include <cstddef>    // amalgamate: file already included
	// #include <cstdint>    // amalgamate: file already included
	// #include <functional>    // amalgamate: file already included
	// #include <iosfwd>    // amalgamate: file already included
	// #include <ranges>    // amalgamate: file already included
	// #include <sstream>    // amalgamate: file already included
	// #include <string>    // amalgamate: file already included
	#include <type_traits>

	namespace alpaka
	{
	    /** Simd vector
	     *
	     * @attention You should not use this type to create a buffer of SIMD vectors.
	     * The implementation is not ABI compatible between different API's.
	     * Using Simd data created on the host and used in the compute kernel will be undefined behaviour.
	     *
	     * This class is designed to be used via SimdPtr via reinterpretation of contiguous scalar data.
	     *
	     * @tparam T_Type data value type
	     * @tparam T_width number of lanes in the SIMD vector
	     * @tparam T_Storage wrapped native representation of the SIMD vector
	     */
	    template<
	        typename T_Type,
	        uint32_t T_width,
	        typename T_Storage =
	            /** do not use ALPAKA_TYPEOF(thisApi()) here else nvcc + gcc can trigger a compile error
	             * error: use of built-in trait '__remove_cv(alpaka::api::Host)' in function signature;
	             */
	        typename trait::GetSimdStorageType<decltype(thisApi()), T_Type, T_width>::type>
	    struct Simd;

	    namespace trait
	    {
	        template<typename T_Type, uint32_t T_width, typename T_Storage>
	        struct IsSimd<Simd<T_Type, T_width, T_Storage>> : std::true_type
	        {
	        };
	    } // namespace trait

	    // friend forward declaration
	    template<concepts::SimdMask T_Mask, concepts::Simd T_Simd>
	    struct SimdWhereExpr;

	    template<typename T_Type, uint32_t T_width, typename T_Storage>
	    struct Simd : private T_Storage
	    {
	        using Storage = T_Storage;
	        using type = typename T_Storage::value_type;
	        /** type is an implementation detail, can be a proxy type. */
	        using reference = typename T_Storage::reference;

	        using index_type = uint32_t;
	        using size_type = uint32_t;
	        using rank_type = uint32_t;

	        // universal vec used as fallback if T_Storage is holding the state in the template signature
	        using UniSimd = Simd<T_Type, T_width>;

	        /*Simds without elements are not allowed*/
	        static_assert(T_width > 0u);

	        constexpr Simd() = default;

	        using Storage::asNativeType;

	        /** static cast the instance to the storage type
	         *
	         * @attention: Do not use this method in user code, it is an implementation detail and can cause undefined
	         * behaviour if used wrong.
	         *
	         * @{
	         */
	        constexpr auto& asStorage()
	        {
	            return static_cast<Storage&>(*this);
	        }

	        constexpr auto const& asStorage() const
	        {
	            return static_cast<Storage const&>(*this);
	        }

	        /** @} */

	        /** Initialize via a generator expression
	         *
	         * The generator must return the value for the corresponding index of the component which is passed to the
	         * generator.
	         *
	         * This constructor is not constexpr because std::simd is using a reinterpret_cast during the initialization
	         * with a generator and complains that this is not allowed in constexpr functions.
	         */
	        template<
	            typename F,
	            std::enable_if_t<std::is_invocable_v<F, std::integral_constant<uint32_t, 0u>>, uint32_t> = 0u>
	        ALPAKA_FN_HOST_ACC explicit Simd(F&& generator)
	            : Simd(std::forward<F>(generator), std::make_integer_sequence<uint32_t, T_width>{})
	        {
	            /* Do not change the enable if to `requires(std::is_invocable_v<F, std::integral_constant<uint32_t, 0u>>)`
	             * nvcc 12.3.2 has a bug that creates compile issue when requires with std::is_invocable is used.
	             */
	        }

	        /** Constructor for SIMD pack
	         *
	         * @attention This constructor allows implicit casts.
	         *
	         * This constructor is not constexpr because std::simd is using a reinterpret_cast during the initialization
	         * with a generator and complains that this is not allowed in constexpr functions.
	         *
	         * @param args value of each lane index, x,y,z,...
	         *
	         */
	        template<typename... T_Args>
	        requires((std::is_convertible_v<T_Args, T_Type> && ...) && (sizeof...(T_Args) == T_width))
	        ALPAKA_FN_HOST_ACC Simd(T_Args const&... args) : Storage(static_cast<T_Type>(args)...)
	        {
	        }

	        constexpr Simd(Simd const& other) = default;

	        constexpr Simd(T_Storage const& other) : T_Storage{other}
	        {
	        }

	        /** constructor allows changing the storage policy
	         */
	        template<typename T_OtherStorage>
	        constexpr Simd(Simd<T_Type, T_width, T_OtherStorage> const& other)
	            : Simd([&](uint32_t const i) constexpr { return other[i]; })
	        {
	        }

	        /** Allow static_cast / explicit cast to member type
	         *
	         * @attention only available for SIMD with a single lane.
	         */
	        constexpr explicit operator type() requires(T_width == 1u)
	        {
	            return (*this)[0];
	        }

	        /** Number of components/lanes in the SIMD pack. */
	        static consteval uint32_t width()
	        {
	            return T_width;
	        }

	        constexpr void copyFrom(T_Type const* data, concepts::Alignment auto alignment)
	        {
	            Storage::copyFrom(data, alignment);
	        }

	        constexpr void copyTo(auto* data, concepts::Alignment auto alignment) const
	        {
	            Storage::copyTo(data, alignment);
	        }

	        /**
	         * Creates a Simd where all lanes are set to the same value
	         *
	         * @param value Value which is set for all lanes
	         * @return new Simd<...>
	         */
	        static constexpr auto fill(concepts::Convertible<T_Type> auto value)
	        {
	            /* Note the function is taking value as copy because it is typically a scalar value.
	             * If a const reference is used, it would not be possible to pass a host defined constexpr value into
	             * fill() when CUDA is used. Issue was seen with nvcc CUDA 13.0.2. see
	             * https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#constexpr-variables
	             */
	            return Simd{Storage::fill(static_cast<T_Type>(value))};
	        }

	        constexpr Simd toRT() const
	        {
	            return *this;
	        }

	        constexpr Simd revert() const
	        {
	            Simd invertedSimd{};
	            for(uint32_t i = 0u; i < T_width; i++)
	                invertedSimd[T_width - 1 - i] = (*this)[i];

	            return invertedSimd;
	        }

	        constexpr Simd& operator=(Simd const&) = default;
	        constexpr Simd& operator=(Simd&&) = default;

	        constexpr Simd operator-() const
	        {
	            return Simd([this](uint32_t const i) constexpr { return -(*this)[i]; });
	        }

	        /** assign operator
	         * @{
	         */
	#define ALPAKA_VECTOR_ASSIGN_OP(op)                                                                                   \
	    template<typename T_OtherStorage>                                                                                 \
	    constexpr Simd& operator op(Simd<T_Type, T_width, T_OtherStorage> const& rhs)                                     \
	    {                                                                                                                 \
	        this->asStorage() op rhs.asStorage();                                                                         \
	        return *this;                                                                                                 \
	    }                                                                                                                 \
	    constexpr Simd& operator op(concepts::LosslesslyConvertible<T_Type> auto const value)                             \
	    {                                                                                                                 \
	        this->asStorage() op static_cast<T_Type>(value);                                                              \
	        return *this;                                                                                                 \
	    }

	        ALPAKA_VECTOR_ASSIGN_OP(+=)
	        ALPAKA_VECTOR_ASSIGN_OP(-=)
	        ALPAKA_VECTOR_ASSIGN_OP(/=)
	        ALPAKA_VECTOR_ASSIGN_OP(*=)
	        ALPAKA_VECTOR_ASSIGN_OP(=)

	#undef ALPAKA_VECTOR_ASSIGN_OP

	        /** @} */

	        /** access a lane by index
	         *
	         * @return The returned type is implementation specific, therefore it can be a proxy reference.
	         *         You can not use the returned value to deduct the type and assume that it will be the value type of
	         * Simd.
	         */
	        constexpr reference operator[](std::integral auto const idx)
	        {
	            return asStorage()[idx];
	        }

	        /** access a lane by index
	         *
	         * @return The value type, by copy.
	         */
	        constexpr type operator[](std::integral auto const idx) const
	        {
	            return asStorage()[idx];
	        }

	#define ALPAKA_NAMED_ARRAY_ACCESS(functionName, laneIdx)                                                              \
	    constexpr reference functionName() requires(T_width >= laneIdx + 1)                                               \
	    {                                                                                                                 \
	        return (*this)[laneIdx];                                                                                      \
	    }                                                                                                                 \
	    constexpr type functionName() const requires(T_width >= laneIdx + 1)                                              \
	    {                                                                                                                 \
	        return (*this)[laneIdx];                                                                                      \
	    }

	        /** @brief named lane access
	         *
	         * @attention The mapping from names x,y,z,w to memory indices differ from the mapping of an alpaka vector @c
	         * Vec. The availability of the naming methods depends on the SIMD width.
	         *
	         * You can have access to the same lane index via different nonspecific naming.
	         *
	         * @code
	         * lane index   :  0,  1,  2,  3, ...,  9, 10, ... , 15
	         * hexadecimal  : s0, s1, s2, s3, ..., s9, SA, ... , SF
	         * coordinate   :  x,  y,  z,  w
	         * color channel:  r,  g,  b,  a
	         * @endcode
	         *
	         * @{
	         */
	        ALPAKA_NAMED_ARRAY_ACCESS(x, 0u)
	        ALPAKA_NAMED_ARRAY_ACCESS(y, 1u)
	        ALPAKA_NAMED_ARRAY_ACCESS(z, 2u)
	        ALPAKA_NAMED_ARRAY_ACCESS(w, 3u)

	        ALPAKA_NAMED_ARRAY_ACCESS(r, 0u)
	        ALPAKA_NAMED_ARRAY_ACCESS(g, 1u)
	        ALPAKA_NAMED_ARRAY_ACCESS(b, 2u)
	        ALPAKA_NAMED_ARRAY_ACCESS(a, 3u)

	        ALPAKA_NAMED_ARRAY_ACCESS(s0, 0u)
	        ALPAKA_NAMED_ARRAY_ACCESS(s1, 1u)
	        ALPAKA_NAMED_ARRAY_ACCESS(s2, 2u)
	        ALPAKA_NAMED_ARRAY_ACCESS(s3, 3u)
	        ALPAKA_NAMED_ARRAY_ACCESS(s4, 4u)
	        ALPAKA_NAMED_ARRAY_ACCESS(s5, 5u)
	        ALPAKA_NAMED_ARRAY_ACCESS(s6, 6u)
	        ALPAKA_NAMED_ARRAY_ACCESS(s7, 7u)
	        ALPAKA_NAMED_ARRAY_ACCESS(s8, 8u)
	        ALPAKA_NAMED_ARRAY_ACCESS(s9, 9u)
	        ALPAKA_NAMED_ARRAY_ACCESS(sA, 10u)
	        ALPAKA_NAMED_ARRAY_ACCESS(sB, 11u)
	        ALPAKA_NAMED_ARRAY_ACCESS(sC, 12u)
	        ALPAKA_NAMED_ARRAY_ACCESS(sD, 13u)
	        ALPAKA_NAMED_ARRAY_ACCESS(sE, 14u)
	        ALPAKA_NAMED_ARRAY_ACCESS(sF, 15u)
	        /** @} */

	#undef ALPAKA_NAMED_ARRAY_ACCESS

	        /** Shrink the number of elements of a vector.
	         *
	         * Highest indices kept alive.
	         *
	         * @tparam T_numElements New width of the SIMD pack.
	         * @return First T_numElements elements of the origin vector
	         */
	        template<uint32_t T_numElements>
	        constexpr Simd<T_Type, T_numElements> rshrink() const
	        {
	            static_assert(T_numElements <= T_width);
	            Simd<T_Type, T_numElements> result{};
	            for(uint32_t i = 0u; i < T_numElements; i++)
	                result[T_numElements - 1u - i] = (*this)[T_width - 1u - i];

	            return result;
	        }

	        /** Shrink the SIMD pack
	         *
	         * Removes the last value.
	         */
	        constexpr Simd<T_Type, T_width - 1u> eraseBack() const requires(T_width > 1u)
	        {
	            constexpr auto reducedDim = T_width - 1u;
	            Simd<T_Type, reducedDim> result{};
	            for(uint32_t i = 0u; i < reducedDim; i++)
	                result[i] = (*this)[i];

	            return result;
	        }

	        /** Shrink the number of elements of a vector.
	         *
	         * @tparam T_numElements New width of the SIMD pack.
	         * @param startIdx Index within the origin vector which will be the last element in the result.
	         * @return T_numElements elements of the origin vector starting with the index startIdx.
	         *         Indexing will wrapp around when the begin of the origin vector is reached.
	         */
	        template<uint32_t T_numElements>
	        constexpr Simd<type, T_numElements> rshrink(std::integral auto const startIdx) const
	        {
	            static_assert(T_numElements <= T_width);
	            Simd<type, T_numElements> result;
	            for(uint32_t i = 0u; i < T_numElements; i++)
	                result[T_numElements - 1u - i] = (*this)[(T_width + startIdx - i) % T_width];
	            return result;
	        }

	        /** Removes a component
	         *
	         * It is not allowed to call this method on a vector with the width of one.
	         *
	         * @tparam laneIdxToRemove index which shall be removed; range: [ 0; T_width - 1 ]
	         * @return vector with `T_width - 1` elements
	         */
	        template<std::integral auto laneIdxToRemove>
	        constexpr Simd<type, T_width - 1u> remove() const requires(T_width >= 2u)
	        {
	            Simd<type, T_width - 1u> result{};
	            for(int i = 0u; i < static_cast<int>(T_width - 1u); ++i)
	            {
	                // skip component which must be deleted
	                int const sourceIdx = i >= static_cast<int>(laneIdxToRemove) ? i + 1 : i;
	                result[i] = (*this)[sourceIdx];
	            }
	            return result;
	        }

	        /** Returns product of all components.
	         *
	         * @return product of components
	         */
	        [[nodiscard]] constexpr type product() const
	        {
	            return reduce(std::multiplies{});
	        }

	        /** Returns sum of all components.
	         *
	         * @return sum of components
	         */
	        [[nodiscard]] constexpr type sum() const
	        {
	            return reduce(std::plus{});
	        }

	        /** reduce all elements to a single value
	         *
	         * For better numerical stability a tree reduce algorithm is used.
	         *
	         * @tparam BinaryOp binary functor executed to reduce the range
	         *                  The binary operation must be associative.
	         * @return the type of the result depends on the binary functor
	         */
	        [[nodiscard]] constexpr auto reduce(auto&& reduceFunc) const
	            -> decltype(reduceFunc(std::declval<type>(), std::declval<type>()))
	        {
	            return reduce_range(ALPAKA_FORWARD(reduceFunc));
	        }

	        template<typename T_OtherStorage>
	        constexpr auto min(Simd<T_Type, T_width, T_OtherStorage> const& rhs) const
	        {
	            Simd result{};
	            for(uint32_t d = 0u; d < T_width; d++)
	                result[d] = std::min((*this)[d], rhs[d]);
	            return result;
	        }

	        /** create string out of the SIMD pack
	         *
	         * @param separator string to separate components of the SIMD pack
	         * @param enclosings string with width 2 to enclose SIMD pack
	         *                   width == 0 ? no enclose symbols
	         *                   width == 1 ? means enclose symbol begin and end are equal
	         *                   width >= 2 ? letter[0] = begin enclose symbol
	         *                               letter[1] = end enclose symbol
	         *
	         * example:
	         * .toString(";","|")     -> |x;...;z|
	         * .toString(",","[]")    -> [x,...,z]
	         */
	        std::string toString(std::string const separator = ",", std::string const enclosings = "{}") const
	        {
	            std::string locale_enclosing_begin;
	            std::string locale_enclosing_end;
	            size_t enclosingLaneIdx = enclosings.size();

	            if(enclosingLaneIdx > 0)
	            {
	                /* % avoid out of memory access */
	                locale_enclosing_begin = enclosings[0 % enclosingLaneIdx];
	                locale_enclosing_end = enclosings[1 % enclosingLaneIdx];
	            }

	            std::stringstream stream;
	            stream << locale_enclosing_begin << (*this)[0];

	            for(uint32_t i = 1u; i < T_width; ++i)
	                stream << separator << (*this)[i];
	            stream << locale_enclosing_end;
	            return stream.str();
	        }

	    private:
	        template<typename F, uint32_t... Is>
	        constexpr explicit Simd(F&& generator, std::integer_sequence<uint32_t, Is...>)
	            : Storage{generator(std::integral_constant<uint32_t, Is>{})...}
	        {
	        }

	        /** reduce over a range of elements
	         *
	         * @tparam BinaryOp binary functor executed to reduce the range
	         * @tparam T_start start index
	         * @tparam T_end end index (excluded)
	         * @return the type of the result depends on the binary functor
	         */
	        template<uint32_t T_start = 0u, uint32_t T_end = width()>
	        [[nodiscard]] constexpr auto reduce_range(auto&& reduceFunc) const
	            -> decltype(reduceFunc(std::declval<type>(), std::declval<type>()))
	        {
	            // elements in the range
	            constexpr uint32_t size = T_end - T_start;
	            // single element termination
	            if constexpr(size == 1u)
	            {
	                return (*this)[T_start];
	            }
	#if ALPAKA_LANG_SYCL
	            // SYCL can not call recursive functions
	            auto result = (*this)[T_start];
	            for(uint32_t i = T_start + 1u; i < T_end; ++i)
	            {
	                result = reduceFunc(result, (*this)[i]);
	            }
	            return result;
	#else
	            // split range at midpoint
	            constexpr uint32_t mid = T_start + size / 2u;

	            // recursively reduce both halves and combine
	            return reduceFunc(
	                reduce_range<T_start, mid>(ALPAKA_FORWARD(reduceFunc)),
	                reduce_range<mid, T_end>(ALPAKA_FORWARD(reduceFunc)));
	#endif
	        }

	        template<concepts::SimdMask Mask, concepts::Simd T_Simd>
	        friend struct SimdWhereExpr;
	    };

	    template<std::size_t I, typename T_Type, uint32_t T_width, typename T_Storage>
	    constexpr auto get(Simd<T_Type, T_width, T_Storage> const& v)
	    {
	        return v[I];
	    }

	    template<std::size_t I, typename T_Type, uint32_t T_width, typename T_Storage>
	    constexpr auto& get(Simd<T_Type, T_width, T_Storage>& v)
	    {
	        return v[I];
	    }

	    template<typename Type, uint32_t T_width, typename T_Storage>
	    std::ostream& operator<<(std::ostream& s, Simd<Type, T_width, T_Storage> const& vec)
	    {
	        return s << vec.toString();
	    }

	    // type deduction guide
	    template<typename T_1, typename... T_Args>
	    ALPAKA_FN_HOST_ACC Simd(T_1, T_Args...) -> Simd<T_1, uint32_t(sizeof...(T_Args) + 1u)>;

	    /** binary operators
	     * @{
	     */
	#define ALPAKA_VECTOR_BINARY_OP(typenameOrConcept, op)                                                                \
	    template<typenameOrConcept T_Type, uint32_t T_width, typename T_Storage, typename T_OtherStorage>                 \
	    constexpr auto operator op(                                                                                       \
	        const Simd<T_Type, T_width, T_Storage>& lhs,                                                                  \
	        const Simd<T_Type, T_width, T_OtherStorage>& rhs)                                                             \
	    {                                                                                                                 \
	        using StoreageType = ALPAKA_TYPEOF(lhs.asStorage() op rhs.asStorage());                                       \
	        return Simd<T_Type, T_width, StoreageType>(lhs.asStorage() op rhs.asStorage());                               \
	    }                                                                                                                 \
	    template<                                                                                                         \
	        typenameOrConcept T_Type,                                                                                     \
	        concepts::LosslesslyConvertible<T_Type> T_ValueType,                                                          \
	        uint32_t T_width,                                                                                             \
	        typename T_Storage>                                                                                           \
	    constexpr auto operator op(const Simd<T_Type, T_width, T_Storage>& lhs, T_ValueType rhs)                          \
	    {                                                                                                                 \
	        using StoreageType = ALPAKA_TYPEOF(lhs.asStorage() op static_cast<T_Type>(rhs));                              \
	        return Simd<T_Type, T_width, StoreageType>(lhs.asStorage() op static_cast<T_Type>(rhs));                      \
	    }                                                                                                                 \
	    template<                                                                                                         \
	        typenameOrConcept T_Type,                                                                                     \
	        concepts::LosslesslyConvertible<T_Type> T_ValueType,                                                          \
	        uint32_t T_width,                                                                                             \
	        typename T_Storage>                                                                                           \
	    constexpr auto operator op(T_ValueType lhs, const Simd<T_Type, T_width, T_Storage>& rhs)                          \
	    {                                                                                                                 \
	        using StoreageType = ALPAKA_TYPEOF(static_cast<T_Type>(lhs) op rhs.asStorage());                              \
	        return Simd<T_Type, T_width, StoreageType>(static_cast<T_Type>(lhs) op rhs.asStorage());                      \
	    }

	    ALPAKA_VECTOR_BINARY_OP(typename, +)
	    ALPAKA_VECTOR_BINARY_OP(typename, -)
	    ALPAKA_VECTOR_BINARY_OP(typename, *)
	    ALPAKA_VECTOR_BINARY_OP(typename, /)
	    ALPAKA_VECTOR_BINARY_OP(std::integral, %)
	    ALPAKA_VECTOR_BINARY_OP(std::integral, <<)
	    ALPAKA_VECTOR_BINARY_OP(std::integral, >>)
	    ALPAKA_VECTOR_BINARY_OP(std::integral, &)
	    ALPAKA_VECTOR_BINARY_OP(std::integral, |)
	    ALPAKA_VECTOR_BINARY_OP(std::integral, ^)

	#undef ALPAKA_VECTOR_BINARY_OP


	#define ALPAKA_VECTOR_BINARY_CMP_OP(typenameOrConcept, op)                                                            \
	    template<typenameOrConcept T_Type, uint32_t T_width, typename T_Storage, typename T_OtherStorage>                 \
	    constexpr auto operator op(                                                                                       \
	        const Simd<T_Type, T_width, T_Storage>& lhs,                                                                  \
	        const Simd<T_Type, T_width, T_OtherStorage>& rhs)                                                             \
	    {                                                                                                                 \
	        using StoreageType = ALPAKA_TYPEOF(lhs.asStorage() op rhs.asStorage());                                       \
	        return SimdMask<T_Type, T_width, StoreageType>(lhs.asStorage() op rhs.asStorage());                           \
	    }                                                                                                                 \
	    template<                                                                                                         \
	        typenameOrConcept T_Type,                                                                                     \
	        concepts::LosslesslyConvertible<T_Type> T_ValueType,                                                          \
	        uint32_t T_width,                                                                                             \
	        typename T_Storage>                                                                                           \
	    constexpr auto operator op(const Simd<T_Type, T_width, T_Storage>& lhs, T_ValueType rhs)                          \
	    {                                                                                                                 \
	        using StoreageType = ALPAKA_TYPEOF(lhs.asStorage() op static_cast<T_Type>(rhs));                              \
	        return SimdMask<T_Type, T_width, StoreageType>(lhs.asStorage() op static_cast<T_Type>(rhs));                  \
	    }                                                                                                                 \
	    template<                                                                                                         \
	        typenameOrConcept T_Type,                                                                                     \
	        concepts::LosslesslyConvertible<T_Type> T_ValueType,                                                          \
	        uint32_t T_width,                                                                                             \
	        typename T_Storage>                                                                                           \
	    constexpr auto operator op(T_ValueType lhs, const Simd<T_Type, T_width, T_Storage>& rhs)                          \
	    {                                                                                                                 \
	        using StoreageType = ALPAKA_TYPEOF(static_cast<T_Type>(lhs) op rhs.asStorage());                              \
	        return SimdMask<T_Type, T_width, StoreageType>(static_cast<T_Type>(lhs) op rhs.asStorage());                  \
	    }

	    ALPAKA_VECTOR_BINARY_CMP_OP(typename, >=)
	    ALPAKA_VECTOR_BINARY_CMP_OP(typename, >)
	    ALPAKA_VECTOR_BINARY_CMP_OP(typename, <=)
	    ALPAKA_VECTOR_BINARY_CMP_OP(typename, <)
	    ALPAKA_VECTOR_BINARY_CMP_OP(typename, ==)
	    ALPAKA_VECTOR_BINARY_CMP_OP(typename, !=)

	#undef ALPAKA_VECTOR_BINARY_CMP_OP

	    /** @} */


	    namespace trait
	    {
	        template<typename T_Type, uint32_t T_width, typename T_Storage>
	        struct GetDim<alpaka::Simd<T_Type, T_width, T_Storage>>
	        {
	            static constexpr uint32_t value = T_width;
	        };

	        template<typename T_Type, uint32_t T_width, typename T_Storage>
	        struct GetValueType<alpaka::Simd<T_Type, T_width, T_Storage>>
	        {
	            using type = T_Type;
	        };
	    } // namespace trait

	    namespace internal
	    {
	        template<typename T_To, typename T_Type, uint32_t T_width, typename T_Storage>
	        struct PCast::Op<T_To, alpaka::Simd<T_Type, T_width, T_Storage>>
	        {
	            constexpr auto operator()(auto&& input) const
	                requires std::convertible_to<T_Type, T_To> && (!std::same_as<T_To, T_Type>)
	            {
	                return typename alpaka::Simd<T_To, T_width, T_Storage>::UniSimd(
	                    [&](uint32_t idx) constexpr { return static_cast<T_To>(input[idx]); });
	            }

	            constexpr decltype(auto) operator()(auto&& input) const requires std::same_as<T_To, T_Type>
	            {
	                return std::forward<decltype(input)>(input);
	            }
	        };
	    } // namespace internal
	}; // namespace alpaka

	namespace std
	{
	    template<typename T_Type, uint32_t T_width, typename T_Storage>
	    struct tuple_size<alpaka::Simd<T_Type, T_width, T_Storage>>
	    {
	        static constexpr std::size_t value = T_width;
	    };

	    template<std::size_t I, typename T_Type, uint32_t T_width, typename T_Storage>
	    struct tuple_element<I, alpaka::Simd<T_Type, T_width, T_Storage>>
	    {
	        using type = T_Type;
	    };
	} // namespace std
	// ==
	// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/Simd.hpp ==
	// ============================================================================

	// ============================================================================
	// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/SimdWhereExpr.hpp ==
	// ==
	/* Copyright 2025 René Widera
	 * SPDX-License-Identifier: MPL-2.0
	 */

	/** @file This file provides a basic implementation of a SIMD vector.
	 *
	 * The implementation is based on the class Vec:
	 *   - the storge policy should become the native SIMD implementation e.g. std::simd
	 *   - load/ store and simd specifics should be implemented in the storage policy
	 *   - the name of storage policy should be changed
	 *
	 *   The current operator operations rely on compilers auto vectorization.
	 */

	// #pragma once
	// #include "alpaka/Simd.hpp"    // amalgamate: file already inlined

	namespace alpaka
	{
	    template<concepts::SimdMask Mask, concepts::Simd T_Simd>
	    struct SimdWhereExpr
	    {
	        Mask const& m_mask;
	        T_Simd& value;

	        constexpr SimdWhereExpr(Mask const& m, T_Simd& v) : m_mask(m), value(v)
	        {
	        }

	        // disable copy and move constructors/operators to avoid pointing to invalid references.
	        constexpr SimdWhereExpr(SimdWhereExpr const&) = delete;
	        constexpr SimdWhereExpr(SimdWhereExpr&&) = delete;
	        constexpr SimdWhereExpr& operator=(SimdWhereExpr const&) = delete;
	        constexpr SimdWhereExpr& operator=(SimdWhereExpr&&) = delete;

	        using value_type = typename T_Simd::type;

	        constexpr void operator=(concepts::Simd auto const& rhs)
	            requires std::same_as<value_type, typename ALPAKA_TYPEOF(rhs)::type>
	        {
	            if constexpr(requires { value.where(m_mask); })
	                value.where(m_mask) = rhs.asNativeType();
	            else
	                value.update(m_mask, rhs);
	        }

	        constexpr void operator=(concepts::LosslesslyConvertible<value_type> auto const& rhs)
	        {
	            if constexpr(requires { value.where(m_mask); })
	                value.where(m_mask) = rhs;
	            else
	                value.update(m_mask, static_cast<value_type>(rhs));
	        }

	#define ALPAKA_SIMD_EXPR_ASSIGN_OP(op_name, op)                                                                       \
	    constexpr void operator op_name(concepts::Simd auto const& rhs)                                                   \
	    {                                                                                                                 \
	        if constexpr(requires { value.where(m_mask); })                                                               \
	            value.where(m_mask) op_name rhs.asNativeType();                                                           \
	        else                                                                                                          \
	            value.update(m_mask, value op rhs);                                                                       \
	    }                                                                                                                 \
	    constexpr void operator op_name(concepts::LosslesslyConvertible<value_type> auto const& rhs)                      \
	    {                                                                                                                 \
	        if constexpr(requires { value.where(m_mask); })                                                               \
	            value.where(m_mask) op_name rhs;                                                                          \
	        else                                                                                                          \
	            value.update(m_mask, value op rhs);                                                                       \
	    }

	        ALPAKA_SIMD_EXPR_ASSIGN_OP(+=, +)
	        ALPAKA_SIMD_EXPR_ASSIGN_OP(-=, -)
	        ALPAKA_SIMD_EXPR_ASSIGN_OP(/=, /)
	        ALPAKA_SIMD_EXPR_ASSIGN_OP(*=, *)


	#undef ALPAKA_SIMD_EXPR_ASSIGN_OP

	    private:
	        /** create a SIMD vector where all bits are zero or one depedning on the mask value
	         *
	         * @return per lane: all bits one if mask is true, else all bits zero
	         */
	        static constexpr auto valueMask(concepts::Simd auto const& mask)
	            requires(sizeof(typename T_Simd::type) == 4u || sizeof(typename T_Simd::type) == 8u)
	        {
	            using ValueMaskType = std::conditional_t<sizeof(typename T_Simd::type) == 4u, uint32_t, uint64_t>;
	            Simd<ValueMaskType, T_Simd::width()> result(
	                [&](uint32_t const idx)
	                { return mask[idx] ? std::numeric_limits<ValueMaskType>::max() : ValueMaskType{0u}; });
	            return result;
	        }
	    };

	    /** Conditionally update each component of an SIMD pack
	     *
	     * @param mask SIMD pack of booleans, where each component is true for the element in v which should be overwritten
	     * with the value assigned to the returned expression
	     * @param value SIMD vector to which the mask is applied
	     */
	    template<concepts::SimdMask T_Mask, concepts::Simd T_Simd>
	    constexpr SimdWhereExpr<T_Mask, T_Simd> where(T_Mask const& mask, T_Simd& value)
	    {
	        return {mask, value};
	    }
	} // namespace alpaka
	// ==
	// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/SimdWhereExpr.hpp ==
	// ============================================================================

// #include "alpaka/UniqueId.hpp"    // amalgamate: file already inlined
// #include "alpaka/api/api.hpp"    // amalgamate: file already inlined
	// ============================================================================
	// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/cpu.hpp ==
	// ==
	/* Copyright 2024 René Widera
	 * SPDX-License-Identifier: MPL-2.0
	 */

	// #pragma once
	// #include "alpaka/api/host/Api.hpp"    // amalgamate: file already inlined
		// ============================================================================
		// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/host/Device.hpp ==
		// ==
		/* Copyright 2024 René Widera, Mehmet Yusufoglu
		 * SPDX-License-Identifier: MPL-2.0
		 */

		// #pragma once
		// #include "alpaka/api/host/Api.hpp"    // amalgamate: file already inlined
			// ============================================================================
			// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/host/Event.hpp ==
			// ==
			/* Copyright 2023 Axel Hübl, Benjamin Worpitz, Matthias Werner, René Widera, Jan Stephan, Bernhard Manfred Gruber
			 * SPDX-License-Identifier: MPL-2.0
			 */

			// #pragma once

			// #include "alpaka/api/host/Api.hpp"    // amalgamate: file already inlined
				// ============================================================================
				// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/interface.hpp ==
				// ==
				/* Copyright 2024 René Widera
				 * SPDX-License-Identifier: MPL-2.0
				 */

				// #pragma once
				// #include "alpaka/concepts.hpp"    // amalgamate: file already inlined
				// #include "alpaka/internal/interface.hpp"    // amalgamate: file already inlined
				// #include "alpaka/tag.hpp"    // amalgamate: file already inlined
				// #include "alpaka/trait.hpp"    // amalgamate: file already inlined
				// #include "alpaka/unused.hpp"    // amalgamate: file already inlined

				namespace alpaka
				{

				    /** Get the API an object depends on
				     *
				     * @param any can be a platform, device, queue, view
				     * @return API tag
				     *
				     * @{
				     */
				    inline constexpr decltype(auto) getApi(auto&& any)
				    {
				        return alpaka::internal::getApi(ALPAKA_FORWARD(any));
				    }

				    inline constexpr decltype(auto) getApi(alpaka::concepts::HasGet auto&& any)
				    {
				        return alpaka::internal::getApi(*any.get());
				    }

				    namespace concepts
				    {
				        /** Concept to check if the given type implements the `getApi(T x)` function returning an alpaka::concepts::Api
				         */
				        template<typename T_Any>
				        concept HasApi = requires(T_Any&& any) {
				            { getApi(any) } -> alpaka::concepts::Api;
				        };
				    } // namespace concepts

				    /** @} */

				    /** Get the device type of an object
				     *
				     * @param any can be a platform, device, queue, view
				     * @return type from alpaka::deviceKind
				     *
				     * @{
				     */
				    inline constexpr decltype(auto) getDeviceKind(auto&& any)
				    {
				        return alpaka::internal::getDeviceKind(ALPAKA_FORWARD(any));
				    }

				    inline constexpr decltype(auto) getDeviceKind(alpaka::concepts::HasGet auto&& any)
				    {
				        return alpaka::internal::getDeviceKind(*any.get());
				    }

				    /** @} */


				    /** Get the number of elements to compute per thread.
				     *
				     * This function considers the SIMD width for the corresponding data type and the potential for instruction
				     * parallelism.
				     *
				     * @tparam T_Type The data type used to determine the SIMD width.
				     * @return The minimum number of elements a thread should compute to achieve optimal utilization.
				     */
				    template<typename T_Type>
				    constexpr uint32_t getNumElemPerThread(auto&& any)
				    {
				        return alpaka::getNumElemPerThread<T_Type>(ALPAKA_TYPEOF(getApi(any)){}, ALPAKA_TYPEOF(getDeviceKind(any)){});
				    }

				    /** get SIMD with in bytes for the
				     *
				     * @tparam T_Type data type
				     * @return number of elements that can be processed in parallel in a vector register
				     */
				    template<typename T_Type>
				    constexpr uint32_t getArchSimdWidth(auto&& any)
				    {
				        return alpaka::getArchSimdWidth<T_Type>(ALPAKA_TYPEOF(getApi(any)){}, ALPAKA_TYPEOF(getDeviceKind(any)){});
				    }

				    /** get the number of instruction can be issued in parallel */
				    constexpr uint32_t getNumPipelines(auto&& any)
				    {
				        return alpaka::getNumPipelines(ALPAKA_TYPEOF(getApi(any)){}, ALPAKA_TYPEOF(getDeviceKind(any)){});
				    }

				    /** Get the value type alignment of an object
				     *
				     * @param any type derive the alignment from
				     * @return alignment in bytes, if not defined the alignment of the value_type will be returned
				     */
				    constexpr auto getAlignment(auto&& any)
				    {
				        return internal::getAlignment(ALPAKA_FORWARD(any));
				    }

				} // namespace alpaka
				// ==
				// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/interface.hpp ==
				// ============================================================================

			// #include "alpaka/internal/interface.hpp"    // amalgamate: file already inlined
			// #include "alpaka/onHost/Handle.hpp"    // amalgamate: file already inlined
			// #include "alpaka/onHost/internal/interface.hpp"    // amalgamate: file already inlined
				// ============================================================================
				// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/onHost/logger/logger.hpp ==
				// ==
				/* Copyright 2025 René Widera
				 * SPDX-License-Identifier: MPL-2.0
				 */

				// #pragma once
					// ============================================================================
					// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/onHost/internal/logger.hpp ==
					// ==
					/* Copyright 2025 René Widera
					 * SPDX-License-Identifier: MPL-2.0
					 */

					// #pragma once
					// #include "alpaka/onHost/demangledName.hpp"    // amalgamate: file already inlined
						// ============================================================================
						// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/onHost/logger/lvl.hpp ==
						// ==
						/* Copyright 2025 René Widera
						 * SPDX-License-Identifier: MPL-2.0
						 */

						// #pragma once
						// #include "alpaka/core/common.hpp"    // amalgamate: file already inlined

						// #include <string>    // amalgamate: file already included

						namespace alpaka::onHost::logger
						{

						    namespace detail
						    {
						        struct LogLvlBase
						        {
						        };

						        template<typename T_Logger0, typename T_Logger1>
						        struct AggregatedLogger : LogLvlBase
						        {
						            static std::string getName()
						            {
						                return T_Logger0::getName();
						            }

						            static constexpr size_t mask()
						            {
						                return T_Logger0::mask() + T_Logger1::mask();
						            }
						        };
						    } // namespace detail

						    namespace trait
						    {
						        template<typename T_DeviceKind>
						        struct IsLogLvl : std::is_base_of<detail::LogLvlBase, T_DeviceKind>
						        {
						        };
						    } // namespace trait

						    template<typename T_LogLvl>
						    constexpr bool isLogLvl_v = trait::IsLogLvl<T_LogLvl>::value;

						    namespace concepts
						    {
						        /** Concept for log level types
						         */
						        template<typename T_DeviceKind>
						        concept Level = isLogLvl_v<T_DeviceKind>;
						    } // namespace concepts

						    constexpr bool operator==(concepts::Level auto lhs, concepts::Level auto rhs)
						    {
						        return std::is_same_v<ALPAKA_TYPEOF(lhs), ALPAKA_TYPEOF(rhs)>;
						    }

						    constexpr bool operator!=(concepts::Level auto lhs, concepts::Level auto rhs)
						    {
						        return !(lhs == rhs);
						    }

						    constexpr auto operator+(concepts::Level auto lhs, concepts::Level auto rhs)
						    {
						        return detail::AggregatedLogger<ALPAKA_TYPEOF(lhs), ALPAKA_TYPEOF(rhs)>{};
						    }

						    struct Device : detail::LogLvlBase
						    {
						        static std::string getName()
						        {
						            return "Device";
						        }

						        static constexpr size_t mask()
						        {
						            return 1;
						        }
						    };

						    constexpr auto device = Device{};

						    struct Event : detail::LogLvlBase
						    {
						        static std::string getName()
						        {
						            return "Event";
						        }

						        static constexpr size_t mask()
						        {
						            return 2;
						        }
						    };

						    constexpr auto event = Event{};

						    struct Memory : detail::LogLvlBase
						    {
						        static std::string getName()
						        {
						            return "Memory";
						        }

						        static constexpr size_t mask()
						        {
						            return 4;
						        }
						    };

						    constexpr auto memory = Memory{};

						    struct Queue : detail::LogLvlBase
						    {
						        static std::string getName()
						        {
						            return "Queue";
						        }

						        static constexpr size_t mask()
						        {
						            return 8;
						        }
						    };

						    constexpr auto queue = Queue{};

						    struct Kernel : detail::LogLvlBase
						    {
						        static std::string getName()
						        {
						            return "Kernel";
						        }

						        static constexpr size_t mask()
						        {
						            return 16;
						        }
						    };

						    constexpr auto kernel = Kernel{};
						} // namespace alpaka::onHost::logger
						// ==
						// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/onHost/logger/lvl.hpp ==
						// ============================================================================

					// #include "alpaka/unused.hpp"    // amalgamate: file already inlined

					#include <atomic>
					#include <chrono>
					// #include <functional>    // amalgamate: file already included
					#include <iostream>
					// #include <ostream>    // amalgamate: file already included
					#include <source_location>
					// #include <string>    // amalgamate: file already included
					#include <string_view>

					namespace alpaka::onHost::logger::internal
					{
					    /** Write all output to std::cerr
					     *
					     * The output is not buffered and will be written immediately, it is **NOT** threadsafe.
					     *
					     * @todo seperate the indention level from the writer
					     * @todo write additional logger, std::cout and thread save logger
					     */
					    struct StdErr
					    {
					        static StdErr& get()
					        {
					            static StdErr inst = StdErr{};
					            return inst;
					        }

					        std::ostream& operator<<(auto const& input) const
					        {
					            return std::cerr << input;
					        }

					        /** increase the indention level
					         *
					         * @return the indention level for the current message
					         */
					        int enter()
					        {
					            return indentLvl++;
					        }

					        /** decrease the indention level
					         *
					         * @return the indention level for the current message
					         */
					        int leave()
					        {
					            return --indentLvl;
					        }

					        /** current indention level
					         *
					         * @return the indention level for the current message
					         */
					        int current()
					        {
					            return indentLvl.load();
					        }

					    private:
					        std::atomic<int> indentLvl = 1;
					    };

					    /** Indent the message if needed and forward it to the output writer
					     *
					     * If input is indented depends on the preprocessor define ALPAKA_LOG_INDENT
					     */
					    inline void indent(auto& writer, [[maybe_unused]] int indentLvl)
					    {
					#if defined(ALPAKA_LOG_INDENT)
					        for(int i = 0; i < indentLvl; ++i)
					            i == 0 ? (writer << "|-") : (writer << "--");
					        if(indentLvl)
					#endif
					            writer << " ";
					    }

					    /** Adjust the length of a string to a minimum length
					     *
					     * @param str input string
					     * @param n minimum number of characters, if the string is shorter than this number, it will be padded with a
					     * padding character
					     * @return new string with a  minimum number of characters
					     */
					    inline std::string adjStringLength(std::string str, size_t n, char const paddingCharacter = ' ')
					    {
					        if(str.length() >= n)
					        {
					            return str;
					        }
					        str.resize(n, paddingCharacter);
					        return str;
					    }

					    /** shortening the function signatures to become human-readable
					     *
					     * If the name is simplified depends on the preprocessor define ALPAKA_LOG_DETAIL_SHORT
					     */
					    inline std::string adjDetails(std::string const& str)
					    {
					#if defined(ALPAKA_LOG_DETAIL_SHORT)
					        return onHost::simplifyFunctionSignature(str);
					#else
					        return str;
					#endif
					    }

					    /** Log the entry and exit of a scope */
					    template<logger::concepts::Level T_LogLvl, typename T_Writer = StdErr>
					    struct Scoped
					    {
					    public:
					        Scoped(T_LogLvl logLvl, std::source_location const& location)
					            : m_functionName{adjDetails(location.function_name())}
					            , m_prefix{std::string("[") + adjStringLength(logLvl.getName(), 6) + "]"}
					            , m_startTime{std::chrono::high_resolution_clock::now()}
					            , m_writer{T_Writer::get()}
					        {
					            m_writer << m_prefix << "[+]";
					            indent(m_writer, m_writer.enter());
					            m_writer << m_functionName << std::endl;
					        }

					        Scoped(T_LogLvl logLvl) : m_writer{T_Writer::get()}, m_enableOutput{false}
					        {
					            alpaka::unused(logLvl);
					        }

					        Scoped(Scoped const&) = delete;
					        Scoped(Scoped&&) = delete;
					        Scoped& operator=(Scoped const&) = delete;
					        Scoped& operator=(Scoped&&) = delete;

					        ~Scoped()
					        {
					            if(m_enableOutput)
					            {
					                auto const endTime = std::chrono::high_resolution_clock::now();
					                double durationInSeconds = std::chrono::duration<double, std::milli>(endTime - m_startTime).count();

					                m_writer << m_prefix << "[-]";
					                indent(m_writer, m_writer.leave());
					                m_writer << m_functionName << " " << durationInSeconds << " ms" << std::endl;
					            }
					        }

					    private:
					        std::string m_functionName;
					        std::string m_prefix;
					        decltype(std::chrono::high_resolution_clock::now()) m_startTime;
					        T_Writer& m_writer;
					        bool m_enableOutput = true;
					    };

					    /** Write a meta data message to the output
					     *
					     * @tparam T_Callable callable without arguments which provides a string which should be written to the output
					     */
					    template<logger::concepts::Level T_LogLvl, typename T_Callable, typename T_Writer = StdErr>
					    requires(std::is_invocable_r_v<std::string, T_Callable>)
					    struct Info
					    {
					    public:
					        Info(T_LogLvl logLvl, T_Callable const& callable, std::source_location const& location)
					        {
					            auto fullPrefix = std::string("[") + adjStringLength(logLvl.getName(), 6) + "]";

					            auto& writer = T_Writer::get();
					            std::stringstream ss;
					            ss << "   ";
					            writer << fullPrefix << ss.str();
					            indent(writer, writer.current());
					            writer << callable() << " " << adjDetails(location.function_name()) << " " << location.file_name() << ":"
					                   << location.line() << std::endl;
					        }

					        Info(Info const&) = delete;
					        Info(Info&&) = delete;
					        Info& operator=(Info const&) = delete;
					        Info& operator=(Info&&) = delete;

					        ~Info() = default;
					    };
					} // namespace alpaka::onHost::logger::internal
					// ==
					// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/onHost/internal/logger.hpp ==
					// ============================================================================

				// #include "alpaka/onHost/logger/lvl.hpp"    // amalgamate: file already inlined

				// #include <mutex>    // amalgamate: file already included
				#include <source_location>

				namespace alpaka::onHost::logger
				{
				    /** Log the entry and exit of a scope
				     *
				     * @attention It is suggested to use the logger macro ALPAKA_LOG_FUNCTION to speedup the compile time.
				     * For cases where logging is disabled the compiler does not need to register the C++ function signature.
				     *
				     * The time spend within the scope is added to the output as additional information, in milliseconds.
				     *
				     * @param logLvl log level or a sum of log levels
				     */
				    inline auto scope(
				        concepts::Level auto logLvl,
				        std::source_location const& location = std::source_location::current())
				    {
				        alpaka::unused(logLvl, location);
				#if defined(ALPAKA_LOG_STATIC)
				        if constexpr(logLvl.mask() & ALPAKA_LOG_STATIC_LVL_MASK)
				            return internal::Scoped{logLvl, location};
				        else
				            return internal::Scoped{logLvl};
				#elif defined(ALPAKA_LOG_DYNAMIC)
				        static std::once_flag flag;
				        static size_t envLogMask = 0;

				        std::call_once(
				            flag,
				            []()
				            {
				                if(char const* envStr = std::getenv("ALPAKA_LOG_DYNAMIC_LVL"))
				                    envLogMask = std::stoull(envStr);
				            });

				        if(logLvl.mask() & envLogMask)
				            return internal::Scoped{logLvl, location};
				        else
				            return internal::Scoped{logLvl};
				#endif
				    }

				    /** Write a meta data message to the output
				     *
				     * @attention It is suggested to use the logger macro ALPAKA_LOG_INFO to speedup the compile time.
				     * For cases where logging is disabled the compiler does not need to register the C++ function signature.
				     *
				     * @param logLvl log level or a sum of log levels
				     * @param callable callable without arguments which provides a string which should be written to the output
				     */
				    inline void info(
				        concepts::Level auto logLvl,
				        auto const& callable,
				        std::source_location const& location = std::source_location::current())
				    {
				        alpaka::unused(logLvl, callable, location);
				#if defined(ALPAKA_LOG_STATIC)
				        if constexpr(logLvl.mask() & ALPAKA_LOG_STATIC_LVL_MASK)
				            internal::Info{logLvl, callable, location};
				#elif defined(ALPAKA_LOG_DYNAMIC)
				        static std::once_flag flag;
				        static size_t envLogMask = 0;

				        std::call_once(
				            flag,
				            []()
				            {
				                if(char const* envStr = std::getenv("ALPAKA_LOG_DYNAMIC_LVL"))
				                    envLogMask = std::stoull(envStr);
				            });
				        if(logLvl.mask() & envLogMask)
				            internal::Info{logLvl, callable, location};
				#endif
				    }
				} // namespace alpaka::onHost::logger

				/** Log the entry and exit of a scope
				 *
				 * @param logLvl log level or a sum of log levels
				 */
				#if defined(ALPAKA_ENABLE_LOG_FUNCTIONS)
				#    define ALPAKA_LOG_FUNCTION(logLvl)                                                                               \
				        [[maybe_unused]] auto const __alpaka_log_scope = ::alpaka::onHost::logger::scope(logLvl)
				#else
				#    define ALPAKA_LOG_FUNCTION(logLvl) void()
				#endif

				/** Write a meta data message to the output
				 *
				 * @param logLvl log level or a sum of log levels
				 * @param callable callable without arguments which provides a string which should be written to the output
				 */
				#if defined(ALPAKA_ENABLE_LOG_INFO)
				#    define ALPAKA_LOG_INFO(logLvl, callable) ::alpaka::onHost::logger::info(logLvl, callable)
				#else
				#    define ALPAKA_LOG_INFO(logLvl, callable) void()
				#endif
				// ==
				// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/onHost/logger/logger.hpp ==
				// ============================================================================


			// #include <cstdint>    // amalgamate: file already included
			#include <cstring>
			#include <future>
			// #include <sstream>    // amalgamate: file already included

			namespace alpaka::onHost
			{
			    namespace cpu
			    {
			        template<typename T_Device>
			        struct Event : std::enable_shared_from_this<Event<T_Device>>
			        {
			        public:
			            Event(internal::concepts::DeviceHandle auto device, uint32_t const idx)
			                : m_device(std::move(device))
			                , m_idx(idx)
			            {
			                ALPAKA_LOG_FUNCTION(onHost::logger::event);
			            }

			            ~Event()
			            {
			                ALPAKA_LOG_FUNCTION(onHost::logger::event);
			                internal::wait(*this);
			            }

			            Event(Event const&) = delete;
			            Event& operator=(Event const&) = delete;

			            Event(Event&&) = delete;
			            Event& operator=(Event&&) = delete;

			            bool operator==(Event const& other) const
			            {
			                return m_idx == other.m_idx && m_device == other.m_device;
			            }

			            bool operator!=(Event const& other) const
			            {
			                return !(*this == other);
			            }

			        private:
			            Handle<T_Device> m_device;
			            uint32_t m_idx = 0u;

			            //!< The mutex used to synchronize access to the event.
			            std::mutex mutable m_mutex;
			            //!< The future signaling the event completion.
			            std::shared_future<void> m_future;
			            //!< The number of times this event has been enqueued.
			            std::size_t m_enqueueCount = 0u;
			            //!< The time this event has been ready the last time.
			            //!< Ready means that the event was not waiting within a queue
			            //!< (not enqueued or already completed). If m_enqueueCount ==
			            //!< m_LastReadyEnqueueCount, the event is currently not enqueued
			            std::size_t m_LastReadyEnqueueCount = 0u;

			            friend struct alpaka::internal::GetName;

			            std::string getName() const
			            {
			                return std::string("host::Event id=") + std::to_string(m_idx);
			            }

			            friend struct internal::GetNativeHandle;
			            friend struct internal::Enqueue;
			            friend struct alpaka::internal::GetDeviceType;

			            auto getDeviceKind() const
			            {
			                return alpaka::internal::getDeviceKind(*m_device.get());
			            }

			            auto getDevice() const
			            {
			                return m_device;
			            }

			            std::shared_ptr<Event> getSharedPtr()
			            {
			                return this->shared_from_this();
			            }

			            friend struct onHost::internal::GetDevice;

			            friend struct internal::IsEventComplete;

			            /** Check if the event is ready.
			             *
			             * @attention Do not call this method without holding the event lock.
			             *
			             * @return true if the event is ready, false otherwise
			             */
			            bool isReady() noexcept
			            {
			                ALPAKA_LOG_FUNCTION(onHost::logger::event);
			                return (m_LastReadyEnqueueCount == m_enqueueCount);
			            }

			            /** Check if the event is complete.
			             *
			             * @attention Should not be called if the event lock is acquired, because it could lead to a deadlock.
			             *
			             * @return true if the event is complete, false otherwise
			             */
			            bool isEventComplete() noexcept
			            {
			                ALPAKA_LOG_FUNCTION(onHost::logger::event);
			                std::lock_guard<std::mutex> lk(m_mutex);
			                return isReady();
			            }

			            friend struct internal::WaitFor;
			            friend struct internal::Wait;

			            void wait()
			            {
			                ALPAKA_LOG_FUNCTION(onHost::logger::event);
			                std::unique_lock<std::mutex> lk(m_mutex);
			                size_t enqueueCount = m_enqueueCount;

			                while(enqueueCount > m_LastReadyEnqueueCount)
			                {
			                    auto future = m_future;
			                    lk.unlock();
			                    future.get();
			                    lk.lock();
			                }
			            }

			            friend struct alpaka::internal::GetApi;
			        };
			    } // namespace cpu
			} // namespace alpaka::onHost

			namespace alpaka::internal
			{
			    template<typename T_Device>
			    struct GetApi::Op<onHost::cpu::Event<T_Device>>
			    {
			        inline constexpr auto operator()(auto&& event) const
			        {
			            return alpaka::getApi(event.m_device);
			        }
			    };
			} // namespace alpaka::internal
			// ==
			// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/host/Event.hpp ==
			// ============================================================================

			// ============================================================================
			// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/host/Queue.hpp ==
			// ==
			/* Copyright 2024 René Widera
			 * SPDX-License-Identifier: MPL-2.0
			 */

			// #pragma once
				// ============================================================================
				// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/generic.hpp ==
				// ==
				/* Copyright 2025 René Widera, Mehmet Yusufoglu
				 * SPDX-License-Identifier: MPL-2.0
				 */


				// #pragma once
				// #include "alpaka/internal/interface.hpp"    // amalgamate: file already inlined
				// #include "alpaka/math/internal/ieee754.hpp"    // amalgamate: file already inlined
					// ============================================================================
					// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/onAcc/SimdAlgo.hpp ==
					// ==
					/* Copyright 2024 René Widera
					 * SPDX-License-Identifier: MPL-2.0
					 */

					// #pragma once
					// #include "alpaka/Vec.hpp"    // amalgamate: file already inlined
					// #include "alpaka/core/common.hpp"    // amalgamate: file already inlined
						// ============================================================================
						// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/mem/concepts/IDataStorage.hpp ==
						// ==
						/* Copyright 2025 Simeon Ehrig
						 * SPDX-License-Identifier: MPL-2.0
						 */

						// #pragma once
						// #include "alpaka/mem/concepts/IBuffer.hpp"    // amalgamate: file already inlined
						// #include "alpaka/mem/concepts/IDataSource.hpp"    // amalgamate: file already inlined
						// #include "alpaka/mem/concepts/IMdSpan.hpp"    // amalgamate: file already inlined
						// #include "alpaka/mem/concepts/IView.hpp"    // amalgamate: file already inlined
						// ==
						// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/mem/concepts/IDataStorage.hpp ==
						// ============================================================================

						// ============================================================================
						// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/onAcc/internal/SimdConcurrent.hpp ==
						// ==
						/* Copyright 2024 René Widera
						 * SPDX-License-Identifier: MPL-2.0
						 */

						// #pragma once
						// #include "alpaka/Simd.hpp"    // amalgamate: file already inlined
							// ============================================================================
							// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/SimdPtr.hpp ==
							// ==
							/* Copyright 2025 René Widera
							 * SPDX-License-Identifier: MPL-2.0
							 */

							// #pragma once
							// #include "alpaka/Simd.hpp"    // amalgamate: file already inlined
							// #include "alpaka/Vec.hpp"    // amalgamate: file already inlined
							// #include "alpaka/internal/interface.hpp"    // amalgamate: file already inlined
							// #include "alpaka/mem/Alignment.hpp"    // amalgamate: file already inlined
							// #include "alpaka/mem/concepts/IMdSpan.hpp"    // amalgamate: file already inlined
								// ============================================================================
								// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/mem/concepts/IndexVec.hpp ==
								// ==
								/* Copyright 2025 Simeon Ehrig
								 * SPDX-License-Identifier: MPL-2.0
								 */

								// #pragma once								// #include "alpaka/Vec.hpp"    // amalgamate: file already inlined

								namespace alpaka::concepts
								{
								    /** Check whether the specified type is a multidimensional index.
								     *
								     * @details The type must fulfill alpaka::concepts::Vector, and its type must be convertible to an expected index
								     * type without loss of precision.
								     *
								     * If you observe that nvcc segfaults during compile, and you used this concept in the function signature, replace
								     * it with a static assert inside the function body. see SimdPtr::operator[]().
								     *
								     * @tparam T_IndexType expected index type
								     * @tparam T_dim expected dimension
								     */
								    template<typename T, typename T_IndexType, uint32_t T_dim>
								    concept IndexVec = requires {
								        requires concepts::Vector<T, alpaka::NotRequired, T_dim>;
								        requires isLosslesslyConvertible_v<typename T::type, T_IndexType>;
								    };
								} // namespace alpaka::concepts
								// ==
								// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/mem/concepts/IndexVec.hpp ==
								// ============================================================================

							// #include "alpaka/trait.hpp"    // amalgamate: file already inlined

							// #include <concepts>    // amalgamate: file already included
							// #include <cstdint>    // amalgamate: file already included
							#include <type_traits>

							namespace alpaka
							{
							    namespace trait
							    {
							        template<typename T>
							        struct IsSimdPtr : std::false_type
							        {
							        };
							    } // namespace trait

							    template<typename T>
							    constexpr bool isSimdPtr_v = trait::IsSimdPtr<T>::value;

							    namespace concepts
							    {
							        /** Concept to check if a type is a SIMD pointer
							         *
							         * @tparam T Type to check
							         * @tparam T_ValueType enforce a value type of the SIMD pointer, if not provided the value type is not checked
							         * @tparam T_width enforce lane width of the SIMD pointer, if not provided the value is not checked
							         */
							        template<typename T, typename T_ValueType = alpaka::NotRequired, uint32_t T_width = alpaka::notRequiredWidth>
							        concept SimdPtr = isSimdPtr_v<T>
							                          && (std::same_as<T_ValueType, trait::GetValueType_t<std::decay_t<T>>>
							                              || std::same_as<T_ValueType, alpaka::NotRequired>)
							                          && ((T_width == alpaka::notRequiredWidth) || (T::width() == T_width));
							    } // namespace concepts

							    /** pointer to a SIMD pack with the width T_SimdWidth
							     *
							     * The pointer is used to load/store data from/to memory
							     *
							     * @tparam T_MdSpan type of the memory the pointer is pointing to
							     * @tparam T_IdxType type of the index
							     * @tparam T_MemAlignment alignment of the memory the pointer is pointing to
							     * @tparam T_SimdWidth width of the SIMD pack
							     */
							    template<
							        typename T_MdSpan,
							        alpaka::concepts::Vector T_IdxType,
							        alpaka::concepts::Alignment T_MemAlignment,
							        alpaka::concepts::CVector T_SimdWidth>
							    struct SimdPtr : private T_MdSpan
							    {
							        using value_type = typename T_MdSpan::value_type;
							        using IdxType = typename T_IdxType::UniVec;

							        static consteval uint32_t width()
							        {
							            return T_SimdWidth{}.back();
							        }

							        constexpr SimdPtr(T_MdSpan const& mdSpan, T_IdxType const& idx, T_MemAlignment, T_SimdWidth)
							            : T_MdSpan(mdSpan)
							            , m_idx(idx)
							        {
							        }

							        /** Shift the element the pointer is pointing to by idx
							         *
							         * @param idx number of elements to shift the pointer by
							         * @return a new simd pointer pointing to the shifted element
							         *
							         * @{
							         */
							        constexpr alpaka::concepts::SimdPtr auto operator[](auto const& idx) const
							        {
							            /* Do not use concepts::IndexVec as concept in the function signature else nvcc (tested 12.X -> 13.0)
							             * segfaults during compile.
							             */
							            static_assert(
							                alpaka::concepts::IndexVec<ALPAKA_TYPEOF(idx), typename IdxType::type, T_MdSpan::dim()>,
							                "The dimension of idx must match the encapsulated MdSpan dimension and the index type of idx must be "
							                "lossless castable to the MdSpan index type");
							            constexpr uint32_t valueAlignment = static_cast<uint32_t>(alignof(value_type));
							            constexpr auto align = Alignment<valueAlignment>{};
							            return SimdPtr<T_MdSpan, IdxType, ALPAKA_TYPEOF(align), T_SimdWidth>{
							                static_cast<T_MdSpan>(*this),
							                idx + m_idx,
							                align,
							                T_SimdWidth{}};
							        }

							        constexpr alpaka::concepts::SimdPtr auto operator[](auto const& idx)
							        {
							            /* Do not use concepts::IndexVec as concept in the function signature else nvcc (tested 12.X -> 13.0)
							             * segfaults during compile.
							             */
							            static_assert(
							                alpaka::concepts::IndexVec<ALPAKA_TYPEOF(idx), typename IdxType::type, T_MdSpan::dim()>,
							                "The dimension of idx must match the encapsulated MdSpan dimension and the index type of idx must be "
							                "lossless castable to the MdSpan index type");
							            constexpr uint32_t valueAlignment = static_cast<uint32_t>(alignof(value_type));
							            constexpr auto align = Alignment<valueAlignment>{};
							            return SimdPtr<T_MdSpan, IdxType, ALPAKA_TYPEOF(align), T_SimdWidth>{
							                static_cast<T_MdSpan>(*this),
							                idx + m_idx,
							                align,
							                T_SimdWidth{}};
							        }

							        /** @} */

							        constexpr decltype(auto) load() const
							        {
							            return internal::loadAsSimd<width()>(static_cast<T_MdSpan const&>(*this), getAlignment(), m_idx);
							        }

							        constexpr decltype(auto) load()
							        {
							            return internal::loadAsSimd<width()>(static_cast<T_MdSpan&>(*this), getAlignment(), m_idx);
							        }

							        /** get the alignment of the memory the pointer is pointing to
							         *
							         * @attention If the pointer is shifted by `operator[]` the alignment is equal to the data alignment of an
							         * single element
							         *
							         * @return the alignment of the memory (in byte) the pointer is pointing to
							         */
							        static constexpr auto getAlignment()
							        {
							            using SpanElemType = typename T_MdSpan::value_type;
							            constexpr uint32_t spanAlignment = T_MdSpan::getAlignment().template get<SpanElemType>();
							            using MemoryAlignment = std::conditional_t<
							                std::is_same_v<AutoAligned, T_MemAlignment>,
							                Alignment<spanAlignment>,
							                Alignment<std::min(T_MemAlignment::template get<SpanElemType>(), spanAlignment)>>;
							            return MemoryAlignment{};
							        }

							        /** store the simd pack to the memory the pointer is pointing to
							         *
							         * @param rhs simd pack to store
							         *
							         * @{
							         */
							        template<typename T_Storage>
							        constexpr void storeTo(Simd<value_type, SimdPtr::width(), T_Storage> const& rhs) const
							        {
							            auto* ptr = &T_MdSpan::operator[](m_idx);

							            rhs.copyTo(ptr, getAlignment());
							        }

							        template<typename T_Storage>
							        constexpr void storeTo(Simd<value_type, SimdPtr::width(), T_Storage> const& rhs)
							        {
							            auto* ptr = &T_MdSpan::operator[](m_idx);
							            rhs.copyTo(ptr, getAlignment());
							        }

							        template<typename T_Storage>
							        constexpr SimdPtr const& operator=(Simd<value_type, SimdPtr::width(), T_Storage> const& rhs) const
							        {
							            storeTo(rhs);
							            return *this;
							        }

							        template<typename T_Storage>
							        constexpr SimdPtr& operator=(Simd<value_type, SimdPtr::width(), T_Storage> const& rhs)
							        {
							            storeTo(rhs);
							            return *this;
							        }

							        /** @} */

							        /** offset in elements relative to the MdSpan given at construction
							         *
							         * The index points to the first element followed by T_SimdWidth elements.
							         *
							         * @return the index of the first element relative to the MdSpan given at construction
							         */
							        constexpr IdxType getIdx() const
							        {
							            return m_idx;
							        }

							    private:
							        IdxType m_idx;
							    };

							    namespace internal
							    {
							        template<
							            alpaka::concepts::IMdSpan T_MdSpan,
							            alpaka::concepts::Alignment T_MdSpanAlignment,
							            alpaka::concepts::Vector T_Idx>
							        struct LoadAsSimd::Op<T_MdSpan, T_MdSpanAlignment, T_Idx>
							        {
							            template<uint32_t T_simdWidth>
							            constexpr auto load(auto&& dataSource, T_MdSpanAlignment alignment, T_Idx const& idx) const
							            {
							                static_assert(
							                    std::is_same_v<T_MdSpan, ALPAKA_TYPEOF(dataSource)>,
							                    "Data source type must match the class template signature.");
							                auto&& d = dataSource[idx];
							                using DataTypeType = std::remove_reference_t<decltype(d)>;
							                using DstType = std::conditional_t<
							                    std::is_const_v<DataTypeType>,
							                    Simd<std::decay_t<DataTypeType>, T_simdWidth> const,
							                    Simd<std::decay_t<DataTypeType>, T_simdWidth>>;

							                alpaka::concepts::Simd auto dest = DstType{};
							                dest.copyFrom(&d, alignment);
							                return dest;
							            }
							        };
							    } // namespace internal

							    namespace trait
							    {
							        template<typename T>
							        requires(isSpecializationOf_v<T, SimdPtr>)
							        struct IsSimdPtr<T> : std::true_type
							        {
							        };
							    } // namespace trait
							} // namespace alpaka
							// ==
							// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/SimdPtr.hpp ==
							// ============================================================================

						// #include "alpaka/Vec.hpp"    // amalgamate: file already inlined
						// #include "alpaka/api/trait.hpp"    // amalgamate: file already inlined
						// #include "alpaka/core/common.hpp"    // amalgamate: file already inlined
						// #include "alpaka/mem/concepts/IDataSource.hpp"    // amalgamate: file already inlined
						// #include "alpaka/mem/concepts/IDataStorage.hpp"    // amalgamate: file already inlined
							// ============================================================================
							// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/onAcc/WorkerGroup.hpp ==
							// ==
							/* Copyright 2024 Andrea Bocci, René Widera
							 * SPDX-License-Identifier: MPL-2.0
							 */

							// #pragma once
								// ============================================================================
								// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/mem/ThreadSpace.hpp ==
								// ==
								/* Copyright 2024 Andrea Bocci, René Widera
								 * SPDX-License-Identifier: MPL-2.0
								 */

								// #pragma once
								// #include "alpaka/CVec.hpp"    // amalgamate: file already inlined
								// #include "alpaka/Vec.hpp"    // amalgamate: file already inlined
								// #include "alpaka/core/common.hpp"    // amalgamate: file already inlined

								// #include <cstdint>    // amalgamate: file already included

								namespace alpaka
								{
								    template<concepts::Vector T_ThreadIdx, concepts::Vector T_ThreadCount>
								    struct ThreadSpace
								    {
								        constexpr ThreadSpace(T_ThreadIdx const& threadIdx, T_ThreadCount const& threadCount)
								            : m_threadIdx(threadIdx)
								            , m_threadCount(threadCount)
								        {
								        }

								        std::string toString(std::string const separator = ",", std::string const enclosings = "{}") const
								        {
								            std::string locale_enclosing_begin;
								            std::string locale_enclosing_end;
								            size_t enclosing_dim = enclosings.size();

								            if(enclosing_dim > 0)
								            {
								                /* % avoid out of memory access */
								                locale_enclosing_begin = enclosings[0 % enclosing_dim];
								                locale_enclosing_end = enclosings[1 % enclosing_dim];
								            }

								            std::stringstream stream;
								            stream << locale_enclosing_begin;
								            stream << m_threadIdx << separator << m_threadCount;
								            stream << locale_enclosing_end;
								            return stream.str();
								        }

								        constexpr auto size() const
								        {
								            return m_threadCount;
								        }

								        constexpr auto idx() const
								        {
								            return m_threadIdx;
								        }

								        template<concepts::CVector T_CSelect>
								        constexpr ThreadSpace mapTo(T_CSelect selection) const requires(T_ThreadIdx::dim() <= T_CSelect::dim())
								        {
								            alpaka::unused(selection);
								            static_assert(T_ThreadIdx::dim() == T_CSelect::dim(), "can not map to a larger dimension");
								            return *this;
								        }

								        template<concepts::CVector T_CSelect>
								        constexpr auto mapTo(T_CSelect selection) const requires(T_ThreadIdx::dim() > T_CSelect::dim())
								        {
								            alpaka::unused(selection);

								            using IdxType = typename T_ThreadIdx::type;
								            constexpr uint32_t dim = T_ThreadIdx::dim();

								            auto allElements = iotaCVec<IdxType, dim>();
								            constexpr auto notSelectedDims = filter(allElements, T_CSelect{});

								            // Transform into a universal vector because the input could be a CVec which is read only.
								            auto threadIndex = typename ALPAKA_TYPEOF(m_threadIdx)::UniVec{m_threadIdx};
								            auto numThreads = typename ALPAKA_TYPEOF(m_threadCount)::UniVec{m_threadCount};

								            // map not selected dimensions to the slowest selected dimension
								            for(uint32_t x = 0u; x < notSelectedDims.dim(); ++x)
								            {
								                auto d = notSelectedDims[x];
								                auto old = threadIndex[d];
								                threadIndex[d] = 0u;
								                threadIndex[T_CSelect{}[0]] += old * numThreads[T_CSelect{}[0]];
								            }

								            for(uint32_t x = 0u; x < notSelectedDims.dim(); ++x)
								            {
								                auto d = notSelectedDims[x];
								                auto old = numThreads[d];
								                numThreads[d] = 1u;
								                numThreads[T_CSelect{}[0]] *= old;
								            }

								            return ThreadSpace<ALPAKA_TYPEOF(threadIndex), ALPAKA_TYPEOF(numThreads)>{threadIndex, numThreads};
								        }

								        T_ThreadIdx m_threadIdx;
								        T_ThreadCount m_threadCount;

								        using type = typename T_ThreadIdx::type;
								    };

								    namespace internal
								    {
								        template<typename T_To, typename T_ThreadIdx, typename T_ThreadCount>
								        struct PCast::Op<T_To, ThreadSpace<T_ThreadIdx, T_ThreadCount>>
								        {
								            constexpr auto operator()(auto&& input) const
								                requires std::convertible_to<typename T_ThreadIdx::type, T_To>
								                         && (!std::same_as<T_To, typename T_ThreadIdx::type>)
								            {
								                return ThreadSpace{pCast<T_To>(input.m_threadIdx), pCast<T_To>(input.m_threadCount)};
								            }

								            constexpr decltype(auto) operator()(auto&& input) const
								                requires std::same_as<T_To, typename T_ThreadIdx::type>
								            {
								                return std::forward<decltype(input)>(input);
								            }
								        };
								    } // namespace internal

								    template<std::size_t I, typename T_ThreadIdx, typename T_ThreadCount>
								    constexpr auto get(alpaka::ThreadSpace<T_ThreadIdx, T_ThreadCount> const& v) requires(I == 0u)
								    {
								        return v.m_threadIdx;
								    }

								    template<std::size_t I, typename T_ThreadIdx, typename T_ThreadCount>
								    constexpr auto& get(alpaka::ThreadSpace<T_ThreadIdx, T_ThreadCount>& v) requires(I == 0u)
								    {
								        return v.m_threadIdx;
								    }

								    template<std::size_t I, typename T_ThreadIdx, typename T_ThreadCount>
								    constexpr auto get(alpaka::ThreadSpace<T_ThreadIdx, T_ThreadCount> const& v) requires(I == 1u)
								    {
								        return v.m_threadCount;
								    }

								    template<std::size_t I, typename T_ThreadIdx, typename T_ThreadCount>
								    constexpr auto& get(alpaka::ThreadSpace<T_ThreadIdx, T_ThreadCount>& v) requires(I == 1u)
								    {
								        return v.m_threadCount;
								    }

								} // namespace alpaka

								namespace std
								{
								    template<typename T_ThreadIdx, typename T_ThreadCount>
								    struct tuple_size<alpaka::ThreadSpace<T_ThreadIdx, T_ThreadCount>>
								    {
								        static constexpr std::size_t value = 2u;
								    };

								    template<std::size_t I, typename T_ThreadIdx, typename T_ThreadCount>
								    struct tuple_element<I, alpaka::ThreadSpace<T_ThreadIdx, T_ThreadCount>>
								    {
								        using type = std::conditional_t<I == 0u, T_ThreadIdx, T_ThreadCount>;
								    };
								} // namespace std
								// ==
								// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/mem/ThreadSpace.hpp ==
								// ============================================================================

							// #include "alpaka/onAcc/internal/interface.hpp"    // amalgamate: file already inlined
							// #include "alpaka/onAcc/tag.hpp"    // amalgamate: file already inlined
							// #include "alpaka/tag.hpp"    // amalgamate: file already inlined

							// #include <cstdint>    // amalgamate: file already included

							namespace alpaka::onAcc
							{
							    template<bool T_multiDimensional = true>
							    struct MultiDimensional : std::bool_constant<T_multiDimensional>
							    {
							    };

							    constexpr auto linearized = MultiDimensional<false>{};

							    template<
							        typename T_ThreadIdxOrOrigin,
							        typename T_NumThreadsOrUnit,
							        typename T_MultiDimensional = MultiDimensional<true>>
							    struct WorkerGroup
							    {
							        /** WorkerGroup constructor
							         *
							         * @param threadIdxOrOrigin the index of the thread or onAcc::origin
							         * @param numThreadsOrUnit the number of threads or the onAcc::unit
							         * @param multiDimensional keep the dimensionality for both input parameters, if 'linearized' is used the
							         * workgroup will be reduced to a one dimensional group.
							         */
							        constexpr WorkerGroup(
							            T_ThreadIdxOrOrigin threadIdxOrOrigin,
							            T_NumThreadsOrUnit numThreadsOrUnit,
							            T_MultiDimensional = MultiDimensional<true>{})
							            : m_threadIdxOrOrigin{threadIdxOrOrigin}
							            , m_numThreadsOrUnit{numThreadsOrUnit}
							        {
							        }

							        constexpr auto size(auto const& acc) const
							        {
							            return getThreadSpace(acc).size();
							        }

							        constexpr auto idx(auto const& acc) const
							        {
							            return getThreadSpace(acc).idx();
							        }

							    private:
							        template<typename T_ThreadGroup, typename T_ThreadIdxOrOriginRange>
							        friend struct DomainSpec;

							        /** get the thread configuration
							         *
							         * Implementation specialization for vectors.
							         */
							        constexpr auto getThreadSpace([[maybe_unused]] auto const& acc) const
							            requires(isVector_v<T_ThreadIdxOrOrigin> && isVector_v<T_NumThreadsOrUnit>)
							        {
							            if constexpr(T_MultiDimensional::value == false)
							                return ThreadSpace{
							                    Vec{linearize(m_numThreadsOrUnit, m_threadIdxOrOrigin)},
							                    Vec{m_numThreadsOrUnit.product()}};
							            else
							                return ThreadSpace{m_threadIdxOrOrigin, m_numThreadsOrUnit};
							        }

							        /** get the thread configuration
							         *
							         * Implementation specialization for lazy evaluated acc properties based on an origin and unit.
							         */
							        constexpr auto getThreadSpace(auto const& acc) const
							            requires(isOrigin_v<T_ThreadIdxOrOrigin> && isUnit_v<T_NumThreadsOrUnit>)
							        {
							            auto const idx
							                = internalCompute::GetIdxWithin::Op<ALPAKA_TYPEOF(acc), T_ThreadIdxOrOrigin, T_NumThreadsOrUnit>{}(
							                    acc,
							                    m_threadIdxOrOrigin,
							                    m_numThreadsOrUnit);
							            auto const extent
							                = internalCompute::GetExtentsOf::Op<ALPAKA_TYPEOF(acc), T_ThreadIdxOrOrigin, T_NumThreadsOrUnit>{}(
							                    acc,
							                    m_threadIdxOrOrigin,
							                    m_numThreadsOrUnit);

							            if constexpr(T_MultiDimensional::value == false)
							                return ThreadSpace{Vec{linearize(extent, idx)}, Vec{extent.product()}};
							            else
							                return ThreadSpace{idx, extent};
							        }

							    private:
							        T_ThreadIdxOrOrigin m_threadIdxOrOrigin;
							        T_NumThreadsOrUnit m_numThreadsOrUnit;
							    };

							    namespace worker
							    {
							        constexpr auto threadsInGrid = WorkerGroup{origin::grid, unit::threads};
							        constexpr auto blocksInGrid = WorkerGroup{origin::grid, unit::blocks};
							        constexpr auto threadsInBlock = WorkerGroup{origin::block, unit::threads};

							        /** Representation of all threads in a warp as a linearized worker group */
							        constexpr auto linearThreadsInWarp = WorkerGroup{origin::warp, unit::threads};
							        constexpr auto linearThreadsInBlock = WorkerGroup{origin::block, unit::threads, linearized};
							        constexpr auto linearThreadsInGrid = WorkerGroup{origin::grid, unit::threads, linearized};

							        /** Representation of all warps in a thread block as a linearized worker group
							         *
							         * @attention If the number of threads in a block is not a multiple of the warp size you can have partial
							         * warps.
							         */
							        constexpr auto linearWarpsInBlock = WorkerGroup{origin::block, unit::warps};
							        /** Representation of all warps in the grid as a linearized worker group
							         *
							         * @attention Since a thread block is not required to have as many threads a warp has, you can not assume that
							         * number of warps * warp size is the total number of threads.
							         */
							        constexpr auto linearWarpsInGrid = WorkerGroup{origin::grid, unit::warps};

							        constexpr auto linearBlocksInGrid = WorkerGroup{origin::grid, unit::blocks, linearized};

							        /** Represent the identity of the executor thread.
							         *
							         * All threads are in the same worker group.
							         * If used with onAcc::makeIdxMap(), any thread is getting all indices of the range.
							         */
							        constexpr auto allThreads = WorkerGroup{origin::thread, unit::threads};
							    } // namespace worker

							} // namespace alpaka::onAcc
							// ==
							// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/onAcc/WorkerGroup.hpp ==
							// ============================================================================

							// ============================================================================
							// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/onAcc/interface.hpp ==
							// ==
							/* Copyright 2024 René Widera
							 * SPDX-License-Identifier: MPL-2.0
							 */

							// #pragma once
							/** @file
							 *
							 * On some constexpr function signatures `ALPAKA_FN_HOST_ACC` is required for CUDA;
							 * otherwise a `__host__` function called from a `__host__ __device__` context
							 * triggers a warning and the generated code is wrong.
							 */

							// #include "alpaka/Vec.hpp"    // amalgamate: file already inlined
							// #include "alpaka/concepts.hpp"    // amalgamate: file already inlined
								// ============================================================================
								// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/mem/BoundaryIter.hpp ==
								// ==
								/* Copyright 2025 Anton Reinhard
								 * SPDX-License-Identifier: MPL-2.0
								 */

								// #pragma once
								// #include "alpaka/CVec.hpp"    // amalgamate: file already inlined
								// #include "alpaka/Vec.hpp"    // amalgamate: file already inlined
								// #include "alpaka/api/api.hpp"    // amalgamate: file already inlined
								// #include "alpaka/api/host/Api.hpp"    // amalgamate: file already inlined
								// #include "alpaka/concepts.hpp"    // amalgamate: file already inlined
								// #include "alpaka/core/Assert.hpp"    // amalgamate: file already inlined
								// #include "alpaka/utility.hpp"    // amalgamate: file already inlined

								// #include <ostream>    // amalgamate: file already included

								namespace alpaka
								{

								    /**
								     * @brief An enum representing the different types of boundary, with LOWER, MIDDLE, and UPPER being valid states,
								     * and OOB being invalid (out-of-bounds).
								     */
								    enum class BoundaryType : uint32_t
								    {
								        LOWER,
								        MIDDLE,
								        UPPER,
								        OOB
								    };

								    /**
								     * @brief An n-dimensional boundary direction. Encodes a single unique boundary of an nD volume, e.g., a specific
								     * corner of a 2D plane or a side of a 3D cube.
								     *
								     * See also: @ref BoundaryDirectionsContainer
								     *
								     * @tparam T_dim The dimensionality of the volume that this is a boundary direction for.
								     * @tparam T_LowHaloVec The vector type used for the lower halo sizes.
								     * @tparam T_UpHaloVec The vector type used for the upper halo sizes.
								     */
								    template<uint32_t T_dim, concepts::Vector T_LowHaloVec, concepts::Vector T_UpHaloVec>
								    struct BoundaryDirection
								    {
								        using T_BoundaryVec = Vec<BoundaryType, T_dim>;

								        T_BoundaryVec data;
								        T_LowHaloVec lowerHaloSize;
								        T_UpHaloVec upperHaloSize;

								        constexpr BoundaryDirection(
								            concepts::Vector auto const& boundaries,
								            T_LowHaloVec const& lower_halo_sizes,
								            T_UpHaloVec const& upper_halo_sizes)
								            : data(boundaries)
								            , lowerHaloSize(lower_halo_sizes)
								            , upperHaloSize(upper_halo_sizes)
								        {
								        }

								        /** @brief The dimensionality of the whole volume that this is a boundary direction for. Not to be confused
								         * with boundaryDimensionality().
								         */
								        [[nodiscard]] static constexpr uint32_t dim()
								        {
								            return T_dim;
								        }

								        /** @brief The dimensionality of the boundary direction. For example, a vertex (corner) of a 3D-volume (cube)
								         * is 0-dimensional. See also the functions isVertex(), isEdge(), etc.
								         */
								        [[nodiscard]] constexpr uint32_t boundaryDimensionality() const
								        {
								            uint32_t c = 0;
								            for(uint32_t i = 0; i < T_dim; ++i)
								            {
								                if(data[i] == BoundaryType::MIDDLE)
								                    ++c;
								            }
								            return c;
								        }

								        /** @brief Return true if this boundary direction describes a vertex, for example the corner of a plane.
								         */
								        [[nodiscard]] constexpr bool isVertex() const
								        {
								            return boundaryDimensionality() == 0;
								        }

								        /** @brief Return true if this boundary direction describes an edge, for example any of the 12 edges of a cube.
								         */
								        [[nodiscard]] constexpr bool isEdge() const
								        {
								            return boundaryDimensionality() == 1;
								        }

								        /** @brief Return true if this boundary direction describes a face, for example any of the 6 sides of a cube.
								         */
								        [[nodiscard]] constexpr bool isFace() const
								        {
								            return boundaryDimensionality() == 2;
								        }

								        /** @brief Return true if this boundary direction describes a cell, for example the interior of a cube or one
								         * of the 8 cells in a tesseract.
								         */
								        [[nodiscard]] constexpr bool isCell() const
								        {
								            return boundaryDimensionality() == 3;
								        }

								        /** @brief Return true if this boundary direction describes the interior of a volume, like the 2D interior of a
								         * plane or the 3D interior of a cube.
								         */
								        [[nodiscard]] constexpr bool isInterior() const
								        {
								            return boundaryDimensionality() == dim();
								        }

								        [[nodiscard]] constexpr auto operator<=>(BoundaryDirection const&) const = default;
								    };

								    /**
								     * @brief The iterator type for @ref BoundaryDirectionsContainer.
								     *
								     * @tparam T_dim The dimensionality of the volume that this is a boundary direction iterator for.
								     * @tparam T_LowHaloVec The vector type used for the lower halo sizes.
								     * @tparam T_UpHaloVec The vector type used for the upper halo sizes.
								     */
								    template<uint32_t T_dim, concepts::Vector T_LowHaloVec, concepts::Vector T_UpHaloVec>
								    struct BoundaryDirectionIter
								    {
								        using T_BoundaryVec = Vec<BoundaryType, T_dim>;

								        using difference_type = std::ptrdiff_t;
								        using value_type = BoundaryDirection<T_dim, T_LowHaloVec, T_UpHaloVec>;
								        using reference = value_type&;
								        using const_reference = value_type const&;
								        using pointer = value_type*;
								        using const_pointer = value_type const*;

								        constexpr BoundaryDirectionIter(
								            T_BoundaryVec const& boundaries,
								            T_LowHaloVec const& lower_halo_sizes,
								            T_UpHaloVec const& upper_halo_sizes)
								            : boundaries(boundaries, lower_halo_sizes, upper_halo_sizes)
								            , lowerHaloSizes(lower_halo_sizes)
								            , upperHaloSizes(upper_halo_sizes)
								        {
								        }

								        [[nodiscard]] constexpr const_reference& operator*() const
								        {
								            return boundaries;
								        }

								        [[nodiscard]] constexpr reference& operator*()
								        {
								            return boundaries;
								        }

								        constexpr auto& operator++()
								        {
								            uint32_t i = T_dim - 1;
								            bool oob = true;
								            while(i != static_cast<uint32_t>(-1))
								            {
								                switch(boundaries.data[i])
								                {
								                case BoundaryType::LOWER:
								                    boundaries.data[i] = BoundaryType::MIDDLE;
								                    i = static_cast<uint32_t>(-1);
								                    oob = false;
								                    break;
								                case BoundaryType::MIDDLE:
								                    boundaries.data[i] = BoundaryType::UPPER;
								                    i = static_cast<uint32_t>(-1);
								                    oob = false;
								                    break;
								                case BoundaryType::UPPER:
								                    boundaries.data[i] = BoundaryType::LOWER;
								                    --i;
								                    break;
								                case BoundaryType::OOB:
								                    [[fallthrough]];
								                default:
								                    constexpr bool onHost = std::is_same_v<api::Host, ALPAKA_TYPEOF(thisApi())>;
								                    if constexpr(onHost)
								                        assert(false);
								                    else
								                        ALPAKA_ASSERT_ACC(false);
								                }
								            }
								            if(oob)
								            {
								                boundaries
								                    = {Vec<BoundaryType, T_dim>([](int) { return BoundaryType::OOB; }),
								                       lowerHaloSizes,
								                       upperHaloSizes};
								            }
								            return *this;
								        }

								        [[nodiscard]] static consteval auto dim()
								        {
								            return T_dim;
								        }

								        [[nodiscard]] constexpr auto operator<=>(BoundaryDirectionIter const&) const = default;

								    private:
								        BoundaryDirection<T_dim, T_LowHaloVec, T_UpHaloVec> boundaries;

								        T_LowHaloVec lowerHaloSizes;
								        T_UpHaloVec upperHaloSizes;
								    };

								    /**
								     * @brief A container for boundary directions of an n-dimensional volume.
								     *
								     * This class implements `begin()`, `end()`, and `length()`, and can be iterated over. This is useful for stencil
								     * codes, where boundary conditions exist, that need to only be applied to elements on the borders of memory. To
								     * create a BoundaryDirectionsContainer for a memory object, see @ref makeBoundaryDirIterator.
								     *
								     * A 0D boundary direction is a single value, a 1D boundary direction is a "line", for example edges of a cube, a
								     * 2D boundary direction is a "plane", for example the sides of a cube.
								     *
								     * For example, a 1-dimensional (1D) volume has two 0D ends and a 1D center. A 2D volume has 4 0D corners, 4 1D
								     * edges, and one 2D center. In general, there are 3^n boundaries for an nD volume.
								     *
								     * @tparam T_dim The dimensionality of the volume that this contains boundaries for.
								     * @tparam T_LowHaloVec The vector type used for the lower halo sizes.
								     * @tparam T_UpHaloVec The vector type used for the upper halo sizes.
								     */
								    template<uint32_t T_dim, concepts::Vector T_LowHaloVec, concepts::Vector T_UpHaloVec>
								    struct BoundaryDirectionsContainer
								    {
								        static_assert(T_dim > 0, "0 Dimension Boundary Direction Container is not defined");

								        constexpr BoundaryDirectionsContainer(T_LowHaloVec const& lowerHaloSizes, T_UpHaloVec const& upperHaloSizes)
								            : m_lowerHaloSizes(lowerHaloSizes)
								            , m_upperHaloSizes(upperHaloSizes)
								        {
								        }

								        [[nodiscard]] constexpr BoundaryDirectionIter<T_dim, T_LowHaloVec, T_UpHaloVec> begin() const
								        {
								            return BoundaryDirectionIter<T_dim, T_LowHaloVec, T_UpHaloVec>{
								                Vec<BoundaryType, T_dim>([](int) { return BoundaryType::LOWER; }),
								                m_lowerHaloSizes,
								                m_upperHaloSizes};
								        }

								        [[nodiscard]] constexpr BoundaryDirectionIter<T_dim, T_LowHaloVec, T_UpHaloVec> end() const
								        {
								            return BoundaryDirectionIter<T_dim, T_LowHaloVec, T_UpHaloVec>{
								                Vec<BoundaryType, T_dim>([](int) { return BoundaryType::OOB; }),
								                m_lowerHaloSizes,
								                m_upperHaloSizes};
								        }

								        [[nodiscard]] static consteval uint32_t length()
								        {
								            return ipow(3u, T_dim);
								        }

								        [[nodiscard]] static consteval auto dim()
								        {
								            return T_dim;
								        }

								    private:
								        T_LowHaloVec const m_lowerHaloSizes;
								        T_UpHaloVec const m_upperHaloSizes;
								    };

								    template<concepts::Vector LowHaloVecType, concepts::Vector UpHaloVecType>
								    BoundaryDirectionsContainer(LowHaloVecType const& lowerHalos, UpHaloVecType const& upperHalos)
								        -> BoundaryDirectionsContainer<LowHaloVecType::dim(), LowHaloVecType, UpHaloVecType>;

								    /** @brief Construct and return a single @ref BoundaryDirection specifying the middle of a volume.
								     */
								    template<uint32_t T_dim>
								    [[nodiscard]] constexpr auto makeCoreBoundaryDirection(
								        concepts::Vector auto const& lowerHalos,
								        concepts::Vector auto const& upperHalos)
								    {
								        return BoundaryDirection<T_dim, ALPAKA_TYPEOF(lowerHalos), ALPAKA_TYPEOF(upperHalos)>{
								            fillCVec<BoundaryType, T_dim, BoundaryType::MIDDLE>(),
								            lowerHalos,
								            upperHalos};
								    }

								    /** @brief Construct and return a single @ref BoundaryDirection specifying the middle of a volume with symmetric
								     * halos.
								     */
								    template<uint32_t T_dim>
								    [[nodiscard]] constexpr auto makeCoreBoundaryDirection(concepts::Vector auto const& halos)
								    {
								        return BoundaryDirection<T_dim, ALPAKA_TYPEOF(halos), ALPAKA_TYPEOF(halos)>{
								            fillCVec<BoundaryType, T_dim, BoundaryType::MIDDLE>(),
								            halos,
								            halos};
								    }

								    /**
								     * @brief Construct and return a single @ref BoundaryDirection specifying the middle of a volume with all halo
								     * sizes set to 1.
								     */
								    template<uint32_t T_dim>
								    consteval auto makeCoreBoundaryDirection()
								    {
								        return makeCoreBoundaryDirection<T_dim>(fillCVec<uint32_t, T_dim, 1u>());
								    }

								    /** @brief Construct and return a @ref BoundaryDirectionsContainer. This container can be iterated over.
								     *
								     * This constructor uses a default halo size of 1 everywhere.
								     *
								     * @tparam T_dim The dimensionality of the container.
								     */
								    template<uint32_t T_dim>
								    [[nodiscard]] constexpr auto makeBoundaryDirIterator()
								    {
								        auto lowerHalos = fillCVec<uint32_t, T_dim, static_cast<uint32_t>(1)>();
								        auto upperHalos = fillCVec<uint32_t, T_dim, static_cast<uint32_t>(1)>();
								        return BoundaryDirectionsContainer{lowerHalos, upperHalos};
								    }

								    /** @brief Construct and return a boundary direction container with the given halo sizes.
								     * This container can be iterated over. See BoundaryDirectionsContainer.
								     * The dimensionality is inferred from the given haloSizes.
								     *
								     * @param haloSizes The halo sizes per dimension. The halos are used for both "ends" of each dimension
								     * symmetrically.
								     */
								    [[nodiscard]] constexpr auto makeBoundaryDirIterator(concepts::Vector auto const& haloSizes)
								    {
								        return BoundaryDirectionsContainer{haloSizes, haloSizes};
								    }

								    /** @brief Construct and return a @ref BoundaryDirectionsContainer with the given halo sizes.
								     * This container can be iterated over.
								     * The dimensionality is inferred from the given halo sizes, which are asserted to be identical.
								     *
								     * @param lowerHaloSizes The lower end halo sizes per dimension. These are the halos from 0 in each dimension.
								     * @param upperHaloSizes The upper end halo sizes per dimension. These are the halos to `size()` in each dimension.
								     */
								    [[nodiscard]] constexpr auto makeBoundaryDirIterator(
								        concepts::Vector auto const& lowerHaloSizes,
								        concepts::Vector auto const& upperHaloSizes)
								    {
								        static_assert(
								            ALPAKA_TYPEOF(lowerHaloSizes)::dim() == ALPAKA_TYPEOF(upperHaloSizes)::dim(),
								            "dimension mismatch");
								        return BoundaryDirectionsContainer{lowerHaloSizes, upperHaloSizes};
								    }

								    /** @brief Construct and return a @ref BoundaryDirectionsContainer for the given view with default (size 1) halo
								     * sizes. This container can be iterated over.
								     * For custom halo sizes, use one of the other overloads.
								     *
								     * @param view The given view; only the dimension of the view matters.
								     */
								    [[nodiscard]] constexpr auto makeBoundaryDirIterator(concepts::IView auto const& view)
								    {
								        return makeBoundaryDirIterator<static_cast<uint32_t>(ALPAKA_TYPEOF(view)::dim())>();
								    }

								    namespace trait
								    {
								        template<typename T>
								        struct IsBoundaryDirection : std::false_type
								        {
								        };

								        template<uint32_t T_dim, concepts::Vector T_LowHaloVec, concepts::Vector T_UpHaloVec>
								        requires(T_dim == T_LowHaloVec::dim() && T_dim == T_UpHaloVec::dim())
								        struct IsBoundaryDirection<BoundaryDirection<T_dim, T_LowHaloVec, T_UpHaloVec>> : std::true_type
								        {
								        };
								    } // namespace trait

								    template<typename T>
								    constexpr bool isBoundaryDirection_v = trait::IsBoundaryDirection<T>::value;

								    namespace concepts
								    {
								        /** @brief Concept checking whether T is a boundary direction.
								         */
								        template<typename T>
								        concept BoundaryDirection = isBoundaryDirection_v<T>;
								    } // namespace concepts

								    std::ostream& operator<<(std::ostream& os, concepts::BoundaryDirection auto const& bd)
								    {
								        for(uint32_t i = 0; i < bd.dim(); ++i)
								        {
								            switch(bd.data[i])
								            {
								            case BoundaryType::LOWER:
								                os << 'v';
								                break;
								            case BoundaryType::MIDDLE:
								                os << '-';
								                break;
								            case BoundaryType::UPPER:
								                os << '^';
								                break;
								            case BoundaryType::OOB:
								                [[fallthrough]];
								            default:
								                os << 'x';
								                break;
								            }
								        }

								        if(bd.isVertex())
								            os << " (vertex) ";
								        if(bd.isEdge())
								            os << " (edge)   ";
								        if(bd.isFace())
								            os << " (face)   ";
								        if(bd.isCell())
								            os << " (cell)   ";
								        if(bd.boundaryDimensionality() >= 4)
								            os << " (" << bd.boundaryDimensionality() << "D volume)";

								        return os;
								    }
								} // namespace alpaka
								// ==
								// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/mem/BoundaryIter.hpp ==
								// ============================================================================

								// ============================================================================
								// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/mem/MdSpan.hpp ==
								// ==
								/* Copyright 2025 René Widera, Simeon Ehrig
								 * SPDX-License-Identifier: MPL-2.0
								 */

								// #pragma once
								// #include "alpaka/Vec.hpp"    // amalgamate: file already inlined
								// #include "alpaka/core/config.hpp"    // amalgamate: file already inlined
								// #include "alpaka/interface.hpp"    // amalgamate: file already inlined
								// #include "alpaka/mem/Alignment.hpp"    // amalgamate: file already inlined
									// ============================================================================
									// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/mem/DataPitches.hpp ==
									// ==
									/* Copyright 2024 René Widera
									 * SPDX-License-Identifier: MPL-2.0
									 */

									// #pragma once
									// #include "alpaka/CVec.hpp"    // amalgamate: file already inlined
									// #include "alpaka/Vec.hpp"    // amalgamate: file already inlined
									// #include "alpaka/core/config.hpp"    // amalgamate: file already inlined
									// #include "alpaka/mem/Alignment.hpp"    // amalgamate: file already inlined

									#include <type_traits>

									namespace alpaka
									{
									    //! Calculate the pitches purely from the extents.
									    template<typename T_Elem, alpaka::concepts::Vector T_Vec>
									    constexpr auto calculatePitchesFromExtents(T_Vec const& extent)
									    {
									        constexpr auto dim = T_Vec::dim();
									        using type = typename T_Vec::type;
									        auto pitchBytes = typename T_Vec::UniVec{};
									        if constexpr(dim > 0)
									            pitchBytes.back() = static_cast<type>(sizeof(T_Elem));
									        if constexpr(dim > 1)
									            for(type i = dim - 1; i > 0; i--)
									                pitchBytes[i - 1] = extent[i] * pitchBytes[i];
									        return pitchBytes;
									    }

									    //! Calculate the pitches purely from the extents.
									    template<typename T_Elem, alpaka::concepts::Vector T_Vec>
									    requires(T_Vec::dim() >= 2)
									    constexpr auto calculatePitches(T_Vec const& extent, typename T_Vec::type const& rowPitchBytes)
									    {
									        constexpr auto dim = T_Vec::dim();
									        using type = typename T_Vec::type;
									        auto pitchBytes = typename T_Vec::UniVec{};
									        pitchBytes.back() = static_cast<type>(sizeof(T_Elem));
									        if constexpr(dim > 1)
									            pitchBytes[dim - 2u] = rowPitchBytes;
									        if constexpr(dim > 2)
									            for(type i = dim - 2; i > 0; i--)
									                pitchBytes[i - 1] = extent[i] * pitchBytes[i];
									        return pitchBytes;
									    }

									    template<typename T_Type, concepts::Vector T_Pitches>
									    struct DataPitches
									    {
									        using value_type = T_Type;
									        using index_type = typename T_Pitches::type;

									        static consteval uint32_t dim()
									        {
									            return T_Pitches::dim();
									        }

									        constexpr DataPitches(T_Pitches const& pitchBytes) : m_pitch(pitchBytes.eraseBack())
									        {
									            assert(pitchBytes.back() == sizeof(value_type));
									        }

									        /*Object must init by copy a valid instance*/
									        constexpr DataPitches() = default;

									        constexpr auto getPitches() const
									        {
									            Vec<index_type, dim()> result;
									            for(uint32_t d = 0u; d < dim() - 1u; ++d)
									            {
									                result[d] = m_pitch[d];
									            }
									            result.back() = static_cast<index_type>(sizeof(value_type));
									            return result;
									        }

									        constexpr index_type operator[](std::integral auto idx) const
									        {
									            return getPitches()[idx];
									        }

									    private:
									        decltype(std::declval<T_Pitches>().eraseBack()) m_pitch;
									    };

									    template<typename T_Type, typename T_IndexType, typename T_Storage>
									    struct DataPitches<T_Type, Vec<T_IndexType, 1u, T_Storage>>
									    {
									        using value_type = T_Type;
									        using index_type = T_IndexType;

									        static consteval uint32_t dim()
									        {
									            return 1u;
									        }

									        constexpr DataPitches([[maybe_unused]] Vec<T_IndexType, 1u> const& pitchBytes)
									        {
									            assert(pitchBytes.back() == sizeof(value_type));
									        }

									        /*Object must init by copy a valid instance*/
									        constexpr DataPitches() = default;

									        constexpr auto getPitches() const
									        {
									            return Vec{static_cast<index_type>(sizeof(value_type))};
									        }
									    };
									} // namespace alpaka
									// ==
									// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/mem/DataPitches.hpp ==
									// ============================================================================

									// ============================================================================
									// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/mem/MdForwardIter.hpp ==
									// ==
									/* Copyright 2025 René Widera
									 * SPDX-License-Identifier: MPL-2.0
									 */

									// #pragma once
									// #include "alpaka/Vec.hpp"    // amalgamate: file already inlined
									// #include "alpaka/core/common.hpp"    // amalgamate: file already inlined
									// #include "alpaka/mem/concepts/IMdSpan.hpp"    // amalgamate: file already inlined

									// #include <cstdint>    // amalgamate: file already included
									#include <iterator>

									namespace alpaka
									{

									    /** special implementation to define the end
									     *
									     * Only a scalar value must be stored which reduce the register footprint.
									     * The definition of end is that the index is behind or equal to the extent of the slowest moving dimension.
									     */
									    template<typename T_idxType>
									    class MdForwardIterEnd
									    {
									        using index_type = T_idxType;

									        void _()
									        {
									            static_assert(std::forward_iterator<MdForwardIterEnd>);
									        }

									    public:
									        constexpr MdForwardIterEnd(alpaka::concepts::IMdSpan auto const& mdSpan)
									            : m_extentSlowDim{mdSpan.getExtents()[0]}
									        {
									        }

									        constexpr auto operator*() const
									        {
									            return m_extentSlowDim;
									        }

									        constexpr bool operator==(MdForwardIterEnd const& other) const
									        {
									            return (m_extentSlowDim == other.m_extentSlowDim);
									        }

									        constexpr bool operator!=(MdForwardIterEnd const& other) const
									        {
									            return !(*this == other);
									        }

									    private:
									        index_type m_extentSlowDim;
									    };

									    template<alpaka::concepts::IMdSpan T_MdSpan>
									    ALPAKA_FN_HOST_ACC MdForwardIterEnd(T_MdSpan const&) -> MdForwardIterEnd<typename T_MdSpan::index_type>;

									    template<alpaka::concepts::IMdSpan T_MdSpan>
									    class MdForwardIter
									    {
									        using index_type = typename T_MdSpan::index_type;

									        friend class MdForwardIterEnd<index_type>;

									        static constexpr uint32_t iterDim = T_MdSpan::dim();
									        using IterIdxVecType = Vec<index_type, iterDim>;

									        void _()
									        {
									            static_assert(std::forward_iterator<MdForwardIter>);
									            static_assert(std::input_or_output_iterator<MdForwardIter>);
									        }

									    public:
									        constexpr MdForwardIter(T_MdSpan const& mdSpan) : m_mdSpan(mdSpan), m_current{IterIdxVecType::fill(0u)}
									        {
									            // Any zero extent makes the span empty, so start directly at end().
									            for(uint32_t idx = 0; idx < iterDim; ++idx)
									            {
									                if(m_mdSpan.getExtents()[idx] == index_type{0u})
									                {
									                    m_current[0] = m_mdSpan.getExtents()[0];
									                    break;
									                }
									            }
									        }

									        ALPAKA_FN_ACC constexpr index_type slowCurrent() const
									        {
									            return m_current[0];
									        }

									        constexpr decltype(auto) operator*() const
									        {
									            return m_mdSpan[m_current];
									        }

									        constexpr decltype(auto) operator*()
									        {
									            return m_mdSpan[m_current];
									        }

									        // pre-increment the iterator
									        ALPAKA_FN_ACC inline MdForwardIter& operator++()
									        {
									            for(uint32_t d = 0; d < iterDim; ++d)
									            {
									                uint32_t const idx = iterDim - 1u - d;
									                m_current[idx] += index_type{1u};
									                if constexpr(iterDim != 1u)
									                {
									                    if(idx >= 1u && m_current[idx] >= m_mdSpan.getExtents()[idx])
									                    {
									                        m_current[idx] = index_type{0u};
									                    }
									                    else
									                        break;
									                }
									            }
									            return *this;
									        }

									        // post-increment the iterator
									        ALPAKA_FN_ACC inline MdForwardIter operator++(int)
									        {
									            MdForwardIter old = *this;
									            ++(*this);
									            return old;
									        }

									        constexpr bool operator==(MdForwardIter const& other) const
									        {
									            return (m_current == other.m_current);
									        }

									        constexpr bool operator!=(MdForwardIter const& other) const
									        {
									            return !(*this == other);
									        }

									    private:
									        T_MdSpan m_mdSpan;
									        IterIdxVecType m_current;
									    };

									    template<typename T_MdSpan>
									    constexpr bool operator==(
									        MdForwardIter<T_MdSpan> const& mdIter,
									        MdForwardIterEnd<typename T_MdSpan::index_type> const& mdIteratorEnd)
									    {
									        return (*mdIteratorEnd <= mdIter.slowCurrent());
									    }

									    template<typename T_MdSpan>
									    constexpr bool operator!=(
									        MdForwardIter<T_MdSpan> const& mdIter,
									        MdForwardIterEnd<typename T_MdSpan::index_type> const& mdIteratorEnd)
									    {
									        return !(mdIteratorEnd == mdIter);
									    }

									    template<typename T_MdSpan>
									    constexpr bool operator==(
									        MdForwardIterEnd<typename T_MdSpan::index_type> const& mdIteratorEnd,
									        MdForwardIter<T_MdSpan> const& mdIter)
									    {
									        return (*mdIteratorEnd <= mdIter.slowCurrent());
									    }

									    template<typename T_MdSpan>
									    constexpr bool operator!=(
									        MdForwardIterEnd<typename T_MdSpan::index_type> const& mdIteratorEnd,
									        MdForwardIter<T_MdSpan> const& mdIter)
									    {
									        return !(mdIteratorEnd == mdIter);
									    }
									} // namespace alpaka
									// ==
									// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/mem/MdForwardIter.hpp ==
									// ============================================================================

									// ============================================================================
									// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/mem/concepts/detail/InnerTypeAllowedCast.hpp ==
									// ==
									/* Copyright 2025 Simeon Ehrig
									 * SPDX-License-Identifier: MPL-2.0
									 */

									// #pragma once

										// ============================================================================
										// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/concepts/types.hpp ==
										// ==
										/* Copyright 2025 Simeon Ehrig, René Widera
										 * SPDX-License-Identifier: MPL-2.0
										 */

										// #pragma once
										#include <type_traits>

										namespace alpaka::concepts
										{
										    /** Concept to check if the given type is a C static array.
										     */
										    template<typename T>
										    concept CStaticArray = std::is_array_v<T>;

										    /** Concept to check if the given type is a reference, using std::is_reference
										     */
										    template<typename T>
										    concept Reference = std::is_reference_v<T>;
										} // namespace alpaka::concepts
										// ==
										// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/concepts/types.hpp ==
										// ============================================================================


									// #include <concepts>    // amalgamate: file already included

									namespace alpaka::internal
									{
									    /** Get the element type without cv qualifier or static dimension from a value or reference type T.
									     *
									     * @example
									     * int const -> int
									     * int const & -> int
									     * int -> int
									     * &int const[2][2] -> int
									     *
									     */
									    template<typename T>
									    struct GetElementType
									    {
									        /** The trait GetElementType removes an optional reference and NonRefType removes the cv-qualifiers.
									         Two nested traits are required because we need the specialization for C static array. */
									        template<typename U>
									        struct NonRefType
									        {
									            using type = std::decay_t<U>;
									        };

									        template<alpaka::concepts::CStaticArray U>
									        struct NonRefType<U>
									        {
									            using type = typename std::remove_all_extents_t<std::remove_cv_t<U>>;
									        };

									        using type = typename NonRefType<std::remove_reference_t<T>>::type;
									        static constexpr bool is_const = std::is_const_v<std::remove_reference_t<T>>;
									    };

									    template<typename T>
									    using GetElementType_t = typename GetElementType<T>::type;

									    namespace concepts
									    {
									        /** Concept to restrict copy or move constructor of a DataSource which creates a new object with a different
									         * inner type.
									         *
									         * @tparam T_Type element type of the new object
									         * @tparam T_Type_Other element type of the object which is copied or moved
									         *
									         * @details
									         * Needs to fulfill the following requirements
									         *  - the datatype without cv-qualifier needs to be the same
									         *  - following const/mutable conversion to const/mutable are allowed
									         *      - mutable -> mutable
									         *      - const -> const
									         *      - mutable -> const
									         */
									        template<typename T_Type, typename T_Type_Other>
									        concept InnerTypeAllowedCast = requires {
									            /// the value type without cv-qualifier needs to be the same
									            requires std::same_as<GetElementType_t<T_Type>, GetElementType_t<T_Type_Other>>;
									            /// check the correct cast of a const/mutable inner type to another const/mutable inner type
									            requires !(GetElementType<T_Type_Other>::is_const && !GetElementType<T_Type>::is_const);
									        };
									    } // namespace concepts
									} // namespace alpaka::internal
									// ==
									// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/mem/concepts/detail/InnerTypeAllowedCast.hpp ==
									// ============================================================================

								// #include "alpaka/mem/trait.hpp"    // amalgamate: file already inlined
									// ============================================================================
									// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/onHost/interface.hpp ==
									// ==
									/* Copyright 2024 René Widera
									 * SPDX-License-Identifier: MPL-2.0
									 */

									// #pragma once
									// #include "alpaka/api/trait.hpp"    // amalgamate: file already inlined
									// #include "alpaka/concepts.hpp"    // amalgamate: file already inlined
										// ============================================================================
										// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/onHost/DeviceSelector.hpp ==
										// ==
										/* Copyright 2024 René Widera
										 * SPDX-License-Identifier: MPL-2.0
										 */

										// #pragma once
											// ============================================================================
											// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/onHost/Device.hpp ==
											// ==
											/* Copyright 2024 René Widera
											 * SPDX-License-Identifier: MPL-2.0
											 */

											// #pragma once
											// #include "Handle.hpp"    // amalgamate: file already inlined
											// #include "alpaka/interface.hpp"    // amalgamate: file already inlined
												// ============================================================================
												// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/onHost/Event.hpp ==
												// ==
												/* Copyright 2024 René Widera
												 * SPDX-License-Identifier: MPL-2.0
												 */

												// #pragma once
												// #include "Handle.hpp"    // amalgamate: file already inlined
												// #include "alpaka/api/trait.hpp"    // amalgamate: file already inlined
												// #include "alpaka/onHost/internal/interface.hpp"    // amalgamate: file already inlined

												// #include <memory>    // amalgamate: file already included

												namespace alpaka::onHost
												{
												    template<alpaka::concepts::Api T_Api, alpaka::concepts::DeviceKind T_DeviceKind>
												    struct Device;

												    template<typename T_Device>
												    struct Event;

												    template<alpaka::concepts::Api T_Api, alpaka::concepts::DeviceKind T_DeviceKind>
												    struct Event<Device<T_Api, T_DeviceKind>>
												    {
												    private:
												        using DeviceInterface = Device<T_Api, T_DeviceKind>;
												        using EventHandle = ALPAKA_TYPEOF(
												            internal::MakeEvent::Op<ALPAKA_TYPEOF(*std::declval<DeviceInterface>().get())>{}(
												                *std::declval<DeviceInterface>().get()));

												        EventHandle m_event;

												    public:
												        using element_type = typename EventHandle::element_type;

												        template<typename T_Event>
												        Event(Handle<T_Event>&& event) : m_event{std::forward<Handle<T_Event>>(event)}
												        {
												        }

												        auto* get() const
												        {
												            return m_event.get();
												        }

												        constexpr auto getApi() const
												        {
												            return alpaka::internal::getApi(*m_event.get());
												        }

												        std::string getName() const
												        {
												            return alpaka::internal::GetName::Op<std::decay_t<decltype(*m_event.get())>>{}(*m_event.get());
												        }

												        [[nodiscard]] auto getNativeHandle() const
												        {
												            return internal::getNativeHandle(*m_event.get());
												        }

												        bool operator==(Event const& other) const
												        {
												            return this->get() == other.get();
												        }

												        bool operator!=(Event const& other) const
												        {
												            return this->get() != other.get();
												        }

												        /** Get the device of this event
												         *
												         * @return the device of this event
												         */
												        auto getDevice() const
												        {
												            return Device<T_Api, T_DeviceKind>{internal::getDevice(*m_event.get())};
												        }

												        bool isComplete() const
												        {
												            return alpaka::onHost::internal::isEventComplete(*m_event.get());
												        }
												    };

												    template<typename T_Event>
												    Event(Handle<T_Event>&&) -> Event<Device<
												        ALPAKA_TYPEOF(alpaka::internal::getApi(std::declval<T_Event>())),
												        ALPAKA_TYPEOF(alpaka::internal::getDeviceKind(std::declval<T_Event>()))>>;

												} // namespace alpaka::onHost
												// ==
												// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/onHost/Event.hpp ==
												// ============================================================================

												// ============================================================================
												// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/onHost/Queue.hpp ==
												// ==
												/* Copyright 2024 René Widera
												 * SPDX-License-Identifier: MPL-2.0
												 */

												// #pragma once
												// #include "Handle.hpp"    // amalgamate: file already inlined
												// #include "alpaka/api/trait.hpp"    // amalgamate: file already inlined
												// #include "alpaka/executor.hpp"    // amalgamate: file already inlined
												// #include "alpaka/onHost/Event.hpp"    // amalgamate: file already inlined
												// #include "alpaka/onHost/FrameSpec.hpp"    // amalgamate: file already inlined
												// #include "alpaka/onHost/concepts.hpp"    // amalgamate: file already inlined
												// #include "alpaka/onHost/internal/interface.hpp"    // amalgamate: file already inlined
												// #include "alpaka/onHost/trait.hpp"    // amalgamate: file already inlined

												// #include <memory>    // amalgamate: file already included

												namespace alpaka::onHost
												{
												    template<alpaka::concepts::Api T_Api, alpaka::concepts::DeviceKind T_DeviceKind>
												    struct Device;

												    template<typename T_Device, alpaka::concepts::QueueKind T_QueueKind>
												    struct Queue;

												    template<
												        alpaka::concepts::Api T_Api,
												        alpaka::concepts::DeviceKind T_DeviceKind,
												        alpaka::concepts::QueueKind T_QueueKind>
												    struct Queue<Device<T_Api, T_DeviceKind>, T_QueueKind>
												    {
												    private:
												        using DeviceInterface = Device<T_Api, T_DeviceKind>;
												        using QueueHandle = ALPAKA_TYPEOF(
												            internal::MakeQueue::Op<ALPAKA_TYPEOF(*std::declval<DeviceInterface>().get()), T_QueueKind>{}(
												                *std::declval<DeviceInterface>().get(),
												                T_QueueKind{}));

												        QueueHandle m_queue;

												    public:
												        using element_type = typename QueueHandle::element_type;

												        template<typename T_Queue>
												        Queue(Handle<T_Queue>&& queue, T_QueueKind) : m_queue{std::forward<Handle<T_Queue>>(queue)}
												        {
												        }

												        auto* get() const
												        {
												            return m_queue.get();
												        }

												        constexpr alpaka::concepts::Api auto getApi() const
												        {
												            return alpaka::internal::getApi(*m_queue.get());
												        }

												        constexpr alpaka::concepts::QueueKind auto getQueueKind() const
												        {
												            return T_QueueKind{};
												        }

												        constexpr alpaka::concepts::DeviceKind auto getDeviceKind() const
												        {
												            return alpaka::internal::getDeviceKind(this->getDevice());
												        }

												        void _()
												        {
												            static_assert(internal::concepts::Queue<element_type>);
												        }

												        std::string getName() const
												        {
												            return alpaka::internal::GetName::Op<std::decay_t<decltype(*m_queue.get())>>{}(*m_queue.get());
												        }

												        [[nodiscard]] auto getNativeHandle() const
												        {
												            return internal::getNativeHandle(*m_queue.get());
												        }

												        bool operator==(Queue const& other) const
												        {
												            return this->get() == other.get();
												        }

												        bool operator!=(Queue const& other) const
												        {
												            return this->get() != other.get();
												        }

												        /** Get the device of this queue.
												         *
												         * @return The device of this queue.
												         */
												        auto getDevice() const
												        {
												            return Device<T_Api, T_DeviceKind>{internal::getDevice(*m_queue.get())};
												        }

												        /** Enqueue a kernel functor to a queue.
												         *
												         * @param launchCfg Thread or frame specification which provides a chunked description of the thread or
												         * frame index domain.
												         * @param kernelBundle The compute kernel and its arguments.
												         */
												        void enqueue(
												            onHost::concepts::ThreadOrFrameSpec auto const& launchCfg,
												            alpaka::concepts::KernelBundle auto const& kernelBundle) const
												        {
												            if constexpr(
												                isFrameSpec_v<ALPAKA_TYPEOF(launchCfg)>
												                && ALPAKA_TYPEOF(launchCfg)::getExecutor() == alpaka::exec::anyExecutor)
												            {
												                FrameSpec frameSpecWithExecutor = FrameSpec{
												                    launchCfg.getNumFrames(),
												                    launchCfg.getFrameExtents(),
												                    alpaka::onHost::defaultExecutor(internal::getDevice(*m_queue.get()))};
												                internal::enqueue(*m_queue.get(), frameSpecWithExecutor, kernelBundle);
												            }
												            else if constexpr(
												                isThreadSpec_v<ALPAKA_TYPEOF(launchCfg)>
												                && ALPAKA_TYPEOF(launchCfg)::getExecutor() == alpaka::exec::anyExecutor)
												            {
												                ThreadSpec threadSpecWithExecutor = ThreadSpec{
												                    launchCfg.getNumBlocks(),
												                    launchCfg.getNumThreads(),
												                    alpaka::onHost::defaultExecutor(internal::getDevice(*m_queue.get()))};
												                internal::enqueue(*m_queue.get(), threadSpecWithExecutor, kernelBundle);
												            }
												            else
												            {
												                internal::enqueue(*m_queue.get(), launchCfg, kernelBundle);
												            }
												        }

												        /** Enqueue a kernel functor to a queue.
												         *
												         * @param launchCfg Thread or frame specification which provides a chunked description of the thread or
												         * frame index domain.
												         * @param f The compute kernel functor.
												         * @param args Arguments passed to the kernel functor.
												         */
												        void enqueue(onHost::concepts::ThreadOrFrameSpec auto const& launchCfg, auto const& f, auto&&... args) const
												        {
												            enqueue(launchCfg, KernelBundle{f, onHost::makeAccessibleOnAcc(ALPAKA_FORWARD(args))...});
												        }

												        /** Enqueue an operation which is executed on the host side.
												         *
												         * @attention Do NOT enqueue a task which captures the queue internally to keep the queue alive, this could
												         * lead into deadlocks. Do NOT capture @see MangedView because view actions could perform blocking operations
												         * e.g. onHost::wait() in the destructor which could lead to deadlocks too.
												         *
												         * @param task Task to be executed on the host side.
												         */
												        void enqueueHostFn(auto const& task) const
												        {
												            internal::Enqueue::HostTask<ALPAKA_TYPEOF(*m_queue.get()), ALPAKA_TYPEOF(task)>{}(*m_queue.get(), task);
												        }

												        /** Enqueue an operation which is executed asynchronously on the host side
												         *
												         * The enqueued operation will be started after all preceding tasks in the queue, but it may run after
												         * subsequent tasks in the queue. Because this task is asynchronous, it may contain vendor library functions,
												         * which may not be valid in an `enqueueHostFn` task.
												         *
												         * @param task Task to be executed asynchronously on the host side.
												         */
												        void enqueueHostFnDeferred(auto const& task) const
												        {
												            internal::Enqueue::HostTaskDeferred<ALPAKA_TYPEOF(*m_queue.get()), ALPAKA_TYPEOF(task)>{}(
												                *m_queue.get(),
												                task);
												        }

												        /** Enqueue an event
												         *
												         * The event will be signaled after all preceding operations in the queue are finished.
												         *
												         * @param event Event that is to be enqueue in the queue of operations.
												         */
												        void enqueue(Event<Device<T_Api, T_DeviceKind>> const& event) const
												        {
												            internal::Enqueue::Event<ALPAKA_TYPEOF(*m_queue.get()), ALPAKA_TYPEOF(*event.get())>{}(
												                *m_queue.get(),
												                *event.get());
												        }

												        /** Wait until all operations in this queue are finished.
												         *
												         * The caller will be blocked until all previously queued operations have been completed.
												         */
												        void waitFor(Event<Device<T_Api, T_DeviceKind>> const& event) const
												        {
												            internal::waitFor(*m_queue.get(), *event.get());
												        }

												        /** Checks if the queue does not have any enqueued work to process.
												         *
												         * @attention: If you enqueue work outside alpaka by using the native handle of the queue, this function is
												         * maybe not seeing this tasks and can return true even if there are unfinished tasks.
												         * If you need the guarantee that all tasks, even enqueued tasks outside alpaka are finished you should use
												         * onHost::wait(alpaka::concepts::HasGet auto&).
												         *
												         * @return true if there are no unfinished tasks in the queue, else false.
												         */
												        bool isEmpty() const
												        {
												            return internal::isQueueEmpty(*m_queue.get());
												        }
												    };

												    template<typename T_Queue, alpaka::concepts::QueueKind T_QueueKind>
												    Queue(Handle<T_Queue>&&, T_QueueKind) -> Queue<
												        Device<
												            ALPAKA_TYPEOF(alpaka::internal::getApi(std::declval<T_Queue>())),
												            ALPAKA_TYPEOF(alpaka::internal::getDeviceKind(std::declval<T_Queue>()))>,
												        T_QueueKind>;

												    /** @{
												     * @name Memory modifiers
												     *
												     * @attention For input/output memory the caller should ensure that the memory is valid until the operation is
												     * completed not until the execution handle is given back because alpaka is not extending the life-time until
												     * the operation is finished.
												     */
												    /** copy data byte wise from one to another container
												     *
												     * @param queue the copy will be executed after all previous work in this queue is finished
												     * @param[in,out] dest can be a container/view where the data should be written to
												     * @param[in] source can be a container/view from which the data will be copied
												     */
												    template<typename T_Device, alpaka::concepts::QueueKind T_QueueKind>
												    inline void memcpy(Queue<T_Device, T_QueueKind> const& queue, auto&& dest, auto const& source)
												    {
												        memcpy(queue, ALPAKA_FORWARD(dest), source, internal::getExtents(dest));
												    }

												    /** copy data byte wise from one to another container
												     *
												     * @param queue the copy will be executed after all previous work in this queue is finished
												     * @param[in,out] dest can be a container/view where the data should be written to
												     * @param[in] source can be a container/view from which the data will be copied
												     * @param extents M-dimensional data extents in elements, can be smaller than the container capacity
												     */
												    template<typename T_Device, alpaka::concepts::QueueKind T_QueueKind>
												    inline void memcpy(
												        Queue<T_Device, T_QueueKind> const& queue,
												        auto&& dest,
												        auto const& source,
												        alpaka::concepts::VectorOrScalar auto const& extents)
												    {
												        Vec const extentsVec = extents;
												        internal::Memcpy::Op<
												            std::decay_t<decltype(*queue.get())>,
												            std::decay_t<decltype(dest)>,
												            std::decay_t<decltype(source)>,
												            std::decay_t<decltype(extentsVec)>>{}(*queue.get(), ALPAKA_FORWARD(dest), source, extentsVec);
												    }

												    /** copy data byte wise from a container or host pointer to global device memory
												     *
												     * @param queue the copy will be executed after all previous work in this queue is finished
												     * @param[in,out] dest must be device global memory on the device of the queue the data should be written to
												     * @param[in] source can be a container/view or host accessible pointer from which the data will be copied
												     */
												    template<typename T_Device, alpaka::concepts::QueueKind T_QueueKind, typename T_Storage, typename T>
												    inline void memcpy(
												        Queue<T_Device, T_QueueKind> const& queue,
												        onAcc::internal::GlobalDeviceMemoryWrapper<T_Storage, T> dest,
												        auto&& source)
												    {
												        internal::MemcpyDeviceGlobal::Op<
												            std::decay_t<decltype(*queue.get())>,
												            onAcc::internal::GlobalDeviceMemoryWrapper<T_Storage, T>,
												            std::decay_t<decltype(source)>>{}(*queue.get(), dest, ALPAKA_FORWARD(source));
												    }

												    /** copy data byte wise from global device memory to a container or host pointer
												     *
												     * @param queue the copy will be executed after all previous work in this queue is finished
												     * @param[in,out] dest can be a container/view or host accessible pointer the data should be written to
												     * @param[in] source must be device global memory on the device of the queue from which the data will be copied
												     */
												    template<typename T_Device, alpaka::concepts::QueueKind T_QueueKind, typename T_Storage, typename T>
												    inline void memcpy(
												        Queue<T_Device, T_QueueKind> const& queue,
												        auto&& dest,
												        onAcc::internal::GlobalDeviceMemoryWrapper<T_Storage, T> source)
												    {
												        internal::MemcpyDeviceGlobal::Op<
												            std::decay_t<decltype(*queue.get())>,
												            std::decay_t<decltype(dest)>,
												            onAcc::internal::GlobalDeviceMemoryWrapper<T_Storage, T>>{}(*queue.get(), ALPAKA_FORWARD(dest), source);
												    }

												    /** fill memory byte wise
												     *
												     * @param[in,out] dest can be a container/view where the data should be written to
												     * The caller should ensure that the memory is valid until the operation is completed not until the
												     * execution handle is given back because alpaka is not extending the life-time until the operation
												     * is finished.
												     * @param byteValue value to be written to each byte
												     */
												    template<typename T_Device, alpaka::concepts::QueueKind T_QueueKind>
												    inline void memset(Queue<T_Device, T_QueueKind> const& queue, auto&& dest, uint8_t byteValue)
												    {
												        memset(queue, ALPAKA_FORWARD(dest), byteValue, internal::getExtents(dest));
												    }

												    /** fill memory byte wise
												     *
												     * @param[in,out] dest can be a container/view where the data should be written to
												     * The caller should ensure that the memory is valid until the operation is completed not until the
												     * execution handle is given back because alpaka is not extending the life-time until the operation
												     * is finished.
												     * @param byteValue value to be written to each byte
												     * @param extents M-dimensional data extents in elements, can be smaller than the container capacity
												     */
												    template<typename T_Device, alpaka::concepts::QueueKind T_QueueKind>
												    inline void memset(
												        Queue<T_Device, T_QueueKind> const& queue,
												        auto&& dest,
												        uint8_t byteValue,
												        alpaka::concepts::VectorOrScalar auto const& extents)
												    {
												        Vec const extentsVec = extents;
												        internal::Memset::Op<
												            std::decay_t<decltype(*queue.get())>,
												            std::decay_t<decltype(dest)>,
												            std::decay_t<decltype(extentsVec)>>{}(*queue.get(), ALPAKA_FORWARD(dest), byteValue, extentsVec);
												    }

												    /** fill memory element wise
												     *
												     * @param[in,out] dest can be a container/view where the data should be written to
												     * The caller should ensure that the memory is valid until the operation is completed not until the
												     * execution handle is given back because alpaka is not extending the life-time until the operation
												     * is finished.
												     * @param elementValue value to be written to each element
												     */
												    template<typename T_Value, typename T_Device, alpaka::concepts::QueueKind T_QueueKind>
												    inline void fill(Queue<T_Device, T_QueueKind> const& queue, auto&& dest, T_Value elementValue) requires(
												        std::same_as<alpaka::trait::GetValueType_t<ALPAKA_TYPEOF(dest)>, T_Value>
												        && std::same_as<ALPAKA_TYPEOF(alpaka::internal::getApi(queue)), ALPAKA_TYPEOF(alpaka::internal::getApi(dest))>)
												    {
												        fill(queue, ALPAKA_FORWARD(dest), elementValue, internal::getExtents(dest));
												    }

												    /** fill memory element wise
												     *
												     * @param[in,out] dest can be a container/view where the data should be written to
												     * The caller should ensure that the memory is valid until the operation is completed not until the
												     * execution handle is given back because alpaka is not extending the life-time until the operation
												     * is finished.
												     * @param elementValue value to be written to each element
												     * @param extents M-dimensional data extents in elements, can be smaller than the container capacity
												     */

												    template<typename T_Value, typename T_Device, alpaka::concepts::QueueKind T_QueueKind>
												    inline void fill(
												        Queue<T_Device, T_QueueKind> const& queue,
												        auto&& dest,
												        T_Value elementValue,
												        alpaka::concepts::VectorOrScalar auto const& extents)
												        requires(
												            std::same_as<alpaka::trait::GetValueType_t<ALPAKA_TYPEOF(dest)>, T_Value>
												            && std::
												                same_as<ALPAKA_TYPEOF(alpaka::internal::getApi(queue)), ALPAKA_TYPEOF(alpaka::internal::getApi(dest))>)
												    {
												        Vec const extentsVec = extents;
												        internal::Fill::Op<
												            ALPAKA_TYPEOF(*queue.get()),
												            ALPAKA_TYPEOF(dest),
												            ALPAKA_TYPEOF(elementValue),
												            ALPAKA_TYPEOF(extentsVec)>{}(*queue.get(), ALPAKA_FORWARD(dest), elementValue, extentsVec);
												    }

												    /** @} */

												    /** @{
												     * @name Deferred device allocations
												     */
												    /** allocate memory that is accessible after it is processed in the queue
												     *
												     * Deferred allocation means that the pointer in the returned buffer is valid after the function is returning.
												     * It is allowed to slice the buffer or use the encapsulated pointer for address calculations.
												     * In any cases the pointer is not allowed to be dereferenced before the memory allocation is processed in the
												     * queue. All tasks performing any memory access must be executed after the memory allocation is processed in the
												     * queue. This can be archived by waiting on the queue or describing queue dependencies via @c waitFor(). The
												     * memory is allowed to be used in other queues too. To avoid that a view to the memory is still in use during the
												     * deallocation you can use @see addDestructorAction() and wait for a queue if it **differs** to the queue used for
												     * the allocation.
												     * The first access could have higher latency compared to alpaka::onHost::alloc() due to the initial setup of the
												     * caching allocator used by some APIs. But subsequent accesses should have lower latency.
												     *
												     * @attention It is allowed that the function is blocking the caller until the memory is created.
												     *
												     * @tparam T_Type type of the data elements
												     * @param queue queue handle
												     * @param extents number of elements for each dimension
												     * @return Shared buffer to the allocated memory. The buffer will be freed after the last instance to the
												     * memory is destroyed. The deallocation is asynchronous performed in the queue which is used for the
												     * allocation.
												     */
												    template<typename T_Type, typename T_Device, alpaka::concepts::QueueKind T_QueueKind>
												    inline auto allocDeferred(
												        Queue<T_Device, T_QueueKind> const& queue,
												        alpaka::concepts::VectorOrScalar auto const& extents)
												    {
												        Vec const extentsVec = extents;
												        return internal::AllocDeferred::Op<T_Type, std::decay_t<decltype(*queue.get())>, ALPAKA_TYPEOF(extentsVec)>{}(
												            *queue.get(),
												            extentsVec);
												    }

												    /** allocate memory that is accessible after it is processed in the queue
												     *
												     * In any cases the pointer is not allowed to be dereferenced before the memory allocation is processed in the
												     * queue. All tasks performing any memory access must be executed after the memory allocation is processed in the
												     * queue. This can be archived by waiting on the queue or describing queue dependencies via @c waitFor(). The
												     * memory is allowed to be used in other queues too. To avoid that a view to the memory is still in use during the
												     * deallocation you can use @see addDestructorAction() and wait for a queue if it **differs** to the queue used for
												     * the allocation.
												     * The first access could have higher latency compared to alpaka::onHost::alloc() due to the initial setup of the
												     * caching allocator used by some APIs. But subsequent accesses should have lower latency.
												     *
												     * @attention It is allowed that the function is blocking the caller until the memory is created.
												     *
												     * @param queue queue handle
												     * @param[in] view other memory where the extents will be derived from
												     * @return Shared buffer to the allocated memory. The buffer will be freed after the last instance to the
												     * memory is destroyed. The deallocation is asynchronous performed in the queue which is used for the
												     * allocation.
												     */
												    template<typename T_Device, alpaka::concepts::QueueKind T_QueueKind>
												    inline auto allocLikeDeferred(Queue<T_Device, T_QueueKind> const& queue, auto const& view)
												    {
												        return allocDeferred<alpaka::trait::GetValueType_t<ALPAKA_TYPEOF(view)>>(queue, internal::getExtents(view));
												    }

												    /** @} */
												} // namespace alpaka::onHost
												// ==
												// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/onHost/Queue.hpp ==
												// ============================================================================

											// #include "alpaka/onHost/concepts.hpp"    // amalgamate: file already inlined
											// #include "alpaka/onHost/internal/interface.hpp"    // amalgamate: file already inlined
											// #include "alpaka/tag.hpp"    // amalgamate: file already inlined
											// #include "alpaka/utility.hpp"    // amalgamate: file already inlined

											// #include <bit>    // amalgamate: file already included
											// #include <climits>    // amalgamate: file already included

											namespace alpaka::onHost
											{
											    /** @brief Description of a specific device that one can schedule kernels on.
											     *
											     * @details
											     * A device is the combination of an alpaka::deviceKind::onHost::DeviceKind and an alpaka::concepts::Api,
											     * representing an entity that one can schedule work on.
											     *
											     * @tparam T_Api The Api powering this device.
											     * @tparam T_DeviceKind The kind of device it is.
											     */
											    template<alpaka::concepts::Api T_Api, alpaka::concepts::DeviceKind T_DeviceKind>
											    struct Device
											    {
											    private:
											        using PlatformHandle = ALPAKA_TYPEOF(internal::makePlatform(T_Api{}, T_DeviceKind{}));
											        using DeviceHandle = ALPAKA_TYPEOF(
											            internal::MakeDevice::Op<typename PlatformHandle::element_type>{}(
											                *std::declval<PlatformHandle>().get(),
											                0u));
											        DeviceHandle m_device;

											    public:
											        friend struct alpaka::internal::GetName;
											        friend struct internal::GetNativeHandle;

											        using element_type = typename DeviceHandle::element_type;

											        auto get() const
											        {
											            return m_device.get();
											        }

											        template<typename T_Device>
											        Device(Handle<T_Device>&& internalDeviceHandle)
											            : m_device{std::forward<Handle<T_Device>>(internalDeviceHandle)}
											        {
											        }

											        void _()
											        {
											            static_assert(internal::concepts::Device<element_type>);
											        }

											        std::string getName() const
											        {
											            return alpaka::internal::GetName::Op<std::decay_t<decltype(*m_device.get())>>{}(*m_device.get());
											        }

											        [[nodiscard]] auto getNativeHandle() const
											        {
											            return internal::getNativeHandle(*m_device.get());
											        }

											        bool operator==(Device const& other) const
											        {
											            return this->get() == other.get();
											        }

											        bool operator!=(Device const& other) const
											        {
											            return this->get() != other.get();
											        }

											        /** Create a queue for this device.
											         *
											         * @attention If you call this method multiple times it is allowed that you always get the same handle
											         * back. There is no guarantee that you will get independent queues.
											         *
											         * Enqueuing tasks into two different queues does not guarantee that these tasks run in parallel.
											         * Running tasks from different tasks sequentially is valid behavior. Enqueuing into two individual queues only
											         * signifies that the tasks are independent of each other and their order of execution is independent.
											         *
											         * @param kind
											         *   Blocking behaviour:
											         *    - queueKind::nonBlocking (default): enqueue returns immediately; completion of the enqueued operation
											         * must be ensured via onHost::wait(queue) or by enqueuing dependent operations onto the same queue.
											         *    - queueKind::blocking: each enqueue only returns after the operation is complete and its effects are
											         * host-visible.
											         *
											         * @return A onHost::Queue that tasks and memory operations can be enqueued on.
											         */
											        auto makeQueue(alpaka::concepts::QueueKind auto kind)
											        {
											            return Queue{
											                internal::MakeQueue::Op<ALPAKA_TYPEOF(*m_device.get()), ALPAKA_TYPEOF(kind)>{}(*m_device.get(), kind),
											                kind};
											        }

											        auto makeQueue()
											        {
											            return makeQueue(queueKind::nonBlocking);
											        }

											        auto makeEvent()
											        {
											            return Event{internal::MakeEvent::Op<std::decay_t<decltype(*m_device.get())>>{}(*m_device.get())};
											        }

											        /** Blocks the caller until the given handle executes all work
											         */
											        void wait()
											        {
											            return internal::wait(*m_device.get());
											        }

											        /** Properties of a given device
											         *
											         * @attention Currently only a handful of entries is available. The object will be refactored soon and will
											         * become most likely a compile time dictionary tu support optional entries.
											         */

											        inline DeviceProperties getDeviceProperties() const
											        {
											            return internal::GetDeviceProperties::Op<ALPAKA_TYPEOF(*m_device.get())>{}(*m_device.get());
											        }

											        size_t getFreeGlobalMemBytes() const
											        {
											            return internal::GetFreeGlobalMemBytes::Op<ALPAKA_TYPEOF(*m_device.get())>{}(*m_device.get());
											        }

											        constexpr auto getDeviceKind() const
											        {
											            return T_DeviceKind{};
											        }

											        constexpr alpaka::concepts::Api auto getApi() const
											        {
											            return T_Api{};
											        }
											    };

											    namespace concepts
											    {
											        /** @brief Concept to check if something is a device.
											         *
											         * @details
											         * This concept checks for specializations of alpaka::onHost::Device. For more information on devices in
											         * alpaka, refer to the class documentation.
											         */
											        template<typename T_Device>
											        concept Device = alpaka::concepts::SpecializationOf<T_Device, onHost::Device>;
											    } // namespace concepts

											    template<typename T_Device>
											    Device(Handle<T_Device>&&) -> Device<
											        ALPAKA_TYPEOF(alpaka::internal::getApi(std::declval<T_Device>())),
											        ALPAKA_TYPEOF(alpaka::internal::getDeviceKind(std::declval<T_Device>()))>;

											    /** @{
											     * @name Device allocations
											     */
											    /** Allocate memory on the given device
											     *
											     * @tparam T_Type type of the data elements
											     * @param device device handle
											     * @param extents number of elements for each dimension
											     * @return memory owning view to the allocated memory
											     */
											    template<typename T_Type>
											    inline auto alloc(concepts::Device auto const& device, alpaka::concepts::VectorOrScalar auto const& extents)
											    {
											        Vec const extentsVec = extents;
											        return internal::Alloc::Op<T_Type, std::decay_t<decltype(*device.get())>, ALPAKA_TYPEOF(extentsVec)>{}(
											            *device.get(),
											            extentsVec);
											    }

											    /** Allocate memory on the given device with unified virtual memory
											     *
											     * This memory can be accessed from all devices with the same Api and device kind. Depending on the backend e.g.
											     * OneApi memory can be accessed by other device kind devices if they are using the same native context. It is not
											     * allowed to access the data on two devices at the same time, this must be avoided by explicit synchronizations.
											     * Unified memory follows the rules of UVM memory of the device backend e.g. CUDA, HIP, ...
											     *
											     * @tparam T_Type type of the data elements
											     * @param device device handle
											     * @param extents number of elements for each dimension
											     * @return Managed view to the allocated memory
											     */
											    template<typename T_Type>
											    inline auto allocUnified(concepts::Device auto const& device, alpaka::concepts::VectorOrScalar auto const& extents)
											    {
											        Vec const extentsVec = extents;
											        return internal::AllocUnified::Op<T_Type, std::decay_t<decltype(*device.get())>, ALPAKA_TYPEOF(extentsVec)>{}(
											            *device.get(),
											            extentsVec);
											    }

											    /** Allocates unified memory on the device associated with the given queue.
											     *
											     * This memory can be accessed from all devices with the same Api and device kind. Depending on the backend e.g.
											     * OneApi memory can be accessed by other device kind devices if they are using the same native context. It is not
											     * allowed to access the data on two devices at the same time, this must be avoided by explicit synchronizations.
											     * Unified memory follows the rules of UVM memory of the device backend e.g. CUDA, HIP, ...
											     *
											     * @ingroup foo
											     *
											     * @tparam T_Type type of the data elements
											     * @param queue queue handle
											     * @param extents number of elements for each dimension
											     */
											    template<typename T_Type, typename T_Device, alpaka::concepts::QueueKind T_QueueKind>
											    inline auto allocUnified(
											        Queue<T_Device, T_QueueKind> const& queue,
											        alpaka::concepts::VectorOrScalar auto const& extents)
											    {
											        Vec const extentsVec = extents;
											        return internal::AllocUnified::
											            Op<T_Type, std::decay_t<decltype(*queue.getDevice().get())>, ALPAKA_TYPEOF(extentsVec)>{}(
											                *queue.getDevice().get(),
											                extentsVec);
											    }

											    /** Allocate pinned memory on the host which is mapped into the address space of the device
											     *
											     * Mapped memory is located on the host and is transferred for each access via the PCIe/Nvlink bus. The performance
											     * on the device is mostly pure. Mapped memory should be used for host memory if you transfer memory between host
											     * and device via `onHost::memcpy()` because the transfer will be optimized for latency and performance.
											     *
											     * @tparam T_Type type of the data elements
											     * @param device device handle
											     * @param extents number of elements for each dimension
											     */
											    template<typename T_Type>
											    inline auto allocMapped(concepts::Device auto const& device, alpaka::concepts::VectorOrScalar auto const& extents)
											    {
											        Vec const extentsVec = extents;
											        return internal::AllocMapped::Op<T_Type, std::decay_t<decltype(*device.get())>, ALPAKA_TYPEOF(extentsVec)>{}(
											            *device.get(),
											            extentsVec);
											    }

											    /** Allocate pinned memory on the host which is mapped into the address space of the device
											     *
											     * Mapped memory is located on the host and is transferred for each access via the PCIe/Nvlink bus. The performance
											     * on the device is mostly pure. Mapped memory should be used for host memory if you transfer memory between host
											     * and device via `onHost::memcpy()` because the transfer will be optimized for latency and performance.
											     *
											     * @tparam T_Type type of the data elements
											     * @param queue queue handle
											     * @param extents number of elements for each dimension
											     */
											    template<typename T_Type, typename T_Device, alpaka::concepts::QueueKind T_QueueKind>
											    inline auto allocMapped(
											        Queue<T_Device, T_QueueKind> const& queue,
											        alpaka::concepts::VectorOrScalar auto const& extents)
											    {
											        return allocMapped<T_Type>(queue.getDevice(), extents);
											    }

											    /** Allocate memory on the given device based on a view
											     *
											     * Derives type and extents of the memory from the view.
											     * The content of the memory is NOT copied to the created allocated memory.
											     *
											     * @param device device handle
											     * @param[in] view memory where properties will be derived from
											     *
											     * @return memory owning view to the allocated memory
											     */
											    inline auto allocLike(concepts::Device auto const& device, auto const& view)
											    {
											        return alloc<alpaka::trait::GetValueType_t<ALPAKA_TYPEOF(view)>>(device, internal::getExtents(view));
											    }

											    ///@}

											    /** Check if the given view is accessible on the given device
											     *
											     * @param device device handle
											     * @param view memory where properties will be derived from
											     * @return true if the view is accessible on the device, false otherwise.
											     * alpaka can not detect all memory access types therefore the result can be false even if the memory is accessible
											     * because the view was allocated with a UVM allocator.
											     *
											     */
											    inline bool isDataAccessible(concepts::Device auto const& device, alpaka::concepts::IView auto const& view)
											    {
											        return internal::IsDataAccessible::FirstPath<ALPAKA_TYPEOF(*device.get()), ALPAKA_TYPEOF(view)>{}(
											                   *device.get(),
											                   view)
											               || internal::IsDataAccessible::SecondPath<
											                   ALPAKA_TYPEOF(getApi(view)),
											                   ALPAKA_TYPEOF(getDeviceKind(device)),
											                   ALPAKA_TYPEOF(view)>{}(getApi(view), getDeviceKind(device), view);
											    }

											    /** Check if the given view is accessible on the device of the given queue
											     *
											     * @param queue queue handle
											     */
											    template<typename T_Device, alpaka::concepts::QueueKind T_QueueKind>
											    inline bool isDataAccessible(Queue<T_Device, T_QueueKind> const& queue, alpaka::concepts::IView auto const& view)
											    {
											        return internal::IsDataAccessible::FirstPath<ALPAKA_TYPEOF(*queue.getDevice().get()), ALPAKA_TYPEOF(view)>{}(
											                   *queue.getDevice().get(),
											                   view)
											               || internal::IsDataAccessible::SecondPath<
											                   ALPAKA_TYPEOF(getApi(view)),
											                   ALPAKA_TYPEOF(getDeviceKind(queue.getDevice())),
											                   ALPAKA_TYPEOF(view)>{}(getApi(view), getDeviceKind(queue.getDevice()), view);
											    }

											    /** Provides a frame specification to operate on a given index range
											     *
											     * @param extents size of the index range
											     * @return frame specification
											     */
											    template<typename T_Api, alpaka::concepts::DeviceKind T_DeviceKind>
											    inline constexpr concepts::FrameSpec auto getFrameSpec(
											        Device<T_Api, T_DeviceKind> const& device,
											        alpaka::concepts::Executor auto executor,
											        alpaka::concepts::VectorOrScalar auto const& extents)
											    {
											        if constexpr(executor == exec::anyExecutor)
											        {
											            auto usedExecutor = defaultExecutor(device);
											            return internal::getFrameSpec(*device.get(), usedExecutor, extents);
											        }
											        else
											            return internal::getFrameSpec(*device.get(), executor, extents);
											    }

											    /** Provides a frame specification to operate on a given index range
											     *
											     * The frame specification will be optimized for SIMD executions in the highest dimension
											     * for a flat non-hierarchical execution via onAcc::worker::threadsInGrid.
											     * Do not use this functions for kernel using hierarchical thread parallelism, in many cases the frame
											     * specification depends on the outer parallelism in the kernel.
											     *
											     * @tparam T_DataType the data type for which you would like to SIMD optimize
											     * @param extents number of elements for each dimension of the type T_DataType
											     * @return frame specification
											     */
											    template<typename T_DataType, typename T_Api, alpaka::concepts::DeviceKind T_DeviceKind>
											    inline constexpr concepts::FrameSpec auto getSimdFrameSpec(
											        Device<T_Api, T_DeviceKind> const& device,
											        alpaka::concepts::Executor auto executor,
											        alpaka::concepts::VectorOrScalar auto const& extents)
											    {
											        if constexpr(executor == exec::anyExecutor)
											        {
											            auto usedExecutor = defaultExecutor(device);
											            return internal::getSimdFrameSpec<T_DataType>(*device.get(), usedExecutor, extents);
											        }
											        else
											            return internal::getSimdFrameSpec<T_DataType>(*device.get(), executor, extents);
											    }
											} // namespace alpaka::onHost
											// ==
											// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/onHost/Device.hpp ==
											// ============================================================================

											// ============================================================================
											// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/onHost/DeviceSpec.hpp ==
											// ==
											/* Copyright 2024 René Widera
											 * SPDX-License-Identifier: MPL-2.0
											 */

											// #pragma once
											// #include "alpaka/api/api.hpp"    // amalgamate: file already inlined
												// ============================================================================
												// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/host/hwloc/hwlocConfig.hpp ==
												// ==
												/* Copyright 2026 René Widera
												 * SPDX-License-Identifier: MPL-2.0
												 */

												// #pragma once
												// #include "alpaka/core/config.hpp"    // amalgamate: file already inlined


												#if !defined(ALPAKA_DISABLE_HWLOC)
												#    if __has_include(<hwloc.h>)
												#        include <hwloc.h>
												#        if !defined(ALPAKA_HAS_HWLOC)
												#            define ALPAKA_HAS_HWLOC 1
												#        endif
												#    endif
												#endif

												// In case it is not already set, set it to disabled, to ensure that his header is included wherever the macro is
												// used. If this header is not included compiler flag `-Wundef` will show an error.
												#if !defined(ALPAKA_HAS_HWLOC)
												#    define ALPAKA_HAS_HWLOC 0
												#endif
												// ==
												// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/host/hwloc/hwlocConfig.hpp ==
												// ============================================================================

											// #include "alpaka/core/config.hpp"    // amalgamate: file already inlined
											// #include "alpaka/tag.hpp"    // amalgamate: file already inlined

											// #include <tuple>    // amalgamate: file already included

											namespace alpaka::onHost
											{
											    /** @brief Concept for a combination of an API and device kind
											     *
											     * @details
											     * A device specification means the combination of an API and a device kind. Multiple instances of
											     * alpaka::onHost::Device can exist for the same device specification, for example in the form of multiple GPUs of
											     * the same type in one system.
											     *
											     * To check whether a specific combination is valid, i.e., whether an API can target a device kind, the static
											     * isValid() method can be used.
											     */
											    template<alpaka::concepts::Api T_Api, alpaka::concepts::DeviceKind T_DeviceKind>
											    struct DeviceSpec
											    {
											    public:
											        constexpr DeviceSpec(T_Api api, T_DeviceKind deviceType) : m_api(api), m_deviceType(deviceType)
											        {
											        }

											        constexpr DeviceSpec() = default;

											        constexpr T_DeviceKind getDeviceKind() const
											        {
											            return m_deviceType;
											        }

											        constexpr T_Api getApi() const
											        {
											            return m_api;
											        }

											        std::string getName() const
											        {
											            return m_api.getName() + " " + m_deviceType.getName();
											        }

											        /** Checks if the device kind and api combination is valid
											         *
											         * Reasons why a combination is valid can be that the api does not know how to talk to a device or that the
											         * required dependencies e.g. CUDA, HIP, or OneApi are not fulfilled.
											         *
											         * @return true if the device kind and api combination is valid, else false
											         */
											        static constexpr bool isValid()
											        {
											            if constexpr(requires { trait::IsDeviceSupportedBy::Op<T_DeviceKind, T_Api>::value; })
											                return trait::IsDeviceSupportedBy::Op<T_DeviceKind, T_Api>::value;
											            else
											                return false;
											        }

											    private:
											        T_Api m_api;
											        T_DeviceKind m_deviceType;
											    };

											    /** list of enabled device specifications
											     *
											     * - device specifications can be dis-/enabled by the CMake options alpaka_<API>_<DeviceKindType>
											     * - the second way to disable a device specifications is to define the preprocessor define
											     * ALPAKA_DISABLE_<ApiType>_<DeviceKindType>, else the device specification is enabled
											     */
											    constexpr auto enabledDeviceSpecs = std::tuple_cat(
											        std::tuple<>{}
											#if !defined(ALPAKA_DISABLE_Host_Cpu)
											        ,
											        std::tuple{DeviceSpec{api::host, deviceKind::cpu}}
											#endif
											#if !defined(ALPAKA_DISABLE_Host_NumaCpu) && ALPAKA_HAS_HWLOC
											        ,
											        std::tuple{DeviceSpec{api::host, deviceKind::numaCpu}}
											#endif
											#if !defined(ALPAKA_DISABLE_OneApi_IntelGpu) && ALPAKA_LANG_ONEAPI
											        ,
											        std::tuple{DeviceSpec{api::oneApi, deviceKind::intelGpu}}
											#endif
											#if !defined(ALPAKA_DISABLE_OneApi_NvidiaGpu) && ALPAKA_LANG_ONEAPI
											        ,
											        std::tuple{DeviceSpec{api::oneApi, deviceKind::nvidiaGpu}}
											#endif
											#if !defined(ALPAKA_DISABLE_OneApi_AmdGpu) && ALPAKA_LANG_ONEAPI
											        ,
											        std::tuple{DeviceSpec{api::oneApi, deviceKind::amdGpu}}
											#endif
											#if !defined(ALPAKA_DISABLE_OneApi_Cpu) && ALPAKA_LANG_ONEAPI
											        ,
											        std::tuple{DeviceSpec{api::oneApi, deviceKind::cpu}}
											#endif
											#if !defined(ALPAKA_DISABLE_Cuda_NvidiaGpu) && ALPAKA_LANG_CUDA
											        ,
											        std::tuple{DeviceSpec{api::cuda, deviceKind::nvidiaGpu}}
											#endif
											#if !defined(ALPAKA_DISABLE_Hip_AmdGpu) && ALPAKA_LANG_HIP
											        ,
											        std::tuple{DeviceSpec{api::hip, deviceKind::amdGpu}}
											#endif
											    );

											    namespace concepts
											    {
											        /** Concept to check for specializations of alpaka::onHost::DeviceSpec
											         */
											        template<typename T>
											        concept DeviceSpec = alpaka::concepts::SpecializationOf<T, onHost::DeviceSpec>;
											    } // namespace concepts

											} // namespace alpaka::onHost
											// ==
											// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/onHost/DeviceSpec.hpp ==
											// ============================================================================

										// #include "alpaka/utility.hpp"    // amalgamate: file already inlined

										namespace alpaka::onHost
										{
										    template<alpaka::concepts::Api T_Api, alpaka::concepts::DeviceKind T_DeviceKind>
										    struct DeviceSelector
										    {
										    public:
										        static_assert(
										            DeviceSpec<T_Api, T_DeviceKind>::isValid(),
										            "Invalid combination of device kind and api. The api does not know how to talk to the device or the "
										            "required dependencies to enable the api are not fulfilled.");

										        constexpr DeviceSelector(DeviceSpec<T_Api, T_DeviceKind> deviceSpec)
										            : m_platform(internal::makePlatform(deviceSpec.getApi(), deviceSpec.getDeviceKind()))
										            , m_deviceSpec(deviceSpec)
										        {
										        }

										        constexpr DeviceSelector(T_Api api, T_DeviceKind devType) : DeviceSelector(DeviceSpec{api, devType})
										        {
										        }

										        /** Get the number of available devices for the given api and device kind.
										         *
										         * @attention In case the compiler flags you used to build your application were wrong, kernels for the given
										         * deviceKind cannot be built and the number of available devices will be zero. This can happen, e.g., if you
										         * compile for OneAPI SYCL: you can compile the application, but whether you can run on a device is evaluated
										         * at runtime.
										         *
										         * @return number of devices
										         */
										        uint32_t getDeviceCount() const
										        {
										            return internal::GetDeviceCount::Op<ALPAKA_TYPEOF(*m_platform.get())>{}(*m_platform.get());
										        }

										        bool isAvailable() const
										        {
										            return getDeviceCount() != 0;
										        }

										        DeviceProperties getDeviceProperties(uint32_t idx) const
										        {
										            return internal::GetDeviceProperties::Op<ALPAKA_TYPEOF(*m_platform.get())>{}(*m_platform.get(), idx);
										        }

										        /** Get a device
										         *
										         * @param idx device index (range [0;number of devices), invalid index will throw an exception
										         * @return @see onHost::Device
										         */
										        auto makeDevice(uint32_t idx)
										        {
										            return Device{internal::MakeDevice::Op<ALPAKA_TYPEOF(*m_platform.get())>{}(*m_platform.get(), idx)};
										        }

										    private:
										        ALPAKA_TYPEOF(internal::makePlatform(T_Api{}, T_DeviceKind{})) m_platform;
										        DeviceSpec<T_Api, T_DeviceKind> m_deviceSpec;
										    };

										    /** create a object to get access to devices */
										    template<typename T_Api, alpaka::concepts::DeviceKind T_DeviceKind>
										    inline auto makeDeviceSelector(DeviceSpec<T_Api, T_DeviceKind> deviceSpec)
										    {
										        return DeviceSelector{deviceSpec};
										    }

										    inline auto makeDeviceSelector(alpaka::concepts::Api auto api, alpaka::concepts::DeviceKind auto deviceTag)
										    {
										        return DeviceSelector{api, deviceTag};
										    }

										    template<typename deferEvaluation = void>
										    inline auto makeHostDevice()
										    {
										        return DeviceSelector{
										            std::conditional_t<std::is_same_v<deferEvaluation, bool>, api::Host, api::Host>{},
										            deviceKind::cpu}
										            .makeDevice(0);
										    }
										} // namespace alpaka::onHost
										// ==
										// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/onHost/DeviceSelector.hpp ==
										// ============================================================================

									// #include "alpaka/onHost/concepts.hpp"    // amalgamate: file already inlined
									// #include "alpaka/tag.hpp"    // amalgamate: file already inlined
									// #include "alpaka/trait.hpp"    // amalgamate: file already inlined

									/** Functionality which is usable on the host CPU controller thread. */
									namespace alpaka::onHost
									{
									    /** @{
									     * @name Query extents
									     */
									    /** Object extents
									     *
									     * @param any can be a std::vector, std::array, ...
									     * @return the extents of the object
									     */
									    inline decltype(auto) getExtents(auto&& any)
									    {
									        return internal::getExtents(ALPAKA_FORWARD(any));
									    }

									    /** Handle extents
									     *
									     * @param handle can be a view, a data
									     * @return the extents of the object
									     */
									    inline decltype(auto) getExtents(alpaka::concepts::HasGet auto&& handle)
									    {
									        return internal::getExtents(*handle.get());
									    }

									    /** @} */

									    /** @{
									     * @name Query multi-dimensional pitches
									     */
									    /** Object pitches
									     *
									     * @param any can be a std::vector, std::array, ...
									     * @return Multidimensional value with number of bytes to jump to the next value within the corresponding
									     *         dimension.
									     *         The inner-most dimension (x) is sizeof(value_type), the next dimension (y) is the byte-stride of a
									     *         full row including padding, and so on for higher dimensions.
									     *         Given an ND index, the element-wise product with the pitches summed (dot product) yields the byte
									     *         offset from the start of the buffer to that data element.
									     */
									    inline decltype(auto) getPitches(auto&& any)
									    {
									        return internal::getPitches(ALPAKA_FORWARD(any));
									    }

									    /** Handle pitches
									     *
									     * @param handle can be a view, a data
									     * @return Multidimensional value with number of bytes to jump to the next value within the corresponding
									     *         dimension.
									     *         The inner-most dimension (x) is sizeof(value_type), the next dimension (y) is the byte-stride of a
									     *         full row including padding, and so on for higher dimensions.
									     *         Given an ND index, the element-wise product with the pitches summed (dot product) yields the byte
									     *         offset from the start of the buffer to that data element.
									     */
									    inline decltype(auto) getPitches(alpaka::concepts::HasGet auto&& handle)
									    {
									        return internal::getPitches(*handle.get());
									    }

									    /** @} */

									    /** @{
									     * @name Query the name
									     */

									    /** Compile‑time available name for a given object.
									     *
									     * @param any object whose name shall be queried
									     * @return a `std::string`‑compatible value holding the static name
									     */
									    inline std::convertible_to<std::string> auto getStaticName(auto const& any)
									    {
									        return alpaka::internal::GetStaticName::Op<ALPAKA_TYPEOF(any)>{}(any);
									    }

									    /** Compile‑time available name of an handle
									     *
									     * @param handle object whose name shall be queried
									     * @return a `std::string`‑compatible value holding the static name
									     */
									    inline std::convertible_to<std::string> auto getStaticName(concepts::StaticNameHandle auto const& handle)
									    {
									        return alpaka::internal::GetStaticName::Op<std::decay_t<decltype(*handle.get())>>{}(*handle.get());
									    }

									    /** Runtime name for a given object.
									     *
									     * @param any object whose name shall be queried
									     * @return a `std::string`‑compatible value holding the name
									     */
									    inline std::convertible_to<std::string> auto getName(auto&& any)
									    {
									        return alpaka::internal::GetName::Op<ALPAKA_TYPEOF(any)>{}(ALPAKA_FORWARD(any));
									    }

									    /** Runtime name for a given handle.
									     *
									     * @param handle object whose name shall be queried
									     * @return a `std::string`‑compatible value holding the name
									     */
									    inline std::convertible_to<std::string> auto getName(concepts::NameHandle auto const& handle)
									    {
									        return alpaka::internal::GetName::Op<std::decay_t<decltype(*handle.get())>>{}(*handle.get());
									    }

									    /** @} */

									    /** Get the native handle of an handle.
									     *
									     * The native handle can be passed to the underlying backend API
									     * (e.g. CUDA, HIP, OpenMP) for low‑level operations.
									     *
									     * @param handle object exposing a native handle
									     * @return the native handle returned by the backend‑specific implementation
									     */
									    inline auto getNativeHandle(auto const& handle)
									    {
									        return internal::getNativeHandle(*handle.get());
									    }

									    /** wait for all work to be finished
									     *
									     * Waits until all work submitted to any before this call has finished
									     *
									     * @param handle queue/device/event
									     */
									    inline void wait(alpaka::concepts::HasGet auto& handle)
									    {
									        return internal::wait(*handle.get());
									    }

									    /** @{
									     * @name Query raw pointer
									     */
									    /** pointer to data of an object
									     *
									     * For multi‑dimensional data the data is not required to be continuous.
									     *
									     * @param any object providing data access (e.g. std::vector)
									     * @return raw pointer to the underlying data (equivalent to `std::data`)
									     */
									    inline decltype(auto) data(auto&& any)
									    {
									        return internal::Data::data(ALPAKA_FORWARD(any));
									    }

									    /** pointer to data of an handle
									     *
									     * For multi‑dimensional data the data is not required to be continuous.
									     *
									     * @param handle handle providing data access (e.g. view)
									     * @return raw pointer to the underlying data
									     */
									    inline decltype(auto) data(alpaka::concepts::HasGet auto&& handle)
									    {
									        return internal::Data::data(*handle.get());
									    }

									    /** @} */

									    /** @{
									     * @name Host allocations
									     */
									    /** Allocate host memory for a given element type and extents.
									     *
									     * The allocation is performed on the host controller device
									     * (`api::host` ans `deviceKind::cpu`).
									     * The returned view owns the allocated memory.
									     *
									     * @tparam T_ValueType type of the data elements
									     * @param extents number of elements per dimension (vector or scalar)
									     * @return a view owning the newly allocated memory
									     */
									    template<typename T_ValueType>
									    inline auto allocHost(alpaka::concepts::VectorOrScalar auto const& extents)
									    {
									        auto device = makeHostDevice<T_ValueType>();
									        Vec const extentsVec = extents;
									        return internal::Alloc::Op<T_ValueType, std::decay_t<decltype(*device.get())>, ALPAKA_TYPEOF(extentsVec)>{}(
									            *device.get(),
									            extentsVec);
									    }

									    /** Allocate host memory with the same value type and extents as an existing view.
									     *
									     * The content of the source view is **not** copied. The function deduces the
									     * element type and extents from `view` and creates a new shared buffer on the
									     * host controller device.
									     *
									     * @param view a view (e.g. `std::vector`, `std::array`, or any compatible type)
									     * @return a view owning the newly allocated memory
									     */
									    inline auto allocHostLike(auto const& view)
									    {
									        auto device = makeHostDevice<ALPAKA_TYPEOF(view)>();
									        return alloc<alpaka::trait::GetValueType_t<ALPAKA_TYPEOF(view)>>(device, internal::getExtents(view));
									    }

									    /** @} */

									    /** @{
									     * @name Device selection utilities
									     */
									    /** Resolve the list of executors supported for a device specification.
									     *
									     * This helper is used internally to build backend dictionaries.
									     *
									     * @param deviceSpec      device specification to be used
									     * @param listOfExecutors tuple of executor types to be filtered
									     * @return a tuple containing the supported executor types
									     *
									     * @{
									     */
									    constexpr auto getExecutorsList(auto const deviceSpec, auto const listOfExecutors)
									        requires(ALPAKA_TYPEOF(deviceSpec)::isValid())
									    {
									        using DevSelectorType = decltype(makeDeviceSelector(deviceSpec));
									        using DeviceType = decltype(std::declval<DevSelectorType>().makeDevice(0));
									        using ExecutorListType = decltype(supportedExecutors(std::declval<DeviceType>(), listOfExecutors));
									        return ExecutorListType{};
									    }

									    constexpr auto getExecutorsList(auto const deviceSpec, auto const listOfExecutors)
									    {
									        alpaka::unused(deviceSpec, listOfExecutors);
									        return std::tuple<>{};
									    }

									    /**@} */

									    /** Create a tuple of device specifications for a single API.
									     *
									     * Each device specifications combines the supplied API with one of the supported
									     * device types for that API.
									     *
									     * @param api a single alpaka API (e.g. `api::cuda`, `api::hip`)
									     * @return a tuple containing all device specifications for the given API
									     */
									    constexpr auto getDeviceSpecsFor(auto const api)
									    {
									        return std::apply(
									            [api](auto... devType) constexpr { return std::make_tuple(DeviceSpec{api, devType}...); },
									            supportedDevices(api));
									    }

									    /** Create a flattened tuple of device specification objects for a list of APIs.
									     *
									     * @param apiList a `std::tuple` containing the APIs
									     * @return a tuple containing all device specifications for the given API
									     */
									    template<alpaka::concepts::Api... T_Apis>
									    constexpr auto getDeviceSpecsFor(std::tuple<T_Apis...> const apiList)
									    {
									        return std::apply([](auto... api) constexpr { return std::tuple_cat(getDeviceSpecsFor(api)...); }, apiList);
									    }

									    /** Build a tuple of backends for a single device specification.
									     *
									     * A backend is the combination of a device specification and an executor.
									     * Each dictionary stores a `deviceSpec`(query: foo[object::deviceSpec]) entry and an `exec`(query:
									     * foo[object::exec]) entry for the corresponding executor.
									     *
									     * @param deviceSpec the device specification to associate with the executors
									     * @param listOfExecutors tuple of executor types
									     * @return a tuple of backend objects, one per executor
									     */
									    constexpr auto createBackendsFor(auto const deviceSpec, auto const listOfExecutors)
									    {
									        return std::apply(
									            [deviceSpec](auto... executor) constexpr
									            {
									                return std::make_tuple(
									                    Dict{DictEntry{object::deviceSpec, deviceSpec}, DictEntry{object::exec, executor}}...);
									            },
									            listOfExecutors);
									    }

									    /** Create the complete backend list for all device specifications and executors.
									     *
									     * @param devSpecList tuple of device specifications
									     * @param listOfExecutors tuple of executor types
									     * @return a tuple of backend objects, for all executors
									     */
									    constexpr auto createBackendList(auto const devSpecList, auto const listOfExecutors)
									    {
									        return std::apply(
									            [listOfExecutors](auto... devSpec) constexpr
									            { return std::tuple_cat(createBackendsFor(devSpec, getExecutorsList(devSpec, listOfExecutors))...); },
									            devSpecList);
									    }

									    /** Generate the full set of backend dictionaries for a set of APIs.
									     *
									     * The result contains a backend entry for each combination of supported device
									     * specification for the APIs and executors.
									     *
									     * @param usedApis tuple of alpaka APIs to consider
									     * @param listOfExecutors tuple of executor types
									     * @return a tuple of backend dictionaries covering all APIs and executors
									     */
									    template<alpaka::concepts::Api... T_Apis>
									    consteval auto allBackends(std::tuple<T_Apis...> const& usedApis, auto const listOfExecutors)
									    {
									        return std::apply(
									            [listOfExecutors](auto... api) constexpr
									            { return std::tuple_cat(createBackendList(getDeviceSpecsFor(api), listOfExecutors)...); },
									            usedApis);
									    }

									    /** Generate the full set of backend dictionaries for a set of device kinds.
									     *
									     * The result contains a backend entry for each combination of supported device
									     * specification and executors.
									     *
									     * @param usedDeviceSpecs tuple of alpaka device kinds
									     * @param listOfExecutors tuple of executor types
									     * @return a tuple of backend dictionaries covering all device kinds and executors
									     */
									    template<concepts::DeviceSpec... T_DevicesSpecs>
									    consteval auto allBackends(std::tuple<T_DevicesSpecs...> const& usedDeviceSpecs, auto const& listOfExecutors)
									    {
									        return std::tuple_cat(createBackendList(usedDeviceSpecs, listOfExecutors));
									    }
									} // namespace alpaka::onHost
									// ==
									// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/onHost/interface.hpp ==
									// ============================================================================

								// #include "alpaka/trait.hpp"    // amalgamate: file already inlined
								// #include "concepts/IndexVec.hpp"    // amalgamate: file already inlined

								#include <type_traits>

								namespace alpaka
								{
								    /** Lightweight view to data in an n-dimensional array.
								     *
								     * Const-ness of the MdSpan instance is propagated to the data region.
								     * A constant MdSpan can be used to access non-const data.
								     *
								     * @tparam T_Type if the type is const the data is only readable
								     */
								    template<
								        typename T_Type,
								        concepts::Vector T_Extents,
								        concepts::Vector T_Pitches,
								        concepts::Alignment T_MemAlignment = Alignment<>>
								    struct MdSpan;

								    template<concepts::Alignment T_MemAlignment = Alignment<>>
								    inline constexpr auto makeMdSpan(
								        auto* pointer,
								        concepts::Vector auto const& extents,
								        concepts::Vector auto const& pitchBytes,
								        T_MemAlignment const memAlignment = T_MemAlignment{})
								    {
								        return MdSpan{pointer, extents, pitchBytes, memAlignment};
								    }

								    template<typename T_ValueType, concepts::Alignment T_MemAlignment = Alignment<>>
								    inline constexpr auto makeMdSpan(
								        T_ValueType* pointer,
								        concepts::Vector auto const& extents,
								        T_MemAlignment const memAlignment = T_MemAlignment{})
								    {
								        auto pitchMd = alpaka::calculatePitchesFromExtents<T_ValueType>(extents);
								        return MdSpan{pointer, extents, pitchMd, memAlignment};
								    }

								    inline constexpr auto makeMdSpan(auto&& any)
								    {
								        return MdSpan{onHost::data(any), onHost::getExtents(any), onHost::getPitches(any), alpaka::getAlignment(any)};
								    }

								    template<
								        typename T_Type,
								        concepts::Vector T_Extents,
								        concepts::Vector T_Pitches,
								        concepts::Alignment T_MemAlignment>
								    struct MdSpan
								    {
								        using value_type = T_Type;
								        using reference = value_type&;
								        using const_reference = std::add_const_t<value_type>&;
								        using pointer = value_type*;
								        using const_pointer = std::add_const_t<value_type>*;
								        using index_type = typename T_Pitches::type;

								        using ConstThis = MdSpan<std::add_const_t<value_type>, T_Extents, T_Pitches, T_MemAlignment>;

								        static_assert(std::is_convertible_v<index_type, typename T_Extents::type>);
								        static_assert(T_Extents::dim() == T_Pitches::dim());

								        static consteval uint32_t dim()
								        {
								            return T_Extents::dim();
								        }

								        /** return value the origin pointer is pointing to
								         *
								         * @return value at the current location
								         */
								        constexpr const_reference operator*() const
								        {
								            return *this->m_ptr;
								        }

								        constexpr reference operator*()
								        {
								            return *this->m_ptr;
								        }

								        /** get origin pointer
								         *
								         * If the pointer is const and therefore read only depends on T_Type and not the const-ness of MdSPan.
								         */
								        constexpr const_pointer data() const
								        {
								            return this->m_ptr;
								        }

								        constexpr pointer data()
								        {
								            return this->m_ptr;
								        }

								        constexpr auto begin() const
								        {
								            return MdForwardIter{this->getConstMdSpan()};
								        }

								        constexpr auto begin()
								        {
								            return MdForwardIter{*this};
								        }

								        constexpr auto end() const
								        {
								            return MdForwardIterEnd{this->getConstMdSpan()};
								        }

								        constexpr auto end()
								        {
								            return MdForwardIterEnd{*this};
								        }

								        constexpr auto cbegin() const
								        {
								            return MdForwardIter{this->getConstMdSpan()};
								        }

								        constexpr auto cend() const
								        {
								            return MdForwardIterEnd{this->getConstMdSpan()};
								        }

								        /*Object must init by copy a valid instance*/
								        constexpr MdSpan() = default;

								        /** Constructor
								         *
								         * @param pointer pointer to the memory
								         * @param extents number of elements
								         * @param pitchBytes pitch in bytes per dimension
								         * @param memAlignmentInByte alignment in bytes (zero will set alignment to element alignment)
								         */
								        constexpr MdSpan(
								            T_Type* pointer,
								            T_Extents extents,
								            T_Pitches const& pitchBytes,
								            [[maybe_unused]] T_MemAlignment const& memAlignmentInByte = T_MemAlignment{})
								            : m_ptr(pointer)
								            , m_extent(extents)
								            , m_pitch(pitchBytes)
								        {
								        }

								        template<typename T_Type_Other>
								        requires internal::concepts::InnerTypeAllowedCast<T_Type, T_Type_Other>
								        constexpr MdSpan(MdSpan<T_Type_Other, T_Extents, T_Pitches, T_MemAlignment> const& other)
								            : m_ptr(other.data())
								            , m_extent(other.getExtents())
								            , m_pitch(other.getPitches())
								        {
								        }

								        template<typename T_Type_Other>
								        requires alpaka::internal::concepts::InnerTypeAllowedCast<T_Type, T_Type_Other>
								        constexpr MdSpan(MdSpan<T_Type_Other, T_Extents, T_Pitches, T_MemAlignment>&& other)
								            : m_ptr(std::move(other.data()))
								            , m_extent(std::move(other.getExtents()))
								            , m_pitch(std::move(other.getPitches()))
								        {
								        }

								        constexpr MdSpan(MdSpan const&) = default;
								        constexpr MdSpan(MdSpan&&) = default;

								        /** Assignment operator keeping const-ness
								         *
								         * @attention the assign operator is not removing inner const-ness because the type signature is not changed.
								         */
								        constexpr MdSpan& operator=(MdSpan const&) = default;

								        constexpr MdSpan& operator=(MdSpan&&) = default;

								        static constexpr auto getAlignment()
								        {
								            return T_MemAlignment{};
								        }

								        /** get value at the given index
								         *
								         * @param idx n-dimensional offset, relative to the origin pointer
								         * @return reference to the value
								         */
								        constexpr const_reference operator[](
								            // cannot use dim() or alpaka::trait::GetDim_v<T_Extents> because the cause a segmentation fault in nvcc
								            concepts::IndexVec<index_type, alpaka::trait::GetDim<T_Extents>::value> auto const& idx) const
								        {
								            return *ptr(idx);
								        }

								        constexpr reference operator[](
								            // cannot use dim() or alpaka::trait::GetDim_v<T_Extents> because the cause a segmentation fault in nvcc
								            concepts::IndexVec<index_type, alpaka::trait::GetDim<T_Extents>::value> auto const& idx)
								        {
								            return *ptr(idx);
								        }

								        constexpr const_reference operator[](std::integral auto const& idx) const requires(dim() == 1u)
								        {
								            return *ptr(Vec{idx});
								        }

								        constexpr reference operator[](std::integral auto const& idx) requires(dim() == 1u)
								        {
								            return *ptr(Vec{idx});
								        }

								        constexpr auto getExtents() const
								        {
								            return m_extent;
								        }

								        constexpr T_Pitches getPitches() const
								        {
								            return m_pitch.getPitches();
								        }

								        constexpr auto getConstMdSpan() const
								        {
								            using ConstValueType = std::add_const_t<value_type>;
								            return makeMdSpan(
								                static_cast<ConstValueType*>(m_ptr),
								                this->getExtents(),
								                this->getPitches(),
								                T_MemAlignment{});
								        }

								        /** True if MdSpan is pointing to valid memory.
								         *
								         * @details
								         * An MdSpan remains valid even after being moved. The reason for this is that the MdSpan is simply copied.
								         * This is more efficient than a real move (e.g., setting the data pointer to nullptr). Implementing a real
								         * move is also not possible because MdSpan must be trivially copyable, which requires a default move
								         * constructor.
								         */
								        [[nodiscard]] constexpr explicit operator bool() const noexcept
								        {
								            return true;
								        }

								    protected:
								        /** get the pointer of the value relative to the origin pointer m_ptr
								         *
								         * @param idx n-dimensional offset
								         * @return pointer to value
								         */
								        constexpr auto ptr(concepts::Vector auto const& idx) const requires(dim() >= 2u)
								        {
								            /** offset in bytes
								             *
								             * We calculate the complete offset in bytes even if it would be possible to change the x-dimension
								             * with the native value_types pointer, this is reducing the register footprint.
								             */
								            index_type offset = sizeof(value_type) * idx.back();
								            for(uint32_t d = 0u; d < dim() - 1u; ++d)
								            {
								                offset += m_pitch[d] * idx[d];
								            }
								            using CharPtrType = std::conditional_t<std::is_const_v<value_type>, char const*, char*>;
								            using ResultPtrType = std::conditional_t<std::is_const_v<value_type>, const_pointer, pointer>;
								            return reinterpret_cast<ResultPtrType>(reinterpret_cast<CharPtrType>(this->m_ptr) + offset);
								        }

								        constexpr const_pointer ptr(concepts::Vector auto const& idx) const requires(dim() == 1u)
								        {
								            return this->m_ptr + idx.x();
								        }

								        constexpr pointer ptr(concepts::Vector auto const& idx) requires(dim() == 1u)
								        {
								            return this->m_ptr + idx.x();
								        }

								    private:
								        pointer m_ptr;
								        T_Extents m_extent;
								        DataPitches<value_type, T_Pitches> m_pitch;
								    };

								    template<
								        typename T_Type,
								        concepts::Vector T_Extents,
								        concepts::Vector T_Pitches,
								        concepts::Alignment T_MemAlignment>
								    std::ostream& operator<<(std::ostream& s, MdSpan<T_Type, T_Extents, T_Pitches, T_MemAlignment> const& mdSpan)
								    {
								        return s << "MdSpan{ dim=" << ALPAKA_TYPEOF(mdSpan)::dim() << ", extents=" << mdSpan.getExtents().toString()
								                 << ", pitches=" << mdSpan.getPitches().toString()
								                 << " , alignment=" << T_MemAlignment::template get<T_Type>() << " }";
								    }

								    template<
								        typename T_Type,
								        alpaka::concepts::Vector T_Extents,
								        alpaka::concepts::Vector T_Pitches,
								        alpaka::concepts::Alignment T_MemAlignment>
								    struct internal::CopyConstructableDataSource<MdSpan<T_Type, T_Extents, T_Pitches, T_MemAlignment>> : std::true_type
								    {
								        using InnerMutable = MdSpan<std::remove_const_t<T_Type>, T_Extents, T_Pitches, T_MemAlignment>;
								        using InnerConst = MdSpan<std::add_const_t<T_Type>, T_Extents, T_Pitches, T_MemAlignment>;
								    };
								} // namespace alpaka
								// ==
								// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/mem/MdSpan.hpp ==
								// ============================================================================

								// ============================================================================
								// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/onAcc/DomainSpec.hpp ==
								// ==
								/* Copyright 2024 René Widera
								 * SPDX-License-Identifier: MPL-2.0
								 */

								// #pragma once
									// ============================================================================
									// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/onAcc/internal/MakeIter.hpp ==
									// ==
									/* Copyright 2024 Andrea Bocci, René Widera
									 * SPDX-License-Identifier: MPL-2.0
									 */

									// #pragma once
									// #include "alpaka/Vec.hpp"    // amalgamate: file already inlined
									// #include "alpaka/core/common.hpp"    // amalgamate: file already inlined
									// #include "alpaka/mem/ThreadSpace.hpp"    // amalgamate: file already inlined
									// #include "alpaka/mem/trait.hpp"    // amalgamate: file already inlined
									// #include "alpaka/onAcc/layout.hpp"    // amalgamate: file already inlined
										// ============================================================================
										// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/onAcc/traverse.hpp ==
										// ==
										/* Copyright 2024 Andrea Bocci, René Widera
										 * SPDX-License-Identifier: MPL-2.0
										 */

										// #pragma once
										// #include "alpaka/Vec.hpp"    // amalgamate: file already inlined
											// ============================================================================
											// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/mem/FlatIdxContainer.hpp ==
											// ==
											/* Copyright 2024 Andrea Bocci, René Widera
											 * SPDX-License-Identifier: MPL-2.0
											 */

											// #pragma once
											// #include "alpaka/Vec.hpp"    // amalgamate: file already inlined
											// #include "alpaka/api/api.hpp"    // amalgamate: file already inlined
											// #include "alpaka/core/Dict.hpp"    // amalgamate: file already inlined
											// #include "alpaka/core/PP.hpp"    // amalgamate: file already inlined
											// #include "alpaka/core/common.hpp"    // amalgamate: file already inlined
											// #include "alpaka/mem/ThreadSpace.hpp"    // amalgamate: file already inlined
											// #include "alpaka/onAcc/layout.hpp"    // amalgamate: file already inlined
											// #include "alpaka/tag.hpp"    // amalgamate: file already inlined
											// #include "alpaka/utility.hpp"    // amalgamate: file already inlined

											// #include <cstdint>    // amalgamate: file already included
											// #include <functional>    // amalgamate: file already included
											// #include <memory>    // amalgamate: file already included
											// #include <ranges>    // amalgamate: file already included
											// #include <sstream>    // amalgamate: file already included

											namespace alpaka::onAcc
											{

											    template<typename T_IdxRange, typename T_ThreadSpace, typename T_IdxMapperFn, alpaka::concepts::CVector T_CSelect>
											    class FlatIdxContainer : private T_IdxMapperFn
											    {
											        void _()
											        {
											            static_assert(std::ranges::forward_range<FlatIdxContainer>);
											            static_assert(std::ranges::borrowed_range<FlatIdxContainer>);
											            static_assert(std::ranges::range<FlatIdxContainer>);
											            static_assert(std::ranges::input_range<FlatIdxContainer>);
											        }

											    public:
											        using IdxType = typename T_IdxRange::IdxType;
											        static constexpr uint32_t dim = T_IdxRange::dim();
											        using IdxVecType = Vec<IdxType, dim>;

											        ALPAKA_FN_ACC inline FlatIdxContainer(
											            T_IdxRange const& idxRange,
											            T_ThreadSpace const& threadSpace,
											            T_IdxMapperFn idxMapping,
											            T_CSelect const& = T_CSelect{})
											            : T_IdxMapperFn{std::move(idxMapping)}
											            , m_idxRange(idxRange)
											            , m_threadSpace{threadSpace}
											        {
											            //  std::cout << "iter:" << m_idxRange.toString() << " " << m_threadSpace.toString() << std::endl;
											        }

											        constexpr FlatIdxContainer(FlatIdxContainer const&) = default;
											        constexpr FlatIdxContainer(FlatIdxContainer&&) = default;

											        class const_iterator;

											        /** special implementation to define the end
											         *
											         * Only a scalar value must be stored which reduce the register footprint.
											         * The definition of end is that the index is behind or equal to the extent of the slowest moving dimension.
											         */
											        class const_iterator_end
											        {
											            friend class FlatIdxContainer;

											            void _()
											            {
											                static_assert(std::forward_iterator<const_iterator_end>);
											                static_assert(std::input_iterator<const_iterator_end>);
											            }

											            ALPAKA_FN_ACC inline const_iterator_end(IdxType const& end) : m_extentSlowDim{end}
											            {
											            }

											            constexpr IdxType operator*() const
											            {
											                return m_extentSlowDim;
											            }

											        public:
											            constexpr bool operator==(const_iterator_end const& other) const
											            {
											                return (m_extentSlowDim == other.m_extentSlowDim);
											            }

											            constexpr bool operator!=(const_iterator_end const& other) const
											            {
											                return !(*this == other);
											            }

											            constexpr bool operator==(const_iterator const& other) const
											            {
											                return (m_extentSlowDim <= other.slowCurrent());
											            }

											            constexpr bool operator!=(const_iterator const& other) const
											            {
											                return !(*this == other);
											            }

											        private:
											            IdxType m_extentSlowDim;
											        };

											        class const_iterator
											        {
											            friend class FlatIdxContainer;
											            friend class const_iterator_end;

											            static constexpr uint32_t iterDim = T_CSelect::dim();
											            using IterIdxVecType = Vec<IdxType, iterDim>;

											            void _()
											            {
											                static_assert(std::forward_iterator<const_iterator>);
											                static_assert(std::input_iterator<const_iterator>);
											            }

											            constexpr const_iterator(
											                alpaka::concepts::Vector auto offsetMD,
											                IdxType const current,
											                IdxType const stride,
											                IdxType const end,
											                alpaka::concepts::Vector auto const extentMD,
											                alpaka::concepts::Vector auto const strideMD)
											                : m_offsetMD{offsetMD}
											                , m_current{current}
											                , m_end{end}
											                , m_stride{stride}
											                , m_extentMD{extentMD}
											                , m_strideMD{strideMD}
											            {
											            }

											            ALPAKA_FN_ACC constexpr IdxType slowCurrent() const
											            {
											                return m_current;
											            }

											        public:
											            constexpr IdxVecType operator*() const
											            {
											                auto result = m_offsetMD;
											                result.ref(T_CSelect{}) += mapToND(m_extentMD, m_current) * m_strideMD;
											                return result;
											            }

											            // pre-increment the iterator
											            ALPAKA_FN_ACC inline const_iterator& operator++()
											            {
											                m_current += m_stride;
											                return *this;
											            }

											            // post-increment the iterator
											            ALPAKA_FN_ACC inline const_iterator operator++(int)
											            {
											                const_iterator old = *this;
											                ++(*this);
											                return old;
											            }

											            constexpr bool operator==(const_iterator const& other) const
											            {
											                return ((**this) == *other);
											            }

											            constexpr bool operator!=(const_iterator const& other) const
											            {
											                return !(*this == other);
											            }

											            constexpr bool operator==(const_iterator_end const& other) const
											            {
											                return (slowCurrent() >= *other);
											            }

											            constexpr bool operator!=(const_iterator_end const& other) const
											            {
											                return !(*this == other);
											            }

											        private:
											            IdxVecType m_offsetMD;
											            // modified by the pre/post-increment operator
											            IdxType m_current;
											            // non-const to support iterator copy and assignment
											            IdxType m_end;
											            IdxType m_stride;
											            IterIdxVecType m_extentMD;
											            IterIdxVecType m_strideMD;
											        };

											        ALPAKA_FN_ACC inline const_iterator begin() const
											        {
											            constexpr auto selectedDims = T_CSelect{};
											            auto [threadIdx, numThreads] = m_threadSpace.mapTo(selectedDims);

											            if constexpr(std::is_same_v<T_IdxMapperFn, layout::Strided>)
											            {
											                auto groupOffset = threadIdx * m_idxRange.m_stride;
											                groupOffset.ref(selectedDims) -= groupOffset[selectedDims];

											                auto begin = m_idxRange.m_begin + groupOffset;

											                auto linearCurrent = linearize(numThreads[selectedDims], threadIdx[selectedDims]);
											                auto linearStride = numThreads[selectedDims].product();
											                auto strideMD = m_idxRange.m_stride[selectedDims];
											                auto extentMD = divCeil(m_idxRange.distance()[selectedDims], strideMD);

											                return const_iterator(begin, linearCurrent, linearStride, extentMD.product(), extentMD, strideMD);
											            }
											            else if constexpr(std::is_same_v<T_IdxMapperFn, layout::Contiguous>)
											            {
											                auto groupOffset = threadIdx * m_idxRange.m_stride;
											                groupOffset.ref(selectedDims) -= groupOffset[selectedDims];

											                auto begin = m_idxRange.m_begin + groupOffset;

											                auto strideMD = m_idxRange.m_stride[selectedDims];
											                auto extentMD = divCeil(m_idxRange.distance()[selectedDims], strideMD);

											                auto threadCountMD = m_threadSpace.m_threadCount[selectedDims];

											                auto numWorkerSlots = threadCountMD.product();
											                auto linearSlotIdx = linearize(threadCountMD, threadIdx[selectedDims]);

											                auto logicalExtent = extentMD.product();

											                // elements per slot
											                auto base = logicalExtent / numWorkerSlots;
											                // remainder elements will be given to the slots with id lower than rem
											                auto rem = logicalExtent % numWorkerSlots;

											                auto nextLinearSlotIdx = linearSlotIdx + IdxType{1};

											                auto linearCurrent = linearSlotIdx * base + std::min(linearSlotIdx, rem);
											                auto linearEnd = nextLinearSlotIdx * base + std::min(nextLinearSlotIdx, rem);

											                return const_iterator(
											                    begin,
											                    linearCurrent,
											                    IdxType{1u},
											                    std::min(linearEnd, logicalExtent),
											                    extentMD,
											                    strideMD);
											            }
											        }

											        ALPAKA_FN_ACC inline const_iterator_end end() const
											        {
											            constexpr auto selectedDims = T_CSelect{};
											            auto [threadIdx, numThreads] = m_threadSpace.mapTo(selectedDims);

											            if constexpr(std::is_same_v<T_IdxMapperFn, layout::Strided>)
											            {
											                auto extentMD = divCeil(m_idxRange.distance()[selectedDims], m_idxRange.m_stride[selectedDims]);
											                return const_iterator_end(extentMD.product());
											            }
											            else if constexpr(std::is_same_v<T_IdxMapperFn, layout::Contiguous>)
											            {
											                auto strideMD = m_idxRange.m_stride[selectedDims];
											                auto extentMD = divCeil(m_idxRange.distance()[selectedDims], strideMD);

											                auto numWorkerSlots = numThreads[selectedDims].product();
											                auto linearSlotIdx = linearize(numThreads[selectedDims], threadIdx[selectedDims]);

											                auto logicalExtent = extentMD.product();

											                // elements per slot
											                auto base = logicalExtent / numWorkerSlots;
											                // remainder elements will be given to the slots with id lower than rem
											                auto rem = logicalExtent % numWorkerSlots;

											                auto nextLinearSlotIdx = linearSlotIdx + IdxType{1};
											                auto linearEnd = nextLinearSlotIdx * base + std::min(nextLinearSlotIdx, rem);

											                return const_iterator_end(std::min(linearEnd, logicalExtent));
											            }
											        }

											        ALPAKA_FN_HOST_ACC constexpr auto operator[](alpaka::concepts::CVector auto const iterDir) const
											        {
											            return FlatIdxContainer<T_IdxRange, T_ThreadSpace, T_IdxMapperFn, ALPAKA_TYPEOF(iterDir)>(
											                m_idxRange,
											                m_threadSpace,
											                T_IdxMapperFn{});
											        }

											    private:
											        T_IdxRange m_idxRange;
											        T_ThreadSpace m_threadSpace;
											    };
											} // namespace alpaka::onAcc
											// ==
											// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/mem/FlatIdxContainer.hpp ==
											// ============================================================================

											// ============================================================================
											// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/mem/TiledIdxContainer.hpp ==
											// ==
											/* Copyright 2024 Andrea Bocci, René Widera
											 * SPDX-License-Identifier: MPL-2.0
											 */

											// #pragma once
											// #include "alpaka/Vec.hpp"    // amalgamate: file already inlined
											// #include "alpaka/api/api.hpp"    // amalgamate: file already inlined
											// #include "alpaka/core/Dict.hpp"    // amalgamate: file already inlined
											// #include "alpaka/core/PP.hpp"    // amalgamate: file already inlined
											// #include "alpaka/core/common.hpp"    // amalgamate: file already inlined
												// ============================================================================
												// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/mem/IdxRange.hpp ==
												// ==
												/* Copyright 2024 René Widera
												 * SPDX-License-Identifier: MPL-2.0
												 */

												// #pragma once
												// #include "alpaka/Vec.hpp"    // amalgamate: file already inlined
												// #include "alpaka/core/PP.hpp"    // amalgamate: file already inlined
												// #include "alpaka/core/common.hpp"    // amalgamate: file already inlined
												// #include "alpaka/mem/BoundaryIter.hpp"    // amalgamate: file already inlined
												// #include "alpaka/mem/FlatIdxContainer.hpp"    // amalgamate: file already inlined

												// #include <cstdint>    // amalgamate: file already included

												namespace alpaka
												{

												    template<
												        concepts::VectorOrScalar T_End,
												        concepts::Vector T_Begin = typename T_End::UniVec,
												        concepts::Vector T_Stride = typename T_End::UniVec>
												    struct IdxRange
												    {
												        using IdxType = typename T_End::type;
												        using IdxVecType = typename T_End::UniVec;

												        constexpr IdxRange(T_Begin const& begin, T_End const& end, T_Stride const& stride)
												            : m_begin{begin}
												            , m_end{end}
												            , m_stride{stride}
												        {
												        }

												        constexpr IdxRange(T_Begin const& begin, T_End const& end)
												            : m_begin{begin}
												            , m_end{end}
												            , m_stride{T_End::fill(1u)}
												        {
												        }

												        constexpr IdxRange(T_End const& extent) : m_begin{T_End::fill(0u)}, m_end{extent}, m_stride{T_End::fill(1u)}
												        {
												        }

												        static consteval uint32_t dim()
												        {
												            return IdxVecType::dim();
												        }

												        template<concepts::TypeOrVector<typename T_End::type> T_OpType>
												        ALPAKA_FN_HOST_ACC constexpr auto operator%(T_OpType const& rhs) const
												        {
												            return IdxRange<T_End, T_Begin, ALPAKA_TYPEOF(m_stride * rhs)>{m_begin, m_end, m_stride * rhs};
												        }

												        template<concepts::TypeOrVector<typename T_End::type> T_OpType>
												        ALPAKA_FN_HOST_ACC constexpr auto operator>>(T_OpType const& rhs) const
												        {
												            return IdxRange<ALPAKA_TYPEOF(m_end + rhs), ALPAKA_TYPEOF(m_begin + rhs), ALPAKA_TYPEOF(m_stride)>{
												                m_begin + rhs,
												                m_end + rhs,
												                m_stride};
												        }

												        template<concepts::TypeOrVector<typename T_End::type> T_OpType>
												        ALPAKA_FN_HOST_ACC constexpr auto operator<<(T_OpType const& rhs) const
												        {
												            return IdxRange<ALPAKA_TYPEOF(m_end - rhs), ALPAKA_TYPEOF(m_begin - rhs), T_Stride>{
												                m_begin - rhs,
												                m_end - rhs,
												                m_stride};
												        }

												        constexpr auto distance() const
												        {
												            return m_end - m_begin;
												        }

												        /** Begin iterator to iterate all positions in the range. It first iterates the fastest index (the one on the
												         * far right -> x-dimension) and then moves sequentially to the slowest index (the one on the far left) until
												         * it reaches the end.
												         *
												         * If you want to iterate the index in parallel with many threads, use the function
												         * alpaka::onAcc::makeIdxMap().
												         *
												         * @return Begin iterator
												         */
												        [[nodiscard]] constexpr auto begin() const
												        {
												            return alpaka::onAcc::FlatIdxContainer{
												                *this,
												                alpaka::ThreadSpace{T_End::fill(0), T_End::fill(1)},
												                alpaka::onAcc::layout::contiguous,
												                alpaka::iotaCVec<typename ALPAKA_TYPEOF(distance())::type, ALPAKA_TYPEOF(distance())::dim()>()}
												                .begin();
												        }

												        [[nodiscard]] constexpr auto end() const
												        {
												            return alpaka::onAcc::FlatIdxContainer{
												                *this,
												                alpaka::ThreadSpace{T_End::fill(0), T_End::fill(1)},
												                alpaka::onAcc::layout::contiguous,
												                alpaka::iotaCVec<typename ALPAKA_TYPEOF(distance())::type, ALPAKA_TYPEOF(distance())::dim()>()}
												                .end();
												        }

												        std::string toString(std::string const separator = ",", std::string const enclosings = "{}") const
												        {
												            std::string locale_enclosing_begin;
												            std::string locale_enclosing_end;
												            size_t enclosing_dim = enclosings.size();

												            if(enclosing_dim > 0)
												            {
												                /* % avoid out of memory access */
												                locale_enclosing_begin = enclosings[0 % enclosing_dim];
												                locale_enclosing_end = enclosings[1 % enclosing_dim];
												            }

												            std::stringstream stream;
												            stream << locale_enclosing_begin;
												            stream << m_begin << separator << m_end << separator << m_stride;
												            stream << locale_enclosing_end;
												            return stream.str();
												        }

												        T_Begin m_begin;
												        T_End m_end;
												        T_Stride m_stride;

												        using type = typename T_Begin::type;
												    };

												    template<uint32_t T_dim, alpaka::concepts::Vector T_LowHaloVec, alpaka::concepts::Vector T_UpHaloVec>
												    constexpr auto makeDirectionSubRange(
												        auto const range,
												        alpaka::BoundaryDirection<T_dim, T_LowHaloVec, T_UpHaloVec> const& boundaryDir)
												    {
												        auto m_begin = Vec<uint32_t, T_dim>::fill(0u);
												        auto m_end = Vec<uint32_t, T_dim>::fill(0u);
												        for(uint32_t i = 0; i < T_dim; ++i)
												        {
												            switch(boundaryDir.data[i])
												            {
												            case BoundaryType::LOWER:
												                m_begin[i] = range.m_begin[i];
												                m_end[i] = range.m_begin[i] + boundaryDir.lowerHaloSize[i];
												                break;
												            case BoundaryType::UPPER:
												                m_begin[i] = range.m_end[i] - boundaryDir.upperHaloSize[i];
												                m_end[i] = range.m_end[i];
												                break;
												            case BoundaryType::MIDDLE:
												                m_begin[i] = range.m_begin[i] + boundaryDir.lowerHaloSize[i];
												                m_end[i] = range.m_end[i] - boundaryDir.upperHaloSize[i];
												                break;
												            case BoundaryType::OOB:
												                [[fallthrough]];
												            default:
												                ALPAKA_ASSERT_ACC(false);
												            }
												        }
												        return IdxRange{m_begin, m_end, range.m_stride};
												    }

												    namespace internal
												    {
												        template<
												            typename T_To,
												            alpaka::concepts::Vector T_End,
												            alpaka::concepts::Vector T_Begin,
												            alpaka::concepts::Vector T_Stride>
												        struct PCast::Op<T_To, IdxRange<T_End, T_Begin, T_Stride>>
												        {
												            constexpr auto operator()(auto&& input) const
												                requires std::convertible_to<typename T_End::type, T_To> && (!std::same_as<T_To, typename T_End::type>)
												            {
												                return IdxRange{pCast<T_To>(input.m_begin), pCast<T_To>(input.m_end), pCast<T_To>(input.m_stride)};
												            }

												            constexpr decltype(auto) operator()(auto&& input) const requires std::same_as<T_To, typename T_End::type>
												            {
												                return std::forward<decltype(input)>(input);
												            }
												        };

												    } // namespace internal

												    template<concepts::VectorOrScalar T_Extents>
												    ALPAKA_FN_HOST_ACC IdxRange(T_Extents const&) -> IdxRange<typename trait::getVec_t<T_Extents>::UniVec>;

												    template<concepts::VectorOrScalar T_Begin, concepts::VectorOrScalar T_End>
												    ALPAKA_FN_HOST_ACC IdxRange(T_Begin const&, T_End const&) -> IdxRange<
												        typename trait::getVec_t<T_Begin>::UniVec,
												        typename trait::getVec_t<T_End>::UniVec,
												        typename trait::getVec_t<T_End>::UniVec>;

												    template<concepts::VectorOrScalar T_Begin, concepts::VectorOrScalar T_End, concepts::VectorOrScalar T_Stride>
												    ALPAKA_FN_HOST_ACC IdxRange(T_Begin const&, T_End const&, T_Stride const&) -> IdxRange<
												        typename trait::getVec_t<T_Begin>::UniVec,
												        typename trait::getVec_t<T_End>::UniVec,
												        typename trait::getVec_t<T_Stride>::UniVec>;

												    namespace trait
												    {
												        template<typename T>
												        struct IsIndexRange : std::false_type
												        {
												        };

												        template<concepts::SpecializationOf<IdxRange> T>
												        struct IsIndexRange<T> : std::true_type
												        {
												        };

												        template<typename T>
												        struct IsLazyIndexRange : std::false_type
												        {
												        };

												    } // namespace trait

												    template<typename T>
												    constexpr bool isIndexRange_v = trait::IsIndexRange<T>::value;

												    template<typename T>
												    constexpr bool isLazyIndexRange_v = trait::IsLazyIndexRange<T>::value;

												    namespace concepts
												    {
												        /** Concept to check if a type is an index range
												         *
												         * @tparam T Type to check
												         * @tparam T_ValueType enforce a value type of the index range, if not provided the type is not checked
												         * @tparam T_dim enforce a dimensionality of the index range, if not provided the value is not checked
												         */
												        template<typename T, typename T_ValueType = alpaka::NotRequired, uint32_t T_dim = alpaka::notRequiredDim>
												        concept IdxRange
												            = alpaka::isIndexRange_v<T>
												              && (std::same_as<T_ValueType, typename T::IdxType> || std::same_as<T_ValueType, alpaka::NotRequired>)
												              && ((T_dim == alpaka::notRequiredDim) || (T::dim() == T_dim));

												        /** Concept to check if a type is a lazy-evaluated index range
												         *
												         * @attention the value type and dimension can not be evaluated for lazy index ranges.
												         *
												         * @tparam T Type to check
												         */
												        template<typename T>
												        concept LazyIdxRange = alpaka::isLazyIndexRange_v<T>;

												        template<typename T>
												        concept IdxRangeDescription = alpaka::isIndexRange_v<T> || isLazyIndexRange_v<T>;

												    } // namespace concepts
												} // namespace alpaka
												// ==
												// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/mem/IdxRange.hpp ==
												// ============================================================================

											// #include "alpaka/mem/ThreadSpace.hpp"    // amalgamate: file already inlined
											// #include "alpaka/onAcc/layout.hpp"    // amalgamate: file already inlined
											// #include "alpaka/tag.hpp"    // amalgamate: file already inlined
											// #include "alpaka/utility.hpp"    // amalgamate: file already inlined

											// #include <cstdint>    // amalgamate: file already included
											// #include <functional>    // amalgamate: file already included
											// #include <memory>    // amalgamate: file already included
											// #include <ranges>    // amalgamate: file already included
											// #include <sstream>    // amalgamate: file already included

											namespace alpaka::onAcc
											{
											    namespace detail
											    {
											        /** Store reduced vector
											         *
											         * The first index can be reduced by on dimension because the slowest dimension must never set to zero after
											         * the initialization.
											         */
											        template<typename T_Type, uint32_t T_dim>
											        struct ReducedVector : private Vec<T_Type, T_dim - 1u>
											        {
											            constexpr ReducedVector(Vec<T_Type, T_dim> const& first)
											                : Vec<T_Type, T_dim - 1u>{first.template rshrink<T_dim - 1u>()}
											            {
											            }

											            constexpr decltype(auto) operator[](T_Type idx) const
											            {
											                return Vec<T_Type, T_dim - 1u>::operator[](idx - 1u);
											            }

											            constexpr decltype(auto) operator[](T_Type idx)
											            {
											                return Vec<T_Type, T_dim - 1u>::operator[](idx - 1u);
											            }
											        };

											        template<typename T_Type>
											        struct ReducedVector<T_Type, 1u>
											        {
											            constexpr ReducedVector(Vec<T_Type, 1u> const&)
											            {
											            }
											        };
											    } // namespace detail

											    template<
											        alpaka::concepts::IdxRange T_IdxRange,
											        typename T_ThreadSpace,
											        typename T_IdxMapperFn,
											        alpaka::concepts::CVector T_CSelect>
											    class TiledIdxContainer
											    {
											        void _()
											        {
											            static_assert(std::ranges::forward_range<TiledIdxContainer>);
											            static_assert(std::ranges::borrowed_range<TiledIdxContainer>);
											            static_assert(std::ranges::range<TiledIdxContainer>);
											            static_assert(std::ranges::input_range<TiledIdxContainer>);
											        }

											    public:
											        using IdxType = typename T_IdxRange::IdxType;
											        static constexpr uint32_t dim = T_IdxRange::dim();
											        using IdxVecType = Vec<IdxType, dim>;

											        ALPAKA_FN_ACC inline TiledIdxContainer(
											            T_IdxRange const& idxRange,
											            T_ThreadSpace const& threadSpace,
											            T_IdxMapperFn idxMapping,
											            T_CSelect const& = T_CSelect{})
											            : m_idxRange(idxRange)
											            , m_threadSpace{threadSpace}
											        {
											            alpaka::unused(idxMapping);
											        }

											        constexpr TiledIdxContainer(TiledIdxContainer const&) = default;
											        constexpr TiledIdxContainer(TiledIdxContainer&&) = default;

											        class const_iterator;

											        /** special implementation to define the end
											         *
											         * Only a scalar value must be stored which reduce the register footprint.
											         * The definition of end is that the index is behind or equal to the extent of the slowest moving dimension.
											         */
											        class const_iterator_end
											        {
											            friend class TiledIdxContainer;

											            void _()
											            {
											                static_assert(std::forward_iterator<const_iterator_end>);
											            }

											            ALPAKA_FN_ACC inline const_iterator_end(alpaka::concepts::Vector auto const& extent)
											                : m_extentSlowDim{extent[T_CSelect{}][0]}
											            {
											            }

											            constexpr IdxType operator*() const
											            {
											                return m_extentSlowDim;
											            }

											        public:
											            constexpr bool operator==(const_iterator_end const& other) const
											            {
											                return (m_extentSlowDim == other.m_extentSlowDim);
											            }

											            constexpr bool operator!=(const_iterator_end const& other) const
											            {
											                return !(*this == other);
											            }

											            constexpr bool operator==(const_iterator const& other) const
											            {
											                return (m_extentSlowDim <= other.slowCurrent);
											            }

											            constexpr bool operator!=(const_iterator const& other) const
											            {
											                return !(*this == other);
											            }

											        private:
											            IdxType m_extentSlowDim;
											        };

											        class const_iterator
											        {
											            friend class TiledIdxContainer;
											            friend class const_iterator_end;

											            static constexpr uint32_t iterDim = T_CSelect::dim();
											            using IterIdxVecType = Vec<IdxType, iterDim>;

											            void _()
											            {
											                static_assert(std::forward_iterator<const_iterator>);
											                static_assert(std::input_iterator<const_iterator>);
											            }

											            constexpr const_iterator(
											                alpaka::concepts::Vector auto const offset,
											                alpaka::concepts::Vector auto const first,
											                alpaka::concepts::Vector auto const extent,
											                alpaka::concepts::Vector auto const stride)
											                : m_current{first + offset}
											                , m_stride{stride[T_CSelect{}]}
											                , m_extent{(extent + offset)[T_CSelect{}]}
											                , m_first((m_current)[T_CSelect{}])
											            {
											                // range check required for 1 dimensional iterators
											                if constexpr(iterDim > 1u)
											                {
											                    // invalidate current if one dimension is out of range.
											                    bool isIndexValid = true;
											                    for(uint32_t d = 1u; d < iterDim; ++d)
											                        isIndexValid = isIndexValid && (m_first[d] < m_extent[d]);
											                    if(!isIndexValid)
											                        m_current[T_CSelect{}[0]] = m_extent[0];
											                }

											                // std::cout << "const iter " << m_current << m_extent << m_stride << std::endl;
											            }

											            ALPAKA_FN_ACC constexpr IdxType slowCurrent() const
											            {
											                return m_current[T_CSelect{}[0]];
											            }

											        public:
											            constexpr IdxVecType operator*() const
											            {
											                return m_current;
											            }

											            // pre-increment the iterator
											            ALPAKA_FN_ACC inline const_iterator& operator++()
											            {
											                for(uint32_t d = 0; d < iterDim; ++d)
											                {
											                    uint32_t const idx = iterDim - 1u - d;
											                    m_current[T_CSelect{}[idx]] += m_stride[idx];
											                    if constexpr(iterDim != 1u)
											                    {
											                        if(idx >= 1u && m_current[T_CSelect{}[idx]] >= m_extent[idx])
											                        {
											                            m_current[T_CSelect{}[idx]] = m_first[idx];
											                        }
											                        else
											                            break;
											                    }
											                }
											                return *this;
											            }

											            // post-increment the iterator
											            ALPAKA_FN_ACC inline const_iterator operator++(int)
											            {
											                const_iterator old = *this;
											                ++(*this);
											                return old;
											            }

											            constexpr bool operator==(const_iterator const& other) const
											            {
											                return (m_current == other.m_current);
											            }

											            constexpr bool operator!=(const_iterator const& other) const
											            {
											                return !(*this == other);
											            }

											            constexpr bool operator==(const_iterator_end const& other) const
											            {
											                return (slowCurrent() >= *other);
											            }

											            constexpr bool operator!=(const_iterator_end const& other) const
											            {
											                return !(*this == other);
											            }

											        private:
											            // modified by the pre/post-increment operator
											            IdxVecType m_current;
											            // non-const to support iterator copy and assignment
											            IterIdxVecType m_stride;
											            IterIdxVecType m_extent;
											            detail::ReducedVector<IdxType, iterDim> m_first;
											        };

											        ALPAKA_FN_ACC inline const_iterator begin() const
											        {
											            constexpr auto selectedDims = T_CSelect{};
											            auto [threadIdx, numThreads] = m_threadSpace.mapTo(selectedDims);

											            if constexpr(std::is_same_v<T_IdxMapperFn, layout::Strided>)
											            {
											                return const_iterator(
											                    m_idxRange.m_begin,
											                    threadIdx * m_idxRange.m_stride,
											                    m_idxRange.distance(),
											                    numThreads * m_idxRange.m_stride);
											            }
											            else if constexpr(std::is_same_v<T_IdxMapperFn, layout::Contiguous>)
											            {
											                IdxVecType extent = m_idxRange.distance();
											                IdxVecType logicalExtent = divCeil(extent, m_idxRange.m_stride);

											                // elements per slot
											                IdxVecType base = logicalExtent / numThreads;
											                // remainder elements will be given to the slots with id lower than rem
											                IdxVecType rem = logicalExtent % numThreads;

											                IdxVecType firstLogical = threadIdx * base + threadIdx.min(rem);
											                IdxVecType first = firstLogical * m_idxRange.m_stride;

											                IdxVecType nextThreadIdx = threadIdx + IdxType{1};
											                IdxVecType endLogical = nextThreadIdx * base + nextThreadIdx.min(rem);
											                // crop to the end of the index range
											                IdxVecType end = extent.min(endLogical * m_idxRange.m_stride);

											                return const_iterator(m_idxRange.m_begin, first, end, m_idxRange.m_stride);
											            }
											        }

											        ALPAKA_FN_ACC inline const_iterator_end end() const
											        {
											            constexpr auto selectedDims = T_CSelect{};
											            auto [threadIdx, numThreads] = m_threadSpace.mapTo(selectedDims);

											            if constexpr(std::is_same_v<T_IdxMapperFn, layout::Strided>)
											            {
											                return const_iterator_end(m_idxRange.m_begin + m_idxRange.distance());
											            }
											            else if constexpr(std::is_same_v<T_IdxMapperFn, layout::Contiguous>)
											            {
											                IdxVecType extent = m_idxRange.distance();
											                IdxVecType logicalExtent = divCeil(extent, m_idxRange.m_stride);

											                // elements per slot
											                IdxVecType base = logicalExtent / numThreads;
											                // remainder elements will be given to the slots with id lower than rem
											                IdxVecType rem = logicalExtent % numThreads;

											                IdxVecType nextSlotIdx = threadIdx + IdxType{1};
											                IdxVecType endLogical = nextSlotIdx * base + nextSlotIdx.min(rem);
											                // crop to the end of the index range
											                IdxVecType end = extent.min(endLogical * m_idxRange.m_stride);

											                return const_iterator_end(m_idxRange.m_begin + end);
											            }
											        }

											        ALPAKA_FN_HOST_ACC constexpr auto operator[](alpaka::concepts::CVector auto const iterDir) const
											        {
											            return TiledIdxContainer<T_IdxRange, T_ThreadSpace, T_IdxMapperFn, ALPAKA_TYPEOF(iterDir)>(
											                m_idxRange,
											                m_threadSpace,
											                T_IdxMapperFn{});
											        }

											    private:
											        T_IdxRange m_idxRange;
											        T_ThreadSpace m_threadSpace;
											    };
											} // namespace alpaka::onAcc
											// ==
											// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/mem/TiledIdxContainer.hpp ==
											// ============================================================================


										namespace alpaka::onAcc
										{
										    namespace traverse
										    {
										        /** Linearize the index domain for traversing.
										         *
										         * Maps each linear index into the M-dimensional index space.
										         * Mapping the linear index to a MD-index is increasing the computations (usage of multiplications and
										         * additions) and can therefore slow down the performance.
										         */
										        struct Flat
										        {
										            ALPAKA_FN_HOST_ACC static constexpr auto make(
										                auto const& idxRange,
										                auto const& threadSpace,
										                auto const& idxLayout,
										                alpaka::concepts::CVector auto const& cSelect)
										            {
										                return FlatIdxContainer{idxRange, threadSpace, idxLayout, cSelect};
										            }
										        };

										        constexpr auto flat = Flat{};

										        /** Traversing the index domain with MD-tiles
										         *
										         * The worker specification is seen as MD-tile and iterating over the index space is done in a tiled strided
										         * way. There are no multiplication required (only additions) and therefore are less computations requred
										         * compared to @see Flat.
										         */
										        struct Tiled
										        {
										            ALPAKA_FN_HOST_ACC static constexpr auto make(
										                auto const& idxRange,
										                auto const& threadSpace,
										                auto const& idxLayout,
										                alpaka::concepts::CVector auto const& cSelect)
										            {
										                return TiledIdxContainer{idxRange, threadSpace, idxLayout, cSelect};
										            }
										        };

										        constexpr auto tiled = Tiled{};
										    } // namespace traverse
										} // namespace alpaka::onAcc
										// ==
										// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/onAcc/traverse.hpp ==
										// ============================================================================


									namespace alpaka::onAcc::internal
									{
									    namespace trait
									    {
									        template<typename T>
									        struct IsIdxMapping : std::false_type
									        {
									        };

									        template<>
									        struct IsIdxMapping<layout::Strided> : std::true_type
									        {
									        };

									        template<>
									        struct IsIdxMapping<layout::Optimized> : std::true_type
									        {
									        };

									        template<>
									        struct IsIdxMapping<layout::Contiguous> : std::true_type
									        {
									        };

									        template<typename T>
									        constexpr bool isIdxMapping_v = IsIdxMapping<T>::value;

									        template<typename T>
									        struct IsIdxTraversing : std::false_type
									        {
									        };

									        template<>
									        struct IsIdxTraversing<traverse::Flat> : std::true_type
									        {
									        };

									        template<>
									        struct IsIdxTraversing<traverse::Tiled> : std::true_type
									        {
									        };

									        template<typename T>
									        constexpr bool isIdxTraversing_v = IsIdxTraversing<T>::value;

									    } // namespace trait

									    struct MakeIter
									    {
									        /* create iterator
									         *
									         * ALPAKA_FN_HOST_ACC is required for cuda else __host__ function called from __host__ __device__
									         * warning is popping up and generated code is wrong.
									         */
									        template<
									            typename T_ScalarIdxType,
									            typename T_Acc,
									            typename T_DomainSpec,
									            typename T_Traverse,
									            typename T_IdxMapping>
									        struct Op
									        {
									            ALPAKA_FN_HOST_ACC constexpr auto operator()(
									                T_Acc const& acc,
									                T_DomainSpec const& domainSpec,
									                [[maybe_unused]] T_Traverse traverse,
									                T_IdxMapping idxMapping) const requires std::is_same_v<ALPAKA_TYPEOF(idxMapping), layout::Optimized>
									            {
									                auto adjIdxMapping = adjustMapping(acc);
									                auto const idxRange = domainSpec.getIdxRange(acc);
									                auto const threadSpace = domainSpec.getThreadSpace(acc);

									                using IdxType = std::conditional_t<
									                    std::is_same_v<void, T_ScalarIdxType>,
									                    typename ALPAKA_TYPEOF(idxRange)::IdxType,
									                    T_ScalarIdxType>;
									                return T_Traverse::make(
									                    pCast<IdxType>(idxRange),
									                    pCast<IdxType>(threadSpace),
									                    adjIdxMapping,
									                    iotaCVec<
									                        typename ALPAKA_TYPEOF(idxRange.distance())::type,
									                        ALPAKA_TYPEOF(idxRange.distance())::dim()>());
									            }

									            ALPAKA_FN_HOST_ACC constexpr auto operator()(
									                T_Acc const& acc,
									                T_DomainSpec const& domainSpec,
									                [[maybe_unused]] T_Traverse traverse,
									                T_IdxMapping idxMapping) const
									            {
									                auto const idxRange = domainSpec.getIdxRange(acc);
									                auto const threadSpace = domainSpec.getThreadSpace(acc);

									                using IdxType = std::conditional_t<
									                    std::is_same_v<void, T_ScalarIdxType>,
									                    typename ALPAKA_TYPEOF(idxRange)::IdxType,
									                    T_ScalarIdxType>;
									                return T_Traverse::make(
									                    pCast<IdxType>(idxRange),
									                    pCast<IdxType>(threadSpace),
									                    idxMapping,
									                    iotaCVec<
									                        typename ALPAKA_TYPEOF(idxRange.distance())::type,
									                        ALPAKA_TYPEOF(idxRange.distance())::dim()>());
									            }
									        };
									    };
									} // namespace alpaka::onAcc::internal
									// ==
									// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/onAcc/internal/MakeIter.hpp ==
									// ============================================================================


								namespace alpaka::onAcc
								{
								    template<typename T_WorkGroup, typename T_IdxRange>
								    struct DomainSpec
								    {
								        constexpr DomainSpec(T_WorkGroup const threadGroup, T_IdxRange const idxRange)
								            : m_threadGroup{threadGroup}
								            , m_idxRange{idxRange}
								        {
								        }

								    private:
								        friend internal::MakeIter;

								        constexpr auto getIdxRange(auto const& acc) const
								        {
								            alpaka::unused(acc);
								            return m_idxRange;
								        }

								        constexpr auto getIdxRange(auto const& acc) const
								            requires(requires { std::declval<T_IdxRange>().getIdxRange(acc); })
								        {
								            return m_idxRange.getIdxRange(acc);
								        }

								        constexpr auto getThreadSpace(auto const& acc) const
								        {
								            alpaka::unused(acc);
								            return m_threadGroup;
								        }

								        constexpr auto getThreadSpace(auto const& acc) const
								            requires(requires { std::declval<T_WorkGroup>().getThreadSpace(acc); })
								        {
								            return m_threadGroup.getThreadSpace(acc);
								        }

								        T_WorkGroup m_threadGroup;
								        T_IdxRange m_idxRange;
								    };
								} // namespace alpaka::onAcc
								// ==
								// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/onAcc/DomainSpec.hpp ==
								// ============================================================================

							// #include "alpaka/onAcc/internal/MakeIter.hpp"    // amalgamate: file already inlined
							// #include "alpaka/onAcc/internal/interface.hpp"    // amalgamate: file already inlined
							// #include "alpaka/onAcc/layout.hpp"    // amalgamate: file already inlined
							// #include "alpaka/onAcc/traverse.hpp"    // amalgamate: file already inlined

							/** functionality which is usable on the accelerator compute device from within a kernel. */
							namespace alpaka::onAcc
							{
							    namespace concepts
							    {
							        template<typename T>
							        concept IdxMapping = internal::trait::isIdxMapping_v<T>;

							        template<typename T>
							        concept IdxTraversing = internal::trait::isIdxTraversing_v<T>;
							    } // namespace concepts

							    /**@{
							     * @name range‑based loop indexable index container
							     */

							    /** Creates an index container
							     *
							     * The index data type is deduced from the supplied range.
							     * The traversal policy (`T_Traverse`) defines how the next valid index is found for a worker and
							     * defaults to @c traverse::Flat.
							     * The mapping policy (`T_IdxLayout`) defines how the index is mapped to worker threads and defaults to
							     * @c layout::Optimized.
							     *
							     * @param workGroup Description of the participating thread group.  More than one
							     *                  thread can have the same index within the group; all workers
							     *                  with the same id obtain the same index as result.
							     * @param range     Index range description.
							     * @param traverse  Policy describing how the next value can be found.
							     * @param idxLayout Policy describing how real worker threads will be mapped to the range.
							     * @return A index container that can be used in a range‑based for loop.
							     */
							    template<concepts::IdxTraversing T_Traverse = traverse::Flat, concepts::IdxMapping T_IdxLayout = layout::Optimized>
							    ALPAKA_FN_HOST_ACC constexpr auto makeIdxMap(
							        auto const& acc,
							        auto const workGroup,
							        auto const range,
							        T_Traverse traverse = T_Traverse{},
							        T_IdxLayout idxLayout = T_IdxLayout{})
							    {
							        return internal::MakeIter::
							            Op<void, ALPAKA_TYPEOF(acc), ALPAKA_TYPEOF(DomainSpec{workGroup, range}), T_Traverse, T_IdxLayout>{}(
							                acc,
							                DomainSpec{workGroup, range},
							                traverse,
							                idxLayout);
							    }

							    /** Specialization for an index container with a given boundary direction of the volume described by the range.
							     */
							    template<concepts::IdxTraversing T_Traverse = traverse::Flat, concepts::IdxMapping T_IdxLayout = layout::Optimized>
							    ALPAKA_FN_HOST_ACC constexpr auto makeIdxMap(
							        auto const& acc,
							        auto const workGroup,
							        auto const range,
							        alpaka::concepts::BoundaryDirection auto const& bd,
							        T_Traverse traverse = T_Traverse{},
							        T_IdxLayout idxLayout = T_IdxLayout{})
							    {
							        static_assert(ALPAKA_TYPEOF(bd)::dim() == ALPAKA_TYPEOF(range)::dim());
							        auto const subRange = makeDirectionSubRange(range, bd);
							        return internal::MakeIter::
							            Op<void, ALPAKA_TYPEOF(acc), ALPAKA_TYPEOF(DomainSpec{workGroup, subRange}), T_Traverse, T_IdxLayout>{}(
							                acc,
							                DomainSpec{workGroup, subRange},
							                traverse,
							                idxLayout);
							    }

							    /** Creates an index container
							     *
							     * The traversal policy (`T_Traverse`) defines how the next valid index is found for a worker and
							     * defaults to @c traverse::Flat.
							     * The mapping policy (`T_IdxLayout`) defines how the index is mapped to worker threads and defaults to
							     * @c layout::Optimized.
							     *
							     * @tparam T_ScalarIdxType scalar index type sed for the indices inside the iterator
							     * @param workGroup Description of the participating thread group.  More than one
							     *                  thread can have the same index within the group; all workers
							     *                  with the same id obtain the same index as result.
							     * @param range     Index range description.
							     * @param traverse  Policy describing how the next value can be found.
							     * @param idxLayout Policy describing how real worker threads will be mapped to the range.
							     * @return A index container that can be used in a range‑based for loop.
							     */
							    template<
							        typename T_ScalarIdxType,
							        concepts::IdxTraversing T_Traverse = traverse::Flat,
							        concepts::IdxMapping T_IdxLayout = layout::Optimized>
							    ALPAKA_FN_HOST_ACC constexpr auto makeIdxMap(
							        auto const& acc,
							        auto const workGroup,
							        auto const range,
							        T_Traverse traverse = T_Traverse{},
							        T_IdxLayout idxLayout = T_IdxLayout{})
							    {
							        return internal::MakeIter::Op<
							            T_ScalarIdxType,
							            ALPAKA_TYPEOF(acc),
							            ALPAKA_TYPEOF(DomainSpec{workGroup, range}),
							            T_Traverse,
							            T_IdxLayout>{}(acc, DomainSpec{workGroup, range}, traverse, idxLayout);
							    }

							    ///@cond NO_HTML
							    /** Specialisation for one‑dimensional ranges. */
							    template<
							        concepts::IdxTraversing T_Traverse = traverse::Tiled,
							        concepts::IdxMapping T_IdxLayout = layout::Optimized>
							    ALPAKA_FN_HOST_ACC constexpr auto makeIdxMap(
							        auto const& acc,
							        auto const workGroup,
							        alpaka::concepts::IdxRange auto const range,
							        T_Traverse traverse = T_Traverse{},
							        T_IdxLayout idxLayout = T_IdxLayout{}) requires(ALPAKA_TYPEOF(range)::dim() == 1u)
							    {
							        return internal::MakeIter::
							            Op<void, ALPAKA_TYPEOF(acc), ALPAKA_TYPEOF(DomainSpec{workGroup, range}), T_Traverse, T_IdxLayout>{}(
							                acc,
							                DomainSpec{workGroup, range},
							                traverse,
							                idxLayout);
							    }

							    /** Specialisation for one‑dimensional ranges. */
							    template<
							        typename T_ScalarIdxType,
							        concepts::IdxTraversing T_Traverse = traverse::Tiled,
							        concepts::IdxMapping T_IdxLayout = layout::Optimized>
							    ALPAKA_FN_HOST_ACC constexpr auto makeIdxMap(
							        auto const& acc,
							        auto const workGroup,
							        alpaka::concepts::IdxRange auto const range,
							        T_Traverse traverse = T_Traverse{},
							        T_IdxLayout idxLayout = T_IdxLayout{}) requires(ALPAKA_TYPEOF(range)::dim() == 1u)
							    {
							        return internal::MakeIter::Op<
							            T_ScalarIdxType,
							            ALPAKA_TYPEOF(acc),
							            ALPAKA_TYPEOF(DomainSpec{workGroup, range}),
							            T_Traverse,
							            T_IdxLayout>{}(acc, DomainSpec{workGroup, range}, traverse, idxLayout);
							    }

							    ///@endcond NO_HTML
							    /** @} */
							} // namespace alpaka::onAcc
							// ==
							// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/onAcc/interface.hpp ==
							// ============================================================================


						// #include <cstdint>    // amalgamate: file already included
						#include <new>

						namespace alpaka::onAcc::internal
						{
						    /** concurrent foreach implementation */
						    template<typename T_Parent>
						    struct SimdConcurrent
						    {
						        constexpr SimdConcurrent() = default;

						    protected:
						        template<uint32_t T_maxConcurrencyInByte, alpaka::concepts::Alignment T_MemAlignment>
						        ALPAKA_FN_INLINE ALPAKA_FN_ACC constexpr void concurrent(
						            auto const& acc,
						            alpaka::concepts::Vector auto extents,
						            auto&& func,
						            alpaka::concepts::IDataSource auto&& data0,
						            alpaka::concepts::IDataSource auto&&... dataN) const
						        {
						            auto numElements = typename ALPAKA_TYPEOF(extents)::UniVec{extents};
						            using ValueType = alpaka::trait::GetValueType_t<ALPAKA_TYPEOF(data0)>;

						            constexpr auto simdCfg = T_Parent::template calcSimdPackConfig<ValueType>(
						                ALPAKA_TYPEOF(acc.getApi()){},
						                ALPAKA_TYPEOF(acc.getDeviceKind()){},
						                T_maxConcurrencyInByte);

						            constexpr uint32_t simdWidth = simdCfg.simdWidth;

						            if constexpr(simdWidth != 1u)
						            {
						                constexpr uint32_t numSimdPerFnCall = simdCfg.numSimdPacksPerFnCall;
						                concurrentSimdPackExecution<simdWidth, numSimdPerFnCall, T_MemAlignment>(
						                    acc,
						                    numElements,
						                    ALPAKA_FORWARD(func),
						                    ALPAKA_FORWARD(data0),
						                    ALPAKA_FORWARD(dataN)...);
						            }
						            else
						            {
						                // execute the algorithm with SIMD width one
						                for(auto idx : onAcc::makeIdxMap(
						                        acc,
						                        asParent().getWorkGroup(),
						                        IdxRange{numElements},
						                        asParent().getTraversePolicy(),
						                        asParent().getIdxLayoutPolicy()))
						                {
						                    func(
						                        acc,
						                        SimdPtr{data0, idx, T_MemAlignment{}, CVec<uint32_t, 1u>{}},
						                        SimdPtr{dataN, idx, T_MemAlignment{}, CVec<uint32_t, 1u>{}}...);
						                }
						            }
						        }

						    private:
						        constexpr auto const& asParent() const
						        {
						            return static_cast<T_Parent const&>(*this);
						        }

						        template<alpaka::concepts::Alignment T_MemAlignment, uint32_t T_width>
						        ALPAKA_FN_INLINE static constexpr void executeDo(
						            auto const& acc,
						            auto const& dataIdx,
						            auto&& func,
						            alpaka::concepts::IDataSource auto&&... data)
						        {
						            func(acc, SimdPtr{ALPAKA_FORWARD(data), dataIdx, T_MemAlignment{}, CVec<uint32_t, T_width>{}}...);
						        }

						        /** calls the functor and forward the data T_repeat times
						         *
						         * The calls to the functor are independent and compile time unrolled to support instruction parallelism.
						         *
						         * @param iter the caller must ensure tha the interator can be increased T_repeat times without jumping over
						         * iter.end()
						         */
						        template<alpaka::concepts::Alignment T_MemAlignment, uint32_t T_width, uint32_t... T_repeat>
						        ALPAKA_FN_INLINE static constexpr void execute(
						            auto const& acc,
						            auto& iter,
						            std::integer_sequence<uint32_t, T_repeat...>,
						            auto&& func,
						            alpaka::concepts::IDataSource auto&&... data)
						        {
						            /* We do not check if the iterator points to a valid element, the caller must ensure that we can safely
						             * increase the iterator without jumping over iter.end().
						             *
						             * The ternary operator is used to allow using the folding expression on iter.
						             */
						            auto ids = std::make_tuple(*(T_repeat + 1 != 0u ? iter++ : iter++)...);
						            std::apply(
						                [&](auto const&... dataIdx) constexpr
						                {
						                    (executeDo<T_MemAlignment, T_width>(acc, dataIdx, ALPAKA_FORWARD(func), ALPAKA_FORWARD(data)...),
						                     ...);
						                },
						                ids);
						        }

						        template<uint32_t T_simdWidth, uint32_t T_numSimdPerFnCall, alpaka::concepts::Alignment T_MemAlignment>
						        ALPAKA_FN_INLINE ALPAKA_FN_ACC constexpr auto concurrentSimdPackExecution(
						            auto const& acc,
						            alpaka::concepts::Vector auto numElements,
						            auto&& func,
						            alpaka::concepts::IDataSource auto&& data0,
						            alpaka::concepts::IDataSource auto&&... dataN) const
						        {
						            auto const workGroup = asParent().getWorkGroup();

						            // we SIMDfy only over the fast moving dimension (columns of memory)
						            auto const wSize = workGroup.size(acc).back();

						            /* Number of data elements process per functor call. */
						            auto const numElementsPerFnCall = T_simdWidth * T_numSimdPerFnCall;
						            /** To avoid a overflow in the index range we device first by the number of elements per
						             * function call and than by the number of workers.
						             */
						            auto const numSimdPackLoops = numElements.back() / numElementsPerFnCall / wSize;

						            // number of elments to jump over to start the remainder loop
						            auto const remainderBegin = numSimdPackLoops * numElementsPerFnCall * wSize;

						            // we SIMDfy only over the fast moving dimension (columns of memory)
						            auto domainSize = numElements.rAssign(remainderBegin);
						            auto stride = ALPAKA_TYPEOF(numElements)::fill(1).rAssign(T_simdWidth);
						            using IdxType = ALPAKA_TYPEOF(numElements);

						            if constexpr(
						                domainSize.dim() > 1u && std::is_same_v<ALPAKA_TYPEOF(asParent().getTraversePolicy()), traverse::Flat>)
						            {
						                /* For cases where we traverse with the flat policy, we cannot assume that we can blindly increase the
						                 * iterator later N times. This could happen in cases where we have enough concurrency. We evaluate for
						                 * SIMD operations only the fast moving dimension but with the flat policy flattening the worker group
						                 * and use all workers on a linear domain. The loop must therefore be splited into iterating over all
						                 * slow dimensions and an inner loop iterating over the fast moving dimension. For this we need to
						                 * build our own groups out of the user-provided workgroup.
						                 */
						                // build a worker group with slow-moving dimension threads for the outer loop
						                using index_type = typename IdxType::type;
						                auto wIdx = workGroup.idx(acc).rAssign(index_type{0});
						                auto wSize = workGroup.size(acc).rAssign(index_type{1});
						                auto domSize = domainSize.rAssign(index_type{1});

						                auto wOuter = WorkerGroup{wIdx, wSize};

						                for(auto rowIdx : onAcc::makeIdxMap(
						                        acc,
						                        wOuter,
						                        IdxRange{domSize},
						                        asParent().getTraversePolicy(),
						                        asParent().getIdxLayoutPolicy()))
						                {
						                    // build a worker group with fast-moving dimension threads for the inner loop
						                    auto wIdxInner = ALPAKA_TYPEOF(domainSize)::fill(0).rAssign(workGroup.idx(acc).back());
						                    auto wSizeInner = ALPAKA_TYPEOF(domainSize)::fill(1).rAssign(workGroup.size(acc).back());
						                    auto wInner = WorkerGroup{wIdxInner, wSizeInner};

						                    // iterate over the fast-moving dimension
						                    auto simdIdxContainer = onAcc::makeIdxMap(
						                        acc,
						                        wInner,
						                        IdxRange{rowIdx, domainSize, stride},
						                        asParent().getTraversePolicy(),
						                        asParent().getIdxLayoutPolicy())[CVec<uint32_t, ALPAKA_TYPEOF(domainSize)::dim() - 1u>{}];

						                    for(auto iter = simdIdxContainer.begin(); iter != simdIdxContainer.end();)
						                    {
						                        execute<T_MemAlignment, T_simdWidth>(
						                            acc,
						                            iter,
						                            std::make_integer_sequence<uint32_t, T_numSimdPerFnCall>{},
						                            ALPAKA_FORWARD(func),
						                            ALPAKA_FORWARD(data0),
						                            ALPAKA_FORWARD(dataN)...);
						                    }
						                }
						            }
						            else
						            {
						                auto simdIdxContainer = onAcc::makeIdxMap(
						                    acc,
						                    workGroup,
						                    IdxRange{IdxType::fill(0), domainSize, stride},
						                    asParent().getTraversePolicy(),
						                    asParent().getIdxLayoutPolicy());

						                for(auto iter = simdIdxContainer.begin(); iter != simdIdxContainer.end();)
						                {
						                    execute<T_MemAlignment, T_simdWidth>(
						                        acc,
						                        iter,
						                        std::make_integer_sequence<uint32_t, T_numSimdPerFnCall>{},
						                        ALPAKA_FORWARD(func),
						                        ALPAKA_FORWARD(data0),
						                        ALPAKA_FORWARD(dataN)...);
						                }
						            }

						            ALPAKA_TYPEOF(numElements) remainderDomainSize = numElements.fill(0).rAssign(remainderBegin);

						            for(auto idx : onAcc::makeIdxMap(
						                    acc,
						                    workGroup,
						                    IdxRange{remainderDomainSize, numElements},
						                    asParent().getTraversePolicy(),
						                    asParent().getIdxLayoutPolicy()))
						            {
						                func(
						                    acc,
						                    SimdPtr{data0, idx, T_MemAlignment{}, CVec<uint32_t, 1u>{}},
						                    SimdPtr{dataN, idx, T_MemAlignment{}, CVec<uint32_t, 1u>{}}...);
						            }
						        }
						    };
						} // namespace alpaka::onAcc::internal
						// ==
						// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/onAcc/internal/SimdConcurrent.hpp ==
						// ============================================================================

						// ============================================================================
						// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/onAcc/internal/SimdTransformReduce.hpp ==
						// ==
						/* Copyright 2024 René Widera
						 * SPDX-License-Identifier: MPL-2.0
						 */

						// #pragma once
						// #include "alpaka/Simd.hpp"    // amalgamate: file already inlined
						// #include "alpaka/SimdPtr.hpp"    // amalgamate: file already inlined
						// #include "alpaka/Vec.hpp"    // amalgamate: file already inlined
						// #include "alpaka/api/trait.hpp"    // amalgamate: file already inlined
						// #include "alpaka/core/common.hpp"    // amalgamate: file already inlined
							// ============================================================================
							// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/functor.hpp ==
							// ==
							/* Copyright 2025 René Widera
							 * SPDX-License-Identifier: MPL-2.0
							 */

							// #pragma once
							// #include "alpaka/core/common.hpp"    // amalgamate: file already inlined

							#include <type_traits>
							// #include <utility>    // amalgamate: file already included

							namespace alpaka
							{
							    /** Marks a functor which supports SimdPtr as arguments
							     *
							     * Wrapping a functor or lambda with this class to signal support for SimdPtr.
							     * A stencil functor can be used to write stencil operations within a transform call.
							     */
							    template<typename T_Func>
							    struct StencilFunc : T_Func
							    {
							        using Functor = T_Func;

							        constexpr StencilFunc(auto&& func) : T_Func{ALPAKA_FORWARD(func)}
							        {
							        }
							    };

							    template<typename T_Func>
							    ALPAKA_FN_HOST_ACC StencilFunc(T_Func&&) -> StencilFunc<T_Func>;

							    /** Marks a functor that can only be executed with scalar types and not SIMD packages.
							     *
							     * The functor will be executed element wise for SIMD packages due to methods used which prevent using SIMD
							     * packages directly.
							     */
							    template<typename T_Func>
							    struct ScalarFunc : T_Func
							    {
							        using Functor = T_Func;

							        constexpr ScalarFunc(auto&& func) : T_Func{ALPAKA_FORWARD(func)}
							        {
							        }
							    };

							    template<typename T_Func>
							    ALPAKA_FN_HOST_ACC ScalarFunc(T_Func&&) -> ScalarFunc<T_Func>;

							    /** Execute the functor with or without an accelerator as first argument
							     *
							     * The functor is not allowed to have both possible signatures.
							     *
							     * @{
							     */
							    template<typename T_Acc, typename T_Functor, typename... T_Args>
							    requires std::invocable<T_Functor, T_Acc, T_Args...>
							    inline constexpr auto callFunctor(T_Acc const& acc, T_Functor&& functor, T_Args&&... args)
							    {
							        return functor(acc, std::forward<T_Args>(args)...);
							    }

							    template<typename T_Acc, typename T_Functor, typename... T_Args>
							    requires std::invocable<T_Functor, T_Args...>
							    inline constexpr auto callFunctor(T_Acc const&, T_Functor&& functor, T_Args&&... args)
							    {
							        return functor(std::forward<T_Args>(args)...);
							    }

							    /** @} */
							} // namespace alpaka
							// ==
							// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/functor.hpp ==
							// ============================================================================

						// #include "alpaka/mem/concepts/IDataSource.hpp"    // amalgamate: file already inlined
						// #include "alpaka/mem/concepts/IDataStorage.hpp"    // amalgamate: file already inlined
							// ============================================================================
							// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/onAcc/Acc.hpp ==
							// ==
							/* Copyright 2024 René Widera
							 * SPDX-License-Identifier: MPL-2.0
							 */

							// #pragma once
							// #include "alpaka/Vec.hpp"    // amalgamate: file already inlined
							// #include "alpaka/core/Dict.hpp"    // amalgamate: file already inlined
							// #include "alpaka/core/Tag.hpp"    // amalgamate: file already inlined
							// #include "alpaka/core/common.hpp"    // amalgamate: file already inlined
								// ============================================================================
								// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/mem/MdSpanArray.hpp ==
								// ==
								/* Copyright 2025 René Widera, Simeon Ehrig
								 * SPDX-License-Identifier: MPL-2.0
								 */

								// #pragma once
								// #include "alpaka/CVec.hpp"    // amalgamate: file already inlined
								// #include "alpaka/Vec.hpp"    // amalgamate: file already inlined
								// #include "alpaka/concepts/types.hpp"    // amalgamate: file already inlined
								// #include "alpaka/core/config.hpp"    // amalgamate: file already inlined
								// #include "alpaka/mem/Alignment.hpp"    // amalgamate: file already inlined
								// #include "alpaka/mem/DataPitches.hpp"    // amalgamate: file already inlined
								// #include "alpaka/mem/MdForwardIter.hpp"    // amalgamate: file already inlined
								// #include "alpaka/mem/concepts/detail/InnerTypeAllowedCast.hpp"    // amalgamate: file already inlined
								// #include "alpaka/mem/trait.hpp"    // amalgamate: file already inlined
								// #include "alpaka/trait.hpp"    // amalgamate: file already inlined
								// #include "concepts/IndexVec.hpp"    // amalgamate: file already inlined

								// #include <concepts>    // amalgamate: file already included
								#include <type_traits>

								namespace alpaka
								{
								    /** access a C array with compile time extents via a runtime md index. */
								    template<std::integral auto T_numDims, uint32_t T_dim = 0u>
								    struct ResolveArrayAccess
								    {
								        constexpr decltype(auto) operator()(auto arrayPtr, concepts::Vector auto const& idx) const
								        {
								            return ResolveArrayAccess<T_numDims - 1u, T_dim + 1u>{}(arrayPtr[idx[T_dim]], idx);
								        }
								    };

								    template<uint32_t T_dim>
								    struct ResolveArrayAccess<1u, T_dim>
								    {
								        constexpr decltype(auto) operator()(auto arrayPtr, concepts::Vector auto const& idx) const
								        {
								            return arrayPtr[idx[T_dim]];
								        }
								    };

								    /** build C array type with compile time extents from a scalar value based on the compile time extents vector */
								    template<typename T, concepts::CVector T_Extents, uint32_t T_numDims = T_Extents::dim(), uint32_t T_dim = 0u>
								    struct CArrayType
								    {
								        using type =
								            typename CArrayType<T[T_Extents{}[T_numDims - T_dim - 1u]], T_Extents, T_numDims - 1u, T_dim + 1u>::type;
								    };

								    template<typename T, concepts::CVector T_Extents, uint32_t T_dim>
								    struct CArrayType<T, T_Extents, 1u, T_dim>
								    {
								        using type = T[T_Extents{}[0u]];
								    };

								    template<typename T_ArrayType, std::integral T_IndexType, concepts::Alignment T_MemAlignment = Alignment<>>
								    struct MdSpanArray
								    {
								        static_assert(
								            sizeof(T_ArrayType) && false,
								            "MdSpanArray can only be used if std::is_array_v<T> is true for the given type.");
								    };

								    template<alpaka::concepts::CStaticArray T_ArrayType, std::integral T_IndexType, concepts::Alignment T_MemAlignment>
								    struct MdSpanArray<T_ArrayType, T_IndexType, T_MemAlignment>
								    {
								    private:
								        using MutArrayType = std::remove_cv_t<T_ArrayType>;
								        using ConstArrayType = std::add_const_t<MutArrayType>;

								    public:
								        using value_type = std::remove_all_extents_t<T_ArrayType>;
								        using reference = value_type&;
								        using const_reference = value_type const&;
								        using pointer = value_type*;
								        using const_pointer = value_type const*;
								        using index_type = T_IndexType;

								        static consteval uint32_t dim()
								        {
								            return std::rank_v<T_ArrayType>;
								        }

								        /** return value the origin pointer is pointing to
								         *
								         * @return value at the current location
								         */
								        constexpr const_reference operator*() const
								        {
								            return *this->data();
								        }

								        constexpr reference operator*()
								        {
								            return *this->data();
								        }

								        /** get origin pointer */
								        constexpr const_pointer data() const
								        {
								            return reinterpret_cast<const_pointer>(this->m_ptr);
								        }

								        constexpr pointer data()
								        {
								            return reinterpret_cast<pointer>(this->m_ptr);
								        }

								        constexpr auto begin() const
								        {
								            return MdForwardIter{this->getConstMdSpan()};
								        }

								        constexpr auto begin()
								        {
								            return MdForwardIter{*this};
								        }

								        constexpr auto end() const
								        {
								            return MdForwardIterEnd{this->getConstMdSpan()};
								        }

								        constexpr auto end()
								        {
								            return MdForwardIterEnd{*this};
								        }

								        constexpr auto getConstMdSpan() const
								        {
								            return MdSpanArray<ConstArrayType, T_IndexType, T_MemAlignment>(*m_ptr);
								        }

								        constexpr auto cbegin() const
								        {
								            return MdForwardIter{this->getConstMdSpan()};
								        }

								        constexpr auto cend() const
								        {
								            return MdForwardIterEnd{this->getConstMdSpan()};
								        }

								        /*Object must init by copy a valid instance*/
								        constexpr MdSpanArray() = default;

								        /** Constructor
								         *
								         * @param pointer pointer to the memory
								         */
								        constexpr MdSpanArray(T_ArrayType& staticSizedArray) : m_ptr(const_cast<MutArrayType*>(&staticSizedArray))
								        {
								        }

								        template<alpaka::concepts::CStaticArray T_OtherArrayType>
								        requires internal::concepts::InnerTypeAllowedCast<T_ArrayType, T_OtherArrayType>
								        constexpr MdSpanArray(MdSpanArray<T_OtherArrayType, T_IndexType, T_MemAlignment> const& other)
								            : m_ptr(other.m_ptr)
								        {
								        }

								        constexpr MdSpanArray(MdSpanArray const&) = default;

								        /** Assignment operator keeping const-ness
								         *
								         * alpaka keeps track that const references to a span can not be cast to non const.
								         * If the inner value type is not const the second assignment operator with non const will be used.
								         */
								        constexpr MdSpanArray& operator=(MdSpanArray const& other)
								            requires(std::is_const_v<typename ALPAKA_TYPEOF(other)::value_type>)
								            = default;
								        constexpr MdSpanArray& operator=(MdSpanArray&) = default;

								        constexpr MdSpanArray(MdSpanArray&&) = default;

								        template<alpaka::concepts::CStaticArray T_OtherArrayType>
								        requires internal::concepts::InnerTypeAllowedCast<T_ArrayType, T_OtherArrayType>
								        constexpr MdSpanArray(MdSpanArray<T_OtherArrayType, T_IndexType, T_MemAlignment>&& other) : m_ptr(other.m_ptr)
								        {
								        }

								        constexpr MdSpanArray& operator=(MdSpanArray&&) = default;

								        static constexpr auto getAlignment()
								        {
								            return T_MemAlignment{};
								        }

								        /** get value at the given index
								         *
								         * @param idx offset relative to the origin pointer
								         * @return reference to the value
								         * @{
								         */
								        constexpr const_reference operator[](
								            // cannot use dim() or std::rank_v<T_ArrayType> because the cause a segmentation fault in nvcc
								            concepts::IndexVec<index_type, std::rank<T_ArrayType>::value> auto const& idx) const
								        {
								            return ResolveArrayAccess<dim()>{}(*m_ptr, idx);
								        }

								        constexpr reference operator[](
								            // cannot use dim() or std::rank_v<T_ArrayType> because the cause a segmentation fault in nvcc
								            concepts::IndexVec<index_type, std::rank<T_ArrayType>::value> auto const& idx)
								        {
								            return ResolveArrayAccess<dim()>{}(*m_ptr, idx);
								        }

								        constexpr const_reference operator[](index_type const& idx) const
								        {
								            return (*m_ptr)[idx];
								        }

								        constexpr reference operator[](index_type const& idx)
								        {
								            return (*m_ptr)[idx];
								        }

								        constexpr bool operator==(MdSpanArray const other) const
								        {
								            return m_ptr == other.m_ptr;
								        }

								        /** @} */

								        constexpr auto getExtents() const
								        {
								            // uint32_t is the data type of dim()
								            auto const createExtents = []<uint32_t... T_extent>(std::integer_sequence<uint32_t, T_extent...>)
								            { return CVec<index_type, std::extent_v<T_ArrayType, T_extent>...>{}; };
								            return createExtents(std::make_integer_sequence<uint32_t, dim()>{});
								        }

								        constexpr auto getPitches() const
								        {
								            return alpaka::calculatePitchesFromExtents<value_type>(getExtents());
								        }

								        /** True if MdSpanArray is pointing to valid memory.
								         *
								         * @details
								         * An MdSpanArray remains valid even after being moved. The reason is, that it use stack memory which cannot be
								         * freed.
								         */
								        [[nodiscard]] constexpr explicit operator bool() const noexcept
								        {
								            return true;
								        }

								        // Needs to be friend of itself with that the copy and move constructor can access the m_ptr of other, if the
								        // const modifier of the C static array type of the other type is different.
								        friend MdSpanArray<MutArrayType, T_IndexType, T_MemAlignment>;
								        friend MdSpanArray<ConstArrayType, T_IndexType, T_MemAlignment>;

								    protected:
								        // we store the C static array as mutable type that we can assign it another MdSpanArray with const or
								        // non-const inner type
								        // Depending on the value_type, the const is added at memory access
								        MutArrayType* m_ptr;
								    };

								    template<
								        alpaka::concepts::CStaticArray T_ArrayType,
								        std::integral T_IndexType,
								        alpaka::concepts::Alignment T_MemAlignment>
								    struct internal::CopyConstructableDataSource<MdSpanArray<T_ArrayType, T_IndexType, T_MemAlignment>>
								        : std::true_type
								    {
								        using InnerMutable = MdSpanArray<std::remove_const_t<T_ArrayType>, T_IndexType, T_MemAlignment>;
								        using InnerConst = MdSpanArray<std::add_const_t<T_ArrayType>, T_IndexType, T_MemAlignment>;
								    };
								} // namespace alpaka
								// ==
								// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/mem/MdSpanArray.hpp ==
								// ============================================================================

								// ============================================================================
								// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/meta/NdLoop.hpp ==
								// ==
								/* Copyright 2022 Axel Huebl, Benjamin Worpitz, Jan Stephan, Bernhard Manfred Gruber
								 * SPDX-License-Identifier: MPL-2.0
								 */

								// #pragma once
								// #include "alpaka/Vec.hpp"    // amalgamate: file already inlined
								// #include "alpaka/core/common.hpp"    // amalgamate: file already inlined
									// ============================================================================
									// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/meta/IntegerSequence.hpp ==
									// ==
									/* Copyright 2022 Axel Huebl, Benjamin Worpitz, Bernhard Manfred Gruber
									 * SPDX-License-Identifier: MPL-2.0
									 */

									// #pragma once
										// ============================================================================
										// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/meta/Set.hpp ==
										// ==
										/* Copyright 2022 Benjamin Worpitz, Bernhard Manfred Gruber
										 * SPDX-License-Identifier: MPL-2.0
										 */

										// #pragma once
										// #include <utility>    // amalgamate: file already included

										namespace alpaka::meta
										{
										    namespace detail
										    {
										        //! Empty dependent type.
										        template<typename T>
										        struct Empty
										        {
										        };

										        template<typename... Ts>
										        struct IsParameterPackSetImpl;

										        template<>
										        struct IsParameterPackSetImpl<>
										        {
										            static constexpr bool value = true;
										        };

										        // Based on code by Roland Bock: https://gist.github.com/rbock/ad8eedde80c060132a18
										        // Linearly inherits from empty<T> and checks if it has already inherited from this type.
										        template<typename T, typename... Ts>
										        struct IsParameterPackSetImpl<T, Ts...>
										            : public IsParameterPackSetImpl<Ts...>
										            , public virtual Empty<T>
										        {
										            using Base = IsParameterPackSetImpl<Ts...>;

										            static constexpr bool value = Base::value && !std::is_base_of_v<Empty<T>, Base>;
										        };
										    } // namespace detail

										    //! Trait that tells if the parameter pack contains only unique (no equal) types.
										    template<typename... Ts>
										    using IsParameterPackSet = detail::IsParameterPackSetImpl<Ts...>;

										    namespace detail
										    {
										        template<typename TList>
										        struct IsSetImpl;

										        template<template<typename...> class TList, typename... Ts>
										        struct IsSetImpl<TList<Ts...>>
										        {
										            static constexpr bool value = IsParameterPackSet<Ts...>::value;
										        };
										    } // namespace detail

										    //! Trait that tells if the template contains only unique (no equal) types.
										    template<typename TList>
										    using IsSet = detail::IsSetImpl<TList>;
										} // namespace alpaka::meta
										// ==
										// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/meta/Set.hpp ==
										// ============================================================================


									// #include <cstddef>    // amalgamate: file already included
									#include <type_traits>
									// #include <utility>    // amalgamate: file already included

									namespace alpaka::meta
									{
									    namespace detail
									    {
									        template<typename TDstType, typename TIntegerSequence>
									        struct ConvertIntegerSequence;

									        template<typename TDstType, typename T, T... Tvals>
									        struct ConvertIntegerSequence<TDstType, std::integer_sequence<T, Tvals...>>
									        {
									            using type = std::integer_sequence<TDstType, static_cast<TDstType>(Tvals)...>;
									        };
									    } // namespace detail

									    template<typename TDstType, typename TIntegerSequence>
									    using ConvertIntegerSequence = typename detail::ConvertIntegerSequence<TDstType, TIntegerSequence>::type;

									    namespace detail
									    {
									        template<bool TisSizeNegative, bool TbIsBegin, typename T, T Tbegin, typename TIntCon, typename TIntSeq>
									        struct MakeIntegerSequenceHelper
									        {
									            static_assert(!TisSizeNegative, "MakeIntegerSequence<T, N> requires N to be non-negative.");
									        };

									        template<typename T, T Tbegin, T... Tvals>
									        struct MakeIntegerSequenceHelper<
									            false,
									            true,
									            T,
									            Tbegin,
									            std::integral_constant<T, Tbegin>,
									            std::integer_sequence<T, Tvals...>>
									        {
									            using type = std::integer_sequence<T, Tvals...>;
									        };

									        template<typename T, T Tbegin, T TIdx, T... Tvals>
									        struct MakeIntegerSequenceHelper<
									            false,
									            false,
									            T,
									            Tbegin,
									            std::integral_constant<T, TIdx>,
									            std::integer_sequence<T, Tvals...>>
									        {
									            using type = typename MakeIntegerSequenceHelper<
									                false,
									                TIdx == (Tbegin + 1),
									                T,
									                Tbegin,
									                std::integral_constant<T, TIdx - 1>,
									                std::integer_sequence<T, TIdx - 1, Tvals...>>::type;
									        };
									    } // namespace detail

									    template<typename T, T Tbegin, T Tsize>
									    using MakeIntegerSequenceOffset = typename detail::MakeIntegerSequenceHelper<
									        (Tsize < 0),
									        (Tsize == 0),
									        T,
									        Tbegin,
									        std::integral_constant<T, Tbegin + Tsize>,
									        std::integer_sequence<T>>::type;

									    //! Checks if the integral values are unique.
									    template<typename T, T... Tvals>
									    struct IntegralValuesUnique
									    {
									        static constexpr bool value = meta::IsParameterPackSet<std::integral_constant<T, Tvals>...>::value;
									    };

									    //! Checks if the values in the index sequence are unique.
									    template<typename TIntegerSequence>
									    struct IntegerSequenceValuesUnique;

									    //! Checks if the values in the index sequence are unique.
									    template<typename T, T... Tvals>
									    struct IntegerSequenceValuesUnique<std::integer_sequence<T, Tvals...>>
									    {
									        static constexpr bool value = IntegralValuesUnique<T, Tvals...>::value;
									    };

									    //! Checks if the integral values are within the given range.
									    template<typename T, T Tmin, T Tmax, T... Tvals>
									    struct IntegralValuesInRange;

									    //! Checks if the integral values are within the given range.
									    template<typename T, T Tmin, T Tmax>
									    struct IntegralValuesInRange<T, Tmin, Tmax>
									    {
									        static constexpr bool value = true;
									    };

									    //! Checks if the integral values are within the given range.
									    template<typename T, T Tmin, T Tmax, T I, T... Tvals>
									    struct IntegralValuesInRange<T, Tmin, Tmax, I, Tvals...>
									    {
									        static constexpr bool value
									            = (I >= Tmin) && (I <= Tmax) && IntegralValuesInRange<T, Tmin, Tmax, Tvals...>::value;
									    };

									    //! Checks if the values in the index sequence are within the given range.
									    template<typename TIntegerSequence, typename T, T Tmin, T Tmax>
									    struct IntegerSequenceValuesInRange;

									    //! Checks if the values in the index sequence are within the given range.
									    template<typename T, T... Tvals, T Tmin, T Tmax>
									    struct IntegerSequenceValuesInRange<std::integer_sequence<T, Tvals...>, T, Tmin, Tmax>
									    {
									        static constexpr bool value = IntegralValuesInRange<T, Tmin, Tmax, Tvals...>::value;
									    };
									} // namespace alpaka::meta
									// ==
									// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/meta/IntegerSequence.hpp ==
									// ============================================================================


								// #include <utility>    // amalgamate: file already included

								namespace alpaka::meta
								{
								    namespace detail
								    {
								        template<typename TIndex, typename TExtentVec, typename TFnObj>
								        constexpr void ndLoopImpl(std::index_sequence<>, TIndex& idx, TExtentVec const&, TFnObj const& f)
								        {
								            f(idx);
								        }

								        template<std::size_t Tdim0, std::size_t... Tdims, typename TIndex, typename TExtentVec, typename TFnObj>
								        constexpr void ndLoopImpl(
								            std::index_sequence<Tdim0, Tdims...>,
								            TIndex& idx,
								            TExtentVec const& extent,
								            TFnObj const& f)
								        {
								            static_assert(TIndex::dim() > 0u, "The dimension given to ndLoop has to be larger than zero!");
								            static_assert(
								                TIndex::dim() == TExtentVec::dim(),
								                "The dimensions of the iteration vector and the extent vector have to be identical!");
								            static_assert(TIndex::dim() > Tdim0, "The current dimension has to be in the range [0,dim-1]!");

								            for(idx[Tdim0] = 0u; idx[Tdim0] < extent[Tdim0]; ++idx[Tdim0])
								            {
								                ndLoopImpl(std::index_sequence<Tdims...>{}, idx, extent, f);
								            }
								        }
								    } // namespace detail

								    //! Loops over an n-dimensional iteration index variable calling f(idx, args...) for each iteration.
								    //! The loops are nested in the order given by the index_sequence with the first element being the outermost
								    //! and the last index the innermost loop.
								    //!
								    //! \param indexSequence A sequence of indices being a permutation of the values [0, dim-1].
								    //! \param extent N-dimensional loop extent.
								    //! \param f The function called at each iteration.
								    template<typename TExtentVec, typename TFnObj, std::size_t... Tdims>
								    auto ndLoop(
								        [[maybe_unused]] std::index_sequence<Tdims...> indexSequence,
								        TExtentVec& idx,
								        TExtentVec const& extent,
								        TFnObj const& f) -> void
								    {
								        static_assert(
								            IntegerSequenceValuesInRange<std::index_sequence<Tdims...>, std::size_t, 0, TExtentVec::dim()>::value,
								            "The values in the index_sequence have to be in the range [0,dim-1]!");
								        static_assert(
								            IntegerSequenceValuesUnique<std::index_sequence<Tdims...>>::value,
								            "The values in the index_sequence have to be unique!");

								        detail::ndLoopImpl(std::index_sequence<Tdims...>{}, idx, extent, f);
								    }

								    //! Loops over an n-dimensional iteration index variable calling f(idx, args...) for each iteration.
								    //! The loops are nested from index zero outmost to index (dim-1) innermost.
								    //!
								    //! \param extent N-dimensional loop extent.
								    //! \param f The function called at each iteration.
								    template<typename TExtentVec, typename TFnObj>
								    auto ndLoopIncIdx(TExtentVec& idx, TExtentVec const& extent, TFnObj const& f) -> void
								    {
								        idx = TExtentVec::fill(0);
								        ndLoop(std::make_index_sequence<TExtentVec::dim()>(), idx, extent, f);
								    }

								    template<typename TExtentVec, typename TFnObj>
								    auto ndLoopIncIdx(TExtentVec const& extent, TFnObj const& f) -> void
								    {
								        // TExtentVec could be a CVec therefore we need to make it writable
								        using IndexVector = typename TExtentVec::UniVec;
								        auto idx = IndexVector::fill(0);

								        ndLoop(std::make_index_sequence<TExtentVec::dim()>(), idx, IndexVector{extent}, f);
								    }
								} // namespace alpaka::meta
								// ==
								// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/meta/NdLoop.hpp ==
								// ============================================================================

							// #include "alpaka/onAcc/interface.hpp"    // amalgamate: file already inlined
							// #include "alpaka/tag.hpp"    // amalgamate: file already inlined

							// #include <cassert>    // amalgamate: file already included
							// #include <tuple>    // amalgamate: file already included

							namespace alpaka::onAcc
							{
							    template<typename T_Storage>
							    struct Acc : T_Storage
							    {
							        constexpr Acc(T_Storage const& storage) : T_Storage{storage}
							        {
							        }

							        constexpr Acc(Acc const&) = delete;
							        constexpr Acc(Acc&&) = delete;
							        constexpr Acc& operator=(Acc const&) = delete;
							        constexpr Acc& operator=(Acc&&) = delete;

							        /** Get the n-dimensional indices within the origin in the quantity of the selected unit
							         *
							         * @attention if origin or unit is warp/s a one dimensional vector is returned
							         */
							        constexpr alpaka::concepts::Vector auto getIdxWithin(concepts::Origin auto origin, concepts::Unit auto unit)
							            const
							        {
							            return internalCompute::GetIdxWithin::Op<Acc, ALPAKA_TYPEOF(origin), ALPAKA_TYPEOF(unit)>{}(
							                *this,
							                origin,
							                unit);
							        }

							        /** Get the n-dimensional extents of an origin in the quantity of the selected unit
							         *
							         * @attention if origin or unit is warp/s a one dimensional vector is returned
							         */
							        constexpr alpaka::concepts::Vector auto getExtentsOf(concepts::Origin auto origin, concepts::Unit auto unit)
							            const
							        {
							            return internalCompute::GetExtentsOf::Op<Acc, ALPAKA_TYPEOF(origin), ALPAKA_TYPEOF(unit)>{}(
							                *this,
							                origin,
							                unit);
							        }

							        static constexpr bool hasKey(auto key)
							        {
							            constexpr auto idx = alpaka::internal::KeyIdx<ALPAKA_TYPEOF(key), std::decay_t<T_Storage>>::value;
							            return idx != -1;
							        }

							        constexpr auto getApi() const
							        {
							            return T_Storage::operator[](object::api);
							        }

							        constexpr auto getDeviceKind() const
							        {
							            return T_Storage::operator[](object::deviceKind);
							        }

							        /** Check if a frame spec was used to enqueue the kernel
							         *
							         * To be able to use the result as constexpr value you must call the static function via the type
							         * `Acc_t::launchedWithFrameSpec()` else the warp size can only be used as runtime value.
							         *
							         * If the kernel was enqueued via a frame specification the thread specification to within the kernel can
							         * differ, the number of thread block is not necessary equal to the number of frames and the thread block
							         * extent is not necessary equal to the frame extents.
							         *
							         * @return true if the kernel was launched based on a frame specification, false if a thread specification was
							         * used.
							         */
							        static constexpr bool launchedWithFrameSpec()
							        {
							            // is std::true_type or std::false_type
							            return ALPAKA_TYPEOF(std::declval<T_Storage>()[object::launchedWidthFrameSpec])::value;
							        }

							        /** Get the warp size
							         *
							         * To be able to use the warp size as constexpr value you must call the static function via the type
							         * `Acc_t::getWarpSize()` else the warp size can only be used as runtime value.
							         * Alternative you can use the free function onAcc::warp::getSize<Acc_t>();
							         */
							        static constexpr uint32_t getWarpSize()
							        {
							            constexpr uint32_t w = ALPAKA_TYPEOF(std::declval<T_Storage>()[object::warpSize])::value;
							            return w;
							        }
							    };

							    namespace concepts
							    {
							        /** Concept to check if a type is an accelerator
							         *
							         * @tparam T_Acc Type to check
							         * @tparam T_Api Enforce an API type, if not provided api type is not checked
							         */
							        template<typename T_Acc, typename T_Api = alpaka::NotRequired>
							        concept Acc = alpaka::concepts::SpecializationOf<T_Acc, onAcc::Acc>
							                      && (std::same_as<T_Api, ALPAKA_TYPEOF(std::declval<T_Acc>().getApi())>
							                          || std::same_as<T_Api, alpaka::NotRequired>);
							    } // namespace concepts

							    /** Synchronize all threads within a given scope */
							    template<alpaka::concepts::Layer T_Scope>
							    constexpr void sync(concepts::Acc auto const& acc, T_Scope scope)
							    {
							        internalCompute::sync(acc, scope);
							    }

							    /** Synchronize all threads within a thread block */
							    constexpr void syncBlockThreads(concepts::Acc auto const& acc)
							    {
							        internalCompute::sync(acc, alpaka::layer::block);
							    }

							    /** Create a variable located in the thread blocks shared memory
							     *
							     * @code{.cpp}
							     * // creates a reference to a float value
							     * auto& foo = declareSharedVar<float, uniqueId()>(acc);
							     * @endcode
							     *
							     * @attention The data is not initialized; it can contain garbage.
							     *
							     * @tparam T The type that should be created; the constructor is not called
							     * @tparam T_uniqueId ID that is unique inside a kernel.
							     *                  Reusing the id will return the same memory declared before with the same id.
							     * @return Result should be taken by reference
							     */
							    template<typename T, size_t T_uniqueId>
							    constexpr decltype(auto) declareSharedVar(concepts::Acc auto const& acc)
							    {
							        return internalCompute::declareSharedVar<T, T_uniqueId>(acc);
							    }

							    /** creates an M-dimensional array
							     *
							     * @code{.cpp}
							     * // creates a MdSpan view to a float value, do NOT use a reference here
							     * auto fooArrayMd = declareSharedVar<float, uniqueId()>(acc, CVec<uint32_t, 5, 8>{});
							     * @endcode
							     *
							     * @attention The data is not initialized it can contains garbage.
							     *
							     * @tparam T type which should be created, the constructor is not called
							     * @tparam T_uniqueId id those is unique inside a kernel.
							     *                  Reusing the id will return the same memory declared before with the same id.
							     * @param extent M-dimensional extent in elements for each dimension, 1 - M dimensions are supported
							     * @return MdSpan non owning view to the corresponding data, you should NOT store a reference to the handle
							     */
							    template<typename T, size_t T_uniqueId>
							    constexpr decltype(auto) declareSharedMdArray(
							        concepts::Acc auto const& acc,
							        alpaka::concepts::CVector auto const& extent)
							    {
							        using CArrayType = typename CArrayType<T, ALPAKA_TYPEOF(extent)>::type;
							        /* XOR with hash to avoid issues in case the user is using the same id to create an array and normal shared
							         * variables.
							         */
							        constexpr size_t id = T_uniqueId ^ 0x9e37'79b9'7f4a'7c15;
							        constexpr auto alignment = Alignment<alignof(T)>{};
							        return MdSpanArray<CArrayType, typename ALPAKA_TYPEOF(extent)::type, ALPAKA_TYPEOF(alignment)>{
							            declareSharedVar<CArrayType, id>(acc)};
							    }

							    /** Get block shared dynamic memory.
							     *
							     * The available size of the memory can be defined by specializing 'onHost::trait:GetDynSharedMemBytes' or adding a
							     * public member variable 'uint32_t dynSharedMemBytes' for a kernel. The Memory can be accessed by all threads
							     * within a block. Access to the memory is not thread safe.
							     *
							     * \tparam T The element type.
							     * \return Pointer to pre-allocated contiguous memory.
							     */
							    template<typename T>
							    constexpr auto getDynSharedMem(concepts::Acc auto const& acc) -> T*
							    {
							        return internalCompute::declareDynamicSharedMem<T>(acc);
							    }

							} // namespace alpaka::onAcc

							namespace alpaka::onAcc::internalCompute
							{
							    /** synchronize all threads within a thread block */
							    template<concepts::Acc T_Acc>
							    struct Sync::Op<T_Acc, alpaka::layer::Block>
							    {
							        constexpr auto operator()(T_Acc const& acc, alpaka::layer::Block const scope) const
							        {
							            alpaka::unused(scope);
							            acc[action::threadBlockSync]();
							        }
							    };
							} // namespace alpaka::onAcc::internalCompute
							// ==
							// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/onAcc/Acc.hpp ==
							// ============================================================================

						// #include "alpaka/onAcc/WorkerGroup.hpp"    // amalgamate: file already inlined
						// #include "alpaka/onAcc/interface.hpp"    // amalgamate: file already inlined
							// ============================================================================
							// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/simd/simdized.hpp ==
							// ==
							/* Copyright 2025 René Widera
							 * SPDX-License-Identifier: MPL-2.0
							 */

							/** @file This file provides functionality to transform a type into a SIMD-optimized data structure.
							 *
							 * The implementation is motivated by https://ieeexplore.ieee.org/document/11207437
							 */

							// #pragma once
							// #include "alpaka/Simd.hpp"    // amalgamate: file already inlined
							// #include "alpaka/core/common.hpp"    // amalgamate: file already inlined

							namespace alpaka
							{

							    /** Transform a type into a SIMD-optimized data structure.
							     *
							     * A simdized value is not necessarily a data type wrapped by alpaka::Simd, it can be a structured hierarchical
							     * type where each component is a SIMD pack. This function can be specialized within the namespace of the input
							     * type and will be found via ADL.
							     * @see simdizedInvoke should be specialized together with this function.
							     *
							     * Simdizing of structured types as shown in the code example often improves the performance compared to wrapping
							     * into a SIMD pack.
							     * @code{.cpp}
							     * // input type
							     * template<typename T>
							     * struct Pos
							     * {
							     *   T x = 0;
							     *   T y = 1;
							     * };
							     *
							     * // output could be
							     * template<typename T>
							     * struct Pos
							     * {
							     *   alpaka::Simd<T, width> x;
							     *   alpaka::Simd<T, width> y;
							     * };
							     * @endcode
							     *
							     * @tparam T_width the width of the used SIMD type
							     * @return A simdized data type where each lane replicates the given value. If `makeSimdized` is not specialized
							     * for the given type a SIMD pack wrapping the input value will be returned.
							     */
							    template<uint32_t T_width>
							    constexpr auto makeSimdized(auto&& value)
							    {
							        return Simd<ALPAKA_TYPEOF(value), T_width>::fill(value);
							    }

							    /** Invokes the callable object fn with the parameters args.
							     *
							     * For structured data where each component is a SIMD pack, the functor should be forwarded to the members while
							     * recursively calling simdizedInvoke.
							     * As soon as there is no use specialization available, the recursion is terminated by the invocation of the
							     * functor with the forwarded arguments. This function can be specialized within the namespace of the argument
							     * types and will be found via ADL.
							     * @see makeSimdized should be specialized together with this function.
							     *
							     * As shown in the code snippet, alpaka assumes at least a specialization where each argument can perform the same
							     * access used within the function. It is allowed to specialize more function signatures that do not follow the
							     * rule but are useful within the user code.
							     * @code{.cpp}
							     * // A typical case of how this specialization is called is
							     * // `simdizedInvoke(f, Pos<int>{}, Pos<alpaka::Simd<int,4>>{})`.
							     * constexpr void simdizedInvoke(auto&& f, alpaka::concepts::SpecializationOf<Pos> auto&&... args)
							     * {
							     *    // Accessing .x and .y must be supported by all arguments.
							     *    simdizedInvoke(ALPAKA_FORWARD(f), ALPAKA_FORWARD(args).x...);
							     *    simdizedInvoke(ALPAKA_FORWARD(f), ALPAKA_FORWARD(args).y...);
							     * }
							     * @endcode
							     *
							     * @param fn Callable object to which the arguments will be forwarded.
							     * @param args Arguments forwarded to fn.
							     */
							    constexpr void simdizedInvoke(auto&& fn, auto&&... args)
							    {
							        ALPAKA_FORWARD(fn)(ALPAKA_FORWARD(args)...);
							    }
							} // namespace alpaka
							// ==
							// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/simd/simdized.hpp ==
							// ============================================================================


						// #include <cstdint>    // amalgamate: file already included
						// #include <new>    // amalgamate: file already included

						namespace alpaka::onAcc::internal
						{

						    /** concurrent reduce implementation */
						    template<typename T_Parent>
						    struct SimdTransformReduce
						    {
						        constexpr SimdTransformReduce() = default;

						    protected:
						        template<uint32_t T_maxConcurrencyInByte, alpaka::concepts::Alignment T_MemAlignment = AutoAligned>
						        ALPAKA_FN_INLINE ALPAKA_FN_ACC constexpr auto transformReduce(
						            concepts::Acc auto const& acc,
						            alpaka::concepts::Vector auto extents,
						            auto const& neutralElement,
						            auto&& reduceFunc,
						            auto&& func,
						            alpaka::concepts::IDataSource auto&& data0,
						            alpaka::concepts::IDataSource auto&&... dataN) const
						        {
						            auto numElements = typename ALPAKA_TYPEOF(extents)::UniVec{extents};
						            using ValueType = alpaka::trait::GetValueType_t<ALPAKA_TYPEOF(data0)>;
						            decltype(auto) transformFunc = wrapTransformFunc(ALPAKA_FORWARD(func));

						            constexpr auto simdCfg = T_Parent::template calcSimdPackConfig<ValueType>(
						                ALPAKA_TYPEOF(acc.getApi()){},
						                ALPAKA_TYPEOF(acc.getDeviceKind()){},
						                T_maxConcurrencyInByte);

						            constexpr uint32_t simdWidth = simdCfg.simdWidth;

						            if constexpr(simdWidth != 1u)
						            {
						                constexpr uint32_t numSimdPerFnCall = simdCfg.numSimdPacksPerFnCall;
						                return reduceSimdPackExecution<simdWidth, numSimdPerFnCall, T_MemAlignment>(
						                    acc,
						                    numElements,
						                    neutralElement,
						                    ALPAKA_FORWARD(reduceFunc),
						                    transformFunc,
						                    ALPAKA_FORWARD(data0),
						                    ALPAKA_FORWARD(dataN)...);
						            }

						            auto const workGroup = asParent().getWorkGroup();
						            // execute the algorithm with SIMD width one
						            auto traverse = onAcc::makeIdxMap(
						                acc,
						                workGroup,
						                IdxRange{numElements},
						                asParent().getTraversePolicy(),
						                asParent().getIdxLayoutPolicy());

						            using SimdOneReturnType = ALPAKA_TYPEOF(makeSimdized<1u>(neutralElement));
						            SimdOneReturnType simdizedReducedValue = makeSimdized<1u>(neutralElement);

						            for(auto idx : traverse)
						            {
						                simdizedReducedValue = reduceFunc(
						                    simdizedReducedValue,
						                    callFunctor(
						                        acc,
						                        transformFunc,
						                        SimdPtr{data0, idx, T_MemAlignment{}, CVec<uint32_t, 1u>{}},
						                        SimdPtr{dataN, idx, T_MemAlignment{}, CVec<uint32_t, 1u>{}}...));
						            }

						            auto result = neutralElement;
						            simdizedInvoke(
						                [](auto& lhs, alpaka::concepts::Simd auto const& rhs) { lhs = rhs[0]; },
						                result,
						                simdizedReducedValue);
						            return result;
						        }

						    private:
						        template<uint32_t... T_idx>
						        ALPAKA_FN_INLINE ALPAKA_FN_ACC static constexpr auto loadAncExecuteScalarOp(
						            std::integer_sequence<uint32_t, T_idx...>,
						            auto&& op,
						            auto const& acc,
						            auto&& func,
						            auto&&... data)
						        {
						            return Simd{op(CVec<uint32_t, T_idx>{}, acc, ALPAKA_FORWARD(func), ALPAKA_FORWARD(data)...)...};
						        }

						        ALPAKA_FN_INLINE ALPAKA_FN_ACC static constexpr decltype(auto) wrapTransformFunc(auto&& transformFunc)
						        {
						            if constexpr(isSpecializationOf_v<ALPAKA_TYPEOF(transformFunc), StencilFunc>)
						            {
						                return ALPAKA_FORWARD(transformFunc);
						            }
						            else if constexpr(isSpecializationOf_v<ALPAKA_TYPEOF(transformFunc), ScalarFunc>)
						            {
						                return [transformFunc = ALPAKA_FORWARD(transformFunc)](
						                           onAcc::concepts::Acc auto const& acc,
						                           alpaka::concepts::SimdPtr auto&& inPtr0,
						                           alpaka::concepts::SimdPtr auto const&... inPtr) constexpr
						                {
						                    return loadAncExecuteScalarOp(
						                        std::make_integer_sequence<uint32_t, ALPAKA_TYPEOF(inPtr0)::width()>{},
						                        [](alpaka::concepts::CVector auto idx,
						                           auto const& acc,
						                           auto&& func,
						                           alpaka::concepts::Simd auto const&... data) constexpr
						                        { return callFunctor(acc, func, data[idx.x()]...); },
						                        acc,
						                        transformFunc,
						                        inPtr0.load(),
						                        inPtr.load()...);
						                };
						            }
						            else
						            {
						                return [transformFunc = ALPAKA_FORWARD(transformFunc)](
						                           onAcc::concepts::Acc auto const& acc,
						                           alpaka::concepts::SimdPtr auto&&... inPtr) constexpr
						                { return callFunctor(acc, transformFunc, inPtr.load()...); };
						            }
						        }

						        template<alpaka::concepts::Alignment T_MemAlignment, uint32_t T_width>
						        ALPAKA_FN_INLINE static constexpr auto executeDoTransform(
						            concepts::Acc auto const& acc,
						            auto const& dataIdx,
						            auto&& func,
						            alpaka::concepts::IDataSource auto&&... data)
						        {
						            return callFunctor(acc, func, SimdPtr{data, dataIdx, T_MemAlignment{}, CVec<uint32_t, T_width>{}}...);
						        }

						        /** advance the iterator T_repeat times
						         *
						         * We do not check if the iterator points to a valid element, the caller must ensure that we can safely
						         * advance the iterator T_repeat time without jumping over iter.end().
						         *
						         * @tparam T_repeat Number of time sthe iterator should be advanced.
						         * @return Tuple with T_repeat times iterators.
						         */
						        template<uint32_t... T_repeat>
						        ALPAKA_FN_INLINE static constexpr auto makeAdvanceIterators(
						            auto& iter,
						            std::integer_sequence<uint32_t, T_repeat...>)
						        {
						            // The ternary operator is used to allow using the folding expression on iter.
						            return std::make_tuple(*(T_repeat + 1 != 0u ? iter++ : iter++)...);
						        }

						        /** Calls the transform functor T_repeat times and reduces the results with the given reduce function.
						         *
						         * The calls to the functor are independent and compile time unrolled to support instruction parallelism.
						         * In contrast to executeReduceInto() the register footprint is larger because T_repeat temporary results will
						         * be holt. This allows the compiler to use instruction level parallelism. Call this function if result of
						         * reduceFunc is a SIMD pack.
						         *
						         * @param iter the caller must ensure tha the interator can be increased T_repeat times without jumping over
						         * iter.end()
						         * @return a single simdized pack
						         */
						        template<alpaka::concepts::Alignment T_MemAlignment, uint32_t T_width, uint32_t... T_repeat>
						        ALPAKA_FN_INLINE static constexpr auto executeReduce(
						            concepts::Acc auto const& acc,
						            auto& iter,
						            std::integer_sequence<uint32_t, T_repeat...>,
						            auto&& reduceFunc,
						            auto&& func,
						            alpaka::concepts::IDataSource auto&&... data)
						        {
						            auto ids = makeAdvanceIterators(iter, std::integer_sequence<uint32_t, T_repeat...>{});
						            return std::apply(
						                [&](auto const&... dataIdx) constexpr
						                {
						                    /* It is not possible to create a Simd{Simd} due to constructor issues. Therefore we need to define
						                     * the type for the result explicit.
						                     */
						                    using ComponentType = ALPAKA_TYPEOF(
						                        executeDoTransform<T_MemAlignment, T_width>(
						                            acc,
						                            std::get<0>(std::make_tuple(dataIdx...)),
						                            func,
						                            data...));
						                    auto results = Simd<ComponentType, std::tuple_size_v<ALPAKA_TYPEOF(ids)>>{
						                        executeDoTransform<T_MemAlignment, T_width>(acc, dataIdx, func, data...)...};

						                    return results.reduce(reduceFunc);
						                },
						                ids);
						        }

						        /** Reduce simdized packs into a single simdized pack with the given reduce function.
						         *
						         * In contrast to executeReduce() the register footprint is lower because all intermediate results are directly
						         * reduced into the result variable. Call this function if the type of result is a simdized pack is not a SIMD
						         * pack.
						         *
						         * @param result The results of reduceFn with the result of transformFn will be reduced into this simdized
						         * pack.
						         */
						        template<alpaka::concepts::Alignment T_MemAlignment, uint32_t T_width, uint32_t... T_repeat>
						        ALPAKA_FN_INLINE static constexpr void executeReduceInto(
						            concepts::Acc auto const& acc,
						            auto& iter,
						            std::integer_sequence<uint32_t, T_repeat...>,
						            auto& result,
						            auto&& reduceFn,
						            auto&& transformFn,
						            alpaka::concepts::IDataSource auto&&... data)
						        {
						            auto ids = makeAdvanceIterators(iter, std::integer_sequence<uint32_t, T_repeat...>{});
						            std::apply(
						                [&](auto const&... dataIdx) constexpr
						                {
						                    ((result = reduceFn(
						                          result,
						                          executeDoTransform<T_MemAlignment, T_width>(acc, dataIdx, transformFn, data...))),
						                     ...);
						                },
						                ids);
						        }

						        /** Reduce T_numSimdPerFnCall simdized packs
						         *
						         */
						        template<uint32_t T_simdWidth, uint32_t T_numSimdPerFnCall, alpaka::concepts::Alignment T_MemAlignment>
						        ALPAKA_FN_INLINE static constexpr void reduceNextSimdized(
						            auto const& acc,
						            auto& iter,
						            auto& tmpReturn,
						            auto&& reduceFn,
						            auto&& transformFn,
						            alpaka::concepts::IDataSource auto&& data0,
						            alpaka::concepts::IDataSource auto&&... dataN)
						        {
						            if constexpr(alpaka::concepts::Simd<std::remove_cvref_t<decltype(tmpReturn)>>)
						            {
						                tmpReturn = reduceFn(
						                    tmpReturn,
						                    executeReduce<T_MemAlignment, T_simdWidth>(
						                        acc,
						                        iter,
						                        std::make_integer_sequence<uint32_t, T_numSimdPerFnCall>{},
						                        reduceFn,
						                        transformFn,
						                        data0,
						                        dataN...));
						            }
						            else
						            {
						                executeReduceInto<T_MemAlignment, T_simdWidth>(
						                    acc,
						                    iter,
						                    std::make_integer_sequence<uint32_t, T_numSimdPerFnCall>{},
						                    tmpReturn,
						                    reduceFn,
						                    transformFn,
						                    data0,
						                    dataN...);
						            }
						        }

						        template<onAcc::concepts::Acc T_Acc, typename T_ReduceOp>
						        struct ScalarReducer
						        {
						            // using a const reference here is fine because we control the lifetime
						            T_Acc const& m_acc;
						            T_ReduceOp const& m_reduceOp;

						            constexpr ScalarReducer(T_Acc const& acc, auto&& func) : m_acc(acc), m_reduceOp{ALPAKA_FORWARD(func)}
						            {
						            }

						            constexpr auto operator()(auto&& a, auto&& b) const
						                requires(alpaka::concepts::Simd<ALPAKA_TYPEOF(a)> && alpaka::concepts::Simd<ALPAKA_TYPEOF(b)>)
						            {
						                return loadAncExecuteScalarOp(
						                    std::make_integer_sequence<uint32_t, ALPAKA_TYPEOF(a)::width()>{},
						                    [this](
						                        alpaka::concepts::CVector auto idx,
						                        concepts::Acc auto const& acc,
						                        auto&& func,
						                        auto const&... data) constexpr
						                    {
						                        /* const& for data is used instead of && to enforce const evaluation of the operator[]
						                         * std simd operator[] is returning a smart reference which is avoided if data is const
						                         */
						                        alpaka::unused(acc, func);
						                        // recursively call until no Simd type is the result
						                        return this->operator()(data[idx.x()]...);
						                    },
						                    m_acc,
						                    ALPAKA_FORWARD(m_reduceOp),
						                    ALPAKA_FORWARD(a),
						                    ALPAKA_FORWARD(b));
						            }

						            constexpr auto operator()(auto&& a, auto&& b) const
						                requires(!alpaka::concepts::Simd<ALPAKA_TYPEOF(a)> && !alpaka::concepts::Simd<ALPAKA_TYPEOF(b)>)
						            {
						                return m_reduceOp(ALPAKA_FORWARD(a), ALPAKA_FORWARD(b));
						            }

						        private:
						            template<uint32_t... T_idx>
						            ALPAKA_FN_INLINE ALPAKA_FN_ACC static constexpr auto loadAncExecuteScalarOp(
						                std::integer_sequence<uint32_t, T_idx...>,
						                auto&& op,
						                auto const& acc,
						                auto&& func,
						                auto&&... data)
						            {
						                return Simd{op(CVec<uint32_t, T_idx>{}, acc, ALPAKA_FORWARD(func), ALPAKA_FORWARD(data)...)...};
						            }
						        };

						        /** Get the reducer functor
						         *
						         * @return wrapped functor in case the input is @see ScalarFunc else the identity
						         */
						        ALPAKA_FN_INLINE constexpr auto getReducer(onAcc::concepts::Acc auto const&, auto&& reduceOp) const
						            requires(!isSpecializationOf_v<ALPAKA_TYPEOF(reduceOp), ScalarFunc>)
						        {
						            return reduceOp;
						        }

						        ALPAKA_FN_INLINE constexpr auto getReducer(onAcc::concepts::Acc auto const& acc, auto&& reduceOp) const
						            requires(isSpecializationOf_v<ALPAKA_TYPEOF(reduceOp), ScalarFunc>)
						        {
						            return ScalarReducer<ALPAKA_TYPEOF(acc), ALPAKA_TYPEOF(reduceOp)>{acc, reduceOp};
						        }

						        constexpr auto const& asParent() const
						        {
						            return static_cast<T_Parent const&>(*this);
						        }

						        template<uint32_t T_simdWidth, uint32_t T_numSimdPerFnCall, alpaka::concepts::Alignment T_MemAlignment>
						        ALPAKA_FN_INLINE ALPAKA_FN_ACC constexpr auto reduceSimdPackExecution(
						            auto const& acc,
						            alpaka::concepts::Vector auto numElements,
						            auto const& neutralElement,
						            auto&& userReduceFunc,
						            auto&& func,
						            alpaka::concepts::IDataSource auto&& data0,
						            alpaka::concepts::IDataSource auto&&... dataN) const
						        {
						            auto reduceFunc = getReducer(acc, userReduceFunc);

						            auto const workGroup = asParent().getWorkGroup();

						            // we SIMDfy only over the fast moving dimension (columns of memory)
						            auto const wSize = workGroup.size(acc).back();

						            /* Number of data elements process per functor call. */
						            auto const numElementsPerFnCall = T_simdWidth * T_numSimdPerFnCall;
						            /** To avoid a overflow in the index range we device first by the number of elements per
						             * function call and than by the number of workers.
						             */
						            auto const numSimdPackLoops = numElements.back() / numElementsPerFnCall / wSize;

						            // number of elments to jump over to start the remainder loop
						            auto const remainderBegin = numSimdPackLoops * numElementsPerFnCall * wSize;

						            // we SIMDfy only over the fast moving dimension (columns of memory)
						            auto domainSize = numElements.rAssign(remainderBegin);
						            auto stride = ALPAKA_TYPEOF(numElements)::fill(1).rAssign(T_simdWidth);

						            using IdxType = ALPAKA_TYPEOF(numElements);
						            auto simdIdxContainer = onAcc::makeIdxMap(
						                acc,
						                workGroup,
						                IdxRange{IdxType::fill(0), domainSize, stride},
						                asParent().getTraversePolicy(),
						                asParent().getIdxLayoutPolicy());

						            using SimdReturn = ALPAKA_TYPEOF(makeSimdized<T_simdWidth>(neutralElement));
						            SimdReturn simdizedReducedValue = makeSimdized<T_simdWidth>(neutralElement);

						            if constexpr(
						                domainSize.dim() > 1u && std::is_same_v<ALPAKA_TYPEOF(asParent().getTraversePolicy()), traverse::Flat>)
						            {
						                /* For cases where we traverse with the flat policy, we cannot assume that we can blindly increase the
						                 * iterator later N times. This could happen in cases where we have enough concurrency. We evaluate for
						                 * SIMD operations only the fast moving dimension but with the flat policy flattening the worker group
						                 * and use all workers on a linear domain. The loop must therefore be split into iterating over all
						                 * slow dimensions and an inner loop iterating over the fast moving dimension. For this we need to
						                 * build our own groups out of the user-provided workgroup.
						                 */
						                // build a worker group with slow-moving dimension threads for the outer loop
						                using index_type = typename IdxType::type;
						                auto wIdx = workGroup.idx(acc).rAssign(index_type{0});
						                auto wSize = workGroup.size(acc).rAssign(index_type{1});
						                auto domSize = domainSize.rAssign(index_type{1});

						                auto wOuter = WorkerGroup{wIdx, wSize};

						                for(auto rowIdx : onAcc::makeIdxMap(
						                        acc,
						                        wOuter,
						                        IdxRange{domSize},
						                        asParent().getTraversePolicy(),
						                        asParent().getIdxLayoutPolicy()))
						                {
						                    // build a worker group with fast-moving dimension threads for the inner loop
						                    auto wIdxInner = ALPAKA_TYPEOF(domainSize)::fill(0).rAssign(workGroup.idx(acc).back());
						                    auto wSizeInner = ALPAKA_TYPEOF(domainSize)::fill(1).rAssign(workGroup.size(acc).back());
						                    auto wInner = WorkerGroup{wIdxInner, wSizeInner};

						                    // iterate over the fast-moving dimension only
						                    auto simdIdxContainerFastDim = onAcc::makeIdxMap(
						                        acc,
						                        wInner,
						                        IdxRange{rowIdx, domainSize, stride},
						                        asParent().getTraversePolicy(),
						                        asParent().getIdxLayoutPolicy())[CVec<uint32_t, ALPAKA_TYPEOF(domainSize)::dim() - 1u>{}];

						                    for(auto iter = simdIdxContainerFastDim.begin(); iter != simdIdxContainerFastDim.end();)
						                    {
						                        reduceNextSimdized<T_simdWidth, T_numSimdPerFnCall, T_MemAlignment>(
						                            acc,
						                            iter,
						                            simdizedReducedValue,
						                            ALPAKA_FORWARD(reduceFunc),
						                            ALPAKA_FORWARD(func),
						                            ALPAKA_FORWARD(data0),
						                            ALPAKA_FORWARD(dataN)...);
						                    }
						                }
						            }
						            else
						            {
						                for(auto iter = simdIdxContainer.begin(); iter != simdIdxContainer.end();)
						                {
						                    reduceNextSimdized<T_simdWidth, T_numSimdPerFnCall, T_MemAlignment>(
						                        acc,
						                        iter,
						                        simdizedReducedValue,
						                        ALPAKA_FORWARD(reduceFunc),
						                        ALPAKA_FORWARD(func),
						                        ALPAKA_FORWARD(data0),
						                        ALPAKA_FORWARD(dataN)...);
						                }
						            }

						            ALPAKA_TYPEOF(numElements) remainderDomainSize = numElements.fill(0).rAssign(remainderBegin);

						            for(auto idx : onAcc::makeIdxMap(
						                    acc,
						                    workGroup,
						                    IdxRange{remainderDomainSize, numElements},
						                    asParent().getTraversePolicy(),
						                    asParent().getIdxLayoutPolicy()))
						            {
						                auto transformResult = callFunctor(
						                    acc,
						                    func,
						                    SimdPtr{data0, idx, T_MemAlignment{}, CVec<uint32_t, 1u>{}},
						                    SimdPtr{dataN, idx, T_MemAlignment{}, CVec<uint32_t, 1u>{}}...);

						                simdizedInvoke(
						                    [reduceFunc](auto& lhs, alpaka::concepts::Simd auto const& rhs)
						                    {
						                        // std simd non-const operator[] is returning a smart reference, therefore we need
						                        // std::as_const to enforce returning a copy of the value.
						                        lhs[0] = reduceFunc(std::as_const(lhs)[0], rhs[0]);
						                    },
						                    simdizedReducedValue,
						                    transformResult);
						            }

						            ALPAKA_TYPEOF(neutralElement) result;
						            simdizedInvoke(
						                [reduceFunc](auto& lhs, alpaka::concepts::Simd auto const& rhs) { lhs = rhs.reduce(reduceFunc); },
						                result,
						                simdizedReducedValue);
						            return result;
						        }
						    };
						} // namespace alpaka::onAcc::internal
						// ==
						// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/onAcc/internal/SimdTransformReduce.hpp ==
						// ============================================================================


					// #include <bit>    // amalgamate: file already included
					// #include <cstdint>    // amalgamate: file already included

					namespace alpaka::onAcc
					{
					    /** Creates a functor operate on contiguous data concurrently.
					     *
					     * The class is automatically configured to use the best fitting SIMD width for the given data type and is able to
					     * expose instruction level parallelism.
					     *
					     * @param T_WorkGroup participating thread description. More than one thread can have the same index within the
					     * group. All worker with the same id will get the same index as result.
					     * @param T_Traverse Policy to configure the method used to find the next valid index for a worker. @see namespace
					     * traverse
					     * @param T_IdxLayout Policy to define how indecision will be mapped to worker threads. @see namsepsace layout
					     */
					    template<
					        typename T_WorkGroup,
					        concepts::IdxTraversing T_Traverse = traverse::Flat,
					        concepts::IdxMapping T_IdxLayout = layout::Optimized>
					    struct SimdAlgo
					        : protected internal::SimdConcurrent<SimdAlgo<T_WorkGroup, T_Traverse, T_IdxLayout>>
					        , protected internal::SimdTransformReduce<SimdAlgo<T_WorkGroup, T_Traverse, T_IdxLayout>>
					    {
					        constexpr SimdAlgo(
					            T_WorkGroup const workGroup,
					            T_Traverse traverse = T_Traverse{},
					            T_IdxLayout idxLayout = T_IdxLayout{})
					            : m_workGroup{workGroup}
					        {
					            alpaka::unused(traverse, idxLayout);
					        }

					        constexpr T_WorkGroup getWorkGroup() const
					        {
					            return m_workGroup;
					        }

					        constexpr T_Traverse getTraversePolicy() const
					        {
					            return T_Traverse{};
					        }

					        constexpr T_IdxLayout getIdxLayoutPolicy() const
					        {
					            return T_IdxLayout{};
					        }

					        /** execute the functor concurrently over the given data.
					         *
					         * @attention The number of elements to process is derived from the first MdSpan object.
					         *            All other MdSpan objects must have at least the same number of elements.
					         *            The optimal concurrency is also derived from the first MdSpan.
					         *
					         * @param func the functor to be executed
					         * @param data0 the first data to be processed
					         * @param dataN the remaining data to be processed
					         *
					         * @{
					         */
					        ALPAKA_FN_INLINE ALPAKA_FN_ACC constexpr void concurrent(
					            auto const& acc,
					            auto&& func,
					            alpaka::concepts::IDataSource auto&& data0,
					            alpaka::concepts::IDataSource auto&&... dataN) const
					        {
					            concurrent(acc, data0.getExtents(), ALPAKA_FORWARD(func), ALPAKA_FORWARD(data0), ALPAKA_FORWARD(dataN)...);
					        }

					        /**
					         * @param extents number of elements to process in each dimension
					         */
					        ALPAKA_FN_INLINE ALPAKA_FN_ACC constexpr void concurrent(
					            auto const& acc,
					            alpaka::concepts::Vector auto extents,
					            auto&& func,
					            alpaka::concepts::IDataSource auto&& data0,
					            alpaka::concepts::IDataSource auto&&... dataN) const
					        {
					            using ValueType = alpaka::trait::GetValueType_t<ALPAKA_TYPEOF(data0)>;
					            concurrent<
					                alpaka::getNumElemPerThread<ValueType>(
					                    ALPAKA_TYPEOF(acc.getApi()){},
					                    ALPAKA_TYPEOF(acc.getDeviceKind()){})
					                * sizeof(ValueType)>(
					                acc,
					                extents,
					                ALPAKA_FORWARD(func),
					                ALPAKA_FORWARD(data0),
					                ALPAKA_FORWARD(dataN)...);
					        }

					        /** @} */

					        /** execute the functor concurrently over the given data.
					         *
					         * @attention The number of elements to process is derived from the first MdSpan object.
					         *            All other MdSpan objects must have at least the same number of elements.
					         *
					         * @param T_maxConcurrencyInByte
					         *    Maximum number of bytes to be used for concurrency.
					         *    Concurrency bytes describe a virtual simd pack size which is not exceeded.
					         *    Internally a best fitting SIMD width is calculated and instruction parallelism is exposed based on
					         *    T_maxConcurrencyInByte.
					         * @param T_MemAlignment alignment of the memory, if no alignments is given the alignment will be derived from
					         * the MdSpan data descriptions
					         * @param func the functor to be executed
					         * @param data0 the first data to be processed
					         * @param dataN the remaining data to be processed
					         *
					         * @{
					         */
					        template<uint32_t T_maxConcurrencyInByte, alpaka::concepts::Alignment T_MemAlignment = AutoAligned>
					        ALPAKA_FN_INLINE ALPAKA_FN_ACC constexpr void concurrent(
					            auto const& acc,
					            auto&& func,
					            alpaka::concepts::IDataSource auto&& data0,
					            alpaka::concepts::IDataSource auto&&... dataN) const
					        {
					            concurrent<T_maxConcurrencyInByte, T_MemAlignment>(
					                acc,
					                data0.getExtents(),
					                ALPAKA_FORWARD(func),
					                ALPAKA_FORWARD(data0),
					                ALPAKA_FORWARD(dataN)...);
					        }

					        /**
					         * @param extents number of elements to process in each dimension
					         */
					        template<uint32_t T_maxConcurrencyInByte, alpaka::concepts::Alignment T_MemAlignment = AutoAligned>
					        ALPAKA_FN_INLINE ALPAKA_FN_ACC constexpr void concurrent(
					            auto const& acc,
					            alpaka::concepts::Vector auto extents,
					            auto&& func,
					            alpaka::concepts::IDataSource auto&& data0,
					            alpaka::concepts::IDataSource auto&&... dataN) const
					        {
					            ConcurrentAlgo::template concurrent<T_maxConcurrencyInByte, T_MemAlignment>(
					                acc,
					                extents,
					                ALPAKA_FORWARD(func),
					                ALPAKA_FORWARD(data0),
					                ALPAKA_FORWARD(dataN)...);
					        }

					        /** @} */


					        /** @brief transform the input data and reduce is to a single value
					         *
					         * @attention If no extent is given the number of elements to process is derived from the first MdSpan object.
					         *            All other MdSpan objects must have at least the same number of elements.
					         *
					         * @param neutralElement the neutral element for the reduction operation
					         * @param reduceFunc The binary reduction operation to be executed, e.g. std::plus. The functor should support
					         * Simd packages.
					         * @param transformFunc N-nary functor to be executed, values of all containers will be passed to the functor
					         * as arguments. The functor should support Simd packages. If not you can enforce the element wise execution by
					         * wrapping into
					         * ScalarFunc. If you would like to support stencil executions wrapp fn into StencilFunc. StencilFunc
					         * is getting all arguments as SimdPtr. If StencilFunc is used you should take care to not read outside of
					         * valid memory ranges by using sub-views to your input and output data. Optionally a transformFn can have an
					         * accelerator as first argument.
					         * If the result of this functor is a structured value providing an overload to simdize the type
					         * can improve the performance see alpaka::makeSimdized.
					         * @param data0 the first data to be processed
					         * @param dataN the remaining data to be processed
					         * @return A single reduced value.
					         */
					        ALPAKA_FN_INLINE ALPAKA_FN_ACC constexpr auto transformReduce(
					            auto const& acc,
					            auto const& neutralElement,
					            auto&& reduceFunc,
					            auto&& transformFunc,
					            alpaka::concepts::IDataSource auto&& data0,
					            alpaka::concepts::IDataSource auto&&... dataN) const
					        {
					            return transformReduce(
					                acc,
					                data0.getExtents(),
					                neutralElement,
					                ALPAKA_FORWARD(reduceFunc),
					                ALPAKA_FORWARD(transformFunc),
					                ALPAKA_FORWARD(data0),
					                ALPAKA_FORWARD(dataN)...);
					        }

					        /**
					         * @copydoc transformReduce()
					         * @param extents number of elements to process in each dimension
					         */
					        ALPAKA_FN_INLINE ALPAKA_FN_ACC constexpr auto transformReduce(
					            auto const& acc,
					            alpaka::concepts::Vector auto extents,
					            auto const& neutralElement,
					            auto&& reduceFunc,
					            auto&& transformFunc,
					            alpaka::concepts::IDataSource auto&& data0,
					            alpaka::concepts::IDataSource auto&&... dataN) const
					        {
					            using ValueType = alpaka::trait::GetValueType_t<ALPAKA_TYPEOF(data0)>;
					            return transformReduce<
					                alpaka::getNumElemPerThread<ValueType>(
					                    ALPAKA_TYPEOF(acc.getApi()){},
					                    ALPAKA_TYPEOF(acc.getDeviceKind()){})
					                * sizeof(ValueType)>(
					                acc,
					                extents,
					                neutralElement,
					                ALPAKA_FORWARD(reduceFunc),
					                ALPAKA_FORWARD(transformFunc),
					                ALPAKA_FORWARD(data0),
					                ALPAKA_FORWARD(dataN)...);
					        }

					        /**
					         * @copydoc transformReduce()
					         *
					         * @tparam T_maxConcurrencyInByte
					         *    Maximum number of bytes to be used for concurrency.
					         *    Concurrency bytes describe a virtual simd pack size which is not exceeded.
					         *    Internally a best fitting SIMD width is calculated and instruction parallelism is exposed based on
					         *    T_maxConcurrencyInByte.
					         * @tparam T_MemAlignment alignment of the memory, if no alignments is given the alignment will be derived from
					         * the MdSpan data descriptions
					         */
					        template<uint32_t T_maxConcurrencyInByte, alpaka::concepts::Alignment T_MemAlignment = AutoAligned>
					        ALPAKA_FN_INLINE ALPAKA_FN_ACC constexpr auto transformReduce(
					            auto const& acc,
					            auto const& neutralElement,
					            auto&& reduceFunc,
					            auto&& transformFunc,
					            alpaka::concepts::IDataSource auto&& data0,
					            alpaka::concepts::IDataSource auto&&... dataN) const
					        {
					            return transformReduce<T_maxConcurrencyInByte, T_MemAlignment>(
					                acc,
					                data0.getExtents(),
					                neutralElement,
					                ALPAKA_FORWARD(reduceFunc),
					                ALPAKA_FORWARD(transformFunc),
					                ALPAKA_FORWARD(data0),
					                ALPAKA_FORWARD(dataN)...);
					        }

					        /**
					         * @copydoc transformReduce()
					         *
					         * @param extents number of elements to process in each dimension
					         * @tparam T_maxConcurrencyInByte
					         *    Maximum number of bytes to be used for concurrency.
					         *    Concurrency bytes describe a virtual simd pack size which is not exceeded.
					         *    Internally a best fitting SIMD width is calculated and instruction parallelism is exposed based on
					         *    T_maxConcurrencyInByte.
					         * @tparam T_MemAlignment alignment of the memory, if no alignments is given the alignment will be derived from
					         * the MdSpan data descriptions
					         */
					        template<uint32_t T_maxConcurrencyInByte, alpaka::concepts::Alignment T_MemAlignment = AutoAligned>
					        ALPAKA_FN_INLINE ALPAKA_FN_ACC constexpr auto transformReduce(
					            auto const& acc,
					            alpaka::concepts::Vector auto extents,
					            auto const& neutralElement,
					            auto&& reduceFunc,
					            auto&& transformFunc,
					            alpaka::concepts::IDataSource auto&& data0,
					            alpaka::concepts::IDataSource auto&&... dataN) const
					        {
					            return ReduceAlgo::template transformReduce<T_maxConcurrencyInByte, T_MemAlignment>(
					                acc,
					                extents,
					                neutralElement,
					                ALPAKA_FORWARD(reduceFunc),
					                ALPAKA_FORWARD(transformFunc),
					                ALPAKA_FORWARD(data0),
					                ALPAKA_FORWARD(dataN)...);
					        }

					    private:
					        using ConcurrentAlgo = internal::SimdConcurrent<SimdAlgo<T_WorkGroup, T_Traverse, T_IdxLayout>>;
					        using ReduceAlgo = internal::SimdTransformReduce<SimdAlgo<T_WorkGroup, T_Traverse, T_IdxLayout>>;

					        friend ConcurrentAlgo;
					        friend ReduceAlgo;

					        template<typename T_Type, uint32_t T_maxConcurrencyInByte, uint32_t T_cacheLineInByte>
					        static constexpr auto calcSimdWidth()
					        {
					            constexpr uint32_t maxSimdBytes = std::min(T_cacheLineInByte, T_maxConcurrencyInByte);
					            return alpaka::divExZero(maxSimdBytes, static_cast<uint32_t>(sizeof(T_Type)));
					        }

					        template<typename T_Type>
					        struct SimdPackConfig
					        {
					            using value_type = T_Type;
					            uint32_t simdWidth;
					            uint32_t numSimdPacksPerFnCall;
					        };

					        /** Generate a SIMD config for the API and device kind.
					         *
					         * Produces an optimized SIMD configuration based on technical constrained.
					         * The SIMD is set to a power of two.
					         * If possible, the SIMD configuration is aligned to the cacheline size for the given device kind.
					         *
					         * @maxConcurrencyInByte The upper limit in bytes a SIMD configuration must not exceed, except a single value
					         * is larger. This parameter is used to control the register pressure.
					         *
					         * @return a configuration with the number of SIMD pack which should be used in parallel for a single
					         * invocation. And the width of a single SIMD pack.
					         */
					        template<typename T_ValueType>
					        [[nodiscard]] static consteval SimdPackConfig<T_ValueType> calcSimdPackConfig(
					            alpaka::concepts::Api auto api,
					            alpaka::concepts::DeviceKind auto deviceKind,
					            uint32_t maxConcurrencyInByte)
					        {
					            constexpr uint32_t maxArchSimdWidth = getArchSimdWidth<T_ValueType>(api, deviceKind);
					            constexpr uint32_t cachelineBytes = getCachelineSize(api, deviceKind);
					            uint32_t simdWidth = maxArchSimdWidth;

					            // Maximum SIMD width allowed by the byte concurrency budget.
					            uint32_t maxWidthAllowed = maxConcurrencyInByte / sizeof(T_ValueType);

					            // Clamp max hardware SIMD width and ensure at least 1.
					            uint32_t clampedWidth = std::max(std::min(simdWidth, maxWidthAllowed), 1u);

					            // Round down to the nearest power of two.
					            simdWidth = std::bit_floor(clampedWidth);

					            uint32_t const simdWidthInByte = simdWidth * sizeof(T_ValueType);

					            // Number of SIMD packs that fit into the concurrency budget.
					            uint32_t const numSimdPacksToUtilizeConcurrency = alpaka::divExZero(maxConcurrencyInByte, simdWidthInByte);

					            // Number of SIMD packs required to cover one cache line
					            uint32_t const numSimdPacksPerCacheLine = alpaka::divExZero(cachelineBytes, simdWidthInByte);

					            // Prefer the largest cache-line multiple that fits into the budget.
					            uint32_t numSimdPacksPerFnCall = numSimdPacksToUtilizeConcurrency;
					            if(numSimdPacksToUtilizeConcurrency >= numSimdPacksPerCacheLine)
					            {
					                uint32_t const cachelineMultiple
					                    = (numSimdPacksToUtilizeConcurrency / numSimdPacksPerCacheLine) * numSimdPacksPerCacheLine;
					                numSimdPacksPerFnCall = std::max(cachelineMultiple, 1u);
					            }

					            return {simdWidth, numSimdPacksPerFnCall};
					        }

					        T_WorkGroup m_workGroup;
					    };
					} // namespace alpaka::onAcc
					// ==
					// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/onAcc/SimdAlgo.hpp ==
					// ============================================================================

				// #include "alpaka/onAcc/interface.hpp"    // amalgamate: file already inlined
				// #include "alpaka/onHost/FrameSpec.hpp"    // amalgamate: file already inlined
				// #include "alpaka/onHost/internal/interface.hpp"    // amalgamate: file already inlined
				// #include "alpaka/onHost/logger/logger.hpp"    // amalgamate: file already inlined

				// #include <algorithm>    // amalgamate: file already included

				namespace alpaka::internal::generic
				{
				    namespace math
				    {
				        template<typename T>
				        ALPAKA_FN_HOST_ACC constexpr bool isnan(T const& value)
				        {
				            return alpaka::math::internal::ieeeIsnan(value);
				        }

				        template<typename T>
				        ALPAKA_FN_HOST_ACC constexpr bool isinf(T const& value)
				        {
				            return alpaka::math::internal::ieeeIsinf(value);
				        }

				        template<typename T>
				        ALPAKA_FN_HOST_ACC constexpr bool isfinite(T const& value)
				        {
				            return alpaka::math::internal::ieeeIsfinite(value);
				        }
				    } // namespace math

				    /** assign a value to each element of the destination
				     *
				     * @todo replace the kernel as soon as we have an algorithm forEach callable from host
				     */
				    struct SimdFillKernel
				    {
				        ALPAKA_FN_ACC void operator()(auto const& acc, alpaka::concepts::IMdSpan auto dest, auto const value) const
				        {
				            auto simdGrid = onAcc::SimdAlgo{onAcc::worker::threadsInGrid};
				            simdGrid.concurrent(
				                acc,
				                dest.getExtents(),
				                [value](onAcc::concepts::Acc auto const&, auto destSimdPtr) constexpr
				                {
				                    using SimdType = ALPAKA_TYPEOF(destSimdPtr.load());
				                    destSimdPtr = SimdType::fill(value);
				                },
				                dest);
				        }
				    };

				    template<typename T_Value>
				    inline void fill(
				        auto& internalQueue,
				        auto executor,
				        alpaka::concepts::IMdSpan<T_Value> auto&& dest,
				        T_Value elementValue)
				    {
				        ALPAKA_LOG_FUNCTION(onHost::logger::memory);

				        auto extents = onHost::getExtents(dest);
				        auto frameSpec = onHost::internal::getSimdFrameSpec<T_Value>(
				            *onHost::internal::getDevice(internalQueue),
				            executor,
				            extents);

				        ALPAKA_LOG_INFO(
				            onHost::logger::memory,
				            [&]()
				            {
				                std::stringstream ss;
				                ss << "fill{ extents=" << extents << ", elementsPerFrameItem" << ", dst=" << dest
				                   << ", value_type=" << onHost::demangledName(elementValue) << ", frameSpec=" << frameSpec << " }";
				                return ss.str();
				            });

				        onHost::internal::enqueue(internalQueue, frameSpec, KernelBundle{SimdFillKernel{}, dest, elementValue});
				    }
				} // namespace alpaka::internal::generic
				// ==
				// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/generic.hpp ==
				// ============================================================================

			// #include "alpaka/api/host/Api.hpp"    // amalgamate: file already inlined
			// #include "alpaka/api/host/Event.hpp"    // amalgamate: file already inlined
				// ============================================================================
				// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/host/exec/OmpBlocks.hpp ==
				// ==
				/* Copyright 2024 René Widera
				 * SPDX-License-Identifier: MPL-2.0
				 */

				// #pragma once
				// #include "alpaka/Vec.hpp"    // amalgamate: file already inlined
					// ============================================================================
					// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/host/IdxLayer.hpp ==
					// ==
					/* Copyright 2024 René Widera
					 * SPDX-License-Identifier: MPL-2.0
					 */

					// #pragma once
					// #include "alpaka/Vec.hpp"    // amalgamate: file already inlined
					// #include "alpaka/core/Tag.hpp"    // amalgamate: file already inlined
					// #include "alpaka/core/util.hpp"    // amalgamate: file already inlined

					// #include <cassert>    // amalgamate: file already included
					// #include <tuple>    // amalgamate: file already included

					namespace alpaka::onAcc
					{
					    namespace cpu
					    {
					        template<typename IndexVecType>
					        struct OneLayer
					        {
					            constexpr OneLayer() = default;

					            constexpr auto idx() const
					            {
					                return IndexVecType::fill(0);
					            }

					            constexpr auto idx() const requires alpaka::concepts::CVector<IndexVecType>
					            {
					                return IndexVecType::template fill<0>();
					            }

					            constexpr auto count() const
					            {
					                return IndexVecType::fill(1);
					            }

					            constexpr auto count() const requires alpaka::concepts::CVector<IndexVecType>
					            {
					                return IndexVecType::template fill<1u>();
					            }
					        };

					        template<typename T_Idx, typename T_Count>
					        struct GenericLayer
					        {
					            constexpr GenericLayer(T_Idx idx, T_Count count) : m_idx(idx), m_count(count)
					            {
					            }

					            constexpr decltype(auto) idx() const
					            {
					                return unWrapp(m_idx);
					            }

					            constexpr decltype(auto) count() const
					            {
					                return unWrapp(m_count);
					            }

					            T_Idx m_idx;
					            T_Count m_count;
					        };
					    } // namespace cpu
					} // namespace alpaka::onAcc
					// ==
					// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/host/IdxLayer.hpp ==
					// ============================================================================

					// ============================================================================
					// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/host/block/mem/SingleThreadStaticShared.hpp ==
					// ==
					/* Copyright 2024 René Widera
					 * SPDX-License-Identifier: MPL-2.0
					 */

					// #pragma once
						// ============================================================================
						// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/host/block/mem/SharedStorage.hpp ==
						// ==
						/* Copyright 2022 Jeffrey Kelling, Rene Widera, Bernhard Manfred Gruber
						 * SPDX-License-Identifier: MPL-2.0
						 */

						// #pragma once
						// #include "alpaka/core/Assert.hpp"    // amalgamate: file already inlined

						// #include <algorithm>    // amalgamate: file already included
						// #include <array>    // amalgamate: file already included
						// #include <cstdint>    // amalgamate: file already included
						// #include <functional>    // amalgamate: file already included
						// #include <limits>    // amalgamate: file already included
						#include <type_traits>

						#ifndef ALPAKA_BLOCK_SHARED_DYN_MEMBER_ALLOC_KIB
						#    define ALPAKA_BLOCK_SHARED_DYN_MEMBER_ALLOC_KIB 64u
						#endif

						namespace alpaka::onAcc::cpu::detail
						{
						    //! Implementation of static block shared memory provider.
						    //!
						    //! externally allocated fixed-size memory, likely provided by BlockSharedMemDynMember.
						    template<std::size_t TMinDataAlignBytes>
						    class SharedStorage
						    {
						        struct alignas(TMinDataAlignBytes) MetaData
						        {
						            //! Unique id if the next data chunk.
						            size_t id = 0u;
						            //! Offset to the next meta data header, relative to m_mem.
						            //! To access the meta data header the offset must by aligned first.
						            std::uint32_t offset = 0u;
						        };

						        static constexpr std::uint32_t metaDataSize = sizeof(MetaData);

						    public:
						        SharedStorage() = default;

						        template<typename T>
						        void alloc(size_t id) const
						        {
						            // Add meta data chunk in front of the user data
						            m_allocdBytes = varChunkEnd<MetaData>(m_allocdBytes, sizeof(MetaData));
						            ALPAKA_ASSERT_ACC(m_allocdBytes <= totalSharedBytes);
						            auto* meta = getLatestVarPtr<MetaData>();

						            // Allocate variable
						            m_allocdBytes = varChunkEnd<T>(m_allocdBytes, sizeof(T));
						            ALPAKA_ASSERT_ACC(m_allocdBytes <= totalSharedBytes);

						            // Update meta data with id and offset for the allocated variable.
						            meta->id = id;
						            meta->offset = m_allocdBytes;
						        }

						        template<typename T>
						        void allocDynamic(size_t id, uint32_t numBytes) const
						        {
						            // Add meta data chunk in front of the user data
						            m_allocdBytes = varChunkEnd<MetaData>(m_allocdBytes, sizeof(MetaData));
						            ALPAKA_ASSERT_ACC(m_allocdBytes <= totalSharedBytes);
						            auto* meta = getLatestVarPtr<MetaData>();

						            // Allocate variable
						            m_allocdBytes = varChunkEnd<T>(m_allocdBytes, numBytes);
						            ALPAKA_ASSERT_ACC(m_allocdBytes <= totalSharedBytes);

						            // Update meta data with id and offset for the allocated variable.
						            meta->id = id;
						            meta->offset = m_allocdBytes;
						        }

						        //! Give the pointer to an exiting variable
						        //!
						        //! @tparam T type of the variable
						        //! @param id unique id of the variable
						        //! @return nullptr if variable with id not exists
						        template<typename T>
						        auto getVarPtr(size_t id) const -> T*
						        {
						            // Offset in bytes to the next unaligned meta data header behind the variable.
						            std::uint32_t off = 0;

						            // Iterate over allocated data only
						            while(off < m_allocdBytes)
						            {
						                // Adjust offset to be aligned
						                std::uint32_t const alignedMetaDataOffset
						                    = varChunkEnd<MetaData>(off, sizeof(MetaData)) - static_cast<std::uint32_t>(sizeof(MetaData));
						                ALPAKA_ASSERT_ACC(
						                    (alignedMetaDataOffset + static_cast<std::uint32_t>(sizeof(MetaData))) <= m_allocdBytes);
						                auto* metaDataPtr = reinterpret_cast<MetaData*>(data() + alignedMetaDataOffset);
						                off = metaDataPtr->offset;

						                if(metaDataPtr->id == id)
						                    return reinterpret_cast<T*>(&data()[off - sizeof(T)]);
						            }

						            // Variable not found.
						            return nullptr;
						        }

						        //! Get last allocated variable.
						        template<typename T>
						        auto getLatestVarPtr() const -> T*
						        {
						            return reinterpret_cast<T*>(&data()[m_allocdBytes - sizeof(T)]);
						        }

						    private:
						        uint8_t* data() const
						        {
						            return m_data.data();
						        }

						        //! Byte offset to the end of the memory chunk
						        //!
						        //! Calculate bytes required to store a type with a aligned starting address in m_mem.
						        //! Start offset to the origin of the user data chunk can be calculated with `result - sizeof(T)`.
						        //! The padding is always before the origin of the user data chunk and can be zero byte.
						        //!
						        //! \tparam T type should fit into the chunk
						        //! \param byteOffset Current byte offset.
						        //! \param byteOffset Number of bytes to allocate, should be at least sizeof(T).
						        //! \result Byte offset to the end of the data chunk, relative to m_mem..
						        template<typename T>
						        auto varChunkEnd(uint32_t byteOffset, uint32_t numBytes) const -> std::uint32_t
						        {
						            auto const ptr = reinterpret_cast<std::size_t>(data() + byteOffset);
						            constexpr size_t align = std::max(TMinDataAlignBytes, alignof(T));
						            std::size_t const newPtrAdress = ((ptr + align - 1u) / align) * align + numBytes;
						            return static_cast<uint32_t>(newPtrAdress - reinterpret_cast<std::size_t>(data()));
						        }

						        static constexpr std::uint32_t totalSharedBytes
						            = static_cast<std::uint32_t>(ALPAKA_BLOCK_SHARED_DYN_MEMBER_ALLOC_KIB << 10u);
						        //! Memory layout
						        //! |Header|Padding|Variable|Padding|Header|....uninitialized Data ....
						        //! Size of padding can be zero if data after padding is already aligned.
						        mutable std::array<uint8_t, totalSharedBytes> m_data;

						        //! Offset in bytes relative to m_mem to next free data area.
						        //! The last aligned before the free area is always a meta data header.
						        mutable std::uint32_t m_allocdBytes = 0u;
						    };
						} // namespace alpaka::onAcc::cpu::detail
						// ==
						// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/host/block/mem/SharedStorage.hpp ==
						// ============================================================================

					// #include "alpaka/core/common.hpp"    // amalgamate: file already inlined

					// #include <cstdint>    // amalgamate: file already included

					namespace alpaka::onAcc
					{
					    namespace cpu
					    {
					        template<std::size_t TDataAlignBytes>
					        struct SingleThreadStaticShared : private detail::SharedStorage<TDataAlignBytes>
					        {
					            using Base = detail::SharedStorage<TDataAlignBytes>;

					            template<typename T, size_t T_unique>
					            T& allocVar()
					            {
					                auto* data = Base::template getVarPtr<T>(T_unique);

					                if(!data)
					                {
					                    Base::template alloc<T>(T_unique);
					                    data = Base::template getLatestVarPtr<T>();
					                }
					                ALPAKA_ASSERT(data != nullptr);
					                return *data;
					            }

					            template<typename T, size_t T_unique>
					            T* allocDynamic(uint32_t numBytes)
					            {
					                auto* data = Base::template getVarPtr<T>(T_unique);

					                if(!data)
					                {
					                    Base::template allocDynamic<T>(T_unique, numBytes);
					                    data = Base::template getLatestVarPtr<T>();
					                }
					                ALPAKA_ASSERT(data != nullptr);
					                return data;
					            }

					            void reset()
					            {
					            }
					        };
					    } // namespace cpu
					} // namespace alpaka::onAcc
					// ==
					// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/host/block/mem/SingleThreadStaticShared.hpp ==
					// ============================================================================

					// ============================================================================
					// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/host/block/sync/NoOp.hpp ==
					// ==
					/* Copyright 2024 René Widera
					 * SPDX-License-Identifier: MPL-2.0
					 */

					// #pragma once
					namespace alpaka::onAcc
					{
					    namespace cpu
					    {
					        struct NoOp
					        {
					            constexpr void operator()() const
					            {
					            }
					        };
					    } // namespace cpu
					} // namespace alpaka::onAcc
					// ==
					// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/host/block/sync/NoOp.hpp ==
					// ============================================================================

					// ============================================================================
					// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/host/hwloc/utility.hpp ==
					// ==
					/* Copyright 2026 René Widera
					 * SPDX-License-Identifier: MPL-2.0
					 */

					// #pragma once
					// #include "alpaka/api/host/hwloc/hwlocConfig.hpp"    // amalgamate: file already inlined
						// ============================================================================
						// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/host/sysInfo.hpp ==
						// ==
						/* Copyright 2022 Benjamin Worpitz, Daniel Vollmer, Erik Zenker, René Widera, Bernhard Manfred Gruber, Andrea Bocci
						 * SPDX-License-Identifier: MPL-2.0
						 */

						// #pragma once
						// #include "alpaka/core/config.hpp"    // amalgamate: file already inlined

						#if ALPAKA_OS_WINDOWS || ALPAKA_OS_CYGWIN
						#    ifndef NOMINMAX
						#        define NOMINMAX
						#    endif
						#    ifndef WIN32_LEAN_AND_MEAN
						#        define WIN32_LEAN_AND_MEAN
						#    endif
						// We could use some more macros to reduce the number of sub-headers included, but this would restrict user code.
						#    include <windows.h>
						#elif ALPAKA_OS_LINUX || ALPAKA_OS_IOS
						#    include <sys/param.h>
						#    include <sys/types.h>
						#    include <unistd.h>

						// #    include <cstdint>    // amalgamate: file already included
						#    if ALPAKA_OS_IOS
						#        include <sys/sysctl.h>
						#    endif
						#endif

						#if ALPAKA_OS_LINUX
						#    include <fstream>
						#endif

						// #include <cstdint>    // amalgamate: file already included
						// #include <cstring>    // amalgamate: file already included
						#include <stdexcept>
						// #include <string>    // amalgamate: file already included

						#if ALPAKA_ARCH_X86
						#    if ALPAKA_COMP_GNUC || ALPAKA_COMP_CLANG || ALPAKA_COMP_PGI
						#        include <cpuid.h>
						#    elif ALPAKA_COMP_MSVC || defined(ALPAKA_COMP_MSVC_EMULATED)
						#        include <intrin.h>
						#    endif
						#endif

						namespace alpaka::onHost
						{
						    constexpr int NO_CPUID = 0;
						    constexpr int UNKNOWN_CPU = 0;
						    constexpr int UNKNOWN_COMPILER = 1;
						#if ALPAKA_ARCH_X86
						#    if ALPAKA_COMP_GNUC || ALPAKA_COMP_CLANG || ALPAKA_COMP_PGI
						    inline auto cpuid(std::uint32_t level, std::uint32_t subfunction, std::uint32_t ex[4]) -> void
						    {
						        __cpuid_count(level, subfunction, ex[0], ex[1], ex[2], ex[3]);
						    }

						#    elif ALPAKA_COMP_MSVC || defined(ALPAKA_COMP_MSVC_EMULATED)
						    inline auto cpuid(std::uint32_t level, std::uint32_t subfunction, std::uint32_t ex[4]) -> void
						    {
						        __cpuidex(reinterpret_cast<int*>(ex), level, subfunction);
						    }
						#    else
						    inline auto cpuid(std::uint32_t, std::uint32_t, std::uint32_t ex[4]) -> void
						    {
						        ex[0] = ex[2] = ex[3] = NO_CPUID;
						        ex[1] = UNKNOWN_COMPILER;
						    }
						#    endif
						#else
						    inline auto cpuid(std::uint32_t, std::uint32_t, std::uint32_t ex[4]) -> void
						    {
						        ex[0] = ex[2] = ex[3] = NO_CPUID;
						        ex[1] = UNKNOWN_CPU;
						    }
						#endif
						    //! \return The name of the CPU the code is running on.
						    inline auto getCpuName() -> std::string
						    {
						        // Get extended ids.
						        std::uint32_t ex[4] = {0};
						        cpuid(0x8000'0000, 0, ex);
						        std::uint32_t const nExIds(ex[0]);

						        if(!nExIds)
						        {
						            switch(ex[1])
						            {
						            case UNKNOWN_COMPILER:
						                return "<unknown: compiler>";
						            case UNKNOWN_CPU:
						                return "<unknown: CPU>";
						            default:
						                return "<unknown>";
						            }
						        }
						#if ALPAKA_ARCH_X86
						        // Get the information associated with each extended ID.
						        char cpuBrandString[0x40] = {0};
						        for(std::uint32_t i(0x8000'0000); i <= nExIds; ++i)
						        {
						            cpuid(i, 0, ex);

						            // Interpret CPU brand string and cache information.
						            if(i == 0x8000'0002)
						            {
						                std::memcpy(cpuBrandString, ex, sizeof(ex));
						            }
						            else if(i == 0x8000'0003)
						            {
						                std::memcpy(cpuBrandString + 16, ex, sizeof(ex));
						            }
						            else if(i == 0x8000'0004)
						            {
						                std::memcpy(cpuBrandString + 32, ex, sizeof(ex));
						            }
						        }
						        return std::string(cpuBrandString);
						#else
						        return std::string("unknown");
						#endif
						    }

						    //! \return Pagesize in bytes used by the system.
						    inline size_t getPageSize()
						    {
						#if ALPAKA_OS_WINDOWS || ALPAKA_OS_CYGWIN
						        SYSTEM_INFO si;
						        GetSystemInfo(&si);
						        return si.dwPageSize;
						#elif ALPAKA_OS_LINUX || ALPAKA_OS_IOS
						#    if defined(_SC_PAGESIZE)
						        return static_cast<std::size_t>(sysconf(_SC_PAGESIZE));
						#    else
						        // this is legacy and only used as fallback
						        return = static_cast<size_t>(getpagesize());
						#    endif
						#else
						#    error "getPageSize not implemented for this system!"
						        return 0;
						#endif
						    }

						    //! \return The total number of bytes of global memory.
						    //! Adapted from David Robert Nadeau:
						    //! http://nadeausoftware.com/articles/2012/09/c_c_tip_how_get_physical_memory_size_system
						    inline auto getGlobalMemCapacityBytes() -> std::size_t
						    {
						#if ALPAKA_OS_WINDOWS
						        MEMORYSTATUSEX status;
						        status.dwLength = sizeof(status);
						        GlobalMemoryStatusEx(&status);
						        return static_cast<std::size_t>(status.ullTotalPhys);

						#elif ALPAKA_OS_CYGWIN
						        // New 64-bit MEMORYSTATUSEX isn't available.
						        MEMORYSTATUS status;
						        status.dwLength = sizeof(status);
						        GlobalMemoryStatus(&status);
						        return static_cast<std::size_t>(status.dwTotalPhys);

						#elif ALPAKA_OS_LINUX || ALPAKA_OS_IOS
						        // Unix : Prefer sysctl() over sysconf() except sysctl() with HW_REALMEM and HW_PHYSMEM which are not
						        // always reliable
						#    if defined(CTL_HW) && (defined(HW_MEMSIZE) || defined(HW_PHYSMEM64))
						        int mib[2]
						            = {CTL_HW,
						#        if defined(HW_MEMSIZE) // OSX
						               HW_MEMSIZE
						#        elif defined(HW_PHYSMEM64) // NetBSD, OpenBSD.
						               HW_PHYSMEM64
						#        endif
						            };
						        std::uint64_t size(0);
						        std::size_t sizeLen{sizeof(size)};
						        if(sysctl(mib, 2, &size, &sizeLen, nullptr, 0) < 0)
						            throw std::logic_error("getGlobalMemCapacityBytes failed calling sysctl!");
						        return static_cast<std::size_t>(size);

						#    elif defined(_SC_AIX_REALMEM) // AIX.
						        return static_cast<std::size_t>(sysconf(_SC_AIX_REALMEM)) * static_cast<std::size_t>(1024);

						#    elif defined(_SC_PHYS_PAGES) // Linux, FreeBSD, OpenBSD, Solaris.
						        return static_cast<std::size_t>(sysconf(_SC_PHYS_PAGES)) * getPageSize();

						#    elif defined(CTL_HW)                                                                                             \
						        && (defined(HW_PHYSMEM) || defined(HW_REALMEM)) // FreeBSD, DragonFly BSD, NetBSD, OpenBSD, and OSX.
						        int mib[2]
						            = {CTL_HW,
						#        if defined(HW_REALMEM) // FreeBSD.
						               HW_REALMEM
						#        elif defined(HW_PYSMEM) // Others.
						               HW_PHYSMEM
						#        endif
						            };
						        std::uint32_t size(0);
						        std::size_t const sizeLen{sizeof(size)};
						        if(sysctl(mib, 2, &size, &sizeLen, nullptr, 0) < 0)
						            throw std::logic_error("getGlobalMemCapacityBytes failed calling sysctl!");
						        return static_cast<std::size_t>(size);
						#    endif

						#else
						#    error "getGlobalMemCapacityBytes not implemented for this system!"
						#endif
						    }

						    //! \return The free number of bytes of global memory.
						    //! \throws std::logic_error if not implemented on the system and std::runtime_error on other errors.
						    inline auto getFreeGlobalMemBytes() -> std::size_t
						    {
						#if ALPAKA_OS_WINDOWS
						        MEMORYSTATUSEX status;
						        status.dwLength = sizeof(status);
						        GlobalMemoryStatusEx(&status);
						        return static_cast<std::size_t>(status.ullAvailPhys);
						#elif ALPAKA_OS_LINUX
						#    if defined(_SC_AVPHYS_PAGES)
						        return static_cast<std::size_t>(sysconf(_SC_AVPHYS_PAGES)) * getPageSize();
						#    else
						        // this is legacy and only used as fallback
						        return static_cast<std::size_t>(get_avphys_pages()) * getPageSize();
						#    endif
						#elif ALPAKA_OS_IOS
						        int free_pages = 0;
						        std::size_t len = sizeof(free_pages);
						        if(sysctlbyname("vm.page_free_count", &free_pages, &len, nullptr, 0) < 0)
						        {
						            throw std::logic_error("getFreeGlobalMemSizeBytes failed calling sysctl(vm.page_free_count)!");
						        }

						        return static_cast<std::size_t>(free_pages) * getPageSize();
						#else
						#    error "getFreeGlobalMemSizeBytes not implemented for this system!"
						#endif
						    }

						} // namespace alpaka::onHost
						// ==
						// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/host/sysInfo.hpp ==
						// ============================================================================

					// #include "alpaka/core/util.hpp"    // amalgamate: file already inlined
					// #include "alpaka/unused.hpp"    // amalgamate: file already inlined

					// #include <fstream>    // amalgamate: file already included
					#include <optional>
					// #include <sstream>    // amalgamate: file already included
					// #include <string>    // amalgamate: file already included
					#include <thread>

					/** Implement functions required to set thread affinity and pin memory.
					 *
					 * There is always a fallback implement to be able to run without hwloc.
					 * In this case nume selection is not possible and all cores will taken into account.
					 */
					namespace alpaka::onHost::internal::hwloc
					{
					    /** Constant to select all NUMA domains.
					     *
					     * Within alpaka we work always with the numa domain index.
					     * Any code setting properties based on the numa domain index should compare first against this value and use all
					     * cores if the numa index is equal to this value.
					     */
					    constexpr uint32_t allNumaDomains = std::numeric_limits<uint32_t>::max();

					#if ALPAKA_HAS_HWLOC
					    /** Helper singleton to cache the hwloc topology.
					     *
					     * Caching is required to reduce the overhead for repeating operations.
					     * Building the topology can be expensive.
					     */
					    class TopologyCache
					    {
					    public:
					        static TopologyCache& instance()
					        {
					            static TopologyCache topology;
					            return topology;
					        }

					        hwloc_topology_t get() const noexcept
					        {
					            return m_topology;
					        }

					        hwloc_obj_t getNumaObj(uint32_t numaIdx) const
					        {
					            hwloc_obj_t obj = hwloc_get_obj_by_type(m_topology, HWLOC_OBJ_NUMANODE, static_cast<unsigned>(numaIdx));
					            if(obj == nullptr)
					            {
					                throw std::out_of_range("NUMA domain index out of range: " + std::to_string(numaIdx));
					            }
					            return obj;
					        }

					        uint32_t getNumNumaDomains() const
					        {
					            int const count = hwloc_get_nbobjs_by_type(m_topology, HWLOC_OBJ_NUMANODE);
					            if(count < 0)
					            {
					                throw std::runtime_error("hwloc_get_nbobjs_by_type(HWLOC_OBJ_NUMANODE) failed");
					            }
					            return static_cast<uint32_t>(count);
					        }

					    private:
					        TopologyCache()
					        {
					            if(hwloc_topology_init(&m_topology) != 0)
					            {
					                throw std::runtime_error("hwloc_topology_init failed");
					            }
					            if(hwloc_topology_load(m_topology) != 0)
					            {
					                hwloc_topology_destroy(m_topology);
					                throw std::runtime_error("hwloc_topology_load failed");
					            }
					        }

					        ~TopologyCache()
					        {
					            if(m_topology != nullptr)
					            {
					                hwloc_topology_destroy(m_topology);
					            }
					        }

					        TopologyCache(TopologyCache const&) = delete;
					        TopologyCache& operator=(TopologyCache const&) = delete;
					        TopologyCache(TopologyCache&&) = delete;
					        TopologyCache& operator=(TopologyCache&&) = delete;

					    private:
					        hwloc_topology_t m_topology{};
					    };

					    [[noreturn]] inline void throwErrno(char const* what)
					    {
					        throw std::runtime_error(std::string(what) + ": " + std::strerror(errno));
					    }

					    /** Shorthand to get the cached hwloc topology */
					    inline hwloc_topology_t getTopology()
					    {
					        return TopologyCache::instance().get();
					    }

					    /** Get an hwloc NUMA object */
					    inline hwloc_obj_t getNumaObj(uint32_t numaIdx)
					    {
					        return TopologyCache::instance().getNumaObj(numaIdx);
					    }
					#endif

					    /** Get the number of NUMA domains. */
					    inline uint32_t getNumNumaDomains()
					    {
					#if ALPAKA_HAS_HWLOC
					        return TopologyCache::instance().getNumNumaDomains();
					#else
					        return 1;
					#endif
					    }

					    /** Parse the OS NUMA information.
					     *
					     * hwloc is not providing the available free memory in a numa domain.
					     * Therefor we fall back to check the NUMA node information in the OS directly.
					     *
					     * @param osNodeIndex The index of the numa domain in the OS.
					     * @param key The key value you want to read out e.g. 'MemFree:' or 'HugePages_Total:'
					     */
					    inline std::optional<size_t> parseNodeMemInfoValueBytes(unsigned osNodeIndex, std::string_view key)
					    {
					        std::ifstream in("/sys/devices/system/node/node" + std::to_string(osNodeIndex) + "/meminfo");
					        if(!in)
					        {
					            return std::nullopt;
					        }

					        std::string line;
					        while(std::getline(in, line))
					        {
					            if(line.find(std::string(key)) == std::string::npos)
					            {
					                continue;
					            }

					            // Example line:
					            // Node 0 MemFree:        123456 kB
					            std::istringstream iss(line);
					            std::string nodeWord;
					            unsigned nodeNumber = 0;
					            std::string field;
					            size_t valueKB = 0;
					            std::string unit;
					            if(iss >> nodeWord >> nodeNumber >> field >> valueKB >> unit)
					            {
					                if(field == key && unit == "kB")
					                {
					                    return valueKB * 1024ULL;
					                }
					            }
					        }

					        return std::nullopt;
					    }

					    /** Set the affinity of the current thread to all cores of the NUMA domain
					     *
					     * @param numaIdx numa index starting with zero, or allNumaDomains to use all cores
					     */
					    inline void setThreadAffinity(uint32_t numaIdx)
					    {
					#if ALPAKA_HAS_HWLOC
					        hwloc_cpuset_t cpuset = nullptr;

					        if(numaIdx == allNumaDomains)
					        {
					            hwloc_const_cpuset_t const fullSet = hwloc_topology_get_complete_cpuset(getTopology());
					            if(fullSet == nullptr)
					            {
					                throw std::runtime_error("Topology has no complete cpuset");
					            }

					            cpuset = hwloc_bitmap_dup(fullSet);
					        }
					        else
					        {
					            hwloc_obj_t const node = getNumaObj(numaIdx);
					            if(node->cpuset == nullptr)
					            {
					                throw std::runtime_error("NUMA node has no cpuset");
					            }

					            cpuset = hwloc_bitmap_dup(node->cpuset);
					        }

					        if(cpuset == nullptr)
					        {
					            throw std::bad_alloc();
					        }

					        int const rc = hwloc_set_cpubind(getTopology(), cpuset, HWLOC_CPUBIND_THREAD | HWLOC_CPUBIND_STRICT);

					        hwloc_bitmap_free(cpuset);

					        if(rc != 0)
					        {
					            throwErrno("hwloc_set_cpubind failed");
					        }
					#else
					        alpaka::unused(numaIdx);
					        return;
					#endif
					    }

					    /** Set the NUMA domain for the memory range described by ptr and bytes
					     *
					     * @attention This method should be called before the memory is touched, else it has no effect.
					     *
					     * @param ptr pointer address to pin, nullptr are valid input
					     * @param bytes the number of bytes to pin starting from the ptr address
					     * @param numaIdx numa index starting with zero, or allNumaDomains to not pin anything.
					     */
					    template<typename T>
					    inline void pinPointer(T* const ptr, size_t bytes, uint32_t numaIdx)
					    {
					#if ALPAKA_HAS_HWLOC
					        if(numaIdx == allNumaDomains)
					            return;

					        if(ptr == nullptr || bytes == 0u)
					            return;

					        hwloc_obj_t const node = getNumaObj(numaIdx);
					        if(node->nodeset == nullptr)
					        {
					            throw std::runtime_error("NUMA node has no nodeset");
					        }

					        hwloc_nodeset_t nodeset = hwloc_bitmap_dup(node->nodeset);
					        if(nodeset == nullptr)
					        {
					            throw std::bad_alloc();
					        }

					        int const rc = hwloc_set_area_membind(
					            getTopology(),
					            alpaka::toVoidPtr(ptr),
					            bytes,
					            nodeset,
					            HWLOC_MEMBIND_BIND,
					            HWLOC_MEMBIND_BYNODESET | HWLOC_MEMBIND_STRICT);

					        hwloc_bitmap_free(nodeset);

					        if(rc != 0)
					        {
					#    ifdef ALPAKA_HOST_MEM_PINNING_CAN_FAIL
					            // missing privileges, e.g. within a container
					            bool const operationNotSupported = errno == EPERM;
					            // unsupported platform
					            bool const functionNotImplemented = errno == ENOSYS;
					            // NUMA node is not allowed by cpuset/cgroup
					            bool const operationNotAllowed = errno == EXDEV;
					            if(operationNotSupported || functionNotImplemented || operationNotAllowed)
					            {
					                return;
					            }
					#    endif
					            throwErrno("hwloc_set_area_membind failed");
					        }
					#else
					        alpaka::unused(ptr, bytes, numaIdx);
					        return;
					#endif
					    }

					    /** Return the number of cores which has direct access to the numa domain
					     *
					     *  Here "cores" means logical CPUs / processing units, so SMT siblings are counted too.
					     *
					     *  @param numaIdx numa index starting with zero, or allNumaDomains to the C++ hardware concurrency.
					     */
					    inline uint32_t getNumCores(uint32_t numaIdx)
					    {
					#if ALPAKA_HAS_HWLOC
					        if(numaIdx == allNumaDomains)
					            return std::thread::hardware_concurrency();

					        hwloc_obj_t const node = getNumaObj(numaIdx);
					        if(node->cpuset == nullptr)
					        {
					            throw std::runtime_error("NUMA node has no cpuset");
					        }

					        int const numPUs = hwloc_bitmap_weight(node->cpuset);
					        if(numPUs < 0)
					        {
					            throw std::runtime_error("hwloc_bitmap_weight failed");
					        }

					        return static_cast<uint32_t>(numPUs);
					#else
					        alpaka::unused(numaIdx);
					        return std::thread::hardware_concurrency();
					#endif
					    }

					    /** Return the number of bytes of the numa domain
					     *
					     * @param numaIdx numa index starting with zero, or allNumaDomains to get total CPU memory capacity.
					     */
					    inline size_t getMemCapacityBytes(uint32_t numaIdx)
					    {
					#if ALPAKA_HAS_HWLOC
					        if(numaIdx == allNumaDomains)
					            return alpaka::onHost::getGlobalMemCapacityBytes();

					        hwloc_obj_t const node = getNumaObj(numaIdx);
					        if(node->attr == nullptr)
					        {
					            throw std::runtime_error("NUMA node has no attributes");
					        }

					        return static_cast<size_t>(node->attr->numanode.local_memory);

					#else
					        alpaka::unused(numaIdx);
					        return alpaka::onHost::getGlobalMemCapacityBytes();
					#endif
					    }

					    /** Return the number of free bytes in the numa domain.
					     *
					     *  Linux-only implementation via /sys/devices/system/node/nodeX/meminfo
					     *
					     *  @param numaIdx numa index starting with zero, or allNumaDomains to get total free CPU memory.
					     */
					    inline size_t getFreeGlobalMemBytes(uint32_t numaIdx)
					    {
					#if ALPAKA_HAS_HWLOC
					        if(numaIdx == allNumaDomains)
					            return alpaka::onHost::getFreeGlobalMemBytes();

					        hwloc_obj_t const node = getNumaObj(numaIdx);
					        auto const freeBytes = parseNodeMemInfoValueBytes(node->os_index, "MemFree:");
					        if(!freeBytes.has_value())
					        {
					            throw std::runtime_error(
					                "Could not read per-node MemFree from /sys/devices/system/node/node" + std::to_string(node->os_index)
					                + "/meminfo");
					        }
					        return *freeBytes;
					#else
					        alpaka::unused(numaIdx);
					        return alpaka::onHost::getFreeGlobalMemBytes();
					#endif
					    }
					} // namespace alpaka::onHost::internal::hwloc
					// ==
					// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/host/hwloc/utility.hpp ==
					// ============================================================================

				// #include "alpaka/core/Dict.hpp"    // amalgamate: file already inlined
				// #include "alpaka/core/common.hpp"    // amalgamate: file already inlined
				// #include "alpaka/meta/NdLoop.hpp"    // amalgamate: file already inlined
				// #include "alpaka/onAcc/Acc.hpp"    // amalgamate: file already inlined
				// #include "alpaka/onHost/ThreadSpec.hpp"    // amalgamate: file already inlined
				// #include "alpaka/tag.hpp"    // amalgamate: file already inlined

				// #include <cassert>    // amalgamate: file already included
				// #include <stdexcept>    // amalgamate: file already included
				// #include <tuple>    // amalgamate: file already included
				#include <type_traits>

				#if ALPAKA_OMP

				namespace alpaka::onHost
				{
				    namespace cpu
				    {
				        template<onHost::concepts::ThreadSpec T_ThreadSpec>
				        struct OmpBlocks
				        {
				            constexpr OmpBlocks(T_ThreadSpec threadBlocking, uint32_t numaIdx, bool setThreadAffinity)
				                : m_threadBlocking{std::move(threadBlocking)}
				                , m_numaIdx{numaIdx}
				                , m_setThreadAffinity{setThreadAffinity}
				            {
				                if(m_threadBlocking.getNumThreads().product() != 1u)
				                {
				                    throw std::runtime_error("Thread block extent must be 1.");
				                }
				            }

				            void operator()(auto const& kernelBundle, auto const& dict) const
				            {
				                using NumThreadsVecType = typename T_ThreadSpec::NumThreadsVecType;
				#    pragma omp parallel
				                {
				                    if(m_setThreadAffinity)
				                        internal::hwloc::setThreadAffinity(m_numaIdx);

				                    // copy from num blocks to derive correct index type
				                    auto blockIdx = m_threadBlocking.getNumBlocks();
				                    constexpr uint32_t simdWidth
				                        = alpaka::getArchSimdWidth<uint8_t>(api::host, ALPAKA_TYPEOF(dict[object::deviceKind]){});
				                    auto blockSharedMem = onAcc::cpu::SingleThreadStaticShared<simdWidth>{};

				                    // dynamic shared mem
				                    uint32_t blockDynSharedMemBytes = onHost::getDynSharedMemBytes(m_threadBlocking, kernelBundle);
				                    auto const blockDynSharedMemEntry = DictEntry{layer::dynShared, std::ref(blockSharedMem)};
				                    auto const blockDynSharedMemBytesEntry
				                        = DictEntry{object::dynSharedMemBytes, std::ref(blockDynSharedMemBytes)};

				                    /* Only add dynamic shared memory objects if defined by the user, if not we will get a clean static
				                     * assert if the kernel tries to access dynamic shared memory */
				                    auto additionalDict = conditionalAppendDict<
				                        trait::HasUserDefinedDynSharedMemBytes<T_ThreadSpec, ALPAKA_TYPEOF(kernelBundle)>::value>(
				                        dict,
				                        Dict{blockDynSharedMemEntry, blockDynSharedMemBytesEntry});

				                    auto blockCount = m_threadBlocking.getNumBlocks();

				                    auto const blockLayerEntry = DictEntry{
				                        layer::block,
				                        onAcc::cpu::GenericLayer{std::cref(blockIdx), std::cref(blockCount)}};
				                    auto const threadLayerEntry = DictEntry{layer::thread, onAcc::cpu::OneLayer<NumThreadsVecType>{}};
				                    auto const blockSharedMemEntry = DictEntry{layer::shared, std::ref(blockSharedMem)};
				                    auto const blockSyncEntry = DictEntry{action::threadBlockSync, onAcc::cpu::NoOp{}};
				                    auto const warpSizeEntry = DictEntry{object::warpSize, std::integral_constant<uint32_t, 1u>{}};

				                    auto acc = onAcc::Acc(joinDict(
				                        Dict{blockLayerEntry, threadLayerEntry, blockSharedMemEntry, blockSyncEntry, warpSizeEntry},
				                        additionalDict));

				                    using ThreadIdxType = typename NumThreadsVecType::type;
				#    pragma omp for nowait
				                    for(ThreadIdxType i = 0; i < blockCount.product(); ++i)
				                    {
				                        blockIdx = mapToND(blockCount, i);
				                        kernelBundle(acc);
				                        blockSharedMem.reset();
				                    }
				                }
				            }

				            T_ThreadSpec m_threadBlocking;
				            uint32_t m_numaIdx;
				            bool m_setThreadAffinity;
				        };
				    } // namespace cpu

				    inline auto makeAcc(
				        alpaka::onHost::concepts::ThreadSpec auto const& threadSpec,
				        uint32_t numaIdx,
				        bool setThreadAffinity) requires std::same_as<ALPAKA_TYPEOF(threadSpec.getExecutor()), exec::CpuOmpBlocks>
				    {
				        return cpu::OmpBlocks(threadSpec, numaIdx, setThreadAffinity);
				    }
				} // namespace alpaka::onHost

				#endif
				// ==
				// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/host/exec/OmpBlocks.hpp ==
				// ============================================================================

				// ============================================================================
				// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/host/exec/Serial.hpp ==
				// ==
				/* Copyright 2024 René Widera
				 * SPDX-License-Identifier: MPL-2.0
				 */

				// #pragma once
				// #include "alpaka/api/host/IdxLayer.hpp"    // amalgamate: file already inlined
				// #include "alpaka/api/host/block/mem/SingleThreadStaticShared.hpp"    // amalgamate: file already inlined
				// #include "alpaka/api/host/block/sync/NoOp.hpp"    // amalgamate: file already inlined
				// #include "alpaka/api/host/hwloc/utility.hpp"    // amalgamate: file already inlined
				// #include "alpaka/core/Dict.hpp"    // amalgamate: file already inlined
				// #include "alpaka/meta/NdLoop.hpp"    // amalgamate: file already inlined
				// #include "alpaka/onAcc/Acc.hpp"    // amalgamate: file already inlined
				// #include "alpaka/onHost/ThreadSpec.hpp"    // amalgamate: file already inlined
				// #include "alpaka/tag.hpp"    // amalgamate: file already inlined

				// #include <cassert>    // amalgamate: file already included
				// #include <tuple>    // amalgamate: file already included
				#include <type_traits>

				namespace alpaka::onHost
				{
				    namespace cpu
				    {
				        template<onHost::concepts::ThreadSpec T_ThreadSpec>
				        struct Serial
				        {
				            using NumThreadsVecType = typename T_ThreadSpec::NumThreadsVecType;

				            constexpr Serial(T_ThreadSpec threadBlocking, uint32_t numaIdx, bool setThreadAffinity)
				                : m_threadBlocking{std::move(threadBlocking)}
				                , m_numaIdx{numaIdx}
				                , m_setThreadAffinity{setThreadAffinity}
				            {
				                if(m_threadBlocking.getNumThreads().product() != 1u)
				                {
				                    throw std::runtime_error("Thread block extent must be 1.");
				                }
				            }

				            void operator()(auto const& kernelBundle, auto const& dict) const
				            {
				                if(m_setThreadAffinity)
				                    internal::hwloc::setThreadAffinity(m_numaIdx);
				                // copy from num blocks to derive correct index type
				                auto blockIdx = m_threadBlocking.getNumBlocks();
				                constexpr uint32_t simdWidth
				                    = alpaka::getArchSimdWidth<uint8_t>(api::host, ALPAKA_TYPEOF(dict[object::deviceKind]){});
				                auto blockSharedMem = onAcc::cpu::SingleThreadStaticShared<simdWidth>{};

				                auto const blockLayerEntry = DictEntry{
				                    layer::block,
				                    onAcc::cpu::GenericLayer{std::cref(blockIdx), std::cref(m_threadBlocking.getNumBlocks())}};
				                auto const threadLayerEntry = DictEntry{layer::thread, onAcc::cpu::OneLayer<NumThreadsVecType>{}};
				                auto const blockSharedMemEntry = DictEntry{layer::shared, std::ref(blockSharedMem)};
				                auto const blockSyncEntry = DictEntry{action::threadBlockSync, onAcc::cpu::NoOp{}};

				                // dynamic shared mem
				                uint32_t blockDynSharedMemBytes = onHost::getDynSharedMemBytes(m_threadBlocking, kernelBundle);
				                auto const blockDynSharedMemEntry = DictEntry{layer::dynShared, std::ref(blockSharedMem)};
				                auto const blockDynSharedMemBytesEntry
				                    = DictEntry{object::dynSharedMemBytes, std::ref(blockDynSharedMemBytes)};

				                /* Only add dynamic shared memory objects if defined by the user, if not we will get a clean static
				                 * assert if the kernel tries to access dynamic shared memory */
				                auto additionalDict = conditionalAppendDict<
				                    trait::HasUserDefinedDynSharedMemBytes<T_ThreadSpec, ALPAKA_TYPEOF(kernelBundle)>::value>(
				                    dict,
				                    Dict{blockDynSharedMemEntry, blockDynSharedMemBytesEntry});

				                auto const warpSizeEntry = DictEntry{object::warpSize, std::integral_constant<uint32_t, 1u>{}};

				                auto acc = onAcc::Acc(joinDict(
				                    Dict{blockLayerEntry, threadLayerEntry, blockSharedMemEntry, blockSyncEntry, warpSizeEntry},
				                    additionalDict));
				                meta::ndLoopIncIdx(
				                    blockIdx,
				                    m_threadBlocking.getNumBlocks(),
				                    [&](auto const&)
				                    {
				                        kernelBundle(acc);
				                        acc[layer::shared].reset();
				                    });
				            }

				            T_ThreadSpec m_threadBlocking;
				            uint32_t m_numaIdx;
				            bool m_setThreadAffinity;
				        };
				    } // namespace cpu

				    inline auto makeAcc(
				        alpaka::onHost::concepts::ThreadSpec auto const& threadSpec,
				        uint32_t numaIdx,
				        bool setThreadAffinity) requires std::same_as<ALPAKA_TYPEOF(threadSpec.getExecutor()), exec::CpuSerial>
				    {
				        return cpu::Serial(threadSpec, numaIdx, setThreadAffinity);
				    }
				} // namespace alpaka::onHost
				// ==
				// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/host/exec/Serial.hpp ==
				// ============================================================================

				// ============================================================================
				// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/host/exec/TbbBlocks.hpp ==
				// ==
				/* Copyright 2024 Mehmet Yusufoglu, René Widera
				 * SPDX-License-Identifier: MPL-2.0
				 */

				// #pragma once
				// #include "alpaka/api/host/IdxLayer.hpp"    // amalgamate: file already inlined
				// #include "alpaka/api/host/block/mem/SingleThreadStaticShared.hpp"    // amalgamate: file already inlined
				// #include "alpaka/api/host/block/sync/NoOp.hpp"    // amalgamate: file already inlined
				// #include "alpaka/api/host/executor.hpp"    // amalgamate: file already inlined
				// #include "alpaka/api/host/hwloc/utility.hpp"    // amalgamate: file already inlined
				// #include "alpaka/core/Dict.hpp"    // amalgamate: file already inlined
				// #include "alpaka/onAcc/Acc.hpp"    // amalgamate: file already inlined
				// #include "alpaka/onHost/ThreadSpec.hpp"    // amalgamate: file already inlined
				// #include "alpaka/tag.hpp"    // amalgamate: file already inlined

				// #include <cstddef>    // amalgamate: file already included
				// #include <stdexcept>    // amalgamate: file already included
				#include <type_traits>

				#if ALPAKA_TBB
				#    include <oneapi/tbb/blocked_range.h>
				#    include <oneapi/tbb/parallel_for.h>
				#    include <oneapi/tbb/task_group.h>

				namespace alpaka::onHost
				{
				    namespace cpu
				    {
				        template<onHost::concepts::ThreadSpec T_ThreadSpec>
				        struct TbbBlocks
				        {
				            using NumThreadsVecType = typename T_ThreadSpec::NumThreadsVecType;

				            // Construct the executor with the thread blocking configuration chosen by the queue.
				            constexpr TbbBlocks(T_ThreadSpec threadBlocking, uint32_t numaIdx, bool setThreadAffinity)
				                : m_threadBlocking(std::move(threadBlocking))
				                , m_numaIdx{numaIdx}
				                , m_setThreadAffinity{setThreadAffinity}
				            {
				                if(m_threadBlocking.getNumThreads().product() != 1u)
				                {
				                    throw std::runtime_error("Thread block extent must be 1.");
				                }
				            }

				            void operator()(auto const& kernelBundle, auto const& dict) const
				            {
				                auto blockCount = m_threadBlocking.getNumBlocks();

				                constexpr uint32_t simdWidth
				                    = alpaka::getArchSimdWidth<uint8_t>(api::host, ALPAKA_TYPEOF(dict[object::deviceKind]){});

				                oneapi::tbb::task_arena tbbArena;

				                auto kernel = [&]
				                {
				                    using ThreadIdxType = typename NumThreadsVecType::type;
				                    ThreadIdxType const linearNumBlocks = blockCount.product();

				                    oneapi::tbb::parallel_for(
				                        static_cast<ThreadIdxType>(0),
				                        linearNumBlocks,
				                        [&](ThreadIdxType i)
				                        {
				                            auto const blockIdx = mapToND(blockCount, i);

				                            auto blockSharedMem = onAcc::cpu::SingleThreadStaticShared<simdWidth>{};
				                            // Compose the accelerator dictionary entries consumed by the kernel.
				                            auto const blockLayerEntry
				                                = DictEntry{layer::block, onAcc::cpu::GenericLayer{std::cref(blockIdx), blockCount}};
				                            auto const threadLayerEntry
				                                = DictEntry{layer::thread, onAcc::cpu::OneLayer<NumThreadsVecType>{}};
				                            auto const blockSharedMemEntry = DictEntry{layer::shared, std::ref(blockSharedMem)};
				                            auto const blockSyncEntry = DictEntry{action::threadBlockSync, onAcc::cpu::NoOp{}};

				                            // dynamic shared mem
				                            uint32_t blockDynSharedMemBytes
				                                = onHost::getDynSharedMemBytes(m_threadBlocking, kernelBundle);
				                            auto const blockDynSharedMemEntry = DictEntry{layer::dynShared, std::ref(blockSharedMem)};
				                            auto const blockDynSharedMemBytesEntry
				                                = DictEntry{object::dynSharedMemBytes, std::ref(blockDynSharedMemBytes)};

				                            auto additionalDict = conditionalAppendDict<
				                                trait::HasUserDefinedDynSharedMemBytes<T_ThreadSpec, ALPAKA_TYPEOF(kernelBundle)>::
				                                    value>(dict, Dict{blockDynSharedMemEntry, blockDynSharedMemBytesEntry});

				                            auto const warpSizeEntry
				                                = DictEntry{object::warpSize, std::integral_constant<uint32_t, 1u>{}};

				                            auto acc = onAcc::Acc(joinDict(
				                                Dict{
				                                    blockLayerEntry,
				                                    threadLayerEntry,
				                                    blockSharedMemEntry,
				                                    blockSyncEntry,
				                                    warpSizeEntry},
				                                additionalDict));

				                            kernelBundle(acc);
				                        });
				                };

				                if(m_numaIdx != internal::hwloc::allNumaDomains && m_setThreadAffinity)
				                {
				                    oneapi::tbb::task_arena tbbArena;

				                    auto const& tbbNumaNodes = oneapi::tbb::info::numa_nodes();
				                    if(m_numaIdx >= tbbNumaNodes.size())
				                        throw std::out_of_range("Invalid NUMA index");
				                    auto tbbNumaIdx = tbbNumaNodes[m_numaIdx];
				                    tbbArena.initialize(oneapi::tbb::task_arena::constraints{}.set_numa_id(tbbNumaIdx));
				                    tbbArena.execute([&] { oneapi::tbb::this_task_arena::isolate(kernel); });
				                }
				                else
				                {
				                    oneapi::tbb::this_task_arena::isolate(kernel);
				                }
				            }

				            T_ThreadSpec m_threadBlocking;
				            uint32_t m_numaIdx;
				            bool m_setThreadAffinity;
				        };
				    } // namespace cpu

				    inline auto makeAcc(
				        alpaka::onHost::concepts::ThreadSpec auto const& threadSpec,
				        uint32_t numaIdx,
				        bool setThreadAffinity) requires std::same_as<ALPAKA_TYPEOF(threadSpec.getExecutor()), exec::CpuTbbBlocks>
				    {
				        return cpu::TbbBlocks(threadSpec, numaIdx, setThreadAffinity);
				    }
				} // namespace alpaka::onHost
				#endif
				// ==
				// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/host/exec/TbbBlocks.hpp ==
				// ============================================================================

				// ============================================================================
				// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/util.hpp ==
				// ==
				/* Copyright 2025 René Widera
				 * SPDX-License-Identifier: MPL-2.0
				 */


				// #pragma once
				// #include "alpaka/CVec.hpp"    // amalgamate: file already inlined
				// #include "alpaka/api/trait.hpp"    // amalgamate: file already inlined
				// #include "alpaka/core/common.hpp"    // amalgamate: file already inlined
				// #include "alpaka/mem/DataPitches.hpp"    // amalgamate: file already inlined
				// #include "alpaka/tag.hpp"    // amalgamate: file already inlined
				// #include "alpaka/utility.hpp"    // amalgamate: file already inlined

				// #include <cstdint>    // amalgamate: file already included
				// #include <utility>    // amalgamate: file already included

				namespace alpaka::api::util
				{
				    namespace detail
				    {
				        template<
				            std::integral auto T_limit,
				            std::integral auto T_index,
				            std::integral auto T_increment,
				            std::integral auto... T_idx>
				        consteval auto adjustToLimit(concepts::CVector auto const input, std::index_sequence<T_idx...>)
				        {
				            if constexpr(input.product() <= static_cast<typename ALPAKA_TYPEOF(input)::type>(T_limit))
				                return input;
				            else
				            {
				                constexpr uint32_t dim = static_cast<uint32_t>(sizeof...(T_idx));

				                constexpr auto newValue = CVec<
				                    typename ALPAKA_TYPEOF(input)::type,
				                    (T_idx == T_index ? divExZero(input[T_idx], static_cast<typename ALPAKA_TYPEOF(input)::type>(2))
				                                      : input[T_idx])...>{};

				                constexpr auto nextIncrement = dim == 1u ? 0u : T_increment;
				                constexpr auto nextIdx = T_index + T_increment;

				                if constexpr(nextIdx == dim)
				                {
				                    constexpr auto nextIncrement = dim == 1u ? 0u : -1u;

				                    return adjustToLimit<T_limit, (dim == 1 ? 0 : dim - 1u), nextIncrement>(
				                        newValue,
				                        std::index_sequence<T_idx...>{});
				                }
				                else if constexpr(nextIdx == 0u)
				                {
				                    return adjustToLimit<T_limit, nextIdx, 1u>(newValue, std::index_sequence<T_idx...>{});
				                }

				                return adjustToLimit<T_limit, nextIdx, nextIncrement>(newValue, std::index_sequence<T_idx...>{});
				            }
				        }
				    } // namespace detail

				    /** adjust the input vector to a given limit by halving all components
				     * until the product of these is is below or equal to the limit */
				    template<std::integral auto T_limit, std::integral auto T_index, std::integral auto T_increment>
				    consteval auto adjustToLimit(concepts::CVector auto const input)
				    {
				        return detail::adjustToLimit<T_limit, 0u, 1u>(input, std::make_index_sequence<input.dim()>{});
				    }

				    /** adjust the input vector to a given limit by halving the largest dimension until the product of all components
				     * is below or equal to the limit */
				    inline auto adjustToLimit(concepts::Vector auto input, std::integral auto const limit)
				    {
				        using IdxType = typename ALPAKA_TYPEOF(input)::type;
				        constexpr uint32_t dim = input.dim();
				        IdxType limitValue = static_cast<IdxType>(limit);

				        while(input.product() > limitValue)
				        {
				            uint32_t maxIdx = 0u;
				            auto maxValue = input[0];
				            for(auto i = 0u; i < dim; ++i)
				                if(maxValue < input[i])
				                {
				                    maxIdx = i;
				                    maxValue = input[i];
				                }
				            if(input.product() > limitValue)
				                input[maxIdx] = divExZero(input[maxIdx], IdxType{2u});
				        }
				        return input;
				    }

				    /** provides a memory description to create multidimensional linewise aligned memory within a one dimensional
				     * byte area
				     *
				     * @param alignment data alignment in bytes
				     * @return tuple with the linearized data blob size in bytes and multi-dimensional pitches,
				     * std::tuple(numBytes,pitcheMD)
				     */
				    template<typename T_ValueType, alpaka::concepts::Vector T_Extents>
				    inline auto emulatedAlignedMemDescription(uint32_t alignmentInByte, T_Extents extents)
				    {
				        constexpr auto dim = T_Extents::dim();
				        if constexpr(dim == 1u)
				        {
				            size_t memSizeInByte = static_cast<size_t>(extents.x()) * sizeof(T_ValueType);
				            alpaka::concepts::Vector auto pitches = typename T_Extents::UniVec{sizeof(T_ValueType)};
				            return std::make_tuple(memSizeInByte, pitches);
				        }
				        else
				        {
				            using IdxType = typename T_Extents::type;
				            auto alignment = static_cast<IdxType>(alignmentInByte);

				            IdxType rowExtentInBytes = extents.x() * static_cast<IdxType>(sizeof(T_ValueType));
				            IdxType rowPitchInBytes = alpaka::divCeil(rowExtentInBytes, alignment) * alignment;
				            auto pitches = alpaka::calculatePitches<T_ValueType>(extents, rowPitchInBytes);

				            size_t memSizeInByte = static_cast<size_t>(pitches[0]) * static_cast<size_t>(extents[0]);
				            return std::make_tuple(memSizeInByte, pitches);
				        }
				    }

				    consteval uint32_t highestPowerOfTwo(uint32_t value)
				    {
				        uint32_t result = 1u;
				        while((result << 1u) <= value)
				        {
				            result <<= 1u;
				        }
				        return result;
				    }

				    /** Calculate the best alignment for SIMD optimized memory allocation
				     *
				     * @param api the API to use
				     * @param deviceKind the device kind to use
				     * @return the best alignment in bytes, will be a power of two value
				     */
				    template<typename T_ValueType>
				    inline constexpr auto simdOptimizedAlignment(auto api, alpaka::concepts::DeviceKind auto deviceKind)
				    {
				        constexpr uint32_t typeAlignmentBytes = alignof(T_ValueType);
				        constexpr uint32_t simdPackBytes
				            = alpaka::getArchSimdWidth<T_ValueType>(api, deviceKind) * sizeof(T_ValueType);
				        constexpr uint32_t bestSimdPackBytes = highestPowerOfTwo(simdPackBytes);
				        constexpr uint32_t optimalAlignment = std::max(bestSimdPackBytes, typeAlignmentBytes);
				        constexpr uint32_t adjustedAlignment = getAdjustedAlignment<T_ValueType>(api, deviceKind, optimalAlignment);
				        return adjustedAlignment;
				    }
				} // namespace alpaka::api::util
				// ==
				// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/util.hpp ==
				// ============================================================================

				// ============================================================================
				// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/core/CallbackThread.hpp ==
				// ==
				/* Copyright 2022 Antonio Di Pilato
				 * SPDX-License-Identifier: MPL-2.0
				 */

				// #pragma once
				// #include "alpaka/api/host/hwloc/utility.hpp"    // amalgamate: file already inlined
				// #include "alpaka/core/config.hpp"    // amalgamate: file already inlined

				// #include <cassert>    // amalgamate: file already included
				#include <condition_variable>
				// #include <functional>    // amalgamate: file already included
				// #include <future>    // amalgamate: file already included
				// #include <iostream>    // amalgamate: file already included
				// #include <mutex>    // amalgamate: file already included
				#include <queue>
				// #include <thread>    // amalgamate: file already included

				namespace alpaka::core
				{
				    /** A thread queue executing tasks asynchronously.
				     *
				     * This object should be used as a member of objects which are secured by smart pointers to avoid that a task is
				     * taking over the ownership of the callback thread and therefore can destroy itself before all tasks are executed.
				     */
				    class CallbackThread
				    {
				#if ALPAKA_COMP_CLANG
				#    pragma clang diagnostic push
				#    pragma clang diagnostic ignored "-Wweak-vtables"
				#endif
				        // A custom class is used because std::function<F> requires F to be copyable, and std::packaged_task provides a
				        // std::future which will keep the task alive and we cannot control the moment the future is set.
				        //! \todo with C++23 std::move_only_function should be used
				        struct Task
				#if ALPAKA_COMP_CLANG
				#    pragma clang diagnostic pop
				#endif
				        {
				            virtual ~Task() = default;
				            virtual void run() = 0;
				        };

				        template<typename Function>
				        struct FunctionHolder : Task
				        {
				            Function m_func;

				            template<typename FunctionFwd>
				            explicit FunctionHolder(FunctionFwd&& func) : m_func{std::forward<FunctionFwd>(func)}
				            {
				            }

				            void run() override
				            {
				                // if m_func throws, let it propagate
				                m_func();
				            }
				        };

				        using TaskPackage = std::pair<std::unique_ptr<Task>, std::promise<void>>;

				        struct State
				        {
				            std::queue<TaskPackage> m_tasks;
				            std::mutex m_mutex;
				            std::condition_variable m_cond;
				        };

				    public:
				        CallbackThread(uint32_t numaIdx) : m_state(std::make_shared<State>()), m_numaIdx{numaIdx}
				        {
				        }

				        CallbackThread() : m_state(std::make_shared<State>())
				        {
				        }

				        ~CallbackThread()
				        {
				            {
				                std::unique_lock<std::mutex> lock{m_state->m_mutex};
				                m_thread.request_stop();
				                // wakeup the thread in case it is waiting
				                m_state->m_cond.notify_one();
				            }

				            if(m_thread.joinable())
				            {
				                if(std::this_thread::get_id() == m_thread.get_id())
				                {
				                    /* We can not join ourselves.
				                     * We can only end here if a task that the callback thread is executing is capturing the object
				                     * which is holding the callback thread.
				                     */
				                    m_thread.detach();
				                }
				                else
				                    m_thread.join();
				            }
				        }

				        //! It is guaranteed that the task is fully destroyed before the future's result is set.
				        template<typename NullaryFunction>
				        auto submit(NullaryFunction&& nf) -> std::future<void>
				        {
				            using DecayedFunction = std::decay_t<NullaryFunction>;
				            static_assert(
				                std::is_void_v<std::invoke_result_t<DecayedFunction>>,
				                "Submitted function must not have any arguments and return void.");

				            // FunctionHolder stores a copy of the user's task, but may be constructed from an expiring value to avoid
				            // the copy. We do NOT store a reference to the users task, which could dangle if the user isn't careful.
				            auto tp = std::pair(
				                std::make_unique<FunctionHolder<DecayedFunction>>(std::forward<NullaryFunction>(nf)),
				                std::promise<void>{});
				            auto f = tp.second.get_future();
				            {
				                std::unique_lock<std::mutex> lock{m_state->m_mutex};
				                m_state->m_tasks.emplace(std::move(tp));
				                if(!m_thread.joinable())
				                    startWorkerThread();
				                m_state->m_cond.notify_one();
				            }

				            return f;
				        }

				        bool isEmpty() const
				        {
				            std::unique_lock<std::mutex> lock{m_state->m_mutex};
				            return m_state->m_tasks.empty();
				        }

				    private:
				        std::jthread m_thread;
				        /** Hold data shared between this call and the thread processing the tasts. */
				        std::shared_ptr<State> m_state;
				        uint32_t m_numaIdx = onHost::internal::hwloc::allNumaDomains;

				        auto startWorkerThread() -> void
				        {
				            m_thread = std::jthread(
				                [state = m_state, numaIdx = m_numaIdx](std::stop_token st)
				                {
				                    if(numaIdx != onHost::internal::hwloc::allNumaDomains)
				                        onHost::internal::hwloc::setThreadAffinity(numaIdx);

				                    while(true)
				                    {
				                        std::promise<void> taskPromise;
				                        std::exception_ptr eptr;
				                        {
				                            // Task is destroyed before promise is updated but after the queue state is up to date.
				                            std::unique_ptr<Task> task = nullptr;
				                            {
				                                std::unique_lock<std::mutex> lock{state->m_mutex};
				                                state->m_cond.wait(
				                                    lock,
				                                    [&state, &st] { return st.stop_requested() || !state->m_tasks.empty(); });

				                                if(st.stop_requested() && state->m_tasks.empty())
				                                    break;

				                                task = std::move(state->m_tasks.front().first);
				                                taskPromise = std::move(state->m_tasks.front().second);
				                            }
				                            assert(task);
				                            try
				                            {
				                                task->run();
				                            }
				                            catch(...)
				                            {
				                                eptr = std::current_exception();
				                            }
				                            {
				                                std::unique_lock<std::mutex> lock{state->m_mutex};
				                                // Pop empty data from the queue, task and promise will be destroyed later in a
				                                // well-defined order.
				                                state->m_tasks.pop();
				                            }
				                            // Task will be destroyed here, the queue status is already updated.
				                        }
				                        // In case the executed tasks is the last task in the queue the waiting threads will see the
				                        // queue as empty.
				                        if(eptr)
				                            taskPromise.set_exception(std::move(eptr));
				                        else
				                            taskPromise.set_value();
				                    }
				                });
				        }
				    };
				} // namespace alpaka::core
				// ==
				// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/core/CallbackThread.hpp ==
				// ============================================================================

				// ============================================================================
				// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/core/alignedAlloc.hpp ==
				// ==
				/* Copyright 2022 René Widera, Bernhard Manfred Gruber
				 * SPDX-License-Identifier: MPL-2.0
				 */

				// #pragma once
				// #include "alpaka/core/common.hpp"    // amalgamate: file already inlined
				// #include "alpaka/core/config.hpp"    // amalgamate: file already inlined

				// #include <cstddef>    // amalgamate: file already included
				// #include <new>    // amalgamate: file already included

				namespace alpaka::core
				{
				    ALPAKA_FN_INLINE ALPAKA_FN_HOST auto alignedAlloc(size_t alignment, size_t size) -> void*
				    {
				        if(size == 0u)
				        {
				            return nullptr;
				        }
				        else
				        {
				            return ::operator new(size, std::align_val_t{alignment});
				        }
				    }

				    ALPAKA_FN_INLINE ALPAKA_FN_HOST void alignedFree(size_t alignment, auto ptr)
				        requires(std::is_pointer_v<ALPAKA_TYPEOF(ptr)>)
				    {
				        if(ptr != nullptr)
				        {
				            ::operator delete(toVoidPtr(ptr), std::align_val_t{alignment});
				        }
				    }
				} // namespace alpaka::core
				// ==
				// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/core/alignedAlloc.hpp ==
				// ============================================================================

			// #include "alpaka/interface.hpp"    // amalgamate: file already inlined
			// #include "alpaka/internal/interface.hpp"    // amalgamate: file already inlined
			// #include "alpaka/meta/NdLoop.hpp"    // amalgamate: file already inlined
				// ============================================================================
				// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/onAcc/internal/globalMem.hpp ==
				// ==
				/* Copyright 2025 René Widera
				 * SPDX-License-Identifier: MPL-2.0
				 */

				// #pragma once
				// #include "alpaka/api/api.hpp"    // amalgamate: file already inlined
				// #include "alpaka/concepts/types.hpp"    // amalgamate: file already inlined
				// #include "alpaka/core/PP.hpp"    // amalgamate: file already inlined
				// #include "alpaka/core/config.hpp"    // amalgamate: file already inlined
				// #include "alpaka/mem/MdSpan.hpp"    // amalgamate: file already inlined
				// #include "alpaka/mem/MdSpanArray.hpp"    // amalgamate: file already inlined

				/** @file global device memory implementation for all APIs
				 *
				 * We need many precompiler macros to handle the device global feature.
				 * The reason is that we would like to have the possibility to create a variable where the same name can be used on all
				 * devices. Each device will have it's own instance of memory and via a global instance GlobalDeviceMemoryWrapper we
				 * redirect queries to the corresponding instance of memory based on the alpaka API.
				 *
				 * OneAPI Sycl is the only API which does not allow querying the device pointer of a global variable from the host.
				 * That's why we have special implementations of onHost::memcpy() which using the device global object directly.
				 * That's also the reason why we can not use the global memory for onHost::fill() or onHost::memset().
				 */
				namespace alpaka::onHost::internal
				{
				    struct MemcpyDeviceGlobal;
				} // namespace alpaka::onHost::internal

				/** Create a device global variable for the API host */
				#define ALPAKA_DEVICE_GLOBAL_DATA_HOST(attributes, dataType, name, ...)                                               \
				    namespace alpaka_onHost                                                                                           \
				    {                                                                                                                 \
				        [[maybe_unused]] attributes alpaka::onAcc::internal::GlobalDeviceMemoryDataWrapper<                           \
				            ALPAKA_PP_REMOVE_BRACKETS(dataType)> name __VA_OPT__({__VA_ARGS__});                                      \
				    }

				/** Create a forward declaration of a device global variable for the API host */
				#define ALPAKA_DEVICE_GLOBAL_DATA_HOST_EXTERN(attributes, dataType, name)                                             \
				    namespace alpaka_onHost                                                                                           \
				    {                                                                                                                 \
				        extern attributes alpaka::onAcc::internal::GlobalDeviceMemoryDataWrapper<ALPAKA_PP_REMOVE_BRACKETS(dataType)> \
				            name;                                                                                                     \
				    }

				#if ALPAKA_LANG_CUDA || ALPAKA_LANG_HIP
				/** Create a device global variable for the API cuda/hip */
				#    define ALPAKA_DEVICE_GLOBAL_DATA_CUDA_HIP(attributes, dataType, name, ...)                                       \
				        namespace alpaka_onAccCudaHip                                                                                 \
				        {                                                                                                             \
				            __device__ attributes                                                                                     \
				                alpaka::onAcc::internal::GlobalDeviceMemoryDataWrapper<ALPAKA_PP_REMOVE_BRACKETS(dataType)>           \
				                    name __VA_OPT__({__VA_ARGS__});                                                                   \
				        }

				/** Access operator for usage in AlpakaGlobalStorage for API cuda/hip */
				#    define ALPAKA_DEVICE_GLOBAL_GET_CUDA_HIP(attributes, dataType, name, ...)                                        \
				        template<typename T_Api>                                                                                      \
				        requires(std::is_same_v<alpaka::api::Cuda, T_Api> || std::is_same_v<alpaka::api::Hip, T_Api>)                 \
				        constexpr auto& get(T_Api) const                                                                              \
				        {                                                                                                             \
				            return alpaka_onAccCudaHip::name.value;                                                                   \
				        }                                                                                                             \
				        template<typename T_Api>                                                                                      \
				        requires(std::is_same_v<alpaka::api::Cuda, T_Api> || std::is_same_v<alpaka::api::Hip, T_Api>)                 \
				        constexpr auto& getHandle(T_Api) const                                                                        \
				        {                                                                                                             \
				            return alpaka_onAccCudaHip::name.value;                                                                   \
				        }

				#else
				#    define ALPAKA_DEVICE_GLOBAL_DATA_CUDA_HIP(attributes, dataType, name, ...)
				#    define ALPAKA_DEVICE_GLOBAL_GET_CUDA_HIP(attributes, dataType, name, ...)
				#endif

				#if ALPAKA_LANG_CUDA || ALPAKA_LANG_HIP
				/* Define the device external symbol only if relocatable device code is enabled. nvcc is changing the keyword 'extern'
				 * to static in case rdc is disabled which results into redefinition compile errors. To make HIP and CUDA behave equal
				 * we do not expose the symbal for HIP as well in case rdc is disabled.
				 */
				#    if defined(__CUDACC_RDC__) || defined(__CLANG_RDC__)
				#        define ALPAKA_DEVICE_GLOBAL_DATA_CUDA_HIP_EXTERN(attributes, dataType, name)                                 \
				            namespace alpaka_onAccCudaHip                                                                             \
				            {                                                                                                         \
				                extern __device__ attributes                                                                          \
				                    alpaka::onAcc::internal::GlobalDeviceMemoryDataWrapper<ALPAKA_PP_REMOVE_BRACKETS(dataType)>       \
				                        name;                                                                                         \
				            }
				#    else
				/** Create a forward declaration of a device global variable for the API cuda/hip */
				#        define ALPAKA_DEVICE_GLOBAL_DATA_CUDA_HIP_EXTERN(attributes, dataType, name)
				#    endif
				#else
				#    define ALPAKA_DEVICE_GLOBAL_DATA_CUDA_HIP_EXTERN(attributes, dataType, name)
				#endif

				#if ALPAKA_LANG_ONEAPI
				/** Create a device global variable for the API oneApi */
				#    define ALPAKA_DEVICE_GLOBAL_DATA_ONEAPI(attributes, dataType, name, ...)                                         \
				        namespace alpaka_onAccOneAPI                                                                                  \
				        {                                                                                                             \
				            [[maybe_unused]] attributes sycl::ext::oneapi::experimental::device_global<                               \
				                alpaka::onAcc::internal::GlobalDeviceMemoryDataWrapper<ALPAKA_PP_REMOVE_BRACKETS(dataType)>> name     \
				                __VA_OPT__(                                                                                           \
				                    {alpaka::onAcc::internal::GlobalDeviceMemoryDataWrapper<ALPAKA_PP_REMOVE_BRACKETS(dataType)>{     \
				                        __VA_ARGS__}});                                                                               \
				        }

				/** Create a forward declaration of a device global variable for the API oneApi */
				#    define ALPAKA_DEVICE_GLOBAL_DATA_ONEAPI_EXTERN(attributes, dataType, name)                                       \
				        namespace alpaka_onAccOneAPI                                                                                  \
				        {                                                                                                             \
				            extern attributes sycl::ext::oneapi::experimental::device_global<                                         \
				                alpaka::onAcc::internal::GlobalDeviceMemoryDataWrapper<ALPAKA_PP_REMOVE_BRACKETS(dataType)>>          \
				                name;                                                                                                 \
				        }

				/** Access operator for usage in AlpakaGlobalStorage for API oneApi */
				#    define ALPAKA_DEVICE_GLOBAL_GET_ONEAPI(attributes, dataType, name, ...)                                          \
				        template<typename T_Api>                                                                                      \
				        requires(std::is_same_v<alpaka::api::OneApi, T_Api>)                                                          \
				        constexpr auto& get(T_Api) const                                                                              \
				        {                                                                                                             \
				            return alpaka_onAccOneAPI::name.get().value;                                                              \
				        }                                                                                                             \
				        template<typename T_Api>                                                                                      \
				        requires(std::is_same_v<alpaka::api::OneApi, T_Api>)                                                          \
				        constexpr auto& getHandle(T_Api) const                                                                        \
				        {                                                                                                             \
				            return alpaka_onAccOneAPI::name;                                                                          \
				        }
				#else
				#    define ALPAKA_DEVICE_GLOBAL_DATA_ONEAPI(attributes, dataType, name, ...)
				#    define ALPAKA_DEVICE_GLOBAL_DATA_ONEAPI_EXTERN(attributes, dataType, name)
				#    define ALPAKA_DEVICE_GLOBAL_GET_ONEAPI(attributes, dataType, name, ...)
				#endif

				namespace alpaka::onAcc::internal
				{
				    /** Helper class to wrap device global memory data.
				     *
				     * The reason why this wrapper is required is that SYCL oneAPI is using a special type which does not support C
				     * array initialization. All arguments passed to the wrapper constructor are forwarded to the data member.
				     */
				    template<typename T>
				    struct GlobalDeviceMemoryDataWrapper
				    {
				        constexpr GlobalDeviceMemoryDataWrapper(auto const&... args) : value{ALPAKA_FORWARD(args)...}
				        {
				        }

				        T value;

				        T* data()
				        {
				            return &value;
				        }

				        T const* data() const
				        {
				            return &value;
				        }
				    };

				    /** Specialization of GlobalDeviceMemoryDataWrapper for C static arrays.
				     *
				     * This specialization is required because C static arrays cannot have a constructor.
				     * Therefore, the data member is initialized directly.
				     */
				    template<alpaka::concepts::CStaticArray T>
				    struct GlobalDeviceMemoryDataWrapper<T>
				    {
				        T value;
				        using value_type = std::remove_all_extents_t<T>;

				        value_type* data()
				        {
				            return reinterpret_cast<value_type*>(&value);
				        }

				        value_type const* data() const
				        {
				            return reinterpret_cast<value_type const*>(&value);
				        }
				    };

				    /** Helper class to provide access to device global memory variables */
				    template<typename T_Storage, typename T_Type>
				    struct GlobalDeviceMemoryWrapper : private T_Storage
				    {
				    private:
				        friend struct onHost::internal::MemcpyDeviceGlobal;

				        /** Get the handle to call native API specific memcopy for global device memory operation
				         *
				         * @attention This method is for internal usage only.
				         *
				         * @return type depends on the native API e.g Cuda, OneApi, ...
				         */
				        template<alpaka::concepts::Api T_Api>
				        constexpr decltype(auto) getHandle(T_Api api) const
				        {
				            return T_Storage::getHandle(api);
				        }

				    public:
				        using type = T_Type;

				        constexpr decltype(auto) get() const
				        {
				            return T_Storage::get(thisApi());
				        }

				        constexpr decltype(auto) get() const requires(std::is_array_v<type>)
				        {
				            // a static array of type C also uses size_t to define its length
				            return alpaka::MdSpanArray<type, size_t>{T_Storage::get(thisApi())};
				        }

				        constexpr operator type&()
				        {
				            return T_Storage::get(thisApi());
				        }

				        constexpr operator type const&() const
				        {
				            return T_Storage::get(thisApi());
				        }
				    };
				} // namespace alpaka::onAcc::internal
				// ==
				// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/onAcc/internal/globalMem.hpp ==
				// ============================================================================

			// #include "alpaka/onHost/FrameSpec.hpp"    // amalgamate: file already inlined
			// #include "alpaka/onHost/Handle.hpp"    // amalgamate: file already inlined
			// #include "alpaka/onHost/interface.hpp"    // amalgamate: file already inlined
			// #include "alpaka/onHost/internal/interface.hpp"    // amalgamate: file already inlined
				// ============================================================================
				// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/onHost/mem/SharedBuffer.hpp ==
				// ==
				/* Copyright 2024 René Widera, Bernhard Manfred Gruber
				 * SPDX-License-Identifier: MPL-2.0
				 */


				// #pragma once
				// #include "alpaka/Vec.hpp"    // amalgamate: file already inlined
				// #include "alpaka/core/config.hpp"    // amalgamate: file already inlined
				// #include "alpaka/internal/interface.hpp"    // amalgamate: file already inlined
					// ============================================================================
					// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/mem/View.hpp ==
					// ==
					/* Copyright 2024 Bernhard Manfred Gruber, René Widera
					 * SPDX-License-Identifier: MPL-2.0
					 */

					// #pragma once
					// #include "alpaka/api/concepts/api.hpp"    // amalgamate: file already inlined
					// #include "alpaka/interface.hpp"    // amalgamate: file already inlined
					// #include "alpaka/internal/interface.hpp"    // amalgamate: file already inlined
					// #include "alpaka/mem/BoundaryIter.hpp"    // amalgamate: file already inlined
					// #include "alpaka/mem/MdSpan.hpp"    // amalgamate: file already inlined
					// #include "alpaka/mem/concepts/IMdSpan.hpp"    // amalgamate: file already inlined
					// #include "alpaka/mem/concepts/detail/InnerTypeAllowedCast.hpp"    // amalgamate: file already inlined
					// #include "alpaka/mem/trait.hpp"    // amalgamate: file already inlined
					// #include "alpaka/onHost/interface.hpp"    // amalgamate: file already inlined

					// #include <cstdint>    // amalgamate: file already included
					// #include <functional>    // amalgamate: file already included

					namespace alpaka
					{
					    /** @brief Non owning view to data
					     *
					     * This view is only holding a pointer to real data, copying the view is cheap.
					     * Const-ness of the view instance is propagated to the data region.
					     *
					     * This satisfies the alpaka::concepts::IView concept and, therefore, also the alpaka::concepts::IMdSpan concept.
					     */
					    template<
					        alpaka::concepts::Api T_Api,
					        typename T_Type,
					        alpaka::concepts::Vector T_Extents,
					        alpaka::concepts::Alignment T_MemAlignment = Alignment<>>
					    struct View;

					    template<typename T_ValueType, concepts::Alignment T_MemAlignment = Alignment<>>
					    inline constexpr auto makeView(
					        auto&& anyWithApi,
					        T_ValueType* pointer,
					        concepts::Vector auto const& extents,
					        T_MemAlignment const memAlignment = T_MemAlignment{})
					    {
					        auto pitchMd = alpaka::calculatePitchesFromExtents<T_ValueType>(extents);
					        return View{getApi(ALPAKA_FORWARD(anyWithApi)), pointer, extents, pitchMd, memAlignment};
					    }

					    template<typename T_ValueType, concepts::Alignment T_MemAlignment = Alignment<>>
					    inline constexpr auto makeView(
					        auto&& anyWithApi,
					        T_ValueType* pointer,
					        concepts::Vector auto const& extents,
					        concepts::Vector auto const& pitches,
					        T_MemAlignment const memAlignment = T_MemAlignment{})
					    {
					        static_assert(std::is_same_v<ALPAKA_TYPEOF(extents), ALPAKA_TYPEOF(pitches)>);
					        return View{getApi(ALPAKA_FORWARD(anyWithApi)), pointer, extents, pitches, memAlignment};
					    }

					    inline constexpr auto makeView(auto&& any)
					    {
					        return View{
					            internal::getApi(ALPAKA_FORWARD(any)),
					            onHost::data(ALPAKA_FORWARD(any)),
					            onHost::getExtents(ALPAKA_FORWARD(any)),
					            onHost::getPitches(ALPAKA_FORWARD(any)),
					            alpaka::getAlignment(ALPAKA_FORWARD(any))};
					    }

					    template<
					        alpaka::concepts::Api T_Api,
					        typename T_Type,
					        alpaka::concepts::Vector T_Extents,
					        alpaka::concepts::Alignment T_MemAlignment>
					    struct View : MdSpan<T_Type, typename T_Extents::UniVec, typename T_Extents::UniVec, T_MemAlignment>
					    {
					    private:
					        using BaseMdSpan = MdSpan<T_Type, typename T_Extents::UniVec, typename T_Extents::UniVec, T_MemAlignment>;

					    public:
					        /** Creates a view
					         *
					         * @param data handle to the physical data
					         * @param extents n-dimensional extents in elements of the view. Must satisfy `n <= number_of_elements` in the
					         * data handle.
					         */
					        template<
					            alpaka::concepts::HasApi T_Any,
					            alpaka::concepts::Vector T_UserExtents,
					            alpaka::concepts::Vector T_UserPitches>
					        constexpr View(
					            T_Any const& any,
					            T_Type* data,
					            T_UserExtents const& extents,
					            T_UserPitches const& pitches,
					            T_MemAlignment const memAlignment = T_MemAlignment{})
					            : BaseMdSpan{
					                  data,
					                  typename T_UserExtents::UniVec{extents},
					                  typename T_UserPitches::UniVec{pitches},
					                  memAlignment}
					        {
					            alpaka::unused(any);
					            static_assert(
					                isLosslesslyConvertible_v<typename T_UserPitches::type, typename T_UserExtents::type>,
					                "extent type and pitch type must be lossless convertible");
					        }

					        template<typename T_Type_Other>
					        requires alpaka::internal::concepts::InnerTypeAllowedCast<T_Type, T_Type_Other>
					        constexpr View(View<T_Api, T_Type_Other, T_Extents, T_MemAlignment> const& other)
					            : BaseMdSpan{static_cast<BaseMdSpan>(other)}
					        {
					        }

					        constexpr View(View const&) = default;

					        template<typename T_Type_Other>
					        requires alpaka::internal::concepts::InnerTypeAllowedCast<T_Type, T_Type_Other>
					        constexpr View(View<T_Api, T_Type_Other, T_Extents, T_MemAlignment>&& other)
					            : BaseMdSpan{std::move(static_cast<BaseMdSpan>(other))}
					        {
					        }

					        constexpr View(View&&) = default;

					        /** Assignment operator keeping const-ness
					         *
					         * @attention the assign operator is not removing inner const-ness because the type signature is not changed.
					         */
					        constexpr View& operator=(View const&) = default;

					        constexpr View& operator=(View&&) = default;

					        static consteval T_Api getApi()
					        {
					            return T_Api{};
					        }

					        constexpr alpaka::concepts::IMdSpan auto getMdSpan() const
					        {
					            return BaseMdSpan::getConstMdSpan();
					        }

					        constexpr alpaka::concepts::IMdSpan auto getMdSpan()
					        {
					            return BaseMdSpan{*this};
					        }

					        /** create a read only view */
					        constexpr auto getConstView() const
					        {
					            using ConstValueType = std::add_const_t<typename BaseMdSpan::value_type>;
					            return View<T_Api, ConstValueType, T_Extents, T_MemAlignment>{
					                T_Api{},
					                static_cast<ConstValueType*>(this->data()),
					                this->getExtents(),
					                this->getPitches(),
					                T_MemAlignment{}};
					        }

					        /** @brief Creates a sub view to a part of the memory.
					         *
					         * The sub view has the same dimension as the original.
					         *
					         * @param extents Number of elements for each dimension. Each number must be less than or equal to
					         * the number of elements in the original dimension.
					         * @return View which is pointing only to a part of the original view.
					         *
					         * @{
					         */
					        constexpr auto getSubView(alpaka::concepts::VectorOrScalar auto const& extents) const
					        {
					            static_assert(alpaka::trait::getDim_v<ALPAKA_TYPEOF(extents)> == T_Extents::dim());
					            Vec extentMd = extents;
					            assert((extentMd <= this->getExtents()).reduce(std::logical_and{}));
					            return makeView(T_Api{}, this->data(), extentMd, this->getPitches(), T_MemAlignment{});
					        }

					        constexpr auto getSubView(alpaka::concepts::VectorOrScalar auto const& extents)
					        {
					            static_assert(alpaka::trait::getDim_v<ALPAKA_TYPEOF(extents)> == T_Extents::dim());
					            Vec extentMd = extents;
					            assert((extentMd <= this->getExtents()).reduce(std::logical_and{}));
					            return makeView(T_Api{}, this->data(), extentMd, this->getPitches(), T_MemAlignment{});
					        }

					        /** @} */

					        /** @brief Creates a sub view to a part of the memory.
					         *
					         * The sub view has the same dimension as the original. The offset defines the first coordinate of
					         * each dimension. The `offset + extents - 1` defines the last element for each dimension in the
					         * original view. Offset plus extents should not exceed the extents of the original view.
					         *
					         * @param offset offset in elements to the original view
					         * @param extents number of elements for each dimension
					         * @return View which is pointing only to a part of the original view with a shifted origin pointer.
					         *         The alignment of the sub view is reduced to the element alignment.
					         *
					         * @{
					         */
					        constexpr auto getSubView(
					            alpaka::concepts::VectorOrScalar auto const& offset,
					            alpaka::concepts::VectorOrScalar auto const& extents) const
					        {
					            static_assert(alpaka::trait::getDim_v<ALPAKA_TYPEOF(extents)> == T_Extents::dim());
					            static_assert(alpaka::trait::getDim_v<ALPAKA_TYPEOF(offset)> == T_Extents::dim());

					            Vec offsetMd = offset;
					            Vec extentMd = extents;
					            assert((offsetMd + extentMd <= this->getExtents()).reduce(std::logical_and{}));
					            auto shiftedPtr = &(*this)[offsetMd];
					            return makeView(T_Api{}, shiftedPtr, extentMd, this->getPitches(), Alignment<>{});
					        }

					        constexpr auto getSubView(
					            alpaka::concepts::VectorOrScalar auto const& offset,
					            alpaka::concepts::VectorOrScalar auto const& extents)
					        {
					            static_assert(alpaka::trait::getDim_v<ALPAKA_TYPEOF(extents)> == T_Extents::dim());
					            static_assert(alpaka::trait::getDim_v<ALPAKA_TYPEOF(offset)> == T_Extents::dim());

					            Vec offsetMd = offset;
					            Vec extentMd = extents;
					            assert((offsetMd + extentMd <= this->getExtents()).reduce(std::logical_and{}));
					            auto shiftedPtr = &(*this)[offsetMd];
					            return makeView(T_Api{}, shiftedPtr, extentMd, this->getPitches(), Alignment<>{});
					        }

					        /** @} */

					        template<alpaka::concepts::Vector LowHaloVecType, alpaka::concepts::Vector UpHaloVecType>
					        constexpr auto getSubView(
					            alpaka::BoundaryDirection<View::dim(), LowHaloVecType, UpHaloVecType> boundaryDir) const
					        {
					            constexpr uint32_t dim = View::dim();
					            auto offset = alpaka::Vec<uint32_t, dim>{};
					            auto extents = alpaka::Vec<uint32_t, dim>{};

					            for(uint32_t i = 0; i < dim; ++i)
					            {
					                switch(boundaryDir.data[i])
					                {
					                case BoundaryType::LOWER:
					                    offset[i] = 0;
					                    extents[i] = boundaryDir.lowerHaloSize[i];
					                    break;
					                case BoundaryType::UPPER:
					                    offset[i] = this->getExtents()[i] - boundaryDir.upperHaloSize[i];
					                    extents[i] = boundaryDir.upperHaloSize[i];
					                    break;
					                case BoundaryType::MIDDLE:
					                    offset[i] = boundaryDir.lowerHaloSize[i];
					                    extents[i] = this->getExtents()[i] - boundaryDir.lowerHaloSize[i] - boundaryDir.upperHaloSize[i];
					                    break;
					                default:
					                    throw std::invalid_argument("invalid direction");
					                }
					            }
					            return getSubView(offset, extents);
					        }
					    };

					    template<typename T_Api, typename T_Type, concepts::Vector T_Extents, concepts::Alignment T_MemAlignment>
					    std::ostream& operator<<(std::ostream& s, View<T_Api, T_Type, T_Extents, T_MemAlignment> const& view)
					    {
					        return s << "View{ dim=" << ALPAKA_TYPEOF(view)::dim() << ", api= " << onHost::getName(T_Api{})
					                 << ", extents=" << view.getExtents().toString() << ", pitches=" << view.getPitches().toString()
					                 << " , alignment=" << T_MemAlignment::template get<T_Type>() << " }";
					    }

					    template<
					        alpaka::concepts::HasApi T_Any,
					        typename T_Type,
					        alpaka::concepts::Vector T_UserExtents,
					        alpaka::concepts::Vector T_UserPitches,
					        alpaka::concepts::Alignment T_MemAlignment>
					    ALPAKA_FN_HOST_ACC View(
					        T_Any const&,
					        T_Type*,
					        T_UserExtents const&,
					        T_UserPitches const&,
					        T_MemAlignment const memAlignment)
					        -> View<ALPAKA_TYPEOF(getApi(std::declval<T_Any>())), T_Type, typename T_UserPitches::UniVec, T_MemAlignment>;

					    template<
					        alpaka::concepts::HasApi T_Any,
					        typename T_Type,
					        alpaka::concepts::Vector T_UserExtents,
					        alpaka::concepts::Vector T_UserPitches>
					    ALPAKA_FN_HOST_ACC View(T_Any, T_Type*, T_UserExtents const&, T_UserPitches const&)
					        -> View<ALPAKA_TYPEOF(getApi(std::declval<T_Any>())), T_Type, typename T_UserPitches::UniVec, Alignment<>>;
					} // namespace alpaka

					namespace alpaka::internal
					{
					    // externally define the API trait to support constexpr evaluation
					    template<
					        alpaka::concepts::Api T_Api,
					        typename T_Type,
					        alpaka::concepts::Vector T_Extents,
					        alpaka::concepts::Alignment T_MemAlignment>
					    struct GetApi::Op<alpaka::View<T_Api, T_Type, T_Extents, T_MemAlignment>>
					    {
					        inline constexpr auto operator()(auto&& view) const
					        {
					            alpaka::unused(view);
					            return T_Api{};
					        }
					    };

					    template<
					        alpaka::concepts::Api T_Api,
					        typename T_Type,
					        alpaka::concepts::Vector T_Extents,
					        alpaka::concepts::Alignment T_MemAlignment>
					    struct CopyConstructableDataSource<View<T_Api, T_Type, T_Extents, T_MemAlignment>> : std::true_type
					    {
					        using InnerMutable = View<T_Api, std::remove_const_t<T_Type>, T_Extents, T_MemAlignment>;
					        using InnerConst = View<T_Api, std::add_const_t<T_Type>, T_Extents, T_MemAlignment>;
					    };
					} // namespace alpaka::internal
					// ==
					// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/mem/View.hpp ==
					// ============================================================================

				// #include "alpaka/mem/concepts/detail/InnerTypeAllowedCast.hpp"    // amalgamate: file already inlined
				// #include "alpaka/mem/trait.hpp"    // amalgamate: file already inlined
				// #include "alpaka/onHost/Device.hpp"    // amalgamate: file already inlined
				// #include "alpaka/onHost/Handle.hpp"    // amalgamate: file already inlined
				// #include "alpaka/onHost/concepts.hpp"    // amalgamate: file already inlined
				// #include "alpaka/onHost/interface.hpp"    // amalgamate: file already inlined
					// ============================================================================
					// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/onHost/mem/ManagedDealloc.hpp ==
					// ==
					/* Copyright 2025 René Widera
					 * SPDX-License-Identifier: MPL-2.0
					 */

					// #pragma once
					// #include <functional>    // amalgamate: file already included
					// #include <memory>    // amalgamate: file already included

					namespace alpaka::onHost::internal
					{
					    /** Manage the deallocation of memory
					     *
					     * This class is used to manage the deallocation of memory in a shared_ptr.
					     * It takes a function that will be called when the shared_ptr is destroyed.
					     * This is useful for managing memory that needs to be deallocated
					     * when the shared_ptr goes out of scope.
					     */
					    struct ManagedDealloc : std::enable_shared_from_this<ManagedDealloc>
					    {
					        /**
					         * Constructor
					         * @param freeOp Function to be called when the shared_ptr is destroyed after all actions are executed.
					         *               All dependencies required to deallocate the memory must be holed by freeOp.
					         */
					        ManagedDealloc(std::function<void()> freeOp) : freeOp{std::move(freeOp)}
					        {
					        }

					        ~ManagedDealloc()
					        {
					            // Execute all actions before freeing the memory
					            for(auto& action : actions)
					            {
					                action();
					            }
					            freeOp();
					        }

					        /** Add an action to be executed when the shared_ptr is destroyed.
					         *
					         * @param action Callable to execute on destruction.
					         */
					        void addAction(std::function<void()> action)
					        {
					            std::lock_guard<std::mutex> lock{actionGuard};
					            actions.emplace_back(std::move(action));
					        }

					        std::shared_ptr<ManagedDealloc> getSharedPtr()
					        {
					            return this->shared_from_this();
					        }

					    private:
					        std::function<void()> freeOp;
					        std::mutex actionGuard;
					        std::vector<std::function<void()>> actions;
					    };
					} // namespace alpaka::onHost::internal
					// ==
					// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/onHost/mem/ManagedDealloc.hpp ==
					// ============================================================================


				// #include <cstdint>    // amalgamate: file already included
				// #include <functional>    // amalgamate: file already included
				// #include <memory>    // amalgamate: file already included
				// #include <sstream>    // amalgamate: file already included

				namespace alpaka::onHost
				{
				    /** Life time managed buffer with contiguous data
				     *
				     * This buffer owns the data and will deallocate it when last copy is destroyed.
				     * Const-ness of the buffer instance is propagated to the data region.
				     * A copy of this instance will only perform a shallow copy, to perform a deep copy to duplicate the data you
				     * should use @c onHost::memcpy.
				     */
				    template<
				        alpaka::concepts::Api T_Api,
				        typename T_Type,
				        alpaka::concepts::Vector T_Extents,
				        alpaka::concepts::Alignment T_MemAlignment = Alignment<>>
				    struct SharedBuffer : View<T_Api, T_Type, T_Extents, T_MemAlignment>
				    {
				    private:
				        using BaseView = View<T_Api, T_Type, T_Extents, T_MemAlignment>;

				        /** Constructor with existing managed deleter */
				        SharedBuffer(
				            T_Api const api,
				            T_Type* data,
				            T_Extents const& extents,
				            T_Extents const& pitches,
				            std::shared_ptr<internal::ManagedDealloc> managedDeleter,
				            T_MemAlignment const memAlignment)
				            : BaseView{api, data, extents, pitches, memAlignment}
				            , m_deleter{std::move(managedDeleter)}
				        {
				        }

				        // friend declaration is required that any type of SharedBuffer can access the private constructor
				        template<
				            alpaka::concepts::Api T_OtherApi,
				            typename T_OtherType,
				            alpaka::concepts::Vector T_OtherExtents,
				            alpaka::concepts::Alignment T_OtherMemAlignment2>
				        friend struct SharedBuffer;

				        template<
				            alpaka::concepts::Api T_OtherApi,
				            typename T_OtherType,
				            alpaka::concepts::Vector T_OtherExtents,
				            alpaka::concepts::Alignment T_OtherMemAlignment2>
				        friend std::ostream& operator<<(
				            std::ostream& s,
				            SharedBuffer<T_OtherApi, T_OtherType, T_OtherExtents, T_OtherMemAlignment2> const& buffer);

				    public:
				        template<
				            alpaka::concepts::HasApi T_Any,
				            alpaka::concepts::Vector T_UserExtents,
				            alpaka::concepts::Vector T_UserPitches>
				        SharedBuffer(
				            T_Any const& any,
				            T_Type* data,
				            T_UserExtents const& extents,
				            T_UserPitches const& pitches,
				            std::invocable<> auto deleter,
				            T_MemAlignment const memAlignment = Alignment{})
				            : BaseView{any, data, extents, pitches, memAlignment}
				            , m_deleter{std::make_shared<internal::ManagedDealloc>(deleter)}
				        {
				            static_assert(
				                isLosslesslyConvertible_v<typename T_UserPitches::type, typename T_UserExtents::type>,
				                "extent type and pitch type must be lossless convertible");
				        }

				        template<typename T_Type_Other>
				        requires alpaka::internal::concepts::InnerTypeAllowedCast<T_Type, T_Type_Other>
				        SharedBuffer(SharedBuffer<T_Api, T_Type_Other, T_Extents, T_MemAlignment> const& other)
				            : BaseView{static_cast<BaseView>(other)}
				            , m_deleter(other.m_deleter)
				        {
				        }

				        SharedBuffer(SharedBuffer const&) = default;

				        /** Assignment operator keeping const-ness
				         *
				         * @attention the assign operator is not removing inner const-ness because the type signature is not changed.
				         */
				        SharedBuffer& operator=(SharedBuffer const& otherSharedBuffer) = default;

				        template<typename T_Type_Other>
				        requires alpaka::internal::concepts::InnerTypeAllowedCast<T_Type, T_Type_Other>
				        SharedBuffer(SharedBuffer<T_Api, T_Type_Other, T_Extents, T_MemAlignment>&& other)
				            : BaseView{std::move(static_cast<BaseView>(other))}
				            , m_deleter(std::move(other.m_deleter))

				        {
				        }

				        SharedBuffer(SharedBuffer&&) = default;

				        SharedBuffer& operator=(SharedBuffer&&) = default;

				        auto getView() const
				        {
				            return BaseView::getConstView();
				        }

				        auto getView()
				        {
				            return static_cast<BaseView>(*this);
				        }

				        /** create a read shared buffer view */
				        auto getConstSharedBuffer() const
				        {
				            using ConstValueType = std::add_const_t<typename BaseView::value_type>;
				            return SharedBuffer<T_Api, ConstValueType, T_Extents, T_MemAlignment>(
				                T_Api{},
				                static_cast<ConstValueType*>(this->data()),
				                this->getExtents(),
				                this->getPitches(),
				                m_deleter,
				                T_MemAlignment{});
				        }

				        /** Creates a buffer pointing to a part of the memory.
				         *
				         * @param extents number of elements for each dimension
				         * @return shared buffer which is pointing only to a part of the original buffer.
				         */
				        auto getSubSharedBuffer(alpaka::concepts::VectorOrScalar auto const& extents) const
				        {
				            Vec extentMd = extents;
				            assert((extentMd <= this->getExtents()).reduce(std::logical_and{}));
				            return SharedBuffer<T_Api, std::remove_pointer_t<ALPAKA_TYPEOF(this->data())>, T_Extents, T_MemAlignment>{
				                T_Api{},
				                this->data(),
				                extentMd,
				                this->getPitches(),
				                m_deleter,
				                T_MemAlignment{}};
				        }

				        auto getSubSharedBuffer(alpaka::concepts::VectorOrScalar auto const& extents)
				        {
				            Vec extentMd = extents;
				            assert((extentMd <= this->getExtents()).reduce(std::logical_and{}));
				            return SharedBuffer<T_Api, std::remove_pointer_t<ALPAKA_TYPEOF(this->data())>, T_Extents, T_MemAlignment>{
				                T_Api{},
				                this->data(),
				                extentMd,
				                this->getPitches(),
				                m_deleter,
				                T_MemAlignment{}};
				        }

				        /** Creates a shared sub-buffer view to a part of the memory.
				         *
				         * @param offsets offset in elements to the original buffer
				         * @param extents number of elements for each dimension
				         * @return Buffer which is pointing only to a part of the original buffer with a shifted origin pointer.
				         *         Buffer which pointThe alignment of the sub view is reduced to the element alignment.
				         */
				        auto getSubSharedBuffer(
				            alpaka::concepts::VectorOrScalar auto const& offsets,
				            alpaka::concepts::VectorOrScalar auto const& extents) const
				        {
				            Vec offsetMd = offsets;
				            Vec extentMd = extents;
				            assert((offsetMd + extentMd <= this->getExtents()).reduce(std::logical_and{}));
				            auto shiftedPtr = &(*this)[offsetMd];
				            return SharedBuffer<T_Api, std::remove_pointer_t<ALPAKA_TYPEOF(shiftedPtr)>, T_Extents, Alignment<>>{
				                T_Api{},
				                shiftedPtr,
				                extentMd,
				                this->getPitches(),
				                m_deleter,
				                Alignment<>{}};
				        }

				        auto getSubSharedBuffer(
				            alpaka::concepts::VectorOrScalar auto const& offsets,
				            alpaka::concepts::VectorOrScalar auto const& extents)
				        {
				            Vec offsetMd = offsets;
				            Vec extentMd = extents;
				            assert((offsetMd + extentMd <= this->getExtents()).reduce(std::logical_and{}));
				            auto shiftedPtr = &(*this)[offsetMd];
				            return SharedBuffer<T_Api, std::remove_pointer_t<ALPAKA_TYPEOF(shiftedPtr)>, T_Extents, Alignment<>>{
				                T_Api{},
				                shiftedPtr,
				                extentMd,
				                this->getPitches(),
				                m_deleter,
				                Alignment<>{}};
				        }

				        /** Adds a destructor action to the shared buffer
				         *
				         * The action will be executed when the buffer is destroyed.
				         * This can be used to add additional cleanup actions e.g. waiting on a specific queue.
				         * Actions are executed in FIFO order.
				         *
				         * @param action callable to execute on destruction
				         */
				        void addDestructorAction(std::function<void()>&& action)
				        {
				            m_deleter->addAction(ALPAKA_FORWARD(action));
				        }

				        /** Add an action to be executed when the shared_ptr is destroyed.
				         *
				         * @param action Callable to execute on destruction.
				         */
				        void destructorWaitFor(auto const& any)
				        {
				            addDestructorAction([any]() { onHost::wait(any); });
				        }

				        /** Keep the buffer alive until at least the current spot in the queue, even if it runs out of scope.
				         * This ensures that the buffer is and stays valid in previously enqueued kernels. There is *no* guarantee
				         * that the buffer is deleted immediately when the last reference to it is deleted.
				         *
				         * This differs from `destructorWaitFor`, because that function waits, while `keepAlive` does not block
				         * anything, it just extends lifetime.
				         *
				         * @attention Do not apply this function to a buffer allocated with alpaka::onHost::allocDeferred, see
				         * https://github.com/alpaka-group/alpaka3/issues/394
				         *
				         * @param queue The queue to enqueue to.
				         */
				        void keepAlive(auto& queue)
				        {
				            // enqueue an empty lambda that keeps a copy of the buffer
				            // as long as the copy lives (which is as long as it takes the queue to get to this point), the buffer will
				            // stay valid
				            auto del = m_deleter;
				            queue.enqueueHostFnDeferred([_ = std::move(del)] {});
				        }

				        /** Return the number of SharedBuffers which points to the same memory */
				        [[nodiscard]] constexpr long getUseCount() const noexcept
				        {
				            return m_deleter.use_count();
				        }

				        /** True if SharedBuffer is pointing to valid memory. */
				        [[nodiscard]] constexpr explicit operator bool() const noexcept
				        {
				            return static_cast<bool>(m_deleter);
				        }

				    private:
				        /** @todo move this to traits or somewhere else that it can be used everywhere */
				        template<alpaka::concepts::Pointer T>
				        using ConstPtr_t = std::add_pointer_t<std::add_const_t<std::remove_pointer_t<T>>>;

				        std::shared_ptr<internal::ManagedDealloc> m_deleter;
				    }; // namespace alpaka::onHost

				    template<
				        alpaka::concepts::HasApi T_Any,
				        typename T_Type,
				        alpaka::concepts::Vector T_UserExtents,
				        alpaka::concepts::Vector T_UserPitches,
				        alpaka::concepts::Alignment T_MemAlignment>
				    SharedBuffer(
				        T_Any const&,
				        T_Type*,
				        T_UserExtents const&,
				        T_UserPitches const&,
				        std::invocable<> auto,
				        T_MemAlignment const)
				        -> SharedBuffer<
				            ALPAKA_TYPEOF(getApi(std::declval<T_Any>())),
				            T_Type,
				            typename T_UserPitches::UniVec,
				            T_MemAlignment>;

				    template<
				        alpaka::concepts::HasApi T_Any,
				        typename T_Type,
				        alpaka::concepts::Vector T_UserExtents,
				        alpaka::concepts::Vector T_UserPitches>
				    SharedBuffer(T_Any const&, T_Type*, T_UserExtents const&, T_UserPitches const&, std::invocable<> auto)
				        -> SharedBuffer<
				            ALPAKA_TYPEOF(getApi(std::declval<T_Any>())),
				            T_Type,
				            typename T_UserPitches::UniVec,
				            Alignment<>>;

				    template<
				        alpaka::concepts::Api T_Api,
				        typename T_Type,
				        alpaka::concepts::Vector T_Extents,
				        alpaka::concepts::Alignment T_MemAlignment>
				    struct MakeAccessibleOnAcc::Op<SharedBuffer<T_Api, T_Type, T_Extents, T_MemAlignment>>
				    {
				        auto operator()(auto&& any) const
				        {
				            return any.getView();
				        }
				    };

				    template<
				        alpaka::concepts::Api T_Api,
				        typename T_Type,
				        alpaka::concepts::Vector T_Extents,
				        alpaka::concepts::Alignment T_MemAlignment>
				    std::ostream& operator<<(std::ostream& s, SharedBuffer<T_Api, T_Type, T_Extents, T_MemAlignment> const& buff)
				    {
				        return s << "SharedBuffer{ dim=" << ALPAKA_TYPEOF(buff)::dim() << ", api= " << onHost::getName(T_Api{})
				                 << ", extents=" << buff.getExtents().toString() << ", pitches=" << buff.getPitches().toString()
				                 << " , alignment=" << T_MemAlignment::template get<T_Type>() << " }";
				    }

				} // namespace alpaka::onHost

				namespace alpaka::internal
				{
				    // external define the API trait to support constexpr evaluation
				    template<
				        alpaka::concepts::Api T_Api,
				        typename T_Type,
				        alpaka::concepts::Vector T_Extents,
				        alpaka::concepts::Alignment T_MemAlignment>
				    struct GetApi::Op<onHost::SharedBuffer<T_Api, T_Type, T_Extents, T_MemAlignment>>
				    {
				        inline constexpr auto operator()(auto&& buffer) const
				        {
				            alpaka::unused(buffer);
				            return T_Api{};
				        }
				    };

				    template<
				        alpaka::concepts::Api T_Api,
				        typename T_Type,
				        alpaka::concepts::Vector T_Extents,
				        alpaka::concepts::Alignment T_MemAlignment>
				    struct CopyConstructableDataSource<onHost::SharedBuffer<T_Api, T_Type, T_Extents, T_MemAlignment>> : std::true_type
				    {
				        using InnerMutable = onHost::SharedBuffer<T_Api, std::remove_const_t<T_Type>, T_Extents, T_MemAlignment>;
				        using InnerConst = onHost::SharedBuffer<T_Api, std::add_const_t<T_Type>, T_Extents, T_MemAlignment>;
				    };

				} // namespace alpaka::internal
				// ==
				// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/onHost/mem/SharedBuffer.hpp ==
				// ============================================================================


			// #include <cstdint>    // amalgamate: file already included
			// #include <cstring>    // amalgamate: file already included
			// #include <future>    // amalgamate: file already included
			// #include <sstream>    // amalgamate: file already included

			namespace alpaka::onHost
			{
			    namespace cpu
			    {
			        template<typename T_Device>
			        struct Queue : std::enable_shared_from_this<Queue<T_Device>>
			        {
			        public:
			            Queue(internal::concepts::DeviceHandle auto device, uint32_t const idx, uint32_t numIdx, bool isBlocking)
			                : m_device(std::move(device))
			                , m_idx(idx)
			                , m_numaIdx(numIdx)
			                , m_workerThread(numIdx)
			                , m_isBlocking(isBlocking)
			            {
			                ALPAKA_LOG_FUNCTION(onHost::logger::queue);
			            }

			            ~Queue()
			            {
			                ALPAKA_LOG_FUNCTION(onHost::logger::queue);
			                internal::wait(*this);
			            }

			            Queue(Queue const&) = delete;
			            Queue& operator=(Queue const&) = delete;

			            Queue(Queue&&) = delete;
			            Queue& operator=(Queue&&) = delete;

			            bool operator==(Queue const& other) const
			            {
			                return m_idx == other.m_idx && m_device == other.m_device;
			            }

			            bool operator!=(Queue const& other) const
			            {
			                return !(*this == other);
			            }

			        private:
			            void _()
			            {
			                static_assert(internal::concepts::Queue<Queue>);
			            }

			            Handle<T_Device> m_device;
			            uint32_t m_idx = 0u;
			            uint32_t m_numaIdx = 0u;
			            core::CallbackThread m_workerThread;
			            bool m_isBlocking{false};
			            /** Flag to show if a blocking tasks is executed
			             *
			             * This variable is only used if m_isBlocking == true.
			             *
			             * state: If true a thread is executing a blocking tasks, else false.
			             */
			            std::atomic<bool> m_isBlockingTaskExecuted{false};

			            /** Mutex to ensure sequential execution of tasks and operation if the queue is blocking.
			             *
			             * For non-blocking queue @c m_workerThread is taking care of the execution order
			             */
			            std::mutex m_mutex;

			            /** Submit a task to the queue.
			             *
			             * Centralizes blocking / non-blocking behavior within the method to keep other code as easy as possible.
			             * For a blocking queue this method is NOT giving the control back to the caller until the operation is
			             * processed.
			             * All internal calls should use this method and not enqueue tasks directly in @c m_workerThread
			             */
			            template<typename T_Fn>
			            auto submit(T_Fn&& fn)
			            {
			                ALPAKA_LOG_FUNCTION(onHost::logger::queue);
			                if(m_isBlocking)
			                {
			                    std::lock_guard<std::mutex> lk(m_mutex);
			                    m_isBlockingTaskExecuted = true;
			                    fn();
			                    // silent tsan warnings: The promise is fulfilled directly and only a future which is true is
			                    // returned, there can not be a data race in between.
			#if defined(__GNUC__) && !defined(__clang__)
			#    pragma GCC diagnostic push
			#    pragma GCC diagnostic ignored "-Wtsan"
			#endif
			                    // return a ready future-like placeholder; reuse CallbackThread interface minimally
			                    std::promise<void> p;
			                    auto f = p.get_future();
			                    p.set_value();
			#if defined(__GNUC__) && !defined(__clang__)
			#    pragma GCC diagnostic pop
			#endif
			                    m_isBlockingTaskExecuted = false;
			                    // to keep the uniform interface with the non-blocking case,
			                    // return by moving the f since it is move-only
			                    return f;
			                }
			                // enqueue the task into the worker thread, callers can wait/chain later.
			                return m_workerThread.submit(std::forward<T_Fn>(fn));
			            }

			            friend struct alpaka::internal::GetName;

			            std::string getName() const
			            {
			                return std::string("host::Queue id=") + std::to_string(m_idx);
			            }

			            friend struct internal::GetNativeHandle;

			            [[nodiscard]] auto getNativeHandle() const noexcept
			            {
			                return m_idx;
			            }

			            friend struct internal::Enqueue;

			            template<alpaka::onHost::concepts::ThreadSpec T_ThreadSpec>
			            void enqueue(T_ThreadSpec const& threadSpec, auto const& kernelBundle)
			            {
			                static_assert(
			                    ALPAKA_TYPEOF(threadSpec)::getExecutor() != exec::anyExecutor,
			                    "'exec::anyExecutor' can not be used to enqueue an kernel.");
			                ALPAKA_LOG_FUNCTION(onHost::logger::kernel + onHost::logger::queue);
			                auto deviceKind = alpaka::getDeviceKind(m_device);

			                /* Only set the thread affinity if we use a blocking queue, else the affinity is already set in the
			                 * callback thread. The callback thread affinity will be given to all threads created bya task executed
			                 * by the callback thread. */
			                bool setThreadAffinity = m_isBlocking;
			                submit(
			                    [kernelBundle, threadSpec, deviceKind, numIdx = m_numaIdx, setThreadAffinity]()
			                    {
			                        auto moreLayer = Dict{
			                            DictEntry(object::launchedWidthFrameSpec, std::false_type{}),
			                            DictEntry(object::api, api::host),
			                            DictEntry(object::deviceKind, deviceKind),
			                            DictEntry(object::exec, threadSpec.getExecutor())};
			                        onAcc::Acc acc = makeAcc(threadSpec, numIdx, setThreadAffinity);
			                        acc(kernelBundle, moreLayer);
			                    });
			            }

			            template<alpaka::onHost::concepts::FrameSpec T_FrameSpec>
			            void enqueue(T_FrameSpec const& frameSpec, auto const& kernelBundle)
			            {
			                static_assert(
			                    ALPAKA_TYPEOF(frameSpec)::getExecutor() != exec::anyExecutor,
			                    "'exec::anyExecutor' can not be used to enqueue an kernel.");
			                ALPAKA_LOG_FUNCTION(onHost::logger::kernel + onHost::logger::queue);
			                auto adjustedThreadSpec = internal::adjustThreadSpec(*m_device.get(), frameSpec, kernelBundle);
			                auto deviceKind = alpaka::getDeviceKind(m_device);

			                /* Only set the thread affinity if we use a blocking queue, else the affinity is already set in the
			                 * callback thread. The callback thread affinity will be given to all threads created bya task executed
			                 * by the callback thread. */
			                bool setThreadAffinity = m_isBlocking;
			                submit(
			                    [kernelBundle, adjustedThreadSpec, deviceKind, numIdx = m_numaIdx, setThreadAffinity]()
			                    {
			                        auto moreLayer = Dict{
			                            DictEntry(object::launchedWidthFrameSpec, std::true_type{}),
			                            DictEntry(object::api, api::host),
			                            DictEntry(object::deviceKind, deviceKind),
			                            DictEntry(object::exec, adjustedThreadSpec.getExecutor())};
			                        onAcc::Acc acc = makeAcc(adjustedThreadSpec, numIdx, setThreadAffinity);
			                        acc(kernelBundle, moreLayer);
			                    });
			            }

			            /** execute a task in the queue
			             *
			             * @attention Do NOT enqueue a task which captures the queue internally to keep the queue alive as
			             * dependency. In this case the destructure of the queue is not called.
			             */
			            void enqueueHostFn(auto const& task)
			            {
			                ALPAKA_LOG_FUNCTION(onHost::logger::queue);
			                submit([task]() { task(); });
			            }

			            void enqueueHostFnDeferred(auto const& task)
			            {
			                ALPAKA_LOG_FUNCTION(onHost::logger::queue);
			                m_workerThread.submit(task);
			            }

			            friend struct alpaka::internal::GetDeviceType;

			            auto getDeviceKind() const
			            {
			                return alpaka::internal::getDeviceKind(*m_device.get());
			            }

			            auto getDevice() const
			            {
			                return m_device;
			            }

			            std::shared_ptr<Queue> getSharedPtr()
			            {
			                return this->shared_from_this();
			            }

			            friend struct internal::IsQueueEmpty;

			            /** Checks if the queue is empty
			             *
			             * If m_isBlocking is true, only tasks will be taken into account, events will be ignored they could not
			             * influence the usage of isQueueEmpty. if m_isBlocking is false, events will be taken into account because
			             * they are handled as normal tasks.
			             *
			             * @return true if no tasks is executed else false
			             */
			            bool isQueueEmpty() const
			            {
			                ALPAKA_LOG_FUNCTION(onHost::logger::queue);
			                if(m_isBlocking)
			                {
			                    // check if the queue is currently executing a blocking task
			                    return !m_isBlockingTaskExecuted;
			                }
			                else
			                {
			                    return m_workerThread.isEmpty();
			                }
			            }

			            friend struct onHost::internal::GetDevice;

			            friend struct internal::Wait;
			            friend struct internal::WaitFor;
			            friend struct internal::Memcpy;
			            friend struct internal::MemcpyDeviceGlobal;
			            friend struct internal::Memset;
			            friend struct alpaka::internal::GetApi;
			            friend struct internal::AllocDeferred;
			        };
			    } // namespace cpu

			    namespace internal
			    {
			        template<typename T_Device>
			        struct Wait::Op<cpu::Queue<T_Device>>
			        {
			            void operator()(cpu::Queue<T_Device>& queue) const
			            {
			                ALPAKA_LOG_FUNCTION(onHost::logger::queue);
			                /* If empty -> Enqueue an empty task as marker and wait for the future
			                 * else there is no need to wait
			                 */
			                if(queue.isQueueEmpty() == false)
			                {
			                    queue.submit([]() {}).wait();
			                }
			            }
			        };

			        template<typename T_Device, typename T_Event>
			        struct Enqueue::Event<cpu::Queue<T_Device>, T_Event>
			        {
			            void operator()(cpu::Queue<T_Device>& queue, T_Event& event) const
			            {
			                ALPAKA_LOG_FUNCTION(onHost::logger::event + onHost::logger::queue);
			                // open a scope to avoid logging during we hold the lock for this class
			                {
			                    // Setting the event state (e.g. the future) and enqueuing it has to be atomic.
			                    std::lock_guard<std::mutex> lk(event.m_mutex);

			                    ++event.m_enqueueCount;

			                    auto const enqueueCount = event.m_enqueueCount;

			                    /* In case the queue is blocking we can not use queue.submit() because we hold the lock already.
			                     * The blocking queue executes the lambda directly which will create a deadlock.
			                     */
			                    if(queue.m_isBlocking)
			                    {
			                        // Nothing to do if it has been re-enqueued to a later position in the queue.
			                        if(enqueueCount == event.m_enqueueCount)
			                        {
			                            event.m_LastReadyEnqueueCount = std::max(enqueueCount, event.m_LastReadyEnqueueCount);
			                        }
			                        // apply a fulfilled future
			                        std::promise<void> p;
			                        p.set_value();
			                        event.m_future = p.get_future();
			                    }
			                    else
			                    {
			                        auto sharedEvent = event.getSharedPtr();
			                        // Enqueue a task that only resets the events flag if it is completed.
			                        event.m_future = queue.submit(
			                            [sharedEvent, enqueueCount]() mutable
			                            {
			                                std::unique_lock<std::mutex> lk2(sharedEvent->m_mutex);

			                                // Nothing to do if it has been re-enqueued to a later position in the queue.
			                                if(enqueueCount == sharedEvent->m_enqueueCount)
			                                {
			                                    sharedEvent->m_LastReadyEnqueueCount
			                                        = std::max(enqueueCount, sharedEvent->m_LastReadyEnqueueCount);
			                                }
			                            });
			                    }
			                }
			            }
			        };

			        template<typename T_Device, typename T_Event>
			        struct WaitFor::Op<cpu::Queue<T_Device>, T_Event>
			        {
			            void operator()(cpu::Queue<T_Device>& queue, cpu::Event<T_Device>& event) const
			            {
			                ALPAKA_LOG_FUNCTION(onHost::logger::event + onHost::logger::queue);
			                // open a scope to avoid logging during we hold the lock for this class
			                {
			                    // Setting the event state and enqueuing it has to be atomic.
			                    std::unique_lock<std::mutex> lk(event.m_mutex);

			                    if(!event.isReady())
			                    {
			                        /* In case the queue is blocking we can not use queue.submit() because we hold the lock
			                         * already. The blocking queue executes the lambda directly which will create a deadlock.
			                         */
			                        if(queue.m_isBlocking)
			                        {
			                            std::shared_future sFuture = event.m_future;
			                            lk.unlock();
			                            sFuture.get();
			                        }
			                        else
			                        {
			                            auto sharedEvent = event.getSharedPtr();
			                            auto oldFuture = event.m_future;

			                            // unlock here to avoid keeping the look during the maybe expensive enqueue of the task
			                            lk.unlock();
			                            // Enqueue a task that waits for the given future of the event.
			                            queue.submit([sharedEvent, oldFuture]() { oldFuture.get(); });
			                        }
			                    }
			                }
			            }
			        };

			        template<typename T_Device, typename T_Dest, typename T_Source, typename T_Extents>
			        struct Memcpy::Op<cpu::Queue<T_Device>, T_Dest, T_Source, T_Extents>
			        {
			            void operator()(cpu::Queue<T_Device>& queue, auto&& dest, T_Source const& source, T_Extents const& extents)
			                const requires std::same_as<ALPAKA_TYPEOF(dest), T_Dest>
			            {
			                ALPAKA_LOG_FUNCTION(onHost::logger::memory + onHost::logger::queue);
			                constexpr auto dim = alpaka::trait::getDim_v<T_Extents>;

			                /* Get all required properties outside the lambda function to not extend the life-time of the data.
			                 * The life-time is not extended to have some life-time behaviours with all backends.
			                 */
			                void* destPtr = toVoidPtr(alpaka::onHost::data(ALPAKA_FORWARD(dest)));
			                void const* srcPtr = toVoidPtr(alpaka::onHost::data(source));

			                if constexpr(dim == 1u)
			                {
			                    queue.submit(
			                        [extents, destPtr, srcPtr]()
			                        {
			                            std::memcpy(destPtr, srcPtr, extents.x() * sizeof(alpaka::trait::GetValueType_t<T_Dest>));
			                        });
			                }
			                else
			                {
			                    // memcpy is implemented as row wise copy therefore the last dimension is not required
			                    auto destPitchBytesWithoutColumn = dest.getPitches().eraseBack();
			                    auto sourcePitchBytesWithoutColumn = source.getPitches().eraseBack();

			                    queue.submit(
			                        [extents, destPtr, srcPtr, destPitchBytesWithoutColumn, sourcePitchBytesWithoutColumn]()
			                        {
			                            auto const dstExtentWithoutColumn = extents.eraseBack();
			                            if(static_cast<std::size_t>(extents.product()) != 0u)
			                            {
			                                meta::ndLoopIncIdx(
			                                    dstExtentWithoutColumn,
			                                    [&](auto const& idx)
			                                    {
			                                        std::memcpy(
			                                            reinterpret_cast<std::uint8_t*>(destPtr)
			                                                + (idx * destPitchBytesWithoutColumn).sum(),
			                                            reinterpret_cast<std::uint8_t const*>(srcPtr)
			                                                + (idx * sourcePitchBytesWithoutColumn).sum(),
			                                            static_cast<size_t>(extents.back())
			                                                * sizeof(alpaka::trait::GetValueType_t<T_Dest>));
			                                    });
			                            }
			                        });
			                }
			            }
			        };

			        // copy to device global memory
			        template<typename T_Device, typename T_Source, typename T_Storage, typename T>
			        struct internal::MemcpyDeviceGlobal::
			            Op<cpu::Queue<T_Device>, onAcc::internal::GlobalDeviceMemoryWrapper<T_Storage, T>, T_Source>
			        {
			            void operator()(
			                cpu::Queue<T_Device>& queue,
			                onAcc::internal::GlobalDeviceMemoryWrapper<T_Storage, T> dest,
			                auto&& source) const
			            {
			                ALPAKA_LOG_FUNCTION(onHost::logger::memory + onHost::logger::queue);
			                auto* destPtr = dest.getHandle(api::host).data();
			                void const* srcPtr{nullptr};
			                if constexpr(std::is_pointer_v<ALPAKA_TYPEOF(source)>)
			                    srcPtr = source;
			                else
			                    srcPtr = toVoidPtr(alpaka::onHost::data(ALPAKA_FORWARD(source)));
			                queue.submit([destPtr, srcPtr]() { std::memcpy(destPtr, srcPtr, sizeof(T)); });
			            }
			        };

			        // copy from device global memory
			        template<typename T_Device, typename T_Dest, typename T_Storage, typename T>
			        struct internal::MemcpyDeviceGlobal::
			            Op<cpu::Queue<T_Device>, T_Dest, onAcc::internal::GlobalDeviceMemoryWrapper<T_Storage, T>>
			        {
			            void operator()(
			                cpu::Queue<T_Device>& queue,
			                auto&& dest,
			                onAcc::internal::GlobalDeviceMemoryWrapper<T_Storage, T> source) const
			            {
			                ALPAKA_LOG_FUNCTION(onHost::logger::memory + onHost::logger::queue);
			                void* destPtr{nullptr};
			                if constexpr(std::is_pointer_v<ALPAKA_TYPEOF(dest)>)
			                    destPtr = dest;
			                else
			                    destPtr = toVoidPtr(alpaka::onHost::data(ALPAKA_FORWARD(dest)));
			                auto const* srcPtr = source.getHandle(api::host).data();
			                queue.submit([destPtr, srcPtr]() { std::memcpy(destPtr, srcPtr, sizeof(T)); });
			            }
			        };

			        template<typename T_Device, typename T_Dest, typename T_Extents>
			        struct Memset::Op<cpu::Queue<T_Device>, T_Dest, T_Extents>
			        {
			            /** @attention Do not use `requires std::same_as<ALPAKA_TYPEOF(dest), T_Dest>` here else gcc 11.X
			             * (tested 11.4 and 11.3) will run into an internal compiler segfault during the evaluation of the
			             * constraints */
			            void operator()(cpu::Queue<T_Device>& queue, auto&& dest, uint8_t byteValue, T_Extents const& extents)
			                const requires(std::is_same_v<ALPAKA_TYPEOF(dest), T_Dest>)
			            {
			                ALPAKA_LOG_FUNCTION(onHost::logger::memory + onHost::logger::queue);
			                constexpr auto dim = alpaka::trait::getDim_v<T_Extents>;

			                void* destPtr = static_cast<void*>(alpaka::onHost::data(dest));

			                if constexpr(dim == 1u)
			                {
			                    queue.submit(
			                        [extents, destPtr, byteValue]()
			                        {
			                            std::memset(
			                                destPtr,
			                                byteValue,
			                                extents.x() * sizeof(alpaka::trait::GetValueType_t<T_Dest>));
			                        });
			                }
			                else
			                {
			                    // memset is implemented as row wise memset therefore the last dimension is not required
			                    auto destPitchBytesWithoutColumn = dest.getPitches().eraseBack();
			                    queue.submit(
			                        [extents, destPtr, destPitchBytesWithoutColumn, byteValue]()
			                        {
			                            auto const dstExtentWithoutColumn = extents.eraseBack();
			                            if(static_cast<std::size_t>(extents.product()) != 0u)
			                            {
			                                meta::ndLoopIncIdx(
			                                    dstExtentWithoutColumn,
			                                    [&](auto const& idx)
			                                    {
			                                        std::memset(
			                                            reinterpret_cast<std::uint8_t*>(destPtr)
			                                                + (idx * destPitchBytesWithoutColumn).sum(),
			                                            byteValue,
			                                            static_cast<size_t>(extents.back())
			                                                * sizeof(alpaka::trait::GetValueType_t<T_Dest>));
			                                    });
			                            }
			                        });
			                }
			            }
			        };

			        template<typename T_Device, typename T_Dest, typename T_Value, typename T_Extents>
			        struct Fill::Op<cpu::Queue<T_Device>, T_Dest, T_Value, T_Extents>
			        {
			            void operator()(cpu::Queue<T_Device>& queue, auto&& dest, T_Value elementValue, T_Extents const& extents)
			                const requires std::same_as<ALPAKA_TYPEOF(dest), T_Dest>
			                               && std::same_as<alpaka::trait::GetValueType_t<ALPAKA_TYPEOF(dest)>, T_Value>
			            {
			                ALPAKA_LOG_FUNCTION(onHost::logger::memory + onHost::logger::queue);
			                // avoid that we pass a SharedBuffer and convert non alpaka data views
			                alpaka::concepts::IView<T_Value> auto dataView = makeView(dest);

			                alpaka::internal::generic::fill(
			                    queue,
			                    defaultExecutor(getDevice(queue)),
			                    dataView.getSubView(extents),
			                    elementValue);
			            }
			        };

			        /** The code is a copy of the Alloc::Op with the difference that the memory is allocated and freed
			         * within a queue
			         */
			        template<typename T_Type, typename T_Device, alpaka::concepts::Vector T_Extents>
			        struct AllocDeferred::Op<T_Type, cpu::Queue<T_Device>, T_Extents>
			        {
			            static consteval uint32_t highestPowerOfTwo(uint32_t value)
			            {
			                uint32_t result = 1u;
			                while((result << 1u) <= value)
			                {
			                    result <<= 1u;
			                }
			                return result;
			            }

			            auto operator()(cpu::Queue<T_Device>& queue, T_Extents const& extents) const
			            {
			                ALPAKA_LOG_FUNCTION(onHost::logger::memory + onHost::logger::queue);
			                auto device = queue.getDevice();
			                constexpr uint32_t alignment = api::util::simdOptimizedAlignment<T_Type>(
			                    ALPAKA_TYPEOF(getApi(device)){},
			                    ALPAKA_TYPEOF(getDeviceKind(device)){});
			                auto [memSizeInByte, pitches] = api::util::emulatedAlignedMemDescription<T_Type>(alignment, extents);

			                auto deviceDependency = onHost::Device{queue.getDevice()->getSharedPtr()};
			                auto queueDependency = queue.getSharedPtr();

			                T_Type* ptr = reinterpret_cast<T_Type*>(alpaka::core::alignedAlloc(alignment, memSizeInByte));
			                device->pinPointer(ptr, memSizeInByte);

			                // queueDependency is captured to keep the device alive until the memory is deleted
			                auto deleter = [ptr, queueDep = std::move(queueDependency)]()
			                { queueDep.get()->submit([ptr]() { alpaka::core::alignedFree(alignment, ptr); }); };

			                auto sharedBuffer = onHost::SharedBuffer{
			                    deviceDependency,
			                    ptr,
			                    extents,
			                    pitches,
			                    std::move(deleter),
			                    Alignment<alignment>{}};

			                ALPAKA_LOG_INFO(
			                    onHost::logger::memory + onHost::logger::queue,
			                    [&]()
			                    {
			                        std::stringstream ss;
			                        ss << sharedBuffer;
			                        return ss.str();
			                    });
			                return sharedBuffer;
			            }
			        };
			    } // namespace internal
			} // namespace alpaka::onHost

			namespace alpaka::internal
			{
			    template<typename T_Device>
			    struct GetApi::Op<onHost::cpu::Queue<T_Device>>
			    {
			        inline constexpr auto operator()(auto&& queue) const
			        {
			            return alpaka::getApi(queue.m_device);
			        }
			    };
			} // namespace alpaka::internal
			// ==
			// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/host/Queue.hpp ==
			// ============================================================================

		// #include "alpaka/api/host/hwloc/utility.hpp"    // amalgamate: file already inlined
		// #include "alpaka/api/host/sysInfo.hpp"    // amalgamate: file already inlined
		// #include "alpaka/api/util.hpp"    // amalgamate: file already inlined
		// #include "alpaka/core/alignedAlloc.hpp"    // amalgamate: file already inlined
		// #include "alpaka/internal/interface.hpp"    // amalgamate: file already inlined
		// #include "alpaka/onHost/Device.hpp"    // amalgamate: file already inlined
		// #include "alpaka/onHost/DeviceProperties.hpp"    // amalgamate: file already inlined
		// #include "alpaka/onHost/Handle.hpp"    // amalgamate: file already inlined
		// #include "alpaka/onHost/mem/SharedBuffer.hpp"    // amalgamate: file already inlined
		// #include "alpaka/onHost/trait.hpp"    // amalgamate: file already inlined
		// #include "alpaka/tag.hpp"    // amalgamate: file already inlined
		// #include "alpaka/utility.hpp"    // amalgamate: file already inlined

		// #include <cstdint>    // amalgamate: file already included
		// #include <memory>    // amalgamate: file already included
		// #include <sstream>    // amalgamate: file already included

		namespace alpaka::onHost
		{
		    namespace cpu
		    {
		        template<typename T_Platform>
		        struct Device : std::enable_shared_from_this<Device<T_Platform>>
		        {
		        public:
		            Device(internal::concepts::PlatformHandle auto platform, uint32_t const idx, uint32_t numaIdx)
		                : m_platform(std::move(platform))
		                , m_idx(idx)
		                , m_numaIdx(numaIdx)
		                , m_properties{internal::getDeviceProperties(*m_platform.get(), m_idx)}
		            {
		                ALPAKA_LOG_FUNCTION(onHost::logger::device);
		            }

		            ~Device()
		            {
		                ALPAKA_LOG_FUNCTION(onHost::logger::device);
		            }

		            Device(Device const&) = delete;
		            Device& operator=(Device const&) = delete;

		            Device(Device&&) = delete;
		            Device& operator=(Device&&) = delete;

		            bool operator==(Device const& other) const
		            {
		                return m_idx == other.m_idx;
		            }

		            bool operator!=(Device const& other) const
		            {
		                return m_idx != other.m_idx;
		            }

		            void wait()
		            {
		                ALPAKA_LOG_FUNCTION(alpaka::onHost::logger::device);
		                // Host device synchronization - wait on all queues associated with this device.
		                // IMPORTANT: Do not hold queuesGuard across potentially long waits; copy weak refs first.
		                std::vector<std::weak_ptr<cpu::Queue<Device>>> tmpQueues;
		                {
		                    std::lock_guard<std::mutex> lk{queuesGuard};
		                    tmpQueues = queues; // copy weak_ptr list
		                }
		                for(auto& weakQueue : tmpQueues)
		                {
		                    if(auto queue = weakQueue.lock())
		                    {
		                        internal::wait(*queue);
		                    }
		                }
		            }

		        private:
		            void _()
		            {
		                static_assert(internal::concepts::Device<Device>);
		            }

		            Handle<T_Platform> m_platform;
		            uint32_t m_idx = 0u;
		            uint32_t m_numaIdx = internal::hwloc::allNumaDomains;
		            DeviceProperties m_properties;
		            std::vector<std::weak_ptr<cpu::Queue<Device>>> queues;
		            std::vector<std::weak_ptr<cpu::Event<Device>>> events;
		            std::mutex queuesGuard;

		            std::shared_ptr<Device> getSharedPtr()
		            {
		                return this->shared_from_this();
		            }

		            template<typename T_Device>
		            friend struct Queue;

		            void setThreadAffinity() const
		            {
		                internal::hwloc::setThreadAffinity(m_numaIdx);
		            }

		            template<typename T>
		            void pinPointer(T* const ptr, size_t bytes)
		            {
		                internal::hwloc::pinPointer(ptr, bytes, m_numaIdx);
		            }

		            bool isNumaAware() const
		            {
		                return m_numaIdx != internal::hwloc::allNumaDomains;
		            }

		            friend struct alpaka::internal::GetName;

		            std::string getName() const
		            {
		                return m_properties.name;
		            }

		            friend struct internal::GetNativeHandle;

		            [[nodiscard]] uint32_t getNativeHandle() const noexcept
		            {
		                return m_idx;
		            }

		            friend struct internal::MakeQueue;

		            Handle<cpu::Queue<Device>> makeQueue(alpaka::concepts::QueueKind auto kind)
		            {
		                ALPAKA_LOG_FUNCTION(onHost::logger::queue);
		                static_assert(
		                    kind == queueKind::blocking || kind == queueKind::nonBlocking,
		                    "Unsupported queue kind.");
		                auto thisHandle = this->getSharedPtr();
		                std::lock_guard<std::mutex> lk{queuesGuard};

		                constexpr bool isBlocking = kind == queueKind::blocking;
		                auto newQueue = std::make_shared<cpu::Queue<Device>>(
		                    std::move(thisHandle),
		                    queues.size(),
		                    m_numaIdx,
		                    isBlocking);

		                queues.emplace_back(newQueue);
		                return newQueue;
		            }

		            friend struct internal::MakeEvent;

		            Handle<cpu::Event<Device>> makeEvent()
		            {
		                ALPAKA_LOG_FUNCTION(alpaka::onHost::logger::event);
		                auto thisHandle = this->getSharedPtr();
		                std::lock_guard<std::mutex> lk{queuesGuard};
		                auto newEvent = std::make_shared<cpu::Event<Device>>(std::move(thisHandle), queues.size());

		                events.emplace_back(newEvent);
		                return newEvent;
		            }

		            friend struct alpaka::internal::GetDeviceType;

		            auto getDeviceKind() const
		            {
		                return alpaka::internal::getDeviceKind(*m_platform.get());
		            }

		            auto getFreeGlobalMemBytes() const
		            {
		#if ALPAKA_HAS_HWLOC
		                if(isNumaAware())
		                    return internal::hwloc::getFreeGlobalMemBytes(m_numaIdx);
		#endif
		                return onHost::getFreeGlobalMemBytes();
		            }

		            friend struct internal::Alloc;
		            friend struct alpaka::internal::GetApi;
		            friend struct internal::GetDeviceProperties;
		            friend struct internal::GetFreeGlobalMemBytes;
		            friend struct internal::AdjustThreadSpec;
		            friend struct internal::AllocDeferred;
		            friend struct internal::AllocUnified;
		            friend struct internal::AllocMapped;
		        };
		    } // namespace cpu

		    namespace trait

		    {
		        template<typename T_Platform>
		        struct IsExecutorSupportedBy::Op<exec::CpuSerial, cpu::Device<T_Platform>> : std::true_type
		        {
		        };
		#if ALPAKA_OMP
		        template<typename T_Platform>
		        struct IsExecutorSupportedBy::Op<exec::CpuOmpBlocks, cpu::Device<T_Platform>> : std::true_type
		        {
		        };
		#endif
		#if ALPAKA_TBB
		        template<typename T_Platform>
		        struct IsExecutorSupportedBy::Op<exec::CpuTbbBlocks, cpu::Device<T_Platform>> : std::true_type
		        {
		        };
		#endif
		    } // namespace trait

		    namespace internal
		    {
		        template<typename T_Type, typename T_Platform, alpaka::concepts::Vector T_Extents>
		        struct Alloc::Op<T_Type, cpu::Device<T_Platform>, T_Extents>
		        {
		            auto operator()(cpu::Device<T_Platform>& device, T_Extents const& extents) const
		            {
		                ALPAKA_LOG_FUNCTION(onHost::logger::memory + onHost::logger::device);
		                constexpr uint32_t alignment = api::util::simdOptimizedAlignment<T_Type>(
		                    ALPAKA_TYPEOF(getApi(device)){},
		                    ALPAKA_TYPEOF(getDeviceKind(device)){});
		                auto [memSizeInByte, pitches] = api::util::emulatedAlignedMemDescription<T_Type>(alignment, extents);

		                auto deviceDependency = onHost::Device{device.getSharedPtr()};

		                T_Type* ptr = reinterpret_cast<T_Type*>(alpaka::core::alignedAlloc(alignment, memSizeInByte));
		                device.pinPointer(ptr, memSizeInByte);
		                // deviceDependency is captured to keep the device alive until the memory is deleted
		                auto deleter = [ptr, deviceDependency]() { alpaka::core::alignedFree(alignment, ptr); };

		                auto sharedBuffer = onHost::SharedBuffer{
		                    deviceDependency,
		                    ptr,
		                    extents,
		                    pitches,
		                    std::move(deleter),
		                    Alignment<alignment>{}};

		                ALPAKA_LOG_INFO(
		                    onHost::logger::memory + onHost::logger::device,
		                    [&]()
		                    {
		                        std::stringstream ss;
		                        ss << sharedBuffer;
		                        return ss.str();
		                    });
		                return sharedBuffer;
		            }
		        };

		        template<typename T_Type, typename T_Platform, alpaka::concepts::Vector T_Extents>
		        struct AllocUnified::Op<T_Type, cpu::Device<T_Platform>, T_Extents>
		        {
		            auto operator()(cpu::Device<T_Platform>& device, T_Extents const& extents) const
		            {
		                ALPAKA_LOG_FUNCTION(onHost::logger::memory + onHost::logger::device);
		                return Alloc::Op<T_Type, cpu::Device<T_Platform>, T_Extents>{}(device, extents);
		            }
		        };

		        template<typename T_Type, typename T_Platform, alpaka::concepts::Vector T_Extents>
		        struct AllocMapped::Op<T_Type, cpu::Device<T_Platform>, T_Extents>
		        {
		            auto operator()(cpu::Device<T_Platform>& device, T_Extents const& extents) const
		            {
		                ALPAKA_LOG_FUNCTION(onHost::logger::memory + onHost::logger::device);
		                return Alloc::Op<T_Type, cpu::Device<T_Platform>, T_Extents>{}(device, extents);
		            }
		        };

		        template<typename T_Platform, typename T_Any>
		        struct IsDataAccessible::FirstPath<cpu::Device<T_Platform>, T_Any>
		        {
		            bool operator()(cpu::Device<T_Platform>& device, T_Any const& view) const
		            {
		                ALPAKA_LOG_FUNCTION(onHost::logger::memory + onHost::logger::device);
		                if constexpr(
		                    ALPAKA_TYPEOF(getApi(view)){} == api::host
		                    && (ALPAKA_TYPEOF(getDeviceKind(device)){} == deviceKind::cpu
		                        || ALPAKA_TYPEOF(getDeviceKind(device)){} == deviceKind::numaCpu))
		                    return true;
		                else
		                    return false;
		            }
		        };

		        /** Set number of thread blocks and threads per block to one
		         *
		         * There is no need to emulate blocks if we have only one thread.
		         */
		        template<
		            typename T_Platform,
		            alpaka::concepts::Vector T_NumFrames,
		            alpaka::concepts::Vector T_FrameExtents,
		            alpaka::concepts::KernelBundle T_KernelBundle>
		        struct AdjustThreadSpec::
		            Op<cpu::Device<T_Platform>, FrameSpec<T_NumFrames, T_FrameExtents, exec::CpuSerial>, T_KernelBundle>
		        {
		            using FrameSpecType = FrameSpec<T_NumFrames, T_FrameExtents, exec::CpuSerial>;

		            auto operator()(
		                cpu::Device<T_Platform> const& device,
		                FrameSpecType const& frameSpec,
		                T_KernelBundle const& kernelBundle) const requires alpaka::concepts::CVector<T_FrameExtents>
		            {
		                alpaka::unused(device, kernelBundle);
		                ALPAKA_LOG_FUNCTION(onHost::logger::kernel);

		                /// @todo add shortcut to create a CVec with equal values
		                auto const allOne = ALPAKA_TYPEOF(
		                    iotaCVec<typename T_FrameExtents::type, T_FrameExtents::dim()>())::template fill<1u>();
		                return ThreadSpec{allOne, allOne, frameSpec.getExecutor()};
		            }

		            auto operator()(
		                cpu::Device<T_Platform> const& device,
		                FrameSpecType const& frameSpec,
		                T_KernelBundle const& kernelBundle) const
		            {
		                alpaka::unused(device, kernelBundle);
		                ALPAKA_LOG_FUNCTION(onHost::logger::kernel);
		                /// @todo add shortcut to create a CVec with equal values
		                auto const allOne = ALPAKA_TYPEOF(
		                    iotaCVec<typename T_FrameExtents::type, T_FrameExtents::dim()>())::template fill<1u>();
		                return ThreadSpec{allOne, allOne, frameSpec.getExecutor()};
		            }
		        };

		        template<
		            typename T_Platform,
		            alpaka::concepts::Executor T_Executor,
		            alpaka::concepts::Vector T_NumFrames,
		            alpaka::concepts::Vector T_FrameExtents,
		            alpaka::concepts::KernelBundle T_KernelBundle>
		        requires exec::isSeqExecutor_v<T_Executor>
		        struct AdjustThreadSpec::
		            Op<cpu::Device<T_Platform>, FrameSpec<T_NumFrames, T_FrameExtents, T_Executor>, T_KernelBundle>
		        {
		            using FrameSpecType = FrameSpec<T_NumFrames, T_FrameExtents, T_Executor>;

		            auto operator()(
		                cpu::Device<T_Platform> const& device,
		                FrameSpecType const& frameSpec,
		                T_KernelBundle const& kernelBundle) const requires alpaka::concepts::CVector<T_FrameExtents>
		            {
		                alpaka::unused(device, kernelBundle);
		                ALPAKA_LOG_FUNCTION(onHost::logger::kernel);

		                // map the number of frames to thread blocks
		                auto numThreadBlocks = frameSpec.getNumFrames();
		                return ThreadSpec{numThreadBlocks, T_FrameExtents::template fill<1u>(), frameSpec.getExecutor()};
		            }

		            auto operator()(
		                cpu::Device<T_Platform> const& device,
		                FrameSpecType const& frameSpec,
		                T_KernelBundle const& kernelBundle) const
		            {
		                alpaka::unused(device, kernelBundle);
		                ALPAKA_LOG_FUNCTION(alpaka::onHost::logger::kernel);

		                // map the number of frames to thread blocks
		                auto numThreadBlocks = frameSpec.getNumFrames();
		                auto const numThreads = Vec<typename T_FrameExtents::type, T_FrameExtents::dim()>::fill(1);
		                return ThreadSpec{numThreadBlocks, numThreads, frameSpec.getExecutor()};
		            }
		        };

		        template<typename T_Platform>
		        struct GetDeviceProperties::Op<cpu::Device<T_Platform>>
		        {
		            DeviceProperties operator()(cpu::Device<T_Platform> const& device) const
		            {
		                return device.m_properties;
		            }
		        };
		    } // namespace internal
		} // namespace alpaka::onHost

		namespace alpaka::internal
		{
		    template<typename T_Platform>
		    struct GetApi::Op<onHost::cpu::Device<T_Platform>>
		    {
		        inline constexpr auto operator()(auto&& device) const
		        {
		            return alpaka::getApi(device.m_platform);
		        }
		    };
		} // namespace alpaka::internal
		// ==
		// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/host/Device.hpp ==
		// ============================================================================

	// #include "alpaka/api/host/Event.hpp"    // amalgamate: file already inlined
		// ============================================================================
		// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/host/Platform.hpp ==
		// ==
		/* Copyright 2024 René Widera
		 * SPDX-License-Identifier: MPL-2.0
		 */

		// #pragma once
		// #include "alpaka/api/host/Api.hpp"    // amalgamate: file already inlined
		// #include "alpaka/api/host/Device.hpp"    // amalgamate: file already inlined
		// #include "alpaka/api/host/block/mem/SharedStorage.hpp"    // amalgamate: file already inlined
		// #include "alpaka/api/host/hwloc/utility.hpp"    // amalgamate: file already inlined
		// #include "alpaka/api/host/sysInfo.hpp"    // amalgamate: file already inlined
		// #include "alpaka/internal/interface.hpp"    // amalgamate: file already inlined
		// #include "alpaka/onHost/Handle.hpp"    // amalgamate: file already inlined
		// #include "alpaka/onHost/interface.hpp"    // amalgamate: file already inlined
		// #include "alpaka/onHost/trait.hpp"    // amalgamate: file already inlined
		// #include "alpaka/tag.hpp"    // amalgamate: file already inlined

		// #include <memory>    // amalgamate: file already included
		// #include <sstream>    // amalgamate: file already included

		namespace alpaka::onHost
		{
		    namespace cpu
		    {
		        template<alpaka::concepts::DeviceKind T_DeviceKind>
		        struct Platform : std::enable_shared_from_this<Platform<T_DeviceKind>>
		        {
		        public:
		            Platform() = default;

		            Platform(Platform const&) = delete;
		            Platform& operator=(Platform const&) = delete;

		            Platform(Platform&&) = delete;
		            Platform& operator=(Platform&&) = delete;

		        private:
		            void _()
		            {
		                static_assert(internal::concepts::Platform<Platform>);
		            }

		            std::vector<std::weak_ptr<cpu::Device<Platform>>> devices;
		            std::mutex deviceGuard;

		            std::shared_ptr<Platform> getSharedPtr()
		            {
		                return this->shared_from_this();
		            }

		            friend struct alpaka::internal::GetName;

		            std::string getName() const
		            {
		                return "host::Platform";
		            }

		            friend struct internal::GetDeviceCount;

		            uint32_t getDeviceCount()
		            {
		                uint32_t devCount = 0u;

		                constexpr bool isSupportedDev = trait::IsDeviceSupportedBy::Op<T_DeviceKind, api::Host>::value;
		                if constexpr(isSupportedDev)
		                {
		                    if constexpr(T_DeviceKind{} == deviceKind::numaCpu)
		                    {
		                        devCount = alpaka::onHost::internal::hwloc::getNumNumaDomains();
		                    }
		                    else
		                        devCount = 1;

		                    if(devices.size() < static_cast<size_t>(devCount))
		                    {
		                        std::lock_guard<std::mutex> lk{deviceGuard};
		                        devices.resize(devCount);
		                    }
		                }
		                return devCount;
		            }

		            friend struct internal::MakeDevice;

		            Handle<cpu::Device<Platform>> makeDevice(uint32_t const& idx)
		            {
		                ALPAKA_LOG_FUNCTION(alpaka::onHost::logger::device);
		                uint32_t const numDevices = getDeviceCount();
		                if(idx >= numDevices)
		                {
		                    std::stringstream ssErr;
		                    ssErr << "Unable to return device handle with index " << idx << " because there are only "
		                          << numDevices << " devices of type '" << alpaka::onHost::getStaticName(T_DeviceKind{})
		                          << "' !";
		                    throw std::runtime_error(ssErr.str());
		                }
		                std::lock_guard<std::mutex> lk{deviceGuard};

		                if(auto sharedPtr = devices[idx].lock())
		                {
		                    return sharedPtr;
		                }
		                auto thisHandle = getSharedPtr();
		                uint32_t numaIdx = internal::hwloc::allNumaDomains;
		                if constexpr(T_DeviceKind{} == deviceKind::numaCpu)
		                {
		                    numaIdx = idx;
		                }
		                auto newDevice = std::make_shared<cpu::Device<Platform>>(std::move(thisHandle), idx, numaIdx);
		                devices[idx] = newDevice;
		                return newDevice;
		            }

		            friend struct internal::GetDeviceProperties;

		            friend struct alpaka::internal::GetDeviceType;

		            T_DeviceKind getDeviceKind() const
		            {
		                return T_DeviceKind{};
		            }
		        };
		    } // namespace cpu

		    namespace internal
		    {
		        template<alpaka::concepts::DeviceKind T_DeviceKind>
		        struct MakePlatform::Op<api::Host, T_DeviceKind>
		        {
		            auto operator()(api::Host, T_DeviceKind) const
		            {
		                return make_sharedSingleton<cpu::Platform<T_DeviceKind>>();
		            }
		        };

		        template<alpaka::concepts::DeviceKind T_DeviceKind>
		        struct GetDeviceProperties::Op<cpu::Platform<T_DeviceKind>>
		        {
		            DeviceProperties operator()(cpu::Platform<T_DeviceKind> const& platform, uint32_t deviceIdx) const
		            {
		                alpaka::unused(platform);
		                ALPAKA_LOG_FUNCTION(alpaka::onHost::logger::device);
		                auto prop = DeviceProperties{};
		                prop.name = getCpuName();
		                prop.warpSize = 1u;
		                prop.multiProcessorCount = hwloc::getNumCores(hwloc::allNumaDomains);
		                prop.globalMemCapacityBytes = hwloc::getMemCapacityBytes(hwloc::allNumaDomains);
		                prop.sharedMemPerBlockBytes = ALPAKA_BLOCK_SHARED_DYN_MEMBER_ALLOC_KIB * 1024u;

		                if constexpr(T_DeviceKind{} == deviceKind::numaCpu)
		                {
		                    // the deviceIdx is equal to the numa domain index
		                    prop.multiProcessorCount = hwloc::getNumCores(deviceIdx);
		                    prop.globalMemCapacityBytes = hwloc::getMemCapacityBytes(deviceIdx);
		                }
		                else
		                    alpaka::unused(deviceIdx);

		                prop.maxThreadsPerBlock = std::numeric_limits<uint32_t>::max();
		                prop.fnMaxThreadsPerBlock = [](uint32_t* data, uint32_t numDims)
		                {
		                    for(uint32_t d = 0u; d < numDims; ++d)
		                        data[d] = std::numeric_limits<uint32_t>::max();
		                };

		                prop.maxBlocksPerGrid = std::numeric_limits<uint32_t>::max();
		                prop.fnMaxBlocksPerGrid = [](uint32_t* data, uint32_t numDims)
		                {
		                    for(uint32_t d = 0u; d < numDims; ++d)
		                        data[d] = std::numeric_limits<uint32_t>::max();
		                };

		                return prop;
		            }
		        };
		    } // namespace internal
		} // namespace alpaka::onHost

		namespace alpaka::internal
		{
		    template<alpaka::concepts::DeviceKind T_DeviceKind>
		    struct GetApi::Op<onHost::cpu::Platform<T_DeviceKind>>
		    {
		        inline constexpr auto operator()(auto&& platform) const
		        {
		            alpaka::unused(platform);
		            return api::Host{};
		        }
		    };
		} // namespace alpaka::internal
		// ==
		// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/host/Platform.hpp ==
		// ============================================================================

	// #include "alpaka/api/host/Queue.hpp"    // amalgamate: file already inlined
		// ============================================================================
		// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/host/atomic.hpp ==
		// ==
		/* Copyright 2022 Felice Pantaleo, Andrea Bocci, Jan Stephan
		 * SPDX-License-Identifier: MPL-2.0
		 */

		// #pragma once
		// #include "alpaka/api/host/tag.hpp"    // amalgamate: file already inlined
		// #include "alpaka/core/config.hpp"    // amalgamate: file already inlined
		// #include "alpaka/onAcc/internal/interface.hpp"    // amalgamate: file already inlined
		// #include "alpaka/onAcc/scope.hpp"    // amalgamate: file already inlined
		// #include "alpaka/operation.hpp"    // amalgamate: file already inlined

		// #include <array>    // amalgamate: file already included
		// #include <atomic>    // amalgamate: file already included
		#include <type_traits>


		#ifdef ALPAKA_DISABLE_STD_ATOMIC_REF
		#    include <boost/atomic.hpp>
		#endif

		namespace alpaka::onAcc
		{
		    namespace detail
		    {
		#if defined(ALPAKA_DISABLE_STD_ATOMIC_REF)
		        template<typename T>
		        using atomic_ref = boost::atomic_ref<T>;
		        constexpr auto memory_order_relaxed = boost::memory_order_relaxed;
		#else
		        template<typename T>
		        using atomic_ref = std::atomic_ref<T>;
		        constexpr auto memory_order_relaxed = std::memory_order_relaxed;
		#endif
		    } // namespace detail

		    //! The atomic ops based on atomic_ref for CPU accelerators.
		    //
		    //  Atomics can be used in the grids, blocks and threads hierarchy levels.
		    //

		    class AtomicAtomicRef
		    {
		    };

		    template<typename T>
		    void isSupportedByAtomicAtomicRef()
		    {
		        static_assert(
		            std::is_trivially_copyable_v<T> && detail::atomic_ref<T>::required_alignment <= alignof(T),
		            "Type not supported by AtomicAtomicRef, please recompile defining "
		            "ALPAKA_DISABLE_ATOMIC_ATOMICREF.");
		    }

		    namespace internalCompute
		    {
		        //! The CPU accelerators operation::Add.
		        template<typename T, typename T_Scope>
		        struct Atomic::Op<operation::Add, internal::StlAtomic, T, T_Scope>
		        {
		            ALPAKA_FN_HOST static auto atomicOp(internal::StlAtomic const&, T* const addr, T const& value) -> T
		            {
		                isSupportedByAtomicAtomicRef<T>();
		                detail::atomic_ref<T> ref(*addr);
		                return ref.fetch_add(value, detail::memory_order_relaxed);
		            }
		        };

		        //! The CPU accelerators operation::Sub.
		        template<typename T, typename T_Scope>
		        struct Atomic::Op<alpaka::operation::Sub, internal::StlAtomic, T, T_Scope>
		        {
		            ALPAKA_FN_HOST static auto atomicOp(internal::StlAtomic const&, T* const addr, T const& value) -> T
		            {
		                isSupportedByAtomicAtomicRef<T>();
		                detail::atomic_ref<T> ref(*addr);
		                return ref.fetch_sub(value, detail::memory_order_relaxed);
		            }
		        };

		        //! The CPU accelerators operation::Min.
		        template<typename T, typename T_Scope>
		        struct Atomic::Op<alpaka::operation::Min, internal::StlAtomic, T, T_Scope>
		        {
		            ALPAKA_FN_HOST static auto atomicOp(internal::StlAtomic const&, T* const addr, T const& value) -> T
		            {
		                isSupportedByAtomicAtomicRef<T>();
		                detail::atomic_ref<T> ref(*addr);
		                T old = ref;
		                T result = old;
		                result = std::min(result, value);
		                while(!ref.compare_exchange_weak(old, result, detail::memory_order_relaxed))
		                {
		                    result = old;
		                    result = std::min(result, value);
		                }
		                return old;
		            }
		        };

		        //! The CPU accelerators operation::Max.
		        template<typename T, typename T_Scope>
		        struct Atomic::Op<alpaka::operation::Max, internal::StlAtomic, T, T_Scope>
		        {
		            ALPAKA_FN_HOST static auto atomicOp(internal::StlAtomic const&, T* const addr, T const& value) -> T
		            {
		                isSupportedByAtomicAtomicRef<T>();
		                detail::atomic_ref<T> ref(*addr);
		                T old = ref;
		                T result = old;
		                result = std::max(result, value);
		                while(!ref.compare_exchange_weak(old, result, detail::memory_order_relaxed))
		                {
		                    result = old;
		                    result = std::max(result, value);
		                }
		                return old;
		            }
		        };

		        //! The CPU accelerators operation::Exch.
		        template<typename T, typename T_Scope>
		        struct Atomic::Op<alpaka::operation::Exch, internal::StlAtomic, T, T_Scope>
		        {
		            ALPAKA_FN_HOST static auto atomicOp(internal::StlAtomic const&, T* const addr, T const& value) -> T
		            {
		                isSupportedByAtomicAtomicRef<T>();
		                detail::atomic_ref<T> ref(*addr);
		                T old = ref;
		                T result = value;
		                while(!ref.compare_exchange_weak(old, result, detail::memory_order_relaxed))
		                {
		                    result = value;
		                }
		                return old;
		            }
		        };

		        //! The CPU accelerators operation::Inc.
		        template<typename T, typename T_Scope>
		        struct Atomic::Op<alpaka::operation::Inc, internal::StlAtomic, T, T_Scope>
		        {
		            ALPAKA_FN_HOST static auto atomicOp(internal::StlAtomic const&, T* const addr, T const& value) -> T
		            {
		                isSupportedByAtomicAtomicRef<T>();
		                detail::atomic_ref<T> ref(*addr);
		                T old = ref;
		                T result;
		                do
		                {
		                    result = ((old >= value) ? T{0} : old + T{1});
		                } while(!ref.compare_exchange_weak(old, result, detail::memory_order_relaxed));
		                return old;
		            }
		        };

		        //! The CPU accelerators operation::Dec.
		        template<typename T, typename T_Scope>
		        struct Atomic::Op<alpaka::operation::Dec, internal::StlAtomic, T, T_Scope>
		        {
		            ALPAKA_FN_HOST static auto atomicOp(internal::StlAtomic const&, T* const addr, T const& value) -> T
		            {
		                isSupportedByAtomicAtomicRef<T>();
		                detail::atomic_ref<T> ref(*addr);
		                T old = ref;
		                T result;
		                do
		                {
		                    result = (old == T{0} || old > value) ? value : (old - T{1});
		                } while(!ref.compare_exchange_weak(old, result, detail::memory_order_relaxed));
		                return old;
		            }
		        };

		        //! The CPU accelerators operation::And.
		        template<typename T, typename T_Scope>
		        struct Atomic::Op<alpaka::operation::And, internal::StlAtomic, T, T_Scope>
		        {
		            ALPAKA_FN_HOST static auto atomicOp(internal::StlAtomic const&, T* const addr, T const& value) -> T
		            {
		                isSupportedByAtomicAtomicRef<T>();
		                detail::atomic_ref<T> ref(*addr);
		                return ref.fetch_and(value, detail::memory_order_relaxed);
		            }
		        };

		        //! The CPU accelerators operation::Or.
		        template<typename T, typename T_Scope>
		        struct Atomic::Op<alpaka::operation::Or, internal::StlAtomic, T, T_Scope>
		        {
		            ALPAKA_FN_HOST static auto atomicOp(internal::StlAtomic const&, T* const addr, T const& value) -> T
		            {
		                isSupportedByAtomicAtomicRef<T>();
		                detail::atomic_ref<T> ref(*addr);
		                return ref.fetch_or(value, detail::memory_order_relaxed);
		            }
		        };

		        //! The CPU accelerators operation::Xor.
		        template<typename T, typename T_Scope>
		        struct Atomic::Op<alpaka::operation::Xor, internal::StlAtomic, T, T_Scope>
		        {
		            ALPAKA_FN_HOST static auto atomicOp(internal::StlAtomic const&, T* const addr, T const& value) -> T
		            {
		                isSupportedByAtomicAtomicRef<T>();
		                detail::atomic_ref<T> ref(*addr);
		                return ref.fetch_xor(value, detail::memory_order_relaxed);
		            }
		        };

		        //! The CPU accelerators operation::Cas.
		        template<typename T, typename T_Scope>
		        struct Atomic::Op<alpaka::operation::Cas, internal::StlAtomic, T, T_Scope>
		        {
		            ALPAKA_FN_HOST static auto atomicOp(
		                internal::StlAtomic const&,
		                T* const addr,
		                T const& compare,
		                T const& value) -> T
		            {
		                isSupportedByAtomicAtomicRef<T>();
		                detail::atomic_ref<T> ref(*addr);
		                T old = ref;
		                T result;
		                do
		                {
		#if ALPAKA_COMP_GNUC || ALPAKA_COMP_CLANG
		#    pragma GCC diagnostic push
		#    pragma GCC diagnostic ignored "-Wfloat-equal"
		#endif
		                    result = ((old == compare) ? value : old);
		#if ALPAKA_COMP_GNUC || ALPAKA_COMP_CLANG
		#    pragma GCC diagnostic pop
		#endif
		                } while(!ref.compare_exchange_weak(old, result, detail::memory_order_relaxed));
		                return old;
		            }
		        };
		    } // namespace internalCompute
		} // namespace alpaka::onAcc
		// ==
		// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/host/atomic.hpp ==
		// ============================================================================

		// ============================================================================
		// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/host/memFence.hpp ==
		// ==
		/* Copyright 2025 Mehmet Yusufoglu, René Widera
		 * SPDX-License-Identifier: MPL-2.0
		 */

		// #pragma once
		// #include "alpaka/api/host/Api.hpp"    // amalgamate: file already inlined
		// #include "alpaka/api/host/executor.hpp"    // amalgamate: file already inlined
			// ============================================================================
			// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/host/memoryOrder.hpp ==
			// ==
			/* Copyright 2025 Mehmet Yusufoglu, René Widera
			 * SPDX-License-Identifier: MPL-2.0
			 */

			// #pragma once
				// ============================================================================
				// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/onAcc/memoryOrder.hpp ==
				// ==
				/* Copyright 2025 Mehmet Yusufoglu, René Widera
				 * SPDX-License-Identifier: MPL-2.0
				 */

				// #pragma once
				// #include <string>    // amalgamate: file already included

				/**
				 * @brief Provides scopes for atomic and memory fence operations, analogous to NVIDIA CUDA's atomic and fence scopes.
				 *
				 * This namespace defines the visibility scopes for atomic operations and memory fences,
				 * which control the visibility of memory operations across threads, blocks, and devices.
				 * The provided scopes are:
				 * - Block: Visibility within a thread block.
				 * - Device: Visibility across all thread blocks on the same device.
				 * - System: System-wide visibility, mapped to the strongest available atomic/fence by the backend.
				 *
				 * @see alpaka::onAcc::atomicAdd, alpaka::onAcc::memFence
				 */
				namespace alpaka::onAcc
				{
				    namespace order
				    {

				        /**
				         * @brief Base tag for memory order types.
				         *
				         * This tag can be used to constrain APIs that accept only valid memory orders.
				         */
				        struct MemoryOrderTag
				        {
				        };

				        /**
				         * @brief Sequentially consistent memory ordering.
				         *
				         * This is the strongest memory ordering and provides a single global order
				         * for all sequentially consistent operations.
				         */
				        struct SeqCst : MemoryOrderTag
				        {
				            static std::string getName()
				            {
				                return "SeqCst";
				            }
				        };

				        inline constexpr SeqCst seq_cst{};

				        /**
				         * @brief Acquire-release memory ordering.
				         *
				         * Ensures both acquire and release semantics. This ordering is typically
				         * used for read-modify-write operations.
				         */
				        struct AcqRel : MemoryOrderTag
				        {
				            static std::string getName()
				            {
				                return "AcqRel";
				            }
				        };

				        inline constexpr AcqRel acq_rel{};

				        /**
				         * @brief Release memory ordering.
				         *
				         * Ensures that all writes before the operation become visible before the
				         * release operation itself becomes visible.
				         */
				        struct Release : MemoryOrderTag
				        {
				            static std::string getName()
				            {
				                return "Release";
				            }
				        };

				        inline constexpr Release release{};

				        /**
				         * @brief Acquire memory ordering.
				         *
				         * Ensures that all reads and writes after the operation observe effects
				         * that became visible before the acquire operation.
				         */
				        struct Acquire : MemoryOrderTag
				        {
				            static std::string getName()
				            {
				                return "Acquire";
				            }
				        };

				        inline constexpr Acquire acquire{};

				        /**
				         * @brief Relaxed memory ordering.
				         *
				         * Provides atomicity without additional ordering guarantees.
				         */
				        struct Relaxed : MemoryOrderTag
				        {
				            static std::string getName()
				            {
				                return "Relaxed";
				            }
				        };

				        inline constexpr Relaxed relaxed{};
				    } // namespace order

				    namespace concepts
				    {
				        template<typename T>
				        concept MemoryOrder = std::derived_from<T, order::MemoryOrderTag>;
				    } // namespace concepts

				} // namespace alpaka::onAcc
				// ==
				// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/onAcc/memoryOrder.hpp ==
				// ============================================================================


			// #include <atomic>    // amalgamate: file already included
			#include <type_traits>

			namespace alpaka::onAcc::internalCompute
			{
			    struct MemOrderHost
			    {
			        template<concepts::MemoryOrder TMemOrder>
			        static constexpr auto get(TMemOrder const)
			        {
			            if constexpr(std::same_as<TMemOrder, order::SeqCst>)
			            {
			                return std::memory_order::seq_cst;
			            }
			            if constexpr(std::same_as<TMemOrder, order::AcqRel>)
			            {
			                return std::memory_order::acq_rel;
			            }
			            if constexpr(std::same_as<TMemOrder, order::Release>)
			            {
			                return std::memory_order::release;
			            }
			            if constexpr(std::same_as<TMemOrder, order::Acquire>)
			            {
			                return std::memory_order::acquire;
			            }
			            if constexpr(std::same_as<TMemOrder, order::Relaxed>)
			            {
			                return std::memory_order::relaxed;
			            }
			        }
			    };
			} // namespace alpaka::onAcc::internalCompute
			// ==
			// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/host/memoryOrder.hpp ==
			// ============================================================================

		// #include "alpaka/api/host/tag.hpp"    // amalgamate: file already inlined
		// #include "alpaka/core/common.hpp"    // amalgamate: file already inlined
		// #include "alpaka/onAcc/Acc.hpp"    // amalgamate: file already inlined
		// #include "alpaka/onAcc/memoryOrder.hpp"    // amalgamate: file already inlined
		// #include "alpaka/onAcc/scope.hpp"    // amalgamate: file already inlined
		// #include "alpaka/tag.hpp"    // amalgamate: file already inlined

		// #include <atomic>    // amalgamate: file already included
		#include <type_traits>

		namespace alpaka::onAcc::internalCompute
		{
		    namespace detail
		    {
		        // suppress warning: `warning: 'atomic_thread_fence' is not supported with '-fsanitize=thread' [-Wtsan]`
		#if defined(__GNUC__) && !defined(__clang__)
		#    pragma GCC diagnostic push
		#    pragma GCC diagnostic ignored "-Wtsan"
		#endif

		        constexpr void hostMemoryFenceImpl(auto const&, auto const scope, concepts::MemoryOrder auto const order)
		        {
		            using ScopeT = std::remove_cvref_t<decltype(scope)>;

		            // Block scope requires no fence since threads within a block are simulated/single-threaded
		            if constexpr(!std::same_as<ScopeT, scope::Block>)
		            {
		                std::atomic_thread_fence(MemOrderHost::get(order));
		            }
		        }
		#if defined(__GNUC__) && !defined(__clang__)
		#    pragma GCC diagnostic pop
		#endif
		    } // namespace detail

		    // Host API: dispatch to executor-specific implementation
		    template<typename T_Scope, concepts::MemoryOrder T_Order>
		    struct MemoryFence::Op<api::Host, T_Scope, T_Order>
		    {
		        void operator()(onAcc::concepts::Acc<api::Host> auto const& acc, T_Scope const scope, T_Order const order)
		            const
		        {
		            detail::hostMemoryFenceImpl(acc[object::exec], scope, order);
		        }
		    };
		} // namespace alpaka::onAcc::internalCompute
		// ==
		// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/host/memFence.hpp ==
		// ============================================================================

	// ==
	// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/cpu.hpp ==
	// ============================================================================

	// ============================================================================
	// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/cuda/warp.hpp ==
	// ==
	/* Copyright 2025 Mehmet Yusufoglu, René Widera
	 * SPDX-License-Identifier: MPL-2.0
	 */

	// #pragma once
	// #include "alpaka/api/cuda/Api.hpp"    // amalgamate: file already inlined
	// #include "alpaka/core/common.hpp"    // amalgamate: file already inlined
	// #include "alpaka/core/config.hpp"    // amalgamate: file already inlined
		// ============================================================================
		// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/onAcc/internal/warp.hpp ==
		// ==
		/* Copyright 2025 Mehmet Yusufoglu, René Widera
		 * SPDX-License-Identifier: MPL-2.0
		 *
		 * Defines the Alpaka3 warp traits for kernel calls for votes and shuffles.
		 * Central helpers (all, any, ballot, shfl…) forward to trait::Op specializations.
		 *
		 * Dispatch now depends on api::X and deviceKind::Y tags, keeping the layer backend free.
		 * Provides a single entry point that works for host and GPU backends alike.
		 *
		 * Legacy Alpaka preferred the dispatch through ConceptWarp implementations per accelerator and used
		 * interface::ImplementationBase indirection to select the correct implementation.
		 */

		// #pragma once
		// #include "alpaka/api/concepts/api.hpp"    // amalgamate: file already inlined
		// #include "alpaka/api/trait.hpp"    // amalgamate: file already inlined
		// #include "alpaka/core/common.hpp"    // amalgamate: file already inlined
		// #include "alpaka/onAcc/Acc.hpp"    // amalgamate: file already inlined
		// #include "alpaka/tag.hpp"    // amalgamate: file already inlined

		// #include <cstdint>    // amalgamate: file already included

		namespace alpaka::onAcc::warp::internal
		{
		    template<alpaka::onAcc::concepts::Acc T_Acc>
		    constexpr uint32_t getSize()
		    {
		        return T_Acc::getWarpSize();
		    }

		    /** Retrieve a bit-mask describing which warp lanes are active. */
		    struct Activemask
		    {
		        template<alpaka::onAcc::concepts::Acc T_Acc, alpaka::concepts::Api T_Api>
		        struct Op
		        {
		            constexpr auto operator()(T_Acc const&, T_Api) const
		            {
		                static_assert(sizeof(T_Acc) && false, "Missing warp Activemask implementation for the accelerator.");
		                return 0u;
		            }
		        };
		    };

		    struct GetLaneIdx
		    {
		        template<alpaka::onAcc::concepts::Acc T_Acc, alpaka::concepts::Api T_Api>
		        struct Op
		        {
		            constexpr auto operator()(T_Acc const&, T_Api) const
		            {
		                static_assert(sizeof(T_Acc) && false, "Missing warp GetLaneIdx implementation for the accelerator.");
		                return 0u;
		            }
		        };
		    };

		    /** Return the lane index of the current thread within its warp. */
		    constexpr uint32_t getLaneIdx(alpaka::onAcc::concepts::Acc auto const& acc)
		    {
		        using Acc = ALPAKA_TYPEOF(acc);
		        using Api = ALPAKA_TYPEOF(acc[object::api]);
		        return GetLaneIdx::Op<Acc, Api>{}(acc, Api{});
		    }

		    struct GetWarpIdx
		    {
		        template<alpaka::onAcc::concepts::Acc T_Acc, alpaka::concepts::Api T_Api>
		        struct Op
		        {
		            constexpr auto operator()(T_Acc const&, T_Api) const
		            {
		                static_assert(sizeof(T_Acc) && false, "Missing warp GetWarpIdx implementation for the accelerator.");
		                return 0u;
		            }
		        };
		    };

		    /** Return the warp index within the block. */
		    constexpr uint32_t getWarpIdx(alpaka::onAcc::concepts::Acc auto const& acc)
		    {
		        using Acc = ALPAKA_TYPEOF(acc);
		        using Api = ALPAKA_TYPEOF(acc[object::api]);
		        return GetWarpIdx::Op<Acc, Api>{}(acc, Api{});
		    }

		    struct All
		    {
		        template<alpaka::onAcc::concepts::Acc T_Acc, alpaka::concepts::Api T_Api>
		        struct Op
		        {
		            constexpr bool operator()(T_Acc const&, T_Api, int32_t predicate) const
		            {
		                alpaka::unused(predicate);
		                static_assert(sizeof(T_Acc) && false, "Missing warp All implementation for the accelerator.");
		                return false;
		            }
		        };
		    };

		    struct Any
		    {
		        template<alpaka::onAcc::concepts::Acc T_Acc, alpaka::concepts::Api T_Api>
		        struct Op
		        {
		            constexpr bool operator()(T_Acc const&, T_Api, int32_t predicate) const
		            {
		                alpaka::unused(predicate);
		                static_assert(sizeof(T_Acc) && false, "Missing warp Any implementation for the accelerator.");
		                return false;
		            }
		        };
		    };

		    struct Ballot
		    {
		        template<alpaka::onAcc::concepts::Acc T_Acc, alpaka::concepts::Api T_Api>
		        struct Op
		        {
		            constexpr auto operator()(T_Acc const&, T_Api, int32_t predicate) const
		            {
		                alpaka::unused(predicate);
		                static_assert(sizeof(T_Acc) && false, "Missing warp Ballot implementation for the accelerator.");
		                return 0;
		            }
		        };
		    };

		    struct Shfl
		    {
		        template<alpaka::onAcc::concepts::Acc T_Acc, alpaka::concepts::Api T_Api, typename T>
		        struct Op
		        {
		            constexpr T operator()(T_Acc const&, T_Api, T const& value, uint32_t srcLane, uint32_t width) const
		            {
		                alpaka::unused(value, srcLane, width);
		                static_assert(sizeof(T_Acc) && false, "Missing warp Shfl implementation for the accelerator.");
		                return T{};
		            }
		        };
		    };

		    struct ShflDown
		    {
		        template<alpaka::onAcc::concepts::Acc T_Acc, alpaka::concepts::Api T_Api, typename T>
		        struct Op
		        {
		            constexpr T operator()(T_Acc const&, T_Api, T const& value, uint32_t delta, uint32_t width) const
		            {
		                alpaka::unused(value, delta, width);
		                static_assert(sizeof(T_Acc) && false, "Missing warp ShflDown implementation for the accelerator.");
		                return T{};
		            }
		        };
		    };

		    struct ShflUp
		    {
		        template<alpaka::onAcc::concepts::Acc T_Acc, alpaka::concepts::Api T_Api, typename T>
		        struct Op
		        {
		            constexpr T operator()(T_Acc const&, T_Api, T const& value, uint32_t delta, uint32_t width) const
		            {
		                alpaka::unused(value, delta, width);
		                static_assert(sizeof(T_Acc) && false, "Missing warp ShflUp implementation for the accelerator.");
		                return T{};
		            }
		        };
		    };

		    struct ShflXor
		    {
		        template<alpaka::onAcc::concepts::Acc T_Acc, alpaka::concepts::Api T_Api, typename T>
		        struct Op
		        {
		            constexpr T operator()(T_Acc const&, T_Api, T const& value, uint32_t laneMask, uint32_t width) const
		            {
		                alpaka::unused(value, laneMask, width);
		                static_assert(sizeof(T_Acc) && false, "Missing warp ShflXor implementation for the accelerator.");
		                return T{};
		            }
		        };
		    };
		} // namespace alpaka::onAcc::warp::internal
		// ==
		// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/onAcc/internal/warp.hpp ==
		// ============================================================================


	// #include <cstdint>    // amalgamate: file already included
	#include <type_traits>

	#if ALPAKA_LANG_CUDA
	namespace alpaka::onAcc::warp::internal
	{
	    template<alpaka::onAcc::concepts::Acc T_Acc>
	    struct Activemask::Op<T_Acc, api::Cuda>
	    {
	        constexpr __device__ auto operator()(T_Acc const&, api::Cuda) const
	        {
	            return __activemask();
	        }
	    };

	    template<alpaka::onAcc::concepts::Acc T_Acc>
	    struct GetLaneIdx::Op<T_Acc, api::Cuda>
	    {
	        constexpr __device__ auto operator()(T_Acc const&, api::Cuda) const
	        {
	            unsigned lIdx;
	#    if ALPAKA_COMP_NVCC
	            asm volatile("mov.u32 %0, %laneid;" : "=r"(lIdx));
	#    else
	            asm("mov.u32 %0, %%laneid;" : "=r"(lIdx));
	#    endif
	            return lIdx;
	        }
	    };

	    template<alpaka::onAcc::concepts::Acc T_Acc>
	    struct GetWarpIdx::Op<T_Acc, api::Cuda>
	    {
	        constexpr __device__ uint32_t operator()(T_Acc const& acc, api::Cuda) const
	        {
	            constexpr uint32_t warpExtent = onAcc::warp::internal::getSize<ALPAKA_TYPEOF(acc)>();
	            alpaka::concepts::Vector auto blockThreadCount
	                = acc.getExtentsOf(onAcc::origin::block, onAcc::unit::threads);
	            alpaka::concepts::Vector auto threadIdxInBlock
	                = acc.getIdxWithin(alpaka::onAcc::origin::block, alpaka::onAcc::unit::threads);
	            return linearize(blockThreadCount, threadIdxInBlock) / warpExtent;
	        }
	    };

	    template<alpaka::onAcc::concepts::Acc T_Acc>
	    struct All::Op<T_Acc, api::Cuda>
	    {
	        constexpr __device__ bool operator()(T_Acc const&, api::Cuda, int32_t predicate) const
	        {
	            return __all_sync(__activemask(), static_cast<int>(predicate)) != 0;
	        }
	    };

	    template<alpaka::onAcc::concepts::Acc T_Acc>
	    struct Any::Op<T_Acc, api::Cuda>
	    {
	        constexpr __device__ bool operator()(T_Acc const&, api::Cuda, int32_t predicate) const
	        {
	            return __any_sync(__activemask(), static_cast<int>(predicate)) != 0;
	        }
	    };

	    template<alpaka::onAcc::concepts::Acc T_Acc>
	    struct Ballot::Op<T_Acc, api::Cuda>
	    {
	        constexpr __device__ auto operator()(T_Acc const&, api::Cuda, int32_t predicate) const
	        {
	            return __ballot_sync(__activemask(), static_cast<int>(predicate));
	        }
	    };

	    template<alpaka::onAcc::concepts::Acc T_Acc, typename T>
	    struct Shfl::Op<T_Acc, api::Cuda, T>
	    {
	        constexpr __device__ T
	        operator()(T_Acc const&, api::Cuda, T const& value, uint32_t srcLane, uint32_t width) const
	        {
	            return __shfl_sync(__activemask(), value, static_cast<int>(srcLane), static_cast<int>(width));
	        }
	    };

	    template<alpaka::onAcc::concepts::Acc T_Acc, typename T>
	    struct ShflDown::Op<T_Acc, api::Cuda, T>
	    {
	        constexpr __device__ T
	        operator()(T_Acc const&, api::Cuda, T const& value, uint32_t delta, uint32_t width) const
	        {
	            return __shfl_down_sync(__activemask(), value, static_cast<int>(delta), static_cast<int>(width));
	        }
	    };

	    template<alpaka::onAcc::concepts::Acc T_Acc, typename T>
	    struct ShflUp::Op<T_Acc, api::Cuda, T>
	    {
	        constexpr __device__ T
	        operator()(T_Acc const&, api::Cuda, T const& value, uint32_t delta, uint32_t width) const
	        {
	            return __shfl_up_sync(__activemask(), value, static_cast<int>(delta), static_cast<int>(width));
	        }
	    };

	    template<alpaka::onAcc::concepts::Acc T_Acc, typename T>
	    struct ShflXor::Op<T_Acc, api::Cuda, T>
	    {
	        constexpr __device__ T
	        operator()(T_Acc const&, api::Cuda, T const& value, uint32_t laneMask, uint32_t width) const
	        {
	            return __shfl_xor_sync(__activemask(), value, static_cast<int>(laneMask), static_cast<int>(width));
	        }
	    };
	} // namespace alpaka::onAcc::warp::internal
	#endif
	// ==
	// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/cuda/warp.hpp ==
	// ============================================================================

	// ============================================================================
	// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/hip/warp.hpp ==
	// ==
	/* Copyright 2025 Mehmet Yusufoglu, René Widera
	 * SPDX-License-Identifier: MPL-2.0
	 */

	// #pragma once
	// #include "alpaka/api/hip/Api.hpp"    // amalgamate: file already inlined
	// #include "alpaka/core/common.hpp"    // amalgamate: file already inlined
	// #include "alpaka/core/config.hpp"    // amalgamate: file already inlined
	// #include "alpaka/onAcc/internal/warp.hpp"    // amalgamate: file already inlined

	// #include <cstdint>    // amalgamate: file already included
	#include <type_traits>

	#if ALPAKA_LANG_HIP
	namespace alpaka::onAcc::warp::internal
	{
	    template<alpaka::onAcc::concepts::Acc T_Acc>
	    struct Activemask::Op<T_Acc, api::Hip>
	    {
	        constexpr __device__ auto operator()(T_Acc const&, api::Hip) const
	        {
	            return __ballot(1u);
	        }
	    };

	    template<alpaka::onAcc::concepts::Acc T_Acc>
	    struct GetLaneIdx::Op<T_Acc, api::Hip>
	    {
	        constexpr __device__ auto operator()(T_Acc const&, api::Hip) const
	        {
	            // for the host side deduction path, result is wrong but this is fine
	            return __lane_id();
	        }
	    };

	    template<alpaka::onAcc::concepts::Acc T_Acc>
	    struct GetWarpIdx::Op<T_Acc, api::Hip>
	    {
	        constexpr __device__ uint32_t operator()(T_Acc const& acc, api::Hip) const
	        {
	            constexpr uint32_t warpExtent = onAcc::warp::internal::getSize<ALPAKA_TYPEOF(acc)>();
	            alpaka::concepts::Vector auto blockThreadCount
	                = acc.getExtentsOf(onAcc::origin::block, onAcc::unit::threads);
	            alpaka::concepts::Vector auto threadIdxInBlock
	                = acc.getIdxWithin(alpaka::onAcc::origin::block, alpaka::onAcc::unit::threads);
	            return linearize(blockThreadCount, threadIdxInBlock) / warpExtent;
	        }
	    };

	    template<alpaka::onAcc::concepts::Acc T_Acc>
	    struct All::Op<T_Acc, api::Hip>
	    {
	        constexpr __device__ bool operator()(T_Acc const&, api::Hip, int32_t predicate) const
	        {
	            return __all(static_cast<int>(predicate)) != 0u;
	        }
	    };

	    template<alpaka::onAcc::concepts::Acc T_Acc>
	    struct Any::Op<T_Acc, api::Hip>
	    {
	        constexpr __device__ bool operator()(T_Acc const&, api::Hip, int32_t predicate) const
	        {
	            return __any(static_cast<int>(predicate)) != 0;
	        }
	    };

	    template<alpaka::onAcc::concepts::Acc T_Acc>
	    struct Ballot::Op<T_Acc, api::Hip>
	    {
	        constexpr __device__ auto operator()(T_Acc const&, api::Hip, int32_t predicate) const
	        {
	            return __ballot(static_cast<int>(predicate));
	        }
	    };

	    template<alpaka::onAcc::concepts::Acc T_Acc, typename T>
	    struct Shfl::Op<T_Acc, api::Hip, T>
	    {
	        constexpr __device__ T
	        operator()(T_Acc const&, api::Hip, T const& value, uint32_t srcLane, uint32_t width) const
	        {
	            return __shfl(value, static_cast<int>(srcLane), static_cast<int>(width));
	        }
	    };

	    template<alpaka::onAcc::concepts::Acc T_Acc, typename T>
	    struct ShflDown::Op<T_Acc, api::Hip, T>
	    {
	        constexpr __device__ T operator()(T_Acc const&, api::Hip, T const& value, uint32_t delta, uint32_t width) const
	        {
	            return __shfl_down(value, static_cast<int>(delta), static_cast<int>(width));
	        }
	    };

	    template<alpaka::onAcc::concepts::Acc T_Acc, typename T>
	    struct ShflUp::Op<T_Acc, api::Hip, T>
	    {
	        constexpr __device__ T operator()(T_Acc const&, api::Hip, T const& value, uint32_t delta, uint32_t width) const
	        {
	            return __shfl_up(value, static_cast<int>(delta), static_cast<int>(width));
	        }
	    };

	    template<alpaka::onAcc::concepts::Acc T_Acc, typename T>
	    struct ShflXor::Op<T_Acc, api::Hip, T>
	    {
	        constexpr __device__ T
	        operator()(T_Acc const&, api::Hip, T const& value, uint32_t laneMask, uint32_t width) const
	        {
	            return __shfl_xor(value, static_cast<int>(laneMask), static_cast<int>(width));
	        }
	    };
	} // namespace alpaka::onAcc::warp::internal
	#endif
	// ==
	// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/hip/warp.hpp ==
	// ============================================================================

	// ============================================================================
	// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/host/warp.hpp ==
	// ==
	/* Copyright 2025 Mehmet Yusufoglu, René Widera
	 * SPDX-License-Identifier: MPL-2.0
	 *
	 * Provides warp trait fallbacks for scalar host execution.
	 */

	// #pragma once
	// #include "alpaka/api/host/Api.hpp"    // amalgamate: file already inlined
	// #include "alpaka/core/common.hpp"    // amalgamate: file already inlined
	// #include "alpaka/onAcc/internal/warp.hpp"    // amalgamate: file already inlined

	// #include <cstdint>    // amalgamate: file already included

	namespace alpaka::onAcc::warp::internal
	{
	    template<alpaka::onAcc::concepts::Acc T_Acc>
	    struct Activemask::Op<T_Acc, api::Host>
	    {
	        constexpr auto operator()(T_Acc const& acc, api::Host) const
	        {
	            alpaka::unused(acc);
	            return uint32_t{1u};
	        }
	    };

	    template<alpaka::onAcc::concepts::Acc T_Acc>
	    struct GetLaneIdx::Op<T_Acc, api::Host>
	    {
	        constexpr auto operator()(T_Acc const& acc, api::Host) const
	        {
	            alpaka::unused(acc);
	            return uint32_t{0u};
	        }
	    };

	    template<alpaka::onAcc::concepts::Acc T_Acc>
	    struct GetWarpIdx::Op<T_Acc, api::Host>
	    {
	        constexpr auto operator()(T_Acc const& acc, api::Host) const
	        {
	            alpaka::unused(acc);
	            return uint32_t{0u};
	        }
	    };

	    template<alpaka::onAcc::concepts::Acc T_Acc>
	    struct All::Op<T_Acc, api::Host>
	    {
	        constexpr bool operator()(T_Acc const& acc, api::Host, int32_t predicate) const
	        {
	            alpaka::unused(acc);
	            return predicate != 0;
	        }
	    };

	    template<alpaka::onAcc::concepts::Acc T_Acc>
	    struct Any::Op<T_Acc, api::Host>
	    {
	        constexpr bool operator()(T_Acc const& acc, api::Host, int32_t predicate) const
	        {
	            alpaka::unused(acc);
	            return predicate != 0;
	        }
	    };

	    template<alpaka::onAcc::concepts::Acc T_Acc>
	    struct Ballot::Op<T_Acc, api::Host>
	    {
	        constexpr auto operator()(T_Acc const& acc, api::Host, int32_t predicate) const
	        {
	            alpaka::unused(acc);
	            return predicate != 0 ? 1u : 0u;
	        }
	    };

	    template<alpaka::onAcc::concepts::Acc T_Acc, typename T>
	    struct Shfl::Op<T_Acc, api::Host, T>
	    {
	        constexpr T operator()(T_Acc const& acc, api::Host, T const& value, uint32_t srcLane, uint32_t width) const
	        {
	            alpaka::unused(acc, srcLane, width);
	            return value;
	        }
	    };

	    template<alpaka::onAcc::concepts::Acc T_Acc, typename T>
	    struct ShflDown::Op<T_Acc, api::Host, T>
	    {
	        constexpr T operator()(T_Acc const& acc, api::Host, T const& value, uint32_t delta, uint32_t width) const
	        {
	            alpaka::unused(acc, delta, width);
	            return value;
	        }
	    };

	    template<alpaka::onAcc::concepts::Acc T_Acc, typename T>
	    struct ShflUp::Op<T_Acc, api::Host, T>
	    {
	        constexpr T operator()(T_Acc const& acc, api::Host, T const& value, uint32_t delta, uint32_t width) const
	        {
	            alpaka::unused(acc, delta, width);
	            return value;
	        }
	    };

	    template<alpaka::onAcc::concepts::Acc T_Acc, typename T>
	    struct ShflXor::Op<T_Acc, api::Host, T>
	    {
	        constexpr T operator()(T_Acc const& acc, api::Host, T const& value, uint32_t laneMask, uint32_t width) const
	        {
	            alpaka::unused(acc, laneMask, width);
	            return value;
	        }
	    };
	} // namespace alpaka::onAcc::warp::internal
	// ==
	// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/host/warp.hpp ==
	// ============================================================================

	// ============================================================================
	// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/oneApi.hpp ==
	// ==
	/* Copyright 2025 Simeon Ehrig
	 * SPDX-License-Identifier: MPL-2.0
	 */

	// #pragma once
	// #include "alpaka/api/oneApi/Api.hpp"    // amalgamate: file already inlined
		// ============================================================================
		// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/oneApi/Device.hpp ==
		// ==
		/* Copyright 2024 René Widera
		 * SPDX-License-Identifier: MPL-2.0
		 */

		// #pragma once
		// #include "alpaka/api/host/sysInfo.hpp"    // amalgamate: file already inlined
			// ============================================================================
			// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/syclGeneric/Device.hpp ==
			// ==
			/* Copyright 2025 Simeon Ehrig, René Widera
			 * SPDX-License-Identifier: MPL-2.0
			 */

			// #pragma once
				// ============================================================================
				// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/syclGeneric/Queue.hpp ==
				// ==
				/* Copyright 2025 Simeon Ehrig, René Widera, Mehmet Yusufoglu, Andrea Bocci
				 * SPDX-License-Identifier: MPL-2.0
				 */

				// #pragma once
					// ============================================================================
					// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/syclGeneric/Event.hpp ==
					// ==
					/* Copyright 2025 René Widera
					 * SPDX-License-Identifier: MPL-2.0
					 */

					// #pragma once
					// #include "alpaka/api/util.hpp"    // amalgamate: file already inlined
					// #include "alpaka/core/CallbackThread.hpp"    // amalgamate: file already inlined
					// #include "alpaka/core/config.hpp"    // amalgamate: file already inlined
					// #include "alpaka/interface.hpp"    // amalgamate: file already inlined
					// #include "alpaka/internal/interface.hpp"    // amalgamate: file already inlined
					// #include "alpaka/onHost/concepts.hpp"    // amalgamate: file already inlined
					// #include "alpaka/onHost/internal/interface.hpp"    // amalgamate: file already inlined
					// #include "alpaka/onHost/logger/logger.hpp"    // amalgamate: file already inlined

					// #include <algorithm>    // amalgamate: file already included
					#include <shared_mutex>
					// #include <sstream>    // amalgamate: file already included

					#if ALPAKA_LANG_SYCL

					#    include <sycl/sycl.hpp>

					namespace alpaka::onHost
					{
					    namespace syclGeneric
					    {
					        template<typename T_Device>
					        struct Event : std::enable_shared_from_this<Event<T_Device>>
					        {
					        private:
					            friend struct alpaka::internal::GetApi;

					        public:
					            Event(internal::concepts::DeviceHandle auto device, uint32_t const idx)
					                : m_device(std::move(device))
					                , m_idx(idx)
					            {
					                ALPAKA_LOG_FUNCTION(onHost::logger::event);
					            }

					            Event(Event const&) = delete;
					            Event& operator=(Event const&) = delete;

					            Event(Event&&) = delete;
					            Event& operator=(Event&&) = delete;

					            ~Event()
					            {
					                ALPAKA_LOG_FUNCTION(onHost::logger::event);
					                try
					                {
					                    getEvent().wait_and_throw();
					                }
					                catch(sycl::exception const& err)
					                {
					                    std::cerr << "Caught SYCL exception while destructing a SYCL event: " << err.what() << " ("
					                              << err.code() << ')' << std::endl;
					                }
					                catch(std::exception const& err)
					                {
					                    std::cerr << "The following runtime error(s) occurred while destructing a SYCL event:"
					                              << err.what() << std::endl;
					                }
					            }

					            std::shared_ptr<Event> getSharedPtr()
					            {
					                return this->shared_from_this();
					            }

					            [[nodiscard]] auto getNativeHandle() const noexcept
					            {
					                return getEvent();
					            }

					            void wait()
					            {
					                ALPAKA_LOG_FUNCTION(onHost::logger::event);
					                getEvent().wait_and_throw();
					            }

					            std::string getName() const
					            {
					                std::stringstream ss;
					                ss << "Queue<" << getApi(m_device).getName() << ">";
					                ss << " id=" << m_idx;
					                return ss.str();
					            }

					        private:
					            friend struct alpaka::internal::GetDeviceType;
					            friend struct alpaka::onHost::internal::Enqueue;

					            auto getDeviceKind() const
					            {
					                return alpaka::internal::getDeviceKind(*m_device.get());
					            }

					            auto getDevice() const
					            {
					                return m_device;
					            }

					            friend struct onHost::internal::GetDevice;

					            friend struct onHost::internal::IsEventComplete;

					            /** Check if the event is complete.
					             *
					             * @return true if the event is complete, false otherwise
					             */
					            bool isEventComplete() noexcept
					            {
					                auto const status = getEvent().template get_info<sycl::info::event::command_execution_status>();
					                return (status == sycl::info::event_command_status::complete);
					            }

					            friend struct internal::WaitFor;
					            friend struct internal::Wait;

					            void setEvent(sycl::event const& event)
					            {
					                std::unique_lock<std::shared_mutex> lock{m_eventGuard};
					                m_event = event;
					            }

					            sycl::event getEvent() const
					            {
					                std::shared_lock<std::shared_mutex> lock{m_eventGuard};
					                return m_event;
					            }

					            Handle<T_Device> m_device;
					            //! secure that two threads can change the event at the same time
					            mutable std::shared_mutex m_eventGuard;

					            //! You should not use the event directly, use always getEvent() or setEvent()
					            sycl::event m_event{};
					            uint32_t m_idx = 0u;
					        };


					    } // namespace syclGeneric
					} // namespace alpaka::onHost

					namespace alpaka::internal

					{
					    template<typename T_Device>
					    struct GetApi::Op<alpaka::onHost::syclGeneric::Event<T_Device>>
					    {
					        inline constexpr auto operator()(auto&& event) const
					        {
					            return alpaka::getApi(event.m_device);
					        }
					    };
					} // namespace alpaka::internal

					#endif
					// ==
					// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/syclGeneric/Event.hpp ==
					// ============================================================================

				// #include "alpaka/api/util.hpp"    // amalgamate: file already inlined
				// #include "alpaka/core/CallbackThread.hpp"    // amalgamate: file already inlined
				// #include "alpaka/core/config.hpp"    // amalgamate: file already inlined
				// #include "alpaka/interface.hpp"    // amalgamate: file already inlined
				// #include "alpaka/internal/interface.hpp"    // amalgamate: file already inlined
				// #include "alpaka/onAcc/Acc.hpp"    // amalgamate: file already inlined
				// #include "alpaka/onHost/concepts.hpp"    // amalgamate: file already inlined
				// #include "alpaka/onHost/interface.hpp"    // amalgamate: file already inlined
				// #include "alpaka/onHost/internal/interface.hpp"    // amalgamate: file already inlined
				// #include "alpaka/onHost/mem/SharedBuffer.hpp"    // amalgamate: file already inlined
				// #include "alpaka/onHost/trait.hpp"    // amalgamate: file already inlined

				// #include <algorithm>    // amalgamate: file already included
				// #include <future>    // amalgamate: file already included
				#include <shared_mutex>
				// #include <sstream>    // amalgamate: file already included
				#include <type_traits>

				#if ALPAKA_LANG_SYCL

				// #    include <sycl/sycl.hpp>    // amalgamate: file already included

				namespace alpaka::onHost
				{
				    namespace syclGeneric
				    {
				        /** Dispatch a compile time warp size to the kernel
				         *
				         * The runtime provided warp size of the device is transformed into a compile time warp size.
				         * During the kernel (lambda) call in cgh.parallel_for() the lambda must be annotated with
				         * `[[sycl::reqd_sub_group_size(WARP_SIZE)]]`. In cases where the warp size is not supported by device a
				         * compiler warning will be shown, therefore a second stage during the call of parallel_for() is required where
				         * we check if we know based on macro defines provided by the compiler which subgroup sizes (warp size) are
				         * supported for the device ther kernel is currently compiled. In cases, where the macro definition to detect
				         * the target device is not in the list (file: core/syclConfig.hpp) we allow all subgroup sizes generated from
				         * the runtime dispatcher in this trait. This is also the case if we not compile ahead of time for a device.
				         * @attention If a warning `-Wincorrect-sub-group-size` is shown this mean we generated a kernel with an
				         * unsupported warp size, triggered by the on host runtime dispatch in this trait.
				         *
				         * The reason why we do not want to execute the runtime dispatch within the parallel_for, equal to what
				         * mainline alpaka is doing, is that any kernel instance should have only one code patch to avoid possible
				         * register pressure due to a code path which will maybe never called but is generated in the kernel.
				         * This complicated approach gives us the guarantee that the runtime device warp size is used during the kernel
				         * generation.
				         */
				        struct Warpsize
				        {
				            template<alpaka::concepts::DeviceKind T_DeviceKind>
				            struct Dispatch
				            {
				                auto operator()(T_DeviceKind deviceKind, auto&& fn) const;
				            };
				        };

				        template<>
				        struct Warpsize::Dispatch<alpaka::deviceKind::Cpu>
				        {
				            auto operator()(alpaka::deviceKind::Cpu, auto&& fn, uint32_t warpSize) const
				            {
				                switch(warpSize)
				                {
				                case 1u:
				                    return fn(std::integral_constant<uint32_t, 1u>{});
				                case 2u:
				                    return fn(std::integral_constant<uint32_t, 2u>{});
				                case 4u:
				                    return fn(std::integral_constant<uint32_t, 4u>{});
				                case 8u:
				                    return fn(std::integral_constant<uint32_t, 8u>{});
				                case 16u:
				                    return fn(std::integral_constant<uint32_t, 16u>{});
				                case 32u:
				                    return fn(std::integral_constant<uint32_t, 32u>{});
				                default:
				                    throw std::runtime_error(
				                        std::string("Sycl warp size runtime dispatch, unsupported warpSize: ")
				                        + std::to_string(warpSize));
				                    return fn(std::integral_constant<uint32_t, 1u>{});
				                }
				            }
				        };

				        template<>
				        struct Warpsize::Dispatch<alpaka::deviceKind::IntelGpu>
				        {
				            auto operator()(alpaka::deviceKind::IntelGpu, auto&& fn, uint32_t warpSize) const
				            {
				                switch(warpSize)
				                {
				                case 8u:
				                    return fn(std::integral_constant<uint32_t, 8u>{});
				                case 16u:
				                    return fn(std::integral_constant<uint32_t, 16u>{});
				                case 32u:
				                    return fn(std::integral_constant<uint32_t, 32u>{});
				                default:
				                    throw std::runtime_error(
				                        std::string("Sycl warp size runtime dispatch, unsupported warpSize: ")
				                        + std::to_string(warpSize));
				                    return fn(std::integral_constant<uint32_t, 32u>{});
				                }
				            }
				        };

				        template<>
				        struct Warpsize::Dispatch<alpaka::deviceKind::AmdGpu>
				        {
				            auto operator()(alpaka::deviceKind::AmdGpu, auto&& fn, uint32_t warpSize) const
				            {
				                switch(warpSize)
				                {
				                case 32u:
				                    return fn(std::integral_constant<uint32_t, 32u>{});
				                case 64u:
				                    return fn(std::integral_constant<uint32_t, 64u>{});
				                default:
				                    throw std::runtime_error(
				                        std::string("Sycl warp size runtime dispatch, unsupported warpSize: ")
				                        + std::to_string(warpSize));
				                    return fn(std::integral_constant<uint32_t, 32u>{});
				                }
				            }
				        };

				        template<>
				        struct Warpsize::Dispatch<alpaka::deviceKind::NvidiaGpu>
				        {
				            auto operator()(alpaka::deviceKind::NvidiaGpu, auto&& fn, uint32_t warpSize) const
				            {
				                switch(warpSize)
				                {
				                case 32u:
				                    return fn(std::integral_constant<uint32_t, 32u>{});
				                default:
				                    throw std::runtime_error(
				                        std::string("Sycl warp size runtime dispatch, unsupported warpSize: ")
				                        + std::to_string(warpSize));
				                    return fn(std::integral_constant<uint32_t, 32u>{});
				                }
				            }
				        };

				        template<typename T_Device>
				        struct Queue : std::enable_shared_from_this<Queue<T_Device>>
				        {
				        private:
				            friend struct alpaka::internal::GetApi;

				            template<alpaka::concepts::Vector TVec>
				            static constexpr auto vecToSyclRange(TVec vec)
				            {
				                constexpr auto dim = std::decay_t<TVec>::dim();
				                return [&vec]<auto... I>(std::index_sequence<I...>)
				                // TODO: check if this is the correct order
				                { return sycl::range<dim>(vec[I]...); }(std::make_index_sequence<dim>{});
				            };

				            inline constexpr auto dispatchWarpSize(auto&& fn) const
				            {
				                auto warpSize
				                    = internal::GetDeviceProperties::Op<ALPAKA_TYPEOF(*m_device.get())>{}(*m_device.get()).warpSize;

				                return Warpsize::Dispatch<ALPAKA_TYPEOF(getDeviceKind())>{}(
				                    getDeviceKind(),
				                    ALPAKA_FORWARD(fn),
				                    warpSize);
				            }


				        public:
				            Queue(internal::concepts::DeviceHandle auto device, uint32_t const idx, bool isBlocking)
				                : m_device(std::move(device))
				                , m_idx(idx)
				                , m_queue(
				                      m_device->getNativeHandle().second,
				                      m_device->getNativeHandle().first,
				                      {sycl::property::queue::in_order{}})
				                , m_isBlocking(isBlocking)
				            {
				                ALPAKA_LOG_FUNCTION(onHost::logger::queue);
				            }

				            [[nodiscard]] bool isBlocking() const noexcept
				            {
				                return m_isBlocking;
				            }

				            Queue(Queue const&) = delete;
				            Queue& operator=(Queue const&) = delete;

				            Queue(Queue&&) = delete;
				            Queue& operator=(Queue&&) = delete;

				            ~Queue()
				            {
				                ALPAKA_LOG_FUNCTION(onHost::logger::queue);
				                try
				                {
				                    m_queue.wait_and_throw();
				                }
				                catch(sycl::exception const& err)
				                {
				                    std::cerr << "Caught SYCL exception while destructing a SYCL queue: " << err.what() << " ("
				                              << err.code() << ')' << std::endl;
				                }
				                catch(std::exception const& err)
				                {
				                    std::cerr << "The following runtime error(s) occurred while destructing a SYCL queue:"
				                              << err.what() << std::endl;
				                }
				            }

				            std::shared_ptr<Queue> getSharedPtr()
				            {
				                return this->shared_from_this();
				            }

				            [[nodiscard]] auto getNativeHandle() const noexcept
				            {
				                return m_queue;
				            }

				            void wait()
				            {
				                m_queue.wait_and_throw();
				            }

				            std::string getName() const
				            {
				                std::stringstream ss;
				                ss << "Queue<" << getApi(m_device).getName() << ">";
				                ss << " id=" << m_idx;
				                return ss.str();
				            }

				        private:
				            friend struct alpaka::internal::GetDeviceType;
				            friend struct alpaka::onHost::internal::Enqueue;
				            friend struct onHost::internal::AllocDeferred;

				            auto getDeviceKind() const
				            {
				                return alpaka::internal::getDeviceKind(*m_device.get());
				            }

				            auto getDevice() const
				            {
				                return m_device;
				            }

				            friend struct onHost::internal::GetDevice;

				            friend struct alpaka::onHost::internal::WaitFor;

				            void waitFor(syclGeneric::Event<T_Device>& event)
				            {
				                ALPAKA_LOG_FUNCTION(onHost::logger::event + onHost::logger::queue);
				                sycl::event sycl_event = event.getNativeHandle();
				                sycl::event ev = m_queue.submit([sycl_event](sycl::handler& cgh) { cgh.depends_on(sycl_event); });
				                setLastEvent(ev);
				                if(isBlocking())
				                    ev.wait_and_throw();
				            }

				            friend struct internal::IsQueueEmpty;

				            /** Test of all tasks in the queue are finished
				             *
				             * @attention We are testing for the last event of last enqueued alpaka event or action. The function
				             * cannot check events that were queued directly into the native queue, bypassing alpaka.
				             */
				            bool isQueueEmpty() const
				            {
				                ALPAKA_LOG_FUNCTION(onHost::logger::queue);

				                auto const status = getLastEvent().template get_info<sycl::info::event::command_execution_status>();
				                return status == sycl::info::event_command_status::complete;
				            }

				            //! Thread safe getter for the last sycl event.
				            sycl::event getLastEvent() const
				            {
				                std::shared_lock<std::shared_mutex> lock{m_eventGuard};
				                return m_lastEvent;
				            }

				            /** Thread safe setter for the last sycl event
				             *
				             * To track dependencies this method must be called with any event returned by native sycl calls.
				             */
				            void setLastEvent(sycl::event const& ev) const
				            {
				                std::unique_lock<std::shared_mutex> lock{m_eventGuard};
				                m_lastEvent = ev;
				            }

				            friend struct alpaka::onHost::internal::Memset;
				            friend struct alpaka::onHost::internal::Memcpy;
				            friend struct alpaka::onHost::internal::MemcpyDeviceGlobal;
				            friend struct alpaka::onHost::internal::Alloc;
				            friend struct alpaka::onHost::internal::AllocDeferred;
				            friend struct alpaka::onHost::internal::AllocMapped;
				            friend struct alpaka::onHost::internal::Fill;

				            Handle<T_Device> m_device;
				            uint32_t m_idx = 0u;
				            sycl::queue m_queue;
				            // secure that two threads can change the event at the same time
				            mutable std::shared_mutex m_eventGuard;
				            /** Event which is representing the last enqueued task/action by alpaka
				             *
				             * @attention You should not use the event directly, use always getLastEvent() or setLastEvent().
				             * Tasks enqueued via the native handle outside of alpaka, will not be tracked by this event, therefore it
				             * can be possible that the queue is not empty but the event is already marked as complete. If you need to
				             * track also tasks enqueued outside of alpaka you should use onHost::wait(auto&&).
				             */
				            mutable sycl::event m_lastEvent;
				            core::CallbackThread m_callBackThread;
				            bool m_isBlocking{false};
				        };

				    } // namespace syclGeneric

				    template<typename T_Device, typename T_Task>
				    struct internal::Enqueue::HostTask<syclGeneric::Queue<T_Device>, T_Task>
				    {
				        void operator()(syclGeneric::Queue<T_Device>& queue, T_Task const& task) const
				        {
				            ALPAKA_LOG_FUNCTION(onHost::logger::queue);
				            /* Using the queue by reference is fine here, because if the queue is destroyed during the native sycl host
				             * task is executed the sycl queue is still valid, in the destructure of the alpaka queue we wait until all
				             * native sycl queue tasks are processed. Accessing the callback thread is still allowed att his point in
				             * time. Capturing the queue as handle (shared pointer) will result into a deadlock because the native sycl
				             * host task is not allowed to destruct the alpaka3, we call in the destructor of the queue 'wait for the
				             * native sycl queue' which is than producing the deadlock.*/
				            sycl::event ev = queue.m_queue.submit(
				                [&queue, task](sycl::handler& cgh)
				                {
				                    cgh.host_task(
				                        [&queue, task]
				                        {
				                            auto f = queue.m_callBackThread.submit([t = std::move(task)] { t(); });
				                            f.wait();
				                        });
				                });
				            queue.setLastEvent(ev);
				            if(queue.isBlocking())
				                ev.wait_and_throw();
				        }
				    };

				    template<typename T_Device, typename T_Task>
				    struct internal::Enqueue::HostTaskDeferred<syclGeneric::Queue<T_Device>, T_Task>
				    {
				        // same as for Enqueue::HostTask, but not waiting for the task to finish
				        void operator()(syclGeneric::Queue<T_Device>& queue, T_Task const& task) const
				        {
				            ALPAKA_LOG_FUNCTION(onHost::logger::queue);
				            /* Using the queue by reference is fine here, because if the queue is destroyed during the native sycl host
				             * task is executed the sycl queue is still valid, in the destructure of the alpaka queue we wait until all
				             * native sycl queue tasks are processed. Accessing the callback thread is still allowed att his point in
				             * time. Capturing the queue as handle (shared pointer) will result into a deadlock because the native sycl
				             * host task is not allowed to destruct the alpaka3, we call in the destructor of the queue 'wait for the
				             * native sycl queue' which is than producing the deadlock.*/
				            sycl::event ev = queue.m_queue.submit(
				                [&queue, task](sycl::handler& cgh)
				                {
				                    cgh.host_task([&queue, task]() { queue.m_callBackThread.submit([t = std::move(task)] { t(); }); });
				                });
				            queue.setLastEvent(ev);
				            if(queue.isBlocking())
				                ev.wait_and_throw();
				        }
				    };

				    template<typename T_Device, typename T_Event>
				    struct internal::Enqueue::Event<syclGeneric::Queue<T_Device>, T_Event>
				    {
				        void operator()(syclGeneric::Queue<T_Device>& queue, T_Event& event) const
				        {
				            ALPAKA_LOG_FUNCTION(onHost::logger::event + onHost::logger::queue);

				            /* We do not use the last event of the queue itself because creating an emulated event allows to see newly
				             * submitted tasks add to the native sycl queue outside alpaka. */
				            sycl::event emulatedEvent = queue.m_queue.submit([](sycl::handler& cgh) { cgh.single_task([]() {}); });
				            event.setEvent(emulatedEvent);
				            if(queue.isBlocking())
				                emulatedEvent.wait_and_throw();
				        }
				    };

				    template<typename T_Device, typename T_Dest, typename T_Extents>
				    requires(alpaka::trait::getDim_v<T_Extents> == 1u)
				    struct internal::Memset::Op<syclGeneric::Queue<T_Device>, T_Dest, T_Extents>
				    {
				        void operator()(syclGeneric::Queue<T_Device>& queue, auto&& dest, uint8_t byteValue, T_Extents const& extents)
				            const requires std::same_as<ALPAKA_TYPEOF(dest), T_Dest>
				        {
				            ALPAKA_LOG_FUNCTION(onHost::logger::memory + onHost::logger::queue);
				            // TODO: implement generic version for multidimensional memory
				            sycl::queue sycl_queue = queue.getNativeHandle();
				            sycl::event ev = sycl_queue.memset(
				                internal::Data::data(dest),
				                byteValue,
				                extents.x() * sizeof(alpaka::trait::GetValueType_t<T_Dest>));
				            queue.setLastEvent(ev);
				            if(queue.isBlocking())
				                ev.wait_and_throw();
				        }
				    };

				    template<typename T_Device, typename T_Dest, typename T_Source, typename T_Extents>
				    requires(alpaka::trait::getDim_v<T_Extents> == 1u)
				    struct internal::Memcpy::Op<syclGeneric::Queue<T_Device>, T_Dest, T_Source, T_Extents>
				    {
				        void operator()(
				            syclGeneric::Queue<T_Device>& queue,
				            auto&& dest,
				            T_Source const& source,
				            T_Extents const& extents) const requires std::same_as<ALPAKA_TYPEOF(dest), T_Dest>
				        {
				            ALPAKA_LOG_FUNCTION(onHost::logger::memory + onHost::logger::queue);
				            // TODO: implement generic version for multidimensional memory
				            sycl::queue sycl_queue = queue.getNativeHandle();
				            sycl::event ev = sycl_queue.memcpy(
				                toVoidPtr(internal::Data::data(dest)),
				                toVoidPtr(internal::Data::data(source)),
				                extents.x() * sizeof(alpaka::trait::GetValueType_t<T_Dest>));
				            queue.setLastEvent(ev);
				            if(queue.isBlocking())
				                ev.wait_and_throw();
				        }
				    };

				    template<typename T_Device, typename T_Dest, typename T_Value, typename T_Extents>
				    requires(alpaka::trait::getDim_v<T_Extents> == 1u)
				    struct internal::Fill::Op<syclGeneric::Queue<T_Device>, T_Dest, T_Value, T_Extents>
				    {
				        void operator()(
				            syclGeneric::Queue<T_Device>& queue,
				            auto&& dest,
				            T_Value elementValue,
				            T_Extents const& extents) const
				            requires std::same_as<ALPAKA_TYPEOF(dest), T_Dest>
				                     && std::same_as<alpaka::trait::GetValueType_t<ALPAKA_TYPEOF(dest)>, T_Value>
				        {
				            ALPAKA_LOG_FUNCTION(onHost::logger::memory + onHost::logger::queue);
				            sycl::queue sycl_queue = queue.getNativeHandle();
				            sycl::event ev = sycl_queue.fill(internal::Data::data(dest), elementValue, extents.x());
				            queue.setLastEvent(ev);
				            if(queue.isBlocking())
				                ev.wait_and_throw();
				        }
				    };

				    /** The code is a copy of the Alloc::Op with the difference that the memory is allocated and freed
				     * within a queue
				     */
				    template<typename T_Type, typename T_Device, alpaka::concepts::Vector T_Extents>
				    struct internal::AllocDeferred::Op<T_Type, syclGeneric::Queue<T_Device>, T_Extents>
				    {
				        auto operator()(syclGeneric::Queue<T_Device>& queue, T_Extents const& extents) const
				        {
				            ALPAKA_LOG_FUNCTION(onHost::logger::memory + onHost::logger::queue);
				            auto device = queue.getDevice();
				            constexpr uint32_t alignment = api::util::simdOptimizedAlignment<T_Type>(
				                ALPAKA_TYPEOF(getApi(device)){},
				                ALPAKA_TYPEOF(getDeviceKind(device)){});
				            auto [memSizeInByte, pitches] = api::util::emulatedAlignedMemDescription<T_Type>(alignment, extents);

				            auto deviceDependency = onHost::Device{queue.getDevice()->getSharedPtr()};
				            sycl::queue sycl_queue = queue.getNativeHandle();
				            auto queueDependency = queue.getSharedPtr();


				            T_Type* ptr = reinterpret_cast<T_Type*>(sycl::aligned_alloc_device(alignment, memSizeInByte, sycl_queue));

				            // guarantees that the allocation is blocking the queue if necessary.
				            if(queue.isBlocking())
				                sycl_queue.wait_and_throw();

				            auto deleter = [queueDep = std::move(queueDependency), ptr]()
				            {
				                sycl::queue sycl_queue = queueDep->getNativeHandle();
				                /* in cases where the deleter lifetime is extended e.g. by using keepAlive() on a buffer it can be that
				                 * the queue callback thread is holding the last instance of the deleter. keepAlive() is executed
				                 * within a sycl host tasks, it is forbidden to create another host task in a host task, result will be
				                 * a deadlock. Therefore, we submit the host task to free the memory first to the callback thread which
				                 * is than enqueuing the host task. This means that we can guarantee that the memory is freed after all
				                 * work, enqueued at the moment where the deleter is executed, in the sycl queue is finished. The
				                 * memory will be freed a little bit later than it could in cases other threads enqueue now kernel,
				                 * tasks into the sycl queue while the callback thread is creating the host tasks.
				                 */
				                queueDep->m_callBackThread.submit(
				                    [sycl_queue, ptr]() mutable
				                    {
				                        sycl_queue.submit([&](sycl::handler& cgh)
				                                          { cgh.host_task([=]() { sycl::free(toVoidPtr(ptr), sycl_queue); }); });
				                    });
				            };

				            auto sharedBuffer = onHost::SharedBuffer{
				                deviceDependency,
				                ptr,
				                extents,
				                pitches,
				                std::move(deleter),
				                Alignment<alignment>{}};
				            return sharedBuffer;
				        }
				    };
				} // namespace alpaka::onHost

				namespace alpaka::internal

				{
				    template<typename T_Device>
				    struct GetApi::Op<alpaka::onHost::syclGeneric::Queue<T_Device>>
				    {
				        inline constexpr auto operator()(auto&& queue) const
				        {
				            return alpaka::getApi(queue.m_device);
				        }
				    };
				} // namespace alpaka::internal

				#endif
				// ==
				// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/syclGeneric/Queue.hpp ==
				// ============================================================================

			// #include "alpaka/Vec.hpp"    // amalgamate: file already inlined
			// #include "alpaka/api/syclGeneric/Event.hpp"    // amalgamate: file already inlined
			// #include "alpaka/api/syclGeneric/Queue.hpp"    // amalgamate: file already inlined
			// #include "alpaka/api/util.hpp"    // amalgamate: file already inlined
			// #include "alpaka/core/config.hpp"    // amalgamate: file already inlined
			// #include "alpaka/onHost/mem/SharedBuffer.hpp"    // amalgamate: file already inlined

			#if ALPAKA_LANG_SYCL

			// #    include <sycl/sycl.hpp>    // amalgamate: file already included

			namespace alpaka::onHost
			{
			    namespace syclGeneric
			    {
			        template<typename T_Platform>
			        struct Device : std::enable_shared_from_this<Device<T_Platform>>
			        {
			        public:
			            Device(internal::concepts::PlatformHandle auto platform, auto const& dev, uint32_t const idx)
			                : m_platform(std::move(platform))
			                , m_idx(idx)
			                , m_sycl_dev(dev)
			                , m_properties{internal::getDeviceProperties(*m_platform.get(), m_idx)}
			            {
			                ALPAKA_LOG_FUNCTION(onHost::logger::device);
			            }

			            ~Device()
			            {
			                ALPAKA_LOG_FUNCTION(onHost::logger::device);
			            }

			            Device(Device const&) = delete;
			            Device& operator=(Device const&) = delete;

			            Device(Device&&) = delete;
			            Device& operator=(Device&&) = delete;

			            auto getName() const
			            {
			                return m_sycl_dev.get_info<sycl::info::device::name>();
			            }

			            std::shared_ptr<Device<T_Platform>> getSharedPtr()
			            {
			                return this->shared_from_this();
			            }

			            [[nodiscard]] Handle<syclGeneric::Queue<Device>> makeQueue(alpaka::concepts::QueueKind auto kind)
			            {
			                ALPAKA_LOG_FUNCTION(onHost::logger::queue + onHost::logger::device);
			                static_assert(
			                    kind == queueKind::blocking || kind == queueKind::nonBlocking,
			                    "Unsupported queue kind.");
			                auto thisHandle = this->getSharedPtr();
			                std::lock_guard<std::mutex> lk{m_writeGuard};

			                constexpr bool isBlocking = kind == queueKind::blocking;
			                auto newQueue
			                    = std::make_shared<syclGeneric::Queue<Device>>(std::move(thisHandle), queues.size(), isBlocking);

			                queues.emplace_back(newQueue);
			                return newQueue;
			            }

			            [[nodiscard]] std::pair<sycl::device, sycl::context> getNativeHandle() const noexcept
			            {
			                return {m_sycl_dev, m_platform->getContext()};
			            }

			            void wait()
			            {
			                ALPAKA_LOG_FUNCTION(onHost::logger::device);
			                // Copy queue weak refs under lock then release to avoid blocking other operations while waiting.
			                std::vector<std::weak_ptr<syclGeneric::Queue<Device>>> tmpQueues;
			                {
			                    std::lock_guard<std::mutex> lk{m_writeGuard};
			                    tmpQueues = queues;
			                }
			                for(auto& weakQueue : tmpQueues)
			                {
			                    if(auto queue = weakQueue.lock())
			                    {
			                        queue->wait();
			                    }
			                }
			            }

			        private:
			            friend struct internal::MakeEvent;

			            Handle<syclGeneric::Event<Device>> makeEvent()
			            {
			                ALPAKA_LOG_FUNCTION(onHost::logger::event + onHost::logger::device);
			                auto thisHandle = this->getSharedPtr();
			                std::lock_guard<std::mutex> lk{m_writeGuard};
			                auto newEvent = std::make_shared<syclGeneric::Event<Device>>(std::move(thisHandle), events.size());

			                events.emplace_back(newEvent);
			                return newEvent;
			            }

			            void _()
			            {
			                static_assert(internal::concepts::Device<Device>);
			            }

			            friend struct alpaka::internal::GetDeviceType;

			            auto getDeviceKind() const
			            {
			                return alpaka::internal::getDeviceKind(*m_platform.get());
			            }

			            Handle<T_Platform> m_platform;
			            uint32_t m_idx = 0u;
			            sycl::device m_sycl_dev;

			            std::vector<std::weak_ptr<syclGeneric::Queue<Device>>> queues;
			            std::vector<std::weak_ptr<syclGeneric::Event<Device>>> events;
			            std::mutex m_writeGuard;

			            DeviceProperties m_properties;

			            friend struct alpaka::internal::GetApi;
			            friend struct internal::GetDeviceProperties;
			            friend struct internal::GetFreeGlobalMemBytes;
			            friend struct internal::AdjustThreadSpec;
			            friend struct onHost::internal::AllocDeferred;
			            friend struct onHost::internal::AllocUnified;
			            friend struct onHost::internal::AllocMapped;
			            friend struct onHost::internal::IsDataAccessible;
			        };
			    } // namespace syclGeneric

			    namespace internal
			    {

			        template<typename T_Type, typename T_Platform, alpaka::concepts::Vector T_Extents>
			        struct Alloc::Op<T_Type, syclGeneric::Device<T_Platform>, T_Extents>
			        {
			            auto operator()(syclGeneric::Device<T_Platform>& device, T_Extents const& extents) const
			            {
			                ALPAKA_LOG_FUNCTION(onHost::logger::memory + onHost::logger::device);
			                constexpr uint32_t alignment = api::util::simdOptimizedAlignment<T_Type>(
			                    ALPAKA_TYPEOF(getApi(device)){},
			                    ALPAKA_TYPEOF(getDeviceKind(device)){});
			                auto [memSizeInByte, pitches] = api::util::emulatedAlignedMemDescription<T_Type>(alignment, extents);

			                auto deviceDependency = onHost::Device{device.getSharedPtr()};
			                auto [sycl_device, sycl_context] = device.getNativeHandle();

			                T_Type* ptr = reinterpret_cast<T_Type*>(
			                    sycl::aligned_alloc_device(alignment, memSizeInByte, sycl_device, sycl_context));
			                auto deleter = [ctx = sycl_context, ptr]() { sycl::free(toVoidPtr(ptr), ctx); };

			                auto sharedBuffer = onHost::SharedBuffer{
			                    deviceDependency,
			                    ptr,
			                    extents,
			                    pitches,
			                    std::move(deleter),
			                    Alignment<alignment>{}};
			                return sharedBuffer;
			            }
			        };

			        template<typename T_Type, typename T_Platform, alpaka::concepts::Vector T_Extents>
			        struct AllocUnified::Op<T_Type, syclGeneric::Device<T_Platform>, T_Extents>
			        {
			            auto operator()(syclGeneric::Device<T_Platform>& device, T_Extents const& extents) const
			            {
			                ALPAKA_LOG_FUNCTION(onHost::logger::memory + onHost::logger::device);
			                constexpr uint32_t alignment = api::util::simdOptimizedAlignment<T_Type>(
			                    ALPAKA_TYPEOF(getApi(device)){},
			                    ALPAKA_TYPEOF(getDeviceKind(device)){});
			                auto [memSizeInByte, pitches] = api::util::emulatedAlignedMemDescription<T_Type>(alignment, extents);

			                auto deviceDependency = onHost::Device{device.getSharedPtr()};
			                auto [sycl_device, sycl_context] = device.getNativeHandle();

			                bool isManagedMemorySupported = sycl_device.has(sycl::aspect::usm_shared_allocations);
			                if(!isManagedMemorySupported)
			                {
			                    throw std::runtime_error("Sycl device does not support unified memory allocations.");
			                }

			                T_Type* ptr = reinterpret_cast<T_Type*>(
			                    sycl::aligned_alloc_shared(alignment, memSizeInByte, sycl_device, sycl_context));
			                auto deleter = [ctx = sycl_context, ptr]() { sycl::free(toVoidPtr(ptr), ctx); };

			                auto sharedBuffer = onHost::SharedBuffer{
			                    deviceDependency,
			                    ptr,
			                    extents,
			                    pitches,
			                    std::move(deleter),
			                    Alignment<alignment>{}};
			                return sharedBuffer;
			            }
			        };

			        template<typename T_Type, typename T_Platform, alpaka::concepts::Vector T_Extents>
			        struct AllocMapped::Op<T_Type, syclGeneric::Device<T_Platform>, T_Extents>
			        {
			            auto operator()(syclGeneric::Device<T_Platform>& device, T_Extents const& extents) const
			            {
			                ALPAKA_LOG_FUNCTION(onHost::logger::memory + onHost::logger::device);
			                constexpr uint32_t alignment = api::util::simdOptimizedAlignment<T_Type>(
			                    ALPAKA_TYPEOF(getApi(device)){},
			                    ALPAKA_TYPEOF(getDeviceKind(device)){});
			                auto [memSizeInByte, pitches] = api::util::emulatedAlignedMemDescription<T_Type>(alignment, extents);

			                auto deviceDependency = onHost::Device{device.getSharedPtr()};
			                auto [_, sycl_context] = device.getNativeHandle();

			                T_Type* ptr
			                    = reinterpret_cast<T_Type*>(sycl::aligned_alloc_host(alignment, memSizeInByte, sycl_context));
			                auto deleter = [ctx = sycl_context, ptr]() { sycl::free(toVoidPtr(ptr), ctx); };

			                auto sharedBuffer = onHost::SharedBuffer{
			                    deviceDependency,
			                    ptr,
			                    extents,
			                    pitches,
			                    std::move(deleter),
			                    Alignment<alignment>{}};
			                return sharedBuffer;
			            }
			        };

			        template<typename T_Platform, typename T_Any>
			        struct IsDataAccessible::FirstPath<syclGeneric::Device<T_Platform>, T_Any>
			        {
			            bool operator()(syclGeneric::Device<T_Platform>& device, T_Any const& view) const
			            {
			                ALPAKA_LOG_FUNCTION(onHost::logger::memory + onHost::logger::device);
			                auto [sycl_device, sycl_context] = device.getNativeHandle();
			                auto sycl_alloc_type = sycl::get_pointer_type(data(view), sycl_context);

			                if(sycl_alloc_type != sycl::usm::alloc::unknown)
			                {
			                    try
			                    {
			                        sycl::device deviceAssociatedWithData = sycl::get_pointer_device(data(view), sycl_context);
			                        if(deviceAssociatedWithData == sycl_device)
			                        {
			                            // sycl device allocated the memory
			                            return true;
			                        }
			                    }
			                    catch(...)
			                    {
			                    }
			                }

			                if(sycl_alloc_type == sycl::usm::alloc::shared)
			                {
			                    // is shared within the device context
			                    return true;
			                }
			                else if(sycl_alloc_type == sycl::usm::alloc::unknown)
			                {
			                    // assume that a sycl cpu device can always access host memory
			                    if constexpr(
			                        ALPAKA_TYPEOF(getApi(view)){} == api::host
			                        && (ALPAKA_TYPEOF(getDeviceKind(device)){} == deviceKind::cpu
			                            || ALPAKA_TYPEOF(getDeviceKind(device)){} == deviceKind::numaCpu))
			                        return true;
			                }

			                return false;
			            }
			        };

			        template<typename T_Platform>
			        struct GetDeviceProperties::Op<syclGeneric::Device<T_Platform>>
			        {
			            DeviceProperties operator()(syclGeneric::Device<T_Platform> const& device) const
			            {
			                return device.m_properties;
			            }
			        };

			        template<
			            typename T_Platform,
			            alpaka::concepts::Executor T_Executor,
			            alpaka::concepts::Vector T_NumFrames,
			            alpaka::concepts::Vector T_FrameExtents,
			            alpaka::concepts::KernelBundle T_KernelBundle>
			        struct AdjustThreadSpec::
			            Op<syclGeneric::Device<T_Platform>, FrameSpec<T_NumFrames, T_FrameExtents, T_Executor>, T_KernelBundle>
			        {
			            using FrameSpecType = FrameSpec<T_NumFrames, T_FrameExtents, T_Executor>;

			            auto operator()(
			                syclGeneric::Device<T_Platform> const& device,
			                FrameSpecType const& frameSpec,
			                T_KernelBundle const& kernelBundle) const requires alpaka::concepts::CVector<T_FrameExtents>
			            {
			                alpaka::unused(device, kernelBundle);
			                ALPAKA_LOG_FUNCTION(onHost::logger::kernel + onHost::logger::device);
			                auto numThreads = frameSpec.getFrameExtents();

			                using ApiType = ALPAKA_TYPEOF(getApi(device));
			                using DeviceKindType = ALPAKA_TYPEOF(getDeviceKind(device));
			                constexpr auto result = api::util::adjustToLimit<
			                    alpaka::onHost::getMaxThreadsPerBlock(ApiType{}, DeviceKindType{}, T_Executor{}),
			                    0u,
			                    1u>(numThreads);
			                return ThreadSpec{frameSpec.getNumFrames(), result};
			            }

			            auto operator()(
			                syclGeneric::Device<T_Platform> const& device,
			                FrameSpecType const& frameSpec,
			                T_KernelBundle const& kernelBundle) const
			            {
			                alpaka::unused(kernelBundle);
			                ALPAKA_LOG_FUNCTION(onHost::logger::kernel + onHost::logger::device);
			                auto numThreadsPerBlocks = frameSpec.getFrameExtents();
			                auto const maxThreadsPerBlock = device.m_properties.maxThreadsPerBlock;

			                auto result = api::util::adjustToLimit(numThreadsPerBlocks, maxThreadsPerBlock);
			                return ThreadSpec{frameSpec.getNumFrames(), result};
			            }
			        };

			    } // namespace internal
			} // namespace alpaka::onHost

			namespace alpaka::internal
			{
			    template<typename T_Platform>
			    struct GetApi::Op<onHost::syclGeneric::Device<T_Platform>>
			    {
			        decltype(auto) operator()(auto&& device) const
			        {
			            return internal::getApi(*device.m_platform.get());
			        }
			    };
			} // namespace alpaka::internal

			#endif
			// ==
			// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/syclGeneric/Device.hpp ==
			// ============================================================================

		// #include "alpaka/core/config.hpp"    // amalgamate: file already inlined
		// #include "alpaka/onHost/trait.hpp"    // amalgamate: file already inlined
		// #include "executor.hpp"    // amalgamate: file already inlined

		#if ALPAKA_LANG_ONEAPI

		// #    include <sycl/sycl.hpp>    // amalgamate: file already included

		namespace alpaka::onHost::trait
		{
		    template<typename T_Platform>
		    struct IsExecutorSupportedBy::Op<alpaka::exec::OneApi, alpaka::onHost::syclGeneric::Device<T_Platform>>
		        : std::true_type
		    {
		    };
		} // namespace alpaka::onHost::trait

		namespace alpaka::onHost::internal
		{
		    template<typename T_Platform>
		    struct GetFreeGlobalMemBytes::Op<syclGeneric::Device<T_Platform>>
		    {
		        size_t operator()(syclGeneric::Device<T_Platform> const& device) const
		        {
		            /* OneApi for CPU is not defining the ext_intel_free_memory aspect, therefore we fall back to query it
		             * directly from the host.
		             */
		            if constexpr(ALPAKA_TYPEOF(device.getDeviceKind()){} == deviceKind::cpu)
		            {
		                return onHost::getFreeGlobalMemBytes();
		            }

		            sycl::device const dev = std::get<0>(device.getNativeHandle());
		            return dev.get_info<sycl::ext::intel::info::device::free_memory>();
		        }
		    };
		} // namespace alpaka::onHost::internal

		#endif
		// ==
		// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/oneApi/Device.hpp ==
		// ============================================================================

		// ============================================================================
		// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/oneApi/Platform.hpp ==
		// ==
		/* Copyright 2025 Simeon Ehrig
		 * SPDX-License-Identifier: MPL-2.0
		 */

		// #pragma once
		// #include "alpaka/api/oneApi/Api.hpp"    // amalgamate: file already inlined
			// ============================================================================
			// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/syclGeneric/Platform.hpp ==
			// ==
			/* Copyright 2025 Simeon Ehrig
			 * SPDX-License-Identifier: MPL-2.0
			 */

			// #pragma once
			// #include "alpaka/api/syclGeneric/Device.hpp"    // amalgamate: file already inlined
			// #include "alpaka/core/Dict.hpp"    // amalgamate: file already inlined
				// ============================================================================
				// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/core/Sycl.hpp ==
				// ==
				/* Copyright 2023 Jan Stephan, Luca Ferragina, Aurora Perego, Andrea Bocci
				 * SPDX-License-Identifier: MPL-2.0
				 */

				// #pragma once
				// #include "alpaka/Vec.hpp"    // amalgamate: file already inlined
				// #include "alpaka/core/config.hpp"    // amalgamate: file already inlined
				// #include "alpaka/meta/IntegerSequence.hpp"    // amalgamate: file already inlined

				// #include <array>    // amalgamate: file already included
				// #include <cstddef>    // amalgamate: file already included
				#include <cstdio> // the #define printf(...) breaks <cstdio> if it is included afterwards
				// #include <iostream>    // amalgamate: file already included
				// #include <stdexcept>    // amalgamate: file already included
				// #include <string>    // amalgamate: file already included
				#include <type_traits>
				// #include <utility>    // amalgamate: file already included

				#if ALPAKA_LANG_SYCL

				// #    include <sycl/sycl.hpp>    // amalgamate: file already included

				// if SYCL is enabled with the AMD backend the printf will be killed because of missing compiler support
				#    ifdef __AMDGCN__
				#        define printf(...)
				#    else

				#        ifdef __SYCL_DEVICE_ONLY__
				using AlpakaFormat = char const* [[clang::opencl_constant]];
				#        else
				using AlpakaFormat = char const*;
				#        endif

				#        if ALPAKA_COMP_CLANG
				#            pragma clang diagnostic push
				#            pragma clang diagnostic ignored "-Wgnu-zero-variadic-macro-arguments"
				#        endif

				#        define printf(FORMAT, ...)                                                                                   \
				            do                                                                                                        \
				            {                                                                                                         \
				                static auto const format = AlpakaFormat{FORMAT};                                                      \
				                sycl::ext::oneapi::experimental::printf(format, ##__VA_ARGS__);                                       \
				            } while(false)

				#        if ALPAKA_COMP_CLANG
				#            pragma clang diagnostic pop
				#        endif

				#    endif

				// SYCL vector types trait specializations.
				namespace alpaka
				{
				    namespace detail
				    {
				        // Remove std::is_same boilerplate
				        template<typename T, typename... Ts>
				        struct is_any : std::bool_constant<(std::is_same_v<T, Ts> || ...)>
				        {
				        };
				    } // namespace detail

				    //! In contrast to CUDA SYCL doesn't know 1D vectors. It does
				    //! support OpenCL's data types which have additional requirements
				    //! on top of those in the C++ standard. Note that SYCL's equivalent
				    //! to CUDA's dim3 type is a different class type and thus not used
				    //! here.
				    template<typename T>
				    struct IsSyclBuiltInType
				        : detail::is_any<
				              T,
				              // built-in scalar types - these are the standard C++ built-in types, std::size_t, std::byte and
				              // sycl::half
				              sycl::half,

				              // 2 component vector types
				              sycl::char2,
				              sycl::uchar2,
				              sycl::short2,
				              sycl::ushort2,
				              sycl::int2,
				              sycl::uint2,
				              sycl::long2,
				              sycl::ulong2,
				              sycl::float2,
				              sycl::double2,
				              sycl::half2,

				              // 3 component vector types
				              sycl::char3,
				              sycl::uchar3,
				              sycl::short3,
				              sycl::ushort3,
				              sycl::int3,
				              sycl::uint3,
				              sycl::long3,
				              sycl::ulong3,
				              sycl::float3,
				              sycl::double3,
				              sycl::half3,

				              // 4 component vector types
				              sycl::char4,
				              sycl::uchar4,
				              sycl::short4,
				              sycl::ushort4,
				              sycl::int4,
				              sycl::uint4,
				              sycl::long4,
				              sycl::ulong4,
				              sycl::float4,
				              sycl::double4,
				              sycl::half4,

				              // 8 component vector types
				              sycl::char8,
				              sycl::uchar8,
				              sycl::short8,
				              sycl::ushort8,
				              sycl::int8,
				              sycl::uint8,
				              sycl::long8,
				              sycl::ulong8,
				              sycl::float8,
				              sycl::double8,
				              sycl::half8,

				              // 16 component vector types
				              sycl::char16,
				              sycl::uchar16,
				              sycl::short16,
				              sycl::ushort16,
				              sycl::int16,
				              sycl::uint16,
				              sycl::long16,
				              sycl::ulong16,
				              sycl::float16,
				              sycl::double16,
				              sycl::half16>
				    {
				    };
				} // namespace alpaka
				#endif
				// ==
				// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/core/Sycl.hpp ==
				// ============================================================================

			// #include "alpaka/core/config.hpp"    // amalgamate: file already inlined
			// #include "alpaka/internal/interface.hpp"    // amalgamate: file already inlined

			#if ALPAKA_LANG_SYCL

			// #    include <sycl/sycl.hpp>    // amalgamate: file already included

			#    include <map>
			// #    include <memory>    // amalgamate: file already included
			#    include <numeric>
			// #    include <optional>    // amalgamate: file already included

			namespace alpaka
			{
			    namespace detail
			    {
			        template<typename T_DeviceKind>
			        struct SYCLDeviceSelector;

			        struct Context
			        {
			            Context() = default;

			            sycl::platform getPlatformByName(std::string const& platformName)
			            {
			                auto platforms = sycl::platform::get_platforms();

			                for(auto const& platform : platforms)
			                {
			                    if(platform.get_info<sycl::info::platform::name>() == platformName)
			                    {
			                        return platform;
			                    }
			                }

			                throw std::runtime_error("Platform not found");
			            }

			            auto getContext(sycl::platform platform)
			            {
			                std::string platformName = platform.get_info<sycl::info::platform::name>();
			                if(contextMap.contains(platformName))
			                {
			                    return contextMap[platformName];
			                }

			                std::vector<sycl::device> devices;
			                try
			                {
			                    devices = platform.get_devices();
			                }
			                catch(...)
			                {
			                    devices.clear();
			                }
			                if(devices.size())
			                {
			                    auto context = sycl::context{
			                        platform.get_devices(),
			                        [](sycl::exception_list exceptions)
			                        {
			                            auto ss_err = std::stringstream{};
			                            ss_err << "Caught asynchronous SYCL exception(s):\n";
			                            for(std::exception_ptr e : exceptions)
			                            {
			                                try
			                                {
			                                    std::rethrow_exception(e);
			                                }
			                                catch(sycl::exception const& err)
			                                {
			                                    ss_err << err.what() << " (" << err.code() << ")\n";
			                                }
			                            }
			                            throw std::runtime_error(ss_err.str());
			                        }};
			                    return contextMap[platformName] = context;
			                }
			                return sycl::context{};
			            }

			            std::map<std::string, sycl::context> contextMap;
			        };
			    } // namespace detail

			    namespace onHost

			    {
			        namespace syclGeneric
			        {
			            template<typename T_ApiInterface, alpaka::concepts::DeviceKind T_DeviceKind>
			            struct Platform : std::enable_shared_from_this<Platform<T_ApiInterface, T_DeviceKind>>
			            {
			            private:
			                /** Checks if kernels can be compiled for the context and given list of devices
			                 *
			                 * It is possible that we can create a context and a list of devices for an API and device kind.
			                 * That does not mean we can compile kernels for the devices.
			                 * A reason can be:
			                 *   - the compile flags are not set to build kernels for the given device kind
			                 *   - dependencies e.g. CUDA/HIP are not available
			                 *
			                 * @return true if we can build kernels for the devices, else false.
			                 */
			                bool checkIfKernelsCanBeCompiled(std::vector<sycl::device> const& devs, sycl::context ctx)
			                {
			                    if(devs.empty())
			                        return false;
			                    try
			                    {
			                        auto kernelIds = sycl::get_kernel_ids();

			                        // No application kernels exist, so there is nothing to validate.
			                        if(kernelIds.empty())
			                            return true;
			                        // Check if we have already pre-compiled binaries/executables for the devices.
			                        if(sycl::has_kernel_bundle<sycl::bundle_state::executable>(ctx, devs))
			                        {
			                            // an executable exists already
			                            return true;
			                        }
			                        // Check if we can compile for the devices.
			                        if(sycl::has_kernel_bundle<sycl::bundle_state::input>(ctx, devs))
			                        {
			                            auto input = sycl::get_kernel_bundle<sycl::bundle_state::input>(ctx, devs);

			                            auto executable = sycl::build(input, devs);

			                            // return true if we can build the kernels for the devices
			                            return !executable.empty();
			                        }
			                        return false;
			                    }
			                    catch(...)
			                    {
			                        return false;
			                    }
			                }

			            public:
			                Platform() : contextManager{make_sharedSingleton<detail::Context>()}
			                {
			                    try
			                    {
			                        syclPlatform = sycl::platform{detail::SYCLDeviceSelector<T_DeviceKind>{}};
			                        syclDevices = syclPlatform->get_devices();
			                        devices.resize(syclDevices.size());
			                        syclContext = contextManager->getContext(syclPlatform.value());

			                        /* If no call before fired an exception we need to check if we can build kernels for the
			                         * context and devices. If we are not able to compile kernels for the devices, we throw
			                         * to reset the context and device list.
			                         */
			                        if(!checkIfKernelsCanBeCompiled(syclDevices, *syclContext))
			                        {
			                            auto msg
			                                = (std::string("kernel_bundle_building for ") + T_ApiInterface::getName() + "and "
			                                   + T_DeviceKind::getName() + " failed");
			                            throw std::runtime_error(msg);
			                        }
			                    }
			                    catch(...)
			                    {
			                        /* Reset all members, to show that the platform does not have a valid context and devices.
			                         * If later the number of devices is queried it will return that zero devices are available.
			                         */
			                        syclContext.reset();
			                        syclPlatform.reset();
			                        syclDevices.clear();
			                        devices.clear();
			                    }
			                }

			                Platform(Platform const&) = delete;
			                Platform& operator=(Platform const&) = delete;

			                Platform(Platform&&) = delete;
			                Platform& operator=(Platform&&) = delete;

			                std::shared_ptr<Platform<T_ApiInterface, T_DeviceKind>> getSharedPtr()
			                {
			                    return this->shared_from_this();
			                }

			                auto getContext() const
			                {
			                    if(!syclContext.has_value())
			                        throw std::runtime_error("The underlying SYCL context is invalid.");
			                    return syclContext.value();
			                }

			                uint32_t getDeviceCount() const
			                {
			                    ALPAKA_LOG_FUNCTION(alpaka::onHost::logger::device);
			                    constexpr bool isSupportedDev = trait::IsDeviceSupportedBy::
			                        Op<T_DeviceKind, ALPAKA_TYPEOF(alpaka::internal::getApi(std::declval<Platform>()))>::value;
			                    if constexpr(isSupportedDev)
			                    {
			                        auto numDevices = devices.size();
			                        return static_cast<uint32_t>(numDevices);
			                    }
			                    return 0u;
			                }

			                Handle<syclGeneric::Device<Platform<T_ApiInterface, T_DeviceKind>>> makeDevice(uint32_t const& idx)
			                {
			                    ALPAKA_LOG_FUNCTION(alpaka::onHost::logger::device);
			                    uint32_t const numDevices = getDeviceCount();
			                    if(idx >= numDevices)
			                    {
			                        std::stringstream ssErr;
			                        ssErr << "Unable to return device handle for SYCL device with index " << idx
			                              << " because there are only " << numDevices << " devices!";
			                        throw std::runtime_error(ssErr.str());
			                    }

			                    std::lock_guard<std::mutex> lk{deviceGuard};

			                    if(auto sharedPtr = devices[idx].lock())
			                    {
			                        return sharedPtr;
			                    }

			                    auto newDevice = std::make_shared<syclGeneric::Device<Platform<T_ApiInterface, T_DeviceKind>>>(
			                        std::move(getSharedPtr()),
			                        syclDevices[idx],
			                        idx);
			                    devices[idx] = newDevice;
			                    return newDevice;
			                }

			                static constexpr auto getName()
			                {
			                    return onHost::demangledName<syclGeneric::Platform<T_ApiInterface, T_DeviceKind>>();
			                }

			                friend struct internal::GetDeviceProperties::Op<syclGeneric::Platform<T_ApiInterface, T_DeviceKind>>;

			            private:
			                friend struct onHost::internal::IsDataAccessible;
			                friend struct GetDeviceProperties;

			                // The context manager is required to be able to use the same sycl context for different device types
			                std::shared_ptr<alpaka::detail::Context> contextManager;
			                std::optional<sycl::context> syclContext;
			                // native sycl platform for the corresponding device kind this platform is representing
			                std::optional<sycl::platform> syclPlatform;
			                // native sycl devices for the corresponding device kind this platform is representing
			                std::vector<sycl::device> syclDevices;
			                // alpaka devices for the internal hierarchy
			                std::vector<std::weak_ptr<syclGeneric::Device<Platform<T_ApiInterface, T_DeviceKind>>>> devices;

			                std::mutex deviceGuard;

			                void _()
			                {
			                    static_assert(internal::concepts::Platform<Platform>);
			                }
			            };
			        } // namespace syclGeneric

			        namespace internal
			        {
			            template<typename T_ApiInterface, alpaka::concepts::DeviceKind T_DeviceKind>
			            struct GetDeviceProperties::Op<syclGeneric::Platform<T_ApiInterface, T_DeviceKind>>
			            {
			                DeviceProperties operator()(
			                    syclGeneric::Platform<T_ApiInterface, T_DeviceKind> const& platform,
			                    uint32_t deviceIdx) const
			                {
			                    ALPAKA_LOG_FUNCTION(alpaka::onHost::logger::device);
			                    if(deviceIdx >= platform.syclDevices.size())
			                    {
			                        std::stringstream ssErr;
			                        ssErr << "Unable to return device properties for SYCL device with index " << deviceIdx
			                              << " because there are only " << platform.getDeviceCount() << " devices!";
			                        throw std::runtime_error(ssErr.str());
			                    }
			                    sycl::device const dev = platform.syclDevices[deviceIdx];

			                    auto prop = DeviceProperties{};
			                    prop.name = dev.get_info<sycl::info::device::name>();
			                    std::vector<std::size_t> wrap_sizes = dev.get_info<sycl::info::device::sub_group_sizes>();
			                    // @todo do not reduce wrap size to a single value, return all values
			                    prop.warpSize = static_cast<uint32_t>(std::reduce(
			                        wrap_sizes.begin(),
			                        wrap_sizes.end(),
			                        std::size_t{0},
			                        [](std::size_t a, std::size_t b)
			                        {
			                            // The CPU runtime supports a sub-group size of 64, but the SYCL implementation
			                            // currently does not
			                            if constexpr(T_DeviceKind{} == deviceKind::cpu)
			                                return std::max(a, b) <= 32 ? std::max(a, b) : 32;
			                            else
			                                return std::max(a, b);
			                        }));
			                    prop.multiProcessorCount = dev.get_info<sycl::info::device::max_compute_units>();
			                    prop.globalMemCapacityBytes = dev.get_info<sycl::info::device::global_mem_size>();
			                    prop.sharedMemPerBlockBytes = dev.get_info<sycl::info::device::local_mem_size>();

			                    prop.maxThreadsPerBlock = dev.get_info<sycl::info::device::max_work_group_size>();
			                    // will be copied into the lampda
			                    auto syclMaxThreadsPerBlock = dev.get_info<sycl::info::device::max_work_item_sizes<3>>();
			                    // in sycl index order == alpaka index order
			                    prop.fnMaxThreadsPerBlock = [maxThreadsPerBlock = prop.maxThreadsPerBlock,
			                                                 syclMaxThreadsPerBlock](uint32_t* data, uint32_t numDims)
			                    {
			                        if(numDims <= 3u)
			                        {
			                            for(uint32_t d = 0u; d < numDims; ++d)
			                                data[numDims - 1u - d] = syclMaxThreadsPerBlock[3u - 1u - d];
			                        }
			                        else
			                        {
			                            /* For more than 3 dimensions alpaka is linearizing to one dimension, therefore we use the
			                             * maximum for each dimension. */
			                            for(uint32_t d = 0u; d < numDims; ++d)
			                                data[d] = maxThreadsPerBlock;
			                        }
			                    };

			                    prop.maxBlocksPerGrid = std::numeric_limits<uint32_t>::max();
			                    prop.fnMaxBlocksPerGrid = [](uint32_t* data, uint32_t numDims)
			                    {
			                        for(uint32_t d = 0u; d < numDims; ++d)
			                            data[d] = std::numeric_limits<uint32_t>::max();
			                    };


			                    return prop;
			                }
			            };
			        } // namespace internal

			    } // namespace onHost
			} // namespace alpaka
			#endif
			// ==
			// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/syclGeneric/Platform.hpp ==
			// ============================================================================

		// #include "alpaka/core/config.hpp"    // amalgamate: file already inlined
		// #include "alpaka/internal/interface.hpp"    // amalgamate: file already inlined
		// #include "alpaka/onHost/internal/interface.hpp"    // amalgamate: file already inlined
		// #include "alpaka/tag.hpp"    // amalgamate: file already inlined

		#if ALPAKA_LANG_ONEAPI

		namespace alpaka
		{
		    namespace detail
		    {
		        template<>
		        struct SYCLDeviceSelector<deviceKind::Cpu>
		        {
		            auto operator()(sycl::device const& dev) const -> int
		            {
		                return dev.is_cpu() ? 1 : -1;
		            }
		        };

		        template<>
		        struct SYCLDeviceSelector<deviceKind::IntelGpu>
		        {
		            auto operator()(sycl::device const& dev) const -> int
		            {
		                auto const& vendor = dev.get_info<sycl::info::device::vendor>();
		                auto const is_intel_gpu = dev.is_gpu() && (vendor.find("Intel(R) Corporation") != std::string::npos);

		                return is_intel_gpu ? 1 : -1;
		            }
		        };

		        template<>
		        struct SYCLDeviceSelector<deviceKind::NvidiaGpu>
		        {
		            auto operator()(sycl::device const& dev) const -> int
		            {
		                auto const& vendor = dev.get_info<sycl::info::device::vendor>();
		                auto const is_nvidia_gpu = dev.is_gpu() && (vendor.find("NVIDIA") != std::string::npos);

		                return is_nvidia_gpu ? 1 : -1;
		            }
		        };

		        template<>
		        struct SYCLDeviceSelector<deviceKind::AmdGpu>
		        {
		            auto operator()(sycl::device const& dev) const -> int
		            {
		                auto const& vendor = dev.get_info<sycl::info::device::vendor>();
		                auto const is_amd_gpu = dev.is_gpu() && (vendor.find("AMD") != std::string::npos);

		                return is_amd_gpu ? 1 : -1;
		            }
		        };

		    } // namespace detail

		    namespace onHost
		    {
		        namespace internal
		        {
		            template<alpaka::concepts::DeviceKind T_DeviceKind>
		            struct MakePlatform::Op<api::OneApi, T_DeviceKind>
		            {
		                auto operator()(api::OneApi const&, T_DeviceKind) const
		                {
		                    return onHost::make_sharedSingleton<syclGeneric::Platform<api::OneApi, T_DeviceKind>>();
		                }
		            };
		        } // namespace internal
		    } // namespace onHost

		    namespace internal
		    {
		        template<alpaka::concepts::DeviceKind T_DeviceKind>
		        struct GetApi::Op<onHost::syclGeneric::Platform<api::OneApi, T_DeviceKind>>
		        {
		            decltype(auto) operator()(auto&& platform) const
		            {
		                alpaka::unused(platform);
		                return api::OneApi{};
		            }
		        };

		        template<alpaka::concepts::DeviceKind T_DeviceKind>
		        struct GetDeviceType::Op<onHost::syclGeneric::Platform<api::OneApi, T_DeviceKind>>
		        {
		            decltype(auto) operator()(auto&& platform) const
		            {
		                alpaka::unused(platform);
		                return T_DeviceKind{};
		            }
		        };
		    } // namespace internal
		} // namespace alpaka

		namespace alpaka::onHost::internal
		{
		    template<alpaka::concepts::DeviceKind T_DeviceKind, typename T_Any>
		    struct IsDataAccessible::SecondPath<api::OneApi, T_DeviceKind, T_Any>
		    {
		        static void getPtrType(auto deviceKind, auto& sycl_data_alloc_type, auto const& view)
		        {
		            try
		            {
		                auto platform
		                    = onHost::make_sharedSingleton<syclGeneric::Platform<api::OneApi, ALPAKA_TYPEOF(deviceKind)>>();
		                auto sycl_context = platform->getContext();
		                auto sycl_alloc_type = get_pointer_type(Data::data(view), sycl_context);

		                if(sycl_alloc_type != sycl::usm::alloc::unknown)
		                    sycl_data_alloc_type = sycl_alloc_type;
		            }
		            catch(...)
		            {
		                // do to mising drivers or other issues we can not query the pointer type, in this case we assume that
		                // the memory is not accessible for the device
		            }
		        }

		        bool operator()(api::OneApi usedApi, T_DeviceKind deviceKind, T_Any const& view) const
		        {
		            auto deviceKindList = onHost::supportedDevices(usedApi);
		            auto sycl_data_alloc_type = sycl::usm::alloc::unknown;
		            alpaka::apply(
		                [&sycl_data_alloc_type, &view](auto... devKind)
		                { (getPtrType(devKind, sycl_data_alloc_type, view), ...); },
		                deviceKindList);

		            if(deviceKind == deviceKind::cpu || deviceKind == deviceKind::numaCpu)
		            {
		                /* If the device kind is not CPU and usm alloc type is shared, we do not know if the memory is shared
		                 * within the same sycl context. Therefor only know we mark only shared and host alloced memory
		                 * accessible in case the device kind is CPU.
		                 */
		                if(sycl_data_alloc_type == sycl::usm::alloc::shared || sycl_data_alloc_type == sycl::usm::alloc::host)
		                    return true;
		            }
		            return false;
		        }
		    };
		} // namespace alpaka::onHost::internal

		#endif
		// ==
		// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/oneApi/Platform.hpp ==
		// ============================================================================

		// ============================================================================
		// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/oneApi/Queue.hpp ==
		// ==
		/* Copyright 2025 Simeon Ehrig, René Widera
		 * SPDX-License-Identifier: MPL-2.0
		 */

		// #pragma once
		// #include "alpaka/api/generic.hpp"    // amalgamate: file already inlined
			// ============================================================================
			// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/oneApi/StaticSharedMemory.hpp ==
			// ==
			/* Copyright 2025 Rene Widera
			 * SPDX-License-Identifier: MPL-2.0
			 */

			// #pragma once
			// #include "alpaka/Vec.hpp"    // amalgamate: file already inlined
			// #include "alpaka/core/Assert.hpp"    // amalgamate: file already inlined
			// #include "alpaka/core/Dict.hpp"    // amalgamate: file already inlined
			// #include "alpaka/core/config.hpp"    // amalgamate: file already inlined
			// #include "alpaka/tag.hpp"    // amalgamate: file already inlined

			#if ALPAKA_LANG_ONEAPI

			// #    include <sycl/sycl.hpp>    // amalgamate: file already included

			// #    include <functional>    // amalgamate: file already included

			namespace alpaka::onAcc
			{
			    namespace oneApi
			    {
			        namespace detail
			        {
			            /** Pointer lookup table
			             *
			             * Provides a dynamic lookup table to map an unique id to a pointer.
			             */
			            class PtrLookupTable
			            {
			                struct MetaData
			                {
			                    //! pointer to allocated data
			                    std::byte* ptr = nullptr;
			                    //! Unique id if the next data chunk.
			                    size_t id = std::numeric_limits<size_t>::max();
			                };

			                static constexpr uint32_t metaDataSize = sizeof(MetaData);

			            public:
			#    ifndef NDEBUG
			                PtrLookupTable(std::byte* mem, uint32_t capacity)
			                    : m_mem(reinterpret_cast<MetaData*>(mem))
			                    , m_capacity(capacity / metaDataSize)
			                {
			                    ALPAKA_ASSERT_ACC((m_mem == nullptr) == (m_capacity == 0u));
			                }
			#    else
			                PtrLookupTable(std::byte* mem, uint32_t) : m_mem(reinterpret_cast<MetaData*>(mem))
			                {
			                }
			#    endif

			                /** number of bytes required for bookkeeping of maxNumberOfAllocations unique allocations
			                 *
			                 * @param maxNumUniqueAllocations number of unique allocation a user is allowed to perform
			                 * @return bytes required to store lookup meta data
			                 */
			                static consteval uint32_t sizeLookupBufferInBytes(uint32_t maxNumUniqueAllocations)
			                {
			                    return metaDataSize * maxNumUniqueAllocations;
			                }

			                /* With oneApi 2025.2 the behaviour of shared memory allocation has changed. IT behaves like cuda
			                 * shared memory. Therefore, we need a unique data type to avoid pointer aliasing. Using the helper
			                 * class for data alignment is backward compatible to previous versions. The reason for using std::byte
			                 * is that this guaranteed support for data types which are not trivially constructible.
			                 */
			                template<typename T, size_t T_id>
			                struct alignas(T) SharedMemData
			                {
			                    std::byte data[sizeof(T)];
			                };

			                template<typename T, size_t T_id>
			                T* alloc() const
			                {
			                    auto group = sycl::ext::oneapi::this_work_item::get_work_group<1>();
			                    SharedMemData<T, T_id>* data
			                        = sycl::ext::oneapi::group_local_memory_for_overwrite<SharedMemData<T, T_id>>(group);

			                    MetaData& metaDataEntry = m_mem[m_numEntries];
			                    ++m_numEntries;
			                    ALPAKA_ASSERT_ACC(m_numEntries <= m_capacity);

			                    // Update meta data with id and pointer to the current allocation
			                    if(group.get_local_linear_id() == 0u)
			                    {
			                        // only one thread must update the pointer in shared memory
			                        metaDataEntry.ptr = reinterpret_cast<std::byte*>(data);
			                    }
			                    metaDataEntry.id = T_id;

			                    return reinterpret_cast<T*>(data);
			                }

			                //! Give the pointer to an exiting variable
			                //!
			                //! @tparam T type of the variable
			                //! @param id unique id of the variable
			                //! @return nullptr if variable with id not exists
			                template<typename T>
			                auto getVarPtr(size_t id) const -> T*
			                {
			                    // Iterate over metadata
			                    for(uint32_t off = 0u; off < m_numEntries; ++off)
			                    {
			                        MetaData& metaDataEntry = m_mem[off];

			                        if(metaDataEntry.id == id)
			                            return reinterpret_cast<T*>(metaDataEntry.ptr);
			                    }

			                    // Variable not found.
			                    return nullptr;
			                }

			            private:
			                //! Number unqiue meta data entries stored
			                mutable uint32_t m_numEntries = 0u;

			                //! Memory layout
			                //! |Header|Padding|Variable|Padding|Header|....uninitialized Data ....
			                //! Size of padding can be zero if data after padding is already aligned.
			                MetaData* const m_mem;
			#    ifndef NDEBUG
			                //! max number of meta data entries
			                uint32_t const m_capacity;
			#    endif
			            };
			        } // namespace detail

			        class StaticSharedMemory : private detail::PtrLookupTable
			        {
			        public:
			            /** number of bytes required for bookkeeping of mayNumberOfAllocations unique allcoations
			             *
			             * @param maxNumUniqueAllocations number of unique allocation a user is allowed to perform
			             * @return bytes required to store lookup meta data
			             */
			            static consteval uint32_t sizeLookupBufferInBytes(uint32_t maxNumUniqueAllocations)
			            {
			                return detail::PtrLookupTable::sizeLookupBufferInBytes(maxNumUniqueAllocations);
			            }

			            StaticSharedMemory(StaticSharedMemory const&) = delete;

			            /** Construct shared memory allocator
			             * @param accessor local memory accessor to store lookup meta data
			             *                 bytes required to store N unique allocation can be calculated with
			             * sizeLookupBufferInBytes()
			             */
			            StaticSharedMemory(sycl::local_accessor<std::byte> const& accessor)
			                : PtrLookupTable(
			                      reinterpret_cast<std::byte*>(accessor.get_multi_ptr<sycl::access::decorated::no>().get()),
			                      static_cast<uint32_t>(accessor.size()))

			            {
			            }

			            using Base = detail::PtrLookupTable;

			            template<typename T, size_t T_unique>
			            T& allocVar()
			            {
			                T* data = Base::template getVarPtr<T>(T_unique);

			                if(!data)
			                {
			                    data = Base::template alloc<T, T_unique>();
			                }
			                ALPAKA_ASSERT(data != nullptr);
			                return *data;
			            }
			        };

			    } // namespace oneApi
			} // namespace alpaka::onAcc

			#endif
			// ==
			// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/oneApi/StaticSharedMemory.hpp ==
			// ============================================================================

		// #include "alpaka/api/syclGeneric/Queue.hpp"    // amalgamate: file already inlined
			// ============================================================================
			// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/syclGeneric/onAcc.hpp ==
			// ==
			/* Copyright 2025 Simeon Ehrig
			 * SPDX-License-Identifier: MPL-2.0
			 */

			// #pragma once
			// #include "alpaka/Vec.hpp"    // amalgamate: file already inlined
			// #include "alpaka/core/Assert.hpp"    // amalgamate: file already inlined
			// #include "alpaka/core/Dict.hpp"    // amalgamate: file already inlined
			// #include "alpaka/core/config.hpp"    // amalgamate: file already inlined
			// #include "alpaka/tag.hpp"    // amalgamate: file already inlined

			#if ALPAKA_LANG_SYCL

			// #    include <sycl/sycl.hpp>    // amalgamate: file already included

			// #    include <functional>    // amalgamate: file already included

			namespace alpaka::onAcc
			{
			    namespace syclGeneric
			    {
			        template<auto T_syclDim, typename T_OptimizedThreadSpec>
			        class BlockLayer
			        {
			            using IdxType = typename T_OptimizedThreadSpec::NumBlocksVecType::type;

			            sycl::nd_item<T_syclDim> const& m_item;
			            T_OptimizedThreadSpec const& m_optimizedThreadSpec;
			            // dimension of the alpaka objects
			            static constexpr uint32_t dim = T_OptimizedThreadSpec::dim();

			        public:
			            BlockLayer(sycl::nd_item<T_syclDim> const& item, T_OptimizedThreadSpec const& optimizedThreadSpec)
			                : m_item(item)
			                , m_optimizedThreadSpec(optimizedThreadSpec)
			            {
			            }

			            constexpr auto idx() const -> Vec<IdxType, dim>
			            {
			                if constexpr(dim == 1)
			                {
			                    return Vec<IdxType, 1u>{m_item.get_group(0)};
			                }
			                else if constexpr(dim == 2)
			                {
			                    return Vec<IdxType, 2u>{m_item.get_group(0), m_item.get_group(1)};
			                }
			                else if constexpr(dim == 3)
			                {
			                    return Vec<IdxType, 3u>{m_item.get_group(0), m_item.get_group(1), m_item.get_group(2)};
			                }
			                else
			                {
			                    return mapToND(m_optimizedThreadSpec.getNumBlocks(), static_cast<IdxType>(m_item.get_group(0)));
			                }
			            }

			            constexpr auto count() const -> Vec<IdxType, dim>
			            {
			                if constexpr(dim == 1)
			                {
			                    return Vec<IdxType, 1u>{m_item.get_group_range(0)};
			                }
			                else if constexpr(dim == 2)
			                {
			                    return Vec<IdxType, 2u>{m_item.get_group_range(0), m_item.get_group_range(1)};
			                }
			                else if constexpr(dim == 3)
			                {
			                    return Vec<IdxType, 3u>{
			                        m_item.get_group_range(0),
			                        m_item.get_group_range(1),
			                        m_item.get_group_range(2)};
			                }
			                else
			                {
			                    return m_optimizedThreadSpec.getNumBlocks();
			                }
			            }
			        };

			        template<auto T_syclDim, typename T_OptimizedThreadSpec>
			        class ThreadLayer
			        {
			            using IdxType = typename T_OptimizedThreadSpec::NumThreadsVecType::type;

			            sycl::nd_item<T_syclDim> const& m_item;
			            T_OptimizedThreadSpec const& m_optimizedThreadSpec;
			            // dimension of the alpaka objects
			            static constexpr uint32_t dim = T_OptimizedThreadSpec::dim();

			        public:
			            ThreadLayer(sycl::nd_item<T_syclDim> const& item, T_OptimizedThreadSpec const& optimizedThreadSpec)
			                : m_item(item)
			                , m_optimizedThreadSpec(optimizedThreadSpec)
			            {
			            }

			            constexpr auto idx() const -> Vec<IdxType, dim>
			            {
			                if constexpr(dim == 1)
			                {
			                    return Vec<IdxType, 1u>{m_item.get_local_id(0)};
			                }
			                else if constexpr(dim == 2)
			                {
			                    return Vec<IdxType, 2u>{m_item.get_local_id(0), m_item.get_local_id(1)};
			                }
			                else if constexpr(dim == 3)
			                {
			                    return Vec<IdxType, 3u>{m_item.get_local_id(0), m_item.get_local_id(1), m_item.get_local_id(2)};
			                }
			                else
			                {
			                    return mapToND(
			                        m_optimizedThreadSpec.getNumThreads(),
			                        static_cast<IdxType>(m_item.get_local_id(0)));
			                }
			            }

			            constexpr auto count() const -> Vec<IdxType, dim>
			            {
			                if constexpr(dim == 1)
			                {
			                    return Vec<IdxType, 1u>{m_item.get_local_range(0)};
			                }
			                else if constexpr(dim == 2)
			                {
			                    return Vec<IdxType, 2u>{m_item.get_local_range(0), m_item.get_local_range(1)};
			                }
			                else if constexpr(dim == 3)
			                {
			                    return Vec<IdxType, 3u>{
			                        m_item.get_local_range(0),
			                        m_item.get_local_range(1),
			                        m_item.get_local_range(2)};
			                }
			                else
			                {
			                    return m_optimizedThreadSpec.getNumThreads();
			                }
			            }

			            constexpr auto count() const
			                requires alpaka::concepts::CVector<typename T_OptimizedThreadSpec::NumThreadsVecType>
			            {
			                return typename T_OptimizedThreadSpec::NumThreadsVecType{};
			            }
			        };

			        template<auto T_syclDim>
			        class Sync
			        {
			            sycl::nd_item<T_syclDim> const& m_item;

			        public:
			            Sync(sycl::nd_item<T_syclDim> const& item) : m_item(item)
			            {
			            }

			            void operator()() const
			            {
			                m_item.barrier();
			            }
			        };

			        class DynamicSharedMemory
			        {
			            sycl::local_accessor<std::byte> const& m_accessor;

			        public:
			            DynamicSharedMemory(sycl::local_accessor<std::byte> const& accessor) : m_accessor(accessor)
			            {
			            }

			            template<typename T, size_t>
			            T* allocDynamic(uint32_t)
			            {
			                return reinterpret_cast<T*>(m_accessor.get_multi_ptr<sycl::access::decorated::no>().get());
			            }

			            constexpr size_t byte_size() noexcept
			            {
			                return m_accessor.byte_size();
			            }
			        };
			    } // namespace syclGeneric
			} // namespace alpaka::onAcc

			#endif
			// ==
			// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/syclGeneric/onAcc.hpp ==
			// ============================================================================

		// #include "alpaka/core/config.hpp"    // amalgamate: file already inlined
			// ============================================================================
			// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/core/syclConfig.hpp ==
			// ==
			/* Copyright 2023 Andrea Bocci, Aurora Perego, René Widera
			 * SPDX-License-Identifier: MPL-2.0
			 */

			// #include "alpaka/core/config.hpp"    // amalgamate: file already inlined

			#if ALPAKA_LANG_SYCL

			#    if defined(__SYCL_DEVICE_ONLY__)

			// defines can be taken from
			// https://github.com/llvm/llvm-project/blob/3cfe6aa46e06a8caa3f07057838d31c6ce840076/clang/include/clang/Basic/OffloadArch.h#L18-L28

			#        if /* Broadwell Intel graphics architecture */                                                               \
			            (defined(__SYCL_TARGET_INTEL_GPU_BDW__) && __SYCL_TARGET_INTEL_GPU_BDW__)                                 \
			            || /* Skylake Intel graphics architecture */                                                              \
			            (defined(__SYCL_TARGET_INTEL_GPU_SKL__) && __SYCL_TARGET_INTEL_GPU_SKL__)                                 \
			            || /* Kaby Lake Intel graphics architecture */                                                            \
			            (defined(__SYCL_TARGET_INTEL_GPU_KBL__) && __SYCL_TARGET_INTEL_GPU_KBL__)                                 \
			            || /* Coffee Lake Intel graphics architecture */                                                          \
			            (defined(__SYCL_TARGET_INTEL_GPU_CFL__) && __SYCL_TARGET_INTEL_GPU_CFL__)                                 \
			            || /* Apollo Lake Intel graphics architecture */                                                          \
			            (defined(__SYCL_TARGET_INTEL_GPU_APL__) && __SYCL_TARGET_INTEL_GPU_APL__)                                 \
			            || /* Gemini Lake Intel graphics architecture */                                                          \
			            (defined(__SYCL_TARGET_INTEL_GPU_GLK__) && __SYCL_TARGET_INTEL_GPU_GLK__)                                 \
			            || /* Whiskey Lake Intel graphics architecture */                                                         \
			            (defined(__SYCL_TARGET_INTEL_GPU_WHL__) && __SYCL_TARGET_INTEL_GPU_WHL__)                                 \
			            || /* Amber Lake Intel graphics architecture */                                                           \
			            (defined(__SYCL_TARGET_INTEL_GPU_AML__) && __SYCL_TARGET_INTEL_GPU_AML__)                                 \
			            || /* Comet Lake Intel graphics architecture */                                                           \
			            (defined(__SYCL_TARGET_INTEL_GPU_CML__) && __SYCL_TARGET_INTEL_GPU_CML__)                                 \
			            || /* Ice Lake Intel graphics architecture */                                                             \
			            (defined(__SYCL_TARGET_INTEL_GPU_ICLLP__) && __SYCL_TARGET_INTEL_GPU_ICLLP__)                             \
			            || /* Elkhart Lake or Jasper Lake Intel graphics architecture */                                          \
			            (defined(__SYCL_TARGET_INTEL_GPU_EHL__) && __SYCL_TARGET_INTEL_GPU_EHL__)                                 \
			            || /* Tiger Lake Intel graphics architecture */                                                           \
			            (defined(__SYCL_TARGET_INTEL_GPU_TGLLP__) && __SYCL_TARGET_INTEL_GPU_TGLLP__)                             \
			            || /* Rocket Lake Intel graphics architecture */                                                          \
			            (defined(__SYCL_TARGET_INTEL_GPU_RKL__) && __SYCL_TARGET_INTEL_GPU_RKL__)                                 \
			            || /* Alder Lake S or Raptor Lake S Intel graphics architecture */                                        \
			            (defined(__SYCL_TARGET_INTEL_GPU_ADL_S__) && __SYCL_TARGET_INTEL_GPU_ADL_S__)                             \
			            || /* Alder Lake P Intel graphics architecture */                                                         \
			            (defined(__SYCL_TARGET_INTEL_GPU_ADL_P__) && __SYCL_TARGET_INTEL_GPU_ADL_P__)                             \
			            || /* Alder Lake N Intel graphics architecture */                                                         \
			            (defined(__SYCL_TARGET_INTEL_GPU_ADL_N__) && __SYCL_TARGET_INTEL_GPU_ADL_N__)                             \
			            || /* DG1 Intel graphics architecture */                                                                  \
			            (defined(__SYCL_TARGET_INTEL_GPU_DG1__) && __SYCL_TARGET_INTEL_GPU_DG1__)                                 \
			            || /* Alchemist G10 Intel graphics architecture */                                                        \
			            (defined(__SYCL_TARGET_INTEL_GPU_ACM_G10__) && __SYCL_TARGET_INTEL_GPU_ACM_G10__)                         \
			            || /* Alchemist G11 Intel graphics architecture */                                                        \
			            (defined(__SYCL_TARGET_INTEL_GPU_ACM_G11__) && __SYCL_TARGET_INTEL_GPU_ACM_G11__)                         \
			            || /* Alchemist G12 Intel graphics architecture */                                                        \
			            (defined(__SYCL_TARGET_INTEL_GPU_ACM_G12__) && __SYCL_TARGET_INTEL_GPU_ACM_G12__)                         \
			            || /* Meteor Lake U/S or Arrow Lake U/S Intel graphics architecture */                                    \
			            (defined(__SYCL_TARGET_INTEL_GPU_MTL_U__) && __SYCL_TARGET_INTEL_GPU_MTL_U__)                             \
			            || /* Meteor Lake H Intel graphics architecture */                                                        \
			            (defined(__SYCL_TARGET_INTEL_GPU_MTL_H__) && __SYCL_TARGET_INTEL_GPU_MTL_H__)                             \
			            || /* Arrow Lake H Intel graphics architecture */                                                         \
			            (defined(__SYCL_TARGET_INTEL_GPU_ARL_H__) && __SYCL_TARGET_INTEL_GPU_ARL_H__)                             \
			            || /* Battlemage G21 Intel graphics architecture */                                                       \
			            (defined(__SYCL_TARGET_INTEL_GPU_BMG_G21__) && __SYCL_TARGET_INTEL_GPU_BMG_G21__)                         \
			            || /* Lunar Lake Intel graphics architecture */                                                           \
			            (defined(__SYCL_TARGET_INTEL_GPU_LNL_M__) && __SYCL_TARGET_INTEL_GPU_LNL_M__)

			#            define ALPAKA_SYCL_SUBGROUP_SIZE (8 | 16 | 32)

			#        elif /* Ponte Vecchio Intel graphics architecture */                                                         \
			            (defined(__SYCL_TARGET_INTEL_GPU_PVC__) && __SYCL_TARGET_INTEL_GPU_PVC__)                                 \
			            || /* Ponte Vecchio VG Intel graphics architecture */                                                     \
			            (defined(__SYCL_TARGET_INTEL_GPU_PVC_VG__) && __SYCL_TARGET_INTEL_GPU_PVC_VG__)

			#            define ALPAKA_SYCL_SUBGROUP_SIZE (16 | 32)

			#        elif(/* generate code ahead of time for x86_64 CPUs */                                                       \
			              defined(__SYCL_TARGET_INTEL_X86_64__) && __SYCL_TARGET_INTEL_X86_64__)
			// @attention ony CPU side detachment of SYCL kernel we limit the CPU currently to max warp group size of 32, therefore
			// 64 is removed from this list
			#            define ALPAKA_SYCL_SUBGROUP_SIZE (1 | 2 | 4 | 8 | 16 | 32)

			#        elif /* NVIDIA Maxwell architecture (compute capability 5.0) */                                              \
			            (defined(__SYCL_TARGET_NVIDIA_GPU_SM_50__) && __SYCL_TARGET_NVIDIA_GPU_SM_50__)                           \
			            || /* NVIDIA Maxwell architecture (compute capability 5.2) */                                             \
			            (defined(__SYCL_TARGET_NVIDIA_GPU_SM_52__) && __SYCL_TARGET_NVIDIA_GPU_SM_52__)                           \
			            || /* NVIDIA Jetson TX1 / Nano (compute capability 5.3) */                                                \
			            (defined(__SYCL_TARGET_NVIDIA_GPU_SM_53__) && __SYCL_TARGET_NVIDIA_GPU_SM_53__)                           \
			            || /* NVIDIA Pascal architecture (compute capability 6.0) */                                              \
			            (defined(__SYCL_TARGET_NVIDIA_GPU_SM_60__) && __SYCL_TARGET_NVIDIA_GPU_SM_60__)                           \
			            || /* NVIDIA Pascal architecture (compute capability 6.1) */                                              \
			            (defined(__SYCL_TARGET_NVIDIA_GPU_SM_61__) && __SYCL_TARGET_NVIDIA_GPU_SM_61__)                           \
			            || /* NVIDIA Jetson TX2 (compute capability 6.2) */                                                       \
			            (defined(__SYCL_TARGET_NVIDIA_GPU_SM_62__) && __SYCL_TARGET_NVIDIA_GPU_SM_62__)                           \
			            || /* NVIDIA Volta architecture (compute capability 7.0) */                                               \
			            (defined(__SYCL_TARGET_NVIDIA_GPU_SM_70__) && __SYCL_TARGET_NVIDIA_GPU_SM_70__)                           \
			            || /* NVIDIA Jetson AGX (compute capability 7.2) */                                                       \
			            (defined(__SYCL_TARGET_NVIDIA_GPU_SM_72__) && __SYCL_TARGET_NVIDIA_GPU_SM_72__)                           \
			            || /* NVIDIA Turing architecture (compute capability 7.5) */                                              \
			            (defined(__SYCL_TARGET_NVIDIA_GPU_SM_75__) && __SYCL_TARGET_NVIDIA_GPU_SM_75__)                           \
			            || /* NVIDIA Ampere architecture (compute capability 8.0) */                                              \
			            (defined(__SYCL_TARGET_NVIDIA_GPU_SM_80__) && __SYCL_TARGET_NVIDIA_GPU_SM_80__)                           \
			            || /* NVIDIA Ampere architecture (compute capability 8.6) */                                              \
			            (defined(__SYCL_TARGET_NVIDIA_GPU_SM_86__) && __SYCL_TARGET_NVIDIA_GPU_SM_86__)                           \
			            || /* NVIDIA Jetson/Drive AGX Orin (compute capability 8.7) */                                            \
			            (defined(__SYCL_TARGET_NVIDIA_GPU_SM_87__) && __SYCL_TARGET_NVIDIA_GPU_SM_87__)                           \
			            || /* NVIDIA Ada Lovelace arch. (compute capability 8.9) */                                               \
			            (defined(__SYCL_TARGET_NVIDIA_GPU_SM_89__) && __SYCL_TARGET_NVIDIA_GPU_SM_89__)                           \
			            || /* NVIDIA Hopper architecture (compute capability 9.0) */                                              \
			            (defined(__SYCL_TARGET_NVIDIA_GPU_SM_90__) && __SYCL_TARGET_NVIDIA_GPU_SM_90__)                           \
			            || /*NVIDIA Hopper architecture variant(compute capability 9.0a) */                                       \
			            (defined(__SYCL_TARGET_NVIDIA_GPU_SM_90a__) && __SYCL_TARGET_NVIDIA_GPU_SM_90a__)                         \
			            || /* NVIDIA Blackwell architecture (compute capability 10.0) */                                          \
			            (defined(__SYCL_TARGET_NVIDIA_GPU_SM_100__) && __SYCL_TARGET_NVIDIA_GPU_SM_100__)                         \
			            || /* NVIDIA Blackwell architecture variant (compute capability 10.0a) */                                 \
			            (defined(__SYCL_TARGET_NVIDIA_GPU_SM_100a__) && __SYCL_TARGET_NVIDIA_GPU_SM_100a__)                       \
			            || /* NVIDIA Blackwell Next architecture (compute capability 10.1) */                                     \
			            (defined(__SYCL_TARGET_NVIDIA_GPU_SM_101__) && __SYCL_TARGET_NVIDIA_GPU_SM_101__)                         \
			            || /* NVIDIA Blackwell Next architecture variant (compute capability 10.1a) */                            \
			            (defined(__SYCL_TARGET_NVIDIA_GPU_SM_101a__) && __SYCL_TARGET_NVIDIA_GPU_SM_101a__)                       \
			            || /* NVIDIA Next-generation architecture (compute capability 10.3) */                                    \
			            (defined(__SYCL_TARGET_NVIDIA_GPU_SM_103__) && __SYCL_TARGET_NVIDIA_GPU_SM_103__)                         \
			            || /* NVIDIA Next-generation architecture variant (compute capability 10.3a) */                           \
			            (defined(__SYCL_TARGET_NVIDIA_GPU_SM_103a__) && __SYCL_TARGET_NVIDIA_GPU_SM_103a__)                       \
			            || /* NVIDIA Future architecture (compute capability 12.0) */                                             \
			            (defined(__SYCL_TARGET_NVIDIA_GPU_SM_120__) && __SYCL_TARGET_NVIDIA_GPU_SM_120__)                         \
			            || /* NVIDIA Future architecture variant (compute capability 12.0a) */                                    \
			            (defined(__SYCL_TARGET_NVIDIA_GPU_SM_120a__) && __SYCL_TARGET_NVIDIA_GPU_SM_120a__)                       \
			            || /* NVIDIA Future architecture (compute capability 12.1) */                                             \
			            (defined(__SYCL_TARGET_NVIDIA_GPU_SM_121__) && __SYCL_TARGET_NVIDIA_GPU_SM_121__)                         \
			            || /* NVIDIA Future architecture variant (compute capability 12.1a) */                                    \
			            (defined(__SYCL_TARGET_NVIDIA_GPU_SM_121a__) && __SYCL_TARGET_NVIDIA_GPU_SM_121a__)

			#            define ALPAKA_SYCL_SUBGROUP_SIZE (32) /* CUDA supports warp size 32 */

			#        elif /* AMD GCN 2.0 Sea Islands architecture (gfx 7.0) */                                                    \
			            (defined(__SYCL_TARGET_AMD_GPU_GFX700__) && __SYCL_TARGET_AMD_GPU_GFX700__)                               \
			            || /* AMD GCN 2.0 Sea Islands architecture (gfx 7.0) */                                                   \
			            (defined(__SYCL_TARGET_AMD_GPU_GFX701__) && __SYCL_TARGET_AMD_GPU_GFX701__)                               \
			            || /* AMD GCN 2.0 Sea Islands architecture (gfx 7.0) */                                                   \
			            (defined(__SYCL_TARGET_AMD_GPU_GFX702__) && __SYCL_TARGET_AMD_GPU_GFX702__)                               \
			            || /* AMD GCN 3.0 Volcanic Islands architecture (gfx 8.0) */                                              \
			            (defined(__SYCL_TARGET_AMD_GPU_GFX801__) && __SYCL_TARGET_AMD_GPU_GFX801__)                               \
			            || /* AMD GCN 3.0 Volcanic Islands architecture (gfx 8.0) */                                              \
			            (defined(__SYCL_TARGET_AMD_GPU_GFX802__) && __SYCL_TARGET_AMD_GPU_GFX802__)                               \
			            || /* AMD GCN 4.0 Arctic Islands architecture (gfx 8.0) */                                                \
			            (defined(__SYCL_TARGET_AMD_GPU_GFX803__) && __SYCL_TARGET_AMD_GPU_GFX803__)                               \
			            || /* AMD GCN 3.0 Volcanic Islands architecture (gfx 8.0) */                                              \
			            (defined(__SYCL_TARGET_AMD_GPU_GFX805__) && __SYCL_TARGET_AMD_GPU_GFX805__)                               \
			            || /* AMD GCN 3.0 Volcanic Islands architecture (gfx 8.1) */                                              \
			            (defined(__SYCL_TARGET_AMD_GPU_GFX810__) && __SYCL_TARGET_AMD_GPU_GFX810__)                               \
			            || /* AMD GCN 5.0 Vega architecture (gfx 9.0) */                                                          \
			            (defined(__SYCL_TARGET_AMD_GPU_GFX900__) && __SYCL_TARGET_AMD_GPU_GFX900__)                               \
			            || /* AMD GCN 5.0 Vega architecture (gfx 9.0) */                                                          \
			            (defined(__SYCL_TARGET_AMD_GPU_GFX902__) && __SYCL_TARGET_AMD_GPU_GFX902__)                               \
			            || /* AMD GCN 5.0 Vega architecture (gfx 9.0) */                                                          \
			            (defined(__SYCL_TARGET_AMD_GPU_GFX904__) && __SYCL_TARGET_AMD_GPU_GFX904__)                               \
			            || /* AMD GCN 5.1 Vega II architecture (gfx 9.0) */                                                       \
			            (defined(__SYCL_TARGET_AMD_GPU_GFX906__) && __SYCL_TARGET_AMD_GPU_GFX906__)                               \
			            || /* AMD CDNA 1.0 Arcturus architecture (gfx 9.0) */                                                     \
			            (defined(__SYCL_TARGET_AMD_GPU_GFX908__) && __SYCL_TARGET_AMD_GPU_GFX908__)                               \
			            || /* AMD GCN 5.0 Raven 2 architecture (gfx 9.0) */                                                       \
			            (defined(__SYCL_TARGET_AMD_GPU_GFX909__) && __SYCL_TARGET_AMD_GPU_GFX909__)                               \
			            || /* AMD CDNA 2.0 Aldebaran architecture (gfx 9.0) */                                                    \
			            (defined(__SYCL_TARGET_AMD_GPU_GFX90A__) && __SYCL_TARGET_AMD_GPU_GFX90A__)                               \
			            || /* AMD GCN 5.1 Renoir architecture (gfx 9.0) */                                                        \
			            (defined(__SYCL_TARGET_AMD_GPU_GFX90C__) && __SYCL_TARGET_AMD_GPU_GFX90C__)                               \
			            || /* AMD CDNA 3.x generic architecture (gfx 9.4) */                                                      \
			            (defined(__SYCL_TARGET_AMD_GPU_GFX9_4_GENERIC__) && __SYCL_TARGET_AMD_GPU_GFX9_4_GENERIC__)               \
			            || /* AMD CDNA 3.0 Aqua Vanjaram architecture (gfx 9.4) */                                                \
			            (defined(__SYCL_TARGET_AMD_GPU_GFX940__) && __SYCL_TARGET_AMD_GPU_GFX940__)                               \
			            || /* AMD CDNA 3.0 Aqua Vanjaram architecture (gfx 9.4) */                                                \
			            (defined(__SYCL_TARGET_AMD_GPU_GFX941__) && __SYCL_TARGET_AMD_GPU_GFX941__)                               \
			            || /* AMD CDNA 3.0 Aqua Vanjaram architecture (gfx 9.4) */                                                \
			            (defined(__SYCL_TARGET_AMD_GPU_GFX942__) && __SYCL_TARGET_AMD_GPU_GFX942__)                               \
			            || /* AMD CDNA 3.5 derivative architecture (gfx 9.5) */                                                   \
			            (defined(__SYCL_TARGET_AMD_GPU_GFX950__) && __SYCL_TARGET_AMD_GPU_GFX950__)                               \
			            || /* AMD GCN 5.x generic architecture (gfx 9.x) */                                                       \
			            (defined(__SYCL_TARGET_AMD_GPU_GFX9_GENERIC__) && __SYCL_TARGET_AMD_GPU_GFX9_GENERIC__)

			#            define ALPAKA_SYCL_SUBGROUP_SIZE (64) /* up to gfx9, HIP supports wavefront size 64 */

			#        elif /* AMD RDNA 1.0 Navi 10 architecture (gfx 10.1) */                                                      \
			            (defined(__SYCL_TARGET_AMD_GPU_GFX1010__) && __SYCL_TARGET_AMD_GPU_GFX1010__)                             \
			            || /* AMD RDNA 1.0 Navi 12 architecture (gfx 10.1) */                                                     \
			            (defined(__SYCL_TARGET_AMD_GPU_GFX1011__) && __SYCL_TARGET_AMD_GPU_GFX1011__)                             \
			            || /* AMD RDNA 1.0 Navi 14 architecture (gfx 10.1) */                                                     \
			            (defined(__SYCL_TARGET_AMD_GPU_GFX1012__) && __SYCL_TARGET_AMD_GPU_GFX1012__)                             \
			            || /* AMD RDNA 2.0 Oberon architecture (gfx 10.1) */                                                      \
			            (defined(__SYCL_TARGET_AMD_GPU_GFX1013__) && __SYCL_TARGET_AMD_GPU_GFX1013__)                             \
			            || /* AMD RDNA 1.x generic architecture (gfx 10.1) */                                                     \
			            (defined(__SYCL_TARGET_AMD_GPU_GFX10_1_GENERIC__) && __SYCL_TARGET_AMD_GPU_GFX10_1_GENERIC__)             \
			            || /* AMD RDNA 2.0 Navi 21 architecture (gfx 10.3) */                                                     \
			            (defined(__SYCL_TARGET_AMD_GPU_GFX1030__) && __SYCL_TARGET_AMD_GPU_GFX1030__)                             \
			            || /* AMD RDNA 2.0 Navi 22 architecture (gfx 10.3) */                                                     \
			            (defined(__SYCL_TARGET_AMD_GPU_GFX1031__) && __SYCL_TARGET_AMD_GPU_GFX1031__)                             \
			            || /* AMD RDNA 2.0 Navi 23 architecture (gfx 10.3) */                                                     \
			            (defined(__SYCL_TARGET_AMD_GPU_GFX1032__) && __SYCL_TARGET_AMD_GPU_GFX1032__)                             \
			            || /* AMD RDNA 2.0 Van Gogh architecture (gfx 10.3) */                                                    \
			            (defined(__SYCL_TARGET_AMD_GPU_GFX1033__) && __SYCL_TARGET_AMD_GPU_GFX1033__)                             \
			            || /* AMD RDNA 2.0 Navi 24 architecture (gfx 10.3) */                                                     \
			            (defined(__SYCL_TARGET_AMD_GPU_GFX1034__) && __SYCL_TARGET_AMD_GPU_GFX1034__)                             \
			            || /* AMD RDNA 2.0 Rembrandt Mobile architecture (gfx 10.3) */                                            \
			            (defined(__SYCL_TARGET_AMD_GPU_GFX1035__) && __SYCL_TARGET_AMD_GPU_GFX1035__)                             \
			            || /* AMD RDNA 2.0 Raphael architecture (gfx 10.3) */                                                     \
			            (defined(__SYCL_TARGET_AMD_GPU_GFX1036__) && __SYCL_TARGET_AMD_GPU_GFX1036__)                             \
			            || /* AMD RDNA 2.x generic architecture (gfx 10.3) */                                                     \
			            (defined(__SYCL_TARGET_AMD_GPU_GFX10_3_GENERIC__) && __SYCL_TARGET_AMD_GPU_GFX10_3_GENERIC__)             \
			            || /* AMD RDNA 3.0 Navi 31 architecture (gfx 11.0) */                                                     \
			            (defined(__SYCL_TARGET_AMD_GPU_GFX1100__) && __SYCL_TARGET_AMD_GPU_GFX1100__)                             \
			            || /* AMD RDNA 3.0 Navi 32 architecture (gfx 11.0) */                                                     \
			            (defined(__SYCL_TARGET_AMD_GPU_GFX1101__) && __SYCL_TARGET_AMD_GPU_GFX1101__)                             \
			            || /* AMD RDNA 3.0 Navi 33 architecture (gfx 11.0) */                                                     \
			            (defined(__SYCL_TARGET_AMD_GPU_GFX1102__) && __SYCL_TARGET_AMD_GPU_GFX1102__)                             \
			            || /* AMD RDNA 3.0 Phoenix mobile architecture (gfx 11.0) */                                              \
			            (defined(__SYCL_TARGET_AMD_GPU_GFX1103__) && __SYCL_TARGET_AMD_GPU_GFX1103__)                             \
			            || /* AMD RDNA 3.x generic architecture (gfx 11.x) */                                                     \
			            (defined(__SYCL_TARGET_AMD_GPU_GFX11_GENERIC__) && __SYCL_TARGET_AMD_GPU_GFX11_GENERIC__)                 \
			            || /* AMD RDNA 3.5 Strix Point architecture (gfx 11.5) */                                                 \
			            (defined(__SYCL_TARGET_AMD_GPU_GFX1150__) && __SYCL_TARGET_AMD_GPU_GFX1150__)                             \
			            || /* AMD RDNA 3.5 Strix Halo architecture (gfx 11.5) */                                                  \
			            (defined(__SYCL_TARGET_AMD_GPU_GFX1151__) && __SYCL_TARGET_AMD_GPU_GFX1151__)                             \
			            || /* AMD RDNA 4.0 Navi 44 architecture (gfx 12.0) */                                                     \
			            (defined(__SYCL_TARGET_AMD_GPU_GFX1200__) && __SYCL_TARGET_AMD_GPU_GFX1200__)                             \
			            || /* AMD RDNA 4.0 Navi 48 architecture (gfx 12.0) */                                                     \
			            (defined(__SYCL_TARGET_AMD_GPU_GFX1201__) && __SYCL_TARGET_AMD_GPU_GFX1201__)                             \
			            || /* AMD RDNA 4.x generic architecture (gfx 12.x) */                                                     \
			            (defined(__SYCL_TARGET_AMD_GPU_GFX12_GENERIC__) && __SYCL_TARGET_AMD_GPU_GFX12_GENERIC__)                 \
			            || /* AMD RDNA 4.5 derivative architecture (gfx 12.5) */                                                  \
			            (defined(__SYCL_TARGET_AMD_GPU_GFX1250__) && __SYCL_TARGET_AMD_GPU_GFX1250__)                             \
			            || /* AMD RDNA 4.5 derivative architecture (gfx 12.5) */                                                  \
			            (defined(__SYCL_TARGET_AMD_GPU_GFX1251__) && __SYCL_TARGET_AMD_GPU_GFX1251__)

			#            define ALPAKA_SYCL_SUBGROUP_SIZE (32) /* starting from gfx10, HIP supports wavefront size 32 */

			#        else // __SYCL_TARGET_*

			// if we do not compile ahead of time for a device and use e.g. -fsycl-targets=spir64 we need to accept all possible
			// variants
			#            define ALPAKA_SYCL_SUBGROUP_SIZE (0xFFFF'FFFF) /* unknown target */

			#        endif // __SYCL_TARGET_*

			#    else

			// ony the host side we need to allow all possible variants of a subgroup size else kernel will not be build
			#        define ALPAKA_SYCL_SUBGROUP_SIZE (0xFFFF'FFFF) /* host compilation */

			#    endif // __SYCL_DEVICE_ONLY__

			#endif
			// ==
			// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/core/syclConfig.hpp ==
			// ============================================================================

		// #include "alpaka/onAcc/internal/globalMem.hpp"    // amalgamate: file already inlined
		// #include "alpaka/onHost/internal/interface.hpp"    // amalgamate: file already inlined

		#if ALPAKA_LANG_ONEAPI

		#    ifndef ALPAKA_SYCL_NUM_MAX_SHARED_MEMORY_ALLOCATIONS
		#        define ALPAKA_SYCL_NUM_MAX_SHARED_MEMORY_ALLOCATIONS 32u
		#    endif

		#    ifndef SYCL_EXT_ONEAPI_MEMCPY2D
		#        error                                                                                                        \
		            "SYCL_EXT_ONEAPI_MEMCPY2D is not defined. Extension https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/supported/sycl_ext_oneapi_memcpy2d.asciidoc is required!"
		#    endif

		namespace alpaka::onHost::internal
		{
		    template<typename T_Device, typename T_Dest, typename T_Extents>
		    requires(alpaka::trait::getDim_v<T_Extents> > 1u)
		    struct Memset::Op<syclGeneric::Queue<T_Device>, T_Dest, T_Extents>
		    {
		        void operator()(syclGeneric::Queue<T_Device>& queue, auto&& dest, uint8_t byteValue, T_Extents const& extents)
		            const requires std::same_as<ALPAKA_TYPEOF(dest), T_Dest>
		        {
		            sycl::queue sycl_queue = queue.getNativeHandle();

		            auto extentMd = pCast<size_t>(extents);
		            auto const destPitchBytesWithoutColumn = dest.getPitches().eraseBack();
		            auto* destPtr = data(dest);

		            constexpr auto dim = alpaka::trait::getDim_v<T_Extents>;

		            sycl::event ev;

		            if constexpr(dim == 2u)
		            {
		                ev = sycl_queue.ext_oneapi_memset2d(
		                    destPtr,
		                    destPitchBytesWithoutColumn.back(),
		                    byteValue,
		                    extentMd.x() * sizeof(alpaka::trait::GetValueType_t<T_Dest>),
		                    extentMd.y());
		            }
		            else if constexpr(dim >= 3u)
		            {
		                auto const dstExtentWithoutColumn = extentMd.eraseBack();
		                ev = sycl_queue.ext_oneapi_memset2d(
		                    destPtr,
		                    destPitchBytesWithoutColumn.back(),
		                    byteValue,
		                    extentMd.x() * sizeof(alpaka::trait::GetValueType_t<T_Dest>),
		                    dstExtentWithoutColumn.product());
		            }

		            queue.setLastEvent(ev);
		            if(queue.isBlocking())
		                ev.wait_and_throw();
		        }
		    };

		    template<typename T_Device, typename T_Dest, typename T_Source, typename T_Extents>
		    requires(alpaka::trait::getDim_v<T_Extents> > 1u)
		    struct internal::Memcpy::Op<syclGeneric::Queue<T_Device>, T_Dest, T_Source, T_Extents>
		    {
		        void operator()(
		            syclGeneric::Queue<T_Device>& queue,
		            auto&& dest,
		            T_Source const& source,
		            T_Extents const& extents) const requires std::same_as<ALPAKA_TYPEOF(dest), T_Dest>
		        {
		            sycl::queue sycl_queue = queue.getNativeHandle();

		            auto extentMd = pCast<size_t>(extents);
		            auto const destPitchBytesWithoutColumn = dest.getPitches().eraseBack();
		            auto* destPtr = data(dest);
		            auto const sourcePitchBytesWithoutColumn = source.getPitches().eraseBack();
		            auto* sourcePtr = data(source);

		            constexpr auto dim = alpaka::trait::getDim_v<T_Extents>;

		            sycl::event ev;

		            if constexpr(dim == 2u)
		            {
		                ev = sycl_queue.ext_oneapi_memcpy2d(
		                    destPtr,
		                    destPitchBytesWithoutColumn.back(),
		                    sourcePtr,
		                    sourcePitchBytesWithoutColumn.back(),
		                    extentMd.x() * sizeof(alpaka::trait::GetValueType_t<T_Dest>),
		                    extentMd.y());
		            }
		            else if constexpr(dim >= 3u)
		            {
		                auto const dstExtentWithoutColumn = extentMd.eraseBack();
		                ev = sycl_queue.ext_oneapi_memcpy2d(
		                    destPtr,
		                    destPitchBytesWithoutColumn.back(),
		                    sourcePtr,
		                    sourcePitchBytesWithoutColumn.back(),
		                    extentMd.x() * sizeof(alpaka::trait::GetValueType_t<T_Dest>),
		                    dstExtentWithoutColumn.product());
		            }

		            queue.setLastEvent(ev);
		            if(queue.isBlocking())
		                ev.wait_and_throw();
		        }
		    };

		    // copy to device global memory
		    template<typename T_Device, typename T_Source, typename T_Storage, typename T>
		    struct internal::MemcpyDeviceGlobal::
		        Op<syclGeneric::Queue<T_Device>, onAcc::internal::GlobalDeviceMemoryWrapper<T_Storage, T>, T_Source>
		    {
		        void operator()(
		            syclGeneric::Queue<T_Device>& queue,
		            onAcc::internal::GlobalDeviceMemoryWrapper<T_Storage, T> dest,
		            auto&& source) const
		        {
		            ALPAKA_LOG_FUNCTION(onHost::logger::memory + onHost::logger::queue);
		            sycl::queue sycl_queue = queue.getNativeHandle();
		            void const* srcPtr{nullptr};
		            if constexpr(std::is_pointer_v<ALPAKA_TYPEOF(source)>)
		                srcPtr = source;
		            else
		                srcPtr = toVoidPtr(alpaka::onHost::data(source));
		            sycl::event ev = sycl_queue.memcpy(dest.getHandle(alpaka::api::oneApi), srcPtr);
		            queue.setLastEvent(ev);
		            if(queue.isBlocking())
		                ev.wait_and_throw();
		        }
		    };

		    // copy from device global memory
		    template<typename T_Device, typename T_Dest, typename T_Storage, typename T>
		    struct internal::MemcpyDeviceGlobal::
		        Op<syclGeneric::Queue<T_Device>, T_Dest, onAcc::internal::GlobalDeviceMemoryWrapper<T_Storage, T>>
		    {
		        void operator()(
		            syclGeneric::Queue<T_Device>& queue,
		            auto&& dest,
		            onAcc::internal::GlobalDeviceMemoryWrapper<T_Storage, T> source) const
		        {
		            ALPAKA_LOG_FUNCTION(onHost::logger::memory + onHost::logger::queue);
		            sycl::queue sycl_queue = queue.getNativeHandle();
		            void* destPtr{nullptr};
		            if constexpr(std::is_pointer_v<ALPAKA_TYPEOF(dest)>)
		                destPtr = dest;
		            else
		                destPtr = toVoidPtr(alpaka::onHost::data(dest));
		            sycl::event ev = sycl_queue.memcpy(destPtr, source.getHandle(alpaka::api::oneApi));
		            queue.setLastEvent(ev);
		            if(queue.isBlocking())
		                ev.wait_and_throw();
		        }
		    };

		    template<typename T_Device, typename T_Dest, typename T_Value, typename T_Extents>
		    requires(alpaka::trait::getDim_v<T_Extents> > 1u)
		    struct internal::Fill::Op<syclGeneric::Queue<T_Device>, T_Dest, T_Value, T_Extents>
		    {
		        void operator()(
		            syclGeneric::Queue<T_Device>& queue,
		            auto&& dest,
		            T_Value elementValue,
		            T_Extents const& extents) const
		            requires std::same_as<ALPAKA_TYPEOF(dest), T_Dest>
		                     && std::same_as<alpaka::trait::GetValueType_t<ALPAKA_TYPEOF(dest)>, T_Value>
		        {
		            // avoid that we pass a SharedBuffer and convert non alpaka data views
		            auto dataView = makeView(dest);

		            alpaka::internal::generic::fill(
		                queue,
		                defaultExecutor(getDevice(queue)),
		                dataView.getSubView(extents),
		                elementValue);
		        }
		    };

		    namespace detail
		    {
		        template<alpaka::concepts::Vector TVec>
		        inline constexpr auto vecToSyclRange(TVec vec)
		        {
		            constexpr auto dim = std::decay_t<TVec>::dim();
		            return [&vec]<auto... I>(std::index_sequence<I...>)
		            // TODO: check if this is the correct order
		            { return sycl::range<dim>(vec[I]...); }(std::make_index_sequence<dim>{});
		        };

		        template<alpaka::concepts::Vector T_NumBlocks, alpaka::concepts::Vector T_NumThreads>
		        struct OptimizedThreadSpec
		        {
		            using NumBlocksVecType = typename T_NumBlocks::UniVec;
		            using NumThreadsVecType = T_NumThreads;

		            static consteval uint32_t dim()
		            {
		                return T_NumThreads::dim();
		            }

		            constexpr OptimizedThreadSpec(T_NumBlocks const&, T_NumThreads const&)
		            {
		            }
		        };

		        /** provides the sycl worker description
		         *
		         * @return A pair of the sycl nd range and an optimized thread spec. The thread spec is not holding any data
		         * for dimension smaller equal to 3u
		         */
		        template<onHost::concepts::ThreadSpec T_ThreadSpec>
		        inline constexpr auto getWorkerDescription(T_ThreadSpec const& threadSpec)
		        {
		            constexpr uint32_t dim = T_ThreadSpec::dim();
		            // dimension of the sycl nd range
		            constexpr uint32_t syclDim = dim >= 4u ? 1u : dim;

		            sycl::nd_range<syclDim> gridRange;

		            if constexpr(T_ThreadSpec::dim() >= 4u)
		            {
		                gridRange = sycl::nd_range<syclDim>{
		                    (threadSpec.getNumBlocks() * threadSpec.getNumThreads()).product(),
		                    threadSpec.getNumThreads().product()};
		            }
		            else
		            {
		                gridRange = sycl::nd_range<T_ThreadSpec::dim()>{
		                    detail::vecToSyclRange(threadSpec.getNumBlocks() * threadSpec.getNumThreads()),
		                    detail::vecToSyclRange(threadSpec.getNumThreads())};
		            }

		            using ThreadSpecType = std::conditional_t<
		                dim >= 4u,
		                ALPAKA_TYPEOF(threadSpec),
		                detail::OptimizedThreadSpec<
		                    typename ALPAKA_TYPEOF(threadSpec)::NumBlocksVecType,
		                    typename ALPAKA_TYPEOF(threadSpec)::NumThreadsVecType>>;
		            // thread spec which is only holding data if the dimension is larger than 3u
		            auto optimizedThreadSpec = ThreadSpecType(threadSpec.getNumBlocks(), threadSpec.getNumThreads());
		            return std::make_pair(gridRange, optimizedThreadSpec);
		        }

		        /** Generate the kernel with the given warp size.
		         *
		         * @tparam T_dim number of dimension of the kernel
		         * @tparam T_warpSize requested warp size
		         * @tparam T_isValid 0u means it is not valid, else it is valid and a kernel is generated
		         */
		        template<uint32_t T_dim, uint32_t T_warpSize, uint32_t T_isValid>
		        struct EnqueueKernelWithWarpSize
		        {
		            static void call(
		                sycl::handler& cgh,
		                auto gridRange,
		                auto const& kernelBundle,
		                auto const& st_shared_accessor,
		                auto const& dyn_shared_accessor,
		                auto const& optimizedThreadSpec,
		                auto... args)
		            {
		                cgh.parallel_for(
		                    gridRange,
		                    [kernelBundle, st_shared_accessor, dyn_shared_accessor, optimizedThreadSpec, args...](
		                        sycl::nd_item<T_dim> work_item) [[sycl::reqd_sub_group_size(T_warpSize)]]
		                    {
		                        onAcc::oneApi::StaticSharedMemory ssm(st_shared_accessor);
		                        onAcc::syclGeneric::DynamicSharedMemory dsm(dyn_shared_accessor);

		                        static_assert(T_dim > 0);
		                        static_assert(T_dim <= 3, "more the 3 dimensions are not supported");
		                        auto acc = onAcc::Acc{Dict{
		                            DictEntry(layer::block, onAcc::syclGeneric::BlockLayer{work_item, optimizedThreadSpec}),
		                            DictEntry(layer::thread, onAcc::syclGeneric::ThreadLayer{work_item, optimizedThreadSpec}),
		                            DictEntry(action::threadBlockSync, onAcc::syclGeneric::Sync{work_item}),
		                            DictEntry(layer::shared, std::ref(ssm)),
		                            DictEntry(layer::dynShared, std::ref(dsm)),
		                            DictEntry(object::dynSharedMemBytes, dsm.byte_size()),
		                            args...}};

		                        kernelBundle(acc);
		                    });
		            }
		        };

		        template<uint32_t T_dim, uint32_t T_warpSize>
		        struct EnqueueKernelWithWarpSize<T_dim, T_warpSize, 0u>
		        {
		            static void call(
		                [[maybe_unused]] sycl::handler& cgh,
		                [[maybe_unused]] auto gridRange,
		                [[maybe_unused]] auto const& kernelBundle,
		                [[maybe_unused]] auto const& st_shared_accessor,
		                [[maybe_unused]] auto const& dyn_shared_accessor,
		                [[maybe_unused]] auto const& optimizedThreadSpec,
		                [[maybe_unused]] auto... args)
		            {
		                printf(
		                    "Dynamic evaluated warp size on host does not match the compile time warp size ( macro "
		                    "ALPAKA_SYCL_SUBGROUP_SIZE) evaluated in the "
		                    "kernel. Update the definition of ALPAKA_SYCL_SUBGROUP_SIZE section and check the trait "
		                    "Warpsize::Dispatch<>.");
		                abort();
		            }
		        };
		    } // namespace detail

		    template<
		        typename T_Device,
		        alpaka::concepts::Executor T_Executor,
		        alpaka::concepts::Vector T_NumBlocks,
		        alpaka::concepts::Vector T_NumThreads,
		        alpaka::concepts::KernelBundle T_KernelBundle>
		    struct Enqueue::
		        Kernel<syclGeneric::Queue<T_Device>, ThreadSpec<T_NumBlocks, T_NumThreads, T_Executor>, T_KernelBundle>
		    {
		        void operator()(
		            syclGeneric::Queue<T_Device>& queue,
		            ThreadSpec<T_NumBlocks, T_NumThreads, T_Executor> const& threadSpec,
		            T_KernelBundle const& kernelBundle) const
		        {
		            static_assert(
		                ALPAKA_TYPEOF(threadSpec)::getExecutor() != exec::anyExecutor,
		                "'exec::anyExecutor' can not be used to enqueue an kernel.");
		            ALPAKA_LOG_FUNCTION(onHost::logger::kernel + onHost::logger::queue);

		            constexpr auto st_shared_mem_bytes = onAcc::oneApi::StaticSharedMemory::sizeLookupBufferInBytes(
		                ALPAKA_SYCL_NUM_MAX_SHARED_MEMORY_ALLOCATIONS);
		            // allocate dynamic shared memory -- needs at least 1 byte to make the Xilinx Runtime happy
		            u_int32_t blockDynSharedMemBytes
		                = std::max(u_int32_t(1), onHost::getDynSharedMemBytes(threadSpec, kernelBundle));
		            assert(
		                st_shared_mem_bytes + blockDynSharedMemBytes
		                <= queue.m_device->getNativeHandle().first.template get_info<sycl::info::device::local_mem_size>());

		            sycl::event ev = queue.dispatchWarpSize(
		                [&](auto warpSize) requires std::same_as<
		                    std::integral_constant<
		                        typename ALPAKA_TYPEOF(warpSize)::value_type,
		                        ALPAKA_TYPEOF(warpSize)::value>,
		                    ALPAKA_TYPEOF(warpSize)>
		                {
		                    return queue.m_queue.submit(
		                        [warpSize, threadSpec, kernelBundle, blockDynSharedMemBytes](sycl::handler& cgh)
		                        {
		                            using ApiType = decltype(getApi(queue));
		                            using DeviceKindType = ALPAKA_TYPEOF(getDeviceKind(queue));

		                            auto st_shared_accessor
		                                = sycl::local_accessor<std::byte>{sycl::range<1>{st_shared_mem_bytes}, cgh};

		                            auto dyn_shared_accessor
		                                = sycl::local_accessor<std::byte>{sycl::range<1>{blockDynSharedMemBytes}, cgh};

		                            auto workerDesc = detail::getWorkerDescription(threadSpec);
		                            auto optimizedThreadSpec = workerDesc.second;
		                            constexpr uint32_t syclDim = workerDesc.first.dimensions;

		                            constexpr uint32_t w = ALPAKA_TYPEOF(warpSize)::value;
		                            detail::EnqueueKernelWithWarpSize<syclDim, w, ALPAKA_SYCL_SUBGROUP_SIZE & w>::call(
		                                cgh,
		                                workerDesc.first,
		                                kernelBundle,
		                                st_shared_accessor,
		                                dyn_shared_accessor,
		                                optimizedThreadSpec,
		                                DictEntry(object::api, ApiType{}),
		                                DictEntry(object::deviceKind, DeviceKindType{}),
		                                DictEntry(object::exec, T_Executor{}),
		                                DictEntry(object::launchedWidthFrameSpec, std::bool_constant<false>{}),
		                                DictEntry(object::warpSize, warpSize));
		                        });
		                });

		            queue.setLastEvent(ev);
		            if(queue.isBlocking())
		                ev.wait_and_throw();
		        }
		    };

		    template<
		        typename T_Device,
		        alpaka::concepts::Executor T_Executor,
		        alpaka::concepts::Vector T_NumFrames,
		        alpaka::concepts::Vector T_FrameExtents,
		        alpaka::concepts::KernelBundle T_KernelBundle>
		    struct Enqueue::
		        Kernel<syclGeneric::Queue<T_Device>, FrameSpec<T_NumFrames, T_FrameExtents, T_Executor>, T_KernelBundle>
		    {
		        void operator()(
		            syclGeneric::Queue<T_Device>& queue,
		            FrameSpec<T_NumFrames, T_FrameExtents, T_Executor> const& frameSpec,
		            T_KernelBundle const& kernelBundle) const
		        {
		            static_assert(
		                ALPAKA_TYPEOF(frameSpec)::getExecutor() != exec::anyExecutor,
		                "'exec::anyExecutor' can not be used to enqueue an kernel.");
		            ALPAKA_LOG_FUNCTION(onHost::logger::kernel + onHost::logger::queue);

		            auto const threadBlocking = internal::adjustThreadSpec(*queue.m_device.get(), frameSpec, kernelBundle);

		            constexpr auto st_shared_mem_bytes = onAcc::oneApi::StaticSharedMemory::sizeLookupBufferInBytes(
		                ALPAKA_SYCL_NUM_MAX_SHARED_MEMORY_ALLOCATIONS);

		            // allocate dynamic shared memory -- needs at least 1 byte to make the Xilinx Runtime happy
		            u_int32_t blockDynSharedMemBytes
		                = std::max(u_int32_t(1), onHost::getDynSharedMemBytes(threadBlocking, kernelBundle));

		            assert(
		                st_shared_mem_bytes + blockDynSharedMemBytes
		                <= queue.m_device->getNativeHandle().first.template get_info<sycl::info::device::local_mem_size>());

		            sycl::event ev = queue.dispatchWarpSize(
		                [&](auto warpSize) requires std::same_as<
		                    std::integral_constant<
		                        typename ALPAKA_TYPEOF(warpSize)::value_type,
		                        ALPAKA_TYPEOF(warpSize)::value>,
		                    ALPAKA_TYPEOF(warpSize)>
		                {
		                    return queue.m_queue.submit(
		                        [warpSize, threadBlocking, kernelBundle, blockDynSharedMemBytes](sycl::handler& cgh)
		                        {
		                            using ApiType = decltype(getApi(queue));
		                            using DeviceKindType = ALPAKA_TYPEOF(getDeviceKind(queue));
		                            auto st_shared_accessor
		                                = sycl::local_accessor<std::byte>{sycl::range<1>{st_shared_mem_bytes}, cgh};
		                            auto dyn_shared_accessor
		                                = sycl::local_accessor<std::byte>{sycl::range<1>{blockDynSharedMemBytes}, cgh};

		                            auto workerDesc = detail::getWorkerDescription(threadBlocking);
		                            auto optimizedThreadSpec = workerDesc.second;
		                            constexpr uint32_t syclDim = workerDesc.first.dimensions;

		                            constexpr uint32_t w = ALPAKA_TYPEOF(warpSize)::value;

		                            detail::EnqueueKernelWithWarpSize<syclDim, w, ALPAKA_SYCL_SUBGROUP_SIZE & w>::call(
		                                cgh,
		                                workerDesc.first,
		                                kernelBundle,
		                                st_shared_accessor,
		                                dyn_shared_accessor,
		                                optimizedThreadSpec,
		                                DictEntry(object::api, ApiType{}),
		                                DictEntry(object::deviceKind, DeviceKindType{}),
		                                DictEntry(object::exec, T_Executor{}),
		                                DictEntry(object::launchedWidthFrameSpec, std::bool_constant<true>{}),
		                                DictEntry(object::warpSize, warpSize));
		                        });
		                });
		            if(queue.isBlocking())
		                ev.wait_and_throw();
		        }
		    };

		} // namespace alpaka::onHost::internal

		#endif
		// ==
		// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/oneApi/Queue.hpp ==
		// ============================================================================

	// #include "alpaka/api/oneApi/executor.hpp"    // amalgamate: file already inlined
	// #include "alpaka/api/syclGeneric/Event.hpp"    // amalgamate: file already inlined
		// ============================================================================
		// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/syclGeneric/atomic.hpp ==
		// ==
		/* Copyright 2025 Jan Stephan, Andrea Bocci, Luca Ferragina
		 * SPDX-License-Identifier: MPL-2.0
		 */

		// #pragma once
		// #include "alpaka/api/syclGeneric/tag.hpp"    // amalgamate: file already inlined
		// #include "alpaka/core/config.hpp"    // amalgamate: file already inlined
		// #include "alpaka/onAcc/internal/interface.hpp"    // amalgamate: file already inlined
		// #include "alpaka/onAcc/scope.hpp"    // amalgamate: file already inlined
		// #include "alpaka/operation.hpp"    // amalgamate: file already inlined

		#if ALPAKA_LANG_SYCL

		// #    include <sycl/sycl.hpp>    // amalgamate: file already included

		// #    include <cstdint>    // amalgamate: file already included
		#    include <type_traits>

		namespace alpaka::detail
		{
		    template<typename T_Scope>
		    struct SyclMemoryScope
		    {
		    };

		    template<>
		    struct SyclMemoryScope<alpaka::onAcc::scope::System>
		    {
		        static constexpr auto value = sycl::memory_scope::system;
		    };

		    template<>
		    struct SyclMemoryScope<alpaka::onAcc::scope::Device>
		    {
		        static constexpr auto value = sycl::memory_scope::device;
		    };

		    template<>
		    struct SyclMemoryScope<alpaka::onAcc::scope::Block>
		    {
		        static constexpr auto value = sycl::memory_scope::work_group;
		    };

		    template<typename T, typename T_Scope>
		    using sycl_atomic_ref = sycl::atomic_ref<T, sycl::memory_order::relaxed, SyclMemoryScope<T_Scope>::value>;

		    template<typename T_Scope, typename T, typename TOp>
		    inline auto callAtomicOp(T* const addr, TOp&& op)
		    {
		        auto ref = sycl_atomic_ref<T, T_Scope>{*addr};
		        return op(ref);
		    }

		    template<typename TRef, typename T, typename TEval>
		    inline auto casWithCondition(T* const addr, TEval&& eval)
		    {
		        auto ref = TRef{*addr};
		        auto old_val = ref.load();

		        // prefer compare_exchange_weak when in a loop, assuming that eval is not expensive
		        while(!ref.compare_exchange_weak(old_val, eval(old_val)))
		        {
		        }

		        return old_val;
		    }
		} // namespace alpaka::detail

		namespace alpaka::onAcc::internalCompute
		{
		    // Add.
		    //! The SYCL accelerator atomic operation.
		    template<typename T, typename T_Scope>
		    struct Atomic::Op<alpaka::operation::Add, onAcc::internal::SyclAtomic, T, T_Scope>
		    {
		        static_assert(std::is_integral_v<T> || std::is_floating_point_v<T>, "SYCL atomics do not support this type");

		        static auto atomicOp(onAcc::internal::SyclAtomic const&, T* const addr, T const& value) -> T
		        {
		            return alpaka::detail::callAtomicOp<T_Scope>(addr, [&value](auto& ref) { return ref.fetch_add(value); });
		        }
		    };

		    // Sub.
		    //! The SYCL accelerator atomic operation.
		    template<typename T, typename T_Scope>
		    struct Atomic::Op<alpaka::operation::Sub, onAcc::internal::SyclAtomic, T, T_Scope>
		    {
		        static_assert(std::is_integral_v<T> || std::is_floating_point_v<T>, "SYCL atomics do not support this type");

		        static auto atomicOp(onAcc::internal::SyclAtomic const&, T* const addr, T const& value) -> T
		        {
		            return alpaka::detail::callAtomicOp<T_Scope>(addr, [&value](auto& ref) { return ref.fetch_sub(value); });
		        }
		    };

		    // Min.
		    //! The SYCL accelerator atomic operation.
		    template<typename T, typename T_Scope>
		    struct Atomic::Op<alpaka::operation::Min, onAcc::internal::SyclAtomic, T, T_Scope>
		    {
		        static_assert(std::is_integral_v<T> || std::is_floating_point_v<T>, "SYCL atomics do not support this type");

		        static auto atomicOp(onAcc::internal::SyclAtomic const&, T* const addr, T const& value) -> T
		        {
		            return alpaka::detail::callAtomicOp<T_Scope>(addr, [&value](auto& ref) { return ref.fetch_min(value); });
		        }
		    };

		    // Max.
		    //! The SYCL accelerator atomic operation.
		    template<typename T, typename T_Scope>
		    struct Atomic::Op<alpaka::operation::Max, onAcc::internal::SyclAtomic, T, T_Scope>
		    {
		        static_assert(std::is_integral_v<T> || std::is_floating_point_v<T>, "SYCL atomics do not support this type");

		        static auto atomicOp(onAcc::internal::SyclAtomic const&, T* const addr, T const& value) -> T
		        {
		            return alpaka::detail::callAtomicOp<T_Scope>(addr, [&value](auto& ref) { return ref.fetch_max(value); });
		        }
		    };

		    // Exch.
		    //! The SYCL accelerator atomic operation.
		    template<typename T, typename T_Scope>
		    struct Atomic::Op<alpaka::operation::Exch, onAcc::internal::SyclAtomic, T, T_Scope>
		    {
		        static_assert(
		            (std::is_integral_v<T> || std::is_floating_point_v<T>) and (sizeof(T) == 4 || sizeof(T) == 8),
		            "SYCL atomics do not support this type");

		        static auto atomicOp(onAcc::internal::SyclAtomic const&, T* const addr, T const& value) -> T
		        {
		            return alpaka::detail::callAtomicOp<T_Scope>(addr, [&value](auto& ref) { return ref.exchange(value); });
		        }
		    };

		    // Inc.
		    //! The SYCL accelerator atomic operation.
		    template<typename T, typename T_Scope>
		    struct Atomic::Op<alpaka::operation::Inc, onAcc::internal::SyclAtomic, T, T_Scope>
		    {
		        static_assert(
		            std::is_unsigned_v<T> && (sizeof(T) == 4 || sizeof(T) == 8),
		            "SYCL atomics support only 32- and 64-bits unsigned integral types");

		        static auto atomicOp(onAcc::internal::SyclAtomic const&, T* const addr, T const& value) -> T
		        {
		            auto inc = [&value](auto old_val)
		            { return (old_val >= value) ? static_cast<T>(0) : (old_val + static_cast<T>(1)); };
		            return alpaka::detail::casWithCondition<alpaka::detail::sycl_atomic_ref<T, T_Scope>>(addr, inc);
		        }
		    };

		    // Dec.
		    //! The SYCL accelerator atomic operation.
		    template<typename T, typename T_Scope>
		    struct Atomic::Op<alpaka::operation::Dec, onAcc::internal::SyclAtomic, T, T_Scope>
		    {
		        static_assert(
		            std::is_unsigned_v<T> && (sizeof(T) == 4 || sizeof(T) == 8),
		            "SYCL atomics support only 32- and 64-bits unsigned integral types");

		        static auto atomicOp(onAcc::internal::SyclAtomic const&, T* const addr, T const& value) -> T
		        {
		            auto dec = [&value](auto& old_val)
		            { return ((old_val == 0) || (old_val > value)) ? value : (old_val - static_cast<T>(1)); };
		            return alpaka::detail::casWithCondition<alpaka::detail::sycl_atomic_ref<T, T_Scope>>(addr, dec);
		        }
		    };

		    // And.
		    //! The SYCL accelerator atomic operation.
		    template<typename T, typename T_Scope>
		    struct Atomic::Op<alpaka::operation::And, onAcc::internal::SyclAtomic, T, T_Scope>
		    {
		        static_assert(std::is_integral_v<T>, "Bitwise operations only supported for integral types.");

		        static auto atomicOp(onAcc::internal::SyclAtomic const&, T* const addr, T const& value) -> T
		        {
		            return alpaka::detail::callAtomicOp<T_Scope>(addr, [&value](auto& ref) { return ref.fetch_and(value); });
		        }
		    };

		    // Or.
		    //! The SYCL accelerator atomic operation.
		    template<typename T, typename T_Scope>
		    struct Atomic::Op<alpaka::operation::Or, onAcc::internal::SyclAtomic, T, T_Scope>
		    {
		        static_assert(std::is_integral_v<T>, "Bitwise operations only supported for integral types.");

		        static auto atomicOp(onAcc::internal::SyclAtomic const&, T* const addr, T const& value) -> T
		        {
		            return alpaka::detail::callAtomicOp<T_Scope>(addr, [&value](auto& ref) { return ref.fetch_or(value); });
		        }
		    };

		    // Xor.
		    //! The SYCL accelerator atomic operation.
		    template<typename T, typename T_Scope>
		    struct Atomic::Op<alpaka::operation::Xor, onAcc::internal::SyclAtomic, T, T_Scope>
		    {
		        static_assert(std::is_integral_v<T>, "Bitwise operations only supported for integral types.");

		        static auto atomicOp(onAcc::internal::SyclAtomic const&, T* const addr, T const& value) -> T
		        {
		            return alpaka::detail::callAtomicOp<T_Scope>(addr, [&value](auto& ref) { return ref.fetch_xor(value); });
		        }
		    };

		    // Cas.
		    //! The SYCL accelerator atomic operation.
		    template<typename T, typename T_Scope>
		    struct Atomic::Op<alpaka::operation::Cas, onAcc::internal::SyclAtomic, T, T_Scope>
		    {
		        static_assert(std::is_integral_v<T> || std::is_floating_point_v<T>, "SYCL atomics do not support this type");

		        static auto atomicOp(onAcc::internal::SyclAtomic const&, T* const addr, T const& expected, T const& desired)
		            -> T
		        {
		            auto cas = [&expected, &desired](auto& ref)
		            {
		                auto expected_ = expected;
		                // Atomically compares the value of `ref` with the value of `expected`.
		                // If the values are equal, replaces the value of `ref` with `desired`.
		                // Otherwise updates `expected` with the value of `ref`.
		                // Returns a bool telling us if the exchange happened or not, but the Alpaka API does not make use of
		                // it.
		                ref.compare_exchange_strong(expected_, desired);

		                // If the update succeded, return the previous value of `ref`.
		                // Otherwise, return the current value of `ref`.
		                return expected_;
		            };

		            return alpaka::detail::callAtomicOp<T_Scope>(addr, cas);
		        }
		    };
		} // namespace alpaka::onAcc::internalCompute

		#endif
		// ==
		// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/syclGeneric/atomic.hpp ==
		// ============================================================================

		// ============================================================================
		// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/syclGeneric/math.hpp ==
		// ==
		/* Copyright 2023 Axel Huebl, Benjamin Worpitz, Matthias Werner, Bert Wesarg, Valentin Gehrke, René Widera,
		 * Jan Stephan, Andrea Bocci, Bernhard Manfred Gruber, Jeffrey Kelling, Sergei Bastrakov, Mehmet Yusufoglu
		 * SPDX-License-Identifier: MPL-2.0
		 */

		// #pragma once
		// #include "alpaka/api/api.hpp"    // amalgamate: file already inlined
		// #include "alpaka/api/syclGeneric/tag.hpp"    // amalgamate: file already inlined
		// #include "alpaka/core/common.hpp"    // amalgamate: file already inlined
		// #include "alpaka/core/config.hpp"    // amalgamate: file already inlined
			// ============================================================================
			// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/math/Complex.hpp ==
			// ==
			/* Copyright 2024 Sergei Bastrakov, Aurora Perego
			 * SPDX-License-Identifier: MPL-2.0
			 */

			// #pragma once
				// ============================================================================
				// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/math/internal/Complex.hpp ==
				// ==
				/* Copyright 2024 Sergei Bastrakov, Aurora Perego
				 * SPDX-License-Identifier: MPL-2.0
				 */

				// #pragma once
				// #include "alpaka/core/common.hpp"    // amalgamate: file already inlined
					// ============================================================================
					// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/math.hpp ==
					// ==
					/* Copyright 2024 René Widera
					 * SPDX-License-Identifier: MPL-2.0
					 */

					// #pragma once
					// #include "alpaka/api/api.hpp"    // amalgamate: file already inlined
						// ============================================================================
						// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/math.hpp ==
						// ==
						/* Copyright 2025 René Widera
						 * SPDX-License-Identifier: MPL-2.0
						 */

						// #pragma once
							// ============================================================================
							// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/cuda/math.hpp ==
							// ==
							/* Copyright 2025 René Widera
							 * SPDX-License-Identifier: MPL-2.0
							 */

							// #pragma once
							// #include "alpaka/api/cuda/Api.hpp"    // amalgamate: file already inlined
							// #include "alpaka/api/trait.hpp"    // amalgamate: file already inlined
							// #include "alpaka/api/unifiedCudaHip/tag.hpp"    // amalgamate: file already inlined

							namespace alpaka::trait
							{
							    template<>
							    struct GetMathImpl::Op<alpaka::api::Cuda>
							    {
							        constexpr decltype(auto) operator()(alpaka::api::Cuda const) const
							        {
							            return alpaka::math::internal::cudaHipMath;
							        }
							    };
							} // namespace alpaka::trait
							// ==
							// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/cuda/math.hpp ==
							// ============================================================================

							// ============================================================================
							// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/hip/math.hpp ==
							// ==
							/* Copyright 2025 René Widera
							 * SPDX-License-Identifier: MPL-2.0
							 */

							// #pragma once
							// #include "alpaka/api/hip/Api.hpp"    // amalgamate: file already inlined
							// #include "alpaka/api/trait.hpp"    // amalgamate: file already inlined
							// #include "alpaka/api/unifiedCudaHip/tag.hpp"    // amalgamate: file already inlined

							namespace alpaka::trait
							{
							    template<>
							    struct GetMathImpl::Op<alpaka::api::Hip>
							    {
							        constexpr decltype(auto) operator()(alpaka::api::Hip const) const
							        {
							            return alpaka::math::internal::cudaHipMath;
							        }
							    };
							} // namespace alpaka::trait
							// ==
							// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/hip/math.hpp ==
							// ============================================================================

							// ============================================================================
							// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/oneApi/math.hpp ==
							// ==
							/* Copyright 2025 René Widera
							 * SPDX-License-Identifier: MPL-2.0
							 */

							// #pragma once
							// #include "alpaka/api/oneApi/Api.hpp"    // amalgamate: file already inlined
							// #include "alpaka/api/syclGeneric/tag.hpp"    // amalgamate: file already inlined
							// #include "alpaka/api/trait.hpp"    // amalgamate: file already inlined

							namespace alpaka::trait
							{
							    template<>
							    struct GetMathImpl::Op<alpaka::api::OneApi>
							    {
							        constexpr decltype(auto) operator()(alpaka::api::OneApi const) const
							        {
							            return alpaka::math::internal::syclMath;
							        }
							    };
							} // namespace alpaka::trait
							// ==
							// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/oneApi/math.hpp ==
							// ============================================================================

						// ==
						// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/math.hpp ==
						// ============================================================================

					// #include "alpaka/math/internal/math.hpp"    // amalgamate: file already inlined

					// #include <cmath>    // amalgamate: file already included

					namespace alpaka::math
					{
					    constexpr auto abs(auto const& arg)
					    {
					        auto const mathImpl = trait::getMathImpl(thisApi());
					        return internal::Abs::Op<ALPAKA_TYPEOF(mathImpl), ALPAKA_TYPEOF(arg)>{}(mathImpl, arg);
					    }

					    constexpr auto sin(auto const& arg)
					    {
					        auto const mathImpl = trait::getMathImpl(thisApi());
					        return internal::Sin::Op<ALPAKA_TYPEOF(mathImpl), ALPAKA_TYPEOF(arg)>{}(mathImpl, arg);
					    }

					    constexpr auto acosh(auto const& arg)
					    {
					        auto const mathImpl = trait::getMathImpl(thisApi());
					        return internal::Acosh::Op<ALPAKA_TYPEOF(mathImpl), ALPAKA_TYPEOF(arg)>{}(mathImpl, arg);
					    }

					    constexpr auto asinh(auto const& arg)
					    {
					        auto const mathImpl = trait::getMathImpl(thisApi());
					        return internal::Asinh::Op<ALPAKA_TYPEOF(mathImpl), ALPAKA_TYPEOF(arg)>{}(mathImpl, arg);
					    }

					    constexpr auto sinh(auto const& arg)
					    {
					        auto const mathImpl = trait::getMathImpl(thisApi());
					        return internal::Sinh::Op<ALPAKA_TYPEOF(mathImpl), ALPAKA_TYPEOF(arg)>{}(mathImpl, arg);
					    }

					    constexpr auto atan(auto const& arg)
					    {
					        auto const mathImpl = trait::getMathImpl(thisApi());
					        return internal::Atan::Op<ALPAKA_TYPEOF(mathImpl), ALPAKA_TYPEOF(arg)>{}(mathImpl, arg);
					    }

					    constexpr auto trunc(auto const& arg)
					    {
					        auto const mathImpl = trait::getMathImpl(thisApi());
					        return internal::Trunc::Op<ALPAKA_TYPEOF(mathImpl), ALPAKA_TYPEOF(arg)>{}(mathImpl, arg);
					    }

					    constexpr auto isinf(auto const& arg)
					    {
					        auto const mathImpl = trait::getMathImpl(thisApi());
					        return internal::Isinf::Op<ALPAKA_TYPEOF(mathImpl), ALPAKA_TYPEOF(arg)>{}(mathImpl, arg);
					    }

					    constexpr auto isfinite(auto const& arg)
					    {
					        auto const mathImpl = trait::getMathImpl(thisApi());
					        return internal::Isfinite::Op<ALPAKA_TYPEOF(mathImpl), ALPAKA_TYPEOF(arg)>{}(mathImpl, arg);
					    }

					    constexpr auto atanh(auto const& arg)
					    {
					        auto const mathImpl = trait::getMathImpl(thisApi());
					        return internal::Atanh::Op<ALPAKA_TYPEOF(mathImpl), ALPAKA_TYPEOF(arg)>{}(mathImpl, arg);
					    }

					    constexpr auto tanh(auto const& arg)
					    {
					        auto const mathImpl = trait::getMathImpl(thisApi());
					        return internal::Tanh::Op<ALPAKA_TYPEOF(mathImpl), ALPAKA_TYPEOF(arg)>{}(mathImpl, arg);
					    }

					    constexpr auto cbrt(auto const& arg)
					    {
					        auto const mathImpl = trait::getMathImpl(thisApi());
					        return internal::Cbrt::Op<ALPAKA_TYPEOF(mathImpl), ALPAKA_TYPEOF(arg)>{}(mathImpl, arg);
					    }

					    constexpr auto ceil(auto const& arg)
					    {
					        auto const mathImpl = trait::getMathImpl(thisApi());
					        return internal::Ceil::Op<ALPAKA_TYPEOF(mathImpl), ALPAKA_TYPEOF(arg)>{}(mathImpl, arg);
					    }

					    /** Computes the nearest integer value to arg (in floating-point format), rounding halfway cases away from zero,
					     * regardless of the current rounding mode.
					     */
					    constexpr auto round(auto const& arg)
					    {
					        auto const mathImpl = trait::getMathImpl(thisApi());
					        return internal::Round::Op<ALPAKA_TYPEOF(mathImpl), ALPAKA_TYPEOF(arg)>{}(mathImpl, arg);
					    }

					    /** Computes the nearest integer value to arg (in in integer format), rounding halfway cases away from zero,
					     * regardless of the current rounding mode.
					     */
					    constexpr auto lround(auto const& arg)
					    {
					        auto const mathImpl = trait::getMathImpl(thisApi());
					        return internal::Lround::Op<ALPAKA_TYPEOF(mathImpl), ALPAKA_TYPEOF(arg)>{}(mathImpl, arg);
					    }

					    /** Computes the nearest integer value to arg (in in integer format), rounding halfway cases away from zero,
					     * regardless of the current rounding mode.
					     */
					    constexpr auto llround(auto const& arg)
					    {
					        auto const mathImpl = trait::getMathImpl(thisApi());
					        return internal::Llround::Op<ALPAKA_TYPEOF(mathImpl), ALPAKA_TYPEOF(arg)>{}(mathImpl, arg);
					    }

					    /** Creates a value with the magnitude of mag and the sign of sgn. */
					    constexpr auto copysign(auto const& mag, auto const& sgn)
					    {
					        auto const mathImpl = trait::getMathImpl(thisApi());
					        return internal::Copysign::Op<ALPAKA_TYPEOF(mathImpl), ALPAKA_TYPEOF(mag), ALPAKA_TYPEOF(sgn)>{}(
					            mathImpl,
					            mag,
					            sgn);
					    }

					    constexpr auto sincos(auto const& arg, auto& result_sin, auto& result_cos)
					    {
					        auto const mathImpl = trait::getMathImpl(thisApi());
					        return internal::SinCos::Op<ALPAKA_TYPEOF(mathImpl), ALPAKA_TYPEOF(arg)>{}(
					            mathImpl,
					            arg,
					            result_sin,
					            result_cos);
					    }

					    constexpr auto exp(auto const& arg)
					    {
					        auto const mathImpl = trait::getMathImpl(thisApi());
					        return internal::Exp::Op<ALPAKA_TYPEOF(mathImpl), ALPAKA_TYPEOF(arg)>{}(mathImpl, arg);
					    }

					    constexpr auto arg(auto const& arg)
					    {
					        auto const mathImpl = trait::getMathImpl(thisApi());
					        return internal::Arg::Op<ALPAKA_TYPEOF(mathImpl), ALPAKA_TYPEOF(arg)>{}(mathImpl, arg);
					    }

					    constexpr auto atan2(auto const& y, auto const& x)
					    {
					        auto const mathImpl = trait::getMathImpl(thisApi());
					        return internal::Atan2::Op<ALPAKA_TYPEOF(mathImpl), ALPAKA_TYPEOF(y), ALPAKA_TYPEOF(x)>{}(mathImpl, y, x);
					    }

					    // Square root function
					    constexpr auto sqrt(auto const& arg)
					    {
					        auto const mathImpl = trait::getMathImpl(thisApi());
					        return internal::Sqrt::Op<ALPAKA_TYPEOF(mathImpl), ALPAKA_TYPEOF(arg)>{}(mathImpl, arg);
					    }

					    /* Computes the rsqrt.
					     *
					     * Valid real arguments are positive. For other values the result
					     * may depend on the backend and compilation options, will likely
					     * be NaN.
					     */
					    constexpr auto rsqrt(auto const& arg)
					    {
					        auto const mathImpl = trait::getMathImpl(thisApi());
					        return internal::Rsqrt::Op<ALPAKA_TYPEOF(mathImpl), ALPAKA_TYPEOF(arg)>{}(mathImpl, arg);
					    }

					    // Cosine function
					    constexpr auto cos(auto const& arg)
					    {
					        auto const mathImpl = trait::getMathImpl(thisApi());
					        return internal::Cos::Op<ALPAKA_TYPEOF(mathImpl), ALPAKA_TYPEOF(arg)>{}(mathImpl, arg);
					    }

					    constexpr auto cosh(auto const& arg)
					    {
					        auto const mathImpl = trait::getMathImpl(thisApi());
					        return internal::Cosh::Op<ALPAKA_TYPEOF(mathImpl), ALPAKA_TYPEOF(arg)>{}(mathImpl, arg);
					    }

					    constexpr auto erf(auto const& arg)
					    {
					        auto const mathImpl = trait::getMathImpl(thisApi());
					        return internal::Erf::Op<ALPAKA_TYPEOF(mathImpl), ALPAKA_TYPEOF(arg)>{}(mathImpl, arg);
					    }

					    constexpr auto floor(auto const& arg)
					    {
					        auto const mathImpl = trait::getMathImpl(thisApi());
					        return internal::Floor::Op<ALPAKA_TYPEOF(mathImpl), ALPAKA_TYPEOF(arg)>{}(mathImpl, arg);
					    }

					    /** Computes the natural (base e) logarithm of arg.
					     *
					     * Valid real arguments are non-negative. For other values the result
					     * may depend on the backend and compilation options, will likely
					     * be NaN.
					     */
					    constexpr auto log(auto const& arg)
					    {
					        auto const mathImpl = trait::getMathImpl(thisApi());
					        return internal::Log::Op<ALPAKA_TYPEOF(mathImpl), ALPAKA_TYPEOF(arg)>{}(mathImpl, arg);
					    }

					    /** Computes the natural (base 2) logarithm of arg.
					     *
					     * Valid real arguments are non-negative. For other values the result
					     * may depend on the backend and compilation options, will likely
					     * be NaN.
					     */
					    constexpr auto log2(auto const& arg)
					    {
					        auto const mathImpl = trait::getMathImpl(thisApi());
					        return internal::Log2::Op<ALPAKA_TYPEOF(mathImpl), ALPAKA_TYPEOF(arg)>{}(mathImpl, arg);
					    }

					    /* Computes the natural (base 10) logarithm of arg.
					     *
					     * Valid real arguments are non-negative. For other values the result
					     * may depend on the backend and compilation options, will likely
					     * be NaN.
					     */
					    constexpr auto log10(auto const& arg)
					    {
					        auto const mathImpl = trait::getMathImpl(thisApi());
					        return internal::Log10::Op<ALPAKA_TYPEOF(mathImpl), ALPAKA_TYPEOF(arg)>{}(mathImpl, arg);
					    }

					    // Tangent function
					    constexpr auto tan(auto const& arg)
					    {
					        auto const mathImpl = trait::getMathImpl(thisApi());
					        return internal::Tan::Op<ALPAKA_TYPEOF(mathImpl), ALPAKA_TYPEOF(arg)>{}(mathImpl, arg);
					    }

					    // Arc cosine function
					    constexpr auto acos(auto const& arg)
					    {
					        auto const mathImpl = trait::getMathImpl(thisApi());
					        return internal::Acos::Op<ALPAKA_TYPEOF(mathImpl), ALPAKA_TYPEOF(arg)>{}(mathImpl, arg);
					    }

					    // Arc sine function
					    constexpr auto asin(auto const& arg)
					    {
					        auto const mathImpl = trait::getMathImpl(thisApi());
					        return internal::Asin::Op<ALPAKA_TYPEOF(mathImpl), ALPAKA_TYPEOF(arg)>{}(mathImpl, arg);
					    }

					    constexpr auto isnan(auto const& arg)
					    {
					        auto const mathImpl = trait::getMathImpl(thisApi());
					        return internal::Isnan::Op<ALPAKA_TYPEOF(mathImpl), ALPAKA_TYPEOF(arg)>{}(mathImpl, arg);
					    }

					    //! Computes the complex conjugate of arg.
					    constexpr auto conj(auto const& arg)
					    {
					        auto const mathImpl = trait::getMathImpl(thisApi());
					        return internal::Conj::Op<ALPAKA_TYPEOF(mathImpl), ALPAKA_TYPEOF(arg)>{}(mathImpl, arg);
					    }

					    constexpr auto min(auto const& a, auto const& b)
					    {
					        auto const mathImpl = trait::getMathImpl(thisApi());
					        return internal::Min::Op<ALPAKA_TYPEOF(mathImpl), ALPAKA_TYPEOF(a), ALPAKA_TYPEOF(b)>{}(mathImpl, a, b);
					    }

					    constexpr auto max(auto const& a, auto const& b)
					    {
					        auto const mathImpl = trait::getMathImpl(thisApi());
					        return internal::Max::Op<ALPAKA_TYPEOF(mathImpl), ALPAKA_TYPEOF(a), ALPAKA_TYPEOF(b)>{}(mathImpl, a, b);
					    }

					    constexpr auto pow(auto const& base, auto const& exp)
					    {
					        auto const mathImpl = trait::getMathImpl(thisApi());
					        return internal::Pow::Op<ALPAKA_TYPEOF(mathImpl), ALPAKA_TYPEOF(base), ALPAKA_TYPEOF(exp)>{}(
					            mathImpl,
					            base,
					            exp);
					    }

					    constexpr auto fmod(auto const& x, auto const& y)
					    {
					        auto const mathImpl = trait::getMathImpl(thisApi());
					        return internal::Fmod::Op<ALPAKA_TYPEOF(mathImpl), ALPAKA_TYPEOF(x), ALPAKA_TYPEOF(y)>{}(mathImpl, x, y);
					    }

					    constexpr auto remainder(auto const& x, auto const& y)
					    {
					        auto const mathImpl = trait::getMathImpl(thisApi());
					        return internal::Remainder::Op<ALPAKA_TYPEOF(mathImpl), ALPAKA_TYPEOF(x), ALPAKA_TYPEOF(y)>{}(mathImpl, x, y);
					    }

					    constexpr auto fma(auto const& x, auto const& y, auto const& z)
					    {
					        auto const mathImpl = trait::getMathImpl(thisApi());
					        return internal::Fma::Op<ALPAKA_TYPEOF(mathImpl), ALPAKA_TYPEOF(x), ALPAKA_TYPEOF(y), ALPAKA_TYPEOF(z)>{}(
					            mathImpl,
					            x,
					            y,
					            z);
					    }

					} // namespace alpaka::math
					// ==
					// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/math.hpp ==
					// ============================================================================

					// ============================================================================
					// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/math/floatEqualExact.hpp ==
					// ==
					/* Copyright 2021 Jiri Vyskocil
					 * SPDX-License-Identifier: MPL-2.0
					 */

					// #pragma once
					// #include "alpaka/core/common.hpp"    // amalgamate: file already inlined

					#include <type_traits>

					namespace alpaka
					{
					    namespace math
					    {
					        /** Compare two floating point numbers for exact equivalence. Use only when necessary, and be aware of the
					         * implications. Most codes should not use this function and instead implement a correct epsilon-based
					         * comparison. If you are unfamiliar with the topic, check out
					         * https://www.geeksforgeeks.org/problem-in-comparing-floating-point-numbers-and-how-to-compare-them-correctly/
					         * or Goldberg 1991: "What every computer scientist should know about floating-point arithmetic",
					         * https://dl.acm.org/doi/10.1145/103162.103163
					         *
					         * This function calls the == operator for floating point types, but disables the warning issued by the
					         * compiler when compiling with the float equality warning checks enabled. This warning is valid an valuable in
					         * most codes and should be generally enabled, but there are specific instances where a piece of code might
					         * need to do an exact comparison (e.g. @a CudaVectorArrayWrapperTest.cpp). The verbose name for the function
					         * is intentional as it should raise a red flag if used while not absolutely needed. Users are advised to add a
					         * justification whenever they use this function.
					         *
					         * @tparam T both operands have to be the same type and conform to std::is_floating_point
					         * @param a first operand
					         * @param b second operand
					         * @return a == b
					         */
					        template<typename T>
					        ALPAKA_FN_INLINE constexpr auto floatEqualExactNoWarning(T a, T b) -> bool
					        {
					            static_assert(std::is_floating_point_v<T>, "floatEqualExactNoWarning is for floating point values only!");

					            // So far only GCC and Clang check for float comparison and both accept the GCC pragmas.
					#ifdef __GNUC__
					#    pragma GCC diagnostic push
					#    pragma GCC diagnostic ignored "-Wfloat-equal"
					#endif
					            return a == b;
					#ifdef __GNUC__
					#    pragma GCC diagnostic pop
					#endif
					        }
					    } // namespace math
					} // namespace alpaka
					// ==
					// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/math/floatEqualExact.hpp ==
					// ============================================================================

				// #include "alpaka/trait.hpp"    // amalgamate: file already inlined
				// #include "math.hpp"    // amalgamate: file already inlined

				// #include <cmath>    // amalgamate: file already included
				// #include <complex>    // amalgamate: file already included
				// #include <iostream>    // amalgamate: file already included
				#include <type_traits>

				namespace alpaka::math
				{
				    namespace internal
				    {
				        //! Implementation of a complex number useable on host and device.
				        //!
				        //! It follows the layout of std::complex and so array-oriented access.
				        //! The class template implements all methods and operators as std::complex<T>.
				        //! Additionally, it provides an implicit conversion to and from std::complex<T>.
				        //! All methods besides operators << and >> are host-device.
				        //! It does not provide non-member functions of std::complex besides the operators.
				        //! Those are provided the same way as alpaka math functions for real numbers.
				        //!
				        //! Note that unlike most of alpaka, this is a concrete type template, and not merely a concept.
				        //!
				        //! Naming and order of the methods match https://en.cppreference.com/w/cpp/numeric/complex in C++17.
				        //! Implementation chose to not extend it e.g. by adding constexpr to some places that would get it in C++20.
				        //! The motivation is that with internal conversion to std::complex<T> for CPU backends, it would define the
				        //! common interface for generic code anyways. So it is more clear to have alpaka's interface exactly matching
				        //! when possible, and not "improving".
				        //!
				        //! @tparam T type of the real and imaginary part: float, double, or long double.
				        template<typename T>
				        class Complex
				        {
				        public:
				            // Make sure the input type is floating-point
				            static_assert(std::is_floating_point_v<T>);

				            //! Type of the real and imaginary parts
				            using value_type = T;

				            //! Constructor from the given real and imaginary parts
				            constexpr Complex(T const& real = T{}, T const& imag = T{}) : m_real(real), m_imag(imag)
				            {
				            }

				            //! Copy constructor
				            constexpr Complex(Complex const& other) = default;

				            //! Constructor from Complex of another type
				            template<typename U>
				            constexpr Complex(Complex<U> const& other)
				                : m_real(static_cast<T>(other.real()))
				                , m_imag(static_cast<T>(other.imag()))
				            {
				            }

				            //! Constructor from std::complex
				            constexpr Complex(std::complex<T> const& other) : m_real(other.real()), m_imag(other.imag())
				            {
				            }

				            //! Conversion to std::complex
				            constexpr operator std::complex<T>() const
				            {
				                return std::complex<T>{m_real, m_imag};
				            }

				            //! Assignment
				            Complex& operator=(Complex const&) = default;

				            //! Get the real part
				            constexpr T real() const
				            {
				                return m_real;
				            }

				            //! Set the real part
				            constexpr void real(T value)
				            {
				                m_real = value;
				            }

				            //! Get the imaginary part
				            constexpr T imag() const
				            {
				                return m_imag;
				            }

				            //! Set the imaginary part
				            constexpr void imag(T value)
				            {
				                m_imag = value;
				            }

				            //! Addition assignment with a real number
				            constexpr Complex& operator+=(T const& other)
				            {
				                m_real += other;
				                return *this;
				            }

				            //! Addition assignment with a complex number
				            template<typename U>
				            constexpr Complex& operator+=(Complex<U> const& other)
				            {
				                m_real += static_cast<T>(other.real());
				                m_imag += static_cast<T>(other.imag());
				                return *this;
				            }

				            //! Subtraction assignment with a real number
				            constexpr Complex& operator-=(T const& other)
				            {
				                m_real -= other;
				                return *this;
				            }

				            //! Subtraction assignment with a complex number
				            template<typename U>
				            constexpr Complex& operator-=(Complex<U> const& other)
				            {
				                m_real -= static_cast<T>(other.real());
				                m_imag -= static_cast<T>(other.imag());
				                return *this;
				            }

				            //! Multiplication assignment with a real number
				            constexpr Complex& operator*=(T const& other)
				            {
				                m_real *= other;
				                m_imag *= other;
				                return *this;
				            }

				            //! Multiplication assignment with a complex number
				            template<typename U>
				            constexpr Complex& operator*=(Complex<U> const& other)
				            {
				                auto const newReal = m_real * static_cast<T>(other.real()) - m_imag * static_cast<T>(other.imag());
				                auto const newImag = m_imag * static_cast<T>(other.real()) + m_real * static_cast<T>(other.imag());
				                m_real = newReal;
				                m_imag = newImag;
				                return *this;
				            }

				            //! Division assignment with a real number
				            constexpr Complex& operator/=(T const& other)
				            {
				                m_real /= other;
				                m_imag /= other;
				                return *this;
				            }

				            //! Division assignment with a complex number
				            template<typename U>
				            constexpr Complex& operator/=(Complex<U> const& other)
				            {
				                return *this *= Complex{
				                           static_cast<T>(other.real() / (other.real() * other.real() + other.imag() * other.imag())),
				                           static_cast<T>(
				                               -other.imag() / (other.real() * other.real() + other.imag() * other.imag()))};
				            }

				        private:
				            //! Real and imaginary parts, storage enables array-oriented access
				            T m_real, m_imag;
				        };

				        //! Host-device arithmetic operations matching std::complex<T>.
				        //!
				        //! They take and return alpaka::math::Complex.
				        //!
				        //! @{
				        //!

				        //! Unary plus (added for compatibility with std::complex)
				        template<typename T>
				        constexpr Complex<T> operator+(Complex<T> const& val)
				        {
				            return val;
				        }

				        //! Unary minus
				        template<typename T>
				        constexpr Complex<T> operator-(Complex<T> const& val)
				        {
				            return Complex<T>{-val.real(), -val.imag()};
				        }

				        //! Addition of two complex numbers
				        template<typename T>
				        constexpr Complex<T> operator+(Complex<T> const& lhs, Complex<T> const& rhs)
				        {
				            return Complex<T>{lhs.real() + rhs.real(), lhs.imag() + rhs.imag()};
				        }

				        //! Addition of a complex and a real number
				        template<typename T>
				        constexpr Complex<T> operator+(Complex<T> const& lhs, T const& rhs)
				        {
				            return Complex<T>{lhs.real() + rhs, lhs.imag()};
				        }

				        //! Addition of a real and a complex number
				        template<typename T>
				        constexpr Complex<T> operator+(T const& lhs, Complex<T> const& rhs)
				        {
				            return Complex<T>{lhs + rhs.real(), rhs.imag()};
				        }

				        //! Subtraction of two complex numbers
				        template<typename T>
				        constexpr Complex<T> operator-(Complex<T> const& lhs, Complex<T> const& rhs)
				        {
				            return Complex<T>{lhs.real() - rhs.real(), lhs.imag() - rhs.imag()};
				        }

				        //! Subtraction of a complex and a real number
				        template<typename T>
				        constexpr Complex<T> operator-(Complex<T> const& lhs, T const& rhs)
				        {
				            return Complex<T>{lhs.real() - rhs, lhs.imag()};
				        }

				        //! Subtraction of a real and a complex number
				        template<typename T>
				        constexpr Complex<T> operator-(T const& lhs, Complex<T> const& rhs)
				        {
				            return Complex<T>{lhs - rhs.real(), -rhs.imag()};
				        }

				        //! Muptiplication of two complex numbers
				        template<typename T>
				        constexpr Complex<T> operator*(Complex<T> const& lhs, Complex<T> const& rhs)
				        {
				            return Complex<T>{
				                lhs.real() * rhs.real() - lhs.imag() * rhs.imag(),
				                lhs.imag() * rhs.real() + lhs.real() * rhs.imag()};
				        }

				        //! Muptiplication of a complex and a real number
				        template<typename T>
				        constexpr Complex<T> operator*(Complex<T> const& lhs, T const& rhs)
				        {
				            return Complex<T>{lhs.real() * rhs, lhs.imag() * rhs};
				        }

				        //! Muptiplication of a real and a complex number
				        template<typename T>
				        constexpr Complex<T> operator*(T const& lhs, Complex<T> const& rhs)
				        {
				            return Complex<T>{lhs * rhs.real(), lhs * rhs.imag()};
				        }

				        //! Division of two complex numbers
				        template<typename T>
				        constexpr Complex<T> operator/(Complex<T> const& lhs, Complex<T> const& rhs)
				        {
				            return Complex<T>{
				                (lhs.real() * rhs.real() + lhs.imag() * rhs.imag())
				                    / (rhs.real() * rhs.real() + rhs.imag() * rhs.imag()),
				                (lhs.imag() * rhs.real() - lhs.real() * rhs.imag())
				                    / (rhs.real() * rhs.real() + rhs.imag() * rhs.imag())};
				        }

				        //! Division of complex and a real number
				        template<typename T>
				        constexpr Complex<T> operator/(Complex<T> const& lhs, T const& rhs)
				        {
				            return Complex<T>{lhs.real() / rhs, lhs.imag() / rhs};
				        }

				        //! Division of a real and a complex number
				        template<typename T>
				        constexpr Complex<T> operator/(T const& lhs, Complex<T> const& rhs)
				        {
				            return Complex<T>{
				                lhs * rhs.real() / (rhs.real() * rhs.real() + rhs.imag() * rhs.imag()),
				                -lhs * rhs.imag() / (rhs.real() * rhs.real() + rhs.imag() * rhs.imag())};
				        }

				        //! Equality of two complex numbers
				        template<typename T>
				        constexpr bool operator==(Complex<T> const& lhs, Complex<T> const& rhs)
				        {
				            return math::floatEqualExactNoWarning(lhs.real(), rhs.real())
				                   && math::floatEqualExactNoWarning(lhs.imag(), rhs.imag());
				        }

				        //! Equality of a complex and a real number
				        template<typename T>
				        constexpr bool operator==(Complex<T> const& lhs, T const& rhs)
				        {
				            return math::floatEqualExactNoWarning(lhs.real(), rhs)
				                   && math::floatEqualExactNoWarning(lhs.imag(), static_cast<T>(0));
				        }

				        //! Equality of a real and a complex number
				        template<typename T>
				        constexpr bool operator==(T const& lhs, Complex<T> const& rhs)
				        {
				            return math::floatEqualExactNoWarning(lhs, rhs.real())
				                   && math::floatEqualExactNoWarning(static_cast<T>(0), rhs.imag());
				        }

				        //! Inequality of two complex numbers.
				        //!
				        //! @note this and other versions of operator != should be removed since C++20, as so does std::complex
				        template<typename T>
				        constexpr bool operator!=(Complex<T> const& lhs, Complex<T> const& rhs)
				        {
				            return !(lhs == rhs);
				        }

				        //! Inequality of a complex and a real number
				        template<typename T>
				        constexpr bool operator!=(Complex<T> const& lhs, T const& rhs)
				        {
				            return !math::floatEqualExactNoWarning(lhs.real(), rhs)
				                   || !math::floatEqualExactNoWarning(lhs.imag(), static_cast<T>(0));
				        }

				        //! Inequality of a real and a complex number
				        template<typename T>
				        constexpr bool operator!=(T const& lhs, Complex<T> const& rhs)
				        {
				            return !math::floatEqualExactNoWarning(lhs, rhs.real())
				                   || !math::floatEqualExactNoWarning(static_cast<T>(0), rhs.imag());
				        }

				        //! @}

				        //! Host-only output of a complex number
				        template<typename T, typename TChar, typename TTraits>
				        std::basic_ostream<TChar, TTraits>& operator<<(std::basic_ostream<TChar, TTraits>& os, Complex<T> const& x)
				        {
				            os << x.operator std::complex<T>();
				            return os;
				        }

				        //! Host-only input of a complex number
				        template<typename T, typename TChar, typename TTraits>
				        std::basic_istream<TChar, TTraits>& operator>>(std::basic_istream<TChar, TTraits>& is, Complex<T> const& x)
				        {
				            std::complex<T> z;
				            is >> z;
				            x = z;
				            return is;
				        }

				        //! Host-only math functions matching std::complex<T>.
				        //!
				        //! Due to issue #1688, these functions are technically marked host-device and suppress related warnings.
				        //! However, they must be called for host only.
				        //!
				        //! They take and return alpaka::math::Complex (or a real number when appropriate).
				        //! Internally cast, fall back to std::complex implementation and cast back.
				        //! These functions can be used directly on the host side.
				        //! They are also picked up by ADL in math traits for CPU backends.
				        //!
				        //! On the device side, alpaka math traits must be used instead.
				        //! Note that the set of the traits is currently a bit smaller.
				        //!
				        //! @{
				        //!

				        //! Absolute value
				        template<typename T>
				        constexpr T abs(Complex<T> const& x)
				        {
				            return std::abs(std::complex<T>(x));
				        }

				        //! Arc cosine
				        template<typename T>
				        constexpr Complex<T> acos(Complex<T> const& x)
				        {
				            return std::acos(std::complex<T>(x));
				        }

				        //! Arc hyperbolic cosine
				        template<typename T>
				        constexpr Complex<T> acosh(Complex<T> const& x)
				        {
				            return std::acosh(std::complex<T>(x));
				        }

				        //! Argument
				        template<typename T>
				        constexpr T arg(Complex<T> const& x)
				        {
				            return std::arg(std::complex<T>(x));
				        }

				        //! Arc sine
				        template<typename T>
				        constexpr Complex<T> asin(Complex<T> const& x)
				        {
				            return std::asin(std::complex<T>(x));
				        }

				        //! Arc hyperbolic sine
				        template<typename T>
				        constexpr Complex<T> asinh(Complex<T> const& x)
				        {
				            return std::asinh(std::complex<T>(x));
				        }

				        //! Arc tangent
				        template<typename T>
				        constexpr Complex<T> atan(Complex<T> const& x)
				        {
				            return std::atan(std::complex<T>(x));
				        }

				        //! Arc hyperbolic tangent
				        template<typename T>
				        constexpr Complex<T> atanh(Complex<T> const& x)
				        {
				            return std::atanh(std::complex<T>(x));
				        }

				        //! Complex conjugate
				        template<typename T>
				        constexpr Complex<T> conj(Complex<T> const& x)
				        {
				            return std::conj(std::complex<T>(x));
				        }

				        //! Cosine
				        template<typename T>
				        constexpr Complex<T> cos(Complex<T> const& x)
				        {
				            return std::cos(std::complex<T>(x));
				        }

				        //! Hyperbolic cosine
				        template<typename T>
				        constexpr Complex<T> cosh(Complex<T> const& x)
				        {
				            return std::cosh(std::complex<T>(x));
				        }

				        //! Exponential
				        template<typename T>
				        constexpr Complex<T> exp(Complex<T> const& x)
				        {
				            return std::exp(std::complex<T>(x));
				        }

				        //! Natural logarithm
				        template<typename T>
				        constexpr Complex<T> log(Complex<T> const& x)
				        {
				            return std::log(std::complex<T>(x));
				        }

				        //! Base 10 logarithm
				        template<typename T>
				        constexpr Complex<T> log10(Complex<T> const& x)
				        {
				            return std::log10(std::complex<T>(x));
				        }

				        //! Squared magnitude
				        template<typename T>
				        constexpr T norm(Complex<T> const& x)
				        {
				            return std::norm(std::complex<T>(x));
				        }

				        //! Get a complex number with given magnitude and phase angle
				        template<typename T>
				        constexpr Complex<T> polar(T const& r, T const& theta = T())
				        {
				            return std::polar(r, theta);
				        }

				        //! Complex power of a complex number
				        template<typename T, typename U>
				        constexpr auto pow(Complex<T> const& x, Complex<U> const& y)
				        {
				            // Use same type promotion as std::pow
				            auto const result = std::pow(std::complex<T>(x), std::complex<U>(y));
				            using ValueType = typename decltype(result)::value_type;
				            return Complex<ValueType>(result);
				        }

				        //! Real power of a complex number
				        template<typename T, typename U>
				        constexpr auto pow(Complex<T> const& x, U const& y)
				        {
				            return pow(x, Complex<U>(y));
				        }

				        //! Complex power of a real number
				        template<typename T, typename U>
				        constexpr auto pow(T const& x, Complex<U> const& y)
				        {
				            return pow(Complex<T>(x), y);
				        }

				        //! Projection onto the Riemann sphere
				        template<typename T>
				        constexpr Complex<T> proj(Complex<T> const& x)
				        {
				            return std::proj(std::complex<T>(x));
				        }

				        //! Sine
				        template<typename T>
				        constexpr Complex<T> sin(Complex<T> const& x)
				        {
				            return std::sin(std::complex<T>(x));
				        }

				        //! Hyperbolic sine
				        template<typename T>
				        constexpr Complex<T> sinh(Complex<T> const& x)
				        {
				            return std::sinh(std::complex<T>(x));
				        }

				        //! Square root
				        template<typename T>
				        constexpr Complex<T> sqrt(Complex<T> const& x)
				        {
				            return std::sqrt(std::complex<T>(x));
				        }

				        //! Tangent
				        template<typename T>
				        constexpr Complex<T> tan(Complex<T> const& x)
				        {
				            return std::tan(std::complex<T>(x));
				        }

				        //! Hyperbolic tangent
				        template<typename T>
				        constexpr Complex<T> tanh(Complex<T> const& x)
				        {
				            return std::tanh(std::complex<T>(x));
				        }

				        //! @}
				    } // namespace internal

				    using internal::Complex;

				#if ALPAKA_LANG_CUDA || ALPAKA_LANG_HIP || ALPAKA_LANG_SYCL

				    namespace internal
				    {
				        template<typename T_MathImpl, typename T_Arg>
				        struct Abs::Op<T_MathImpl, Complex<T_Arg>>
				        {
				            constexpr auto operator()(T_MathImpl, Complex<T_Arg> const& arg) const
				            {
				                return math::sqrt(arg.real() * arg.real() + arg.imag() * arg.imag());
				            }
				        };

				        //! The acos trait specialization for complex types.
				        template<typename T_MathImpl, typename T>
				        struct Acos::Op<T_MathImpl, Complex<T>>
				        {
				            constexpr auto operator()(T_MathImpl, Complex<T> const& arg) const
				            {
				                // This holds everywhere, including the branch cuts: acos(z) = -i * ln(z + i * sqrt(1 - z^2))
				                return Complex<T>{static_cast<T>(0.0), static_cast<T>(-1.0)}
				                       * math::log(
				                           arg
				                           + Complex<T>{static_cast<T>(0.0), static_cast<T>(1.0)}
				                                 * math::sqrt(static_cast<T>(1.0) - arg * arg));
				            }
				        };

				        //! The acosh trait specialization for complex types.
				        template<typename T_MathImpl, typename T>
				        struct Acosh::Op<T_MathImpl, Complex<T>>
				        {
				            constexpr auto operator()(T_MathImpl, Complex<T> const& arg) const
				            {
				                // acos(z) = ln(z + sqrt(z-1) * sqrt(z+1))
				                return math::log(arg + math::sqrt(arg - static_cast<T>(1.0)) * math::sqrt(arg + static_cast<T>(1.0)));
				            }
				        };

				        //! The arg Complex<T> specialization for complex types.
				        template<typename T_MathImpl, typename T>
				        struct Arg::Op<T_MathImpl, Complex<T>>
				        {
				            constexpr auto operator()(T_MathImpl, Complex<T> const& argument) const
				            {
				                return math::atan2(argument.imag(), argument.real());
				            }
				        };

				        //! The asin trait specialization for complex types.
				        template<typename T_MathImpl, typename T>
				        struct Asin::Op<T_MathImpl, Complex<T>>
				        {
				            constexpr auto operator()(T_MathImpl, Complex<T> const& arg) const
				            {
				                // This holds everywhere, including the branch cuts: asin(z) = i * ln(sqrt(1 - z^2) - i * z)
				                return Complex<T>{static_cast<T>(0.0), static_cast<T>(1.0)}
				                       * math::log(
				                           math::sqrt(static_cast<T>(1.0) - arg * arg)
				                           - Complex<T>{static_cast<T>(0.0), static_cast<T>(1.0)} * arg);
				            }
				        };

				        //! The asinh trait specialization for complex types.
				        template<typename T_MathImpl, typename T>
				        struct Asinh::Op<T_MathImpl, Complex<T>>
				        {
				            constexpr auto operator()(T_MathImpl, Complex<T> const& arg) const
				            {
				                // asinh(z) = ln(z + sqrt(z^2 + 1))
				                return math::log(arg + math::sqrt(arg * arg + static_cast<T>(1.0)));
				            }
				        };

				        //! The atan trait specialization for complex types.
				        template<typename T_MathImpl, typename T>
				        struct Atan::Op<T_MathImpl, Complex<T>>
				        {
				            constexpr auto operator()(T_MathImpl, Complex<T> const& arg) const
				            {
				                // This holds everywhere, including the branch cuts: atan(z) = -i/2 * ln((i - z) / (i + z))
				                return Complex<T>{static_cast<T>(0.0), static_cast<T>(-0.5)}
				                       * math::log(
				                           (Complex<T>{static_cast<T>(0.0), static_cast<T>(1.0)} - arg)
				                           / (Complex<T>{static_cast<T>(0.0), static_cast<T>(1.0)} + arg));
				            }
				        };

				        //! The atanh trait specialization for complex types.
				        template<typename T_MathImpl, typename T>
				        struct Atanh::Op<T_MathImpl, Complex<T>>
				        {
				            constexpr auto operator()(T_MathImpl, Complex<T> const& arg) const
				            {
				                //  atanh(z) = 0.5 * (ln(1 + z) - ln(1 - z))
				                return static_cast<T>(0.5)
				                       * (math::log(static_cast<T>(1.0) + arg) - math::log(static_cast<T>(1.0) - arg));
				            }
				        };

				        //! The conj specialization for complex types.
				        template<typename T_MathImpl, typename T>
				        struct Conj::Op<T_MathImpl, Complex<T>>
				        {
				            constexpr auto operator()(T_MathImpl const& /* conj_ctx */, Complex<T> const& arg) const
				            {
				                return Complex<T>{arg.real(), -arg.imag()};
				            }
				        };

				        //! The cos trait specialization for complex types.
				        template<typename T_MathImpl, typename T>
				        struct Cos::Op<T_MathImpl, Complex<T>>
				        {
				            constexpr auto operator()(T_MathImpl, Complex<T> const& arg) const
				            {
				                // cos(z) = 0.5 * (exp(i * z) + exp(-i * z))
				                return T(0.5)
				                       * (math::exp(Complex<T>{static_cast<T>(0.0), static_cast<T>(1.0)} * arg)
				                          + math::exp(Complex<T>{static_cast<T>(0.0), static_cast<T>(-1.0)} * arg));
				            }
				        };

				        //! The cosh trait specialization for complex types.
				        template<typename T_MathImpl, typename T>
				        struct Cosh::Op<T_MathImpl, Complex<T>>
				        {
				            constexpr auto operator()(T_MathImpl, Complex<T> const& arg) const
				            {
				                // cosh(z) = 0.5 * (exp(z) + exp(-z))
				                return T(0.5) * (math::exp(arg) + math::exp(static_cast<T>(-1.0) * arg));
				            }
				        };

				        //! The exp trait specialization for complex types.
				        template<typename T_MathImpl, typename T>
				        struct Exp::Op<T_MathImpl, Complex<T>>
				        {
				            constexpr auto operator()(T_MathImpl, Complex<T> const& arg) const
				            {
				                // exp(z) = exp(x + iy) = exp(x) * (cos(y) + i * sin(y))
				                auto re = T{}, im = T{};
				                math::sincos(arg.imag(), im, re);
				                return math::exp(arg.real()) * Complex<T>{re, im};
				            }
				        };

				        //! The log trait specialization for complex types.
				        template<typename T_MathImpl, typename T>
				        struct Log::Op<T_MathImpl, Complex<T>>
				        {
				            constexpr auto operator()(T_MathImpl, Complex<T> const& argument) const
				            {
				                // Branch cut along the negative real axis (same as for std::complex),
				                // principal value of ln(z) = ln(|z|) + i * arg(z)
				                return math::log(math::abs(argument))
				                       + Complex<T>{static_cast<T>(0.0), static_cast<T>(1.0)} * math::arg(argument);
				            }
				        };

				        //! The log2 trait specialization for complex types.
				        template<typename T_MathImpl, typename T>
				        struct Log2::Op<T_MathImpl, Complex<T>>
				        {
				            constexpr auto operator()(T_MathImpl, Complex<T> const& argument) const
				            {
				                return math::log(argument) / math::log(static_cast<T>(2));
				            }
				        };

				        //! The log10 trait specialization for complex types.
				        template<typename T_MathImpl, typename T>
				        struct Log10::Op<T_MathImpl, Complex<T>>
				        {
				            constexpr auto operator()(T_MathImpl, Complex<T> const& argument) const
				            {
				                return math::log(argument) / math::log(static_cast<T>(10));
				            }
				        };

				        //! The pow trait specialization for complex types.
				        template<typename T_MathImpl, typename T, typename U>
				        struct Pow::Op<T_MathImpl, Complex<T>, Complex<U>>
				        {
				            constexpr auto operator()(T_MathImpl, Complex<T> const& base, Complex<U> const& exponent) const
				            {
				                // Type promotion matching rules of complex std::pow but simplified given our math only supports float
				                // and double, no long double.
				                using Promoted
				                    = Complex<std::conditional_t<is_decayed_v<T, float> && is_decayed_v<U, float>, float, double>>;
				                // pow(z1, z2) = e^(z2 * log(z1))
				                return math::exp(Promoted{exponent} * math::log(Promoted{base}));
				            }
				        };

				        //! The pow trait specialization for complex and real types.
				        template<typename T_MathImpl, typename T, typename U>
				        struct Pow::Op<T_MathImpl, Complex<T>, U>
				        {
				            constexpr auto operator()(T_MathImpl, Complex<T> const& base, U const& exponent) const
				            {
				                return math::pow(base, Complex<U>{exponent});
				            }
				        };

				        //! The pow trait specialization for real and complex types.
				        template<typename T_MathImpl, typename T, typename U>
				        struct Pow::Op<T_MathImpl, T, Complex<U>>
				        {
				            constexpr auto operator()(T_MathImpl, T const& base, Complex<U> const& exponent) const
				            {
				                return math::pow(Complex<T>{base}, exponent);
				            }
				        };

				        //! The rsqrt trait specialization for complex types.
				        template<typename T_MathImpl, typename T>
				        struct Rsqrt::Op<T_MathImpl, Complex<T>>
				        {
				            constexpr auto operator()(T_MathImpl, Complex<T> const& arg) const
				            {
				                return static_cast<T>(1.0) / math::sqrt(arg);
				            }
				        };

				        //! The sin trait specialization for complex types.
				        template<typename T_MathImpl, typename T>
				        struct Sin::Op<T_MathImpl, Complex<T>>
				        {
				            constexpr auto operator()(T_MathImpl, Complex<T> const& arg) const
				            {
				                // sin(z) = (exp(i * z) - exp(-i * z)) / 2i
				                return (math::exp(Complex<T>{static_cast<T>(0.0), static_cast<T>(1.0)} * arg)
				                        - math::exp(Complex<T>{static_cast<T>(0.0), static_cast<T>(-1.0)} * arg))
				                       / Complex<T>{static_cast<T>(0.0), static_cast<T>(2.0)};
				            }
				        };

				        //! The sinh trait specialization for complex types.
				        template<typename T_MathImpl, typename T>
				        struct Sinh::Op<T_MathImpl, Complex<T>>
				        {
				            constexpr auto operator()(T_MathImpl, Complex<T> const& arg) const
				            {
				                // sinh(z) = (exp(z) - exp(-i * z)) / 2
				                return (math::exp(arg) - math::exp(static_cast<T>(-1.0) * arg)) / static_cast<T>(2.0);
				            }
				        };

				        //! The sincos trait specialization for complex types.
				        template<typename T_MathImpl, typename T>
				        struct SinCos::Op<T_MathImpl, Complex<T>>
				        {
				            constexpr auto operator()(
				                T_MathImpl,
				                Complex<T> const& arg,
				                Complex<T>& result_sin,
				                Complex<T>& result_cos) const -> void
				            {
				                result_sin = math::sin(arg);
				                result_cos = math::cos(arg);
				            }
				        };

				        //! The sqrt trait specialization for complex types.
				        template<typename T_MathImpl, typename T>
				        struct Sqrt::Op<T_MathImpl, Complex<T>>
				        {
				            constexpr auto operator()(T_MathImpl, Complex<T> const& argument) const
				            {
				                // Branch cut along the negative real axis (same as for std::complex),
				                // principal value of sqrt(z) = sqrt(|z|) * e^(i * arg(z) / 2)
				                auto const halfArg = T(0.5) * math::arg(argument);
				                auto re = T{}, im = T{};
				                math::sincos(halfArg, im, re);
				                return math::sqrt(math::abs(argument)) * Complex<T>(re, im);
				            }
				        };

				        //! The tan trait specialization for complex types.
				        template<typename T_MathImpl, typename T>
				        struct Tan::Op<T_MathImpl, Complex<T>>
				        {
				            constexpr auto operator()(T_MathImpl, Complex<T> const& arg) const
				            {
				                // tan(z) = i * (e^-iz - e^iz) / (e^-iz + e^iz) = i * (1 - e^2iz) / (1 + e^2iz)
				                // Warning: this straightforward implementation can easily result in NaN as 0/0 or inf/inf.
				                auto const expValue = math::exp(Complex<T>{static_cast<T>(0.0), static_cast<T>(2.0)} * arg);
				                return Complex<T>{static_cast<T>(0.0), static_cast<T>(1.0)} * (static_cast<T>(1.0) - expValue)
				                       / (static_cast<T>(1.0) + expValue);
				            }
				        };

				        //! The tanh trait specialization for complex types.
				        template<typename T_MathImpl, typename T>
				        struct Tanh::Op<T_MathImpl, Complex<T>>
				        {
				            constexpr auto operator()(T_MathImpl, Complex<T> const& arg) const
				            {
				                // tanh(z) = (e^z - e^-z)/(e^z+e^-z)
				                return (math::exp(arg) - math::exp(static_cast<T>(-1.0) * arg))
				                       / (math::exp(arg) + math::exp(static_cast<T>(-1.0) * arg));
				            }
				        };

				    } // namespace internal

				#endif

				} // namespace alpaka::math

				namespace alpaka::trait
				{
				    template<typename T>
				    struct GetValueType<math::internal::Complex<T>>
				    {
				        using type = T;
				    };
				} // namespace alpaka::trait
				// ==
				// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/math/internal/Complex.hpp ==
				// ============================================================================


			namespace alpaka::math
			{
			    using internal::Complex;
			} // namespace alpaka::math
			// ==
			// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/math/Complex.hpp ==
			// ============================================================================

		// #include "alpaka/math/internal/ieee754.hpp"    // amalgamate: file already inlined
		// #include "alpaka/math/internal/math.hpp"    // amalgamate: file already inlined

		#if ALPAKA_LANG_SYCL

		// #    include <sycl/sycl.hpp>    // amalgamate: file already included

		// #    include <concepts>    // amalgamate: file already included

		namespace alpaka::math::internal
		{
		    template<typename T_Arg>
		    requires(std::is_arithmetic_v<T_Arg>)
		    struct Abs::Op<SyclMath, T_Arg>
		    {
		        constexpr auto operator()(SyclMath, T_Arg const& arg) const
		        {
		            if constexpr(std::is_integral_v<T_Arg>)
		                return sycl::abs(arg);
		            else if constexpr(std::is_floating_point_v<T_Arg>)
		                return sycl::fabs(arg);
		            else
		                static_assert(!sizeof(T_Arg), "Unsupported data type");
		        }
		    };

		    template<std::floating_point T_Arg>
		    struct Sin::Op<SyclMath, T_Arg>
		    {
		        constexpr auto operator()(SyclMath, T_Arg const& arg) const
		        {
		            return sycl::sin(arg);
		        }
		    };

		    template<std::floating_point T_Arg>
		    struct Acosh::Op<SyclMath, T_Arg>
		    {
		        constexpr auto operator()(SyclMath, T_Arg const& arg) const
		        {
		            return sycl::acosh(arg);
		        }
		    };

		    template<std::floating_point T_Arg>
		    struct Asinh::Op<SyclMath, T_Arg>
		    {
		        constexpr auto operator()(SyclMath, T_Arg const& arg) const
		        {
		            return sycl::asinh(arg);
		        }
		    };

		    template<std::floating_point T_Arg>
		    struct Sinh::Op<SyclMath, T_Arg>
		    {
		        constexpr auto operator()(SyclMath, T_Arg const& arg) const
		        {
		            return sycl::sinh(arg);
		        }
		    };

		    template<std::floating_point T_Arg>
		    struct Atan::Op<SyclMath, T_Arg>
		    {
		        constexpr auto operator()(SyclMath, T_Arg const& arg) const
		        {
		            return sycl::atan(arg);
		        }
		    };

		    template<std::floating_point T_Arg>
		    struct Atanh::Op<SyclMath, T_Arg>
		    {
		        constexpr auto operator()(SyclMath, T_Arg const& arg) const
		        {
		            return sycl::atanh(arg);
		        }
		    };

		    template<std::floating_point T_Arg>
		    struct Tanh::Op<SyclMath, T_Arg>
		    {
		        constexpr auto operator()(SyclMath, T_Arg const& arg) const
		        {
		            return sycl::tanh(arg);
		        }
		    };

		    template<typename T_Arg>
		    requires(std::is_arithmetic_v<T_Arg>)
		    struct Cbrt::Op<SyclMath, T_Arg>
		    {
		        constexpr auto operator()(SyclMath, T_Arg const& arg) const
		        {
		            if constexpr(std::is_integral_v<T_Arg>)
		                return sycl::cbrt(static_cast<double>(arg)); // Mirror CUDA back-end and use double for ints
		            else if constexpr(std::is_floating_point_v<T_Arg>)
		                return sycl::cbrt(arg);
		            else
		                static_assert(!sizeof(T_Arg), "Unsupported data type");
		        }
		    };

		    template<std::floating_point T_Arg>
		    struct Ceil::Op<SyclMath, T_Arg>
		    {
		        constexpr auto operator()(SyclMath, T_Arg const& arg) const
		        {
		            return sycl::ceil(arg);
		        }
		    };

		    template<std::floating_point T_Arg>
		    struct Round::Op<SyclMath, T_Arg>
		    {
		        constexpr auto operator()(SyclMath, T_Arg const& arg) const
		        {
		            return sycl::round(arg);
		        }
		    };

		    template<std::floating_point T_Arg>
		    struct Lround::Op<SyclMath, T_Arg>
		    {
		        constexpr auto operator()(SyclMath, T_Arg const& arg) const
		        {
		            return static_cast<long>(sycl::round(arg));
		        }
		    };

		    template<std::floating_point T_Arg>
		    struct Llround::Op<SyclMath, T_Arg>
		    {
		        constexpr auto operator()(SyclMath, T_Arg const& arg) const
		        {
		            return static_cast<long long>(sycl::round(arg));
		        }
		    };

		    template<std::floating_point T_Arg>
		    struct SinCos::Op<SyclMath, T_Arg>
		    {
		        constexpr auto operator()(SyclMath, T_Arg const& arg, T_Arg& result_sin, T_Arg& result_cos) const
		        {
		            result_sin = sycl::sincos(arg, &result_cos);
		        }
		    };

		    template<typename T_Arg>
		    requires(std::is_arithmetic_v<T_Arg>)
		    struct Arg::Op<SyclMath, T_Arg>
		    {
		        constexpr auto operator()(SyclMath, T_Arg const& arg) const
		        {
		            if constexpr(std::is_integral_v<T_Arg>)
		                return sycl::atan2(0.0, static_cast<double>(arg));
		            else if constexpr(std::is_floating_point_v<T_Arg>)
		                return sycl::atan2(static_cast<T_Arg>(0.0), arg);
		            else
		                static_assert(!sizeof(T_Arg), "Unsupported data type");
		        }
		    };

		    template<std::floating_point T_Y, std::floating_point T_X>
		    struct Atan2::Op<SyclMath, T_Y, T_X>
		    {
		        using CommonT_Bpe = std::common_type_t<T_Y, T_X>;

		        auto operator()(SyclMath, T_Y const& y, T_X const& x) const
		        {
		            return sycl::atan2(static_cast<CommonT_Bpe>(y), static_cast<CommonT_Bpe>(x));
		        }
		    };

		    template<std::floating_point T_Arg>
		    struct Exp::Op<SyclMath, T_Arg>
		    {
		        constexpr auto operator()(SyclMath, T_Arg const& arg) const
		        {
		            return sycl::exp(arg);
		        }
		    };

		    template<std::floating_point T_Arg>
		    struct Sqrt::Op<SyclMath, T_Arg>
		    {
		        constexpr auto operator()(SyclMath, T_Arg const& arg) const
		        {
		            return sycl::sqrt(arg);
		        }
		    };

		    template<typename T_Arg>
		    requires(std::is_arithmetic_v<T_Arg>)
		    struct Rsqrt::Op<SyclMath, T_Arg>
		    {
		        constexpr auto operator()(SyclMath, T_Arg const& arg) const
		        {
		            if constexpr(std::is_floating_point_v<T_Arg>)
		                return sycl::rsqrt(arg);
		            else if constexpr(std::is_integral_v<T_Arg>)
		            {
		                // mirror CUDA back-end and use double for ints
		                return sycl::rsqrt(static_cast<double>(arg));
		            }
		            else
		                static_assert(!sizeof(T_Arg), "Unsupported data type");
		        }
		    };

		    template<std::floating_point T_Arg>
		    struct Trunc::Op<SyclMath, T_Arg>
		    {
		        constexpr auto operator()(SyclMath, T_Arg const& arg) const
		        {
		            return sycl::trunc(arg);
		        }
		    };

		    template<std::floating_point T_Arg>
		    struct Cos::Op<SyclMath, T_Arg>
		    {
		        constexpr auto operator()(SyclMath, T_Arg const& arg) const
		        {
		            return sycl::cos(arg);
		        }
		    };

		    template<std::floating_point T_Arg>
		    struct Cosh::Op<SyclMath, T_Arg>
		    {
		        constexpr auto operator()(SyclMath, T_Arg const& arg) const
		        {
		            return sycl::cosh(arg);
		        }
		    };

		    template<std::floating_point T_Arg>
		    struct Floor::Op<SyclMath, T_Arg>
		    {
		        constexpr auto operator()(SyclMath, T_Arg const& arg) const
		        {
		            return sycl::floor(arg);
		        }
		    };

		    template<std::floating_point T_Arg>
		    struct Erf::Op<SyclMath, T_Arg>
		    {
		        constexpr auto operator()(SyclMath, T_Arg const& arg) const
		        {
		            return sycl::erf(arg);
		        }
		    };

		    template<std::floating_point T_Arg>
		    struct Log::Op<SyclMath, T_Arg>
		    {
		        constexpr auto operator()(SyclMath, T_Arg const& arg) const
		        {
		            return sycl::log(arg);
		        }
		    };

		    template<std::floating_point T_Arg>
		    struct Log2::Op<SyclMath, T_Arg>
		    {
		        constexpr auto operator()(SyclMath, T_Arg const& arg) const
		        {
		            return sycl::log2(arg);
		        }
		    };

		    template<std::floating_point T_Arg>
		    struct Log10::Op<SyclMath, T_Arg>
		    {
		        constexpr auto operator()(SyclMath, T_Arg const& arg) const
		        {
		            return sycl::log10(arg);
		        }
		    };

		    template<std::floating_point T_Arg>
		    struct Tan::Op<SyclMath, T_Arg>
		    {
		        constexpr auto operator()(SyclMath, T_Arg const& arg) const
		        {
		            return sycl::tan(arg);
		        }
		    };

		    template<std::floating_point T_Arg>
		    struct Asin::Op<SyclMath, T_Arg>
		    {
		        constexpr auto operator()(SyclMath, T_Arg const& arg) const
		        {
		            return sycl::asin(arg);
		        }
		    };

		    template<std::floating_point T_Arg>
		    struct Acos::Op<SyclMath, T_Arg>
		    {
		        constexpr auto operator()(SyclMath, T_Arg const& arg) const
		        {
		            return sycl::acos(arg);
		        }
		    };

		    // Route SYCL predicates through shared helper to match host/CUDA semantics exactly.
		    template<std::floating_point T_Arg>
		    struct Isnan::Op<SyclMath, T_Arg>
		    {
		        constexpr auto operator()(SyclMath, T_Arg const& arg) const
		        {
		            return ieeeIsnan(arg);
		        }
		    };

		    template<std::floating_point T_Arg>
		    struct Isinf::Op<SyclMath, T_Arg>
		    {
		        constexpr auto operator()(SyclMath, T_Arg const& arg) const
		        {
		            return ieeeIsinf(arg);
		        }
		    };

		    template<std::floating_point T_Arg>
		    struct Isfinite::Op<SyclMath, T_Arg>
		    {
		        constexpr auto operator()(SyclMath, T_Arg const& arg) const
		        {
		            return ieeeIsfinite(arg);
		        }
		    };

		    template<std::floating_point T_Arg>
		    struct Conj::Op<SyclMath, T_Arg>
		    {
		        constexpr auto operator()(SyclMath, T_Arg const& arg) const
		        {
		            return Complex<T_Arg>{arg, T_Arg{0.0}};
		        }
		    };

		    template<std::floating_point TMag, std::floating_point TSgn>
		    struct Copysign::Op<SyclMath, TMag, TSgn>
		    {
		        using TCommon = std::common_type_t<TMag, TSgn>;

		        constexpr auto operator()(SyclMath, TMag const& mag, TSgn const& sgn) const
		        {
		            return sycl::copysign(static_cast<TCommon>(mag), static_cast<TCommon>(sgn));
		        }
		    };

		    template<typename T_A, typename T_B>
		    requires(std::is_arithmetic_v<T_A> && std::is_arithmetic_v<T_B>)
		    struct Min::Op<SyclMath, T_A, T_B>
		    {
		        constexpr auto operator()(SyclMath, T_A const& a, T_B const& b) const
		        {
		            if constexpr(std::is_integral_v<T_A> && std::is_integral_v<T_B>)
		                return sycl::min(a, b);
		            else if constexpr(std::is_floating_point_v<T_A> || std::is_floating_point_v<T_B>)
		                return sycl::fmin(a, b);
		            else if constexpr(
		                (std::is_floating_point_v<T_A> && std::is_integral_v<T_B>)
		                || (std::is_integral_v<T_A> && std::is_floating_point_v<T_B>) )
		                return sycl::fmin(static_cast<double>(a), static_cast<double>(b)); // mirror CUDA back-end
		            else
		                static_assert(!sizeof(T_A), "Unsupported data types");
		        }
		    };

		    template<typename T_A, typename T_B>
		    requires(std::is_arithmetic_v<T_A> && std::is_arithmetic_v<T_B>)
		    struct Max::Op<SyclMath, T_A, T_B>
		    {
		        constexpr auto operator()(SyclMath, T_A const& a, T_B const& b) const
		        {
		            if constexpr(std::is_integral_v<T_A> && std::is_integral_v<T_B>)
		                return sycl::max(a, b);
		            else if constexpr(std::is_floating_point_v<T_A> || std::is_floating_point_v<T_B>)
		                return sycl::fmax(a, b);
		            else if constexpr(
		                (std::is_floating_point_v<T_A> && std::is_integral_v<T_B>)
		                || (std::is_integral_v<T_A> && std::is_floating_point_v<T_B>) )
		                return sycl::fmax(static_cast<double>(a), static_cast<double>(b)); // mirror CUDA back-end
		            else
		                static_assert(!sizeof(T_A), "Unsupported data types");
		        }
		    };

		    template<std::floating_point T_Base, std::floating_point T_Exp>
		    struct Pow::Op<SyclMath, T_Base, T_Exp>
		    {
		        using TCommon = std::common_type_t<T_Base, T_Exp>;

		        constexpr auto operator()(SyclMath, T_Base const& base, T_Exp const& exp) const
		        {
		            return sycl::pow(static_cast<TCommon>(base), static_cast<TCommon>(exp));
		        }
		    };

		    template<std::floating_point T_X, std::floating_point T_Y>
		    struct Fmod::Op<SyclMath, T_X, T_Y>
		    {
		        using TCommon = std::common_type_t<T_X, T_Y>;

		        constexpr auto operator()(SyclMath, T_X const& x, T_Y const& y) const
		        {
		            return sycl::fmod(static_cast<TCommon>(x), static_cast<TCommon>(y));
		        }
		    };

		    template<std::floating_point T_X, std::floating_point T_Y>
		    struct Remainder::Op<SyclMath, T_X, T_Y>
		    {
		        using TCommon = std::common_type_t<T_X, T_Y>;

		        constexpr auto operator()(SyclMath, T_X const& x, T_Y const& y) const
		        {
		            return sycl::remainder(static_cast<TCommon>(x), static_cast<TCommon>(y));
		        }
		    };

		    template<std::floating_point T_X, std::floating_point T_Y, std::floating_point T_Z>
		    struct Fma::Op<SyclMath, T_X, T_Y, T_Z>
		    {
		        constexpr auto operator()(SyclMath, T_X const& x, T_Y const& y, T_Z const& z) const
		        {
		            return sycl::fma(x, y, z);
		        }
		    };

		} // namespace alpaka::math::internal

		#endif
		// ==
		// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/syclGeneric/math.hpp ==
		// ============================================================================

		// ============================================================================
		// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/syclGeneric/memFence.hpp ==
		// ==
		/* Copyright 2025 Mehmet Yusufoglu, René Widera
		 * SPDX-License-Identifier: MPL-2.0
		 */

		// #pragma once		// #include "alpaka/api/concepts/api.hpp"    // amalgamate: file already inlined
		// #include "alpaka/api/oneApi/executor.hpp"    // amalgamate: file already inlined
			// ============================================================================
			// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/syclGeneric/memoryOrder.hpp ==
			// ==
			/* Copyright 2025 Mehmet Yusufoglu, René Widera
			 * SPDX-License-Identifier: MPL-2.0
			 */

			// #pragma once
			// #include "alpaka/core/config.hpp"    // amalgamate: file already inlined
			// #include "alpaka/onAcc/memoryOrder.hpp"    // amalgamate: file already inlined

			#include <type_traits>

			// Top-level guard needed because including sycl headers is needed
			#if ALPAKA_LANG_SYCL
			// #    include <sycl/sycl.hpp>    // amalgamate: file already included

			namespace alpaka::onAcc::internalCompute
			{
			    struct MemOrderSycl
			    {
			        template<concepts::MemoryOrder TMemOrder>
			        static constexpr auto get(TMemOrder const)
			        {
			            if constexpr(std::same_as<TMemOrder, order::SeqCst>)
			            {
			                return sycl::memory_order::seq_cst;
			            }
			            if constexpr(std::same_as<TMemOrder, order::AcqRel>)
			            {
			                return sycl::memory_order::acq_rel;
			            }
			            if constexpr(std::same_as<TMemOrder, order::Release>)
			            {
			                return sycl::memory_order::release;
			            }
			            if constexpr(std::same_as<TMemOrder, order::Acquire>)
			            {
			                return sycl::memory_order::acquire;
			            }
			            if constexpr(std::same_as<TMemOrder, order::Relaxed>)
			            {
			                return sycl::memory_order::relaxed;
			            }
			        }
			    };
			} // namespace alpaka::onAcc::internalCompute

			#endif // ALPAKA_LANG_SYCL
			// ==
			// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/syclGeneric/memoryOrder.hpp ==
			// ============================================================================

		// #include "alpaka/api/syclGeneric/tag.hpp"    // amalgamate: file already inlined
		// #include "alpaka/core/common.hpp"    // amalgamate: file already inlined
		// #include "alpaka/core/config.hpp"    // amalgamate: file already inlined
		// #include "alpaka/onAcc/Acc.hpp"    // amalgamate: file already inlined
		// #include "alpaka/onAcc/memoryOrder.hpp"    // amalgamate: file already inlined
		// #include "alpaka/onAcc/scope.hpp"    // amalgamate: file already inlined

		// Top-level guard needed because including sycl headers is needed
		#if ALPAKA_LANG_SYCL
		// #    include <sycl/sycl.hpp>    // amalgamate: file already included

		#    include <type_traits>

		namespace alpaka::onAcc::internalCompute
		{
		    template<alpaka::concepts::Api T_Api, concepts::Scope T_Scope, concepts::MemoryOrder T_Order>
		    requires(std::is_base_of_v<api::GenericSycl<T_Api>, T_Api>)
		    struct MemoryFence::Op<T_Api, T_Scope, T_Order>
		    {
		        constexpr void operator()(onAcc::concepts::Acc auto const&, T_Scope const, T_Order const order) const
		        {
		            if constexpr(std::is_same_v<T_Scope, scope::Block>)
		            {
		                sycl::atomic_fence(MemOrderSycl::get(order), sycl::memory_scope::work_group);
		            }
		            else if constexpr(std::is_same_v<T_Scope, scope::Device>)
		            {
		                sycl::atomic_fence(MemOrderSycl::get(order), sycl::memory_scope::device);
		            }
		            else if constexpr(std::is_same_v<T_Scope, scope::System>)
		            {
		                // System fences map to device scope for SYCL backends
		                sycl::atomic_fence(MemOrderSycl::get(order), sycl::memory_scope::system);
		            }
		        }
		    };
		} // namespace alpaka::onAcc::internalCompute
		#endif // ALPAKA_LANG_SYCL
		// ==
		// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/syclGeneric/memFence.hpp ==
		// ============================================================================

	// ==
	// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/oneApi.hpp ==
	// ============================================================================

	// ============================================================================
	// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/oneApi/warp.hpp ==
	// ==
	/* Copyright 2025 Mehmet Yusufoglu, René Widera
	 * SPDX-License-Identifier: MPL-2.0
	 */

	// #pragma once
	// #include "alpaka/api/oneApi/Api.hpp"    // amalgamate: file already inlined
	// #include "alpaka/concepts.hpp"    // amalgamate: file already inlined
	// #include "alpaka/core/common.hpp"    // amalgamate: file already inlined
	// #include "alpaka/onAcc/internal/warp.hpp"    // amalgamate: file already inlined

	// #include <algorithm>    // amalgamate: file already included
	// #include <cstdint>    // amalgamate: file already included

	#if ALPAKA_LANG_ONEAPI
	// #    include <sycl/sycl.hpp>    // amalgamate: file already included

	namespace alpaka::onAcc::warp::internal
	{
	    // GPU back-ends use native SYCL subgroup operations.
	    template<alpaka::onAcc::concepts::Acc T_Acc>
	    struct Activemask::Op<T_Acc, api::OneApi>
	    {
	        auto operator()(T_Acc const&, api::OneApi) const
	        {
	            sycl::sub_group sg = sycl::ext::oneapi::this_work_item::get_sub_group();

	            return getMask(sg);
	        }

	        static auto getMask(auto const subGroup)
	        {
	            auto sgMask = sycl::ext::oneapi::group_ballot(subGroup, true);

	            constexpr auto const warpSize = T_Acc::getWarpSize();
	            using ReturnType = std::conditional_t<warpSize <= 32, uint32_t, uint64_t>;
	            ReturnType mask;
	            sgMask.extract_bits(mask, 0u);
	            return mask;
	        };
	    };

	    template<alpaka::onAcc::concepts::Acc T_Acc>
	    struct GetLaneIdx::Op<T_Acc, api::OneApi>
	    {
	        constexpr auto operator()(T_Acc const&, api::OneApi) const
	        {
	            sycl::sub_group sg = sycl::ext::oneapi::this_work_item::get_sub_group();
	            // lane id within the warp subgroup
	            return sg.get_local_id()[0];
	        }
	    };

	    template<alpaka::onAcc::concepts::Acc T_Acc>
	    struct GetWarpIdx::Op<T_Acc, api::OneApi>
	    {
	        constexpr auto operator()(T_Acc const&, api::OneApi) const
	        {
	            sycl::sub_group sg = sycl::ext::oneapi::this_work_item::get_sub_group();
	            // lane id within the warp subgroup
	            return sg.get_group_linear_id();
	        }
	    };

	    template<alpaka::onAcc::concepts::Acc T_Acc>
	    struct All::Op<T_Acc, api::OneApi>
	    {
	        bool operator()(T_Acc const& acc, api::OneApi, int32_t predicate) const
	        {
	            using DeviceKind = ALPAKA_TYPEOF(acc[object::deviceKind]);
	            if constexpr(DeviceKind{} == alpaka::deviceKind::amdGpu)
	            {
	                /* Workaround for AMD GPUs: Sycl is taking the results of the non active threads into account
	                 * and therefore even if all participating threads have a true predicate the result will be false.
	                 * We vote with ballot and mask the result with the active thread mask.
	                 */
	                sycl::sub_group sg = sycl::ext::oneapi::this_work_item::get_sub_group();
	                auto activeMask = Activemask::Op<T_Acc, api::OneApi>::getMask(sg);
	                auto sgMask = sycl::ext::oneapi::group_ballot(sg, predicate != 0);

	                constexpr auto const warpSize = T_Acc::getWarpSize();
	                using ReturnType = std::conditional_t<warpSize <= 32, uint32_t, uint64_t>;
	                ReturnType predicateMask;
	                sgMask.extract_bits(predicateMask, 0u);
	                return activeMask & predicateMask == activeMask;
	            }
	            else
	            {
	                sycl::sub_group sg = sycl::ext::oneapi::this_work_item::get_sub_group();
	                return sycl::all_of_group(sg, predicate != 0);
	            }
	        }
	    };

	    template<alpaka::onAcc::concepts::Acc T_Acc>
	    struct Any::Op<T_Acc, api::OneApi>
	    {
	        bool operator()(T_Acc const& acc, api::OneApi, int32_t predicate) const
	        {
	            using DeviceKind = ALPAKA_TYPEOF(acc[object::deviceKind]);
	            if constexpr(DeviceKind{} == alpaka::deviceKind::amdGpu)
	            {
	                /* Workaround for AMD GPUs: Sycl is taking the results of non active threads into account
	                 * and therefore even if all participating threads have a false predicate the result will be true.
	                 * We vote with ballot and mask the result with the active thread mask.
	                 */
	                sycl::sub_group sg = sycl::ext::oneapi::this_work_item::get_sub_group();
	                auto activeMask = Activemask::Op<T_Acc, api::OneApi>::getMask(sg);
	                auto sgMask = sycl::ext::oneapi::group_ballot(sg, predicate != 0);

	                constexpr auto const warpSize = T_Acc::getWarpSize();
	                using ReturnType = std::conditional_t<warpSize <= 32, uint32_t, uint64_t>;
	                ReturnType predicateMask;
	                sgMask.extract_bits(predicateMask, 0u);
	                return activeMask & predicateMask;
	            }
	            else
	            {
	                sycl::sub_group sg = sycl::ext::oneapi::this_work_item::get_sub_group();
	                return sycl::any_of_group(sg, predicate != 0);
	            }
	        }
	    };

	    template<alpaka::onAcc::concepts::Acc T_Acc>
	    struct Ballot::Op<T_Acc, api::OneApi>
	    {
	        auto operator()(T_Acc const&, api::OneApi, int32_t predicate) const
	        {
	            sycl::sub_group sg = sycl::ext::oneapi::this_work_item::get_sub_group();
	            auto sgMask = sycl::ext::oneapi::group_ballot(sg, predicate != 0);

	            constexpr auto const warpSize = T_Acc::getWarpSize();
	            using ReturnType = std::conditional_t<warpSize <= 32, uint32_t, uint64_t>;
	            ReturnType mask;
	            sgMask.extract_bits(mask, 0u);
	            return mask;
	        }
	    };

	    template<alpaka::onAcc::concepts::Acc T_Acc, typename T>
	    struct Shfl::Op<T_Acc, api::OneApi, T>
	    {
	        constexpr T operator()(T_Acc const&, api::OneApi, T const& value, uint32_t srcLane, uint32_t width) const
	        {
	            sycl::sub_group sg = sycl::ext::oneapi::this_work_item::get_sub_group();
	            uint32_t laneIdxInWarp = sg.get_local_id()[0];
	            uint32_t partitionOffset = (laneIdxInWarp / width) * width;
	            uint32_t srcInPartitionLaneIdx = partitionOffset + (srcLane % width);

	            return sycl::select_from_group(sg, value, srcInPartitionLaneIdx);
	        }
	    };

	    template<alpaka::onAcc::concepts::Acc T_Acc, typename T>
	    struct ShflDown::Op<T_Acc, api::OneApi, T>
	    {
	        constexpr T operator()(T_Acc const&, api::OneApi, T const& value, uint32_t delta, uint32_t width) const
	        {
	            sycl::sub_group sg = sycl::ext::oneapi::this_work_item::get_sub_group();

	            uint32_t laneIdxInWarp = sg.get_local_id()[0];
	            uint32_t groupEndIdx = (laneIdxInWarp / width + 1) * width;

	            T result = sycl::shift_group_left(sg, value, delta);
	            if(laneIdxInWarp + delta >= groupEndIdx)
	                result = value;
	            return result;
	        }
	    };

	    template<alpaka::onAcc::concepts::Acc T_Acc, typename T>
	    struct ShflUp::Op<T_Acc, api::OneApi, T>
	    {
	        constexpr T operator()(T_Acc const&, api::OneApi, T const& value, uint32_t delta, uint32_t width) const
	        {
	            sycl::sub_group sg = sycl::ext::oneapi::this_work_item::get_sub_group();

	            uint32_t laneIdxInWarp = sg.get_local_id()[0];
	            uint32_t groupStartIdx = (laneIdxInWarp / width) * width;

	            T result = sycl::shift_group_right(sg, value, delta);
	            if(laneIdxInWarp - groupStartIdx < delta)
	                result = value;
	            return result;
	        }
	    };

	    template<alpaka::onAcc::concepts::Acc T_Acc, typename T>
	    struct ShflXor::Op<T_Acc, api::OneApi, T>
	    {
	        constexpr T operator()(T_Acc const&, api::OneApi, T const& value, uint32_t laneMask, uint32_t width) const
	        {
	            sycl::sub_group sg = sycl::ext::oneapi::this_work_item::get_sub_group();
	            uint32_t laneIdxInWarp = sg.get_local_id()[0];
	            uint32_t groupStartIdx = (laneIdxInWarp / width) * width;
	            uint32_t const relativeIdx = laneIdxInWarp - groupStartIdx;
	            uint32_t const sourceLane = (relativeIdx % width) ^ laneMask;
	            return sycl::select_from_group(sg, value, sourceLane < width ? groupStartIdx + sourceLane : laneIdxInWarp);
	        }
	    };
	} // namespace alpaka::onAcc::warp::internal
	#endif
	// ==
	// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/oneApi/warp.hpp ==
	// ============================================================================

	// ============================================================================
	// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/unifiedCudaHip.hpp ==
	// ==
	/* Copyright 2024 René Widera
	 * SPDX-License-Identifier: MPL-2.0
	 */

	// #pragma once
		// ============================================================================
		// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/cuda.hpp ==
		// ==
		/* Copyright 2024 René Widera
		 * SPDX-License-Identifier: MPL-2.0
		 */

		// #pragma once
		// #include "alpaka/api/cuda/Api.hpp"    // amalgamate: file already inlined
			// ============================================================================
			// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/cuda/Device.hpp ==
			// ==
			/* Copyright 2024 René Widera
			 * SPDX-License-Identifier: MPL-2.0
			 */

			// #pragma once
				// ============================================================================
				// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/unifiedCudaHip/Device.hpp ==
				// ==
				/* Copyright 2024 René Widera
				 * SPDX-License-Identifier: MPL-2.0
				 */

				// #pragma once
					// ============================================================================
					// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/unifiedCudaHip/Event.hpp ==
					// ==
					/* Copyright 2025 René Widera
					 * SPDX-License-Identifier: MPL-2.0
					 */


					// #pragma once
					// #pragma once
						// ============================================================================
						// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/cuda/IdxLayer.hpp ==
						// ==
						/* Copyright 2024 René Widera
						 * SPDX-License-Identifier: MPL-2.0
						 */

						// #pragma once
						// #include "alpaka/Vec.hpp"    // amalgamate: file already inlined
						// #include "alpaka/core/config.hpp"    // amalgamate: file already inlined

						#if ALPAKA_LANG_CUDA

						namespace alpaka::onAcc
						{
						    namespace unifiedCudaHip
						    {
						        template<typename T_OptimizedThreadSpec>
						        struct BlockLayer
						        {
						            T_OptimizedThreadSpec const& m_optimizedThreadSpec;
						            static constexpr uint32_t dim = T_OptimizedThreadSpec::dim();
						            using IdxType = typename T_OptimizedThreadSpec::NumBlocksVecType::type;

						            constexpr BlockLayer(T_OptimizedThreadSpec const& optimizedThreadSpec)
						                : m_optimizedThreadSpec(optimizedThreadSpec)
						            {
						            }

						            constexpr auto idx() const
						            {
						                if constexpr(dim <= 3u)
						                {
						                    return Vec<IdxType, 3u>{::blockIdx.z, ::blockIdx.y, ::blockIdx.x}.template rshrink<dim>();
						                }
						                else
						                {
						                    return mapToND(m_optimizedThreadSpec.getNumBlocks(), static_cast<IdxType>(::blockIdx.x));
						                }
						            }

						            constexpr auto count() const
						            {
						                if constexpr(dim <= 3u)
						                {
						                    return Vec<IdxType, 3u>{::gridDim.z, ::gridDim.y, ::gridDim.x}.template rshrink<dim>();
						                }
						                else
						                {
						                    return m_optimizedThreadSpec.getNumBlocks();
						                }
						            }
						        };

						        template<typename T_OptimizedThreadSpec>
						        struct ThreadLayer
						        {
						            T_OptimizedThreadSpec const& m_optimizedThreadSpec;
						            static constexpr uint32_t dim = T_OptimizedThreadSpec::dim();
						            using IdxType = typename T_OptimizedThreadSpec::NumThreadsVecType::type;

						            constexpr ThreadLayer(T_OptimizedThreadSpec const& optimizedThreadSpec)
						                : m_optimizedThreadSpec(optimizedThreadSpec)
						            {
						            }

						            constexpr auto idx() const
						            {
						                if constexpr(dim <= 3u)
						                {
						                    return Vec<IdxType, 3u>{::threadIdx.z, ::threadIdx.y, ::threadIdx.x}.template rshrink<dim>();
						                }
						                else
						                {
						                    return mapToND(m_optimizedThreadSpec.getNumThreads(), static_cast<IdxType>(::threadIdx.x));
						                }
						            }

						            constexpr auto count() const
						            {
						                if constexpr(dim <= 3u)
						                {
						                    return Vec<IdxType, 3u>{::blockDim.z, ::blockDim.y, ::blockDim.x}.template rshrink<dim>();
						                }
						                else
						                {
						                    return m_optimizedThreadSpec.getNumThreads();
						                }
						            }

						            constexpr auto count() const
						                requires alpaka::concepts::CVector<typename T_OptimizedThreadSpec::NumThreadsVecType>
						            {
						                return typename T_OptimizedThreadSpec::NumThreadsVecType{};
						            }
						        };
						    } // namespace unifiedCudaHip
						} // namespace alpaka::onAcc

						#endif
						// ==
						// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/cuda/IdxLayer.hpp ==
						// ============================================================================

					// #include "alpaka/api/generic.hpp"    // amalgamate: file already inlined
						// ============================================================================
						// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/hip/IdxLayer.hpp ==
						// ==
						/* Copyright 2024 René Widera
						 * SPDX-License-Identifier: MPL-2.0
						 */

						// #pragma once
						// #include "alpaka/Vec.hpp"    // amalgamate: file already inlined
						// #include "alpaka/core/config.hpp"    // amalgamate: file already inlined

						#if ALPAKA_LANG_HIP

						namespace alpaka::onAcc
						{
						    namespace unifiedCudaHip
						    {
						        template<typename T_OptimizedThreadSpec>
						        struct BlockLayer
						        {
						            T_OptimizedThreadSpec const& m_optimizedThreadSpec;
						            static constexpr uint32_t dim = T_OptimizedThreadSpec::dim();
						            using IdxType = typename T_OptimizedThreadSpec::NumBlocksVecType::type;

						            constexpr BlockLayer(T_OptimizedThreadSpec const& optimizedThreadSpec)
						                : m_optimizedThreadSpec(optimizedThreadSpec)
						            {
						            }

						            constexpr auto idx() const
						            {
						                if constexpr(dim <= 3u)
						                {
						                    return Vec<IdxType, 3u>{hipBlockIdx_z, hipBlockIdx_y, hipBlockIdx_x}.template rshrink<dim>();
						                }
						                else
						                {
						                    return mapToND(m_optimizedThreadSpec.getNumBlocks(), static_cast<IdxType>(hipBlockIdx_x));
						                }
						            }

						            constexpr auto count() const
						            {
						                if constexpr(dim <= 3u)
						                {
						                    return Vec<IdxType, 3u>{hipGridDim_z, hipGridDim_y, hipGridDim_x}.template rshrink<dim>();
						                }
						                else
						                {
						                    return m_optimizedThreadSpec.getNumBlocks();
						                }
						            }
						        };

						        template<typename T_OptimizedThreadSpec>
						        struct ThreadLayer
						        {
						            T_OptimizedThreadSpec const& m_optimizedThreadSpec;
						            static constexpr uint32_t dim = T_OptimizedThreadSpec::dim();
						            using IdxType = typename T_OptimizedThreadSpec::NumThreadsVecType::type;

						            constexpr ThreadLayer(T_OptimizedThreadSpec const& optimizedThreadSpec)
						                : m_optimizedThreadSpec(optimizedThreadSpec)
						            {
						            }

						            constexpr auto idx() const
						            {
						                if constexpr(dim <= 3u)
						                {
						                    return Vec<IdxType, 3u>{hipThreadIdx_z, hipThreadIdx_y, hipThreadIdx_x}.template rshrink<dim>();
						                }
						                else
						                {
						                    return mapToND(m_optimizedThreadSpec.getNumThreads(), static_cast<IdxType>(hipThreadIdx_x));
						                }
						            }

						            constexpr auto count() const
						            {
						                if constexpr(dim <= 3u)
						                {
						                    return Vec<IdxType, 3u>{hipBlockDim_z, hipBlockDim_y, hipBlockDim_x}.template rshrink<dim>();
						                }
						                else
						                {
						                    return m_optimizedThreadSpec.getNumThreads();
						                }
						            }

						            constexpr auto count() const
						                requires alpaka::concepts::CVector<typename T_OptimizedThreadSpec::NumThreadsVecType>
						            {
						                return typename T_OptimizedThreadSpec::NumThreadsVecType{};
						            }
						        };
						    } // namespace unifiedCudaHip
						} // namespace alpaka::onAcc

						#endif
						// ==
						// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/hip/IdxLayer.hpp ==
						// ============================================================================

						// ============================================================================
						// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/unifiedCudaHip/ComputeApi.hpp ==
						// ==
						/* Copyright 2024 Jeffrey Kelling, Rene Widera, Bernhard Manfred Gruber, René Widera
						 * SPDX-License-Identifier: MPL-2.0
						 */

						// #pragma once
							// ============================================================================
							// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/unifiedCudaHip/concepts.hpp ==
							// ==
							/* Copyright 2024 René Widera
							 * SPDX-License-Identifier: MPL-2.0
							 */

							// #pragma once
							// #include "alpaka/api/unifiedCudaHip/trait.hpp"    // amalgamate: file already inlined

							// #include <concepts>    // amalgamate: file already included

							namespace alpaka
							{
							    namespace concepts
							    {
							        template<typename T>
							        concept UnifiedCudaHipExecutor = alpaka::unifiedCudaHip::trait::IsUnifiedExecutor<T>::value;

							        template<typename T>
							        concept UnifiedCudaHipApi = alpaka::unifiedCudaHip::trait::IsUnifiedApi<T>::value;
							    } // namespace concepts
							} // namespace alpaka
							// ==
							// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/unifiedCudaHip/concepts.hpp ==
							// ============================================================================

						// #include "alpaka/core/common.hpp"    // amalgamate: file already inlined
						// #include "alpaka/core/config.hpp"    // amalgamate: file already inlined
						// #include "alpaka/onAcc/internal/interface.hpp"    // amalgamate: file already inlined
						// #include "alpaka/tag.hpp"    // amalgamate: file already inlined

						// #include <cstddef>    // amalgamate: file already included

						#if ALPAKA_LANG_CUDA || ALPAKA_LANG_HIP

						namespace alpaka::onAcc
						{
						    namespace unifiedCudaHip
						    {

						        struct Sync
						        {
						            __device__ void operator()() const
						            {
						                __syncthreads();
						            }
						        };

						        namespace internal
						        {
						            /** This trait is only for uniform CUDA and HIP warp size abstraction
						             *
						             * Use onAcc::internal::GetWarpSize to query the warp size independent of the API.
						             * The warp size must be a std::integral_constant<uint32_t,X>.
						             */
						            struct WarpSize
						            {
						                template<alpaka::concepts::DeviceKind T_DeviceKind>
						                struct Get;
						            };
						        } // namespace internal
						    } // namespace unifiedCudaHip
						} // namespace alpaka::onAcc

						namespace alpaka::onAcc::internalCompute
						{
						    template<typename T, typename T_Acc>
						    requires alpaka::concepts::UnifiedCudaHipExecutor<ALPAKA_TYPEOF(std::declval<T_Acc>()[object::exec])>
						    struct SharedMemory::Dynamic<T, T_Acc>
						    {
						        __device__ decltype(auto) operator()(auto const& acc) const
						        {
						            alpaka::unused(acc);
						            // Because unaligned access to variables is not allowed in device code,
						            // we use the widest possible alignment supported by CUDA types to have
						            // all types aligned correctly.
						            // See:
						            //   - http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared
						            //   - http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#vector-types
						            extern __shared__ std::byte shMem alignas(std::max_align_t)[];
						            return reinterpret_cast<T*>(shMem);
						        }
						    };

						    template<typename T, size_t T_uniqueId, typename T_Acc>
						    requires alpaka::concepts::UnifiedCudaHipExecutor<ALPAKA_TYPEOF(std::declval<T_Acc>()[object::exec])>
						    struct SharedMemory::Static<T, T_uniqueId, T_Acc>
						    {
						        __device__ decltype(auto) operator()(auto const& acc) const
						        {
						            alpaka::unused(acc);
						            __shared__ uint8_t shMem alignas(alignof(T))[sizeof(T)];
						            return *(reinterpret_cast<T*>(shMem));
						        }
						    };
						} // namespace alpaka::onAcc::internalCompute

						#endif
						// ==
						// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/unifiedCudaHip/ComputeApi.hpp ==
						// ============================================================================

						// ============================================================================
						// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/unifiedCudaHip/MemcpyKind.hpp ==
						// ==
						/* Copyright 2024 René Widera
						 * SPDX-License-Identifier: MPL-2.0
						 */

						// #pragma once
						// #include "alpaka/api/host/Api.hpp"    // amalgamate: file already inlined
						// #include "alpaka/api/unifiedCudaHip/concepts.hpp"    // amalgamate: file already inlined
						// #include "alpaka/api/unifiedCudaHip/trait.hpp"    // amalgamate: file already inlined
						// #include "alpaka/core/config.hpp"    // amalgamate: file already inlined

						// #include <cstdint>    // amalgamate: file already included

						namespace alpaka::onHost
						{
						    namespace unifiedCudaHip
						    {
						        template<typename T_ApiInterface, typename T_Dest, typename T_Source>
						        struct MemcpyKind
						        {
						            static_assert(sizeof(T_Dest) && false, "Not supported memcpy kind.");
						        };

						        template<typename T_ApiInterface, alpaka::concepts::UnifiedCudaHipApi T_Source>
						        struct MemcpyKind<T_ApiInterface, api::Host, T_Source>
						        {
						            static constexpr auto kind = T_ApiInterface::memcpyDeviceToHost;
						        };

						        template<typename T_ApiInterface, alpaka::concepts::UnifiedCudaHipApi T_SourceDestApi>
						        struct MemcpyKind<T_ApiInterface, T_SourceDestApi, T_SourceDestApi>
						        {
						            static constexpr auto kind = T_ApiInterface::memcpyDeviceToDevice;
						        };

						        template<typename T_ApiInterface>
						        struct MemcpyKind<T_ApiInterface, api::Host, api::Host>
						        {
						            static constexpr auto kind = T_ApiInterface::memcpyHostToHost;
						        };

						        template<typename T_ApiInterface, alpaka::concepts::UnifiedCudaHipApi T_Dest>
						        struct MemcpyKind<T_ApiInterface, T_Dest, api::Host>
						        {
						            static constexpr auto kind = T_ApiInterface::memcpyHostToDevice;
						        };
						    } // namespace unifiedCudaHip
						} // namespace alpaka::onHost
						// ==
						// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/unifiedCudaHip/MemcpyKind.hpp ==
						// ============================================================================

					// #include "alpaka/api/unifiedCudaHip/concepts.hpp"    // amalgamate: file already inlined
					// #include "alpaka/api/util.hpp"    // amalgamate: file already inlined
					// #include "alpaka/core/CallbackThread.hpp"    // amalgamate: file already inlined
						// ============================================================================
						// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/core/UniformCudaHip.hpp ==
						// ==
						/* Copyright 2022 Axel Huebl, Benjamin Worpitz, Matthias Werner, René Widera, Jan Stephan, Andrea Bocci, Bernhard
						 * Manfred Gruber
						 * SPDX-License-Identifier: MPL-2.0
						 */

						// #pragma once
						// #include "alpaka/core/common.hpp"    // amalgamate: file already inlined
						// #include "alpaka/core/config.hpp"    // amalgamate: file already inlined

						// #include <algorithm>    // amalgamate: file already included
						#include <initializer_list>
						// #include <stdexcept>    // amalgamate: file already included
						// #include <string>    // amalgamate: file already included
						// #include <tuple>    // amalgamate: file already included
						#include <type_traits>

						#if ALPAKA_LANG_CUDA || ALPAKA_LANG_HIP

						namespace alpaka::uniform_cuda_hip::detail
						{
						    //! CUDA/HIP runtime API error checking with log and exception, ignoring specific error values
						    template<typename TApi, bool TThrow>
						    ALPAKA_FN_HOST inline void rtCheck(
						        typename TApi::Error_t const& error,
						        char const* desc,
						        char const* file,
						        int const& line) noexcept(!TThrow)
						    {
						        if(error != TApi::success)
						        {
						            // reset the last error to allow user side error handling. Using std::ignore to discard unneeded
						            // return values is suggested by the C++ core guidelines.
						            std::ignore = TApi::getLastError();

						            if constexpr(TThrow)
						            {
						                auto const sError = std::string{
						                    std::string(file) + "(" + std::to_string(line) + ") " + std::string(desc) + " : '"
						                    + TApi::getErrorName(error) + "': '" + std::string(TApi::getErrorString(error)) + "'!"};

						                throw std::runtime_error(sError);
						            }
						        }
						    }

						    //! CUDA/HIP runtime API error checking with log and exception, ignoring specific error values
						    template<typename TApi, bool TThrow>
						    ALPAKA_FN_HOST inline void rtCheckIgnore(
						        typename TApi::Error_t const& error,
						        char const* cmd,
						        char const* file,
						        int const& line,
						        std::initializer_list<typename TApi::Error_t> ignoredErrorCodes) noexcept(!TThrow)
						    {
						        if(error != TApi::success)
						        {
						            // If the error code is not one of the ignored ones.
						            if(std::find(std::cbegin(ignoredErrorCodes), std::cend(ignoredErrorCodes), error)
						               == std::cend(ignoredErrorCodes))
						            {
						                using namespace std::literals;
						                rtCheck<TApi, TThrow>(error, ("'"s + std::string(cmd) + "' returned error "s).c_str(), file, line);
						            }
						            else
						            {
						                // reset the last error to avoid propagation to the next CUDA/HIP API call. Using std::ignore
						                // to discard unneeded return values is recommended by the C++ core guidelines.
						                std::ignore = TApi::getLastError();
						            }
						        }
						    }

						    //! CUDA/HIP runtime API last error checking with log and exception.
						    template<typename TApi, bool TThrow>
						    ALPAKA_FN_HOST inline void rtCheckLastError(char const* desc, char const* file, int const& line) noexcept(!TThrow)
						    {
						        typename TApi::Error_t const error(TApi::getLastError());
						        rtCheck<TApi, TThrow>(error, desc, file, line);
						    }
						} // namespace alpaka::uniform_cuda_hip::detail

						#    define ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK_IMPL(ApiInterfaceType, cmd, throw, ...)                                  \
						        do                                                                                                            \
						        {                                                                                                             \
						            ::alpaka::uniform_cuda_hip::detail::rtCheckLastError<ApiInterfaceType, throw>(                            \
						                "'" #cmd "' A previous API call (not this one) set the error ",                                       \
						                __FILE__,                                                                                             \
						                __LINE__);                                                                                            \
						            ::alpaka::uniform_cuda_hip::detail::rtCheckIgnore<ApiInterfaceType, throw>(                               \
						                cmd,                                                                                                  \
						                #cmd,                                                                                                 \
						                __FILE__,                                                                                             \
						                __LINE__,                                                                                             \
						                {__VA_ARGS__});                                                                                       \
						        } while(0)

						//! CUDA/HIP runtime error checking with log and exception, ignoring specific error values
						#    define ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK_IGNORE(ApiInterfaceType, cmd, ...)                                       \
						        ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK_IMPL(ApiInterfaceType, cmd, true, __VA_ARGS__)

						//! CUDA/HIP runtime error checking with log and exception.
						#    define ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(ApiInterfaceType, cmd)                                                   \
						        ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK_IMPL(ApiInterfaceType, cmd, true, )

						//! CUDA/HIP runtime error checking with log and exception, ignoring specific error values
						#    define ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK_IGNORE_NOEXCEPT(ApiInterfaceType, cmd, ...)                              \
						        ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK_IMPL(ApiInterfaceType, cmd, false, __VA_ARGS__)

						//! CUDA/HIP runtime error checking with log.
						#    define ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK_NOEXCEPT(ApiInterfaceType, cmd)                                          \
						        ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK_IMPL(ApiInterfaceType, cmd, false, )
						#endif
						// ==
						// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/core/UniformCudaHip.hpp ==
						// ============================================================================

					// #include "alpaka/core/config.hpp"    // amalgamate: file already inlined
					// #include "alpaka/internal/interface.hpp"    // amalgamate: file already inlined
					// #include "alpaka/onAcc/Acc.hpp"    // amalgamate: file already inlined
					// #include "alpaka/onHost/FrameSpec.hpp"    // amalgamate: file already inlined
					// #include "alpaka/onHost/Handle.hpp"    // amalgamate: file already inlined
					// #include "alpaka/onHost/interface.hpp"    // amalgamate: file already inlined
					// #include "alpaka/onHost/internal/interface.hpp"    // amalgamate: file already inlined
					// #include "alpaka/onHost/logger/logger.hpp"    // amalgamate: file already inlined
					// #include "alpaka/onHost/mem/SharedBuffer.hpp"    // amalgamate: file already inlined

					#if ALPAKA_LANG_CUDA || ALPAKA_LANG_HIP

						// ============================================================================
						// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/core/ApiCudaRt.hpp ==
						// ==
						/* Copyright 2022 Andrea Bocci
						 * SPDX-License-Identifier: MPL-2.0
						 */

						// #pragma once
						// #include "alpaka/core/config.hpp"    // amalgamate: file already inlined

						#if ALPAKA_LANG_CUDA
						#    include <cuda_runtime_api.h>

						namespace alpaka
						{
						    struct ApiCudaRt
						    {
						        // Names
						        static constexpr char name[] = "Cuda";
						        static constexpr auto version = ALPAKA_LANG_CUDA;

						        // Types
						        using DeviceAttr_t = ::cudaDeviceAttr;
						        using PointerAttr_t = ::cudaPointerAttributes;
						        using Memory_t = ::cudaMemoryType;
						        using DeviceProp_t = ::cudaDeviceProp;
						        using Error_t = ::cudaError_t;
						        using Event_t = ::cudaEvent_t;
						        using Extent_t = ::cudaExtent;
						        using Flag_t = unsigned int;
						        using FuncAttributes_t = ::cudaFuncAttributes;
						        using HostFn_t = void (*)(void* data); // same as cudaHostFn_t, without the CUDART_CB calling convention
						        using Limit_t = ::cudaLimit;
						        using Memcpy3DParms_t = ::cudaMemcpy3DParms;
						        using MemcpyKind_t = ::cudaMemcpyKind;
						        using PitchedPtr_t = ::cudaPitchedPtr;
						        using Pos_t = ::cudaPos;
						        using Stream_t = ::cudaStream_t;

						        // Constants
						        static constexpr Error_t success = ::cudaSuccess;
						        static constexpr Error_t errorNotReady = ::cudaErrorNotReady;
						        static constexpr Error_t errorHostMemoryAlreadyRegistered = ::cudaErrorHostMemoryAlreadyRegistered;
						        static constexpr Error_t errorHostMemoryNotRegistered = ::cudaErrorHostMemoryNotRegistered;
						        static constexpr Error_t errorUnsupportedLimit = ::cudaErrorUnsupportedLimit;
						        static constexpr Error_t errorUnknown = ::cudaErrorUnknown;

						        static constexpr Flag_t eventDefault = cudaEventDefault;
						        static constexpr Flag_t eventBlockingSync = cudaEventBlockingSync;
						        static constexpr Flag_t eventDisableTiming = cudaEventDisableTiming;
						        static constexpr Flag_t eventInterprocess = cudaEventInterprocess;

						        static constexpr Flag_t hostMallocDefault = cudaHostAllocDefault;
						        static constexpr Flag_t hostMallocMapped = cudaHostAllocMapped;
						        static constexpr Flag_t hostMallocPortable = cudaHostAllocPortable;
						        static constexpr Flag_t hostMallocWriteCombined = cudaHostAllocWriteCombined;
						        static constexpr Flag_t hostMallocCoherent = cudaHostAllocDefault; // Not supported.
						        static constexpr Flag_t hostMallocNonCoherent = cudaHostAllocDefault; // Not supported.

						        static constexpr Flag_t hostRegisterDefault = cudaHostRegisterDefault;
						        static constexpr Flag_t hostRegisterPortable = cudaHostRegisterPortable;
						        static constexpr Flag_t hostRegisterMapped = cudaHostRegisterMapped;
						        static constexpr Flag_t hostRegisterIoMemory = cudaHostRegisterIoMemory;

						        static constexpr MemcpyKind_t memcpyDefault = ::cudaMemcpyDefault;
						        static constexpr MemcpyKind_t memcpyDeviceToDevice = ::cudaMemcpyDeviceToDevice;
						        static constexpr MemcpyKind_t memcpyDeviceToHost = ::cudaMemcpyDeviceToHost;
						        static constexpr MemcpyKind_t memcpyHostToDevice = ::cudaMemcpyHostToDevice;
						        static constexpr MemcpyKind_t memcpyHostToHost = ::cudaMemcpyHostToHost;

						        static constexpr Flag_t streamDefault = cudaStreamDefault;
						        static constexpr Flag_t streamNonBlocking = cudaStreamNonBlocking;

						        static constexpr DeviceAttr_t deviceAttributeMaxBlockDimX = ::cudaDevAttrMaxBlockDimX;
						        static constexpr DeviceAttr_t deviceAttributeMaxBlockDimY = ::cudaDevAttrMaxBlockDimY;
						        static constexpr DeviceAttr_t deviceAttributeMaxBlockDimZ = ::cudaDevAttrMaxBlockDimZ;
						        static constexpr DeviceAttr_t deviceAttributeMaxGridDimX = ::cudaDevAttrMaxGridDimX;
						        static constexpr DeviceAttr_t deviceAttributeMaxGridDimY = ::cudaDevAttrMaxGridDimY;
						        static constexpr DeviceAttr_t deviceAttributeMaxGridDimZ = ::cudaDevAttrMaxGridDimZ;
						        static constexpr DeviceAttr_t deviceAttributeMaxSharedMemoryPerBlock = ::cudaDevAttrMaxSharedMemoryPerBlock;
						        static constexpr DeviceAttr_t deviceAttributeMaxThreadsPerBlock = ::cudaDevAttrMaxThreadsPerBlock;
						        static constexpr DeviceAttr_t deviceAttributeMultiprocessorCount = ::cudaDevAttrMultiProcessorCount;
						        static constexpr DeviceAttr_t deviceAttributeWarpSize = ::cudaDevAttrWarpSize;

						        static constexpr Memory_t memoryTypeUnregistered = ::cudaMemoryTypeUnregistered;
						        static constexpr Memory_t memoryTypeHost = ::cudaMemoryTypeHost;
						        static constexpr Memory_t memoryTypeDevice = ::cudaMemoryTypeDevice;
						        static constexpr Memory_t memoryTypeManaged = ::cudaMemoryTypeManaged;

						        static constexpr Limit_t limitPrintfFifoSize = ::cudaLimitPrintfFifoSize;
						        static constexpr Limit_t limitMallocHeapSize = ::cudaLimitMallocHeapSize;

						        // Host function helper
						        // Encapsulates the different function signatures used by cudaStreamAddCallback and cudaLaunchHostFn, and the
						        // different calling conventions used by CUDA (__stdcall on Win32) and HIP (standard).
						        struct HostFnAdaptor
						        {
						            HostFn_t func_;
						            void* data_;

						            static void CUDART_CB hostFunction(void* data)
						            {
						                auto ptr = reinterpret_cast<HostFnAdaptor*>(data);
						                ptr->func_(ptr->data_);
						                delete ptr;
						            }

						            static void CUDART_CB streamCallback(Stream_t, Error_t, void* data)
						            {
						                auto ptr = reinterpret_cast<HostFnAdaptor*>(data);
						                ptr->func_(ptr->data_);
						                delete ptr;
						            }
						        };

						        // Runtime API
						        static inline Error_t deviceGetAttribute(int* value, DeviceAttr_t attr, int device)
						        {
						            return ::cudaDeviceGetAttribute(value, attr, device);
						        }

						        static inline Error_t pointerGetAttributes(PointerAttr_t* attr, void const* ptr)
						        {
						            return ::cudaPointerGetAttributes(attr, ptr);
						        }

						        static inline Error_t deviceGetLimit(size_t* pValue, Limit_t limit)
						        {
						            return ::cudaDeviceGetLimit(pValue, limit);
						        }

						        static inline Error_t deviceReset()
						        {
						            return ::cudaDeviceReset();
						        }

						        static inline Error_t deviceSetLimit(Limit_t limit, size_t value)
						        {
						            return ::cudaDeviceSetLimit(limit, value);
						        }

						        static inline Error_t deviceSynchronize()
						        {
						            return ::cudaDeviceSynchronize();
						        }

						        static inline Error_t eventCreate(Event_t* event)
						        {
						            return ::cudaEventCreate(event);
						        }

						        static inline Error_t eventCreateWithFlags(Event_t* event, Flag_t flags)
						        {
						            return ::cudaEventCreateWithFlags(event, flags);
						        }

						        static inline Error_t eventDestroy(Event_t event)
						        {
						            return ::cudaEventDestroy(event);
						        }

						        static inline Error_t eventQuery(Event_t event)
						        {
						            return ::cudaEventQuery(event);
						        }

						        static inline Error_t eventRecord(Event_t event, Stream_t stream)
						        {
						            return ::cudaEventRecord(event, stream);
						        }

						        static inline Error_t eventSynchronize(Event_t event)
						        {
						            return ::cudaEventSynchronize(event);
						        }

						        static inline Error_t free(void* devPtr)
						        {
						            return ::cudaFree(devPtr);
						        }

						        static inline Error_t freeAsync(void* devPtr, Stream_t stream)
						        {
						            return ::cudaFreeAsync(devPtr, stream);
						        }

						        static inline Error_t funcGetAttributes(FuncAttributes_t* attr, void const* func)
						        {
						            return ::cudaFuncGetAttributes(attr, func);
						        }

						        template<typename T>
						        static inline Error_t funcGetAttributes(FuncAttributes_t* attr, T* func)
						        {
						#    if ALPAKA_COMP_GNUC
						#        pragma GCC diagnostic push
						#        pragma GCC diagnostic ignored "-Wconditionally-supported"
						#    endif
						            return ::cudaFuncGetAttributes(attr, reinterpret_cast<void const*>(func));
						#    if ALPAKA_COMP_GNUC
						#        pragma GCC diagnostic pop
						#    endif
						        }

						        static inline Error_t getDeviceCount(int* count)
						        {
						            return ::cudaGetDeviceCount(count);
						        }

						        static inline Error_t getDeviceProperties(DeviceProp_t* prop, int device)
						        {
						            return ::cudaGetDeviceProperties(prop, device);
						        }

						        static inline char const* getErrorName(Error_t error)
						        {
						            return ::cudaGetErrorName(error);
						        }

						        static inline char const* getErrorString(Error_t error)
						        {
						            return ::cudaGetErrorString(error);
						        }

						        static inline Error_t getLastError()
						        {
						            return ::cudaGetLastError();
						        }

						        static inline Error_t getSymbolAddress(void** devPtr, void const* symbol)
						        {
						            return ::cudaGetSymbolAddress(devPtr, symbol);
						        }

						        template<class T>
						        static inline Error_t getSymbolAddress(void** devPtr, T const& symbol)
						        {
						            return ::cudaGetSymbolAddress(devPtr, symbol);
						        }

						        static inline Error_t hostGetDevicePointer(void** pDevice, void* pHost, Flag_t flags)
						        {
						            return ::cudaHostGetDevicePointer(pDevice, pHost, flags);
						        }

						        static inline Error_t hostFree(void* ptr)
						        {
						            return ::cudaFreeHost(ptr);
						        }

						        static inline Error_t hostMalloc(void** ptr, size_t size, Flag_t flags)
						        {
						            return ::cudaHostAlloc(ptr, size, flags);
						        }

						        static inline Error_t hostRegister(void* ptr, size_t size, Flag_t flags)
						        {
						            return ::cudaHostRegister(ptr, size, flags);
						        }

						        static inline Error_t hostUnregister(void* ptr)
						        {
						            return ::cudaHostUnregister(ptr);
						        }

						        static inline Error_t launchHostFunc(Stream_t stream, HostFn_t fn, void* userData)
						        {
						#    if CUDART_VERSION >= 10000
						            // Wrap the host function using the proper calling convention
						            return ::cudaLaunchHostFunc(stream, HostFnAdaptor::hostFunction, new HostFnAdaptor{fn, userData});
						#    else
						            // Emulate cudaLaunchHostFunc using cudaStreamAddCallback with a callback adaptor.
						            return ::cudaStreamAddCallback(stream, HostFnAdaptor::streamCallback, new HostFnAdaptor{fn, userData}, 0);
						#    endif
						        }

						        static inline Error_t malloc(void** devPtr, size_t size)
						        {
						            return ::cudaMalloc(devPtr, size);
						        }

						        static inline Error_t mallocManaged(void** devPtr, size_t size)
						        {
						            return ::cudaMallocManaged(devPtr, size);
						        }

						        static inline Error_t malloc3D(PitchedPtr_t* pitchedDevPtr, Extent_t extent)
						        {
						            return ::cudaMalloc3D(pitchedDevPtr, extent);
						        }

						        static inline Error_t mallocAsync(
						            [[maybe_unused]] void** devPtr,
						            [[maybe_unused]] size_t size,
						            [[maybe_unused]] Stream_t stream)
						        {
						#    if CUDART_VERSION >= 11020
						            return ::cudaMallocAsync(devPtr, size, stream);
						#    else
						            // Not implemented.
						            return errorUnknown;
						#    endif
						        }

						        static inline Error_t mallocPitch(void** devPtr, size_t* pitch, size_t width, size_t height)
						        {
						            return ::cudaMallocPitch(devPtr, pitch, width, height);
						        }

						        static inline Error_t memGetInfo(size_t* free, size_t* total)
						        {
						            return ::cudaMemGetInfo(free, total);
						        }

						        static inline Error_t memcpy(void* dst, void const* src, size_t count, MemcpyKind_t kind)
						        {
						            return ::cudaMemcpy(dst, src, count, kind);
						        }

						        static inline Error_t memcpy2DAsync(
						            void* dst,
						            size_t dpitch,
						            void const* src,
						            size_t spitch,
						            size_t width,
						            size_t height,
						            MemcpyKind_t kind,
						            Stream_t stream)
						        {
						            return ::cudaMemcpy2DAsync(dst, dpitch, src, spitch, width, height, kind, stream);
						        }

						        static inline Error_t memcpy3DAsync(Memcpy3DParms_t const* p, Stream_t stream)
						        {
						            return ::cudaMemcpy3DAsync(p, stream);
						        }

						        static inline Error_t memcpyAsync(void* dst, void const* src, size_t count, MemcpyKind_t kind, Stream_t stream)
						        {
						            return ::cudaMemcpyAsync(dst, src, count, kind, stream);
						        }

						        static inline Error_t memset2DAsync(
						            void* devPtr,
						            size_t pitch,
						            int value,
						            size_t width,
						            size_t height,
						            Stream_t stream)
						        {
						            return ::cudaMemset2DAsync(devPtr, pitch, value, width, height, stream);
						        }

						        static inline Error_t memset3DAsync(PitchedPtr_t pitchedDevPtr, int value, Extent_t extent, Stream_t stream)
						        {
						            return ::cudaMemset3DAsync(pitchedDevPtr, value, extent, stream);
						        }

						        static inline Error_t memsetAsync(void* devPtr, int value, size_t count, Stream_t stream)
						        {
						            return ::cudaMemsetAsync(devPtr, value, count, stream);
						        }

						        static inline Error_t setDevice(int device)
						        {
						            return ::cudaSetDevice(device);
						        }

						        static inline Error_t streamCreate(Stream_t* pStream)
						        {
						            return ::cudaStreamCreate(pStream);
						        }

						        static inline Error_t streamCreateWithFlags(Stream_t* pStream, Flag_t flags)
						        {
						            return ::cudaStreamCreateWithFlags(pStream, flags);
						        }

						        static inline Error_t streamDestroy(Stream_t stream)
						        {
						            return ::cudaStreamDestroy(stream);
						        }

						        static inline Error_t streamQuery(Stream_t stream)
						        {
						            return ::cudaStreamQuery(stream);
						        }

						        static inline Error_t streamSynchronize(Stream_t stream)
						        {
						            return ::cudaStreamSynchronize(stream);
						        }

						        static inline Error_t streamWaitEvent(Stream_t stream, Event_t event, Flag_t flags)
						        {
						            return ::cudaStreamWaitEvent(stream, event, flags);
						        }

						        static inline PitchedPtr_t makePitchedPtr(void* d, size_t p, size_t xsz, size_t ysz)
						        {
						            return ::make_cudaPitchedPtr(d, p, xsz, ysz);
						        }

						        static inline Pos_t makePos(size_t x, size_t y, size_t z)
						        {
						            return ::make_cudaPos(x, y, z);
						        }

						        static inline Extent_t makeExtent(size_t w, size_t h, size_t d)
						        {
						            return ::make_cudaExtent(w, h, d);
						        }
						    };

						} // namespace alpaka

						#endif
						// ==
						// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/core/ApiCudaRt.hpp ==
						// ============================================================================


					// #    include <cstdint>    // amalgamate: file already included
					// #    include <sstream>    // amalgamate: file already included

					namespace alpaka::onHost
					{
					    namespace unifiedCudaHip
					    {
					        template<typename T_Device>
					        struct Event : std::enable_shared_from_this<Event<T_Device>>
					        {
					            using ApiInterface = typename T_Device::ApiInterface;

					        public:
					            Event(internal::concepts::DeviceHandle auto device, uint32_t const idx)
					                : m_device(std::move(device))
					                , m_idx(idx)
					            {
					                ALPAKA_LOG_FUNCTION(onHost::logger::event);
					                // Set the current device.
					                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
					                    ApiInterface,
					                    ApiInterface::setDevice(internal::getNativeHandle(*m_device.get())));

					                // Create the event on the current device with the specified flags. Valid flags include:
					                // - cuda/hip-EventDefault: Default event creation flag.
					                // - cuda/hip-EventBlockingSync : Specifies that event should use blocking synchronization.
					                //   A host thread that uses cuda/hip-EventSynchronize() to wait on an event created with this flag
					                //   will block until the event actually completes. (currently not used, @todo: check if this is
					                //   required, in mainline alpaka this is configuable in the constructor.
					                // - cuda/hip-EventDisableTiming : Specifies that the created event does not need to record timing
					                // data.
					                //   Events created with this flag specified and the cuda/hip-EventBlockingSync flag not specified
					                //   will provide the best performance when used with cudaStreamWaitEvent() and cudaEventQuery().
					                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
					                    ApiInterface,
					                    ApiInterface::eventCreateWithFlags(
					                        &m_nativeEvent,
					                        ApiInterface::eventDefault | ApiInterface::eventDisableTiming));
					            }

					            ~Event()
					            {
					                ALPAKA_LOG_FUNCTION(onHost::logger::event);
					                onHost::internal::wait(*this);
					                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK_NOEXCEPT(ApiInterface, ApiInterface::eventDestroy(getNativeHandle()));
					            }

					            Event(Event const&) = delete;
					            Event& operator=(Event const&) = delete;

					            Event(Event&&) = delete;
					            Event& operator=(Event&&) = delete;

					            bool operator==(Event const& other) const
					            {
					                return m_idx == other.m_idx && m_device == other.m_device;
					            }

					            bool operator!=(Event const& other) const
					            {
					                return !(*this == other);
					            }

					        private:
					            Handle<T_Device> m_device;
					            uint32_t m_idx = 0u;
					            typename ApiInterface::Event_t m_nativeEvent;

					            friend struct alpaka::internal::GetName;

					            std::string getName() const
					            {
					                return std::string("unifiedCudaHip::Event id=") + std::to_string(m_idx);
					            }

					            friend struct onHost::internal::GetNativeHandle;

					            [[nodiscard]] auto getNativeHandle() const noexcept
					            {
					                return m_nativeEvent;
					            }

					            friend struct onHost::internal::Enqueue;

					            friend struct onHost::internal::Wait;

					            void wait() const
					            {
					                ALPAKA_LOG_FUNCTION(onHost::logger::event);
					                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(ApiInterface, ApiInterface::eventSynchronize(getNativeHandle()));
					            }

					            friend struct alpaka::internal::GetDeviceType;

					            auto getDeviceKind() const
					            {
					                return alpaka::internal::getDeviceKind(*m_device.get());
					            }

					            auto getDevice() const
					            {
					                return m_device;
					            }

					            std::shared_ptr<Event> getSharedPtr()
					            {
					                return this->shared_from_this();
					            }

					            friend struct onHost::internal::IsEventComplete;

					            bool isEventComplete() noexcept
					            {
					                typename ApiInterface::Error_t ret = ApiInterface::success;
					                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK_IGNORE(
					                    ApiInterface,
					                    ret = ApiInterface::eventQuery(m_nativeEvent),
					                    ApiInterface::errorNotReady);
					                return (ret == ApiInterface::success);
					            }

					            friend struct onHost::internal::GetDevice;

					            friend struct alpaka::internal::GetApi;
					        };

					    } // namespace unifiedCudaHip
					} // namespace alpaka::onHost

					namespace alpaka::internal
					{
					    template<typename T_Device>
					    struct GetApi::Op<onHost::unifiedCudaHip::Event<T_Device>>
					    {
					        inline constexpr auto operator()(auto&& queue) const
					        {
					            return getApi(queue.m_device);
					        }
					    };
					} // namespace alpaka::internal
					#endif
					// ==
					// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/unifiedCudaHip/Event.hpp ==
					// ============================================================================

					// ============================================================================
					// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/unifiedCudaHip/Queue.hpp ==
					// ==
					/* Copyright 2024 René Widera
					 * SPDX-License-Identifier: MPL-2.0
					 */
					// #pragma once
					// #include "alpaka/api/concepts/api.hpp"    // amalgamate: file already inlined
					// #include "alpaka/api/cuda/IdxLayer.hpp"    // amalgamate: file already inlined
						// ============================================================================
						// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/cuda/computeApi.hpp ==
						// ==
						/* Copyright 2025 René Widera
						 * SPDX-License-Identifier: MPL-2.0
						 */

						// #pragma once
						// #include "alpaka/api/unifiedCudaHip/ComputeApi.hpp"    // amalgamate: file already inlined
						// #include "alpaka/core/config.hpp"    // amalgamate: file already inlined

						#include <type_traits>

						#if ALPAKA_LANG_CUDA

						namespace alpaka::onAcc::unifiedCudaHip::internal
						{
						    template<>
						    struct WarpSize::Get<alpaka::deviceKind::NvidiaGpu>
						    {
						        constexpr auto operator()() const
						        {
						            return std::integral_constant<uint32_t, 32u>{};
						        }
						    };
						} // namespace alpaka::onAcc::unifiedCudaHip::internal

						#endif
						// ==
						// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/cuda/computeApi.hpp ==
						// ============================================================================

					// #include "alpaka/api/generic.hpp"    // amalgamate: file already inlined
					// #include "alpaka/api/hip/IdxLayer.hpp"    // amalgamate: file already inlined
						// ============================================================================
						// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/hip/computeApi.hpp ==
						// ==
						/* Copyright 2025 Andrea Bocci, René Widera
						 * SPDX-License-Identifier: MPL-2.0
						 */

						// #pragma once
						// #include "alpaka/api/unifiedCudaHip/ComputeApi.hpp"    // amalgamate: file already inlined
						// #include "alpaka/core/config.hpp"    // amalgamate: file already inlined

						#include <type_traits>

						#if ALPAKA_LANG_HIP

						namespace alpaka::onAcc::unifiedCudaHip::internal
						{
						    template<>
						    struct WarpSize::Get<alpaka::deviceKind::AmdGpu>
						    {
						        constexpr auto operator()() const
						        {
						#    if defined(__HIP_DEVICE_COMPILE__)
						            // HIP/ROCm may have a wavefront of 32 or 64 depending on the target device
						#        if defined(__GFX9__)
						            // GCN 5.0 and CDNA GPUs have a wavefront size of 64
						            return std::integral_constant<uint32_t, 64u>{};
						#        elif defined(__GFX10__) or defined(__GFX11__) or defined(__GFX12__)
						            // RDNA GPUs have a wavefront size of 32
						            return std::integral_constant<uint32_t, 32u>{};
						#        else
						            // Unknown AMD GPU architecture
						#            ifdef ALPAKA_DEFAULT_HIP_WAVEFRONT_SIZE
						            return std::integral_constant<uint32_t, ALPAKA_DEFAULT_HIP_WAVEFRONT_SIZE>{};
						#            else
						#                error The current AMD GPU architucture is not supported by this version of alpaka. You can define a default wavefront size setting the preprocessor macro ALPAKA_DEFAULT_HIP_WAVEFRONT_SIZE
						            // return 32 instead of zero to avoid errors due to possible devision by zero, the code will throw at this
						            // point anyway therefore we can return what we want
						            return std::integral_constant<uint32_t, 32u>{};
						#            endif
						#        endif
						#    else
						            // return one to avoid division by zero warnings when the host path is parsed.
						            return std::integral_constant<uint32_t, 1u>{};
						#    endif
						        }
						    };
						} // namespace alpaka::onAcc::unifiedCudaHip::internal

						#endif
						// ==
						// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/hip/computeApi.hpp ==
						// ============================================================================

					// #include "alpaka/api/unifiedCudaHip/ComputeApi.hpp"    // amalgamate: file already inlined
					// #include "alpaka/api/unifiedCudaHip/Event.hpp"    // amalgamate: file already inlined
					// #include "alpaka/api/unifiedCudaHip/MemcpyKind.hpp"    // amalgamate: file already inlined
					// #include "alpaka/api/unifiedCudaHip/concepts.hpp"    // amalgamate: file already inlined
					// #include "alpaka/api/util.hpp"    // amalgamate: file already inlined
					// #include "alpaka/core/CallbackThread.hpp"    // amalgamate: file already inlined
					// #include "alpaka/core/UniformCudaHip.hpp"    // amalgamate: file already inlined
					// #include "alpaka/core/config.hpp"    // amalgamate: file already inlined
					// #include "alpaka/internal/interface.hpp"    // amalgamate: file already inlined
					// #include "alpaka/onAcc/Acc.hpp"    // amalgamate: file already inlined
					// #include "alpaka/onAcc/internal/globalMem.hpp"    // amalgamate: file already inlined
					// #include "alpaka/onHost/FrameSpec.hpp"    // amalgamate: file already inlined
					// #include "alpaka/onHost/Handle.hpp"    // amalgamate: file already inlined
					// #include "alpaka/onHost/interface.hpp"    // amalgamate: file already inlined
					// #include "alpaka/onHost/internal/interface.hpp"    // amalgamate: file already inlined
					// #include "alpaka/onHost/mem/SharedBuffer.hpp"    // amalgamate: file already inlined

					#if ALPAKA_LANG_CUDA || ALPAKA_LANG_HIP

					// #    include "alpaka/core/ApiCudaRt.hpp"    // amalgamate: file already inlined

					// #    include <cstdint>    // amalgamate: file already included
					// #    include <sstream>    // amalgamate: file already included

					namespace alpaka::onHost
					{
					    namespace unifiedCudaHip
					    {
					        struct CallKernel;

					        template<typename T_Device>
					        struct Queue : std::enable_shared_from_this<Queue<T_Device>>
					        {
					            using ApiInterface = typename T_Device::ApiInterface;

					        public:
					            Queue(internal::concepts::DeviceHandle auto device, uint32_t const idx, bool isBlocking)
					                : m_device(std::move(device))
					                , m_idx(idx)
					                , m_isBlocking(isBlocking)
					            {
					                ALPAKA_LOG_FUNCTION(onHost::logger::queue);
					                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
					                    ApiInterface,
					                    ApiInterface::setDevice(onHost::getNativeHandle(m_device)));
					                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
					                    ApiInterface,
					                    ApiInterface::streamCreateWithFlags(&m_UniformCudaHipQueue, ApiInterface::streamNonBlocking));
					            }

					            ~Queue()
					            {
					                ALPAKA_LOG_FUNCTION(onHost::logger::queue);
					                onHost::internal::wait(*this);
					                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK_NOEXCEPT(
					                    ApiInterface,
					                    ApiInterface::streamDestroy(getNativeHandle()));
					            }

					            Queue(Queue const&) = delete;
					            Queue& operator=(Queue const&) = delete;

					            Queue(Queue&&) = delete;
					            Queue& operator=(Queue&&) = delete;

					            bool operator==(Queue const& other) const
					            {
					                return m_idx == other.m_idx && m_device == other.m_device;
					            }

					            bool operator!=(Queue const& other) const
					            {
					                return !(*this == other);
					            }

					        private:
					            void _()
					            {
					                static_assert(internal::concepts::Queue<Queue>);
					            }

					            Handle<T_Device> m_device;
					            uint32_t m_idx = 0u;
					            typename ApiInterface::Stream_t m_UniformCudaHipQueue;
					            core::CallbackThread m_callBackThread;
					            bool m_isBlocking{false};

					            /** Waits until all operations are finished depending on whether the queue is blocking or non-blocking.
					             *
					             * If the queue is a blocking queue the control flow will be blocked and the method is not returning until
					             * all work in the queue is processed. This method should be called after the task is enqueued into the
					             * native CUDA/HIP queue. There is no need to call this method before enqueuing because the queues are
					             * in-order queues and even if another thread is enqueued something before the order is guaranteed.
					             */
					            void conditionalWait() const noexcept
					            {
					                if(m_isBlocking)
					                    ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(ApiInterface, ApiInterface::streamSynchronize(getNativeHandle()));
					            }

					            friend struct alpaka::internal::GetName;

					            std::string getName() const
					            {
					                return std::string("unifiedCudaHip::Queue id=") + std::to_string(m_idx);
					            }

					            friend struct onHost::internal::GetNativeHandle;

					            [[nodiscard]] auto getNativeHandle() const noexcept
					            {
					                return m_UniformCudaHipQueue;
					            }

					            friend struct onHost::internal::Enqueue;
					            friend struct onHost::internal::Wait;

					            void wait() const
					            {
					                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(ApiInterface, ApiInterface::streamSynchronize(getNativeHandle()));
					            }

					            friend struct alpaka::internal::GetDeviceType;

					            auto getDeviceKind() const
					            {
					                return alpaka::internal::getDeviceKind(*m_device.get());
					            }

					            auto getDevice() const
					            {
					                return m_device;
					            }

					            std::shared_ptr<Queue> getSharedPtr()
					            {
					                return this->shared_from_this();
					            }

					            friend struct alpaka::onHost::internal::WaitFor;

					            void waitFor(unifiedCudaHip::Event<T_Device>& event)
					            {
					                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
					                    ApiInterface,
					                    ApiInterface::streamWaitEvent(getNativeHandle(), internal::getNativeHandle(event), 0));

					                conditionalWait();
					            }

					            friend struct internal::IsQueueEmpty;

					            bool isQueueEmpty() const
					            {
					                ALPAKA_LOG_FUNCTION(onHost::logger::queue);

					                typename ApiInterface::Error_t ret = ApiInterface::success;
					                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK_IGNORE(
					                    ApiInterface,
					                    ret = ApiInterface::streamQuery(getNativeHandle()),
					                    ApiInterface::errorNotReady);
					                return (ret == ApiInterface::success);
					            }

					            friend struct onHost::internal::GetDevice;

					            friend struct alpaka::internal::GetApi;
					            friend struct onHost::internal::Memcpy;
					            friend struct onHost::internal::MemcpyDeviceGlobal;
					            friend struct onHost::internal::Memset;
					            friend struct onHost::internal::AllocDeferred;
					            friend struct CallKernel;
					        };

					        template<
					            alpaka::concepts::Api T_Api,
					            alpaka::concepts::DeviceKind T_DeviceKind,
					            alpaka::concepts::Executor T_Executor,
					            bool launchedWidthFrameSpec,
					            typename TKernelBundle,
					            typename T_OptimizedThreadSpec>
					        __global__ void gpuKernel(TKernelBundle const kernelBundle, T_OptimizedThreadSpec const optimizedThreadSpec)
					        {
					            constexpr auto warpSizeValue = alpaka::onAcc::unifiedCudaHip::internal::WarpSize::Get<T_DeviceKind>{}();
					            auto acc = onAcc::Acc{
					                Dict{
					                    DictEntry(layer::block, onAcc::unifiedCudaHip::BlockLayer{optimizedThreadSpec}),
					                    DictEntry(layer::thread, onAcc::unifiedCudaHip::ThreadLayer{optimizedThreadSpec}),
					                    DictEntry(object::launchedWidthFrameSpec, std::bool_constant<launchedWidthFrameSpec>{}),
					                    DictEntry(action::threadBlockSync, onAcc::unifiedCudaHip::Sync{}),
					                    DictEntry(object::api, T_Api{}),
					                    DictEntry(object::deviceKind, T_DeviceKind{}),
					                    DictEntry(object::exec, T_Executor{}),
					                    DictEntry(object::warpSize, warpSizeValue)},
					            };
					            kernelBundle(acc);
					        }

					        ALPAKA_FN_HOST auto convertVecToUniformCudaHipDim(alpaka::concepts::Vector auto const& vec) -> dim3
					        {
					            constexpr auto vecDim = ALPAKA_TYPEOF(vec)::dim();
					            dim3 dim(1, 1, 1);
					            if constexpr(vecDim >= 1u)
					                dim.x = static_cast<unsigned>(vec[vecDim - 1u]);
					            if constexpr(vecDim >= 2u)
					                dim.y = static_cast<unsigned>(vec[vecDim - 2u]);
					            if constexpr(vecDim >= 3u)
					                dim.z = static_cast<unsigned>(vec[vecDim - 3u]);

					            return dim;
					        }

					        struct CallKernel
					        {
					            template<alpaka::concepts::Vector T_NumBlocks, alpaka::concepts::Vector T_NumThreads>
					            struct OptimizedThreadSpec
					            {
					                using NumBlocksVecType = typename T_NumBlocks::UniVec;
					                using NumThreadsVecType = T_NumThreads;

					                static consteval uint32_t dim()
					                {
					                    return T_NumThreads::dim();
					                }

					                constexpr OptimizedThreadSpec(T_NumBlocks const&, T_NumThreads const&)
					                {
					                }
					            };

					            template<
					                bool launchedWidthFrameSpec,
					                typename T_Device,
					                alpaka::concepts::Vector T_NumBlocks,
					                alpaka::concepts::Vector T_NumThreads,
					                alpaka::concepts::Executor T_Executor,
					                typename T_KernelBundle>
					            void operator()(
					                unifiedCudaHip::Queue<T_Device>& queue,
					                ThreadSpec<T_NumBlocks, T_NumThreads, T_Executor> const& threadSpec,
					                T_KernelBundle const& kernelBundle) const
					            {
					                static_assert(
					                    ALPAKA_TYPEOF(threadSpec)::getExecutor() != exec::anyExecutor,
					                    "'exec::anyExecutor' can not be used to enqueue an kernel.");
					                ALPAKA_LOG_FUNCTION(onHost::logger::kernel + onHost::logger::queue);

					                using ApiInterface = typename unifiedCudaHip::Queue<T_Device>::ApiInterface;
					                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
					                    ApiInterface,
					                    ApiInterface::setDevice(onHost::getNativeHandle(queue.m_device)));

					                constexpr uint32_t dim = T_NumBlocks::dim();
					                // dimension of the cuda/hip layer
					                constexpr uint32_t layerDim = dim >= 4u ? 1u : dim;
					                using IdxType = typename T_NumBlocks::type;

					                Vec<IdxType, layerDim> numBlocks;
					                Vec<IdxType, layerDim> numThreadsPerBlock;

					                if constexpr(dim >= 4u)
					                {
					                    numBlocks = threadSpec.getNumBlocks().product();
					                    numThreadsPerBlock = threadSpec.getNumThreads().product();
					                }
					                else
					                {
					                    numBlocks = threadSpec.getNumBlocks();
					                    numThreadsPerBlock = threadSpec.getNumThreads();
					                }

					                using ThreadSpecType = std::conditional_t<
					                    dim >= 4u,
					                    ALPAKA_TYPEOF(threadSpec),
					                    OptimizedThreadSpec<
					                        typename ALPAKA_TYPEOF(threadSpec)::NumBlocksVecType,
					                        typename ALPAKA_TYPEOF(threadSpec)::NumThreadsVecType>>;
					                // thread spec which is only holding data if the dimension is larger than 3u
					                auto optimizedThreadSpec = ThreadSpecType(threadSpec.getNumBlocks(), threadSpec.getNumThreads());

					                auto kernelName = gpuKernel<
					                    ALPAKA_TYPEOF(getApi(queue)),
					                    ALPAKA_TYPEOF(getDeviceKind(queue)),
					                    T_Executor,
					                    launchedWidthFrameSpec,
					                    T_KernelBundle,
					                    ALPAKA_TYPEOF(optimizedThreadSpec)>;

					                uint32_t blockDynSharedMemBytes = onHost::getDynSharedMemBytes(threadSpec, kernelBundle);

					                kernelName<<<
					                    convertVecToUniformCudaHipDim(numBlocks),
					                    convertVecToUniformCudaHipDim(numThreadsPerBlock),
					                    static_cast<std::size_t>(blockDynSharedMemBytes),
					                    queue.getNativeHandle()>>>(kernelBundle, optimizedThreadSpec);

					                queue.conditionalWait();
					            }
					        };
					    } // namespace unifiedCudaHip
					} // namespace alpaka::onHost

					namespace alpaka::internal
					{
					    template<typename T_Device>
					    struct GetApi::Op<onHost::unifiedCudaHip::Queue<T_Device>>
					    {
					        inline constexpr auto operator()(auto&& queue) const
					        {
					            return getApi(queue.m_device);
					        }
					    };
					} // namespace alpaka::internal

					namespace alpaka::onHost
					{
					    namespace internal
					    {
					        template<typename T_Device, typename T_Task>
					        struct Enqueue::HostTask<unifiedCudaHip::Queue<T_Device>, T_Task>
					        {
					            struct HostFuncData
					            {
					                // We don't need to keep the queue alive, because in its dtor it will synchronize with the CUDA/HIP
					                // stream and wait until all host functions and the CallbackThread are done. It's actually an error to
					                // copy the queue into the host function. Destroying it here would call CUDA/HIP APIs from the host
					                // function. Passing it further to the Callback thread, would make the Callback thread hold a task
					                // containing the queue with the CallbackThread itself. Destroying the task if no other queue instance
					                // exists will make the CallbackThread join itself and crash.
					                unifiedCudaHip::Queue<T_Device>& q;
					                T_Task t;
					            };

					            static void uniformCudaHipRtHostFunc(void* arg)
					            {
					                auto data = std::unique_ptr<HostFuncData>(reinterpret_cast<HostFuncData*>(arg));
					                auto& queue = data->q;
					                auto f = queue.m_callBackThread.submit([d = std::move(data)] { d->t(); });
					                f.wait();
					            }

					            void operator()(unifiedCudaHip::Queue<T_Device>& queue, T_Task const& task) const
					            {
					                ALPAKA_LOG_FUNCTION(onHost::logger::queue);
					                using ApiInterface = typename unifiedCudaHip::Queue<T_Device>::ApiInterface;

					                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
					                    ApiInterface,
					                    ApiInterface::launchHostFunc(
					                        queue.getNativeHandle(),
					                        uniformCudaHipRtHostFunc,
					                        new HostFuncData{queue, task}));

					                queue.conditionalWait();
					            }
					        };

					        template<typename T_Device, typename T_Task>
					        struct Enqueue::HostTaskDeferred<unifiedCudaHip::Queue<T_Device>, T_Task>
					        {
					            // same as for Enqueue::HostTask, but not waiting for the task to finish
					            struct HostFuncData
					            {
					                unifiedCudaHip::Queue<T_Device>& q;
					                T_Task t;
					            };

					            static void uniformCudaHipRtHostFuncAsync(void* arg)
					            {
					                auto data = std::unique_ptr<HostFuncData>(reinterpret_cast<HostFuncData*>(arg));
					                auto& queue = data->q;
					                auto queueDependency = queue.getSharedPtr();
					                queue.m_callBackThread.submit([d = std::move(data), queueDependency] { d->t(); });
					                // don't wait, we're async
					            }

					            void operator()(unifiedCudaHip::Queue<T_Device>& queue, T_Task const& task) const
					            {
					                ALPAKA_LOG_FUNCTION(onHost::logger::queue);
					                using ApiInterface = typename unifiedCudaHip::Queue<T_Device>::ApiInterface;

					                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
					                    ApiInterface,
					                    ApiInterface::launchHostFunc(
					                        queue.getNativeHandle(),
					                        uniformCudaHipRtHostFuncAsync,
					                        new HostFuncData{queue, task}));

					                queue.conditionalWait();
					            }
					        };

					        template<typename T_Device, typename T_Event>
					        struct internal::Enqueue::Event<unifiedCudaHip::Queue<T_Device>, T_Event>
					        {
					            void operator()(unifiedCudaHip::Queue<T_Device>& queue, T_Event& event) const
					            {
					                ALPAKA_LOG_FUNCTION(onHost::logger::event + onHost::logger::queue);
					                using ApiInterface = typename unifiedCudaHip::Queue<T_Device>::ApiInterface;
					                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
					                    ApiInterface,
					                    ApiInterface::eventRecord(event.getNativeHandle(), queue.getNativeHandle()));

					                queue.conditionalWait();
					            }
					        };

					        template<
					            typename T_Device,
					            alpaka::concepts::UnifiedCudaHipExecutor T_Executor,
					            alpaka::concepts::Vector T_NumBlocks,
					            alpaka::concepts::Vector T_NumThreads,
					            typename T_KernelBundle>
					        struct Enqueue::
					            Kernel<unifiedCudaHip::Queue<T_Device>, ThreadSpec<T_NumBlocks, T_NumThreads, T_Executor>, T_KernelBundle>
					        {
					            void operator()(
					                unifiedCudaHip::Queue<T_Device>& queue,
					                ThreadSpec<T_NumBlocks, T_NumThreads, T_Executor> const& threadSpec,
					                T_KernelBundle const& kernelBundle) const
					            {
					                ALPAKA_LOG_FUNCTION(onHost::logger::kernel + onHost::logger::queue);
					                unifiedCudaHip::CallKernel{}.template operator()<false>(queue, threadSpec, kernelBundle);
					            }
					        };

					        template<
					            typename T_Device,
					            alpaka::concepts::UnifiedCudaHipExecutor T_Executor,
					            typename T_NumFrames,
					            typename T_FrameExtents,
					            typename T_KernelBundle>
					        struct Enqueue::
					            Kernel<unifiedCudaHip::Queue<T_Device>, FrameSpec<T_NumFrames, T_FrameExtents, T_Executor>, T_KernelBundle>
					        {
					            void operator()(
					                unifiedCudaHip::Queue<T_Device>& queue,
					                FrameSpec<T_NumFrames, T_FrameExtents, T_Executor> const& frameSpec,
					                T_KernelBundle const& kernelBundle) const
					            {
					                static_assert(
					                    ALPAKA_TYPEOF(frameSpec)::getExecutor() != exec::anyExecutor,
					                    "'exec::anyExecutor' can not be used to enqueue an kernel.");
					                ALPAKA_LOG_FUNCTION(onHost::logger::kernel + onHost::logger::queue);
					                auto threadBlocking = internal::adjustThreadSpec(*queue.m_device.get(), frameSpec, kernelBundle);
					                unifiedCudaHip::CallKernel{}.template operator()<true>(queue, threadBlocking, kernelBundle);
					            }
					        };

					        template<typename T_Device, typename T_Dest, typename T_Source, typename T_Extents>
					        struct Memcpy::Op<unifiedCudaHip::Queue<T_Device>, T_Dest, T_Source, T_Extents>
					        {
					            void operator()(
					                unifiedCudaHip::Queue<T_Device>& queue,
					                auto&& dest,
					                T_Source const& source,
					                T_Extents const& extents) const requires std::same_as<ALPAKA_TYPEOF(dest), T_Dest>
					            {
					                ALPAKA_LOG_FUNCTION(onHost::logger::memory + onHost::logger::queue);
					                using ApiInterface = typename unifiedCudaHip::Queue<T_Device>::ApiInterface;

					                auto extentMd = pCast<size_t>(extents);

					                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
					                    ApiInterface,
					                    ApiInterface::setDevice(onHost::getNativeHandle(queue.m_device)));

					                void* destPtr = toVoidPtr(onHost::data(dest));
					                void const* srcPtr = toVoidPtr(onHost::data(source));

					                auto copyKind = unifiedCudaHip::MemcpyKind<
					                    ApiInterface,
					                    ALPAKA_TYPEOF(alpaka::internal::getApi(dest)),
					                    ALPAKA_TYPEOF(alpaka::internal::getApi(source))>::kind;

					                constexpr auto dim = alpaka::trait::getDim_v<T_Extents>;
					                if constexpr(dim == 1u)
					                {
					                    // Initiate the memory copy.
					                    ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
					                        ApiInterface,
					                        ApiInterface::memcpyAsync(
					                            destPtr,
					                            srcPtr,
					                            extentMd.x() * sizeof(alpaka::trait::GetValueType_t<T_Dest>),
					                            copyKind,
					                            internal::getNativeHandle(queue)));
					                }
					                else if constexpr(dim == 2u)
					                {
					                    ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
					                        ApiInterface,
					                        ApiInterface::memcpy2DAsync(
					                            destPtr,
					                            dest.getPitches().y(),
					                            srcPtr,
					                            source.getPitches().y(),
					                            extentMd.x() * sizeof(alpaka::trait::GetValueType_t<T_Dest>),
					                            extentMd.y(),
					                            copyKind,
					                            internal::getNativeHandle(queue)));
					                }
					                else if constexpr(dim >= 3u)
					                {
					                    auto const extentMdNoXY = extentMd.eraseBack().eraseBack();
					                    // zero-init required per CUDA documentation
					                    typename ApiInterface::Memcpy3DParms_t memCpy3DParms{};

					                    memCpy3DParms.srcPtr = ApiInterface::makePitchedPtr(
					                        // CUDA/HIP does not support const for pitched pointer
					                        const_cast<void*>(srcPtr),
					                        source.getPitches().y(),
					                        source.getExtents().x(),
					                        source.getExtents().y());
					                    memCpy3DParms.dstPtr = ApiInterface::makePitchedPtr(
					                        destPtr,
					                        dest.getPitches().y(),
					                        dest.getExtents().x(),
					                        dest.getExtents().y());
					                    memCpy3DParms.extent = ApiInterface::makeExtent(
					                        extentMd.x() * sizeof(alpaka::trait::GetValueType_t<T_Dest>),
					                        extentMd.y(),
					                        extentMdNoXY.product());
					                    memCpy3DParms.kind = copyKind;

					                    ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
					                        ApiInterface,
					                        ApiInterface::memcpy3DAsync(&memCpy3DParms, internal::getNativeHandle(queue)));
					                }

					                queue.conditionalWait();
					            }
					        };

					        // copy to device global memory
					        template<typename T_Device, typename T_Source, typename T_Storage, typename T>
					        struct internal::MemcpyDeviceGlobal::
					            Op<unifiedCudaHip::Queue<T_Device>, onAcc::internal::GlobalDeviceMemoryWrapper<T_Storage, T>, T_Source>
					        {
					            void operator()(
					                unifiedCudaHip::Queue<T_Device>& queue,
					                onAcc::internal::GlobalDeviceMemoryWrapper<T_Storage, T> dest,
					                auto&& source) const
					            {
					                ALPAKA_LOG_FUNCTION(onHost::logger::memory + onHost::logger::queue);

					                using ApiInterface = typename unifiedCudaHip::Queue<T_Device>::ApiInterface;
					                auto queueApi = alpaka::internal::getApi(queue);
					                auto copyKind = unifiedCudaHip::
					                    MemcpyKind<ApiInterface, ALPAKA_TYPEOF(queueApi), ALPAKA_TYPEOF(api::host)>::kind;

					                void* destPtr{nullptr};
					                void const* srcPtr{nullptr};
					                if constexpr(std::is_pointer_v<ALPAKA_TYPEOF(source)>)
					                    srcPtr = source;
					                else
					                    srcPtr = toVoidPtr(alpaka::onHost::data(ALPAKA_FORWARD(source)));

					                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
					                    ApiInterface,
					                    ApiInterface::getSymbolAddress(reinterpret_cast<void**>(&destPtr), dest.getHandle(queueApi)));

					                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
					                    ApiInterface,
					                    ApiInterface::memcpyAsync(destPtr, srcPtr, sizeof(T), copyKind, internal::getNativeHandle(queue)));

					                queue.conditionalWait();
					            }
					        };

					        // copy from device global memory
					        template<typename T_Device, typename T_Dest, typename T_Storage, typename T>
					        struct internal::MemcpyDeviceGlobal::
					            Op<unifiedCudaHip::Queue<T_Device>, T_Dest, onAcc::internal::GlobalDeviceMemoryWrapper<T_Storage, T>>
					        {
					            void operator()(
					                unifiedCudaHip::Queue<T_Device>& queue,
					                auto&& dest,
					                onAcc::internal::GlobalDeviceMemoryWrapper<T_Storage, T> source) const
					            {
					                ALPAKA_LOG_FUNCTION(onHost::logger::memory + onHost::logger::queue);

					                using ApiInterface = typename unifiedCudaHip::Queue<T_Device>::ApiInterface;
					                auto queueApi = alpaka::internal::getApi(queue);
					                auto copyKind = unifiedCudaHip::
					                    MemcpyKind<ApiInterface, ALPAKA_TYPEOF(api::host), ALPAKA_TYPEOF(queueApi)>::kind;

					                void* destPtr{nullptr};
					                if constexpr(std::is_pointer_v<ALPAKA_TYPEOF(dest)>)
					                    destPtr = dest;
					                else
					                    destPtr = toVoidPtr(alpaka::onHost::data(ALPAKA_FORWARD(dest)));

					                void* srcPtr{nullptr};

					                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
					                    ApiInterface,
					                    ApiInterface::getSymbolAddress(reinterpret_cast<void**>(&srcPtr), source.getHandle(queueApi)));

					                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
					                    ApiInterface,
					                    ApiInterface::memcpyAsync(destPtr, srcPtr, sizeof(T), copyKind, internal::getNativeHandle(queue)));

					                queue.conditionalWait();
					            }
					        };

					        template<typename T_Device, typename T_Dest, typename T_Extents>
					        struct Memset::Op<unifiedCudaHip::Queue<T_Device>, T_Dest, T_Extents>
					        {
					            /** @attention Do not use `requires std::same_as<ALPAKA_TYPEOF(dest), T_Dest>` here else gcc 11.X
					             * (tested 11.4 and 11.3) will run into an internal compiler segfault during the evaluation of the
					             * constraints */
					            void operator()(
					                unifiedCudaHip::Queue<T_Device>& queue,
					                auto&& dest,
					                uint8_t byteValue,
					                T_Extents const& extents) const requires(std::is_same_v<ALPAKA_TYPEOF(dest), T_Dest>)
					            {
					                ALPAKA_LOG_FUNCTION(onHost::logger::memory + onHost::logger::queue);
					                using ApiInterface = typename unifiedCudaHip::Queue<T_Device>::ApiInterface;
					                auto extentMd = pCast<size_t>(extents);

					                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
					                    ApiInterface,
					                    ApiInterface::setDevice(onHost::getNativeHandle(queue.m_device)));

					                void* destPtr = toVoidPtr(onHost::data(dest));

					                constexpr auto dim = alpaka::trait::getDim_v<T_Extents>;
					                if constexpr(dim == 1u)
					                {
					                    ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
					                        ApiInterface,
					                        ApiInterface::memsetAsync(
					                            destPtr,
					                            static_cast<int>(byteValue),
					                            extentMd.x() * sizeof(alpaka::trait::GetValueType_t<T_Dest>),
					                            internal::getNativeHandle(queue)));
					                }
					                else if constexpr(dim == 2u)
					                {
					                    ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
					                        ApiInterface,
					                        ApiInterface::memset2DAsync(
					                            destPtr,
					                            dest.getPitches().y(),
					                            static_cast<int>(byteValue),
					                            extentMd.x() * sizeof(alpaka::trait::GetValueType_t<T_Dest>),
					                            extentMd.y(),
					                            internal::getNativeHandle(queue)));
					                }
					                else if constexpr(dim >= 3u)
					                {
					                    typename ApiInterface::PitchedPtr_t const pitchedPtrVal = ApiInterface::makePitchedPtr(
					                        destPtr,
					                        dest.getPitches().y(),
					                        dest.getExtents().x(),
					                        dest.getExtents().y());

					                    auto const extentMdNoXY = extentMd.eraseBack().eraseBack();
					                    typename ApiInterface::Extent_t const extentVal = ApiInterface::makeExtent(
					                        extentMd.x() * sizeof(alpaka::trait::GetValueType_t<T_Dest>),
					                        extentMd.y(),
					                        extentMdNoXY.product());

					                    ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
					                        ApiInterface,
					                        ApiInterface::memset3DAsync(
					                            pitchedPtrVal,
					                            static_cast<int>(byteValue),
					                            extentVal,
					                            internal::getNativeHandle(queue)));
					                }

					                queue.conditionalWait();
					            }
					        };

					        template<typename T_Device, typename T_Dest, typename T_Value, typename T_Extents>
					        struct Fill::Op<unifiedCudaHip::Queue<T_Device>, T_Dest, T_Value, T_Extents>
					        {
					            void operator()(
					                unifiedCudaHip::Queue<T_Device>& queue,
					                auto&& dest,
					                T_Value elementValue,
					                T_Extents const& extents) const
					                requires std::same_as<ALPAKA_TYPEOF(dest), T_Dest>
					                         && std::same_as<alpaka::trait::GetValueType_t<ALPAKA_TYPEOF(dest)>, T_Value>
					            {
					                ALPAKA_LOG_FUNCTION(onHost::logger::memory + onHost::logger::queue);
					                // avoid that we pass a SharedBuffer and convert non alpaka data views
					                auto dataView = makeView(dest);

					                alpaka::internal::generic::fill(
					                    queue,
					                    defaultExecutor(getDevice(queue)),
					                    dataView.getSubView(extents),
					                    elementValue);
					            }
					        };

					        /** The code is a copy of the Alloc::Op with the difference that the memory is allocated and freed
					         * within a queue
					         */
					        template<typename T_Type, typename T_Device, alpaka::concepts::Vector T_Extents>
					        struct AllocDeferred::Op<T_Type, unifiedCudaHip::Queue<T_Device>, T_Extents>
					        {
					            auto operator()(unifiedCudaHip::Queue<T_Device>& queue, T_Extents const& extents) const
					            {
					                ALPAKA_LOG_FUNCTION(onHost::logger::memory + onHost::logger::queue);
					                using ApiInterface = typename T_Device::ApiInterface;

					                /** Each CUDA/HIP allocation is aligned to at least 128 byte but typically to 256byte
					                 *
					                 * @todo check if this value can be derived from the device properties
					                 * @todo validate if memory is always aligned to 256 byte
					                 */
					                constexpr uint32_t alignment = 128u;
					                auto [memSizeInByte, pitches] = api::util::emulatedAlignedMemDescription<T_Type>(alignment, extents);

					                T_Type* ptr = nullptr;
					                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
					                    ApiInterface,
					                    ApiInterface::mallocAsync((void**) &ptr, memSizeInByte, queue.getNativeHandle()));

					                queue.conditionalWait();

					                auto deviceDependency = onHost::Device{queue.getDevice()->getSharedPtr()};
					                // it is the shared pointer to the internal queue, NOT onHost::Queue
					                auto queueDependency = queue.getSharedPtr();

					                auto deleter = [ptr, queueDependency]()
					                {
					                    ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK_NOEXCEPT(
					                        ApiInterface,
					                        ApiInterface::freeAsync(toVoidPtr(ptr), queueDependency->getNativeHandle()));
					                };

					                auto sharedBuffer = onHost::SharedBuffer{
					                    deviceDependency,
					                    ptr,
					                    extents,
					                    pitches,
					                    std::move(deleter),
					                    Alignment<alignment>{}};
					                return sharedBuffer;
					            }
					        };
					    } // namespace internal
					} // namespace alpaka::onHost
					#endif
					// ==
					// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/unifiedCudaHip/Queue.hpp ==
					// ============================================================================

				// #include "alpaka/api/util.hpp"    // amalgamate: file already inlined
				// #include "alpaka/core/UniformCudaHip.hpp"    // amalgamate: file already inlined
				// #include "alpaka/core/config.hpp"    // amalgamate: file already inlined
				// #include "alpaka/onHost/mem/SharedBuffer.hpp"    // amalgamate: file already inlined

				#if ALPAKA_LANG_CUDA || ALPAKA_LANG_HIP


				// #    include <cstdint>    // amalgamate: file already included
				// #    include <memory>    // amalgamate: file already included
				// #    include <mutex>    // amalgamate: file already included
				// #    include <sstream>    // amalgamate: file already included
				#    include <vector>

				namespace alpaka::onHost
				{
				    namespace unifiedCudaHip
				    {
				        template<typename T_Platform>
				        struct Device : std::enable_shared_from_this<Device<T_Platform>>
				        {
				            using ApiInterface = typename T_Platform::ApiInterface;

				        public:
				            Device(internal::concepts::PlatformHandle auto platform, uint32_t const idx)
				                : m_platform(std::move(platform))
				                , m_idx(idx)
				                , m_properties{internal::getDeviceProperties(*m_platform.get(), m_idx)}
				            {
				                ALPAKA_LOG_FUNCTION(onHost::logger::device);
				                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(ApiInterface, ApiInterface::setDevice(idx));
				            }

				            ~Device()
				            {
				                ALPAKA_LOG_FUNCTION(onHost::logger::device);
				                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(ApiInterface, ApiInterface::setDevice(getNativeHandle()));
				                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(ApiInterface, ApiInterface::deviceSynchronize());
				                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(ApiInterface, ApiInterface::deviceReset());
				            }

				            Device(Device const&) = delete;
				            Device& operator=(Device const&) = delete;

				            Device(Device&&) = delete;
				            Device& operator=(Device&&) = delete;

				            bool operator==(Device const& other) const
				            {
				                return m_idx == other.m_idx;
				            }

				            bool operator!=(Device const& other) const
				            {
				                return m_idx != other.m_idx;
				            }

				            void wait()
				            {
				                ALPAKA_LOG_FUNCTION(onHost::logger::device);
				                // Make sure this device is the current thread device (getNativeHandle returns device index)
				                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(ApiInterface, ApiInterface::setDevice(getNativeHandle()));
				                // Wait for all work queued on this device to finish
				                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(ApiInterface, ApiInterface::deviceSynchronize());
				            }

				        private:
				            void _()
				            {
				                static_assert(internal::concepts::Device<Device>);
				            }

				            Handle<T_Platform> m_platform;
				            uint32_t m_idx = 0u;
				            DeviceProperties m_properties;
				            std::vector<std::weak_ptr<unifiedCudaHip::Queue<Device>>> queues;
				            std::vector<std::weak_ptr<unifiedCudaHip::Event<Device>>> events;
				            std::mutex m_writeGuard;

				            std::shared_ptr<Device> getSharedPtr()
				            {
				                return this->shared_from_this();
				            }

				            friend struct alpaka::internal::GetName;

				            std::string getName() const
				            {
				                return m_properties.name;
				            }

				            friend struct onHost::internal::GetNativeHandle;

				            [[nodiscard]] int getNativeHandle() const noexcept
				            {
				                return m_idx;
				            }

				            friend struct onHost::internal::MakeQueue;

				            Handle<unifiedCudaHip::Queue<Device>> makeQueue(alpaka::concepts::QueueKind auto kind)
				            {
				                ALPAKA_LOG_FUNCTION(onHost::logger::queue);
				                static_assert(
				                    kind == queueKind::blocking || kind == queueKind::nonBlocking,
				                    "Unsupported queue kind.");
				                auto thisHandle = this->getSharedPtr();
				                std::lock_guard<std::mutex> lk{m_writeGuard};

				                constexpr bool isBlocking = kind == queueKind::blocking;
				                auto newQueue = std::make_shared<unifiedCudaHip::Queue<Device>>(
				                    std::move(thisHandle),
				                    queues.size(),
				                    isBlocking);

				                queues.emplace_back(newQueue);
				                return newQueue;
				            }

				            friend struct onHost::internal::MakeEvent;

				            Handle<unifiedCudaHip::Event<Device>> makeEvent()
				            {
				                ALPAKA_LOG_FUNCTION(onHost::logger::event);
				                auto thisHandle = this->getSharedPtr();
				                std::lock_guard<std::mutex> lk{m_writeGuard};
				                auto newEvent = std::make_shared<unifiedCudaHip::Event<Device>>(std::move(thisHandle), events.size());

				                events.emplace_back(newEvent);
				                return newEvent;
				            }

				            friend struct alpaka::internal::GetDeviceType;

				            auto getDeviceKind() const
				            {
				                return alpaka::internal::getDeviceKind(*m_platform.get());
				            }

				            auto getFreeGlobalMemBytes() const
				            {
				                std::size_t freeGlobalMemBytes(0u);
				                std::size_t globalMemCapacityBytes(0u);
				                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
				                    ApiInterface,
				                    ApiInterface::memGetInfo(&freeGlobalMemBytes, &globalMemCapacityBytes));
				                return freeGlobalMemBytes;
				            }

				            friend struct onHost::internal::Alloc;
				            friend struct onHost::internal::AllocDeferred;
				            friend struct onHost::internal::AllocUnified;
				            friend struct onHost::internal::AllocMapped;
				            friend struct alpaka::internal::GetApi;
				            friend struct internal::GetDeviceProperties;
				            friend struct internal::GetFreeGlobalMemBytes;
				            friend struct internal::AdjustThreadSpec;
				            friend struct onHost::internal::IsDataAccessible;
				        };
				    } // namespace unifiedCudaHip
				} // namespace alpaka::onHost

				namespace alpaka::internal
				{
				    template<typename T_Platform>
				    struct GetApi::Op<onHost::unifiedCudaHip::Device<T_Platform>>
				    {
				        inline constexpr auto operator()(auto&& device) const
				        {
				            return getApi(device.m_platform);
				        }
				    };
				} // namespace alpaka::internal

				namespace alpaka::onHost
				{
				    namespace internal
				    {
				        template<typename T_Type, typename T_Platform, alpaka::concepts::Vector T_Extents>
				        struct Alloc::Op<T_Type, unifiedCudaHip::Device<T_Platform>, T_Extents>
				        {
				            auto operator()(unifiedCudaHip::Device<T_Platform>& device, T_Extents const& extents) const
				            {
				                ALPAKA_LOG_FUNCTION(onHost::logger::memory + onHost::logger::device);
				                using ApiInterface = typename T_Platform::ApiInterface;

				                T_Type* ptr = nullptr;
				                auto pitches = typename T_Extents::UniVec{sizeof(T_Type)};

				                using Idx = typename T_Extents::type;

				                constexpr auto dim = T_Extents::dim();
				                if constexpr(dim == 1u)
				                {
				                    ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
				                        ApiInterface,
				                        ApiInterface::malloc((void**) &ptr, static_cast<std::size_t>(extents.x()) * sizeof(T_Type)));
				                }
				                else if constexpr(dim == 2u)
				                {
				                    size_t rowPitchInBytes = 0u;
				                    ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
				                        ApiInterface,
				                        ApiInterface::mallocPitch(
				                            (void**) &ptr,
				                            &rowPitchInBytes,
				                            static_cast<std::size_t>(extents.x()) * sizeof(T_Type),
				                            static_cast<std::size_t>(extents.y())));

				                    pitches = alpaka::calculatePitches<T_Type>(extents, static_cast<Idx>(rowPitchInBytes));
				                }
				                else if constexpr(dim >= 3u)
				                {
				                    auto const extentsNoXY = pCast<size_t>(extents.eraseBack().eraseBack());
				                    typename ApiInterface::Extent_t const extentVal = ApiInterface::makeExtent(
				                        static_cast<std::size_t>(extents.x()) * sizeof(T_Type),
				                        static_cast<std::size_t>(extents.y()),
				                        pCast<std::size_t>(extentsNoXY).product());
				                    typename ApiInterface::PitchedPtr_t pitchedPtrVal;
				                    pitchedPtrVal.ptr = nullptr;
				                    ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(ApiInterface, ApiInterface::malloc3D(&pitchedPtrVal, extentVal));

				                    ptr = reinterpret_cast<T_Type*>(pitchedPtrVal.ptr);
				                    pitches = alpaka::calculatePitches<T_Type>(extents, static_cast<Idx>(pitchedPtrVal.pitch));
				                }

				                auto deviceDependency = onHost::Device{device.getSharedPtr()};

				                auto deleter = [ptr, deviceDependency]()
				                { ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK_NOEXCEPT(ApiInterface, ApiInterface::free(toVoidPtr(ptr))); };

				                /** Each CUDA/HIP allocation is aligned to at least 128 byte but typically to 256byte
				                 *
				                 * @todo check if this value can be derived from the device properties
				                 * @todo validate if memory is always aligned to 256 byte
				                 */
				                constexpr uint32_t alignment = 128u;

				                auto buffer = onHost::SharedBuffer{
				                    deviceDependency,
				                    ptr,
				                    extents,
				                    pitches,
				                    std::move(deleter),
				                    Alignment<alignment>{}};
				                return buffer;
				            }
				        };

				        template<typename T_Type, typename T_Platform, alpaka::concepts::Vector T_Extents>
				        struct AllocUnified::Op<T_Type, unifiedCudaHip::Device<T_Platform>, T_Extents>
				        {
				            auto operator()(unifiedCudaHip::Device<T_Platform>& device, T_Extents const& extents) const
				            {
				                ALPAKA_LOG_FUNCTION(onHost::logger::memory + onHost::logger::device);
				                using ApiInterface = typename T_Platform::ApiInterface;

				                /** Each CUDA/HIP allocation is aligned to at least 128 byte but typically to 256byte
				                 *
				                 * @todo check if this value can be derived from the device properties
				                 * @todo validate if memory is always aligned to 256 byte
				                 */
				                constexpr uint32_t alignment = 128u;
				                auto [memSizeInByte, pitches] = api::util::emulatedAlignedMemDescription<T_Type>(alignment, extents);

				                auto deviceDependency = onHost::Device{device.getSharedPtr()};

				                T_Type* ptr = nullptr;
				                // HIP is failing if zero byte unified memory is allocated, therefore we do not call the allocation
				                // method for HIP
				                bool isHipZeroByteAllocation = memSizeInByte == 0 && getApi(device) == api::hip;
				                if(!isHipZeroByteAllocation)
				                {
				                    ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
				                        ApiInterface,
				                        ApiInterface::mallocManaged((void**) &ptr, memSizeInByte));
				                }

				                auto deleter = [ptr, deviceDependency]()
				                { ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK_NOEXCEPT(ApiInterface, ApiInterface::free(toVoidPtr(ptr))); };

				                auto sharedBuffer = onHost::SharedBuffer{
				                    deviceDependency,
				                    ptr,
				                    extents,
				                    pitches,
				                    std::move(deleter),
				                    Alignment<alignment>{}};
				                return sharedBuffer;
				            }
				        };

				        template<typename T_Type, typename T_Platform, alpaka::concepts::Vector T_Extents>
				        struct AllocMapped::Op<T_Type, unifiedCudaHip::Device<T_Platform>, T_Extents>
				        {
				            auto operator()(unifiedCudaHip::Device<T_Platform>& device, T_Extents const& extents) const
				            {
				                ALPAKA_LOG_FUNCTION(onHost::logger::memory + onHost::logger::device);
				                using ApiInterface = typename T_Platform::ApiInterface;

				                /** Each CUDA/HIP allocation is aligned to at least 128 byte but typically to 256byte
				                 *
				                 * @todo check if this value can be derived from the device properties
				                 * @todo validate if memory is always aligned to 256 byte
				                 */
				                constexpr uint32_t alignment = 128u;
				                auto [memSizeInByte, pitches] = api::util::emulatedAlignedMemDescription<T_Type>(alignment, extents);

				                auto deviceDependency = onHost::Device{device.getSharedPtr()};

				                T_Type* ptr = nullptr;
				                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
				                    ApiInterface,
				                    ApiInterface::hostMalloc(
				                        (void**) &ptr,
				                        memSizeInByte,
				                        ApiInterface::hostMallocMapped | ApiInterface::hostMallocPortable));

				                auto deleter = [ptr, deviceDependency]()
				                { ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK_NOEXCEPT(ApiInterface, ApiInterface::hostFree(toVoidPtr(ptr))); };

				                auto sharedBuffer = onHost::SharedBuffer{
				                    deviceDependency,
				                    ptr,
				                    extents,
				                    pitches,
				                    std::move(deleter),
				                    Alignment<alignment>{}};
				                return sharedBuffer;
				            }
				        };

				        template<typename T_Platform, typename T_Any>
				        struct IsDataAccessible::FirstPath<unifiedCudaHip::Device<T_Platform>, T_Any>
				        {
				            bool operator()(unifiedCudaHip::Device<T_Platform>& device, T_Any const& view) const
				            {
				                ALPAKA_LOG_FUNCTION(onHost::logger::memory + onHost::logger::device);
				                using ApiInterface = typename T_Platform::ApiInterface;
				                typename ApiInterface::PointerAttr_t ptrAttributes;
				                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
				                    ApiInterface,
				                    ApiInterface::pointerGetAttributes(&ptrAttributes, onHost::data(view)));

				                auto deviceHandle = device.getNativeHandle();

				                // pointer is owned by the device itself
				                if(deviceHandle == ptrAttributes.device)
				                    return true;
				                if(ptrAttributes.type == ApiInterface::memoryTypeManaged)
				                    return true;

				                return false;
				            }
				        };

				        template<typename T_Platform>
				        struct GetDeviceProperties::Op<unifiedCudaHip::Device<T_Platform>>
				        {
				            DeviceProperties operator()(unifiedCudaHip::Device<T_Platform> const& device) const
				            {
				                return device.m_properties;
				            }
				        };

				        template<
				            typename T_Platform,
				            alpaka::concepts::UnifiedCudaHipExecutor T_Executor,
				            alpaka::concepts::Vector T_NumFrames,
				            alpaka::concepts::Vector T_FrameExtents,
				            alpaka::concepts::KernelBundle T_KernelBundle>
				        struct AdjustThreadSpec::
				            Op<unifiedCudaHip::Device<T_Platform>, FrameSpec<T_NumFrames, T_FrameExtents, T_Executor>, T_KernelBundle>
				        {
				            using FrameSpecType = FrameSpec<T_NumFrames, T_FrameExtents, T_Executor>;

				            auto operator()(
				                unifiedCudaHip::Device<T_Platform> const& device,
				                FrameSpecType const& frameSpec,
				                T_KernelBundle const&) const requires alpaka::concepts::CVector<T_FrameExtents>
				            {
				                ALPAKA_LOG_FUNCTION(onHost::logger::device);
				                auto numThreads = frameSpec.getFrameExtents();

				                using ApiType = ALPAKA_TYPEOF(getApi(device));
				                using DeviceKindType = ALPAKA_TYPEOF(getDeviceKind(device));
				                constexpr auto result = api::util::adjustToLimit<
				                    alpaka::onHost::getMaxThreadsPerBlock(ApiType{}, DeviceKindType{}, T_Executor{}),
				                    0u,
				                    1u>(numThreads);
				                return ThreadSpec{frameSpec.getNumFrames(), result, frameSpec.getExecutor()};
				            }

				            auto operator()(
				                unifiedCudaHip::Device<T_Platform> const& device,
				                FrameSpecType const& frameSpec,
				                T_KernelBundle const&) const
				            {
				                ALPAKA_LOG_FUNCTION(onHost::logger::device);
				                auto numThreadsPerBlocks = frameSpec.getFrameExtents();
				                auto const maxThreadsPerBlock = device.m_properties.maxThreadsPerBlock;

				                auto result = api::util::adjustToLimit(numThreadsPerBlocks, maxThreadsPerBlock);
				                return ThreadSpec{frameSpec.getNumFrames(), result, frameSpec.getExecutor()};
				            }
				        };
				    } // namespace internal
				} // namespace alpaka::onHost

				#endif
				// ==
				// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/unifiedCudaHip/Device.hpp ==
				// ============================================================================

			// #include "alpaka/core/config.hpp"    // amalgamate: file already inlined
			// #include "alpaka/onHost/trait.hpp"    // amalgamate: file already inlined

			#if ALPAKA_LANG_CUDA

			// #    include "alpaka/core/ApiCudaRt.hpp"    // amalgamate: file already inlined

			#    include <type_traits>

			namespace alpaka::onHost
			{
			    namespace trait
			    {
			        template<typename T_Platform>
			        struct IsExecutorSupportedBy::Op<exec::GpuCuda, unifiedCudaHip::Device<T_Platform>> : std::true_type
			        {
			        };
			    } // namespace trait
			} // namespace alpaka::onHost

			namespace alpaka::onHost::internal
			{
			    template<alpaka::concepts::DeviceKind T_DeviceKind, typename T_Any>
			    struct IsDataAccessible::SecondPath<api::Cuda, T_DeviceKind, T_Any>
			    {
			        bool operator()(api::Cuda usedApi, T_DeviceKind deviceKind, T_Any const& view) const
			        {
			            alpaka::unused(usedApi);
			            using ApiInterface = ApiCudaRt;
			            typename ApiInterface::PointerAttr_t ptrAttributes;
			            ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
			                ApiInterface,
			                ApiInterface::pointerGetAttributes(&ptrAttributes, onHost::data(view)));

			            if(ptrAttributes.type == ApiInterface::memoryTypeManaged)
			                return true;
			            if(ptrAttributes.type == ApiInterface::memoryTypeHost
			               && (deviceKind == deviceKind::cpu || deviceKind == deviceKind::numaCpu))
			                return true;

			            return false;
			        }
			    };
			} // namespace alpaka::onHost::internal

			#endif
			// ==
			// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/cuda/Device.hpp ==
			// ============================================================================

		// #include "alpaka/api/cuda/IdxLayer.hpp"    // amalgamate: file already inlined
			// ============================================================================
			// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/cuda/Platform.hpp ==
			// ==
			/* Copyright 2024 René Widera
			 * SPDX-License-Identifier: MPL-2.0
			 */


			// #pragma once
			// #include "alpaka/api/cuda/Api.hpp"    // amalgamate: file already inlined
				// ============================================================================
				// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/unifiedCudaHip/Platform.hpp ==
				// ==
				/* Copyright 2024 René Widera
				 * SPDX-License-Identifier: MPL-2.0
				 */


				// #pragma once
				// #include "alpaka/api/unifiedCudaHip/Device.hpp"    // amalgamate: file already inlined
				// #include "alpaka/api/unifiedCudaHip/Platform.hpp"    // amalgamate: file already inlined
				// #include "alpaka/core/UniformCudaHip.hpp"    // amalgamate: file already inlined
				// #include "alpaka/core/config.hpp"    // amalgamate: file already inlined
				// #include "alpaka/internal/interface.hpp"    // amalgamate: file already inlined
				// #include "alpaka/onHost/Handle.hpp"    // amalgamate: file already inlined
				// #include "alpaka/onHost/interface.hpp"    // amalgamate: file already inlined

				#if ALPAKA_LANG_CUDA || ALPAKA_LANG_HIP


				// #    include <memory>    // amalgamate: file already included
				// #    include <mutex>    // amalgamate: file already included
				// #    include <sstream>    // amalgamate: file already included
				// #    include <vector>    // amalgamate: file already included

				namespace alpaka::onHost
				{
				    namespace unifiedCudaHip
				    {
				        template<typename T_ApiInterface, alpaka::concepts::DeviceKind T_DeviceKind>
				        struct Platform : std::enable_shared_from_this<Platform<T_ApiInterface, T_DeviceKind>>
				        {
				            using ApiInterface = T_ApiInterface;

				        public:
				            Platform() = default;

				            Platform(Platform const&) = delete;
				            Platform& operator=(Platform const&) = delete;

				            Platform(Platform&&) = delete;
				            Platform& operator=(Platform&&) = delete;

				        private:
				            void _()
				            {
				                static_assert(internal::concepts::Platform<Platform>);
				            }

				            std::vector<std::weak_ptr<unifiedCudaHip::Device<Platform>>> devices;
				            std::mutex deviceGuard;

				            std::shared_ptr<Platform> getSharedPtr()
				            {
				                return this->shared_from_this();
				            }

				            friend struct alpaka::internal::GetName;

				            std::string getName() const
				            {
				                return "unifiedCudaHip::Platform";
				            }

				            friend struct onHost::internal::GetDeviceCount;

				            uint32_t getDeviceCount()
				            {
				                ALPAKA_LOG_FUNCTION(alpaka::onHost::logger::device);
				                constexpr bool isSupportedDev = trait::IsDeviceSupportedBy::
				                    Op<T_DeviceKind, ALPAKA_TYPEOF(getApi(std::declval<Platform>()))>::value;
				                if constexpr(isSupportedDev)
				                {
				                    int numDevices{0};
				                    typename ApiInterface::Error_t error = ApiInterface::getDeviceCount(&numDevices);
				                    if(error != ApiInterface::success)
				                        numDevices = 0;

				                    if(devices.size() < static_cast<size_t>(numDevices))
				                    {
				                        std::lock_guard<std::mutex> lk{deviceGuard};
				                        devices.resize(numDevices);
				                    }
				                    return static_cast<uint32_t>(numDevices);
				                }

				                return 0;
				            }

				            friend struct onHost::internal::MakeDevice;

				            Handle<unifiedCudaHip::Device<Platform>> makeDevice(uint32_t const& idx)
				            {
				                ALPAKA_LOG_FUNCTION(alpaka::onHost::logger::device);
				                uint32_t const numDevices = getDeviceCount();
				                if(idx >= numDevices)
				                {
				                    std::stringstream ssErr;
				                    ssErr << "Unable to return device handle for GPU (" << T_DeviceKind{}.getName()
				                          << ") device with index " << idx << " because there are only " << numDevices << " devices!";
				                    throw std::runtime_error(ssErr.str());
				                }
				                std::lock_guard<std::mutex> lk{deviceGuard};

				                if(auto sharedPtr = devices[idx].lock())
				                {
				                    return sharedPtr;
				                }
				                auto thisHandle = getSharedPtr();
				                auto newDevice = std::make_shared<unifiedCudaHip::Device<Platform>>(std::move(thisHandle), idx);
				                devices[idx] = newDevice;
				                return newDevice;
				            }

				            friend struct internal::GetDeviceProperties;
				        };
				    } // namespace unifiedCudaHip

				    namespace internal
				    {
				        template<typename T_ApiInterface, alpaka::concepts::DeviceKind T_DeviceKind>
				        struct GetDeviceProperties::Op<unifiedCudaHip::Platform<T_ApiInterface, T_DeviceKind>>
				        {
				            DeviceProperties operator()(
				                unifiedCudaHip::Platform<T_ApiInterface, T_DeviceKind> const&,
				                uint32_t deviceIdx) const
				            {
				                ALPAKA_LOG_FUNCTION(alpaka::onHost::logger::device);
				                using ApiInterface = typename unifiedCudaHip::Platform<T_ApiInterface, T_DeviceKind>::ApiInterface;
				                typename ApiInterface::DeviceProp_t devProp;
				                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(ApiInterface, ApiInterface::getDeviceProperties(&devProp, deviceIdx));

				                std::size_t freeGlobalMemBytes(0u);
				                std::size_t globalMemCapacityBytes(0u);
				                ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
				                    ApiInterface,
				                    ApiInterface::memGetInfo(&freeGlobalMemBytes, &globalMemCapacityBytes));

				                auto prop = DeviceProperties{};
				                prop.name = devProp.name;
				                prop.warpSize = devProp.warpSize;
				                prop.multiProcessorCount = devProp.multiProcessorCount;
				                prop.globalMemCapacityBytes = globalMemCapacityBytes;
				                prop.sharedMemPerBlockBytes = devProp.sharedMemPerBlock;

				                prop.maxThreadsPerBlock = devProp.maxThreadsPerBlock;
				                // will be copied into the lampda and follows cuda index order
				                Vec<uint32_t, 3u> cudaMaxThreadsPerBlock{
				                    devProp.maxThreadsDim[0u],
				                    devProp.maxThreadsDim[1u],
				                    devProp.maxThreadsDim[2u]};
				                prop.fnMaxThreadsPerBlock = [maxThreadsPerBlock = prop.maxThreadsPerBlock,
				                                             cudaMaxThreadsPerBlock](uint32_t* data, uint32_t numDims)
				                {
				                    if(numDims <= 3u)
				                    {
				                        for(uint32_t d = 0u; d < numDims; ++d)
				                            data[numDims - 1u - d] = cudaMaxThreadsPerBlock[d];
				                    }
				                    else
				                    {
				                        /* For more than 3 dimensions alpaka is linearizing to one dimension, therefore we use the
				                         * maximum for each dimension. */
				                        for(uint32_t d = 0u; d < numDims; ++d)
				                            data[d] = maxThreadsPerBlock;
				                    }
				                };

				                prop.maxBlocksPerGrid = std::numeric_limits<uint32_t>::max();
				                // will be copied into the lampda and follows cuda index order
				                Vec<uint32_t, 3u> cudaMaxBlocksPerGrid{
				                    devProp.maxGridSize[0u],
				                    devProp.maxGridSize[1u],
				                    devProp.maxGridSize[2u]};
				                prop.fnMaxBlocksPerGrid =
				                    [maxBlocksPerGrid = prop.maxBlocksPerGrid, cudaMaxBlocksPerGrid](uint32_t* data, uint32_t numDims)
				                {
				                    if(numDims <= 3u)
				                    {
				                        for(uint32_t d = 0u; d < numDims; ++d)
				                            data[numDims - 1u - d] = cudaMaxBlocksPerGrid[d];
				                    }
				                    else
				                    {
				                        /* For more than 3 dimensions alpaka is linearizing to one dimension, therefore we use the
				                         * maximum for each dimension. */
				                        for(uint32_t d = 0u; d < numDims; ++d)
				                            data[d] = maxBlocksPerGrid;
				                    }
				                };

				                return prop;
				            }
				        };
				    } // namespace internal
				} // namespace alpaka::onHost
				#endif
				// ==
				// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/unifiedCudaHip/Platform.hpp ==
				// ============================================================================

			// #include "alpaka/core/UniformCudaHip.hpp"    // amalgamate: file already inlined
			// #include "alpaka/core/config.hpp"    // amalgamate: file already inlined
			// #include "alpaka/internal/interface.hpp"    // amalgamate: file already inlined
			// #include "alpaka/onHost/interface.hpp"    // amalgamate: file already inlined

			#if ALPAKA_LANG_CUDA

			// #    include "alpaka/core/ApiCudaRt.hpp"    // amalgamate: file already inlined

			namespace alpaka::onHost
			{
			    namespace internal
			    {
			        template<alpaka::concepts::DeviceKind T_DeviceKind>
			        struct MakePlatform::Op<api::Cuda, T_DeviceKind>
			        {
			            auto operator()(api::Cuda, T_DeviceKind) const
			            {
			                return onHost::make_sharedSingleton<unifiedCudaHip::Platform<ApiCudaRt, T_DeviceKind>>();
			            }
			        };
			    } // namespace internal
			} // namespace alpaka::onHost

			namespace alpaka::internal
			{
			    template<alpaka::concepts::DeviceKind T_DeviceKind>
			    struct GetApi::Op<onHost::unifiedCudaHip::Platform<ApiCudaRt, T_DeviceKind>>
			    {
			        inline constexpr auto operator()(auto&& platform) const
			        {
			            alpaka::unused(platform);
			            return api::Cuda{};
			        }
			    };

			    template<alpaka::concepts::DeviceKind T_DeviceKind>
			    struct GetDeviceType::Op<onHost::unifiedCudaHip::Platform<ApiCudaRt, T_DeviceKind>>
			    {
			        decltype(auto) operator()(auto&& platform) const
			        {
			            alpaka::unused(platform);
			            return T_DeviceKind{};
			        }
			    };
			} // namespace alpaka::internal
			#endif
			// ==
			// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/cuda/Platform.hpp ==
			// ============================================================================

		// #include "alpaka/api/cuda/executor.hpp"    // amalgamate: file already inlined
			// ============================================================================
			// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/cuda/memFence.hpp ==
			// ==
			/* Copyright 2025 Mehmet Yusufoglu, René Widera
			 * SPDX-License-Identifier: MPL-2.0
			 */

			// #pragma once
				// ============================================================================
				// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/cuda/memoryOrder.hpp ==
				// ==
				/* Copyright 2025 Mehmet Yusufoglu, René Widera
				 * SPDX-License-Identifier: MPL-2.0
				 */

				// #pragma once
				// #include "alpaka/core/config.hpp"    // amalgamate: file already inlined
				// #include "alpaka/onAcc/memoryOrder.hpp"    // amalgamate: file already inlined

				#include <type_traits>

				#if ALPAKA_LANG_CUDA

				namespace alpaka::onAcc::internalCompute
				{
				    struct MemOrderCuda
				    {
				        template<concepts::MemoryOrder TMemOrder>
				        static constexpr auto get(TMemOrder)
				        {
				#    ifdef ALPAKA_CUDA_ATOMIC
				            if constexpr(std::same_as<TMemOrder, order::SeqCst>)
				            {
				                return ::cuda::memory_order_seq_cst;
				            }
				            if constexpr(std::same_as<TMemOrder, order::AcqRel>)
				            {
				                return ::cuda::memory_order_acq_rel;
				            }
				            if constexpr(std::same_as<TMemOrder, order::Release>)
				            {
				                return ::cuda::memory_order_release;
				            }
				            if constexpr(std::same_as<TMemOrder, order::Acquire>)
				            {
				                return ::cuda::memory_order_acquire;
				            }
				            if constexpr(std::same_as<TMemOrder, order::Relaxed>)
				            {
				                return ::cuda::memory_order_relaxed;
				            }
				#    endif
				        }
				    };
				} // namespace alpaka::onAcc::internalCompute

				#endif // ALPAKA_LANG_SYCL
				// ==
				// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/cuda/memoryOrder.hpp ==
				// ============================================================================

			// #include "alpaka/api/unifiedCudaHip/tag.hpp"    // amalgamate: file already inlined
			// #include "alpaka/core/common.hpp"    // amalgamate: file already inlined
			// #include "alpaka/core/config.hpp"    // amalgamate: file already inlined
			// #include "alpaka/onAcc/Acc.hpp"    // amalgamate: file already inlined
			// #include "alpaka/onAcc/memoryOrder.hpp"    // amalgamate: file already inlined
			// #include "alpaka/onAcc/scope.hpp"    // amalgamate: file already inlined

			#include <type_traits>

			#if ALPAKA_LANG_CUDA

			namespace alpaka::onAcc::internalCompute
			{

			    namespace detail
			    {
			        template<concepts::MemoryOrder TMemOrder>
			        [[maybe_unused]] static constexpr __device__ void cuda_ptx_fence_system([[maybe_unused]] TMemOrder const)
			        {
			#    if ALPAKA_ARCH_PTX >= ALPAKA_VERSION_NUMBER(9, 0, 0)
			            // full acquire/release semantics support
			            if constexpr(std::is_same_v<TMemOrder, order::Relaxed>)
			            { // Relaxed ordering requires no fence
			            }
			            else if constexpr(std::is_same_v<TMemOrder, order::Acquire>)
			            {
			                asm volatile("fence.acquire.sys;" ::);
			            }
			            else if constexpr(std::is_same_v<TMemOrder, order::Release>)
			            {
			                asm volatile("fence.release.sys;" ::);
			            }
			            else if constexpr(std::is_same_v<TMemOrder, order::AcqRel>)
			            {
			                asm volatile("fence.acq_rel.sys;" ::);
			            }
			            else
			            { // Sequential consistency
			                asm volatile("fence.sc.sys;" ::);
			            }
			#    elif ALPAKA_ARCH_PTX >= ALPAKA_VERSION_NUMBER(7, 0, 0)
			            // only acq_rel and sc available
			            if constexpr(std::is_same_v<TMemOrder, order::Relaxed>)
			            { // Relaxed ordering requires no fence
			            }
			            else if constexpr(std::is_same_v<TMemOrder, order::Acquire>)
			            {
			                asm volatile("fence.acq_rel.sys;" ::);
			            }
			            else if constexpr(std::is_same_v<TMemOrder, order::Release>)
			            {
			                asm volatile("fence.acq_rel.sys;" ::);
			            }
			            else if constexpr(std::is_same_v<TMemOrder, order::AcqRel>)
			            {
			                asm volatile("fence.acq_rel.sys;" ::);
			            }
			            else
			            {
			                // Sequential consistency
			                asm volatile("fence.sc.sys;" ::);
			            }
			#    endif
			        }

			        template<concepts::MemoryOrder TMemOrder>
			        [[maybe_unused]] static constexpr __device__ void cuda_ptx_fence_device([[maybe_unused]] TMemOrder const)
			        {
			#    if ALPAKA_ARCH_PTX >= ALPAKA_VERSION_NUMBER(9, 0, 0)
			            // full acquire/release semantics support
			            if constexpr(std::is_same_v<TMemOrder, order::Relaxed>)
			            { // Relaxed ordering requires no fence
			            }
			            else if constexpr(std::is_same_v<TMemOrder, order::Acquire>)
			            {
			                asm volatile("fence.acquire.gpu;" ::);
			            }
			            else if constexpr(std::is_same_v<TMemOrder, order::Release>)
			            {
			                asm volatile("fence.release.gpu;" ::);
			            }
			            else if constexpr(std::is_same_v<TMemOrder, order::AcqRel>)
			            {
			                asm volatile("fence.acq_rel.gpu;" ::);
			            }
			            else
			            { // Sequential consistency
			                asm volatile("fence.sc.gpu;" ::);
			            }
			#    elif ALPAKA_ARCH_PTX >= ALPAKA_VERSION_NUMBER(7, 0, 0)
			            // only acq_rel and sc available
			            if constexpr(std::is_same_v<TMemOrder, order::Relaxed>)
			            { // Relaxed ordering requires no fence
			            }
			            else if constexpr(std::is_same_v<TMemOrder, order::Acquire>)
			            {
			                asm volatile("fence.acq_rel.gpu;" ::);
			            }
			            else if constexpr(std::is_same_v<TMemOrder, order::Release>)
			            {
			                asm volatile("fence.acq_rel.gpu;" ::);
			            }
			            else if constexpr(std::is_same_v<TMemOrder, order::AcqRel>)
			            {
			                asm volatile("fence.acq_rel.gpu;" ::);
			            }
			            else
			            {
			                // Sequential consistency
			                asm volatile("fence.sc.gpu;" ::);
			            }
			#    endif
			        }

			        template<concepts::MemoryOrder TMemOrder>
			        [[maybe_unused]] static constexpr __device__ void cuda_ptx_fence_block([[maybe_unused]] TMemOrder const)
			        {
			#    if ALPAKA_ARCH_PTX >= ALPAKA_VERSION_NUMBER(9, 0, 0)
			            // full acquire/release semantics support
			            if constexpr(std::is_same_v<TMemOrder, order::Relaxed>)
			            { // Relaxed ordering requires no fence
			            }
			            else if constexpr(std::is_same_v<TMemOrder, order::Acquire>)
			            {
			                asm volatile("fence.acquire.cta;" ::);
			            }
			            else if constexpr(std::is_same_v<TMemOrder, order::Release>)
			            {
			                asm volatile("fence.release.cta;" ::);
			            }
			            else if constexpr(std::is_same_v<TMemOrder, order::AcqRel>)
			            {
			                asm volatile("fence.acq_rel.cta;" ::);
			            }
			            else
			            { // Sequential consistency
			                asm volatile("fence.sc.cta;" ::);
			            }
			#    elif ALPAKA_ARCH_PTX >= ALPAKA_VERSION_NUMBER(7, 0, 0)
			            // only acq_rel and sc available
			            if constexpr(std::is_same_v<TMemOrder, order::Relaxed>)
			            { // Relaxed ordering requires no fence
			            }
			            else if constexpr(std::is_same_v<TMemOrder, order::Acquire>)
			            {
			                asm volatile("fence.acq_rel.cta;" ::);
			            }
			            else if constexpr(std::is_same_v<TMemOrder, order::Release>)
			            {
			                asm volatile("fence.acq_rel.cta;" ::);
			            }
			            else if constexpr(std::is_same_v<TMemOrder, order::AcqRel>)
			            {
			                asm volatile("fence.acq_rel.cta;" ::);
			            }
			            else
			            { // Sequential consistency
			                asm volatile("fence.sc.cta;" ::);
			            }
			#    endif
			        }

			        template<concepts::MemoryOrder TMemOrder>
			        [[maybe_unused]] static constexpr __device__ void cuda_mem_fence_block([[maybe_unused]] TMemOrder const order)
			        {
			            if constexpr(std::is_same_v<TMemOrder, order::Relaxed>)
			            { // Relaxed ordering requires no fence
			                return;
			            }
			#    ifdef ALPAKA_CUDA_ATOMIC
			            ::cuda::atomic_thread_fence(MemOrderCuda::get(order), ::cuda::thread_scope_block);
			#    else
			#        if ALPAKA_ARCH_PTX
			#            if ALPAKA_ARCH_PTX >= ALPAKA_VERSION_NUMBER(7, 0, 0)
			            cuda_ptx_fence_block(order);
			#            else
			            __threadfence_block();
			#            endif
			#        endif
			#    endif
			        }

			        template<concepts::MemoryOrder TMemOrder>
			        [[maybe_unused]] static constexpr __device__ void cuda_mem_fence_device([[maybe_unused]] TMemOrder const order)
			        {
			            if constexpr(std::is_same_v<TMemOrder, order::Relaxed>)
			            { // Relaxed ordering requires no fence
			                return;
			            }
			#    ifdef ALPAKA_CUDA_ATOMIC
			            ::cuda::atomic_thread_fence(MemOrderCuda::get(order), ::cuda::thread_scope_device);
			#    else
			#        if ALPAKA_ARCH_PTX
			#            if ALPAKA_ARCH_PTX >= ALPAKA_VERSION_NUMBER(7, 0, 0)
			            cuda_ptx_fence_device(order);
			#            else
			            __threadfence();
			#            endif
			#        endif
			#    endif
			        }

			        template<concepts::MemoryOrder TMemOrder>
			        [[maybe_unused]] static constexpr __device__ void cuda_mem_fence_system([[maybe_unused]] TMemOrder const order)
			        {
			            if constexpr(std::is_same_v<TMemOrder, order::Relaxed>)
			            { // Relaxed ordering requires no fence
			                return;
			            }
			#    ifdef ALPAKA_CUDA_ATOMIC
			            ::cuda::atomic_thread_fence(MemOrderCuda::get(order), ::cuda::thread_scope_system);
			#    else
			#        if ALPAKA_ARCH_PTX
			#            if ALPAKA_ARCH_PTX >= ALPAKA_VERSION_NUMBER(7, 0, 0)
			            cuda_ptx_fence_system(order);
			#            else
			            __threadfence_system();
			#            endif
			#        endif
			#    endif
			        }

			    } // namespace detail

			    /** Specializations should not have to be enabled for backend combinations without CUDA
			     * Removing this top level guard will not cause issues if intrinsics like __threadfence_block are protected
			     * inside the specialization.
			     */
			    template<typename T_Api, typename T_Scope, concepts::MemoryOrder T_Order>
			    requires std::same_as<T_Api, api::Cuda>
			    struct MemoryFence::Op<T_Api, T_Scope, T_Order>
			    {
			        ALPAKA_FN_ACC constexpr void operator()(onAcc::concepts::Acc auto const&, T_Scope const, T_Order const order)
			            const
			        {
			            // Host pass is not allowed.
			#    if ALPAKA_ARCH_PTX
			            if constexpr(std::is_same_v<T_Scope, scope::Block>)
			            {
			                detail::cuda_mem_fence_block(order);
			            }
			            else if constexpr(std::is_same_v<T_Scope, scope::Device>)
			            {
			                detail::cuda_mem_fence_device(order);
			            }
			            else if constexpr(std::is_same_v<T_Scope, scope::System>)
			            {
			                detail::cuda_mem_fence_system(order);
			            }
			#    endif
			        }
			    };
			} // namespace alpaka::onAcc::internalCompute

			#endif // ALPAKA_LANG_CUDA
			// ==
			// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/cuda/memFence.hpp ==
			// ============================================================================

		// ==
		// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/cuda.hpp ==
		// ============================================================================

		// ============================================================================
		// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/hip.hpp ==
		// ==
		/* Copyright 2024 René Widera
		 * SPDX-License-Identifier: MPL-2.0
		 */

		// #pragma once
		// #include "alpaka/api/hip/Api.hpp"    // amalgamate: file already inlined
			// ============================================================================
			// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/hip/Device.hpp ==
			// ==
			/* Copyright 2024 René Widera
			 * SPDX-License-Identifier: MPL-2.0
			 */

			// #pragma once
			// #include "alpaka/api/unifiedCudaHip/Device.hpp"    // amalgamate: file already inlined
			// #include "alpaka/core/config.hpp"    // amalgamate: file already inlined
			// #include "alpaka/onHost/trait.hpp"    // amalgamate: file already inlined

			#if ALPAKA_LANG_HIP
				// ============================================================================
				// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/core/ApiHipRt.hpp ==
				// ==
				/* Copyright 2022 Andrea Bocci
				 * SPDX-License-Identifier: MPL-2.0
				 */

				// #pragma once
				// #include "alpaka/core/config.hpp"    // amalgamate: file already inlined

				#if ALPAKA_LANG_HIP

				#    include <hip/hip_runtime_api.h>
				#    include <hip/hip_version.h>

				namespace alpaka
				{
				    struct ApiHipRt
				    {
				        // Names
				        static constexpr char name[] = "Hip";
				        static constexpr auto version = ALPAKA_VERSION_NUMBER(HIP_VERSION_MAJOR, HIP_VERSION_MINOR, 0);

				        // Types
				        using DeviceAttr_t = ::hipDeviceAttribute_t;
				        using PointerAttr_t = ::hipPointerAttribute_t;
				        using Memory_t = ::hipMemoryType;
				        using DeviceProp_t = ::hipDeviceProp_t;
				        using Error_t = ::hipError_t;
				        using Event_t = ::hipEvent_t;
				        using Extent_t = ::hipExtent;
				        using Flag_t = unsigned int;
				        using FuncAttributes_t = ::hipFuncAttributes;
				        using HostFn_t = void (*)(void* data); // same as hipHostFn_t
				        using Limit_t = ::hipLimit_t;
				        using Memcpy3DParms_t = ::hipMemcpy3DParms;
				        using MemcpyKind_t = ::hipMemcpyKind;
				        using PitchedPtr_t = ::hipPitchedPtr;
				        using Pos_t = ::hipPos;
				        using Stream_t = ::hipStream_t;

				        // Constants
				        static constexpr Error_t success = ::hipSuccess;
				        static constexpr Error_t errorNotReady = ::hipErrorNotReady;
				        static constexpr Error_t errorHostMemoryAlreadyRegistered = ::hipErrorHostMemoryAlreadyRegistered;
				        static constexpr Error_t errorHostMemoryNotRegistered = ::hipErrorHostMemoryNotRegistered;
				        static constexpr Error_t errorUnsupportedLimit = ::hipErrorUnsupportedLimit;
				        static constexpr Error_t errorUnknown = ::hipErrorUnknown;

				        static constexpr Flag_t eventDefault = hipEventDefault;
				        static constexpr Flag_t eventBlockingSync = hipEventBlockingSync;
				        static constexpr Flag_t eventDisableTiming = hipEventDisableTiming;
				        static constexpr Flag_t eventInterprocess = hipEventInterprocess;

				        static constexpr Flag_t hostMallocDefault = hipHostMallocDefault;
				        static constexpr Flag_t hostMallocMapped = hipHostMallocMapped;
				        static constexpr Flag_t hostMallocPortable = hipHostMallocPortable;
				        static constexpr Flag_t hostMallocWriteCombined = hipHostMallocWriteCombined;
				        static constexpr Flag_t hostMallocCoherent = hipHostMallocCoherent;
				        static constexpr Flag_t hostMallocNonCoherent = hipHostMallocNonCoherent;

				        static constexpr Flag_t hostRegisterDefault = hipHostRegisterDefault;
				        static constexpr Flag_t hostRegisterPortable = hipHostRegisterPortable;
				        static constexpr Flag_t hostRegisterMapped = hipHostRegisterMapped;
				        static constexpr Flag_t hostRegisterIoMemory = hipHostRegisterIoMemory;

				        static constexpr MemcpyKind_t memcpyDefault = ::hipMemcpyDefault;
				        static constexpr MemcpyKind_t memcpyDeviceToDevice = ::hipMemcpyDeviceToDevice;
				        static constexpr MemcpyKind_t memcpyDeviceToHost = ::hipMemcpyDeviceToHost;
				        static constexpr MemcpyKind_t memcpyHostToDevice = ::hipMemcpyHostToDevice;
				        static constexpr MemcpyKind_t memcpyHostToHost = ::hipMemcpyHostToHost;

				        static constexpr Flag_t streamDefault = hipStreamDefault;
				        static constexpr Flag_t streamNonBlocking = hipStreamNonBlocking;

				        static constexpr DeviceAttr_t deviceAttributeMaxBlockDimX = ::hipDeviceAttributeMaxBlockDimX;
				        static constexpr DeviceAttr_t deviceAttributeMaxBlockDimY = ::hipDeviceAttributeMaxBlockDimY;
				        static constexpr DeviceAttr_t deviceAttributeMaxBlockDimZ = ::hipDeviceAttributeMaxBlockDimZ;
				        static constexpr DeviceAttr_t deviceAttributeMaxGridDimX = ::hipDeviceAttributeMaxGridDimX;
				        static constexpr DeviceAttr_t deviceAttributeMaxGridDimY = ::hipDeviceAttributeMaxGridDimY;
				        static constexpr DeviceAttr_t deviceAttributeMaxGridDimZ = ::hipDeviceAttributeMaxGridDimZ;
				        static constexpr DeviceAttr_t deviceAttributeMaxSharedMemoryPerBlock
				            = ::hipDeviceAttributeMaxSharedMemoryPerBlock;
				        static constexpr DeviceAttr_t deviceAttributeMaxThreadsPerBlock = ::hipDeviceAttributeMaxThreadsPerBlock;
				        static constexpr DeviceAttr_t deviceAttributeMultiprocessorCount = ::hipDeviceAttributeMultiprocessorCount;
				        static constexpr DeviceAttr_t deviceAttributeWarpSize = ::hipDeviceAttributeWarpSize;

				        static constexpr Memory_t memoryTypeUnregistered = ::hipMemoryTypeUnregistered;
				        static constexpr Memory_t memoryTypeHost = ::hipMemoryTypeHost;
				        static constexpr Memory_t memoryTypeDevice = ::hipMemoryTypeDevice;
				        static constexpr Memory_t memoryTypeManaged = ::hipMemoryTypeManaged;

				#    if HIP_VERSION >= 40'500'000
				        static constexpr Limit_t limitPrintfFifoSize = ::hipLimitPrintfFifoSize;
				#    else
				        static constexpr Limit_t limitPrintfFifoSize
				            = static_cast<Limit_t>(0x01); // Implemented only in ROCm 4.5.0 and later.
				#    endif
				        static constexpr Limit_t limitMallocHeapSize = ::hipLimitMallocHeapSize;

				        // Host function helper
				        // Encapsulates the different function signatures used by hipStreamAddCallback and hipLaunchHostFn, and the
				        // different calling conventions used by CUDA (__stdcall on Win32) and HIP (standard).
				        struct HostFnAdaptor
				        {
				            HostFn_t func_;
				            void* data_;

				            static void hostFunction(void* data)
				            {
				                auto ptr = reinterpret_cast<HostFnAdaptor*>(data);
				                ptr->func_(ptr->data_);
				                delete ptr;
				            }

				            static void streamCallback(Stream_t, Error_t, void* data)
				            {
				                auto ptr = reinterpret_cast<HostFnAdaptor*>(data);
				                ptr->func_(ptr->data_);
				                delete ptr;
				            }
				        };

				        // Runtime API
				        static inline Error_t deviceGetAttribute(int* value, DeviceAttr_t attr, int device)
				        {
				            return ::hipDeviceGetAttribute(value, attr, device);
				        }

				        static inline Error_t pointerGetAttributes(PointerAttr_t* attr, void const* ptr)
				        {
				            return ::hipPointerGetAttributes(attr, ptr);
				        }

				        static inline Error_t deviceGetLimit(size_t* pValue, Limit_t limit)
				        {
				#    if HIP_VERSION < 40'500'000
				            if(limit == limitPrintfFifoSize)
				            {
				                // Implemented only in ROCm 4.5.0 and later.
				                return errorUnsupportedLimit;
				            }
				#    endif
				            return ::hipDeviceGetLimit(pValue, limit);
				        }

				        static inline Error_t deviceReset()
				        {
				            return ::hipDeviceReset();
				        }

				        static inline Error_t deviceSetLimit(Limit_t /* limit */, size_t /* value */)
				        {
				            // Not implemented.
				            return errorUnsupportedLimit;
				        }

				        static inline Error_t deviceSynchronize()
				        {
				            return ::hipDeviceSynchronize();
				        }

				        static inline Error_t eventCreate(Event_t* event)
				        {
				            return ::hipEventCreate(event);
				        }

				        static inline Error_t eventCreateWithFlags(Event_t* event, Flag_t flags)
				        {
				            return ::hipEventCreateWithFlags(event, flags);
				        }

				        static inline Error_t eventDestroy(Event_t event)
				        {
				            return ::hipEventDestroy(event);
				        }

				        static inline Error_t eventQuery(Event_t event)
				        {
				            return ::hipEventQuery(event);
				        }

				        static inline Error_t eventRecord(Event_t event, Stream_t stream)
				        {
				            return ::hipEventRecord(event, stream);
				        }

				        static inline Error_t eventSynchronize(Event_t event)
				        {
				            return ::hipEventSynchronize(event);
				        }

				        static inline Error_t free(void* devPtr)
				        {
				            return ::hipFree(devPtr);
				        }

				        static inline Error_t freeAsync(void* devPtr, Stream_t stream)
				        {
				            // hipFreeAsync fails on a null pointer deallocation
				            if(devPtr)
				            {
				                return ::hipFreeAsync(devPtr, stream);
				            }
				            else
				            {
				                return ::hipSuccess;
				            }
				        }

				        static inline Error_t funcGetAttributes(FuncAttributes_t* attr, void const* func)
				        {
				            return ::hipFuncGetAttributes(attr, func);
				        }

				        template<typename T>
				        static inline Error_t funcGetAttributes(FuncAttributes_t* attr, T* func)
				        {
				            // #    if ALPAKA_COMP_GNUC
				            // #        pragma GCC diagnostic push
				            // #        pragma GCC diagnostic ignored "-Wconditionally-supported"
				            // #    endif
				            return ::hipFuncGetAttributes(attr, reinterpret_cast<void const*>(func));
				            // #    if ALPAKA_COMP_GNUC
				            // #        pragma GCC diagnostic pop
				            // #    endif
				        }

				        static inline Error_t getDeviceCount(int* count)
				        {
				            return ::hipGetDeviceCount(count);
				        }

				        static inline Error_t getDeviceProperties(DeviceProp_t* prop, int device)
				        {
				            return ::hipGetDeviceProperties(prop, device);
				        }

				        static inline char const* getErrorName(Error_t error)
				        {
				            return ::hipGetErrorName(error);
				        }

				        static inline char const* getErrorString(Error_t error)
				        {
				            return ::hipGetErrorString(error);
				        }

				        static inline Error_t getLastError()
				        {
				            return ::hipGetLastError();
				        }

				        static inline Error_t getSymbolAddress(void** devPtr, void const* symbol)
				        {
				            return ::hipGetSymbolAddress(devPtr, symbol);
				        }

				        template<class T>
				        static inline Error_t getSymbolAddress(void** devPtr, T const& symbol)
				        {
				            return ::hipGetSymbolAddress(devPtr, symbol);
				        }

				        static inline Error_t hostGetDevicePointer(void** pDevice, void* pHost, Flag_t flags)
				        {
				            return ::hipHostGetDevicePointer(pDevice, pHost, flags);
				        }

				        static inline Error_t hostFree(void* ptr)
				        {
				            return ::hipHostFree(ptr);
				        }

				        static inline Error_t hostMalloc(void** ptr, size_t size, Flag_t flags)
				        {
				            return ::hipHostMalloc(ptr, size, flags);
				        }

				        static inline Error_t hostRegister(void* ptr, size_t size, Flag_t flags)
				        {
				            return ::hipHostRegister(ptr, size, flags);
				        }

				        static inline Error_t hostUnregister(void* ptr)
				        {
				            return ::hipHostUnregister(ptr);
				        }

				        static inline Error_t launchHostFunc(Stream_t stream, HostFn_t fn, void* userData)
				        {
				            // hipLaunchHostFunc is implemented only in ROCm 5.4.0 and later.
				#    if HIP_VERSION >= 50'400'000
				            // Wrap the host function using the proper calling convention.
				            return ::hipLaunchHostFunc(stream, HostFnAdaptor::hostFunction, new HostFnAdaptor{fn, userData});
				#    else
				            // Emulate hipLaunchHostFunc using hipStreamAddCallback with a callback adaptor.
				            return ::hipStreamAddCallback(stream, HostFnAdaptor::streamCallback, new HostFnAdaptor{fn, userData}, 0);
				#    endif
				        }

				        static inline Error_t malloc(void** devPtr, size_t size)
				        {
				            return ::hipMalloc(devPtr, size);
				        }

				        static inline Error_t mallocManaged(void** devPtr, size_t size)
				        {
				            return ::hipMallocManaged(devPtr, size);
				        }

				        static inline Error_t malloc3D(PitchedPtr_t* pitchedDevPtr, Extent_t extent)
				        {
				            return ::hipMalloc3D(pitchedDevPtr, extent);
				        }

				        static inline Error_t mallocAsync(
				            [[maybe_unused]] void** devPtr,
				            [[maybe_unused]] size_t size,
				            [[maybe_unused]] Stream_t stream)
				        {
				            // stream-ordered memory operations are fully implemented only in ROCm 5.3.0 and later.
				#    if HIP_VERSION >= 50'600'000
				            return ::hipMallocAsync(devPtr, size, stream);
				#    elif HIP_VERSION >= 50'300'000
				            // before ROCm 5.6.0, hipMallocAsync fails for an allocation of 0 bytes
				            if(size > 0)
				            {
				                return ::hipMallocAsync(devPtr, size, stream);
				            }
				            else
				            {
				                // make sure the pointer can safely be passed to hipFreeAsync
				                *devPtr = nullptr;
				                return ::hipSuccess;
				            }
				#    else
				            // Not implemented.
				            return errorUnknown;
				#    endif
				        }

				        static inline Error_t mallocPitch(void** devPtr, size_t* pitch, size_t width, size_t height)
				        {
				            return ::hipMallocPitch(devPtr, pitch, width, height);
				        }

				        static inline Error_t memGetInfo(size_t* free, size_t* total)
				        {
				            return ::hipMemGetInfo(free, total);
				        }

				        static inline Error_t memcpy(void* dst, void const* src, size_t count, MemcpyKind_t kind)
				        {
				            return ::hipMemcpy(dst, src, count, kind);
				        }

				        static inline Error_t memcpy2DAsync(
				            void* dst,
				            size_t dpitch,
				            void const* src,
				            size_t spitch,
				            size_t width,
				            size_t height,
				            MemcpyKind_t kind,
				            Stream_t stream)
				        {
				            return ::hipMemcpy2DAsync(dst, dpitch, src, spitch, width, height, kind, stream);
				        }

				        static inline Error_t memcpy3DAsync(Memcpy3DParms_t const* p, Stream_t stream)
				        {
				            return ::hipMemcpy3DAsync(p, stream);
				        }

				        static inline Error_t memcpyAsync(void* dst, void const* src, size_t count, MemcpyKind_t kind, Stream_t stream)
				        {
				            return ::hipMemcpyAsync(dst, src, count, kind, stream);
				        }

				        static inline Error_t memset2DAsync(
				            void* devPtr,
				            size_t pitch,
				            int value,
				            size_t width,
				            size_t height,
				            Stream_t stream)
				        {
				            return ::hipMemset2DAsync(devPtr, pitch, value, width, height, stream);
				        }

				        static inline Error_t memset3DAsync(PitchedPtr_t pitchedDevPtr, int value, Extent_t extent, Stream_t stream)
				        {
				            return ::hipMemset3DAsync(pitchedDevPtr, value, extent, stream);
				        }

				        static inline Error_t memsetAsync(void* devPtr, int value, size_t count, Stream_t stream)
				        {
				            return ::hipMemsetAsync(devPtr, value, count, stream);
				        }

				        static inline Error_t setDevice(int device)
				        {
				            return ::hipSetDevice(device);
				        }

				        static inline Error_t streamCreate(Stream_t* pStream)
				        {
				            return ::hipStreamCreate(pStream);
				        }

				        static inline Error_t streamCreateWithFlags(Stream_t* pStream, Flag_t flags)
				        {
				            return ::hipStreamCreateWithFlags(pStream, flags);
				        }

				        static inline Error_t streamDestroy(Stream_t stream)
				        {
				            return ::hipStreamDestroy(stream);
				        }

				        static inline Error_t streamQuery(Stream_t stream)
				        {
				            return ::hipStreamQuery(stream);
				        }

				        static inline Error_t streamSynchronize(Stream_t stream)
				        {
				            return ::hipStreamSynchronize(stream);
				        }

				        static inline Error_t streamWaitEvent(Stream_t stream, Event_t event, Flag_t flags)
				        {
				            return ::hipStreamWaitEvent(stream, event, flags);
				        }

				        static inline PitchedPtr_t makePitchedPtr(void* d, size_t p, size_t xsz, size_t ysz)
				        {
				            return ::make_hipPitchedPtr(d, p, xsz, ysz);
				        }

				        static inline Pos_t makePos(size_t x, size_t y, size_t z)
				        {
				            return ::make_hipPos(x, y, z);
				        }

				        static inline Extent_t makeExtent(size_t w, size_t h, size_t d)
				        {
				            return ::make_hipExtent(w, h, d);
				        }
				    };

				} // namespace alpaka

				#endif
				// ==
				// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/core/ApiHipRt.hpp ==
				// ============================================================================


			#    include <type_traits>

			namespace alpaka::onHost
			{
			    namespace trait
			    {
			        template<typename T_Platform>
			        struct IsExecutorSupportedBy::Op<exec::GpuHip, unifiedCudaHip::Device<T_Platform>> : std::true_type
			        {
			        };
			    } // namespace trait
			} // namespace alpaka::onHost

			namespace alpaka::onHost::internal
			{
			    template<alpaka::concepts::DeviceKind T_DeviceKind, typename T_Any>
			    struct IsDataAccessible::SecondPath<api::Hip, T_DeviceKind, T_Any>
			    {
			        bool operator()(api::Hip usedApi, T_DeviceKind deviceKind, T_Any const& view) const
			        {
			            alpaka::unused(usedApi);
			            using ApiInterface = ApiHipRt;
			            typename ApiInterface::PointerAttr_t ptrAttributes;
			            ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
			                ApiInterface,
			                ApiInterface::pointerGetAttributes(&ptrAttributes, onHost::data(view)));

			            if(ptrAttributes.type == ApiInterface::memoryTypeManaged)
			                return true;
			            if(ptrAttributes.type == ApiInterface::memoryTypeHost
			               && (deviceKind == deviceKind::cpu || deviceKind == deviceKind::numaCpu))
			                return true;

			            return false;
			        }
			    };
			} // namespace alpaka::onHost::internal

			#endif
			// ==
			// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/hip/Device.hpp ==
			// ============================================================================

		// #include "alpaka/api/hip/IdxLayer.hpp"    // amalgamate: file already inlined
			// ============================================================================
			// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/hip/Platform.hpp ==
			// ==
			/* Copyright 2024 René Widera
			 * SPDX-License-Identifier: MPL-2.0
			 */


			// #pragma once
			// #include "alpaka/api/hip/Api.hpp"    // amalgamate: file already inlined
			// #include "alpaka/api/unifiedCudaHip/Platform.hpp"    // amalgamate: file already inlined
			// #include "alpaka/core/UniformCudaHip.hpp"    // amalgamate: file already inlined
			// #include "alpaka/core/config.hpp"    // amalgamate: file already inlined
			// #include "alpaka/internal/interface.hpp"    // amalgamate: file already inlined
			// #include "alpaka/onHost/interface.hpp"    // amalgamate: file already inlined

			#if ALPAKA_LANG_HIP

			// #    include "alpaka/core/ApiHipRt.hpp"    // amalgamate: file already inlined

			namespace alpaka::onHost
			{
			    namespace internal
			    {

			        template<alpaka::concepts::DeviceKind T_DeviceKind>
			        struct MakePlatform::Op<api::Hip, T_DeviceKind>
			        {
			            auto operator()(api::Hip, T_DeviceKind) const
			            {
			                return onHost::make_sharedSingleton<unifiedCudaHip::Platform<ApiHipRt, T_DeviceKind>>();
			            }
			        };
			    } // namespace internal
			} // namespace alpaka::onHost

			namespace alpaka::internal
			{
			    template<alpaka::concepts::DeviceKind T_DeviceKind>
			    struct GetApi::Op<onHost::unifiedCudaHip::Platform<ApiHipRt, T_DeviceKind>>
			    {
			        inline constexpr auto operator()(auto&& platform) const
			        {
			            alpaka::unused(platform);
			            return api::Hip{};
			        }
			    };

			    template<alpaka::concepts::DeviceKind T_DeviceKind>
			    struct GetDeviceType::Op<onHost::unifiedCudaHip::Platform<ApiHipRt, T_DeviceKind>>
			    {
			        decltype(auto) operator()(auto&& platform) const
			        {
			            alpaka::unused(platform);
			            return T_DeviceKind{};
			        }
			    };
			} // namespace alpaka::internal
			#endif
			// ==
			// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/hip/Platform.hpp ==
			// ============================================================================

		// #include "alpaka/api/hip/executor.hpp"    // amalgamate: file already inlined
			// ============================================================================
			// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/hip/memFence.hpp ==
			// ==
			/* Copyright 2025 Mehmet Yusufoglu, René Widera
			 * SPDX-License-Identifier: MPL-2.0
			 */

			// #pragma once				// ============================================================================
				// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/hip/memoryOrder.hpp ==
				// ==
				/* Copyright 2025 Mehmet Yusufoglu, René Widera
				 * SPDX-License-Identifier: MPL-2.0
				 */

				// #pragma once
				// #include "alpaka/core/config.hpp"    // amalgamate: file already inlined
				// #include "alpaka/onAcc/memoryOrder.hpp"    // amalgamate: file already inlined

				#include <type_traits>

				#if ALPAKA_LANG_HIP

				namespace alpaka::onAcc::internalCompute
				{
				    struct MemOrderHip
				    {
				        template<concepts::MemoryOrder TMemOrder>
				        static constexpr auto get(TMemOrder)
				        {
				            if constexpr(std::same_as<TMemOrder, order::SeqCst>)
				            {
				                return __ATOMIC_SEQ_CST;
				            }
				            if constexpr(std::same_as<TMemOrder, order::AcqRel>)
				            {
				                return __ATOMIC_ACQ_REL;
				            }
				            if constexpr(std::same_as<TMemOrder, order::Release>)
				            {
				                return __ATOMIC_RELEASE;
				            }
				            if constexpr(std::same_as<TMemOrder, order::Acquire>)
				            {
				                return __ATOMIC_ACQUIRE;
				            }
				            if constexpr(std::same_as<TMemOrder, order::Relaxed>)
				            {
				                return __ATOMIC_RELAXED;
				            }
				        }
				    };
				} // namespace alpaka::onAcc::internalCompute

				#endif // ALPAKA_LANG_SYCL
				// ==
				// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/hip/memoryOrder.hpp ==
				// ============================================================================

			// #include "alpaka/api/unifiedCudaHip/tag.hpp"    // amalgamate: file already inlined
			// #include "alpaka/core/common.hpp"    // amalgamate: file already inlined
			// #include "alpaka/core/config.hpp"    // amalgamate: file already inlined
			// #include "alpaka/onAcc/Acc.hpp"    // amalgamate: file already inlined
			// #include "alpaka/onAcc/memoryOrder.hpp"    // amalgamate: file already inlined
			// #include "alpaka/onAcc/scope.hpp"    // amalgamate: file already inlined

			#include <type_traits>

			#if ALPAKA_LANG_HIP

			namespace alpaka::onAcc::internalCompute
			{
			    /** Specializations should not have to be enabled for backend combinations without HIP
			     * Removing this top level guard will not cause issues if intrinsics like __builtin_amdgcn_fence are protected
			     * inside the specialization.
			     */
			    template<typename T_Api, typename T_Scope, concepts::MemoryOrder T_Order>
			    requires std::same_as<T_Api, api::Hip>
			    struct MemoryFence::Op<T_Api, T_Scope, T_Order>
			    {
			        ALPAKA_FN_ACC constexpr void operator()(
			            onAcc::concepts::Acc auto const&,
			            T_Scope const,
			            [[maybe_unused]] T_Order const order) const
			        {
			            // Host pass is not allowed.
			#    if ALPAKA_ARCH_AMD
			            if constexpr(std::is_same_v<T_Scope, scope::Block>)
			            {
			                __builtin_amdgcn_fence(MemOrderHip::get(order), "workgroup");
			            }
			            else if constexpr(std::is_same_v<T_Scope, scope::Device>)
			            {
			                __builtin_amdgcn_fence(MemOrderHip::get(order), "agent");
			            }
			            else if constexpr(std::is_same_v<T_Scope, scope::System>)
			            {
			                // empty string refers to system
			                __builtin_amdgcn_fence(MemOrderHip::get(order), "");
			            }
			#    endif
			        }
			    };
			} // namespace alpaka::onAcc::internalCompute

			#endif // ALPAKA_LANG_HIP
			// ==
			// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/hip/memFence.hpp ==
			// ============================================================================

		// ==
		// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/hip.hpp ==
		// ============================================================================

	// #include "alpaka/api/unifiedCudaHip/Device.hpp"    // amalgamate: file already inlined
	// #include "alpaka/api/unifiedCudaHip/Event.hpp"    // amalgamate: file already inlined
	// #include "alpaka/api/unifiedCudaHip/Platform.hpp"    // amalgamate: file already inlined
	// #include "alpaka/api/unifiedCudaHip/Queue.hpp"    // amalgamate: file already inlined
		// ============================================================================
		// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/unifiedCudaHip/atomic.hpp ==
		// ==
		/* Copyright 2022 Benjamin Worpitz, René Widera, Jan Stephan, Andrea Bocci, Bernhard Manfred Gruber, Antonio Di Pilato
		 * SPDX-License-Identifier: MPL-2.0
		 */

		// #pragma once
			// ============================================================================
			// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/unifiedCudaHip/atomicBuiltIn.hpp ==
			// ==
			/* Copyright 2022 René Widera
			 * SPDX-License-Identifier: MPL-2.0
			 */

			// #pragma once
			// #include "alpaka/core/config.hpp"    // amalgamate: file already inlined
			// #include "alpaka/onAcc/scope.hpp"    // amalgamate: file already inlined
			// #include "alpaka/operation.hpp"    // amalgamate: file already inlined
			// #include "alpaka/utility.hpp"    // amalgamate: file already inlined

			#include <type_traits>

			#if ALPAKA_LANG_CUDA || ALPAKA_LANG_HIP

			namespace alpaka::onAcc
			{
			    //! The GPU CUDA/HIP accelerator atomic ops.
			    //
			    //  Atomics can be used in the hierarchy level grids, blocks and threads.
			    //  Atomics are not guaranteed to be safe between devices.
			    class AtomicUniformCudaHipBuiltIn
			    {
			    };
			} // namespace alpaka::onAcc

			//! These types must be in the global namespace for checking existence of respective functions in global namespace via
			//! SFINAE, so we use inline namespace.
			inline namespace alpakaGlobal
			{
			    //! Provide an interface to builtin atomic functions.
			    //
			    // To check for the existence of builtin functions located in the global namespace :: directly.
			    // This would not be possible without having these types in global namespace.
			    // If the functor is inheriting from std::false_type an signature is explicitly not available. This can be used to
			    // explicitly disable builtin function in case the builtin is broken.
			    // If the functor is inheriting from std::true_type a specialization must implement one of the following
			    // interfaces.
			    // \code{.cpp}
			    //    // interface for all atomics except atomicCas
			    //    __device__ static T atomic( T* add, T value);
			    //    // interface for atomicCas only
			    //    __device__ static T atomic( T* add, T compare, T value);
			    // \endcode
			    template<typename TOp, typename T, typename T_Scope, typename TSfinae = void>
			    struct AlpakaBuiltInAtomic : std::false_type
			    {
			    };

			    // Cas.
			    template<typename T, typename T_Scope>
			    struct AlpakaBuiltInAtomic<
			        alpaka::operation::Cas,
			        T,
			        T_Scope,
			        typename std::void_t<
			            decltype(atomicCAS(alpaka::core::declval<T*>(), alpaka::core::declval<T>(), alpaka::core::declval<T>()))>>
			        : std::true_type
			    {
			        static __device__ T atomic(T* add, T compare, T value)
			        {
			            return atomicCAS(add, compare, value);
			        }
			    };

			    template<typename T>
			    struct AlpakaBuiltInAtomic<
			        alpaka::operation::Cas,
			        T,
			        alpaka::onAcc::scope::Block,
			        typename std::void_t<decltype(atomicCAS_block(
			            alpaka::core::declval<T*>(),
			            alpaka::core::declval<T>(),
			            alpaka::core::declval<T>()))>> : std::true_type
			    {
			        static __device__ T atomic(T* add, T compare, T value)
			        {
			            return atomicCAS_block(add, compare, value);
			        }
			    };

			    // Add.
			    template<typename T, typename T_Scope>
			    struct AlpakaBuiltInAtomic<
			        alpaka::operation::Add,
			        T,
			        T_Scope,
			        typename std::void_t<decltype(atomicAdd(alpaka::core::declval<T*>(), alpaka::core::declval<T>()))>>
			        : std::true_type
			    {
			        static __device__ T atomic(T* add, T value)
			        {
			            return atomicAdd(add, value);
			        }
			    };

			    template<typename T>
			    struct AlpakaBuiltInAtomic<
			        alpaka::operation::Add,
			        T,
			        alpaka::onAcc::scope::Block,
			        typename std::void_t<decltype(atomicAdd_block(alpaka::core::declval<T*>(), alpaka::core::declval<T>()))>>
			        : std::true_type
			    {
			        static __device__ T atomic(T* add, T value)
			        {
			            return atomicAdd_block(add, value);
			        }
			    };

			#    if (ALPAKA_LANG_HIP)
			    // HIP shows bad performance with builtin atomicAdd(float*,float) for the hierarchy threads therefore we do not
			    // call the buildin method and instead use the atomicCAS emulation. For details see:
			    // https://github.com/alpaka-group/alpaka/issues/1657
			    template<>
			    struct AlpakaBuiltInAtomic<alpaka::operation::Add, float, alpaka::onAcc::scope::Block> : std::false_type
			    {
			    };
			#    endif

			    // Sub.

			    template<typename T, typename T_Scope>
			    struct AlpakaBuiltInAtomic<
			        alpaka::operation::Sub,
			        T,
			        T_Scope,
			        typename std::void_t<decltype(atomicSub(alpaka::core::declval<T*>(), alpaka::core::declval<T>()))>>
			        : std::true_type
			    {
			        static __device__ T atomic(T* add, T value)
			        {
			            return atomicSub(add, value);
			        }
			    };

			    template<typename T>
			    struct AlpakaBuiltInAtomic<
			        alpaka::operation::Sub,
			        T,
			        alpaka::onAcc::scope::Block,
			        typename std::void_t<decltype(atomicSub_block(alpaka::core::declval<T*>(), alpaka::core::declval<T>()))>>
			        : std::true_type
			    {
			        static __device__ T atomic(T* add, T value)
			        {
			            return atomicSub_block(add, value);
			        }
			    };

			    // Min.
			    template<typename T, typename T_Scope>
			    struct AlpakaBuiltInAtomic<
			        alpaka::operation::Min,
			        T,
			        T_Scope,
			        typename std::void_t<decltype(atomicMin(alpaka::core::declval<T*>(), alpaka::core::declval<T>()))>>
			        : std::true_type
			    {
			        static __device__ T atomic(T* add, T value)
			        {
			            return atomicMin(add, value);
			        }
			    };

			    template<typename T>
			    struct AlpakaBuiltInAtomic<
			        alpaka::operation::Min,
			        T,
			        alpaka::onAcc::scope::Block,
			        typename std::void_t<decltype(atomicMin_block(alpaka::core::declval<T*>(), alpaka::core::declval<T>()))>>
			        : std::true_type
			    {
			        static __device__ T atomic(T* add, T value)
			        {
			            return atomicMin_block(add, value);
			        }
			    };

			// disable HIP atomicMin: see https://github.com/ROCm-Developer-Tools/hipamd/pull/40
			#    if (ALPAKA_LANG_HIP)
			    template<typename T_Scope>
			    struct AlpakaBuiltInAtomic<alpaka::operation::Min, float, T_Scope> : std::false_type
			    {
			    };

			    template<>
			    struct AlpakaBuiltInAtomic<alpaka::operation::Min, float, alpaka::onAcc::scope::Block> : std::false_type
			    {
			    };

			    template<typename T_Scope>
			    struct AlpakaBuiltInAtomic<alpaka::operation::Min, double, T_Scope> : std::false_type
			    {
			    };

			    template<>
			    struct AlpakaBuiltInAtomic<alpaka::operation::Min, double, alpaka::onAcc::scope::Block> : std::false_type
			    {
			    };

			#        if !__has_builtin(__hip_atomic_compare_exchange_strong)
			    template<typename T_Scope>
			    struct AlpakaBuiltInAtomic<alpaka::operation::Min, unsigned long long, T_Scope> : std::false_type
			    {
			    };

			    template<>
			    struct AlpakaBuiltInAtomic<alpaka::operation::Min, unsigned long long, alpaka::onAcc::scope::Block>
			        : std::false_type
			    {
			    };
			#        endif
			#    endif

			    // Max.

			    template<typename T, typename T_Scope>
			    struct AlpakaBuiltInAtomic<
			        alpaka::operation::Max,
			        T,
			        T_Scope,
			        typename std::void_t<decltype(atomicMax(alpaka::core::declval<T*>(), alpaka::core::declval<T>()))>>
			        : std::true_type
			    {
			        static __device__ T atomic(T* add, T value)
			        {
			            return atomicMax(add, value);
			        }
			    };

			    template<typename T>
			    struct AlpakaBuiltInAtomic<
			        alpaka::operation::Max,
			        T,
			        alpaka::onAcc::scope::Block,
			        typename std::void_t<decltype(atomicMax_block(alpaka::core::declval<T*>(), alpaka::core::declval<T>()))>>
			        : std::true_type
			    {
			        static __device__ T atomic(T* add, T value)
			        {
			            return atomicMax_block(add, value);
			        }
			    };

			    // disable HIP atomicMax: see https://github.com/ROCm-Developer-Tools/hipamd/pull/40
			#    if (ALPAKA_LANG_HIP)
			    template<typename T_Scope>
			    struct AlpakaBuiltInAtomic<alpaka::operation::Max, float, T_Scope> : std::false_type
			    {
			    };

			    template<>
			    struct AlpakaBuiltInAtomic<alpaka::operation::Max, float, alpaka::onAcc::scope::Block> : std::false_type
			    {
			    };

			    template<typename T_Scope>
			    struct AlpakaBuiltInAtomic<alpaka::operation::Max, double, T_Scope> : std::false_type
			    {
			    };

			    template<>
			    struct AlpakaBuiltInAtomic<alpaka::operation::Max, double, alpaka::onAcc::scope::Block> : std::false_type
			    {
			    };

			#        if !__has_builtin(__hip_atomic_compare_exchange_strong)
			    template<typename T_Scope>
			    struct AlpakaBuiltInAtomic<alpaka::operation::Max, unsigned long long, T_Scope> : std::false_type
			    {
			    };

			    template<>
			    struct AlpakaBuiltInAtomic<alpaka::operation::Max, unsigned long long, alpaka::onAcc::scope::Block>
			        : std::false_type
			    {
			    };
			#        endif
			#    endif


			    // Exch.

			    template<typename T, typename T_Scope>
			    struct AlpakaBuiltInAtomic<
			        alpaka::operation::Exch,
			        T,
			        T_Scope,
			        typename std::void_t<decltype(atomicExch(alpaka::core::declval<T*>(), alpaka::core::declval<T>()))>>
			        : std::true_type
			    {
			        static __device__ T atomic(T* add, T value)
			        {
			            return atomicExch(add, value);
			        }
			    };

			    template<typename T>
			    struct AlpakaBuiltInAtomic<
			        alpaka::operation::Exch,
			        T,
			        alpaka::onAcc::scope::Block,
			        typename std::void_t<decltype(atomicExch_block(alpaka::core::declval<T*>(), alpaka::core::declval<T>()))>>
			        : std::true_type
			    {
			        static __device__ T atomic(T* add, T value)
			        {
			            return atomicExch_block(add, value);
			        }
			    };

			    // Inc.

			    template<typename T, typename T_Scope>
			    struct AlpakaBuiltInAtomic<
			        alpaka::operation::Inc,
			        T,
			        T_Scope,
			        typename std::void_t<decltype(atomicInc(alpaka::core::declval<T*>(), alpaka::core::declval<T>()))>>
			        : std::true_type
			    {
			        static __device__ T atomic(T* add, T value)
			        {
			            return atomicInc(add, value);
			        }
			    };

			    template<typename T>
			    struct AlpakaBuiltInAtomic<
			        alpaka::operation::Inc,
			        T,
			        alpaka::onAcc::scope::Block,
			        typename std::void_t<decltype(atomicInc_block(alpaka::core::declval<T*>(), alpaka::core::declval<T>()))>>
			        : std::true_type
			    {
			        static __device__ T atomic(T* add, T value)
			        {
			            return atomicInc_block(add, value);
			        }
			    };

			    // Dec.

			    template<typename T, typename T_Scope>
			    struct AlpakaBuiltInAtomic<
			        alpaka::operation::Dec,
			        T,
			        T_Scope,
			        typename std::void_t<decltype(atomicDec(alpaka::core::declval<T*>(), alpaka::core::declval<T>()))>>
			        : std::true_type
			    {
			        static __device__ T atomic(T* add, T value)
			        {
			            return atomicDec(add, value);
			        }
			    };

			    template<typename T>
			    struct AlpakaBuiltInAtomic<
			        alpaka::operation::Dec,
			        T,
			        alpaka::onAcc::scope::Block,
			        typename std::void_t<decltype(atomicDec_block(alpaka::core::declval<T*>(), alpaka::core::declval<T>()))>>
			        : std::true_type
			    {
			        static __device__ T atomic(T* add, T value)
			        {
			            return atomicDec_block(add, value);
			        }
			    };

			    // And.

			    template<typename T, typename T_Scope>
			    struct AlpakaBuiltInAtomic<
			        alpaka::operation::And,
			        T,
			        T_Scope,
			        typename std::void_t<decltype(atomicAnd(alpaka::core::declval<T*>(), alpaka::core::declval<T>()))>>
			        : std::true_type
			    {
			        static __device__ T atomic(T* add, T value)
			        {
			            return atomicAnd(add, value);
			        }
			    };

			    template<typename T>
			    struct AlpakaBuiltInAtomic<
			        alpaka::operation::And,
			        T,
			        alpaka::onAcc::scope::Block,
			        typename std::void_t<decltype(atomicAnd_block(alpaka::core::declval<T*>(), alpaka::core::declval<T>()))>>
			        : std::true_type
			    {
			        static __device__ T atomic(T* add, T value)
			        {
			            return atomicAnd_block(add, value);
			        }
			    };

			    // Or.

			    template<typename T, typename T_Scope>
			    struct AlpakaBuiltInAtomic<
			        alpaka::operation::Or,
			        T,
			        T_Scope,
			        typename std::void_t<decltype(atomicOr(alpaka::core::declval<T*>(), alpaka::core::declval<T>()))>>
			        : std::true_type
			    {
			        static __device__ T atomic(T* add, T value)
			        {
			            return atomicOr(add, value);
			        }
			    };

			    template<typename T>
			    struct AlpakaBuiltInAtomic<
			        alpaka::operation::Or,
			        T,
			        alpaka::onAcc::scope::Block,
			        typename std::void_t<decltype(atomicOr_block(alpaka::core::declval<T*>(), alpaka::core::declval<T>()))>>
			        : std::true_type
			    {
			        static __device__ T atomic(T* add, T value)
			        {
			            return atomicOr_block(add, value);
			        }
			    };

			    // Xor.

			    template<typename T, typename T_Scope>
			    struct AlpakaBuiltInAtomic<
			        alpaka::operation::Xor,
			        T,
			        T_Scope,
			        typename std::void_t<decltype(atomicXor(alpaka::core::declval<T*>(), alpaka::core::declval<T>()))>>
			        : std::true_type
			    {
			        static __device__ T atomic(T* add, T value)
			        {
			            return atomicXor(add, value);
			        }
			    };

			    template<typename T>
			    struct AlpakaBuiltInAtomic<
			        alpaka::operation::Xor,
			        T,
			        alpaka::onAcc::scope::Block,
			        typename std::void_t<decltype(atomicXor_block(alpaka::core::declval<T*>(), alpaka::core::declval<T>()))>>
			        : std::true_type
			    {
			        static __device__ T atomic(T* add, T value)
			        {
			            return atomicXor_block(add, value);
			        }
			    };

			} // namespace alpakaGlobal

			#endif
			// ==
			// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/unifiedCudaHip/atomicBuiltIn.hpp ==
			// ============================================================================

		// #include "alpaka/api/unifiedCudaHip/tag.hpp"    // amalgamate: file already inlined
		// #include "alpaka/core/Unreachable.hpp"    // amalgamate: file already inlined
		// #include "alpaka/core/config.hpp"    // amalgamate: file already inlined
		// #include "alpaka/onAcc/internal/interface.hpp"    // amalgamate: file already inlined
		// #include "alpaka/onAcc/scope.hpp"    // amalgamate: file already inlined
		// #include "alpaka/operation.hpp"    // amalgamate: file already inlined

		// #include <bit>    // amalgamate: file already included
		// #include <limits>    // amalgamate: file already included
		#include <type_traits>

		#if ALPAKA_LANG_CUDA || ALPAKA_LANG_HIP

		namespace alpaka::onAcc::internalCompute
		{
		    namespace detail
		    {
		        struct EmulationBase
		        {
		            template<typename T_Type>
		            using AtomicCasType = std::conditional_t<
		                sizeof(T_Type) == 4u,
		                unsigned int,
		                std::conditional_t<sizeof(T_Type) == 8u, unsigned long long int, void>>;

		            template<typename T_Type>
		            static __device__ auto reinterpretAddress(T_Type* address)
		                -> AtomicCasType<T_Type>* requires(sizeof(T_Type) == 4u || sizeof(T_Type) == 8u) {
		                    return reinterpret_cast<AtomicCasType<T_Type>*>(address);
		                }

		            template<typename T_Type>
		            static __device__ auto reinterpretValue(T_Type value)
		                -> AtomicCasType<T_Type> requires(sizeof(T_Type) == 4u || sizeof(T_Type) == 8u)
		            {
		                return std::bit_cast<AtomicCasType<T_Type>>(value);
		            }
		        };

		        //! Emulate atomic
		        //
		        // The default implementation will emulate all atomic functions with atomicCAS.
		        template<
		            typename TOp,
		            typename TAtomic,
		            typename T,
		            typename T_Scope,
		            typename TSfinae = void,
		            typename TDefer = void>
		        struct EmulateAtomic : private EmulationBase
		        {
		        public:
		            static __device__ auto atomic(internal::CudaHipAtomic const ctx, T* const addr, T const& value) -> T
		            {
		                auto* const addressAsIntegralType = reinterpretAddress(addr);
		                using EmulatedType = std::decay_t<decltype(*addressAsIntegralType)>;

		                // Emulating atomics with atomicCAS is mentioned in the programming guide too.
		                // http://docs.nvidia.com/cuda/cuda-c-programming-guide/#atomic-functions
		#    if ALPAKA_LANG_HIP
		#        if __has_builtin(__hip_atomic_load)
		                EmulatedType old{__hip_atomic_load(addressAsIntegralType, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT)};
		#        else
		                EmulatedType old{__atomic_load_n(addressAsIntegralType, __ATOMIC_RELAXED)};
		#        endif
		#    else
		                EmulatedType old{*addressAsIntegralType};
		#    endif
		                EmulatedType assumed;
		                do
		                {
		                    assumed = old;
		                    T v = std::bit_cast<T>(assumed);
		                    TOp{}(&v, value);
		                    using Cas = Atomic::Op<alpaka::operation::Cas, internal::CudaHipAtomic, EmulatedType, T_Scope>;
		                    old = Cas::atomicOp(ctx, addressAsIntegralType, assumed, reinterpretValue(v));
		                    // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
		                } while(assumed != old);
		                return std::bit_cast<T>(old);
		            }
		        };

		        //! Emulate operation::Cas with equivalent unisigned integral type
		        template<typename T, typename T_Scope>
		        struct EmulateAtomic<alpaka::operation::Cas, internal::CudaHipAtomic, T, T_Scope> : private EmulationBase
		        {
		            static __device__ auto atomic(
		                internal::CudaHipAtomic const ctx,
		                T* const addr,
		                T const& compare,
		                T const& value) -> T
		            {
		                auto* const addressAsIntegralType = reinterpretAddress(addr);
		                using EmulatedType = std::decay_t<decltype(*addressAsIntegralType)>;
		                EmulatedType reinterpretedCompare = reinterpretValue(compare);
		                EmulatedType reinterpretedValue = reinterpretValue(value);

		                auto old
		                    = Atomic::Op<alpaka::operation::Cas, internal::CudaHipAtomic, EmulatedType, T_Scope>::atomicOp(
		                        ctx,
		                        addressAsIntegralType,
		                        reinterpretedCompare,
		                        reinterpretedValue);

		                return std::bit_cast<T>(old);
		            }
		        };

		        //! Emulate operation::Sub with atomicAdd
		        template<typename T, typename T_Scope>
		        struct EmulateAtomic<alpaka::operation::Sub, internal::CudaHipAtomic, T, T_Scope>
		        {
		            static __device__ auto atomic(internal::CudaHipAtomic const ctx, T* const addr, T const& value) -> T
		            {
		                return Atomic::Op<alpaka::operation::Add, internal::CudaHipAtomic, T, T_Scope>::atomicOp(
		                    ctx,
		                    addr,
		                    -value);
		            }
		        };

		        //! operation::Dec can not be implemented for floating point types!
		        template<typename T, typename T_Scope>
		        struct EmulateAtomic<
		            operation::Dec,
		            internal::CudaHipAtomic,
		            T,
		            T_Scope,
		            std::enable_if_t<std::is_floating_point_v<T>>>
		        {
		            static __device__ auto atomic(internal::CudaHipAtomic const&, T* const, T const&) -> T
		            {
		                static_assert(
		                    !sizeof(T),
		                    "EmulateAtomic<alpaka::operation::Dec> is not supported for floating point data types!");
		                return T{};
		            }
		        };

		        //! operation::Inc can not be implemented for floating point types!
		        template<typename T, typename T_Scope>
		        struct EmulateAtomic<
		            operation::Inc,
		            internal::CudaHipAtomic,
		            T,
		            T_Scope,
		            std::enable_if_t<std::is_floating_point_v<T>>>
		        {
		            static __device__ auto atomic(internal::CudaHipAtomic const&, T* const, T const&) -> T
		            {
		                static_assert(
		                    !sizeof(T),
		                    "EmulateAtomic<alpaka::operation::Inc> is not supported for floating point data types!");
		                return T{};
		            }
		        };

		        //! operation::And can not be implemented for floating point types!
		        template<typename T, typename T_Scope>
		        struct EmulateAtomic<
		            operation::And,
		            internal::CudaHipAtomic,
		            T,
		            T_Scope,
		            std::enable_if_t<std::is_floating_point_v<T>>>
		        {
		            static __device__ auto atomic(internal::CudaHipAtomic const&, T* const, T const&) -> T
		            {
		                static_assert(
		                    !sizeof(T),
		                    "EmulateAtomic<alpaka::operation::And> is not supported for floating point data types!");
		                return T{};
		            }
		        };

		        //! operation::Or can not be implemented for floating point types!
		        template<typename T, typename T_Scope>
		        struct EmulateAtomic<
		            operation::Or,
		            internal::CudaHipAtomic,
		            T,
		            T_Scope,
		            std::enable_if_t<std::is_floating_point_v<T>>>
		        {
		            static __device__ auto atomic(internal::CudaHipAtomic const&, T* const, T const&) -> T
		            {
		                static_assert(
		                    !sizeof(T),
		                    "EmulateAtomic<alpaka::operation::Or> is not supported for floating point data types!");
		                return T{};
		            }
		        };

		        //! operation::Xor can not be implemented for floating point types!
		        template<typename T, typename T_Scope>
		        struct EmulateAtomic<
		            operation::Xor,
		            internal::CudaHipAtomic,
		            T,
		            T_Scope,
		            std::enable_if_t<std::is_floating_point_v<T>>>
		        {
		            static __device__ auto atomic(internal::CudaHipAtomic const&, T* const, T const&) -> T
		            {
		                static_assert(
		                    !sizeof(T),
		                    "EmulateAtomic<alpaka::operation::Xor> is not supported for floating point data types!");
		                return T{};
		            }
		        };

		    } // namespace detail

		    //! Generic atomic implementation
		    //
		    // - unsigned long int will be redirected to unsigned long long int or unsigned int implementation depending if
		    //   unsigned long int is a 64 or 32bit data type.
		    // - Atomics which are not available as builtin atomic will be emulated.
		    template<typename TOp, typename T, typename T_Scope>
		    struct Atomic::Op<TOp, internal::CudaHipAtomic, T, T_Scope>
		    {
		        static __device__ auto atomicOp(
		            internal::CudaHipAtomic const ctx,
		            [[maybe_unused]] T* const addr,
		            [[maybe_unused]] T const& value) -> T
		        {
		            static_assert(
		                sizeof(T) == 4u || sizeof(T) == 8u,
		                "atomicOp<TOp, internal::CudaHipAtomic, T>(atomic, addr, value) is not supported! Only 64 and "
		                "32bit atomics are supported.");

		            if constexpr(::AlpakaBuiltInAtomic<TOp, T, T_Scope>::value)
		                return ::AlpakaBuiltInAtomic<TOp, T, T_Scope>::atomic(addr, value);

		            else if constexpr(std::is_same_v<unsigned long int, T>)
		            {
		                if constexpr(sizeof(T) == 4u && ::AlpakaBuiltInAtomic<TOp, unsigned int, T_Scope>::value)
		                    return ::AlpakaBuiltInAtomic<TOp, unsigned int, T_Scope>::atomic(
		                        reinterpret_cast<unsigned int*>(addr),
		                        static_cast<unsigned int>(value));
		                else if constexpr(
		                    sizeof(T) == 8u && ::AlpakaBuiltInAtomic<TOp, unsigned long long int, T_Scope>::value) // LP64
		                {
		                    return ::AlpakaBuiltInAtomic<TOp, unsigned long long int, T_Scope>::atomic(
		                        reinterpret_cast<unsigned long long int*>(addr),
		                        static_cast<unsigned long long int>(value));
		                }
		            }

		            return detail::EmulateAtomic<TOp, internal::CudaHipAtomic, T, T_Scope>::atomic(ctx, addr, value);
		        }
		    };

		    template<typename T, typename T_Scope>
		    struct Atomic::Op<alpaka::operation::Cas, internal::CudaHipAtomic, T, T_Scope>
		    {
		        static __device__ auto atomicOp(
		            [[maybe_unused]] internal::CudaHipAtomic const ctx,
		            [[maybe_unused]] T* const addr,
		            [[maybe_unused]] T const& compare,
		            [[maybe_unused]] T const& value) -> T
		        {
		            static_assert(
		                sizeof(T) == 4u || sizeof(T) == 8u,
		                "atomicOp<alpaka::operation::Cas, internal::CudaHipAtomic, T>(atomic, addr, compare, value) is not "
		                "supported! Only 64 and "
		                "32bit atomics are supported.");

		            if constexpr(::AlpakaBuiltInAtomic<alpaka::operation::Cas, T, T_Scope>::value)
		                return ::AlpakaBuiltInAtomic<alpaka::operation::Cas, T, T_Scope>::atomic(addr, compare, value);

		            else if constexpr(std::is_same_v<unsigned long int, T>)
		            {
		                if constexpr(
		                    sizeof(T) == 4u && ::AlpakaBuiltInAtomic<alpaka::operation::Cas, unsigned int, T_Scope>::value)
		                    return ::AlpakaBuiltInAtomic<alpaka::operation::Cas, unsigned int, T_Scope>::atomic(
		                        reinterpret_cast<unsigned int*>(addr),
		                        static_cast<unsigned int>(compare),
		                        static_cast<unsigned int>(value));
		                else if constexpr(
		                    sizeof(T) == 8u
		                    && ::AlpakaBuiltInAtomic<alpaka::operation::Cas, unsigned long long int, T_Scope>::value) // LP64
		                {
		                    return ::AlpakaBuiltInAtomic<alpaka::operation::Cas, unsigned long long int, T_Scope>::atomic(
		                        reinterpret_cast<unsigned long long int*>(addr),
		                        static_cast<unsigned long long int>(compare),
		                        static_cast<unsigned long long int>(value));
		                }
		            }

		            return detail::EmulateAtomic<alpaka::operation::Cas, internal::CudaHipAtomic, T, T_Scope>::atomic(
		                ctx,
		                addr,
		                compare,
		                value);
		        }
		    };
		} // namespace alpaka::onAcc::internalCompute
		#endif
		// ==
		// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/unifiedCudaHip/atomic.hpp ==
		// ============================================================================

		// ============================================================================
		// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/unifiedCudaHip/math.hpp ==
		// ==
		/* Copyright 2023 Axel Huebl, Benjamin Worpitz, Matthias Werner, Bert Wesarg, Valentin Gehrke, René Widera,
		 * Jan Stephan, Andrea Bocci, Bernhard Manfred Gruber, Jeffrey Kelling, Sergei Bastrakov, Mehmet Yusufoglu
		 * SPDX-License-Identifier: MPL-2.0
		 */

		// #pragma once
		// #include "alpaka/api/api.hpp"    // amalgamate: file already inlined
		// #include "alpaka/api/unifiedCudaHip/tag.hpp"    // amalgamate: file already inlined
		// #include "alpaka/core/Unreachable.hpp"    // amalgamate: file already inlined
		// #include "alpaka/core/common.hpp"    // amalgamate: file already inlined
		// #include "alpaka/core/decay.hpp"    // amalgamate: file already inlined
		// #include "alpaka/math/Complex.hpp"    // amalgamate: file already inlined
		// #include "alpaka/math/internal/ieee754.hpp"    // amalgamate: file already inlined
		// #include "alpaka/math/internal/math.hpp"    // amalgamate: file already inlined

		// #include <cmath>    // amalgamate: file already included
		// #include <concepts>    // amalgamate: file already included

		namespace alpaka::math::internal
		{
		#if ALPAKA_LANG_CUDA || ALPAKA_LANG_HIP

		    template<typename T_Arg>
		    requires std::signed_integral<T_Arg> || std::floating_point<T_Arg>
		    struct Abs::Op<CudaHipMath, T_Arg>
		    {
		        constexpr auto operator()(CudaHipMath, T_Arg const& arg) const
		        {
		            if constexpr(is_decayed_v<T_Arg, float>)
		                return ::fabsf(arg);
		            else if constexpr(is_decayed_v<T_Arg, double>)
		                return ::fabs(arg);
		            else if constexpr(is_decayed_v<T_Arg, int>)
		                return ::abs(arg);
		            else if constexpr(is_decayed_v<T_Arg, long int>)
		                return ::labs(arg);
		            else if constexpr(is_decayed_v<T_Arg, long long int>)
		                return ::llabs(arg);
		            else
		                static_assert(!sizeof(T_Arg), "Unsupported data type");

		            ALPAKA_UNREACHABLE(T_Arg{});
		        }
		    };

		    template<std::floating_point T_Arg>
		    struct Sin::Op<CudaHipMath, T_Arg>
		    {
		        constexpr auto operator()(CudaHipMath, T_Arg const& arg) const
		        {
		            if constexpr(is_decayed_v<T_Arg, float>)
		                return ::sinf(arg);
		            else if constexpr(is_decayed_v<T_Arg, double>)
		                return ::sin(arg);
		            else
		                static_assert(!sizeof(T_Arg), "Unsupported data type");

		            ALPAKA_UNREACHABLE(T_Arg{});
		        }
		    };

		    template<std::floating_point T_Arg>
		    struct Acosh::Op<CudaHipMath, T_Arg>
		    {
		        constexpr auto operator()(CudaHipMath, T_Arg const& arg) const
		        {
		            if constexpr(is_decayed_v<T_Arg, float>)
		                return ::acoshf(arg);
		            else if constexpr(is_decayed_v<T_Arg, double>)
		                return ::acosh(arg);
		            else
		                static_assert(!sizeof(T_Arg), "Unsupported data type");

		            ALPAKA_UNREACHABLE(T_Arg{});
		        }
		    };

		    template<std::floating_point T_Arg>
		    struct Asinh::Op<CudaHipMath, T_Arg>
		    {
		        constexpr auto operator()(CudaHipMath, T_Arg const& arg) const
		        {
		            if constexpr(is_decayed_v<T_Arg, float>)
		                return ::asinhf(arg);
		            else if constexpr(is_decayed_v<T_Arg, double>)
		                return ::asinh(arg);
		            else
		                static_assert(!sizeof(T_Arg), "Unsupported data type");

		            ALPAKA_UNREACHABLE(T_Arg{});
		        }
		    };

		    template<std::floating_point T_Arg>
		    struct Sinh::Op<CudaHipMath, T_Arg>
		    {
		        constexpr auto operator()(CudaHipMath, T_Arg const& arg) const
		        {
		            if constexpr(is_decayed_v<T_Arg, float>)
		                return ::sinhf(arg);
		            else if constexpr(is_decayed_v<T_Arg, double>)
		                return ::sinh(arg);
		            else
		                static_assert(!sizeof(T_Arg), "Unsupported data type");

		            ALPAKA_UNREACHABLE(T_Arg{});
		        }
		    };

		    template<std::floating_point T_Arg>
		    struct Atan::Op<CudaHipMath, T_Arg>
		    {
		        constexpr auto operator()(CudaHipMath, T_Arg const& arg) const
		        {
		            if constexpr(is_decayed_v<T_Arg, float>)
		                return ::atanf(arg);
		            else if constexpr(is_decayed_v<T_Arg, double>)
		                return ::atan(arg);
		            else
		                static_assert(!sizeof(T_Arg), "Unsupported data type");

		            ALPAKA_UNREACHABLE(T_Arg{});
		        }
		    };

		    template<std::floating_point T_Arg>
		    struct Atanh::Op<CudaHipMath, T_Arg>
		    {
		        constexpr auto operator()(CudaHipMath, T_Arg const& arg) const
		        {
		            if constexpr(is_decayed_v<T_Arg, float>)
		                return ::atanhf(arg);
		            else if constexpr(is_decayed_v<T_Arg, double>)
		                return ::atanh(arg);
		            else
		                static_assert(!sizeof(T_Arg), "Unsupported data type");

		            ALPAKA_UNREACHABLE(T_Arg{});
		        }
		    };

		    template<std::floating_point T_Arg>
		    struct Tanh::Op<CudaHipMath, T_Arg>
		    {
		        constexpr auto operator()(CudaHipMath, T_Arg const& arg) const
		        {
		            if constexpr(is_decayed_v<T_Arg, float>)
		                return ::tanhf(arg);
		            else if constexpr(is_decayed_v<T_Arg, double>)
		                return ::tanh(arg);
		            else
		                static_assert(!sizeof(T_Arg), "Unsupported data type");

		            ALPAKA_UNREACHABLE(T_Arg{});
		        }
		    };

		    template<typename T_Arg>
		    requires(std::is_arithmetic_v<T_Arg>)
		    struct Cbrt::Op<CudaHipMath, T_Arg>
		    {
		        constexpr auto operator()(CudaHipMath, T_Arg const& arg) const
		        {
		            if constexpr(is_decayed_v<T_Arg, float>)
		                return ::cbrtf(arg);
		            else if constexpr(is_decayed_v<T_Arg, double> || std::is_integral_v<T_Arg>)
		                return ::cbrt(arg);
		            else
		                static_assert(!sizeof(T_Arg), "Unsupported data type");

		            ALPAKA_UNREACHABLE(T_Arg{});
		        }
		    };

		    template<std::floating_point T_Arg>
		    struct Ceil::Op<CudaHipMath, T_Arg>
		    {
		        constexpr auto operator()(CudaHipMath, T_Arg const& arg) const
		        {
		            if constexpr(is_decayed_v<T_Arg, float>)
		                return ::ceilf(arg);
		            else if constexpr(is_decayed_v<T_Arg, double>)
		                return ::ceil(arg);
		            else
		                static_assert(!sizeof(T_Arg), "Unsupported data type");

		            ALPAKA_UNREACHABLE(T_Arg{});
		        }
		    };

		    template<std::floating_point T_Arg>
		    struct Round::Op<CudaHipMath, T_Arg>
		    {
		        constexpr auto operator()(CudaHipMath, T_Arg const& arg) const
		        {
		            if constexpr(is_decayed_v<T_Arg, float>)
		                return ::roundf(arg);
		            else if constexpr(is_decayed_v<T_Arg, double>)
		                return ::round(arg);
		            else
		                static_assert(!sizeof(T_Arg), "Unsupported data type");

		            ALPAKA_UNREACHABLE(T_Arg{});
		        }
		    };

		    template<std::floating_point T_Arg>
		    struct Lround::Op<CudaHipMath, T_Arg>
		    {
		        constexpr auto operator()(CudaHipMath, T_Arg const& arg) const
		        {
		            if constexpr(is_decayed_v<T_Arg, float>)
		                return ::lroundf(arg);
		            else if constexpr(is_decayed_v<T_Arg, double>)
		                return ::lround(arg);
		            else
		                static_assert(!sizeof(T_Arg), "Unsupported data type");

		            ALPAKA_UNREACHABLE(long{});
		        }
		    };

		    template<std::floating_point T_Arg>
		    struct Llround::Op<CudaHipMath, T_Arg>
		    {
		        constexpr auto operator()(CudaHipMath, T_Arg const& arg) const
		        {
		            if constexpr(is_decayed_v<T_Arg, float>)
		                return ::llroundf(arg);
		            else if constexpr(is_decayed_v<T_Arg, double>)
		                return ::llround(arg);
		            else
		                static_assert(!sizeof(T_Arg), "Unsupported data type");

		            ALPAKA_UNREACHABLE(long long{});
		        }
		    };

		    template<std::floating_point T_Arg>
		    struct SinCos::Op<CudaHipMath, T_Arg>
		    {
		        constexpr auto operator()(CudaHipMath, T_Arg const& arg, T_Arg& result_sin, T_Arg& result_cos) const
		        {
		            if constexpr(is_decayed_v<T_Arg, float>)
		                ::sincosf(arg, &result_sin, &result_cos);
		            else if constexpr(is_decayed_v<T_Arg, double>)
		                ::sincos(arg, &result_sin, &result_cos);
		            else
		                static_assert(!sizeof(T_Arg), "Unsupported data type");
		        }
		    };

		    template<std::floating_point T_Arg>
		    struct Exp::Op<CudaHipMath, T_Arg>
		    {
		        constexpr auto operator()(CudaHipMath, T_Arg const& arg) const
		        {
		            if constexpr(is_decayed_v<T_Arg, float>)
		                return ::expf(arg);
		            else if constexpr(is_decayed_v<T_Arg, double>)
		                return ::exp(arg);
		            else
		                static_assert(!sizeof(T_Arg), "Unsupported data type");

		            ALPAKA_UNREACHABLE(T_Arg{});
		        }
		    };

		    template<typename T_Arg>
		    requires(std::is_arithmetic_v<T_Arg>)
		    struct Sqrt::Op<CudaHipMath, T_Arg>
		    {
		        constexpr auto operator()(CudaHipMath, T_Arg const& arg) const
		        {
		            if constexpr(is_decayed_v<T_Arg, float>)
		                return ::sqrtf(arg);
		            else if constexpr(is_decayed_v<T_Arg, double> || std::is_integral_v<T_Arg>)
		                return ::sqrt(arg);
		            else
		                static_assert(!sizeof(T_Arg), "Unsupported data type");

		            ALPAKA_UNREACHABLE(T_Arg{});
		        }
		    };

		    template<typename T_Arg>
		    requires(std::is_arithmetic_v<T_Arg>)
		    struct Rsqrt::Op<CudaHipMath, T_Arg>
		    {
		        constexpr auto operator()(CudaHipMath, T_Arg const& arg) const
		        {
		            if constexpr(is_decayed_v<T_Arg, float>)
		                return ::rsqrtf(arg);
		            else if constexpr(is_decayed_v<T_Arg, double> || std::is_integral_v<T_Arg>)
		                return ::rsqrt(arg);
		            else
		                static_assert(!sizeof(T_Arg), "Unsupported data type");

		            ALPAKA_UNREACHABLE(T_Arg{});
		        }
		    };

		    template<std::floating_point T_Arg>
		    struct Trunc::Op<CudaHipMath, T_Arg>
		    {
		        constexpr auto operator()(CudaHipMath, T_Arg const& arg) const
		        {
		            if constexpr(is_decayed_v<T_Arg, float>)
		                return ::truncf(arg);
		            else if constexpr(is_decayed_v<T_Arg, double>)
		                return ::trunc(arg);
		            else
		                static_assert(!sizeof(T_Arg), "Unsupported data type");

		            ALPAKA_UNREACHABLE(T_Arg{});
		        }
		    };

		    template<std::floating_point T_Arg>
		    struct Cos::Op<CudaHipMath, T_Arg>
		    {
		        constexpr auto operator()(CudaHipMath, T_Arg const& arg) const
		        {
		            if constexpr(is_decayed_v<T_Arg, float>)
		                return ::cosf(arg);
		            else if constexpr(is_decayed_v<T_Arg, double>)
		                return ::cos(arg);
		            else
		                static_assert(!sizeof(T_Arg), "Unsupported data type");

		            ALPAKA_UNREACHABLE(T_Arg{});
		        }
		    };

		    template<std::floating_point T_Arg>
		    struct Cosh::Op<CudaHipMath, T_Arg>
		    {
		        constexpr auto operator()(CudaHipMath, T_Arg const& arg) const
		        {
		            if constexpr(is_decayed_v<T_Arg, float>)
		                return ::coshf(arg);
		            else if constexpr(is_decayed_v<T_Arg, double>)
		                return ::cosh(arg);
		            else
		                static_assert(!sizeof(T_Arg), "Unsupported data type");

		            ALPAKA_UNREACHABLE(T_Arg{});
		        }
		    };

		    template<std::floating_point T_Arg>
		    struct Erf::Op<CudaHipMath, T_Arg>
		    {
		        constexpr auto operator()(CudaHipMath, T_Arg const& arg) const
		        {
		            if constexpr(is_decayed_v<T_Arg, float>)
		                return ::erff(arg);
		            else if constexpr(is_decayed_v<T_Arg, double>)
		                return ::erf(arg);
		            else
		                static_assert(!sizeof(T_Arg), "Unsupported data type");

		            ALPAKA_UNREACHABLE(T_Arg{});
		        }
		    };

		    template<std::floating_point T_Arg>
		    struct Floor::Op<CudaHipMath, T_Arg>
		    {
		        constexpr auto operator()(CudaHipMath, T_Arg const& arg) const
		        {
		            if constexpr(is_decayed_v<T_Arg, float>)
		                return ::floorf(arg);
		            else if constexpr(is_decayed_v<T_Arg, double>)
		                return ::floor(arg);
		            else
		                static_assert(!sizeof(T_Arg), "Unsupported data type");

		            ALPAKA_UNREACHABLE(T_Arg{});
		        }
		    };

		    template<std::floating_point T_Arg>
		    struct Log::Op<CudaHipMath, T_Arg>
		    {
		        constexpr auto operator()(CudaHipMath, T_Arg const& arg) const
		        {
		            if constexpr(is_decayed_v<T_Arg, float>)
		                return ::logf(arg);
		            else if constexpr(is_decayed_v<T_Arg, double>)
		                return ::log(arg);
		            else
		                static_assert(!sizeof(T_Arg), "Unsupported data type");

		            ALPAKA_UNREACHABLE(T_Arg{});
		        }
		    };

		    template<std::floating_point T_Arg>
		    struct Log2::Op<CudaHipMath, T_Arg>
		    {
		        constexpr auto operator()(CudaHipMath, T_Arg const& arg) const
		        {
		            if constexpr(is_decayed_v<T_Arg, float>)
		                return ::log2f(arg);
		            else if constexpr(is_decayed_v<T_Arg, double>)
		                return ::log2(arg);
		            else
		                static_assert(!sizeof(T_Arg), "Unsupported data type");

		            ALPAKA_UNREACHABLE(T_Arg{});
		        }
		    };

		    template<std::floating_point T_Arg>
		    struct Log10::Op<CudaHipMath, T_Arg>
		    {
		        constexpr auto operator()(CudaHipMath, T_Arg const& arg) const
		        {
		            if constexpr(is_decayed_v<T_Arg, float>)
		                return ::log10f(arg);
		            else if constexpr(is_decayed_v<T_Arg, double>)
		                return ::log10(arg);
		            else
		                static_assert(!sizeof(T_Arg), "Unsupported data type");

		            ALPAKA_UNREACHABLE(T_Arg{});
		        }
		    };

		    template<std::floating_point T_Arg>
		    struct Tan::Op<CudaHipMath, T_Arg>
		    {
		        constexpr auto operator()(CudaHipMath, T_Arg const& arg) const
		        {
		            if constexpr(is_decayed_v<T_Arg, float>)
		                return ::tanf(arg);
		            else if constexpr(is_decayed_v<T_Arg, double>)
		                return ::tan(arg);
		            else
		                static_assert(!sizeof(T_Arg), "Unsupported data type");

		            ALPAKA_UNREACHABLE(T_Arg{});
		        }
		    };

		    template<std::floating_point T_Y, std::floating_point T_X>
		    struct Atan2::Op<CudaHipMath, T_Y, T_X>
		    {
		        constexpr auto operator()(CudaHipMath, T_Y const& y, T_X const& x) const
		        {
		            if constexpr(is_decayed_v<T_Y, float> && is_decayed_v<T_X, float>)
		                return ::atan2f(y, x);
		            else if constexpr(is_decayed_v<T_Y, double> || is_decayed_v<T_X, double>)
		                return ::atan2(y, x);
		            else
		                static_assert(!sizeof(T_Y), "Unsupported data type");

		            ALPAKA_UNREACHABLE(T_Y{});
		        }
		    };

		    template<std::floating_point T_Arg>
		    struct Arg::Op<CudaHipMath, T_Arg>
		    {
		        constexpr auto operator()(CudaHipMath, T_Arg const& arg) const
		        {
		            // Fall back to atan2 so that boundary cases are resolved consistently
		            return atan2(T_Arg{0.0}, arg);
		        }
		    };

		    template<std::floating_point T_Arg>
		    struct Asin::Op<CudaHipMath, T_Arg>
		    {
		        constexpr auto operator()(CudaHipMath, T_Arg const& arg) const
		        {
		            if constexpr(is_decayed_v<T_Arg, float>)
		                return ::asinf(arg);
		            else if constexpr(is_decayed_v<T_Arg, double>)
		                return ::asin(arg);
		            else
		                static_assert(!sizeof(T_Arg), "Unsupported data type");

		            ALPAKA_UNREACHABLE(T_Arg{});
		        }
		    };

		    template<std::floating_point T_Arg>
		    struct Acos::Op<CudaHipMath, T_Arg>
		    {
		        constexpr auto operator()(CudaHipMath, T_Arg const& arg) const
		        {
		            if constexpr(is_decayed_v<T_Arg, float>)
		                return ::acosf(arg);
		            else if constexpr(is_decayed_v<T_Arg, double>)
		                return ::acos(arg);
		            else
		                static_assert(!sizeof(T_Arg), "Unsupported data type");

		            ALPAKA_UNREACHABLE(T_Arg{});
		        }
		    };

		    // Shared helper keeps CUDA/HIP IEEE predicates aligned with host behavior under fast-math.
		    template<std::floating_point T_Arg>
		    struct Isnan::Op<CudaHipMath, T_Arg>
		    {
		        constexpr auto operator()(CudaHipMath, T_Arg const& arg) const
		        {
		            return ieeeIsnan(arg);
		        }
		    };

		    template<std::floating_point T_Arg>
		    struct Isinf::Op<CudaHipMath, T_Arg>
		    {
		        constexpr auto operator()(CudaHipMath, T_Arg const& arg) const
		        {
		            return ieeeIsinf(arg);
		        }
		    };

		    template<std::floating_point T_Arg>
		    struct Isfinite::Op<CudaHipMath, T_Arg>
		    {
		        constexpr auto operator()(CudaHipMath, T_Arg const& arg) const
		        {
		            return ieeeIsfinite(arg);
		        }
		    };

		    template<std::floating_point T_Arg>
		    struct Conj::Op<CudaHipMath, T_Arg>
		    {
		        constexpr auto operator()(CudaHipMath, T_Arg const& arg) const
		        {
		            return Complex<T_Arg>{arg, T_Arg{0.0}};
		        }
		    };

		    template<std::floating_point T_Mag, std::floating_point T_Sgn>
		    struct Copysign::Op<CudaHipMath, T_Mag, T_Sgn>
		    {
		        constexpr auto operator()(CudaHipMath, T_Mag const& mag, T_Sgn const& sgn) const
		        {
		            if constexpr(is_decayed_v<T_Mag, float> && is_decayed_v<T_Sgn, float>)
		                return ::copysignf(mag, sgn);
		            else if constexpr(is_decayed_v<T_Mag, double> || is_decayed_v<T_Sgn, double>)
		                return ::copysign(mag, sgn);
		            else
		                static_assert(!sizeof(T_Mag), "Unsupported data type");

		            ALPAKA_UNREACHABLE(T_Mag{});
		        }
		    };

		    template<std::floating_point T_A, std::floating_point T_B>
		    struct Min::Op<CudaHipMath, T_A, T_B>
		    {
		        constexpr auto operator()(CudaHipMath, T_A const& a, T_B const& b) const
		        {
		            if constexpr(std::is_integral_v<T_A> && std::is_integral_v<T_B>)
		                return ::min(a, b);
		            else if constexpr(is_decayed_v<T_A, float> && is_decayed_v<T_B, float>)
		                return ::fminf(a, b);
		            else if constexpr(
		                is_decayed_v<T_A, double> || is_decayed_v<T_B, double>
		                || (is_decayed_v<T_A, float> && std::is_integral_v<T_B>)
		                || (std::is_integral_v<T_A> && is_decayed_v<T_B, float>) )
		                return ::fmin(a, b);
		            else
		                static_assert(!sizeof(T_A), "Unsupported data type");

		            using Ret [[maybe_unused]] = std::conditional_t<
		                std::is_integral_v<T_A> && std::is_integral_v<T_B>,
		                decltype(::min(a, b)),
		                std::conditional_t<is_decayed_v<T_A, float> && is_decayed_v<T_B, float>, float, double>>;
		            ALPAKA_UNREACHABLE(Ret{});
		        }
		    };

		    template<typename T_A, typename T_B>
		    requires(std::is_arithmetic_v<T_A> && std::is_arithmetic_v<T_B>)
		    struct Max::Op<CudaHipMath, T_A, T_B>
		    {
		        constexpr auto operator()(CudaHipMath, T_A const& a, T_B const& b) const
		        {
		            if constexpr(std::is_integral_v<T_A> && std::is_integral_v<T_B>)
		                return ::max(a, b);
		            else if constexpr(is_decayed_v<T_A, float> && is_decayed_v<T_B, float>)
		                return ::fmaxf(a, b);
		            else if constexpr(
		                is_decayed_v<T_A, double> || is_decayed_v<T_B, double>
		                || (is_decayed_v<T_A, float> && std::is_integral_v<T_B>)
		                || (std::is_integral_v<T_A> && is_decayed_v<T_B, float>) )
		                return ::fmax(a, b);
		            else
		                static_assert(!sizeof(T_A), "Unsupported data type");

		            using Ret [[maybe_unused]] = std::conditional_t<
		                std::is_integral_v<T_A> && std::is_integral_v<T_B>,
		                decltype(::max(a, b)),
		                std::conditional_t<is_decayed_v<T_A, float> && is_decayed_v<T_B, float>, float, double>>;
		            ALPAKA_UNREACHABLE(Ret{});
		        }
		    };

		    template<std::floating_point T_Base, std::floating_point T_Exp>
		    struct Pow::Op<CudaHipMath, T_Base, T_Exp>
		    {
		        constexpr auto operator()(CudaHipMath, T_Base const& base, T_Exp const& exp) const
		        {
		            if constexpr(is_decayed_v<T_Base, float> && is_decayed_v<T_Exp, float>)
		                return ::powf(base, exp);
		            else if constexpr(is_decayed_v<T_Base, double> || is_decayed_v<T_Exp, double>)
		                return ::pow(static_cast<double>(base), static_cast<double>(exp));
		            else
		                static_assert(!sizeof(T_Base), "Unsupported data type");

		            using Ret [[maybe_unused]]
		            = std::conditional_t<is_decayed_v<T_Base, float> && is_decayed_v<T_Exp, float>, float, double>;
		            ALPAKA_UNREACHABLE(Ret{});
		        }
		    };

		    template<std::floating_point T_X, std::floating_point T_Y>
		    struct Fmod::Op<CudaHipMath, T_X, T_Y>
		    {
		        constexpr auto operator()(CudaHipMath, T_X const& x, T_Y const& y) const
		        {
		            if constexpr(is_decayed_v<T_X, float> && is_decayed_v<T_Y, float>)
		                return ::fmodf(x, y);
		            else if constexpr(is_decayed_v<T_X, double> || is_decayed_v<T_Y, double>)
		                return ::fmod(x, y);
		            else
		                static_assert(!sizeof(T_X), "Unsupported data type");

		            using Ret [[maybe_unused]]
		            = std::conditional_t<is_decayed_v<T_X, float> && is_decayed_v<T_Y, float>, float, double>;
		            ALPAKA_UNREACHABLE(Ret{});
		        }
		    };

		    template<std::floating_point T_X, std::floating_point T_Y>
		    struct Remainder::Op<CudaHipMath, T_X, T_Y>
		    {
		        constexpr auto operator()(CudaHipMath, T_X const& x, T_Y const& y) const
		        {
		            if constexpr(is_decayed_v<T_X, float> && is_decayed_v<T_Y, float>)
		                return ::remainderf(x, y);
		            else if constexpr(is_decayed_v<T_X, double> || is_decayed_v<T_Y, double>)
		                return ::remainder(x, y);
		            else
		                static_assert(!sizeof(T_X), "Unsupported data type");

		            using Ret [[maybe_unused]]
		            = std::conditional_t<is_decayed_v<T_X, float> && is_decayed_v<T_Y, float>, float, double>;
		            ALPAKA_UNREACHABLE(Ret{});
		        }
		    };

		    template<std::floating_point T_X, std::floating_point T_Y, std::floating_point T_Z>
		    struct Fma::Op<CudaHipMath, T_X, T_Y, T_Z>
		    {
		        constexpr auto operator()(CudaHipMath, T_X const& x, T_Y const& y, T_Z const& z) const
		        {
		            if constexpr(is_decayed_v<T_X, float> && is_decayed_v<T_Y, float> && is_decayed_v<T_Z, float>)
		                return ::fmaf(x, y, z);
		            else if constexpr(is_decayed_v<T_X, double> || is_decayed_v<T_Y, double> || is_decayed_v<T_Z, double>)
		                return ::fma(x, y, z);
		            else
		                static_assert(!sizeof(T_X), "Unsupported data type");

		            using Ret [[maybe_unused]] = std::conditional_t<
		                is_decayed_v<T_X, float> && is_decayed_v<T_Y, float> && is_decayed_v<T_Z, float>,
		                float,
		                double>;
		            ALPAKA_UNREACHABLE(Ret{});
		        }
		    };


		#endif

		} // namespace alpaka::math::internal
		// ==
		// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/unifiedCudaHip/math.hpp ==
		// ============================================================================

	// ==
	// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/unifiedCudaHip.hpp ==
	// ============================================================================

// #include "alpaka/apply.hpp"    // amalgamate: file already inlined
// #include "alpaka/core/Dict.hpp"    // amalgamate: file already inlined
// #include "alpaka/core/Tag.hpp"    // amalgamate: file already inlined
// #include "alpaka/core/common.hpp"    // amalgamate: file already inlined
// #include "alpaka/core/config.hpp"    // amalgamate: file already inlined
// #include "alpaka/executor.hpp"    // amalgamate: file already inlined
	// ============================================================================
	// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/fn.hpp ==
	// ==
	/* Copyright 2026 René Widera
	 * SPDX-License-Identifier: MPL-2.0
	 */

	// #pragma once	// #include "alpaka/api/concepts/api.hpp"    // amalgamate: file already inlined
	// #include "alpaka/concepts.hpp"    // amalgamate: file already inlined
	// #include "alpaka/core/common.hpp"    // amalgamate: file already inlined
	// #include "alpaka/tag.hpp"    // amalgamate: file already inlined

	#include <type_traits>

	/** @brief alpaka'S function interface
	 *
	 * This file defines the interface for registering, dispatching and calling function overloads specialize for device
	 * specifications. A device specification consists of an alpaka API and device kind. These functions can be dispatched
	 * to third-party libraries (e.g. cuBLAS) and can be used in alpaka onHost or onAcc. The function interface of alpaka
	 * provides a way to work natively with alpaka objects while being able to use third party interfaces for functionality
	 * not provided in alpaka or in cases where the vendor implementation provides better performance. For each exposed
	 * function you can provide a fallback to an alpaka implementation for a device specification or a device specification
	 * independent genric implementation. This keeps your code base portable even if you can not dispatch to a third
	 * party/vendor library and avoids preprocessor macros around function calls. The preprocessor macro ALPAKA_FN_SYMBOL()
	 * should be used to declare a function symbol.
	 * A function symbol follows all requirements to be used as kernel within alpaka.
	 *
	 * The main components of the interface are:
	 * - `alpaka::fn::Fn`: The function symbol baseclass that can be used to register, dispatch and call third party
	 * functions.
	 * - `fnRegister`: A function template that can be specialized to register a function overload for a
	 * device specification. This is optional and only required if Registration::enforced is set to the function
	 * symbol. It allows the usage of isRegistered() function to check if a third party function overload is defined.
	 * - `fnDispatch`: A function template that can be specialized to dispatch a function symbol to a third party function
	 * depending on the device specification.
	 *
	 * For an example of how to use the interface see example/vendorApi.
	 */
	namespace alpaka::fn
	{
	    namespace api
	    {
	        /** @prief Api tag for alpaka.
	         *
	         * @warning This api should be used together with alpaka's function interface, it is not compatible with other
	         * alpaka interfaces where api's are required.
	         */
	        struct Alpaka : detail::ApiBase
	        {
	            using element_type = Alpaka;

	            auto get() const
	            {
	                return this;
	            }

	            void _()
	            {
	                static_assert(concepts::Api<Alpaka>);
	            }

	            static std::string getName()
	            {
	                return "Alpaka";
	            }
	        };

	        constexpr auto alpaka = Alpaka{};
	    } // namespace api

	    /** @brief Fallback policy for function calls.
	     *
	     * This enum defines the fallback policy for function calls. It is used as a template parameter in
	     * `alpaka::fn::Fn` or ALPAKA_FN_SYMBOL to specify the fallback behavior if no vendor function overload is defined
	     * for the given device specification.
	     */
	    enum class Fallback : int
	    {
	        /** The generic implementation is called if no other overload is fits.
	         *
	         * Should be used to ensure portability between different heterogeneous APIs.
	         */
	        toGeneric = 1,
	        /** The alpaka implementation is called if no other overload is fits.
	         *
	         * Should be used to ensure portability between different heterogeneous APIs.
	         */
	        toAlpaka = 2,
	        /** No fallback is performed in case no overload is fitting.
	         *
	         * Should be used if you want to ensure that a third party function overload is guaranteed to be called.
	         */
	        none = 3
	    };

	    /** @brief Policy to control if a function symbal must be registered.
	     */
	    enum class Registration : int
	    {
	        /** The isRegistered() function will always return true. This can be used to skip the registration of the
	         * function symbol via fnRegister().
	         */
	        alwaysTrue = 1,
	        /** It is required to define fnRegister() for a function symbol. isRegistered() can be called to check if
	         * a vendor function overload is registered for the given device specification.
	         */
	        enforced = 2,
	        /** The isRegistered() function is not available and no registration of the vendor function overloads is
	         * required. This can be used if you do not want to use the isRegistered() function and do not want to require
	         * the definition of fnRegister() for a function symbol.
	         */
	        none = 3
	    };

	    namespace concepts
	    {
	        /** @brief Concept to check if a function symbol can be called.
	         *
	         * This concept checks if the fnDispatch() can be called with the given function symbol (if
	         * Fallback::toGeneric) or function symbol device specification. It is used to check if a function dispatch is
	         * defined for the given device specification or function symbol and if it can be called with the given
	         * arguments.
	         */
	        template<typename T_FnSpec, typename... Args>
	        concept DispatchedFnInvocable
	            = requires(T_FnSpec fnSpec, Args&&... args) { fnDispatch(fnSpec, std::forward<Args>(args)...); };

	        /** @brief Concept to check if a function symbol is registered.
	         *
	         * This concept checks if the fnRegister() can be called with the given function symbol or
	         * function symbol device specification. It is used to check if a vendor function overload is defined for the
	         * given device specification without taking any function arguments into account.
	         */
	        template<typename T_FnSpec>
	        concept FnRegistered = requires(T_FnSpec fnSpec) { fnRegister(fnSpec); };
	    } // namespace concepts

	    /** @brief Base class for function symbols.
	     *
	     * @tparam T_FnClass The function symbol to register, dispatch and call. The class should be trivially
	     * constructable. By using the static call() function or the operator()
	     * @tparam T_fallbackPolicy The fallback policy if no vendor function overload is defined for the given device
	     * specification. If set to Fallback::toAlpaka the alpaka implementation is called if no other overload fits. If
	     * set to Fallback::none no fallback is performed and a static assert is triggered if no function overload is
	     * defined for the given device specification.
	     * @tparam T_registrationPolicy If set to Registration::enforced the isRegistered() can be called, and it is
	     * required to define fnRegister() for on T_FnClass. If set to Registration::none the isRegistered()
	     * function is not available and no registration of the vendor function overloads is required. If set to
	     * Registration::alwaysTrue isRegistered() will always return true. This can be used to skip the registration
	     * of the function symbol.
	     */
	    template<
	        typename T_FnClass,
	        Fallback T_fallbackPolicy = Fallback::toGeneric,
	        Registration T_registrationPolicy = Registration::none>
	    struct Fn
	    {
	        /** Get the function specification.
	         *
	         * @return the function specification for the given entity.
	         *
	         * @{
	         */
	        static constexpr auto spec(alpaka::concepts::DeviceSpec auto const& any)
	        {
	            return spec(getApi(any), getDeviceKind(any));
	        }

	        template<alpaka::concepts::Api T_Api, alpaka::concepts::DeviceKind T_DeviceKind>
	        static constexpr auto spec(T_Api api, T_DeviceKind deviceKind)
	        {
	            alpaka::unused(api, deviceKind);
	            return typename T_FnClass::template Spec<T_Api, T_DeviceKind>{};
	        }

	        /** @} */

	        /** Checks if a function overload is registered for the given device specification.
	         *
	         * You can use the result to optionally call the function overload and disable at compile time code sections
	         * similar to C++ preprocessor guards.
	         * @code
	         * ALPAKA_FN_SYMBOL(Foo,alpaka::fn::Fallback::none, alpaka::fn::Registration::enforced);
	         *
	         * void fnRegister(Foo::Spec<alpaka::api::Host, alpaka::deviceKind::Cpu>)
	         * {
	         * }
	         *
	         * if constexpr (Foo::isRegistered(queue))
	         * {
	         *     // more code
	         *     Foo::call(queue,args ...);
	         *     // more code
	         * }
	         * @endcode
	         *
	         * @param any any type which is usable with alpaka::getApi() and alpaka::getDeviceKind()
	         * @return true if the function overload T_FnClass is registered else false.
	         * It does not try to check if a fallback overload is dispatchable, to check fallback registrations use
	         * hasRegisteredFallback(alpaka::concepts::DeviceSpec auto const& any).
	         */
	        static constexpr bool isRegistered(alpaka::concepts::DeviceSpec auto const& any)
	            requires(T_registrationPolicy != Registration::none)
	        {
	            return T_registrationPolicy == Registration::alwaysTrue
	                   || concepts::FnRegistered<ALPAKA_TYPEOF(spec(any))>;
	        }

	        /** Checks if the function overload fallback is registered.
	         *
	         * Similar to isRegistered(alpaka::concepts::DeviceSpec auto const& any) but it checks for the fallback
	         * overload only.
	         *
	         * @param any any type which is usable with alpaka::getApi() and alpaka::getDeviceKind()
	         * @return true if the function overload fallback for T_FnClass is registered else false.
	         */
	        static constexpr bool hasRegisteredFallback(alpaka::concepts::DeviceSpec auto const& any)
	            requires(T_registrationPolicy != Registration::none)
	        {
	            constexpr bool isFallbackAllowed = T_fallbackPolicy != Fallback::none;
	            constexpr bool hasAlpakaFallback
	                = ((T_fallbackPolicy == Fallback::toAlpaka)
	                   && concepts::FnRegistered<ALPAKA_TYPEOF(spec(api::Alpaka{}, getDeviceKind(any)))>);
	            constexpr bool hasGenericFallback
	                = ((T_fallbackPolicy == Fallback::toGeneric) && concepts::FnRegistered<T_FnClass>);
	            return T_registrationPolicy == Registration::alwaysTrue
	                   || (isFallbackAllowed && (hasAlpakaFallback || hasGenericFallback));
	        }

	        /** Call function overload if defined for the given device specification. */
	        template<alpaka::concepts::DeviceSpec T_Any, typename... Args>
	        requires concepts::DispatchedFnInvocable<ALPAKA_TYPEOF(spec(std::declval<T_Any>())), T_Any, Args...>
	        constexpr decltype(auto) operator()(T_Any&& any, Args&&... args) const
	        {
	            static_assert(
	                T_registrationPolicy != Registration::enforced || concepts::FnRegistered<ALPAKA_TYPEOF(spec(any))>,
	                "Function dispatch for the given function symbol, API and device kind is not registered.");
	            return fnDispatch(spec(any), std::forward<T_Any>(any), std::forward<Args>(args)...);
	        }

	        /** Fallback operator() to alpaka implementation if the function is not dispatchable for the given device
	         * specification.
	         *
	         * This operator() is only enabled if T_fallbackPolicy is set to toAlpaka and function is
	         * dispatchable for the given device specification.
	         */
	        template<alpaka::concepts::DeviceSpec T_Any, typename... Args>
	        requires(
	            !concepts::DispatchedFnInvocable<ALPAKA_TYPEOF(spec(std::declval<T_Any>())), T_Any, Args...>
	            && (T_fallbackPolicy == Fallback::toAlpaka))
	        constexpr decltype(auto) operator()(T_Any&& any, Args&&... args) const
	        {
	            static_assert(
	                T_registrationPolicy != Registration::enforced
	                    || concepts::FnRegistered<ALPAKA_TYPEOF(spec(api::Alpaka{}, getDeviceKind(any)))>,
	                "Function for the given function group, device kind the api fn::api::alpaka is not registered.");
	            return fnDispatch(
	                spec(api::Alpaka{}, getDeviceKind(any)),
	                std::forward<T_Any>(any),
	                std::forward<Args>(args)...);
	        }

	        /** Fallback operator() to generic function if not dispatchable for the given device
	         * specification.
	         *
	         * This operator() is only enabled if T_fallbackPolicy is set toGeneric and function is
	         * dispatchable without a device specification.
	         */
	        template<alpaka::concepts::DeviceSpec T_Any, typename... Args>
	        requires(
	            // no dispatch with device specification
	            !concepts::DispatchedFnInvocable<ALPAKA_TYPEOF(spec(std::declval<T_Any>())), T_Any, Args...> &&
	            // generic function dispatchable
	            concepts::DispatchedFnInvocable<T_FnClass, T_Any, Args...> && (T_fallbackPolicy == Fallback::toGeneric))
	        constexpr decltype(auto) operator()(T_Any&& any, Args&&... args) const
	        {
	            static_assert(
	                T_registrationPolicy != Registration::enforced || concepts::FnRegistered<T_FnClass>,
	                "Function dispatch for the given function symbol, is not registered.");
	            return fnDispatch(T_FnClass{}, std::forward<T_Any>(any), std::forward<Args>(args)...);
	        }

	        /** Call the function overload for the given device specification.
	         *
	         * See the call operator().
	         * @attention call() is a static function where the call operator required an instance of this class.
	         */
	        static constexpr decltype(auto) call(alpaka::concepts::DeviceSpec auto&& any, auto&&... args)
	        {
	            static_assert(
	                std::is_trivially_constructible_v<T_FnClass>,
	                "Function class must be trivially constructible to use call().");
	            return T_FnClass{}(ALPAKA_FORWARD(any), ALPAKA_FORWARD(args)...);
	        }
	    };
	} // namespace alpaka::fn

	/** @brief Define a function symbol class for alpaka's function interface
	 *
	 * @param fnName Name of the function symbol. This can be used to call the function overloads with the static
	 * call() function without having to create an instance or with the operator().
	 * @param optional_fallback The fallback policy if no vendor function overload is defined for the given device
	 * specification. If set to Fallback::toAlpaka the generic alpaka implementation is called if no other overload is
	 * fitting. If set to Fallback::toGeneric the overload with the function symbol as first argument is called if no other
	 * function device specification is callable. If set to Fallback::none no fallback is performed and a static assert is
	 * triggered if no vendor function overload is defined for the given device specification. Default:
	 * Fallback::toGeneric.
	 * @param optional_registartion If set to Registration::enforced the isRegistered() can be called, and it is
	 * required to define fnRegister() for on T_FnClass. If set to Registration::none the isRegistered()
	 * function is not available and no registration of the vendor function overloads is required. If set to
	 * Registration::alwaysTrue isRegistered() will always return true. This can be used to skip the registration of
	 * the function symbol. Default: Registration::none.
	 *
	 * @code
	 * ALPAKA_FN_SYMBOL(Transform, alpaka::fn::Fallback::toAlpaka, alpaka::fn::Registration::enforced);
	 * ALPAKA_FN_SYMBOL(TransformWithFallback, alpaka::fn::Fallback::toAlpaka);
	 * ALPAKA_FN_SYMBOL(TransformWithFallbackAndRegistration, alpaka::fn::Fallback::toGeneric,
	 * alpaka::fn::Registration::enforced);
	 * @endcode
	 */
	#define ALPAKA_FN_SYMBOL(fnName, ...)                                                                                 \
	    struct fnName : alpaka::fn::Fn<fnName __VA_OPT__(, __VA_ARGS__)>                                                  \
	    {                                                                                                                 \
	        /** Function specification for a given device specification.                                                  \
	         *                                                                                                            \
	         * This struct should be specialized for each device specification combination where a vendor                 \
	         * function overload is defined. The specialization should be empty and only used as a tag to                 \
	         * identify the function overload.                                                                            \
	         */                                                                                                           \
	        template<alpaka::concepts::Api T_Api, alpaka::concepts::DeviceKind T_DeviceKind>                              \
	        struct Spec                                                                                                   \
	        {                                                                                                             \
	        };                                                                                                            \
	    };                                                                                                                \
	    static_assert(true)
	// ==
	// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/fn.hpp ==
	// ============================================================================

// #include "alpaka/interface.hpp"    // amalgamate: file already inlined
// #include "alpaka/internal/interface.hpp"    // amalgamate: file already inlined
	// ============================================================================
	// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/intrinsic.hpp ==
	// ==
	/* Copyright 2025 Luca Venerando Greco, René Widera
	 * SPDX-License-Identifier: MPL-2.0
	 */

	// #pragma once
	// #include "alpaka/api/api.hpp"    // amalgamate: file already inlined
		// ============================================================================
		// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/intrinsic.hpp ==
		// ==
		/* Copyright 2025 Luca Venerando Greco
		 * SPDX-License-Identifier: MPL-2.0
		 */

		// #pragma once
			// ============================================================================
			// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/cuda/intrinsic.hpp ==
			// ==
			/* Copyright 2025 Luca Venerando Greco
			 * SPDX-License-Identifier: MPL-2.0
			 */

			// #pragma once
			// #include "alpaka/api/cuda/Api.hpp"    // amalgamate: file already inlined
				// ============================================================================
				// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/unifiedCudaHip/intrinsic.hpp ==
				// ==
				/* Copyright 2025 Luca Venerando Greco, René Widera
				 * SPDX-License-Identifier: MPL-2.0
				 */

				// #pragma once
				// #include "alpaka/api/cuda/executor.hpp"    // amalgamate: file already inlined
				// #include "alpaka/api/trait.hpp"    // amalgamate: file already inlined
				// #include "alpaka/api/unifiedCudaHip/tag.hpp"    // amalgamate: file already inlined
				// #include "alpaka/core/Unreachable.hpp"    // amalgamate: file already inlined
				// #include "alpaka/core/common.hpp"    // amalgamate: file already inlined
				// #include "alpaka/internal/intrinsic.hpp"    // amalgamate: file already inlined

				// #include <bit>    // amalgamate: file already included

				#if (ALPAKA_LANG_CUDA || ALPAKA_LANG_HIP)
				namespace alpaka::internal::intrinsic
				{
				    template<typename T_Arg>
				    struct Popcount::Op<alpaka::internal::CudaHipIntrinsic, T_Arg>
				    {
				        inline __device__ auto operator()(alpaka::internal::CudaHipIntrinsic const, T_Arg const& val) const
				        {
				            if constexpr(sizeof(T_Arg) == 4u)
				            {
				                return __popc(std::bit_cast<unsigned int>(val));
				            }
				            else if constexpr(sizeof(T_Arg) == 8u)
				            {
				                return __popcll(std::bit_cast<unsigned long long>(val));
				            }
				            else
				                static_assert(!sizeof(T_Arg), "Unsupported data type, sizeof() must be 4 or 8");

				            ALPAKA_UNREACHABLE(int{});
				        }
				    };

				    template<typename T_Arg>
				    struct Ffs::Op<alpaka::internal::CudaHipIntrinsic, T_Arg>
				    {
				        inline __device__ auto operator()(alpaka::internal::CudaHipIntrinsic const, T_Arg const& val) const
				        {
				            if constexpr(sizeof(T_Arg) == 4u)
				            {
				                return __ffs(std::bit_cast<int>(val));
				            }
				            else if constexpr(sizeof(T_Arg) == 8u)
				            {
				                return __ffsll(std::bit_cast<long long int>(val));
				            }
				            else
				                static_assert(!sizeof(T_Arg), "Unsupported data type, sizeof() must be 4 or 8");

				            ALPAKA_UNREACHABLE(int{});
				        }
				    };

				    template<typename T_Arg>
				    struct Clz::Op<alpaka::internal::CudaHipIntrinsic, T_Arg>
				    {
				        inline __device__ auto operator()(alpaka::internal::CudaHipIntrinsic const, T_Arg const& val) const
				        {
				            if constexpr(sizeof(T_Arg) == 4u)
				            {
				                return __clz(std::bit_cast<int>(val));
				            }
				            else if constexpr(sizeof(T_Arg) == 8u)
				            {
				                return __clzll(std::bit_cast<long long int>(val));
				            }
				            else
				                static_assert(!sizeof(T_Arg), "Unsupported data type, sizeof() must be 4 or 8");

				            ALPAKA_UNREACHABLE(int{});
				        }
				    };
				} // namespace alpaka::internal::intrinsic
				#endif
				// ==
				// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/unifiedCudaHip/intrinsic.hpp ==
				// ============================================================================

			// #include "alpaka/api/unifiedCudaHip/tag.hpp"    // amalgamate: file already inlined

			namespace alpaka::trait
			{
			    template<>
			    struct GetIntrinsicImpl::Op<alpaka::api::Cuda>
			    {
			        constexpr decltype(auto) operator()(alpaka::api::Cuda const) const
			        {
			            return alpaka::internal::cudaHipIntrinsic;
			        }
			    };
			} // namespace alpaka::trait
			// ==
			// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/cuda/intrinsic.hpp ==
			// ============================================================================

			// ============================================================================
			// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/hip/intrinsic.hpp ==
			// ==
			/* Copyright 2025 Luca Venerando Greco
			 * SPDX-License-Identifier: MPL-2.0
			 */

			// #pragma once
			// #include "alpaka/api/hip/Api.hpp"    // amalgamate: file already inlined
			// #include "alpaka/api/unifiedCudaHip/intrinsic.hpp"    // amalgamate: file already inlined
			// #include "alpaka/api/unifiedCudaHip/tag.hpp"    // amalgamate: file already inlined

			namespace alpaka::trait
			{
			    template<>
			    struct GetIntrinsicImpl::Op<alpaka::api::Hip>
			    {
			        constexpr decltype(auto) operator()(alpaka::api::Hip const) const
			        {
			            return alpaka::internal::cudaHipIntrinsic;
			        }
			    };
			} // namespace alpaka::trait
			// ==
			// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/hip/intrinsic.hpp ==
			// ============================================================================

			// ============================================================================
			// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/host/intrinsic.hpp ==
			// ==
			/* Copyright 2025 Luca Venerando Greco, René Widera
			 * SPDX-License-Identifier: MPL-2.0
			 */

			// #pragma once
			// #include "alpaka/api/host/Api.hpp"    // amalgamate: file already inlined
			// #include "alpaka/api/host/tag.hpp"    // amalgamate: file already inlined
			// #include "alpaka/api/unifiedCudaHip/intrinsic.hpp"    // amalgamate: file already inlined

			namespace alpaka::trait
			{
			    template<>
			    struct GetIntrinsicImpl::Op<alpaka::api::Host>
			    {
			        constexpr decltype(auto) operator()(alpaka::api::Host const) const
			        {
			            return alpaka::internal::stlIntrinsic;
			        }
			    };
			} // namespace alpaka::trait
			// ==
			// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/host/intrinsic.hpp ==
			// ============================================================================

			// ============================================================================
			// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/oneApi/intrinsic.hpp ==
			// ==
			/* Copyright 2025 Luca Venerando Greco, René Widera
			 * SPDX-License-Identifier: MPL-2.0
			 */

			// #pragma once
			// #include "alpaka/api/oneApi/Api.hpp"    // amalgamate: file already inlined
				// ============================================================================
				// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/syclGeneric/intrinsic.hpp ==
				// ==
				/* Copyright 2025 Luca Venerando Greco, René Widera, Jan Stephan
				 * SPDX-License-Identifier: MPL-2.0
				 */

				// #pragma once
				// #include "alpaka/api/syclGeneric/tag.hpp"    // amalgamate: file already inlined
				// #include "alpaka/core/Unreachable.hpp"    // amalgamate: file already inlined
				// #include "alpaka/core/config.hpp"    // amalgamate: file already inlined
				// #include "alpaka/internal/intrinsic.hpp"    // amalgamate: file already inlined

				#if ALPAKA_LANG_SYCL

				// #    include <sycl/sycl.hpp>    // amalgamate: file already included

				namespace alpaka::internal::intrinsic
				{
				    template<typename T_Arg>
				    struct Popcount::Op<alpaka::internal::SyclIntrinsic, T_Arg>
				    {
				        constexpr auto operator()(alpaka::internal::SyclIntrinsic const, T_Arg const& val) const
				        {
				            if constexpr(sizeof(T_Arg) == 4u)
				            {
				                return sycl::popcount(std::bit_cast<unsigned int>(val));
				            }
				            else if constexpr(sizeof(T_Arg) == 8u)
				            {
				                return sycl::popcount(std::bit_cast<unsigned long long>(val));
				            }
				            else
				                static_assert(!sizeof(T_Arg), "Unsupported data type, sizeof() must be 4 or 8");

				            ALPAKA_UNREACHABLE(int{});
				        }
				    };

				    template<typename T_Arg>
				    struct Ffs::Op<alpaka::internal::SyclIntrinsic, T_Arg>
				    {
				        constexpr auto operator()(alpaka::internal::SyclIntrinsic const, T_Arg const& val) const
				        {
				            // There is no FFS operation in SYCL but we can emulate it using popcount.
				            if constexpr(sizeof(T_Arg) == 4u)
				            {
				                auto value = std::bit_cast<unsigned int>(val);
				                return (value == 0u) ? 0 : sycl::popcount(value ^ ~(-value));
				            }
				            else if constexpr(sizeof(T_Arg) == 8u)
				            {
				                auto value = std::bit_cast<unsigned long long>(val);
				                return (value == 0u) ? 0 : sycl::popcount(value ^ ~(-value));
				            }
				            else
				                static_assert(!sizeof(T_Arg), "Unsupported data type, sizeof() must be 4 or 8");

				            ALPAKA_UNREACHABLE(int{});
				        }
				    };

				    template<typename T_Arg>
				    struct Clz::Op<alpaka::internal::SyclIntrinsic, T_Arg>
				    {
				        constexpr auto operator()(alpaka::internal::SyclIntrinsic const, T_Arg const& val) const
				        {
				            if constexpr(sizeof(T_Arg) == 4u)
				            {
				                auto value = std::bit_cast<unsigned int>(val);
				                return sycl::clz(value);
				            }
				            else if constexpr(sizeof(T_Arg) == 8u)
				            {
				                auto value = std::bit_cast<unsigned long long>(val);
				                return sycl::clz(value);
				            }
				            else
				                static_assert(!sizeof(T_Arg), "Unsupported data type, sizeof() must be 4 or 8");

				            ALPAKA_UNREACHABLE(int{});
				        }
				    };
				} // namespace alpaka::internal::intrinsic

				#endif
				// ==
				// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/syclGeneric/intrinsic.hpp ==
				// ============================================================================

			// #include "alpaka/api/trait.hpp"    // amalgamate: file already inlined

			namespace alpaka::trait
			{
			    template<>
			    struct GetIntrinsicImpl::Op<alpaka::api::OneApi>
			    {
			        constexpr decltype(auto) operator()(alpaka::api::OneApi const) const
			        {
			            return alpaka::internal::syclIntrinsic;
			        }
			    };
			} // namespace alpaka::trait
			// ==
			// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/oneApi/intrinsic.hpp ==
			// ============================================================================

		// ==
		// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/api/intrinsic.hpp ==
		// ============================================================================

	// #include "alpaka/api/trait.hpp"    // amalgamate: file already inlined
	// #include "internal/intrinsic.hpp"    // amalgamate: file already inlined

	// #include <climits>    // amalgamate: file already included

	namespace alpaka
	{
	    /** Returns the number of bits set to 1. */
	    constexpr int32_t popcount(auto const& arg)
	        requires(sizeof(ALPAKA_TYPEOF(arg)) == 4u || sizeof(ALPAKA_TYPEOF(arg)) == 8u)
	    {
	        constexpr auto intrinsicImpl = trait::getIntrinsicImpl(thisApi());
	        return internal::intrinsic::Popcount::Op<ALPAKA_TYPEOF(intrinsicImpl), ALPAKA_TYPEOF(arg)>{}(
	            intrinsicImpl,
	            arg);
	    }

	    /* Position of the least significant bit set to 1.
	     *
	     * @return 1-based position of the first set bit, zero for input value 0.
	     */
	    constexpr int32_t ffs(auto const& arg)
	        requires(sizeof(ALPAKA_TYPEOF(arg)) == 4u || sizeof(ALPAKA_TYPEOF(arg)) == 8u)
	    {
	        constexpr auto intrinsicImpl = trait::getIntrinsicImpl(thisApi());
	        return internal::intrinsic::Ffs::Op<ALPAKA_TYPEOF(intrinsicImpl), ALPAKA_TYPEOF(arg)>{}(intrinsicImpl, arg);
	    }

	    /* Return the number of most significant zero bits
	     *
	     * @return number consecutive most significant zero bits, zero for input value 0.
	     */
	    constexpr int32_t clz(auto const& arg)
	        requires(sizeof(ALPAKA_TYPEOF(arg)) == 4u || sizeof(ALPAKA_TYPEOF(arg)) == 8u)
	    {
	        constexpr auto intrinsicImpl = alpaka::trait::getIntrinsicImpl(thisApi());
	        return internal::intrinsic::Clz::Op<ALPAKA_TYPEOF(intrinsicImpl), ALPAKA_TYPEOF(arg)>{}(intrinsicImpl, arg);
	    }
	} // namespace alpaka
	// ==
	// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/intrinsic.hpp ==
	// ============================================================================

// #include "alpaka/math.hpp"    // amalgamate: file already inlined
// #include "alpaka/math/Complex.hpp"    // amalgamate: file already inlined
	// ============================================================================
	// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/math/constants.hpp ==
	// ==
	/* Copyright 2023 Benjamin Worpitz, Matthias Werner, Jan Stephan, Bernhard Manfred Gruber, Sergei Bastrakov,
	 *                Andrea Bocci, René Widera
	 * SPDX-License-Identifier: MPL-2.0
	 */

	// #pragma once
	// #include "alpaka/core/common.hpp"    // amalgamate: file already inlined

	// #include <cmath>    // amalgamate: file already included
	// #include <complex>    // amalgamate: file already included
	#if __has_include(<version>) // Not part of the C++17 standard but all major standard libraries include this
	#    include <version>
	#endif
	#ifdef __cpp_lib_math_constants
	#    include <numbers>
	#endif

	namespace alpaka::math
	{
	    namespace constants
	    {
	#ifdef __cpp_lib_math_constants
	        inline constexpr double e = std::numbers::e;
	        inline constexpr double log2e = std::numbers::log2e;
	        inline constexpr double log10e = std::numbers::log10e;
	        inline constexpr double pi = std::numbers::pi;
	        inline constexpr double inv_pi = std::numbers::inv_pi;
	        inline constexpr double ln2 = std::numbers::ln2;
	        inline constexpr double ln10 = std::numbers::ln10;
	        inline constexpr double sqrt2 = std::numbers::sqrt2;

	        template<typename T>
	        inline constexpr T e_v = std::numbers::e_v<T>;

	        template<typename T>
	        inline constexpr T log2e_v = std::numbers::log2e_v<T>;

	        template<typename T>
	        inline constexpr T log10e_v = std::numbers::log10e_v<T>;

	        template<typename T>
	        inline constexpr T pi_v = std::numbers::pi_v<T>;

	        template<typename T>
	        inline constexpr T inv_pi_v = std::numbers::inv_pi_v<T>;

	        template<typename T>
	        inline constexpr T ln2_v = std::numbers::ln2_v<T>;

	        template<typename T>
	        inline constexpr T ln10_v = std::numbers::ln10_v<T>;

	        template<typename T>
	        inline constexpr T sqrt2_v = std::numbers::sqrt2_v<T>;
	#else
	        inline constexpr double e = M_E;
	        inline constexpr double log2e = M_LOG2E;
	        inline constexpr double log10e = M_LOG10E;
	        inline constexpr double pi = M_PI;
	        inline constexpr double inv_pi = M_1_PI;
	        inline constexpr double ln2 = M_LN2;
	        inline constexpr double ln10 = M_LN10;
	        inline constexpr double sqrt2 = M_SQRT2;

	        template<typename T>
	        inline constexpr T e_v = static_cast<T>(e);

	        template<typename T>
	        inline constexpr T log2e_v = static_cast<T>(log2e);

	        template<typename T>
	        inline constexpr T log10e_v = static_cast<T>(log10e);

	        template<typename T>
	        inline constexpr T pi_v = static_cast<T>(pi);

	        template<typename T>
	        inline constexpr T inv_pi_v = static_cast<T>(inv_pi);

	        template<typename T>
	        inline constexpr T ln2_v = static_cast<T>(ln2);

	        template<typename T>
	        inline constexpr T ln10_v = static_cast<T>(ln10);

	        template<typename T>
	        inline constexpr T sqrt2_v = static_cast<T>(sqrt2);

	        // Use predefined float constants when available
	#    if defined(M_Ef)
	        template<>
	        inline constexpr float e_v<float> = M_Ef;
	#    endif

	#    if defined(M_LOG2Ef)
	        template<>
	        inline constexpr float log2e_v<float> = M_LOG2Ef;
	#    endif

	#    if defined(M_LOG10Ef)
	        template<>
	        inline constexpr float log10e_v<float> = M_LOG10Ef;
	#    endif

	#    if defined(M_PIf)
	        template<>
	        inline constexpr float pi_v<float> = M_PIf;
	#    endif

	#    if defined(M_1_PIf)
	        template<>
	        inline constexpr float inv_pi_v<float> = M_1_PIf;
	#    endif

	#    if defined(M_LN2f)
	        template<>
	        inline constexpr float ln2_v<float> = M_LN2f;
	#    endif

	#    if defined(M_LN10f)
	        template<>
	        inline constexpr float ln10_v<float> = M_LN10f;
	#    endif

	#    if defined(M_SQRT2f)
	        template<>
	        inline constexpr float sqrt2_v<float> = M_SQRT2f;
	#    endif

	#endif
	    } // namespace constants

	} // namespace alpaka::math
	// ==
	// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/math/constants.hpp ==
	// ============================================================================

// #include "alpaka/mem/BoundaryIter.hpp"    // amalgamate: file already inlined
	// ============================================================================
	// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/mem/LinearizedIdxGenerator.hpp ==
	// ==
	/* Copyright 2025 René Widera
	 * SPDX-License-Identifier: MPL-2.0
	 */

	// #pragma once
	// #include "alpaka/Simd.hpp"    // amalgamate: file already inlined
	// #include "alpaka/internal/interface.hpp"    // amalgamate: file already inlined
	// #include "alpaka/mem/DataPitches.hpp"    // amalgamate: file already inlined
	// #include "alpaka/mem/concepts/IndexVec.hpp"    // amalgamate: file already inlined

	namespace alpaka
	{
	    /** Generate a linearized scalar index.
	     *
	     * The generator behaves like an n-dimensional data container, but it is not pointing to any memory.
	     * The index to access the generator is linearized based on the provided extents.
	     */
	    template<typename T_IndexType, uint32_t T_dim>
	    struct LinearizedIdxGenerator
	    {
	        constexpr LinearizedIdxGenerator(alpaka::concepts::VectorOrScalar auto const& size) : m_extents{size}
	        {
	        }

	        using value_type = T_IndexType;
	        using index_type = value_type;

	        static consteval uint32_t dim()
	        {
	            return T_dim;
	        }

	        /** Get alignment of the generators value_type */
	        static constexpr auto getAlignment()
	        {
	            return alpaka::Alignment<alignof(value_type)>{};
	        }

	        /** Get value at the given index
	         *
	         * @param idx n-dimensional offset, range [0, extents)
	         * @return linearized index
	         */
	        constexpr value_type operator[](alpaka::concepts::IndexVec<T_IndexType, T_dim> auto const& idx) const
	        {
	            return linearize(m_extents, idx);
	        }

	        /** Get value at the given index
	         *
	         * @param idx n-dimensional offset, range [0, extents)
	         * @return linearized index
	         */
	        constexpr value_type operator[](alpaka::concepts::IndexVec<T_IndexType, T_dim> auto const& idx)
	        {
	            return linearize(m_extents, idx);
	        }

	        /** Get value at the given index
	         *
	         * @param idx n-dimensional offset, range [0, extents)
	         * @return linearized index
	         */
	        constexpr value_type operator[](std::integral auto const& idx) const requires(dim() == 1u)
	        {
	            return idx;
	        }

	        /** Get value at the given index
	         *
	         * @param idx n-dimensional offset, range [0, extents)
	         * @return linearized index
	         */
	        constexpr value_type operator[](std::integral auto const& idx) requires(dim() == 1u)
	        {
	            return idx;
	        }

	        /** supported index range
	         *
	         * @return virtual extents of the generator
	         */
	        constexpr auto getExtents() const
	        {
	            return m_extents;
	        }

	        constexpr auto getPitches() const
	        {
	            return alpaka::calculatePitchesFromExtents<value_type>(getExtents());
	        }

	        [[nodiscard]] constexpr explicit operator bool() const noexcept
	        {
	            return true;
	        }

	    private:
	        alpaka::Vec<T_IndexType, T_dim> m_extents;
	    };

	    template<concepts::VectorOrScalar T_Extents>
	    ALPAKA_FN_HOST_ACC LinearizedIdxGenerator(T_Extents const&)
	        -> LinearizedIdxGenerator<trait::GetValueType_t<T_Extents>, trait::getDim_v<T_Extents>>;

	    namespace internal
	    {
	        /** Add support to use the generator with SimdPtr. */
	        template<
	            typename T_IndexType,
	            uint32_t T_dim,
	            alpaka::concepts::Alignment T_MdSpanAlignment,
	            alpaka::concepts::Vector T_Idx>
	        struct LoadAsSimd::Op<LinearizedIdxGenerator<T_IndexType, T_dim>, T_MdSpanAlignment, T_Idx>
	        {
	            template<uint32_t T_simdWidth>
	            constexpr auto load(auto&& linearIdxGenerator, T_MdSpanAlignment alignment, T_Idx const& idx) const
	            {
	                alpaka::unused(alignment);
	                static_assert(
	                    std::is_same_v<LinearizedIdxGenerator<T_IndexType, T_dim>, ALPAKA_TYPEOF(linearIdxGenerator)>,
	                    "Type of linearIdxGenerator must match the class template signature.");
	                using DataTypeType = std::remove_reference_t<decltype(linearIdxGenerator[idx])>;
	                using DstType = std::conditional_t<
	                    std::is_const_v<DataTypeType>,
	                    Simd<std::decay_t<DataTypeType>, T_simdWidth> const,
	                    Simd<std::decay_t<DataTypeType>, T_simdWidth>>;

	                return DstType(
	                    [&](auto i)
	                    {
	                        // rAssign() is used because, SIMD vectors can only be loaded from the fast moving index
	                        return linearIdxGenerator[idx + T_Idx::fill(0).rAssign(i)];
	                    });
	            }
	        };
	    } // namespace internal
	} // namespace alpaka
	// ==
	// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/mem/LinearizedIdxGenerator.hpp ==
	// ============================================================================

// #include "alpaka/mem/concepts/IDataStorage.hpp"    // amalgamate: file already inlined
// #include "alpaka/onAcc/Acc.hpp"    // amalgamate: file already inlined
// #include "alpaka/onAcc/SimdAlgo.hpp"    // amalgamate: file already inlined
// #include "alpaka/onAcc/WorkerGroup.hpp"    // amalgamate: file already inlined
	// ============================================================================
	// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/onAcc/atomic.hpp ==
	// ==
	/* Copyright 2022 Benjamin Worpitz, René Widera, Bernhard Manfred Gruber
	 * SPDX-License-Identifier: MPL-2.0
	 */

	// #pragma once
	// #include "alpaka/api/api.hpp"    // amalgamate: file already inlined
	// #include "alpaka/api/trait.hpp"    // amalgamate: file already inlined
	// #include "alpaka/core/common.hpp"    // amalgamate: file already inlined
	// #include "alpaka/onAcc/Acc.hpp"    // amalgamate: file already inlined
	// #include "alpaka/onAcc/internal/interface.hpp"    // amalgamate: file already inlined
	// #include "alpaka/onAcc/scope.hpp"    // amalgamate: file already inlined
	// #include "alpaka/operation.hpp"    // amalgamate: file already inlined

	#include <type_traits>

	namespace alpaka::onAcc
	{
	    //! Executes the given operation atomically.
	    //!
	    //! \tparam TOp The operation type.
	    //! \tparam T The value type.
	    //! \param addr The value to change atomically.
	    //! \param value The value used in the atomic operation.
	    template<typename TOp, typename T, typename T_Scope = scope::Device>
	    constexpr auto atomicOp(auto const& acc, T* const addr, T const& value, T_Scope const scope = T_Scope()) -> T
	    {
	        static_assert(!std::is_same_v<T_Scope, scope::System>, "System scope is currently not supported.");
	        auto atomicImpl = trait::getAtomicImpl(acc[object::exec], scope);
	        return internalCompute::Atomic::Op<TOp, ALPAKA_TYPEOF(atomicImpl), T, T_Scope>::atomicOp(
	            atomicImpl,
	            addr,
	            value);
	    }

	    //! Executes the given operation atomically.
	    //!
	    //! \tparam TOp The operation type.
	    //! \tparam T The value type.
	    //! \param addr The value to change atomically.
	    //! \param compare The comparison value used in the atomic operation.
	    //! \param value The value used in the atomic operation.
	    template<typename TOp, typename T, typename T_Scope = scope::Device>
	    constexpr auto atomicOp(
	        auto const& acc,
	        T* const addr,
	        T const& compare,
	        T const& value,
	        T_Scope const scope = T_Scope()) -> T
	    {
	        static_assert(!std::is_same_v<T_Scope, scope::System>, "System scope is currently not supported.");
	        auto atomicImpl = trait::getAtomicImpl(acc[object::exec], scope);
	        return internalCompute::Atomic::Op<TOp, ALPAKA_TYPEOF(atomicImpl), T, T_Scope>::atomicOp(
	            atomicImpl,
	            addr,
	            compare,
	            value);
	    }

	    //! Executes an atomic add operation.
	    //!
	    //! \tparam T The value type.
	    //! \param addr The value to change atomically.
	    //! \param value The value used in the atomic operation.
	    template<typename T, typename T_Scope = scope::Device>
	    constexpr auto atomicAdd(auto const& acc, T* const addr, T const& value, T_Scope const hier = T_Scope()) -> T
	    {
	        return atomicOp<operation::Add>(acc, addr, value, hier);
	    }

	    //! Executes an atomic sub operation.
	    //!
	    //! \tparam T The value type.
	    //! \param addr The value to change atomically.
	    //! \param value The value used in the atomic operation.
	    template<typename T, typename T_Scope = scope::Device>
	    constexpr auto atomicSub(auto const& acc, T* const addr, T const& value, T_Scope const hier = T_Scope()) -> T
	    {
	        return atomicOp<operation::Sub>(acc, addr, value, hier);
	    }

	    //! Executes an atomic min operation.
	    //!
	    //! \tparam T The value type.
	    //! \param addr The value to change atomically.
	    //! \param value The value used in the atomic operation.
	    template<typename T, typename T_Scope = scope::Device>
	    constexpr auto atomicMin(auto const& acc, T* const addr, T const& value, T_Scope const hier = T_Scope()) -> T
	    {
	        return atomicOp<operation::Min>(acc, addr, value, hier);
	    }

	    //! Executes an atomic max operation.
	    //!
	    //! \tparam T The value type.
	    //! \param addr The value to change atomically.
	    //! \param value The value used in the atomic operation.
	    template<typename T, typename T_Scope = scope::Device>
	    constexpr auto atomicMax(auto const& acc, T* const addr, T const& value, T_Scope const hier = T_Scope()) -> T
	    {
	        return atomicOp<operation::Max>(acc, addr, value, hier);
	    }

	    //! Executes an atomic exchange operation.
	    //!
	    //! \tparam T The value type.
	    //! \param addr The value to change atomically.
	    //! \param value The value used in the atomic operation.
	    template<typename T, typename T_Scope = scope::Device>
	    constexpr auto atomicExch(auto const& acc, T* const addr, T const& value, T_Scope const hier = T_Scope()) -> T
	    {
	        return atomicOp<operation::Exch>(acc, addr, value, hier);
	    }

	    //! Executes an atomic increment operation.
	    //!
	    //! \tparam T The value type.
	    //! \param addr The value to change atomically.
	    //! \param value The value used in the atomic operation.
	    template<typename T, typename T_Scope = scope::Device>
	    constexpr auto atomicInc(auto const& acc, T* const addr, T const& value, T_Scope const hier = T_Scope()) -> T
	    {
	        return atomicOp<operation::Inc>(acc, addr, value, hier);
	    }

	    //! Executes an atomic decrement operation.
	    //!
	    //! \tparam T The value type.
	    //! \param addr The value to change atomically.
	    //! \param value The value used in the atomic operation.
	    template<typename T, typename T_Scope = scope::Device>
	    constexpr auto atomicDec(auto const& acc, T* const addr, T const& value, T_Scope const hier = T_Scope()) -> T
	    {
	        return atomicOp<operation::Dec>(acc, addr, value, hier);
	    }

	    //! Executes an atomic and operation.
	    //!
	    //! \tparam T The value type.
	    //! \param addr The value to change atomically.
	    //! \param value The value used in the atomic operation.
	    template<typename T, typename T_Scope = scope::Device>
	    constexpr auto atomicAnd(auto const& acc, T* const addr, T const& value, T_Scope const hier = T_Scope()) -> T
	    {
	        return atomicOp<operation::And>(acc, addr, value, hier);
	    }

	    //! Executes an atomic or operation.
	    //!
	    //! \tparam T The value type.
	    //! \param addr The value to change atomically.
	    //! \param value The value used in the atomic operation.
	    template<typename T, typename T_Scope = scope::Device>
	    constexpr auto atomicOr(auto const& acc, T* const addr, T const& value, T_Scope const hier = T_Scope()) -> T
	    {
	        return atomicOp<operation::Or>(acc, addr, value, hier);
	    }

	    //! Executes an atomic xor operation.
	    //!
	    //! \tparam T The value type.
	    //! \param addr The value to change atomically.
	    //! \param value The value used in the atomic operation.
	    template<typename T, typename T_Scope = scope::Device>
	    constexpr auto atomicXor(auto const& acc, T* const addr, T const& value, T_Scope const hier = T_Scope()) -> T
	    {
	        return atomicOp<operation::Xor>(acc, addr, value, hier);
	    }

	    //! Executes an atomic compare-and-swap operation.
	    //!
	    //! \tparam T The value type.
	    //! \param addr The value to change atomically.
	    //! \param compare The comparison value used in the atomic operation.
	    //! \param value The value used in the atomic operation.
	    template<typename T, typename T_Scope = scope::Device>
	    constexpr auto atomicCas(
	        auto const& acc,
	        T* const addr,
	        T const& compare,
	        T const& value,
	        T_Scope const hier = T_Scope()) -> T
	    {
	        return atomicOp<operation::Cas>(acc, addr, compare, value, hier);
	    }

	    namespace atomic
	    {
	        /** Defines the equivalent of an atomic invoke for user defined functors.
	         *
	         * This function can be specialized within the namespace of the user functor type and will be found via ADL.
	         * Typically, this functor is used within the reduce and transformReduce algorithm.
	         * The implementation must implement the functor equivalent atomic function.
	         *
	         * @param fn non-atomic user functor
	         * @param inOut pointer to the values which is updated
	         * @param args arguments normally forwarded to the user functor
	         */
	        ALPAKA_FN_ACC void atomicInvoke(auto&& fn, concepts::Acc auto const& acc, auto* inOut, auto&&... args)
	        {
	            alpaka::unused(acc, inOut, args...);
	            static_assert(
	                sizeof(ALPAKA_TYPEOF(fn)) && false,
	                "You must specialize atomicInvoke() for your functor. Best place the overload in the namespace of the "
	                "functor, it will be found by ADL.");
	        }

	        template<typename T>
	        ALPAKA_FN_ACC void atomicInvoke(std::plus<T>, concepts::Acc auto const& acc, auto* inOut, auto&&... args)
	        {
	            atomicAdd(acc, inOut, ALPAKA_FORWARD(args)...);
	        }
	    } // namespace atomic

	} // namespace alpaka::onAcc
	// ==
	// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/onAcc/atomic.hpp ==
	// ============================================================================

	// ============================================================================
	// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/onAcc/globalMem.hpp ==
	// ==
	/* Copyright 2024 René Widera
	 * SPDX-License-Identifier: MPL-2.0
	 */

	// #pragma once
	// #include "alpaka/core/PP.hpp"    // amalgamate: file already inlined
	// #include "alpaka/onAcc/internal/globalMem.hpp"    // amalgamate: file already inlined

	/** Forward declare an external device global variable.
	 *
	 * The variable is only forward declared as external symbol.
	 *
	 * @attention If you compile with a CUDA or HIP compiler and not compile with the option device repeatable code
	 * (-rdc=true) no symbol will be exposed due to the fact that device linking is disabled.
	 *
	 * @param attributes The keyword 'extern' is automatically set and the attribute 'inline' is allowed.
	 * @param dataType Type of the variable, if the type contains comma it must be wrapped in parentheses
	 * @param name Name of the variable you would use later to access the data.
	 */
	#define ALPAKA_DEVICE_GLOBAL_EXTERN(attributes, dataType, name)                                                       \
	    ALPAKA_DEVICE_GLOBAL_DATA_HOST_EXTERN(attributes, dataType, name)                                                 \
	    ALPAKA_DEVICE_GLOBAL_DATA_CUDA_HIP_EXTERN(attributes, dataType, name)                                             \
	    ALPAKA_DEVICE_GLOBAL_DATA_ONEAPI_EXTERN(attributes, dataType, name)

	/** Define a device global variable.
	 *
	 * Initialize the variable with the given values.
	 * A type 'name_t' is created as alias to the wrapper type.
	 * To get access to the data you should call name.get(). If the dataType is a multidimensional C array with compile
	 * time extents the return type of '.get()' is an alpaka::MdSpanArray.
	 *
	 * @param attributes Attributes for the variable definition, can be empty or 'inline', 'static', 'const', 'constexpr',
	 * etc. If oneAPI with AMD backend is used attributes must be empty or 'inline' due to compiler limitations.
	 * @param dataType Type of the variable, if the type contains comma it must be wrapped in parentheses
	 * @param name Name of the variable you would use later to access the data.
	 * @param ... Initializer values for the variable, can be empty. The arguments will be forwarded to the constructor of
	 * dataType. If dataType is a C array the values must be provided in curly braces.
	 */
	#define ALPAKA_DEVICE_GLOBAL(attributes, dataType, name, ...)                                                         \
	    ALPAKA_DEVICE_GLOBAL_DATA_HOST(attributes, dataType, name, __VA_ARGS__)                                           \
	    ALPAKA_DEVICE_GLOBAL_DATA_CUDA_HIP(attributes, dataType, name, __VA_ARGS__)                                       \
	    ALPAKA_DEVICE_GLOBAL_DATA_ONEAPI(attributes, dataType, name, __VA_ARGS__)                                         \
	                                                                                                                      \
	    struct ALPAKA_PP_CAT(AlpakaGlobalStorage, name)                                                                   \
	    {                                                                                                                 \
	        constexpr auto& get(alpaka::api::Host) const                                                                  \
	        {                                                                                                             \
	            return alpaka_onHost::name.value;                                                                         \
	        }                                                                                                             \
	        constexpr auto& getHandle(alpaka::api::Host) const                                                            \
	        {                                                                                                             \
	            return alpaka_onHost::name;                                                                               \
	        }                                                                                                             \
	        ALPAKA_DEVICE_GLOBAL_GET_CUDA_HIP(attributes, dataType, name, __VA_ARGS__)                                    \
	        ALPAKA_DEVICE_GLOBAL_GET_ONEAPI(attributes, dataType, name, __VA_ARGS__)                                      \
	    };                                                                                                                \
	                                                                                                                      \
	    using ALPAKA_PP_CAT(name, _t) = alpaka::onAcc::internal::                                                         \
	        GlobalDeviceMemoryWrapper<ALPAKA_PP_CAT(AlpakaGlobalStorage, name), ALPAKA_PP_REMOVE_BRACKETS(dataType)>;     \
	    [[maybe_unused]] constexpr auto name = ALPAKA_PP_CAT(name, _t)                                                    \
	    {                                                                                                                 \
	    }
	// ==
	// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/onAcc/globalMem.hpp ==
	// ============================================================================

// #include "alpaka/onAcc/interface.hpp"    // amalgamate: file already inlined
	// ============================================================================
	// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/onAcc/internal/interfaceImpl.hpp ==
	// ==
	/* Copyright 2024 René Widera
	 * SPDX-License-Identifier: MPL-2.0
	 */

	// #pragma once
	/** @file This file contains specializations of interfaces for the accelerator scope.
	 * The specializations must be separated from the definitions to avoid cyclic include dependencies.
	 */

	// #include "alpaka/onAcc/internal/interface.hpp"    // amalgamate: file already inlined
	// #include "alpaka/onAcc/internal/warp.hpp"    // amalgamate: file already inlined

	namespace alpaka::onAcc
	{
	    namespace internalCompute
	    {
	        template<concepts::Acc T_Acc>
	        struct GetIdxWithin::Op<T_Acc, ALPAKA_TYPEOF(origin::warp), ALPAKA_TYPEOF(unit::threads)>
	        {
	            constexpr alpaka::concepts::Vector<uint32_t, 1u> auto operator()(
	                T_Acc const& acc,
	                ALPAKA_TYPEOF(origin::warp),
	                ALPAKA_TYPEOF(unit::threads)) const
	            {
	                return Vec{warp::internal::getLaneIdx(acc)};
	            }
	        };

	        template<typename T_Acc>
	        struct GetIdxWithin::Op<T_Acc, ALPAKA_TYPEOF(origin::block), ALPAKA_TYPEOF(unit::threads)>
	        {
	            constexpr alpaka::concepts::Vector auto operator()(
	                T_Acc const& acc,
	                ALPAKA_TYPEOF(origin::block),
	                ALPAKA_TYPEOF(unit::threads)) const
	            {
	                return acc[layer::thread].idx();
	            }
	        };

	        template<concepts::Acc T_Acc>
	        struct GetIdxWithin::Op<T_Acc, ALPAKA_TYPEOF(origin::block), ALPAKA_TYPEOF(unit::warps)>
	        {
	            constexpr alpaka::concepts::Vector<uint32_t, 1u> auto operator()(
	                T_Acc const& acc,
	                ALPAKA_TYPEOF(origin::block),
	                ALPAKA_TYPEOF(unit::warps)) const
	            {
	                return Vec{warp::internal::getWarpIdx(acc)};
	            }
	        };

	        template<typename T_Acc>
	        struct GetIdxWithin::Op<T_Acc, ALPAKA_TYPEOF(origin::grid), ALPAKA_TYPEOF(unit::threads)>
	        {
	            constexpr alpaka::concepts::Vector auto operator()(
	                T_Acc const& acc,
	                ALPAKA_TYPEOF(origin::grid),
	                ALPAKA_TYPEOF(unit::threads)) const
	            {
	                return acc[layer::thread].count() * acc[layer::block].idx() + acc[layer::thread].idx();
	            }
	        };

	        template<concepts::Acc T_Acc>
	        struct GetIdxWithin::Op<T_Acc, ALPAKA_TYPEOF(origin::grid), ALPAKA_TYPEOF(unit::warps)>
	        {
	            constexpr alpaka::concepts::Vector<uint32_t, 1u> auto operator()(
	                T_Acc const& acc,
	                ALPAKA_TYPEOF(origin::grid),
	                ALPAKA_TYPEOF(unit::warps)) const
	            {
	                auto blockIdxInGrid = acc.getIdxWithin(onAcc::origin::grid, onAcc::unit::blocks);
	                auto numBlocksInGrid = acc.getExtentsOf(onAcc::origin::grid, onAcc::unit::blocks);
	                auto linearBlockIdx = linearize(numBlocksInGrid, blockIdxInGrid);
	                return linearBlockIdx + Vec{warp::internal::getWarpIdx(acc)};
	            }
	        };

	        template<concepts::Acc T_Acc>
	        struct GetIdxWithin::Op<T_Acc, ALPAKA_TYPEOF(origin::grid), ALPAKA_TYPEOF(unit::blocks)>
	        {
	            constexpr alpaka::concepts::Vector auto operator()(
	                T_Acc const& acc,
	                ALPAKA_TYPEOF(origin::grid),
	                ALPAKA_TYPEOF(unit::blocks)) const
	            {
	                return acc[layer::block].idx();
	            }
	        };

	        template<concepts::Acc T_Acc>
	        struct GetIdxWithin::Op<T_Acc, ALPAKA_TYPEOF(origin::thread), ALPAKA_TYPEOF(unit::threads)>
	        {
	            /** The identity of the thread.
	             *
	             * @return Zero for all components of the extent.
	             */
	            constexpr alpaka::concepts::Vector auto operator()(
	                T_Acc const& acc,
	                ALPAKA_TYPEOF(origin::thread),
	                ALPAKA_TYPEOF(unit::threads)) const
	            {
	                using ExtentType = ALPAKA_TYPEOF(acc[layer::thread].idx());

	                using ValueType = typename ExtentType::type;
	                constexpr uint32_t dim = ExtentType::dim();

	                return fillCVec<ValueType, dim, 0u>();
	            }
	        };

	        template<concepts::Acc T_Acc>
	        struct GetExtentsOf::Op<T_Acc, ALPAKA_TYPEOF(origin::warp), ALPAKA_TYPEOF(unit::threads)>
	        {
	            constexpr alpaka::concepts::CVector<uint32_t> auto operator()(
	                T_Acc const& acc,
	                ALPAKA_TYPEOF(origin::warp),
	                ALPAKA_TYPEOF(unit::threads)) const
	            {
	                alpaka::unused(acc);
	                return alpaka::CVec<uint32_t, T_Acc::getWarpSize()>{};
	            }
	        };

	        template<concepts::Acc T_Acc>
	        struct GetExtentsOf::Op<T_Acc, ALPAKA_TYPEOF(origin::block), ALPAKA_TYPEOF(unit::threads)>
	        {
	            constexpr alpaka::concepts::Vector auto operator()(
	                T_Acc const& acc,
	                ALPAKA_TYPEOF(origin::block),
	                ALPAKA_TYPEOF(unit::threads)) const
	            {
	                return acc[layer::thread].count();
	            }
	        };

	        template<concepts::Acc T_Acc>
	        struct GetExtentsOf::Op<T_Acc, ALPAKA_TYPEOF(origin::block), ALPAKA_TYPEOF(unit::warps)>
	        {
	            constexpr alpaka::concepts::Vector<alpaka::NotRequired, 1u> auto operator()(
	                T_Acc const& acc,
	                ALPAKA_TYPEOF(origin::block),
	                ALPAKA_TYPEOF(unit::warps)) const
	            {
	                std::integral auto linearThreadsInBlock
	                    = acc.getExtentsOf(onAcc::origin::block, onAcc::unit::threads).product();
	                using IndexType = alpaka::trait::GetValueType_t<ALPAKA_TYPEOF(linearThreadsInBlock)>;
	                return Vec{divCeil(linearThreadsInBlock, static_cast<IndexType>(T_Acc::getWarpSize()))};
	            }
	        };

	        template<concepts::Acc T_Acc>
	        struct GetExtentsOf::Op<T_Acc, ALPAKA_TYPEOF(origin::grid), ALPAKA_TYPEOF(unit::blocks)>
	        {
	            constexpr alpaka::concepts::Vector auto operator()(
	                T_Acc const& acc,
	                ALPAKA_TYPEOF(origin::grid),
	                ALPAKA_TYPEOF(unit::blocks)) const
	            {
	                return acc[layer::block].count();
	            }
	        };

	        template<concepts::Acc T_Acc>
	        struct GetExtentsOf::Op<T_Acc, ALPAKA_TYPEOF(origin::grid), ALPAKA_TYPEOF(unit::warps)>
	        {
	            constexpr alpaka::concepts::Vector<alpaka::NotRequired, 1u> auto operator()(
	                T_Acc const& acc,
	                ALPAKA_TYPEOF(origin::grid),
	                ALPAKA_TYPEOF(unit::warps)) const
	            {
	                std::integral auto linearNumWarpsInBlock
	                    = acc.getExtentsOf(onAcc::origin::block, onAcc::unit::warps).product();
	                std::integral auto linearNumBlocks
	                    = acc.getExtentsOf(onAcc::origin::grid, onAcc::unit::blocks).product();
	                return Vec{linearNumBlocks * linearNumWarpsInBlock};
	            }
	        };

	        template<concepts::Acc T_Acc>
	        struct GetExtentsOf::Op<T_Acc, ALPAKA_TYPEOF(origin::grid), ALPAKA_TYPEOF(unit::threads)>
	        {
	            constexpr alpaka::concepts::Vector auto operator()(
	                T_Acc const& acc,
	                ALPAKA_TYPEOF(origin::grid),
	                ALPAKA_TYPEOF(unit::threads)) const
	            {
	                return acc[layer::block].count() * acc[layer::thread].count();
	            }
	        };

	        template<concepts::Acc T_Acc>
	        struct GetExtentsOf::Op<T_Acc, ALPAKA_TYPEOF(origin::thread), ALPAKA_TYPEOF(unit::threads)>
	        {
	            /** The identity of the thread.
	             *
	             * @return One for all components of the extent.
	             */
	            constexpr alpaka::concepts::Vector auto operator()(
	                T_Acc const& acc,
	                ALPAKA_TYPEOF(origin::thread),
	                ALPAKA_TYPEOF(unit::threads)) const
	            {
	                using ExtentType = ALPAKA_TYPEOF(acc[layer::thread].count());
	                using ValueType = typename ExtentType::type;
	                constexpr uint32_t dim = ExtentType::dim();

	                return fillCVec<ValueType, dim, 1u>();
	            }
	        };
	    } // namespace internalCompute
	} // namespace alpaka::onAcc
	// ==
	// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/onAcc/internal/interfaceImpl.hpp ==
	// ============================================================================

	// ============================================================================
	// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/onAcc/memFence.hpp ==
	// ==
	/* Copyright 2025 Mehmet Yusufoglu, René Widera
	 * SPDX-License-Identifier: MPL-2.0
	 */

	// #pragma once
	// #include "alpaka/core/common.hpp"    // amalgamate: file already inlined
	// #include "alpaka/onAcc/internal/interface.hpp"    // amalgamate: file already inlined
	// #include "alpaka/onAcc/memoryOrder.hpp"    // amalgamate: file already inlined
	// #include "alpaka/onAcc/scope.hpp"    // amalgamate: file already inlined

	#include <type_traits>

	namespace alpaka::onAcc
	{
	    // Main entry point for thread fences
	    // The forwarder as a free function, forwarding to the internalCompute namespace
	    constexpr void memFence(auto const& acc, concepts::Scope auto const scope, concepts::MemoryOrder auto const order)
	    {
	        // All specialisations are in internalCompute namespace. Dispatching to the appropriate backend.
	        internalCompute::MemoryFence::Op<ALPAKA_TYPEOF(acc.getApi()), ALPAKA_TYPEOF(scope), ALPAKA_TYPEOF(order)>{}(
	            acc,
	            scope,
	            order);
	    }
	} // namespace alpaka::onAcc
	// ==
	// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/onAcc/memFence.hpp ==
	// ============================================================================

	// ============================================================================
	// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/onAcc/range.hpp ==
	// ==
	/* Copyright 2024 René Widera
	 * SPDX-License-Identifier: MPL-2.0
	 */

	// #pragma once
		// ============================================================================
		// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/onAcc/internal/IdxRange.hpp ==
		// ==
		/* Copyright 2024 René Widera
		 * SPDX-License-Identifier: MPL-2.0
		 */

		// #pragma once
		// #include "alpaka/mem/IdxRange.hpp"    // amalgamate: file already inlined
		// #include "alpaka/onAcc/WorkerGroup.hpp"    // amalgamate: file already inlined
		// #include "alpaka/onAcc/tag.hpp"    // amalgamate: file already inlined

		namespace alpaka::onAcc::internal
		{
		    template<typename T_ExtentFn>
		    struct IdxRangeFn
		    {
		        constexpr IdxRangeFn(T_ExtentFn const& extentFn) : m_extentFn{extentFn}
		        {
		        }

		        constexpr auto getIdxRange(auto const& acc) const
		        {
		            return IdxRange{m_extentFn(acc)};
		        }

		    private:
		        T_ExtentFn const m_extentFn;
		    };

		    template<concepts::Origin T_Origin, concepts::Unit T_Unit, typename T_MultiDimensional = MultiDimensional<true>>
		    struct IdxRangeLazy
		    {
		        constexpr IdxRangeLazy(T_Origin const& origin, T_Unit const& unit, T_MultiDimensional = T_MultiDimensional{})
		        {
		            alpaka::unused(origin, unit);
		        }

		        constexpr auto getIdxRange(auto const& acc) const
		        {
		            auto const extent
		                = internalCompute::GetExtentsOf::Op<ALPAKA_TYPEOF(acc), T_Origin, T_Unit>{}(acc, T_Origin{}, T_Unit{});

		            if constexpr(T_MultiDimensional::value == false)
		                return IdxRange{Vec{extent.product()}};
		            else
		                return IdxRange{extent};
		        }
		    };
		} // namespace alpaka::onAcc::internal

		namespace alpaka::trait
		{
		    template<concepts::SpecializationOf<onAcc::internal::IdxRangeLazy> T>
		    struct IsLazyIndexRange<T> : std::true_type
		    {
		    };

		    template<concepts::SpecializationOf<onAcc::internal::IdxRangeFn> T>
		    struct IsLazyIndexRange<T> : std::true_type
		    {
		    };
		} // namespace alpaka::trait
		// ==
		// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/onAcc/internal/IdxRange.hpp ==
		// ============================================================================


	namespace alpaka::onAcc
	{
	    namespace range
	    {
	        constexpr auto threadsInGrid = internal::IdxRangeLazy{origin::grid, unit::threads};
	        constexpr auto blocksInGrid = internal::IdxRangeLazy{origin::grid, unit::blocks};
	        constexpr auto threadsInBlock = internal::IdxRangeLazy{origin::block, unit::threads};

	        constexpr auto linearThreadsInGrid = internal::IdxRangeLazy{origin::grid, unit::threads, linearized};
	        constexpr auto linearBlocksInGrid = internal::IdxRangeLazy{origin::grid, unit::blocks, linearized};
	        /** Range of all warps in a grid. */
	        constexpr auto linearWarpsInGrid = internal::IdxRangeLazy{origin::grid, unit::warps};

	        /** Range of all warps in a block. */
	        constexpr auto linearWarpsInBlock = internal::IdxRangeLazy{origin::block, unit::warps};
	        constexpr auto linearThreadsInBlock = internal::IdxRangeLazy{origin::block, unit::threads, linearized};

	        /** Range of all threads in a warp. */
	        constexpr auto linearThreadsInWarp = internal::IdxRangeLazy{origin::warp, unit::threads};
	    } // namespace range
	} // namespace alpaka::onAcc
	// ==
	// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/onAcc/range.hpp ==
	// ============================================================================

// #include "alpaka/onAcc/tag.hpp"    // amalgamate: file already inlined
	// ============================================================================
	// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/onAcc/warp.hpp ==
	// ==
	/* Copyright 2025 Sergei Bastrakov, David M. Rogers, Bernhard Manfred Gruber, Aurora Perego, Mehmet Yusufoglu, René
	 * Widera SPDX-License-Identifier: MPL-2.0
	 *
	 * Bridges runtime accelerator instances to the trait-based warp intrinsics so kernels can call them without tags.
	 * Exposes device-safe `alpaka::onAcc::warp::*` wrappers for ballots, shuffles, and lane queries.
	 * Reuses the compile-time warp trait specialisations instead of duplicating backend-specific logic in kernels.
	 * Supplies a uniform warp API across CUDA, HIP, SYCL, and host-emulation accelerators.
	 *
	 * Some example usages:
	 * - consteval `alpaka::getWarpSize(api::Cuda{}, deviceKind::NvidiaGpu{})` for tag-driven compile-time logic.
	 * - device-side `alpaka::onAcc::warp::getSize(acc)` to query the active warp inside a kernel.
	 * - device-side `alpaka::onAcc::warp::shfl(acc, 42, 0u) == 42`
	 */

	// #pragma once
	// #include "alpaka/Vec.hpp"    // amalgamate: file already inlined
	// #include "alpaka/interface.hpp"    // amalgamate: file already inlined
	// #include "alpaka/onAcc/Acc.hpp"    // amalgamate: file already inlined
	// #include "alpaka/onAcc/internal/warp.hpp"    // amalgamate: file already inlined
	// #include "alpaka/tag.hpp"    // amalgamate: file already inlined

	// #include <cstdint>    // amalgamate: file already included

	namespace alpaka::onAcc::warp
	{
	    /** Return the bit-mask of active lanes for the warp associated with the accelerator.
	     *
	     * @return bit mask where the Nth bit is set to 1 if the corresponding thread is participating the call. The return
	     * type can be 64bit or 32bit depending on the API.
	     */
	    template<alpaka::onAcc::concepts::Acc T_Acc>
	    constexpr auto activemask(T_Acc const& acc) -> std::conditional_t<T_Acc::getWarpSize() <= 32u, uint32_t, uint64_t>
	    {
	        using Acc = ALPAKA_TYPEOF(acc);
	        using Api = ALPAKA_TYPEOF(acc[object::api]);
	        return internal::Activemask::Op<Acc, Api>{}(acc, Api{});
	    }

	    /** Return the lane index of the current thread within its warp. */
	    constexpr uint32_t getLaneIdx(alpaka::onAcc::concepts::Acc auto const& acc)
	    {
	        return internal::getLaneIdx(acc);
	    }

	    /** Return the warp index within the block. */
	    constexpr uint32_t getWarpIdx(alpaka::onAcc::concepts::Acc auto const& acc)
	    {
	        return internal::getWarpIdx(acc);
	    }

	    /** Evaluates predicate for all active threads of the warp
	     *
	     * It follows the logic of __all_sync(__activemask(), predicate) in CUDA but returns a boolean.
	     *
	     * Note:
	     * * The programmer must ensure that all threads calling this function are executing
	     *   the same line of code. In particular, it is not portable to write
	     *   if(a) {all} else {all}.
	     *
	     * @param predicate The predicate value for current thread.
	     * @return true if all threads predicate non zero, else false
	     */
	    constexpr bool all(alpaka::onAcc::concepts::Acc auto const& acc, int32_t predicate)
	    {
	        using Acc = ALPAKA_TYPEOF(acc);
	        using Api = ALPAKA_TYPEOF(acc[object::api]);
	        return internal::All::Op<Acc, Api>{}(acc, Api{}, predicate);
	    }

	    /** Evaluates predicate for all active threads of the warp.
	     *
	     * It follows the logic of __any_sync(__activemask(), predicate) in CUDA but returns a boolean.
	     *
	     * Note:
	     * * The programmer must ensure that all threads calling this function are executing
	     *   the same line of code. In particular, it is not portable to write
	     *   if(a) {any} else {any}.
	     *
	     * @param predicate The predicate value for current thread.
	     * @return true if at least one threads predicate is non zero, else false
	     */
	    constexpr bool any(alpaka::onAcc::concepts::Acc auto const& acc, int32_t predicate)
	    {
	        using Acc = ALPAKA_TYPEOF(acc);
	        using Api = ALPAKA_TYPEOF(acc[object::api]);
	        return internal::Any::Op<Acc, Api>{}(acc, Api{}, predicate);
	    }

	    /** Evaluates predicate for all non-exited threads in a warp and returns
	     * a 32- or 64-bit unsigned integer (depending on the accelerator)
	     * whose Nth bit is set if and only if predicate evaluates to non-zero
	     * for the Nth thread of the warp and the Nth thread is active.
	     *
	     * It follows the logic of __ballot_sync(__activemask(), predicate) in CUDA.
	     *
	     * Note:
	     * * The programmer must ensure that all threads calling this function are executing
	     *   the same line of code. In particular, it is not portable to write
	     *   if(a) {ballot} else {ballot}.
	     *
	     * @param predicate The predicate value for current thread.
	     * @return bit mask where the Nth bit is set to 1 if the corresponding threads predicate was non zero. The return
	     * type can be 64bit or 32bit depending on the API.
	     */
	    template<alpaka::onAcc::concepts::Acc T_Acc>
	    constexpr auto ballot(T_Acc const& acc, int32_t predicate)
	        -> std::conditional_t<T_Acc::getWarpSize() <= 32u, uint32_t, uint64_t>
	    {
	        using Acc = ALPAKA_TYPEOF(acc);
	        using Api = ALPAKA_TYPEOF(acc[object::api]);
	        return internal::Ballot::Op<Acc, Api>{}(acc, Api{}, predicate);
	    }

	    /** Return the warp size.
	     *
	     * A warp is a collection of threads which work in lock step (executing the same command).
	     * The warp size can be larger than the number of threads executed in the kernel/ thread block.
	     * @{
	     */
	    template<concepts::Acc T_Acc>
	    constexpr uint32_t getSize()
	    {
	        return internal::getSize<T_Acc>();
	    }

	    template<concepts::Acc T_Acc>
	    constexpr uint32_t getSize(T_Acc const&)
	    {
	        return T_Acc::getWarpSize();
	    }

	    /** @} */

	    /** Exchange data between threads within a warp.
	     *
	     * Effectively executes:
	     *
	     *     __shared__ int32_t values[warpsize];
	     *     values[threadIdx.x] = value;
	     *     __syncthreads();
	     *     return values[width*(threadIdx.x/width) + srcLane%width];
	     *
	     * However, it does not use shared memory.
	     *
	     *  Commonly used with width = warpsize (the default), (returns values[srcLane])
	     *
	     * This method supports to be called in diverging control flow branches if you only query values from threads
	     * within the same branch path.
	     *
	     * @param value value to broadcast, only used if other thread is addressing the lane of this thread.
	     * @param srcLane source lane index within the group range [0; width).
	     * @param width number of threads receiving a single value, must be a power of 2.
	     * @return val from the thread index srcLane.
	     */
	    template<typename T, alpaka::onAcc::concepts::Acc T_Acc>
	    requires(std::is_trivially_copyable_v<T>)
	    constexpr T shfl(T_Acc const& acc, T const& value, uint32_t srcLane, uint32_t width = getSize<T_Acc>())
	    {
	        using Acc = ALPAKA_TYPEOF(acc);
	        using Api = ALPAKA_TYPEOF(acc[object::api]);
	        return internal::Shfl::Op<Acc, Api, T>{}(acc, Api{}, value, srcLane, width != 0u ? width : getSize<T_Acc>());
	    }

	    /** Read data from threads with higher lane index within a warp.
	     *
	     * It copies from a lane with higher ID relative to caller.
	     * The lane ID is calculated by adding delta to the caller’s lane ID.
	     *
	     * Effectively executes:
	     *
	     *     __shared__ int32_t values[warpsize];
	     *     values[threadIdx.x] = value;
	     *     __syncthreads();
	     *     return (threadIdx.x % width + delta < width) ? values[threadIdx.x + delta] : values[threadIdx.x];
	     *
	     * However, it does not use shared memory.
	     *
	     * Notes:
	     * * The programmer must ensure that all threads calling this
	     *   function (and the srcLane) are executing the same line of code.
	     *   In particular it is not portable to write if(a) {shfl} else {shfl}.
	     *
	     * * Commonly used with width = warpsize (the default), (returns values[threadIdx.x+delta] if threadIdx.x+delta <
	     * warpsize)
	     *
	     * @param value value to broadcast
	     * @param delta corresponds to the delta used to compute the lane ID
	     * @param width size of the group participating in the shuffle operation, must be a power of 2.
	     * @return the value from the thread index lane ID + delta within the group build by width, else value.
	     */
	    template<typename T, alpaka::onAcc::concepts::Acc T_Acc>
	    requires(std::is_trivially_copyable_v<T>)
	    constexpr T shflDown(T_Acc const& acc, T const& value, uint32_t delta, uint32_t width = getSize<T_Acc>())
	    {
	        using Acc = ALPAKA_TYPEOF(acc);
	        using Api = ALPAKA_TYPEOF(acc[object::api]);
	        return internal::ShflDown::Op<Acc, Api, T>{}(acc, Api{}, value, delta, width != 0u ? width : getSize<T_Acc>());
	    }

	    /** Read data from threads with lower lane index within a warp.
	     *
	     * It copies from a lane with lower ID relative to caller.
	     * The lane ID is calculated by subtracting delta from the caller’s lane ID.
	     *
	     * Effectively executes:
	     *
	     *     __shared__ int32_t values[warpsize];
	     *     values[threadIdx.x] = value;
	     *     __syncthreads();
	     *     return (threadIdx.x % width >= delta) ? values[threadIdx.x - delta] : values[threadIdx.x];
	     *
	     * However, it does not use shared memory.
	     *
	     * Notes:
	     * * The programmer must ensure that all threads calling this
	     *   function (and the srcLane) are executing the same line of code.
	     *   In particular it is not portable to write if(a) {shfl} else {shfl}.
	     *
	     * Commonly used with width = warpsize (the default), (returns values[threadIdx.x - delta] if threadIdx.x >= delta)
	     *
	     * @param value value to broadcast
	     * @param delta corresponds to the delta used to compute the lane ID
	     * @param width size of the group participating in the shuffle operation, must be a power of 2.
	     * @return the value from the thread index lane ID + delta within the group build by width, else value.
	     */
	    template<typename T, alpaka::onAcc::concepts::Acc T_Acc>
	    requires(std::is_trivially_copyable_v<T>)
	    constexpr T shflUp(T_Acc const& acc, T const& value, uint32_t delta, uint32_t width = getSize<T_Acc>())
	    {
	        using Acc = ALPAKA_TYPEOF(acc);
	        using Api = ALPAKA_TYPEOF(acc[object::api]);
	        return internal::ShflUp::Op<Acc, Api, T>{}(acc, Api{}, value, delta, width != 0u ? width : getSize<T_Acc>());
	    }

	    /** Exchange data between threads within a warp.
	     *
	     * It copies from a lane based on bitwise XOR of own lane ID.
	     * The lane ID is calculated by performing a bitwise XOR of the caller’s lane ID with laneMask
	     *
	     * Effectively executes:
	     *
	     *     __shared__ int32_t values[warpsize];
	     *     values[threadIdx.x] = value;
	     *     __syncthreads();
	     *     int lane = threadIdx.x ^ laneMask;
	     *     return values[lane / width > threadIdx.x / width ? threadIdx.x : lane];
	     *
	     * However, it does not use shared memory.
	     *
	     * Notes:
	     * * The programmer must ensure that all threads calling this
	     *   function (and the srcLane) are executing the same line of code.
	     *   In particular it is not portable to write if(a) {shfl} else {shfl}.
	     *
	     * * Commonly used with width = warpsize (the default), (returns values[threadIdx.x^laneMask])
	     *
	     * * Width must be a power of 2.
	     * @param value value to broadcast
	     * @param laneMask mask applied to the thread lane index within the subgroup created by width.
	     * @param width size of the group participating in the shuffle operation, must be a power of 2.
	     * @return the value from the thread index lane ID
	     */
	    template<typename T, alpaka::onAcc::concepts::Acc T_Acc>
	    requires(std::is_trivially_copyable_v<T>)
	    constexpr T shflXor(T_Acc const& acc, T const& value, uint32_t laneMask, uint32_t width = getSize<T_Acc>())
	    {
	        using Acc = ALPAKA_TYPEOF(acc);
	        using Api = ALPAKA_TYPEOF(acc[object::api]);
	        return internal::ShflXor::Op<Acc, Api, T>{}(
	            acc,
	            Api{},
	            value,
	            laneMask,
	            width != 0u ? width : getSize<T_Acc>());
	    }
	} // namespace alpaka::onAcc::warp
	// ==
	// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/onAcc/warp.hpp ==
	// ============================================================================

// #include "alpaka/onHost/Device.hpp"    // amalgamate: file already inlined
// #include "alpaka/onHost/DeviceSelector.hpp"    // amalgamate: file already inlined
// #include "alpaka/onHost/Queue.hpp"    // amalgamate: file already inlined
	// ============================================================================
	// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/onHost/algo/concurrent.hpp ==
	// ==
	/* Copyright 2025 René Widera
	 * SPDX-License-Identifier: MPL-2.0
	 */

	// #pragma once
	// #include "alpaka/api/trait.hpp"    // amalgamate: file already inlined
		// ============================================================================
		// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/onHost/algo/internal/concurrent.hpp ==
		// ==
		/* Copyright 2025 René Widera
		 * SPDX-License-Identifier: MPL-2.0
		 */

		// #pragma once
		// #include "alpaka/Vec.hpp"    // amalgamate: file already inlined
		// #include "alpaka/core/common.hpp"    // amalgamate: file already inlined
		// #include "alpaka/functor.hpp"    // amalgamate: file already inlined
		// #include "alpaka/onAcc/Acc.hpp"    // amalgamate: file already inlined
		// #include "alpaka/onAcc/SimdAlgo.hpp"    // amalgamate: file already inlined
		// #include "alpaka/onHost/interface.hpp"    // amalgamate: file already inlined
		// #include "alpaka/onHost/logger/logger.hpp"    // amalgamate: file already inlined
		// #include "alpaka/trait.hpp"    // amalgamate: file already inlined

		namespace alpaka::onHost::internal
		{
		    struct SimdConcurrentKernel
		    {
		        ALPAKA_FN_ACC void operator()(
		            onAcc::concepts::Acc auto const& acc,
		            alpaka::concepts::VectorOrScalar auto const& extents,
		            auto const& func,
		            alpaka::concepts::IDataSource auto&&... inputs) const
		        {
		            Vec const extentMd = extents;
		            auto simdGrid = onAcc::SimdAlgo{onAcc::worker::threadsInGrid};

		            return simdGrid.concurrent(
		                acc,
		                extentMd,
		                [&func](auto const& acc, auto&&... in)
		                {
		                    static_assert(
		                        std::same_as<decltype(callFunctor(acc, func, ALPAKA_FORWARD(in)...)), void>,
		                        "The return type for a stencil concurrent functor should be void.");
		                    callFunctor(acc, func, ALPAKA_FORWARD(in)...);
		                },
		                ALPAKA_FORWARD(inputs)...);
		        }
		    };

		    template<typename T_DataType>
		    inline void concurrent(
		        auto const& queue,
		        alpaka::concepts::Executor auto const exec,
		        alpaka::concepts::VectorOrScalar auto const& extents,
		        auto&& fn,
		        alpaka::concepts::IDataSource auto&&... in)
		    {
		        Vec const extentMd = extents;
		        auto frameSpec = getSimdFrameSpec<T_DataType>(queue.getDevice(), exec, extentMd);

		        ALPAKA_LOG_INFO(
		            onHost::logger::memory,
		            [&]()
		            {
		                std::stringstream ss;
		                ss << "concurrent{ extents=" << extentMd << ", value_type=" << onHost::demangledName<T_DataType>()
		                   << ", " << frameSpec << ", fn=" << onHost::demangledName(fn) << " }";
		                return ss.str();
		            });

		        queue.enqueue(
		            frameSpec,
		            KernelBundle{SimdConcurrentKernel{}, extentMd, ALPAKA_FORWARD(fn), ALPAKA_FORWARD(in)...});
		    }
		} // namespace alpaka::onHost::internal
		// ==
		// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/onHost/algo/internal/concurrent.hpp ==
		// ============================================================================


	namespace alpaka::onHost
	{
	    /** Execute an n-nary function on each element of all input data.
	     *
	     * Concurrent is quite equal to a for-each algorithm with the difference that the functor is allowed to write to
	     * any argument. So it allows to implement a transform with a free number of output arguments.
	     *
	     * @param queue The queue to execute the transformation.
	     * @param exec The executor to use for the kernel execution.
	     * @param extents multi dimensional or scalar number of elements
	     * @param fn The function to apply to each element of the input data.
	     *   The functor should support @see SimdPtr and therefore can be used for stencil evaluations.
	     *   It is not required to wrapp the functor with @see StencilFunc.
	     *   If a stencil lookup is executed you should take care to not read outside of valid memory ranges
	     *   by using sub-views to your input/output data. Optionally, a function can have an accelerator as its first
	     *   argument.
	     * @param inOut The input/output data, all data is passed to fn.
	     *
	     * examples for a unary add one functor:
	     * @code{.cpp}
	     *   struct Foo {
	     *      constexpr auto operator()(onAcc::concepts::Acc auto const&, concepts::SimdPtr auto const& a) const {
	     *          a = a.load() + 1;
	     *      }
	     *   };
	     *   struct Bar {
	     *      constexpr auto operator()(concepts::SimdPtr auto const& a) const {
	     *          a = a.load() + 1;
	     *      }
	     *   };
	     * @endcode
	     *
	     * @{
	     */
	    template<typename T_DataType, typename T_Device, alpaka::concepts::QueueKind T_QueueKind>
	    inline void concurrent(
	        Queue<T_Device, T_QueueKind> const& queue,
	        alpaka::concepts::Executor auto const exec,
	        alpaka::concepts::VectorOrScalar auto const& extents,
	        auto&& fn,
	        alpaka::concepts::IDataSource auto&&... inOut)
	    {
	        if constexpr(exec == alpaka::exec::anyExecutor)
	        {
	            internal::concurrent<T_DataType>(
	                queue,
	                defaultExecutor(queue.getDevice()),
	                extents,
	                ALPAKA_FORWARD(fn),
	                ALPAKA_FORWARD(inOut)...);
	        }
	        else
	            internal::concurrent<T_DataType>(queue, exec, extents, ALPAKA_FORWARD(fn), ALPAKA_FORWARD(inOut)...);
	    }

	    /**
	     * An available default executor will be selected automatically. The default executor is an executor with most
	     * parallelism/performance.
	     */
	    template<typename T_DataType, typename T_Device, alpaka::concepts::QueueKind T_QueueKind>
	    inline void concurrent(
	        Queue<T_Device, T_QueueKind> const& queue,
	        alpaka::concepts::VectorOrScalar auto const& extents,
	        auto&& fn,
	        alpaka::concepts::IDataSource auto&&... inOut)
	    {
	        internal::concurrent<T_DataType>(
	            queue,
	            defaultExecutor(queue.getDevice()),
	            extents,
	            ALPAKA_FORWARD(fn),
	            ALPAKA_FORWARD(inOut)...);
	    }

	    /** @} */
	} // namespace alpaka::onHost
	// ==
	// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/onHost/algo/concurrent.hpp ==
	// ============================================================================

	// ============================================================================
	// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/onHost/algo/iota.hpp ==
	// ==
	/* Copyright 2025 René Widera
	 * SPDX-License-Identifier: MPL-2.0
	 */

	// #pragma once
	// #include "alpaka/api/trait.hpp"    // amalgamate: file already inlined
		// ============================================================================
		// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/onHost/algo/internal/iota.hpp ==
		// ==
		/* Copyright 2025 René Widera
		 * SPDX-License-Identifier: MPL-2.0
		 */

		// #pragma once

		// #include "alpaka/SimdPtr.hpp"    // amalgamate: file already inlined
		// #include "alpaka/Vec.hpp"    // amalgamate: file already inlined
		// #include "alpaka/core/common.hpp"    // amalgamate: file already inlined
		// #include "alpaka/onAcc/Acc.hpp"    // amalgamate: file already inlined
		// #include "alpaka/onAcc/SimdAlgo.hpp"    // amalgamate: file already inlined
		// #include "alpaka/onHost/interface.hpp"    // amalgamate: file already inlined
		// #include "alpaka/onHost/logger/logger.hpp"    // amalgamate: file already inlined
		// #include "alpaka/trait.hpp"    // amalgamate: file already inlined

		namespace alpaka::onHost::internal
		{
		    struct SimdIotaKernel
		    {
		        template<typename T_DataType>
		        ALPAKA_FN_ACC void operator()(
		            onAcc::concepts::Acc auto const& acc,
		            alpaka::concepts::Vector auto extents,
		            T_DataType const& initValue,
		            alpaka::concepts::IMdSpan auto&&... inputs) const
		        {
		            auto simdGrid = onAcc::SimdAlgo{onAcc::worker::threadsInGrid};

		            return simdGrid.concurrent(
		                acc,
		                extents,
		                [&](onAcc::concepts::Acc auto const&,
		                    alpaka::concepts::SimdPtr auto&& in0,
		                    alpaka::concepts::SimdPtr auto&&... inOther)
		                {
		                    using SimdType = ALPAKA_TYPEOF(in0.load());
		                    alpaka::concepts::Vector auto iotaOffsetMd = in0.getIdx();
		                    T_DataType linearBaseOffset
		                        = static_cast<T_DataType>(linearize(extents, iotaOffsetMd)) + initValue;
		                    alpaka::concepts::Simd auto result
		                        = SimdType([&](auto const& laneId) constexpr
		                                   { return linearBaseOffset + static_cast<T_DataType>(laneId); });
		                    // write output
		                    in0 = pCast<alpaka::trait::GetValueType_t<ALPAKA_TYPEOF(in0)>>(result);
		                    ((inOther = pCast<alpaka::trait::GetValueType_t<ALPAKA_TYPEOF(inOther)>>(result)), ...);
		                },
		                ALPAKA_FORWARD(inputs)...);
		        }
		    };

		    template<typename T_DataType>
		    inline void iota(
		        auto const& queue,
		        alpaka::concepts::Executor auto const exec,
		        alpaka::concepts::Vector auto const& extents,
		        T_DataType const& initValue,
		        alpaka::concepts::IMdSpan auto&&... inputs)
		    {
		        Vec const extentMd = extents;
		        auto frameSpec = getSimdFrameSpec<T_DataType>(queue.getDevice(), exec, extentMd);

		        ALPAKA_LOG_INFO(
		            onHost::logger::memory,
		            [&]()
		            {
		                std::stringstream ss;
		                ss << "iota{ extents=" << extentMd << ", value_type=" << onHost::demangledName<T_DataType>() << ", "
		                   << frameSpec << " }";
		                return ss.str();
		            });

		        queue.enqueue(frameSpec, KernelBundle{SimdIotaKernel{}, extentMd, initValue, ALPAKA_FORWARD(inputs)...});
		    }
		} // namespace alpaka::onHost::internal
		// ==
		// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/onHost/algo/internal/iota.hpp ==
		// ============================================================================


	#include <type_traits>

	namespace alpaka::onHost
	{
	    /** Fill data with sequentially increasing index (iota value).
	     *
	     * For multidimensional memory, the iota value is increased fastest in the last dimension.
	     *
	     * @tparam T_DataType Iota type which is used. Type must be convertible to the value type of the output data. Only
	     * fundamental types are allowed.
	     * @param queue The queue to execute the the algorithm.
	     * @param exec The executor to use for the kernel execution.
	     * @param initValue Index of the first element.
	     * @param out0 Output data to set the iota value. Any kind of alpaka View/MdSpan is supported. The product of the
	     * extents must fit into the precision of the index_type.
	     * @param outOther Additional output data to set the iota value. The extents must be at least as large as out0. Any
	     * kind of alpaka View/MdSpan is supported.
	     * @{
	     */
	    template<typename T_DataType, typename T_Device, alpaka::concepts::QueueKind T_QueueKind>
	    requires(std::is_fundamental_v<T_DataType>)
	    inline void iota(
	        Queue<T_Device, T_QueueKind> const& queue,
	        alpaka::concepts::Executor auto const exec,
	        T_DataType const& initValue,
	        alpaka::concepts::IMdSpan auto&& out0,
	        alpaka::concepts::IMdSpan auto&&... outOther)
	        requires(
	            std::is_convertible_v<T_DataType, alpaka::trait::GetValueType_t<ALPAKA_TYPEOF(out0)>>
	            && std::conjunction_v<
	                std::is_convertible<T_DataType, typename alpaka::trait::GetValueType_t<ALPAKA_TYPEOF(outOther)>>...>)
	    {
	        if constexpr(exec == alpaka::exec::anyExecutor)
	        {
	            internal::iota(
	                queue,
	                defaultExecutor(queue.getDevice()),
	                onHost::getExtents(out0),
	                initValue,
	                ALPAKA_FORWARD(out0),
	                ALPAKA_FORWARD(outOther)...);
	        }
	        else
	            internal::iota(
	                queue,
	                exec,
	                onHost::getExtents(out0),
	                initValue,
	                ALPAKA_FORWARD(out0),
	                ALPAKA_FORWARD(outOther)...);
	    }

	    /**
	     * An available default executor will be selected automatically. The default executor is the executor with the most
	     * parallelism/performance.
	     */
	    template<typename T_DataType, typename T_Device, alpaka::concepts::QueueKind T_QueueKind>
	    requires(std::is_fundamental_v<T_DataType>)
	    inline void iota(
	        Queue<T_Device, T_QueueKind> const& queue,
	        T_DataType const& initValue,
	        alpaka::concepts::IMdSpan auto&& out0,
	        alpaka::concepts::IMdSpan auto&&... outOther)
	        requires(
	            std::is_convertible_v<T_DataType, alpaka::trait::GetValueType_t<ALPAKA_TYPEOF(out0)>>
	            && std::conjunction_v<
	                std::is_convertible<T_DataType, typename alpaka::trait::GetValueType_t<ALPAKA_TYPEOF(outOther)>>...>)
	    {
	        internal::iota<T_DataType>(
	            queue,
	            defaultExecutor(queue.getDevice()),
	            onHost::getExtents(out0),
	            initValue,
	            ALPAKA_FORWARD(out0),
	            ALPAKA_FORWARD(outOther)...);
	    }

	    /** @} */
	} // namespace alpaka::onHost
	// ==
	// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/onHost/algo/iota.hpp ==
	// ============================================================================

	// ============================================================================
	// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/onHost/algo/reduce.hpp ==
	// ==
	/* Copyright 2025 René Widera
	 * SPDX-License-Identifier: MPL-2.0
	 */

	// #pragma once
	// #include "alpaka/api/trait.hpp"    // amalgamate: file already inlined
	// #include "alpaka/mem/concepts/IDataStorage.hpp"    // amalgamate: file already inlined
		// ============================================================================
		// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/onHost/algo/internal/transformReduce.hpp ==
		// ==
		/* Copyright 2025 René Widera, Mehmet Yusufoglu
		 * SPDX-License-Identifier: MPL-2.0
		 */

		// #pragma once
		// #include "alpaka/Vec.hpp"    // amalgamate: file already inlined
		// #include "alpaka/api/util.hpp"    // amalgamate: file already inlined
		// #include "alpaka/core/common.hpp"    // amalgamate: file already inlined
		// #include "alpaka/functor.hpp"    // amalgamate: file already inlined
		// #include "alpaka/mem/MdSpan.hpp"    // amalgamate: file already inlined
		// #include "alpaka/mem/concepts/IDataSource.hpp"    // amalgamate: file already inlined
		// #include "alpaka/mem/concepts/IMdSpan.hpp"    // amalgamate: file already inlined
		// #include "alpaka/onAcc/Acc.hpp"    // amalgamate: file already inlined
		// #include "alpaka/onAcc/SimdAlgo.hpp"    // amalgamate: file already inlined
		// #include "alpaka/onAcc/atomic.hpp"    // amalgamate: file already inlined
		// #include "alpaka/onHost/interface.hpp"    // amalgamate: file already inlined
		// #include "alpaka/onHost/logger/logger.hpp"    // amalgamate: file already inlined
		// #include "alpaka/trait.hpp"    // amalgamate: file already inlined

		namespace alpaka::onHost::internal
		{
		    struct SimdTransformReduceKernel
		    {
		        uint32_t dynSharedMemBytes = 0u;

		        template<typename T_DataType>
		        ALPAKA_FN_ACC void operator()(
		            onAcc::concepts::Acc auto const& acc,
		            alpaka::concepts::Vector auto const& numChunks,
		            alpaka::concepts::Vector auto const& chunkExtents,
		            alpaka::concepts::Vector auto const& extentMd,
		            T_DataType const& neutralElement,
		            alpaka::concepts::IMdSpan auto output,
		            auto const& reduceFunc,
		            auto const& transformFunc,
		            alpaka::concepts::IDataSource auto&&... inputs) const
		        {
		            static_assert(
		                std::is_same_v<ALPAKA_TYPEOF(neutralElement), alpaka::trait::GetValueType_t<ALPAKA_TYPEOF(output)>>,
		                "The neutral element type must match the data output type.");


		            // Shared memory for block-wide reduction
		            T_DataType* dynS = onAcc::getDynSharedMem<T_DataType>(acc);
		            auto pitchMd = alpaka::calculatePitchesFromExtents<T_DataType>(chunkExtents);
		            auto tbSum = MdSpan{dynS, chunkExtents, pitchMd};

		            auto traverseInFrame = alpaka::onAcc::makeIdxMap(
		                acc,
		                alpaka::onAcc::worker::threadsInBlock,
		                alpaka::IdxRange{chunkExtents});

		            // Initialize shared memory by setting all elements to the neutral element or identity value
		            // for the reduction operation.
		            for(auto elemIdxInFrame : traverseInFrame)
		            {
		                tbSum[elemIdxInFrame] = neutralElement;
		            }

		            auto const chunkDataExtent = numChunks * chunkExtents;
		            auto traverseOverFrames = alpaka::onAcc::makeIdxMap(
		                acc,
		                alpaka::onAcc::worker::blocksInGrid,
		                alpaka::IdxRange{chunkDataExtent.fill(0), chunkDataExtent, chunkExtents});

		            for(auto chunkIdx : traverseOverFrames)
		            {
		                for(alpaka::concepts::Vector auto elemIdxInChunk : traverseInFrame)
		                {
		                    auto allThreads = alpaka::onAcc::SimdAlgo{
		                        alpaka::onAcc::WorkerGroup{chunkIdx + elemIdxInChunk, chunkDataExtent}};

		                    // reduce functor with simd package support
		                    auto reducedValue
		                        = allThreads
		                              .transformReduce(acc, extentMd, neutralElement, reduceFunc, transformFunc, inputs...);
		                    auto& tbSumRef = tbSum[elemIdxInChunk];
		                    tbSumRef = reduceFunc(tbSumRef, reducedValue);
		                }
		            }

		            auto const laneIdInBlock = linearize(acc[alpaka::layer::thread].count(), acc[alpaka::layer::thread].idx());
		            auto const blockSize = acc[alpaka::layer::thread].count().product();
		            // Synchronize threads before aggregation
		            alpaka::onAcc::syncBlockThreads(acc);

		            // Aggregate shared memory slots
		            for(auto [linearSharedElemIdx] : alpaka::onAcc::makeIdxMap(
		                    acc,
		                    alpaka::onAcc::worker::linearThreadsInBlock,
		                    alpaka::IdxRange{blockSize, chunkExtents.product()}))
		            {
		                dynS[laneIdInBlock] = reduceFunc(dynS[laneIdInBlock], dynS[linearSharedElemIdx]);
		            }

		            alpaka::onAcc::syncBlockThreads(acc);

		            // Perform a parallel reduction within the block
		            // This is a tree reduction algorithm
		            for(auto offset = blockSize / 2; offset > 0; offset /= 2)
		            {
		                alpaka::onAcc::syncBlockThreads(acc);
		                if(laneIdInBlock < offset)
		                {
		                    dynS[laneIdInBlock] = reduceFunc(dynS[laneIdInBlock], dynS[laneIdInBlock + offset]);
		                }
		            }

		            // Atomic update of the global result
		            if(laneIdInBlock == 0)
		            {
		                using alpaka::onAcc::atomic::atomicInvoke;
		                if constexpr(
		                    alpaka::concepts::SpecializationOf<ALPAKA_TYPEOF(reduceFunc), ScalarFunc>
		                    || alpaka::concepts::SpecializationOf<ALPAKA_TYPEOF(reduceFunc), StencilFunc>)
		                {
		                    // Handle wrapped reduce functors e.g. ScalarFunc or StencilFunc
		                    using ReduceFunctor = typename ALPAKA_TYPEOF(reduceFunc)::Functor;
		                    atomicInvoke(
		                        static_cast<ReduceFunctor const&>(reduceFunc),
		                        acc,
		                        output.data(),
		                        dynS[laneIdInBlock]);
		                }
		                else
		                    atomicInvoke(reduceFunc, acc, output.data(), dynS[laneIdInBlock]);
		            }
		        }
		    };

		    template<typename T_DataType>
		    inline void transformReduce(
		        auto const& queue,
		        alpaka::concepts::Executor auto const exec,
		        T_DataType const& neutralElement,
		        alpaka::concepts::IMdSpan auto out,
		        auto&& reduceFn,
		        auto&& transformFn,
		        auto&& in0,
		        alpaka::concepts::IDataSource auto&&... in)
		    {
		        auto extentMd = onHost::getExtents(in0);
		        using IndexType = alpaka::trait::GetValueType_t<ALPAKA_TYPEOF(extentMd)>;
		        auto frameSpec = getSimdFrameSpec<T_DataType>(queue.getDevice(), exec, extentMd);

		        /* Adjust the launch parameters to not oversubscribe a device too much.
		         *
		         * @todo: This heuristic should be adjusted based on benchmarking different cases.
		         */
		        {
		            IndexType multiprocessorScaling = 1u;
		            if constexpr(!(ALPAKA_TYPEOF(queue.getDevice().getDeviceKind()){} == deviceKind::cpu))
		            {
		                // For non-CPU devices, we scale the number of frames based on an arbitrary number derived from
		                // testing with the dot kernel of the bablestream benchmark.
		                multiprocessorScaling = 32u;
		            }

		            auto const numMultiProcessors = queue.getDevice().getDeviceProperties().multiProcessorCount;
		            auto adjsutedNumFrames = alpaka::api::util::adjustToLimit(
		                frameSpec.getNumFrames(),
		                static_cast<IndexType>(numMultiProcessors * multiprocessorScaling));
		            frameSpec = FrameSpec{adjsutedNumFrames, frameSpec.getFrameExtents(), exec};
		        }

		        /* Derive the chunk size and number of chunks from the SIMD optimized frame specification.
		         * The chunking parameters influences the numerical precision because it provides the possibility to control
		         * the length of the accumulation chain of a single thread.
		         */
		        auto numChunks = frameSpec.getNumFrames();
		        auto chunkExtents = frameSpec.getFrameExtents();

		        auto kernelFn = SimdTransformReduceKernel{
		            static_cast<uint32_t>(frameSpec.getFrameExtents().product() * sizeof(T_DataType))};

		        ALPAKA_LOG_INFO(
		            onHost::logger::memory,
		            [&]()
		            {
		                std::stringstream ss;
		                ss << "transformReduce{ extents=" << extentMd << ", value_type=" << onHost::demangledName<T_DataType>()
		                   << ", " << frameSpec << ", reduceFn=" << onHost::demangledName(reduceFn)
		                   << ", transformFn=" << onHost::demangledName(transformFn) << " }";
		                return ss.str();
		            });

		        onHost::fill(queue, out, neutralElement, out.getExtents().fill(1));
		        queue.enqueue(
		            frameSpec,
		            KernelBundle{
		                kernelFn,
		                numChunks,
		                chunkExtents,
		                extentMd,
		                neutralElement,
		                out,
		                ALPAKA_FORWARD(reduceFn),
		                ALPAKA_FORWARD(transformFn),
		                ALPAKA_FORWARD(in0),
		                ALPAKA_FORWARD(in)...});
		    }
		} // namespace alpaka::onHost::internal
		// ==
		// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/onHost/algo/internal/transformReduce.hpp ==
		// ============================================================================


	namespace alpaka::onHost
	{
	    /** accumulate the results into a scalar value.
	     *
	     * @param queue The queue to execute the transformation.
	     * @param exec The executor to use for the kernel execution.
	     * @param neutralElement The neutral element in respect to binaryReduceFn.
	     * @param out MdSpan for the result. The value_type must be equal to neutralElement and the result of the binary
	     * reduce functor type. The result is written to the first element of the output data.
	     * @param binaryReduceFn Reduce binary functor, the functor operation must be transitive and commutative.
	     *   The atomic operation atomic::atomicInvoke(ReduceFnType, onAcc::concepts::Acc, auto* destination,auto source)
	     * must be overloaded. The functor execution order is not specified. The functor should support Simd packages, if
	     * not you can enforce the element wise execution by wrapping into ScalarFunc.
	     * @param in The input data which should be reduced.
	     *
	     * @{
	     */
	    template<typename DataType, typename T_Device, alpaka::concepts::QueueKind T_QueueKind>
	    inline void reduce(
	        Queue<T_Device, T_QueueKind> const& queue,
	        alpaka::concepts::Executor auto const exec,
	        DataType const& neutralElement,
	        alpaka::concepts::IMdSpan auto out,
	        auto&& binaryReduceFn,
	        auto&& in) requires(std::same_as<DataType, alpaka::trait::GetValueType_t<ALPAKA_TYPEOF(out)>>)
	    {
	        if constexpr(exec == alpaka::exec::anyExecutor)
	        {
	            internal::transformReduce(
	                queue,
	                defaultExecutor(queue.getDevice()),
	                neutralElement,
	                out,
	                ALPAKA_FORWARD(binaryReduceFn),
	                std::identity{},
	                ALPAKA_FORWARD(in));
	        }
	        else
	            internal::transformReduce(
	                queue,
	                exec,
	                neutralElement,
	                out,
	                ALPAKA_FORWARD(binaryReduceFn),
	                std::identity{},
	                ALPAKA_FORWARD(in));
	    }

	    /**
	     * A available default executor will be selected automatically. The default executor is a executor with most
	     * parallelism/performance.
	     */
	    template<typename DataType, typename T_Device, alpaka::concepts::QueueKind T_QueueKind>
	    inline void reduce(
	        Queue<T_Device, T_QueueKind> const& queue,
	        DataType const& neutralElement,
	        alpaka::concepts::IMdSpan auto out,
	        auto&& binaryReduceFn,
	        alpaka::concepts::IDataSource auto&& in)
	        requires(std::same_as<DataType, alpaka::trait::GetValueType_t<ALPAKA_TYPEOF(out)>>)
	    {
	        reduce(
	            queue,
	            defaultExecutor(queue.getDevice()),
	            neutralElement,
	            out,
	            ALPAKA_FORWARD(binaryReduceFn),
	            ALPAKA_FORWARD(in));
	    }

	    /** @} */
	} // namespace alpaka::onHost
	// ==
	// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/onHost/algo/reduce.hpp ==
	// ============================================================================

	// ============================================================================
	// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/onHost/algo/scan.hpp ==
	// ==
	/* Copyright 2025 Anton Reinhard
	 * SPDX-License-Identifier: MPL-2.0
	 */

	// #pragma once
	// #include "alpaka/api/trait.hpp"    // amalgamate: file already inlined
	// #include "alpaka/concepts.hpp"    // amalgamate: file already inlined
		// ============================================================================
		// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/onHost/algo/internal/scan.hpp ==
		// ==
		/* Copyright 2025 Anton Reinhard
		 * SPDX-License-Identifier: MPL-2.0
		 */

		// #pragma once

		// #include "alpaka/CVec.hpp"    // amalgamate: file already inlined
		// #include "alpaka/Simd.hpp"    // amalgamate: file already inlined
		// #include "alpaka/Vec.hpp"    // amalgamate: file already inlined
		// #include "alpaka/core/common.hpp"    // amalgamate: file already inlined
		// #include "alpaka/onAcc/Acc.hpp"    // amalgamate: file already inlined
		// #include "alpaka/onAcc/SimdAlgo.hpp"    // amalgamate: file already inlined
		// #include "alpaka/onAcc/warp.hpp"    // amalgamate: file already inlined
		// #include "alpaka/onHost/interface.hpp"    // amalgamate: file already inlined
		// #include "alpaka/onHost/logger/logger.hpp"    // amalgamate: file already inlined
		// #include "alpaka/trait.hpp"    // amalgamate: file already inlined

		// #include <array> // std::array    // amalgamate: file already included
		// #include <cstddef> // std::size_t    // amalgamate: file already included
		// #include <tuple> // std::tuple    // amalgamate: file already included
		#include <type_traits> // is_same_v
		#include <typeinfo>

		namespace alpaka::onHost::internal
		{
		    enum ScanType
		    {
		        EXCLUSIVE_SCAN,
		        INCLUSIVE_SCAN
		    };

		    constexpr std::size_t chunkSize = 2048u;

		    template<alpaka::concepts::DeviceKind TDeviceKind, typename T_Idx, typename T_Data>
		    consteval T_Idx maximumMiniBlockSize()
		    {
		        if constexpr(TDeviceKind{} == deviceKind::nvidiaGpu)
		            return static_cast<T_Idx>(8);
		        else if constexpr(TDeviceKind{} == deviceKind::amdGpu)
		            return static_cast<T_Idx>(8);
		        else if constexpr(TDeviceKind{} == deviceKind::intelGpu)
		            return static_cast<T_Idx>(8);
		        else
		            return static_cast<T_Idx>(32768) / sizeof(T_Data);
		    }

		    /* This function introduces padding to the shared memory accesses to reduce bank conflicts between threads. The
		     * template parameter is the device kind, which dictates how many memory banks are assumed. For CPU or
		     * unknown/unimplemented device kinds, infinite memory banks are assumed, i.e., no padding is used.
		     */
		    template<typename T_Acc, typename T_Idx>
		    constexpr T_Idx conflictFreeAccess(T_Idx const& n)
		    {
		        constexpr auto warpSize = static_cast<T_Idx>(onAcc::warp::getSize<T_Acc>());
		        return n + n / warpSize;
		    }

		    /* Do a muting exclusive scan on the given miniblock, and return the total sum.
		     */
		    template<typename T_Idx, typename T_Data>
		    ALPAKA_FN_ACC T_Data scanMiniBlock(T_Data* block, alpaka::concepts::CVector<T_Idx> auto const& extent)
		    {
		        // -- UP-SWEEP / REDUCE --
		        for(T_Idx d = extent.x() / T_Idx{2}, offset = T_Idx{1}; d > 0; d >>= 1, offset <<= 1)
		        {
		            for(auto frameElem = T_Idx{0}; frameElem < T_Idx{2} * d; frameElem += T_Idx{2})
		            {
		                T_Idx left = offset * (frameElem + T_Idx{1}) - T_Idx{1};
		                T_Idx right = offset * (frameElem + T_Idx{2}) - T_Idx{1};
		                block[right] += block[left];
		            }
		        }

		        // save total sum
		        T_Data blockSum = block[extent.x() - T_Idx{1}];

		        // set 0
		        block[extent.x() - T_Idx{1}] = T_Data{0};

		        // -- DOWN-SWEEP --
		        for(T_Idx d = 1, offset = extent.x() / T_Idx{2}; d < extent.x(); d <<= 1, offset >>= 1)
		        {
		            for(auto frameElem = T_Idx{0}; frameElem < T_Idx{2} * d; frameElem += T_Idx{2})
		            {
		                T_Idx left = offset * (frameElem + T_Idx{1}) - T_Idx{1};
		                T_Idx right = offset * (frameElem + T_Idx{2}) - T_Idx{1};
		                auto t = block[left];
		                block[left] = block[right];
		                block[right] += t;
		            }
		        }
		        return blockSum;
		    }

		    /* Do an add increment on the given miniblock, adding the given blockSum to each element.
		     */
		    template<typename T_Idx, typename T_Data>
		    ALPAKA_FN_ACC void addIncrements(
		        T_Data* block,
		        T_Data const& blockSum,
		        alpaka::concepts::CVector<T_Idx> auto const& extent)
		    {
		        for(auto i = T_Idx{0}; i < extent.x(); ++i)
		        {
		            block[i] += blockSum;
		        }
		    }

		    /* This kernel calculates an exclusive scan for each block individually. The algorithm is based on Blelloch, with
		     * the improvement from Lichterman, written up in the CUDA blog (see 39.2.5):
		     * https://developer.nvidia.com/gpugems/gpugems3/part-vi-gpu-computing/chapter-39-parallel-prefix-sum-scan-cuda
		     */
		    template<ScanType SCAN_TYPE, typename T_Idx, typename T_Data>
		    class Scan_ScanBlocksKernel
		    {
		    public:
		        ALPAKA_FN_ACC void operator()(
		            auto const& acc,
		            alpaka::concepts::Vector auto const numChunks,
		            alpaka::concepts::CVector auto const largeChunkExtents,
		            alpaka::concepts::IDataSource auto const& inputVec,
		            alpaka::concepts::IMdSpan auto outputVec,
		            auto... blockSums) const
		        {
		            using DeviceType = ALPAKA_TYPEOF(acc.getDeviceKind());
		            using AccType = ALPAKA_TYPEOF(acc);

		            alpaka::concepts::CVector auto numThreadsPerBlock = acc[layer::thread].count();
		            constexpr std::integral auto elsPerThread = largeChunkExtents.x() / numThreadsPerBlock.x();
		            alpaka::concepts::CVector auto chunkExtent = CVec<T_Idx, elsPerThread * numThreadsPerBlock.x()>{};
		            alpaka::concepts::Vector auto numElements = inputVec.getExtents();

		            constexpr std::integral auto miniBlockSize
		                = std::min(maximumMiniBlockSize<DeviceType, T_Idx, T_Data>(), elsPerThread);
		            constexpr std::integral auto miniBlocksPerThread = elsPerThread / miniBlockSize;
		            constexpr std::integral auto miniBlocksPerChunk = chunkExtent.x() / miniBlockSize;

		            constexpr auto LocalArrayLength = miniBlocksPerThread * miniBlockSize;
		            using LocalArray = T_Data[LocalArrayLength];

		            auto const validElementsInLastFrame = (numElements - T_Idx{1}) % chunkExtent + T_Idx{1};

		            /* This kernel is called with 1-dimensional frame extents.
		             *
		             * All thread blocks will be used to iterate over the frames. Each thread block will handle one or more
		             * frames.
		             */
		            for(auto chunkIdx :
		                onAcc::makeIdxMap(acc, onAcc::worker::blocksInGrid, IdxRange{Vec<T_Idx, 1u>{0}, numChunks}))
		            {
		                bool const lastFrameFull = validElementsInLastFrame == chunkExtent;
		                bool const isLastFrame = chunkIdx == numChunks - T_Idx{1};

		                // allocate "per-thread" register memory to store all mini blocks of a thread persistently
		                LocalArray regMem;

		                constexpr auto conflictFreeAdr = conflictFreeAccess<AccType>(miniBlocksPerChunk - T_Idx{1}) + T_Idx{1};
		                auto tmp = onAcc::declareSharedMdArray<T_Data, uniqueId()>(acc, CVec<T_Idx, conflictFreeAdr>{});
		                auto const frameOffset = chunkExtent * chunkIdx;

		                for(auto frameElem : onAcc::makeIdxMap(
		                        acc,
		                        onAcc::worker::threadsInBlock,
		                        IdxRange{CVec<T_Idx, 0u>{}, chunkExtent, CVec<T_Idx, elsPerThread>{}}))
		                {
		                    // -- COPY TO SHARED MEM --
		                    if((!lastFrameFull && isLastFrame) || elsPerThread % T_Idx{4} != T_Idx{0})
		                    {
		                        // load into miniblocks buffer, from frameElem to frameElem + elsPerThread
		                        for(auto i = T_Idx{0}; i < elsPerThread; ++i)
		                        {
		                            if(frameOffset + frameElem + i < numElements)
		                                regMem[i] = inputVec[frameOffset + frameElem + i];
		                            else
		                                regMem[i] = 0;
		                        }
		                    }
		                    else
		                    {
		                        MdSpanArray<LocalArray, T_Idx, alpaka::Alignment<16>> regMemMd{regMem};

		                        for(auto i = T_Idx{0}; i < elsPerThread; i += T_Idx{4})
		                        {
		                            auto inputVecView = SimdPtr{
		                                inputVec,
		                                Vec{frameOffset + frameElem + i},
		                                Alignment<16>{},
		                                CVec<T_Idx, 4>{}};
		                            auto regView = SimdPtr{regMemMd, Vec{i}, Alignment<16>{}, CVec<T_Idx, 4>{}};

		                            regView = inputVecView.load();
		                        }
		                    }

		                    // -- HANDLE MINI BLOCKS OF THIS THREAD --
		                    for(auto miniBlockOffset = T_Idx{0}; miniBlockOffset < elsPerThread;
		                        miniBlockOffset += miniBlockSize)
		                    {
		                        // scan miniblock
		                        auto miniBlockSum
		                            = scanMiniBlock<T_Idx, T_Data>(regMem + miniBlockOffset, CVec<T_Idx, miniBlockSize>{});

		                        // write miniblock sum into shared memory
		                        tmp[conflictFreeAccess<AccType>((frameElem + miniBlockOffset) / miniBlockSize)] = miniBlockSum;
		                    }
		                }

		                // -- UP-SWEEP / REDUCE --
		                for(T_Idx d = miniBlocksPerChunk / T_Idx{2}, offset = T_Idx{1}; d > 0; d >>= 1, offset <<= 1)
		                {
		                    onAcc::syncBlockThreads(acc);
		                    for(auto frameElem : onAcc::makeIdxMap(
		                            acc,
		                            onAcc::worker::threadsInBlock,
		                            IdxRange{CVec<T_Idx, 0>{}, Vec<T_Idx, 1>{T_Idx{2} * d}, T_Idx{2}}))
		                    {
		                        T_Idx left = offset * (frameElem + T_Idx{1}).x() - T_Idx{1};
		                        T_Idx right = offset * (frameElem + T_Idx{2}).x() - T_Idx{1};
		                        left = conflictFreeAccess<AccType>(left);
		                        right = conflictFreeAccess<AccType>(right);
		                        tmp[right] += tmp[left];
		                    }
		                }
		                onAcc::syncBlockThreads(acc);

		                for([[maybe_unused]] auto frameElem :
		                    onAcc::makeIdxMap(acc, onAcc::worker::threadsInBlock, IdxRange{1}))
		                {
		                    // -- SAVE BLOCK SUMS --
		                    if constexpr(sizeof...(blockSums))
		                    {
		                        auto _blockSums = std::get<0>(std::make_tuple(blockSums...));
		                        _blockSums[chunkIdx] = tmp[conflictFreeAccess<AccType>(miniBlocksPerChunk - T_Idx{1})];
		                    }

		                    // -- SET 0 --
		                    tmp[conflictFreeAccess<AccType>(miniBlocksPerChunk - T_Idx{1})] = 0;
		                }

		                // -- DOWN-SWEEP --
		                for(T_Idx d = 1, offset = miniBlocksPerChunk / T_Idx{2}; d < miniBlocksPerChunk; d <<= 1, offset >>= 1)
		                {
		                    onAcc::syncBlockThreads(acc);
		                    for(auto frameElem : onAcc::makeIdxMap(
		                            acc,
		                            onAcc::worker::threadsInBlock,
		                            IdxRange{CVec<T_Idx, 0>{}, Vec<T_Idx, 1>{T_Idx{2} * d}, T_Idx{2}}))
		                    {
		                        T_Idx left = offset * (frameElem.x() + T_Idx{1}) - T_Idx{1};
		                        T_Idx right = offset * (frameElem.x() + T_Idx{2}) - T_Idx{1};
		                        left = conflictFreeAccess<AccType>(left);
		                        right = conflictFreeAccess<AccType>(right);
		                        auto t = tmp[left];
		                        tmp[left] = tmp[right];
		                        tmp[right] += t;
		                    }
		                }
		                onAcc::syncBlockThreads(acc);

		                // -- WRITE BACK --
		                for(auto frameElem : onAcc::makeIdxMap(
		                        acc,
		                        onAcc::worker::threadsInBlock,
		                        IdxRange{CVec<T_Idx, 0u>{}, chunkExtent, CVec<T_Idx, elsPerThread>{}}))
		                {
		                    // -- HANDLE MINI BLOCKS OF THIS THREAD --
		                    for(auto miniBlockOffset = T_Idx{0}; miniBlockOffset < elsPerThread;
		                        miniBlockOffset += miniBlockSize)
		                    {
		                        // load block sum from shared memory
		                        T_Data blockSum{0};
		                        if(frameOffset + frameElem + miniBlockOffset < numElements)
		                        {
		                            blockSum
		                                = tmp[conflictFreeAccess<AccType>((frameElem.x() + miniBlockOffset) / miniBlockSize)];
		                        }

		                        // add block sum to mini block
		                        addIncrements<T_Idx>(regMem + miniBlockOffset, blockSum, CVec<T_Idx, miniBlockSize>{});
		                    }

		                    if((!lastFrameFull && isLastFrame) || elsPerThread % T_Idx{4} != T_Idx{0})
		                    {
		                        // write back to global mem, from frameElem to frameElem + elsPerThread
		                        for(auto i = T_Idx{0}; i < elsPerThread; ++i)
		                        {
		                            if(frameOffset + frameElem + i < numElements)
		                            {
		                                if constexpr(SCAN_TYPE == EXCLUSIVE_SCAN)
		                                    outputVec[frameOffset + frameElem + i] = regMem[i];
		                                else if constexpr(SCAN_TYPE == INCLUSIVE_SCAN)
		                                    outputVec[frameOffset + frameElem + i]
		                                        = inputVec[frameOffset + frameElem + i] + regMem[i];
		                            }
		                        }
		                    }
		                    else
		                    {
		                        MdSpanArray<LocalArray, T_Idx, alpaka::Alignment<16>> regMemMd{regMem};

		                        for(auto i = T_Idx{0}; i < elsPerThread; i += T_Idx{4})
		                        {
		                            auto outputVecView = SimdPtr{
		                                outputVec,
		                                Vec{frameOffset + frameElem + i},
		                                Alignment<16>{},
		                                CVec<T_Idx, 4>{}};
		                            auto regView = SimdPtr{regMemMd, Vec{i}, Alignment<16>{}, CVec<T_Idx, 4>{}};
		                            if constexpr(SCAN_TYPE == EXCLUSIVE_SCAN)
		                                outputVecView = regView.load();
		                            else if constexpr(SCAN_TYPE == INCLUSIVE_SCAN)
		                            {
		                                auto inputVecView = SimdPtr{
		                                    inputVec,
		                                    Vec{frameOffset + frameElem + i},
		                                    Alignment<16>{},
		                                    CVec<T_Idx, 4>{}};
		                                outputVecView = inputVecView.load() + regView.load();
		                            }
		                        }
		                    }
		                }
		                onAcc::syncBlockThreads(acc);
		            }
		        }
		    };

		    /* Add prefix sum from previous blocks (blockSums) to all elements in each block.
		     */
		    template<typename T_Idx>
		    class Scan_AddIncrementsKernel
		    {
		    public:
		        ALPAKA_FN_ACC void operator()(
		            auto const& acc,
		            alpaka::concepts::CVector auto const largeChunkExtents,
		            alpaka::concepts::IMdSpan auto const& blockSums,
		            alpaka::concepts::IMdSpan auto outputVec) const
		        {
		            alpaka::concepts::Vector auto numElements = outputVec.getExtents();
		            alpaka::concepts::CVector auto numThreadsPerBlock = acc[layer::thread].count();
		            constexpr auto elsPerThread = largeChunkExtents.x() / numThreadsPerBlock.x();
		            alpaka::concepts::CVector auto chunkExtent = CVec<T_Idx, elsPerThread * numThreadsPerBlock.x()>{};

		            auto simdGrid = onAcc::SimdAlgo{onAcc::worker::threadsInGrid};
		            simdGrid.concurrent(
		                acc,
		                numElements,
		                [&](auto const&, auto&& simdOut) constexpr
		                { simdOut = simdOut.load() + blockSums[simdOut.getIdx() / chunkExtent]; },
		                outputVec);
		        }
		    };

		    template<typename T_Data>
		    auto scanBufferSize(std::integral auto const& extent)
		    {
		        using T_Idx = ALPAKA_TYPEOF(extent);
		        auto elements = divCeil(extent, T_Idx{chunkSize});

		        auto bufSize = T_Idx{0};
		        while(elements > T_Idx{1})
		        {
		            bufSize += elements;
		            elements = divCeil(elements, T_Idx{chunkSize});
		        }

		        return bufSize * T_Idx{sizeof(T_Data)};
		    }

		    template<typename T_Data>
		    auto scanBufferSize(alpaka::concepts::Vector auto const& extents)
		    {
		        static_assert(ALPAKA_TYPEOF(extents)::dim() == 1, "scan is only usable for one dimensional buffers");
		        return Vec{scanBufferSize<T_Data>(extents.x())};
		    }

		    template<ScanType SCAN_TYPE>
		    void scan(
		        auto& queue,
		        alpaka::onHost::concepts::Device auto& devAcc,
		        alpaka::concepts::Executor auto& exec,
		        alpaka::concepts::IMdSpan auto& buffer,
		        alpaka::concepts::IMdSpan auto& outputVec,
		        alpaka::concepts::IDataSource auto& inputVec)
		    {
		        using T_Data = typename ALPAKA_TYPEOF(inputVec)::value_type;
		        using T_Idx = typename ALPAKA_TYPEOF(inputVec)::index_type;

		        static_assert(
		            std::is_same_v<T_Data, typename ALPAKA_TYPEOF(outputVec)::value_type>,
		            "output vector must have the same data type as input vector");

		        // Instantiate the kernel function object with the given scan type
		        Scan_ScanBlocksKernel<SCAN_TYPE, T_Idx, T_Data> scanBlocks;

		        // Define chunkExtent
		        constexpr auto chunkExtent = CVec<T_Idx, chunkSize>{};
		        alpaka::Vec numChunks = divCeil(inputVec.getExtents(), chunkExtent);
		        auto const frameSpec = onHost::FrameSpec{numChunks, CVec<T_Idx, 256u>{}};

		        ALPAKA_LOG_INFO(
		            onHost::logger::memory,
		            [&]()
		            {
		                std::stringstream ss;
		                ss << "scan: {";
		                if(SCAN_TYPE == INCLUSIVE_SCAN)
		                    ss << ", scanType= INCLUSIVE_SCAN";
		                else if(SCAN_TYPE == EXCLUSIVE_SCAN)
		                    ss << ", scanType= EXCLUSIVE_SCAN";
		                ss << ", numFrames= " << numChunks;
		                ss << ", chunkExtent= " << chunkExtent;
		                ss << ", value_type=" << onHost::demangledName<T_Data>();
		                ss << "}";
		                return ss.str();
		            });

		        if(frameSpec.getNumFrames() > T_Idx{1})
		        {
		            // problem does not fit in 1 frame, recurse
		            Scan_AddIncrementsKernel<T_Idx> addIncrements;

		            auto bufSizeBytes = frameSpec.getNumFrames() * T_Idx{sizeof(T_Data)};
		            assert(buffer.getExtents() * T_Idx{sizeof(typename ALPAKA_TYPEOF(buffer)::value_type)} >= bufSizeBytes);

		            // get the view to the necessary elements in the buffer for increments
		            auto subBuf = buffer.getSubView(bufSizeBytes);
		            auto increments = MdSpan{
		                reinterpret_cast<T_Data*>(subBuf.data()),
		                frameSpec.getNumFrames(),
		                Vec<T_Idx, 1>{sizeof(T_Data)}};

		            // the unused elements in the buffer are used for recursion to the next scan call
		            auto bufferNext = buffer.getSubView(bufSizeBytes, buffer.getExtents() - bufSizeBytes);

		            // enqueue the kernel execution tasks
		            queue.enqueue(
		                frameSpec,
		                KernelBundle{scanBlocks, numChunks, chunkExtent, inputVec, outputVec, increments});

		            // always recurse into exclusive scan
		            scan<EXCLUSIVE_SCAN>(queue, devAcc, exec, bufferNext, increments, increments);
		            queue.enqueue(frameSpec, KernelBundle{addIncrements, chunkExtent, increments, outputVec});
		        }
		        else
		        {
		            // problem fits within 1 frame
		            queue.enqueue(frameSpec, KernelBundle{scanBlocks, numChunks, chunkExtent, inputVec, outputVec});
		        }
		    }

		    template<ScanType SCAN_TYPE>
		    void scan(
		        auto& queue,
		        alpaka::onHost::concepts::Device auto& devAcc,
		        alpaka::concepts::Executor auto& exec,
		        alpaka::concepts::IMdSpan auto& outputVec,
		        alpaka::concepts::IDataSource auto const& inputVec)
		    {
		        using T_Data = ALPAKA_TYPEOF(inputVec)::value_type;

		        /* We do not use allocDeferred here since we measured up to a factor 40 higher latency compared to alloc for
		         * CUDA 12.8 on an A30 for the first call. The reason is the cuda per stream caching pool setup time.
		         */
		        auto buf = onHost::alloc<char>(devAcc, scanBufferSize<T_Data>(inputVec.getExtents()));

		        scan<SCAN_TYPE>(queue, devAcc, exec, buf, outputVec, inputVec);

		        buf.keepAlive(queue);
		    }

		} // namespace alpaka::onHost::internal
		// ==
		// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/onHost/algo/internal/scan.hpp ==
		// ============================================================================


	// TODO: add assertion function for whether a device/api is compatible with a number of buffers
	// (`onHost::isDataAccessible`)

	namespace alpaka::onHost
	{
	    /** @brief For a scan of some size, this function returns the necessary buffer size in bytes.
	     *
	     * When multiple scans of the same extents are needed (for example in a loop), this function can be used to only
	     * allocate an intermediate buffer once, removing alloc/free overhead. For unique scan calls, the buffer can be
	     * omitted in the scan call, in which case it will be allocated and freed on the fly.
	     *
	     * @tparam T_Data The type of the data to be scanned.
	     * @param extents The extents of the scan.
	     * @return The size of the buffer to allocate in **number of bytes**.
	     */
	    template<typename T_Data>
	    auto getScanBufferSize(alpaka::concepts::VectorOrScalar auto const& extents)
	    {
	        return internal::scanBufferSize<T_Data>(extents);
	    }

	    /** @brief Perform an inclusive scan on the input data and write the result to the output data.
	     *
	     * @param queue The queue to enqueue to.
	     * @param exec The executor to run with.
	     * @param buffer (optional) The internally used buffer. Use the scanBufferSize() function to check how big the
	     * buffer needs to be. If omitted, it will be allocated and destructed on the fly. If you call this method
	     * repeatedly, it is recommended to reuse the buffer whenever possible, or to provide a buffer allocated with
	     * onHost::allocDeferred() to reduce the overhead of allocating and deallocating the buffer on each call.
	     * @param outputVec The output data. To perform an in-place scan, use the overload with only one data object.
	     * @param inputVec The input data. Can be const.
	     *
	     * @{
	     */
	    void inclusiveScan(
	        auto const& queue,
	        alpaka::concepts::Executor auto exec,
	        alpaka::concepts::IMdSpan auto& buffer,
	        alpaka::concepts::IMdSpan auto& outputVec,
	        alpaka::concepts::IDataSource auto const& inputVec)
	    {
	        auto devAcc = queue.getDevice();
	        if constexpr(exec == alpaka::exec::anyExecutor)
	        {
	            internal::scan<internal::INCLUSIVE_SCAN>(
	                queue,
	                devAcc,
	                defaultExecutor(devAcc),
	                buffer,
	                outputVec,
	                inputVec);
	        }
	        else
	            internal::scan<internal::INCLUSIVE_SCAN>(queue, devAcc, exec, buffer, outputVec, inputVec);
	    }

	    void inclusiveScan(
	        auto const& queue,
	        alpaka::concepts::Executor auto exec,
	        alpaka::concepts::IMdSpan auto& outputVec,
	        alpaka::concepts::IDataSource auto const& inputVec)
	    {
	        auto devAcc = queue.getDevice();
	        if constexpr(exec == alpaka::exec::anyExecutor)
	        {
	            internal::scan<internal::INCLUSIVE_SCAN>(queue, devAcc, defaultExecutor(devAcc), outputVec, inputVec);
	        }
	        else
	            internal::scan<internal::INCLUSIVE_SCAN>(queue, devAcc, exec, outputVec, inputVec);
	    }

	    /** @} */

	    /** @brief Perform an inclusive scan on data in-place.
	     *
	     * @param queue The queue to enqueue to.
	     * @param exec The executor to run with.
	     * @param buffer (optional) The internally used buffer. Use the scanBufferSize() function to check how big the
	     * buffer needs to be. If omitted, it will be allocated and destructed on the fly. If you call this method
	     * repeatedly, it is recommended to reuse the buffer whenever possible, or to provide a buffer allocated with
	     * onHost::allocDeferred() to reduce the overhead of allocating and deallocating the buffer on each call.
	     * @param dataVec The vector to scan, will be overwritten with the result.
	     *
	     * @{
	     */
	    void inclusiveScanInPlace(
	        auto const& queue,
	        alpaka::concepts::Executor auto exec,
	        alpaka::concepts::IMdSpan auto& buffer,
	        alpaka::concepts::IMdSpan auto& dataVec)
	    {
	        auto devAcc = queue.getDevice();
	        if constexpr(exec == alpaka::exec::anyExecutor)
	        {
	            internal::scan<internal::INCLUSIVE_SCAN>(queue, devAcc, defaultExecutor(devAcc), buffer, dataVec, dataVec);
	        }
	        else
	            internal::scan<internal::INCLUSIVE_SCAN>(queue, devAcc, exec, buffer, dataVec, dataVec);
	    }

	    void inclusiveScanInPlace(
	        auto const& queue,
	        alpaka::concepts::Executor auto exec,
	        alpaka::concepts::IMdSpan auto& dataVec)
	    {
	        auto devAcc = queue.getDevice();
	        if constexpr(exec == alpaka::exec::anyExecutor)
	        {
	            internal::scan<internal::INCLUSIVE_SCAN>(queue, devAcc, defaultExecutor(devAcc), dataVec, dataVec);
	        }
	        else
	            internal::scan<internal::INCLUSIVE_SCAN>(queue, devAcc, exec, dataVec, dataVec);
	    }

	    /** @} */

	    /** @brief Perform an exclusive scan on the input data and write the result to the output data.
	     *
	     * @param queue The queue to enqueue to.
	     * @param exec The executor to run with.
	     * @param buffer (optional) The internally used buffer. Use the scanBufferSize() function to check how big the
	     * buffer needs to be. If omitted, it will be allocated and destructed on the fly. If you call this method
	     * repeatedly, it is recommended to reuse the buffer whenever possible, or to provide a buffer allocated with
	     * onHost::allocDeferred() to reduce the overhead of allocating and deallocating the buffer on each call.
	     * @param outputVec The output data. To perform an in-place scan, use the overload with only one data object.
	     * @param inputVec The input data. Can be const.
	     *
	     * @{
	     */
	    void exclusiveScan(
	        auto const& queue,
	        alpaka::concepts::Executor auto exec,
	        alpaka::concepts::IMdSpan auto& buffer,
	        alpaka::concepts::IMdSpan auto& outputVec,
	        alpaka::concepts::IDataSource auto const& inputVec)
	    {
	        auto devAcc = queue.getDevice();
	        if constexpr(exec == alpaka::exec::anyExecutor)
	        {
	            internal::scan<internal::EXCLUSIVE_SCAN>(
	                queue,
	                devAcc,
	                defaultExecutor(devAcc),
	                buffer,
	                outputVec,
	                inputVec);
	        }
	        else
	            internal::scan<internal::EXCLUSIVE_SCAN>(queue, devAcc, exec, buffer, outputVec, inputVec);
	    }

	    void exclusiveScan(
	        auto const& queue,
	        alpaka::concepts::Executor auto exec,
	        alpaka::concepts::IMdSpan auto& outputVec,
	        alpaka::concepts::IDataSource auto const& inputVec)
	    {
	        auto devAcc = queue.getDevice();
	        if constexpr(exec == alpaka::exec::anyExecutor)
	        {
	            internal::scan<internal::EXCLUSIVE_SCAN>(queue, devAcc, defaultExecutor(devAcc), outputVec, inputVec);
	        }
	        else
	            internal::scan<internal::EXCLUSIVE_SCAN>(queue, devAcc, exec, outputVec, inputVec);
	    }

	    /** @} */

	    /** @brief Perform an exclusive scan on data in-place.
	     *
	     * @param queue The queue to enqueue to.
	     * @param exec The executor to run with.
	     * @param buffer (optional) The internally used buffer. Use the scanBufferSize() function to check how big the
	     * buffer needs to be. If omitted, it will be allocated and destructed on the fly. If you call this method
	     * repeatedly, it is recommended to reuse the buffer whenever possible, or to provide a buffer allocated with
	     * onHost::allocDeferred() to reduce the overhead of allocating and deallocating the buffer on each call.
	     * @param dataVec The vector to scan, will be overwritten with the result.
	     *
	     * @{
	     */
	    void exclusiveScanInPlace(
	        auto const& queue,
	        alpaka::concepts::Executor auto exec,
	        alpaka::concepts::IMdSpan auto& buffer,
	        alpaka::concepts::IMdSpan auto& dataVec)
	    {
	        auto devAcc = queue.getDevice();
	        if constexpr(exec == alpaka::exec::anyExecutor)
	        {
	            internal::scan<internal::EXCLUSIVE_SCAN>(queue, devAcc, defaultExecutor(devAcc), buffer, dataVec, dataVec);
	        }
	        else
	            internal::scan<internal::EXCLUSIVE_SCAN>(queue, devAcc, exec, buffer, dataVec, dataVec);
	    }

	    void exclusiveScanInPlace(
	        auto const& queue,
	        alpaka::concepts::Executor auto exec,
	        alpaka::concepts::IMdSpan auto& dataVec)
	    {
	        auto devAcc = queue.getDevice();
	        if constexpr(exec == alpaka::exec::anyExecutor)
	        {
	            internal::scan<internal::EXCLUSIVE_SCAN>(queue, devAcc, defaultExecutor(devAcc), dataVec, dataVec);
	        }
	        else
	            internal::scan<internal::EXCLUSIVE_SCAN>(queue, devAcc, exec, dataVec, dataVec);
	    }

	    /** @} */
	} // namespace alpaka::onHost
	// ==
	// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/onHost/algo/scan.hpp ==
	// ============================================================================

	// ============================================================================
	// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/onHost/algo/transform.hpp ==
	// ==
	/* Copyright 2025 René Widera
	 * SPDX-License-Identifier: MPL-2.0
	 */

	// #pragma once
	// #include "alpaka/api/trait.hpp"    // amalgamate: file already inlined
	// #include "alpaka/mem/concepts/IDataSource.hpp"    // amalgamate: file already inlined
		// ============================================================================
		// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/onHost/algo/internal/transform.hpp ==
		// ==
		/* Copyright 2025 René Widera
		 * SPDX-License-Identifier: MPL-2.0
		 */

		// #pragma once

		// #include "alpaka/Simd.hpp"    // amalgamate: file already inlined
		// #include "alpaka/Vec.hpp"    // amalgamate: file already inlined
		// #include "alpaka/core/common.hpp"    // amalgamate: file already inlined
		// #include "alpaka/functor.hpp"    // amalgamate: file already inlined
		// #include "alpaka/mem/MdSpan.hpp"    // amalgamate: file already inlined
		// #include "alpaka/onAcc/Acc.hpp"    // amalgamate: file already inlined
		// #include "alpaka/onAcc/SimdAlgo.hpp"    // amalgamate: file already inlined
		// #include "alpaka/onHost/interface.hpp"    // amalgamate: file already inlined
		// #include "alpaka/onHost/logger/logger.hpp"    // amalgamate: file already inlined
		// #include "alpaka/trait.hpp"    // amalgamate: file already inlined

		namespace alpaka::onHost::internal
		{
		    struct SimdTransformKernel
		    {
		        ALPAKA_FN_ACC void operator()(
		            onAcc::concepts::Acc auto const& acc,
		            alpaka::concepts::IMdSpan auto&& output,
		            auto const& func,
		            alpaka::concepts::IDataSource auto&&... inputs) const
		        {
		            auto simdGrid = onAcc::SimdAlgo{onAcc::worker::threadsInGrid};
		            if constexpr(isSpecializationOf_v<ALPAKA_TYPEOF(func), StencilFunc>)
		            {
		                return simdGrid.concurrent(
		                    acc,
		                    output.getExtents(),
		                    [&](auto const& acc, auto out, auto&&... in)
		                    { out = callFunctor(acc, func, ALPAKA_FORWARD(in)...); },
		                    ALPAKA_FORWARD(output),
		                    ALPAKA_FORWARD(inputs)...);
		            }
		            else if constexpr(isSpecializationOf_v<ALPAKA_TYPEOF(func), ScalarFunc>)
		            {
		                simdGrid.concurrent(
		                    acc,
		                    output.getExtents(),
		                    [&](auto const& acc, auto outPtr, auto const&... inPtr) constexpr
		                    {
		                        outPtr = loadAncExecuteScalarOp(
		                            std::make_integer_sequence<uint32_t, ALPAKA_TYPEOF(outPtr)::width()>{},
		                            [](alpaka::concepts::CVector auto idx,
		                               auto const& acc,
		                               auto&& func,
		                               auto&&... data) constexpr { return callFunctor(acc, func, data[idx.x()]...); },
		                            acc,
		                            func,
		                            inPtr.load()...);
		                    },
		                    ALPAKA_FORWARD(output),
		                    ALPAKA_FORWARD(inputs)...);
		            }
		            else
		            {
		                return simdGrid.concurrent(
		                    acc,
		                    output.getExtents(),
		                    [&](auto const& acc, auto out, auto const&... in) { out = callFunctor(acc, func, in.load()...); },
		                    ALPAKA_FORWARD(output),
		                    ALPAKA_FORWARD(inputs)...);
		            }
		        }

		        template<uint32_t... T_idx>
		        ALPAKA_FN_INLINE static constexpr auto loadAncExecuteScalarOp(
		            std::integer_sequence<uint32_t, T_idx...>,
		            auto&& op,
		            auto const& acc,
		            auto&& func,
		            auto&&... data)
		        {
		            return Simd{op(CVec<uint32_t, T_idx>{}, acc, ALPAKA_FORWARD(func), ALPAKA_FORWARD(data)...)...};
		        }
		    };

		    inline void transform(
		        auto const& queue,
		        alpaka::concepts::Executor auto const exec,
		        alpaka::concepts::IMdSpan auto&& out,
		        auto&& fn,
		        alpaka::concepts::IDataSource auto&&... in)
		    {
		        auto extentMd = onHost::getExtents(out);
		        using DataType = alpaka::trait::GetValueType_t<ALPAKA_TYPEOF(out)>;
		        auto frameSpec = getSimdFrameSpec<DataType>(queue.getDevice(), exec, extentMd);

		        ALPAKA_LOG_INFO(
		            onHost::logger::memory,
		            [&]()
		            {
		                std::stringstream ss;
		                ss << "transform{ extents=" << extentMd << ", value_type=" << onHost::demangledName<DataType>() << ", "
		                   << frameSpec << ", fn=" << onHost::demangledName(fn) << " }";
		                return ss.str();
		            });

		        queue.enqueue(
		            frameSpec,
		            KernelBundle{SimdTransformKernel{}, ALPAKA_FORWARD(out), ALPAKA_FORWARD(fn), ALPAKA_FORWARD(in)...});
		    }
		} // namespace alpaka::onHost::internal
		// ==
		// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/onHost/algo/internal/transform.hpp ==
		// ============================================================================


	namespace alpaka::onHost
	{
	    /** Transform the input data with the given function and write the result to the output data.
	     *
	     * fn can be a lambda function if all arguments are specialized. This fully specialized functor must mostly wrapped
	     * by @see ScalarFunc. Generic lambdas are for some backends e.g. CUDA/HIP not supported. A lambda must be of the
	     * following form and should capture arguments only by copy.
	     *
	     * @code{.cpp}
	     *   [] ALPAKA_FN_ACC(){};
	     * @endcode
	     *
	     * @param queue The queue to execute the transformation.
	     * @param exec The executor to use for the kernel execution.
	     * @param out The output data to write the result to.
	     * @param fn The function to apply to each element of the input data.
	     *   The functor should support Simd packages. If not you can enforce the element wise execution by wrapping into
	     * @see ScalarFunc. If you would like to support stencil executions wrapp fn into @see StencilFunc. StencilFunc is
	     * getting all arguments as @see SimdPtr. If StencilFunc is used you should take care to not read outside of valid
	     * memory ranges by using sub-views to your input and output data. Optionally a fn can have a accelerator as first
	     * argument.
	     * @param in The input data to transform, all input data is passed to fn.
	     *
	     * examples for a identity unary transform functor:
	     * @code{.cpp}
	     *   struct Foo {
	     *      constexpr auto operator()(onAcc::concepts::Acc auto const&, concepts::SimdPtr auto const& a) const {
	     *          return a.load();
	     *      }
	     *   };
	     *   struct Bar {
	     *      constexpr auto operator()(concepts::SimdPtr auto const& a) const {
	     *          return a.load();
	     *      }
	     *   };
	     * @endcode
	     *
	     * @{
	     */
	    template<typename T_Device, alpaka::concepts::QueueKind T_QueueKind>
	    inline void transform(
	        Queue<T_Device, T_QueueKind> const& queue,
	        alpaka::concepts::Executor auto const exec,
	        alpaka::concepts::IMdSpan auto&& out,
	        auto&& fn,
	        alpaka::concepts::IDataSource auto&&... in)
	    {
	        if constexpr(exec == alpaka::exec::anyExecutor)
	        {
	            internal::transform(
	                queue,
	                defaultExecutor(queue.getDevice()),
	                ALPAKA_FORWARD(out),
	                ALPAKA_FORWARD(fn),
	                ALPAKA_FORWARD(in)...);
	        }
	        else
	            internal::transform(queue, exec, ALPAKA_FORWARD(out), ALPAKA_FORWARD(fn), ALPAKA_FORWARD(in)...);
	    }

	    /**
	     * A available default executor will be selected automatically. The default executor is a executor with most
	     * parallelism/performance.
	     */
	    template<typename T_Device, alpaka::concepts::QueueKind T_QueueKind>
	    inline void transform(
	        Queue<T_Device, T_QueueKind> const& queue,
	        alpaka::concepts::IMdSpan auto&& out,
	        auto&& fn,
	        alpaka::concepts::IDataSource auto&&... in)
	    {
	        transform(
	            queue,
	            defaultExecutor(queue.getDevice()),
	            ALPAKA_FORWARD(out),
	            ALPAKA_FORWARD(fn),
	            ALPAKA_FORWARD(in)...);
	    }

	    /** @} */
	} // namespace alpaka::onHost
	// ==
	// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/onHost/algo/transform.hpp ==
	// ============================================================================

	// ============================================================================
	// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/onHost/algo/transformReduce.hpp ==
	// ==
	/* Copyright 2025 René Widera
	 * SPDX-License-Identifier: MPL-2.0
	 */

	// #pragma once
	// #include "alpaka/api/trait.hpp"    // amalgamate: file already inlined
	// #include "alpaka/mem/concepts/IDataStorage.hpp"    // amalgamate: file already inlined
	// #include "alpaka/onHost/algo/internal/transformReduce.hpp"    // amalgamate: file already inlined

	namespace alpaka::onHost
	{
	    /** Transform the input data with the given function and accumulate the results into a scalar value.
	     *
	     * transformFn can be a lambda function if all arguments are specialized. This fully specialized functor must
	     * mostly wrapped by @see ScalarFunc. Generic lambdas are for some backends e.g. CUDA/HIP not supported. A lambda
	     * must be of the following form and should capture arguments only by copy.
	     *
	     * @code{.cpp}
	     *   [] ALPAKA_FN_ACC(){};
	     * @endcode
	     *
	     * @param queue The queue to execute the transformation.
	     * @param exec The executor to use for the kernel execution.
	     * @param neutralElement The neutral element in respect to binaryReduceFn.
	     * @param out MdSpan for the result. The value_type must be equal to neutralElement and the result of the binary
	     * reduce functor type. The result is written to the first element of the output data.
	     * @param binaryReduceFn Reduce binary functor, the functor operation must be transitive and commutative.
	     *   The atomic operation atomic::atomicInvoke(ReduceFnType, onAcc::concepts::Acc, auto* destination,auto source)
	     * must be overloaded. The functor execution order is not specified. The functor should support Simd packages, if
	     * not you can enforce the element wise execution by wrapping into
	     *   @see ScalarFunc.
	     * @param transformFn The function to apply to each element of the input data.
	     *   The functor should support Simd packages. If not you can enforce the element wise execution by wrapping into
	     * ScalarFunc. If you would like to support stencil executions wrapp fn into StencilFunc. StencilFunc is
	     * getting all arguments as SimdPtr. If StencilFunc is used you should take care to not read outside of valid
	     * memory ranges by using sub-views to your input and output data. Optionally a transformFn can have an accelerator
	     * as first argument.
	     * @param in The input data to transform, all input data is passed to fn. transformFn must support as many
	     * arguments as input data is provided. An optional argument for the accelerator is support as first argument if
	     * needed.
	     *
	     * examples for a identity unary transform functor:
	     * @code{.cpp}
	     *   struct Foo {
	     *      constexpr auto operator()(onAcc::concepts::Acc auto const&, concepts::SimdPtr auto const& a) const {
	     *          return a.load();
	     *      }
	     *   };
	     *   struct Bar {
	     *      constexpr auto operator()(concepts::SimdPtr auto const& a) const {
	     *          return a.load();
	     *      }
	     *   };
	     * @endcode
	     *
	     * @{
	     */
	    template<typename DataType, typename T_Device, alpaka::concepts::QueueKind T_QueueKind>
	    inline void transformReduce(
	        Queue<T_Device, T_QueueKind> const& queue,
	        alpaka::concepts::Executor auto const exec,
	        DataType const& neutralElement,
	        alpaka::concepts::IMdSpan auto out,
	        auto&& binaryReduceFn,
	        auto&& transformFn,
	        alpaka::concepts::IDataSource auto&&... in)
	        requires(std::same_as<DataType, alpaka::trait::GetValueType_t<ALPAKA_TYPEOF(out)>>)
	    {
	        if constexpr(exec == alpaka::exec::anyExecutor)
	        {
	            internal::transformReduce(
	                queue,
	                defaultExecutor(queue.getDevice()),
	                neutralElement,
	                out,
	                ALPAKA_FORWARD(binaryReduceFn),
	                ALPAKA_FORWARD(transformFn),
	                ALPAKA_FORWARD(in)...);
	        }
	        else
	            internal::transformReduce(
	                queue,
	                exec,
	                neutralElement,
	                out,
	                ALPAKA_FORWARD(binaryReduceFn),
	                ALPAKA_FORWARD(transformFn),
	                ALPAKA_FORWARD(in)...);
	    }

	    /**
	     * An available default executor will be selected automatically. The default executor is the executor with the most
	     * parallelism/performance.
	     */
	    template<typename DataType, typename T_Device, alpaka::concepts::QueueKind T_QueueKind>
	    inline void transformReduce(
	        Queue<T_Device, T_QueueKind> const& queue,
	        DataType const& neutralElement,
	        alpaka::concepts::IMdSpan auto out,
	        auto&& binaryReduceFn,
	        auto&& transformFn,
	        alpaka::concepts::IDataSource auto&&... in)
	        requires(std::same_as<DataType, alpaka::trait::GetValueType_t<ALPAKA_TYPEOF(out)>>)
	    {
	        transformReduce(
	            queue,
	            defaultExecutor(queue.getDevice()),
	            neutralElement,
	            out,
	            ALPAKA_FORWARD(binaryReduceFn),
	            ALPAKA_FORWARD(transformFn),
	            ALPAKA_FORWARD(in)...);
	    }

	    /** @} */
	} // namespace alpaka::onHost
	// ==
	// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/onHost/algo/transformReduce.hpp ==
	// ============================================================================

// #include "alpaka/onHost/demangledName.hpp"    // amalgamate: file already inlined
	// ============================================================================
	// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/onHost/executeForEach.hpp ==
	// ==
	/* Copyright 2023 Jeffrey Kelling, Bernhard Manfred Gruber, Jan Stephan, Aurora Perego, Andrea Bocci
	 * SPDX-License-Identifier: MPL-2.0
	 */

	// #include "alpaka/api/api.hpp"    // amalgamate: file already inlined
	// #include "alpaka/onHost/DeviceSelector.hpp"    // amalgamate: file already inlined

	// #include <functional>    // amalgamate: file already included
	// #include <tuple>    // amalgamate: file already included
	// #include <utility>    // amalgamate: file already included

	// #pragma once
	namespace alpaka::onHost
	{
	    //! execute a callable for each active accelerator tag
	    //
	    // @param callable callable which can be invoked with an accelerator tag
	    // @return disjunction of all invocation results
	    //
	    inline auto executeForEach(auto&& callable, auto const& backends)
	    {
	        // Execute the callable once for each enabled accelerator.
	        // Pass the tag as first argument to the callable.
	        return std::apply([=](auto const&... backend) { return (callable(backend) || ...); }, backends);
	    }

	    //! execute a callable for each active backend if there is a device available
	    //
	    // The function contains a runtime check if at least one device is available, if there is no device the callable
	    // will not be executed. Not executed combinations will return EXIT_SUCCESS.
	    //
	    // @param callable callable which can be invoked with the backend
	    // @return disjunction of all invocation results
	    //
	    inline auto executeForEachIfHasDevice(auto&& callable, auto const& tupleOfBackends)
	    {
	        auto exe = [=](auto const& backend)
	        {
	            auto devSelector = onHost::makeDeviceSelector(backend[object::deviceSpec]);
	            if(devSelector.isAvailable())
	            {
	                callable(backend);
	            }
	            return EXIT_SUCCESS;
	        };
	        // Execute the callable once for each enabled accelerator.
	        // Pass the tag as first argument to the callable.
	        return std::apply([=](auto const&... backends) { return (exe(backends) || ...); }, tupleOfBackends);
	    }

	    template<onHost::concepts::DeviceSpec... T_DeviceSpecs>
	    inline auto executeForEachIfHasDevice(auto&& callable, std::tuple<T_DeviceSpecs...> const& tupleOfDeviceSpecs)
	    {
	        auto exe = [=](auto const& devSpec)
	        {
	            auto devSelector = onHost::makeDeviceSelector(devSpec);
	            if(devSelector.isAvailable())
	            {
	                callable(devSpec);
	            }
	            return EXIT_SUCCESS;
	        };
	        // Execute the callable once for each enabled accelerator.
	        // Pass the tag as first argument to the callable.
	        return std::apply([=](auto const&... devSpecs) { return (exe(devSpecs) || ...); }, tupleOfDeviceSpecs);
	    }
	} // namespace alpaka::onHost
	// ==
	// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/onHost/executeForEach.hpp ==
	// ============================================================================

// #include "alpaka/onHost/interface.hpp"    // amalgamate: file already inlined
// #include "alpaka/onHost/logger/logger.hpp"    // amalgamate: file already inlined
	// ============================================================================
	// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/onHost/mem/stdContainer.hpp ==
	// ==
	/* Copyright 2024 René Widera, Bernhard Manfred Gruber
	 * SPDX-License-Identifier: MPL-2.0
	 */


	// #pragma once
	// #include "alpaka/CVec.hpp"    // amalgamate: file already inlined
	// #include "alpaka/Vec.hpp"    // amalgamate: file already inlined
	// #include "alpaka/api/api.hpp"    // amalgamate: file already inlined
	// #include "alpaka/core/config.hpp"    // amalgamate: file already inlined
	// #include "alpaka/onHost/internal/interface.hpp"    // amalgamate: file already inlined

	// #include <array>    // amalgamate: file already included
	// #include <cstdint>    // amalgamate: file already included
	#include <span>
	// #include <vector>    // amalgamate: file already included

	namespace alpaka
	{
	    namespace internal
	    {
	        template<typename T_Type, typename T_Allocator>
	        struct GetApi::Op<std::vector<T_Type, T_Allocator>>
	        {
	            inline constexpr auto operator()(auto&& stdVector) const
	            {
	                alpaka::unused(stdVector);
	                return api::Host{};
	            }
	        };

	        /** The Api is the Api of the caller scope */
	        template<typename T_Type, size_t T_size>
	        struct GetApi::Op<std::array<T_Type, T_size>>
	        {
	            inline constexpr auto operator()(auto&& stdArray) const
	            {
	                alpaka::unused(stdArray);
	                return thisApi();
	            }
	        };
	    } // namespace internal

	    namespace onHost::internal
	    {
	        template<typename T_Type, typename T_Allocator>
	        struct GetExtents::Op<std::vector<T_Type, T_Allocator>>
	        {
	            decltype(auto) operator()(auto&& stdVector) const
	            {
	                alpaka::unused(stdVector);
	                return Vec{stdVector.size()};
	            }
	        };

	        template<typename T_Type, size_t T_size>
	        struct GetExtents::Op<std::span<T_Type, T_size>>
	        {
	            decltype(auto) operator()(auto&& stdSpan) const
	            {
	                return Vec{stdSpan.size()};
	            }
	        };

	        template<typename T_Type, size_t T_size>
	        struct GetExtents::Op<std::array<T_Type, T_size>>
	        {
	            decltype(auto) operator()(auto&& stdArray) const
	            {
	                alpaka::unused(stdArray);
	                return CVec<size_t, T_size>{};
	            }
	        };

	        template<typename T_Type, typename T_Allocator>
	        struct GetPitches::Op<std::vector<T_Type, T_Allocator>>
	        {
	            decltype(auto) operator()(auto&& stdVector) const
	            {
	                alpaka::unused(stdVector);
	                return Vec{sizeof(T_Type)};
	            }
	        };

	        template<typename T_Type, size_t T_size>
	        struct GetPitches::Op<std::span<T_Type, T_size>>
	        {
	            decltype(auto) operator()(auto&& stdSpan) const
	            {
	                alpaka::unused(stdSpan);
	                return Vec{sizeof(T_Type)};
	            }
	        };

	        template<typename T_Type, size_t T_size>
	        struct GetPitches::Op<std::array<T_Type, T_size>>
	        {
	            decltype(auto) operator()(auto&& stdArray) const
	            {
	                alpaka::unused(stdArray);
	                return CVec<size_t, sizeof(T_Type)>{};
	            }
	        };
	    } // namespace onHost::internal

	    namespace trait
	    {
	        template<typename T_Type, typename T_Allocator>
	        struct GetValueType<std::vector<T_Type, T_Allocator>>
	        {
	            using type = T_Type;
	        };

	        template<typename T_Type, size_t T_size>
	        struct GetValueType<std::span<T_Type, T_size>>
	        {
	            using type = T_Type;
	        };

	        template<typename T_Type, size_t T_size>
	        struct GetValueType<std::array<T_Type, T_size>>
	        {
	            using type = T_Type;
	        };

	        template<typename T_Type, size_t T_size>
	        struct GetDim<std::span<T_Type, T_size>>
	        {
	            static constexpr uint32_t value = 1u;
	        };

	        template<typename T_Type, typename T_Allocator>
	        struct GetDim<std::vector<T_Type, T_Allocator>>
	        {
	            static constexpr uint32_t value = 1u;
	        };

	        template<typename T_Type, size_t T_size>
	        struct GetDim<std::array<T_Type, T_size>>
	        {
	            static constexpr uint32_t value = 1u;
	        };
	    } // namespace trait
	} // namespace alpaka
	// ==
	// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/onHost/mem/stdContainer.hpp ==
	// ============================================================================

	// ============================================================================
	// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/rand/distribution/NormalReal.hpp ==
	// ==
	/* Copyright 2025 Mehmet Yusufoglu, Tim Hanel
	 * SPDX-License-Identifier: MPL-2.0
	 */
	// #pragma once
	// #include "alpaka/math.hpp"    // amalgamate: file already inlined
		// ============================================================================
		// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/rand/concepts.hpp ==
		// ==
		/* Copyright 2025 Tim Hanel, René Widera
		 * SPDX-License-Identifier: MPL-2.0
		 */
		// #pragma once		// #include "alpaka/Vec.hpp"    // amalgamate: file already inlined
			// ============================================================================
			// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/rand/distribution/interval.hpp ==
			// ==
			/* Copyright 2025 Tim Hanel
			 * SPDX-License-Identifier: MPL-2.0
			 */
			// #pragma once			#include <type_traits>

			namespace alpaka::rand::interval
			{
			    namespace detail
			    {
			        struct IntervalBase
			        {
			        };
			    } // namespace detail

			    namespace trait
			    {
			        template<typename T_Interval>
			        struct IsInterval : std::is_base_of<detail::IntervalBase, T_Interval>
			        {
			        };
			    } // namespace trait

			    /** @brief Interval-tag type (a, b]: open (exclusive) at the lower bound, closed (inclusive) at the upper bound. */
			    struct OC : detail::IntervalBase
			    {
			    };

			    /** @brief Interval-tag (a, b] object instance @see OC for details */
			    constexpr OC oc{};

			    /** @brief Interval-tag type [a, b): closed (inclusive) at the lower bound, open (exclusive) at the upper bound. */
			    struct CO : detail::IntervalBase
			    {
			    };

			    /** @brief Interval-tag [a, b) object instance @see CO for details */
			    constexpr CO co{};

			    /** @brief Interval-tag type [a, b]: closed (inclusive) at both the lower and upper bounds. */
			    struct CC : detail::IntervalBase
			    {
			    };

			    /** @brief Interval-tag [a, b] object instance @see CC for details */
			    constexpr CC cc{};

			    /** @brief Interval-tag type (a, b): open (exclusive) at both the lower and upper bounds. */
			    struct OO : detail::IntervalBase
			    {
			    };

			    /** @brief Interval-tag (a, b) object instance @see OO for details */
			    constexpr OO oo{};

			    template<typename T>
			    constexpr bool isInterval_v = trait::IsInterval<T>::value;
			} // namespace alpaka::rand::interval
			// ==
			// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/rand/distribution/interval.hpp ==
			// ============================================================================


		// #include <concepts>    // amalgamate: file already included

		namespace alpaka::rand
		{
		    namespace concepts
		    {
		        /** @brief Concept defining a valid interval tag used to specify distribution bounds. */
		        template<typename T>
		        concept Interval = interval::isInterval_v<T>;

		        /** @brief Concept wrapper for std::uniform_random_bit_generator using alpaka scheme.
		         * @see https://en.cppreference.com/w/cpp/numeric/random/UniformRandomBitGenerator
		         */
		        template<typename T>
		        concept UniformStdEngine = std::uniform_random_bit_generator<T>;

		        /**
		         * Concept for random-engines which return a vector. This mirrors std::uniform_random_bit_generator, except
		         * that the return type of must be a Vector.
		         */
		        template<typename T>
		        concept UniformVectorEngine
		            = std::invocable<T&> && alpaka::concepts::Vector<std::invoke_result_t<T&>> && requires {
		                  { T::min() } -> std::same_as<typename std::invoke_result_t<T&>::type>;
		                  { T::max() } -> std::same_as<typename std::invoke_result_t<T&>::type>;
		                  requires std::bool_constant<(T::min() < T::max())>::value;
		              };

		        /**
		         * Unified concept for alpaka-compatible uniform random engines.
		         * A type satisfies this concept if it is either a standard
		         * uniform random bit generator or UniformVectorEngine.
		         */
		        template<typename T>
		        concept UniformRandomEngine = UniformStdEngine<T> || UniformVectorEngine<T>;
		    } // namespace concepts

		    constexpr bool operator==(concepts::Interval auto lhs, concepts::Interval auto rhs) noexcept
		    {
		        return std::is_same_v<ALPAKA_TYPEOF(lhs), ALPAKA_TYPEOF(rhs)>;
		    }

		    constexpr bool operator!=(concepts::Interval auto lhs, concepts::Interval auto rhs) noexcept
		    {
		        return !(lhs == rhs);
		    }
		} // namespace alpaka::rand
		// ==
		// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/rand/concepts.hpp ==
		// ============================================================================

		// ============================================================================
		// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/rand/distribution/UniformReal.hpp ==
		// ==
		/* Copyright 2025 Mehmet Yusufoglu, Tim Hanel
		 * SPDX-License-Identifier: MPL-2.0
		 */

		// #pragma once
		// #include "alpaka/rand/concepts.hpp"    // amalgamate: file already inlined
		// #include "alpaka/rand/distribution/interval.hpp"    // amalgamate: file already inlined
			// ============================================================================
			// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/rand/engine/philox/philox.hpp ==
			// ==
			/* Copyright 2022 Jiří Vyskočil, Jan Stephan, Bernhard Manfred Gruber
			 * SPDX-License-Identifier: MPL-2.0
			 */

			// #pragma once
			// #include "alpaka/core/common.hpp"    // amalgamate: file already inlined
				// ============================================================================
				// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/rand/engine/philox/PhiloxSingle.hpp ==
				// ==
				/* Copyright 2022 Jiri Vyskocil, Rene Widera, Bernhard Manfred Gruber
				 * SPDX-License-Identifier: MPL-2.0
				 */

				// #pragma once
					// ============================================================================
					// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/rand/engine/philox/PhiloxBaseCommon.hpp ==
					// ==
					/* Copyright 2022 Jiri Vyskocil, Bernhard Manfred Gruber, Jeffrey Kelling
					 * SPDX-License-Identifier: MPL-2.0
					 */

					// #pragma once
						// ============================================================================
						// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/rand/engine/philox/PhiloxState.hpp ==
						// ==
						/* Copyright 2022-2025 Jiri Vyskocil, Rene Widera, Bernhard Manfred Gruber, Tim Hanel
						 * SPDX-License-Identifier: MPL-2.0
						 */

						// #pragma once
						// #include <cstdint>    // amalgamate: file already included

						namespace alpaka::rand::engine::internal
						{
						    template<typename T_Params>
						    class PhiloxSingle;
						    template<typename T_Params>
						    class PhiloxVector;

						    /**  Philox state
						     *
						     * @tparam T_Counter Type of the Counter array
						     * @tparam T_Key Type of the Key array
						     */
						    template<typename T_Counter, typename T_Key, typename Impl>
						    struct PhiloxState;

						    /** Philox state specialization for vector engine
						     * more memory/register efficient
						     *
						     */
						    template<typename T_Counter, typename T_Key, typename T_Params>
						    struct PhiloxState<T_Counter, T_Key, PhiloxVector<T_Params>>
						    {
						        using Counter = T_Counter;
						        using Key = T_Key;
						        Counter counter;
						        Key key;
						    };

						    /** Philox state specialization for single value engine
						     *
						     * @tparam T_Counter Type of the Counter array
						     * @tparam T_Key Type of the Key array
						     */
						    template<typename T_Counter, typename T_Key, typename T_Params>
						    struct PhiloxState<T_Counter, T_Key, PhiloxSingle<T_Params>>
						    {
						        using Counter = T_Counter;
						        using Key = T_Key;

						        Counter counter;
						        Key key;
						        Counter result;
						        std::uint32_t position;
						    };

						} // namespace alpaka::rand::engine::internal
						// ==
						// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/rand/engine/philox/PhiloxState.hpp ==
						// ============================================================================

						// ============================================================================
						// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/rand/engine/philox/PhiloxStateless.hpp ==
						// ==
						/* Copyright 2022 Jiri Vyskocil, Bernhard Manfred Gruber, Jeffrey Kelling
						 * SPDX-License-Identifier: MPL-2.0
						 */


						// #pragma once
						// #include "alpaka/Vec.hpp"    // amalgamate: file already inlined
							// ============================================================================
							// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/rand/engine/philox/PhiloxConstants.hpp ==
							// ==
							/* Copyright 2022 Jiri Vyskocil, Bernhard Manfred Gruber
							 * SPDX-License-Identifier: MPL-2.0
							 */

							// #pragma once
								// ============================================================================
								// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/rand/engine/philox/multiplyAndSplit64to32.hpp ==
								// ==
								/* Copyright 2023 Jiří Vyskočil, Bernhard Manfred Gruber, Jan Stephan
								 * SPDX-License-Identifier: MPL-2.0
								 */

								// #pragma once
								// #include "alpaka/core/common.hpp"    // amalgamate: file already inlined

								// #include <cstdint>    // amalgamate: file already included

								namespace alpaka::rand::engine::internal
								{
								    /// Get high 32 bits of a 64-bit number
								    constexpr auto high32Bits(std::uint64_t const x) -> std::uint32_t
								    {
								        return static_cast<std::uint32_t>(x >> 32);
								    }

								    /// Get low 32 bits of a 64-bit number
								    constexpr auto low32Bits(std::uint64_t const x) -> std::uint32_t
								    {
								        return static_cast<std::uint32_t>(x & 0xffff'ffff);
								    }

								    /** Multiply two 64-bit numbers and split the result into high and low 32 bits, also known as "mulhilo32"
								     *
								     * @param a first 64-bit multiplier
								     * @param b second 64-bit multiplier
								     * @param resultHigh high 32 bits of the product a*b
								     * @param resultLow low 32 bits of the product a*b
								     */
								    // TODO: See single-instruction implementations in original Philox source code
								    constexpr void multiplyAndSplit64to32(
								        std::uint64_t const a,
								        std::uint64_t const b,
								        std::uint32_t& resultHigh,
								        std::uint32_t& resultLow)
								    {
								        std::uint64_t const res64 = a * b;
								        resultHigh = high32Bits(res64);
								        resultLow = low32Bits(res64);
								    }
								} // namespace alpaka::rand::engine::internal
								// ==
								// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/rand/engine/philox/multiplyAndSplit64to32.hpp ==
								// ============================================================================


							// #include <cstdint>    // amalgamate: file already included
							// #include <utility>    // amalgamate: file already included

							namespace alpaka::rand::engine::internal
							{
							    /** Constants used in the Philox algorithm
							     *
							     * The numbers are taken from the reference Philox implementation:
							     *
							     * J. K. Salmon, M. A. Moraes, R. O. Dror and D. E. Shaw, "Parallel random numbers: As easy as 1, 2, 3,"
							     * SC '11: Proceedings of 2011 International Conference for High Performance Computing, Networking,
							     * Storage and Analysis, 2011, pp. 1-12, doi: 10.1145/2063384.2063405.
							     *
							     * static const data members are transformed into functions, because GCC
							     * assumes types with static data members to be not mappable and makes not
							     * exception for constexpr ones. This is a valid interpretation of the
							     * OpenMP <= 4.5 standard. In OpenMP >= 5.0 types with any kind of static
							     * data member are mappable.
							     */
							    class PhiloxConstants
							    {
							    public:
							        /// First Weyl sequence parameter: the golden ratio
							        static consteval std::uint64_t WEYL_64_0()
							        {
							            return 0x9E37'79B9'7F4A'7C15;
							        }

							        /// Second Weyl sequence parameter: \f$ \sqrt{3}-1 \f$
							        static consteval std::uint64_t WEYL_64_1()
							        {
							            return 0xBB67'AE85'84CA'A73B;
							        }

							        /// 1st Weyl sequence parameter, 32 bits
							        static consteval std::uint32_t WEYL_32_0()
							        {
							            return high32Bits(WEYL_64_0());
							        }

							        /// 2nd Weyl sequence parameter, 32 bits
							        static consteval std::uint32_t WEYL_32_1()
							        {
							            return high32Bits(WEYL_64_1());
							        }

							        /// First Philox S-box multiplier
							        static consteval std::uint32_t MULTIPLITER_4x32_0()
							        {
							            return 0xCD9E'8D57;
							        }

							        /// Second Philox S-box multiplier
							        static consteval std::uint32_t MULTIPLITER_4x32_1()
							        {
							            return 0xD251'1F53;
							        }
							    };
							} // namespace alpaka::rand::engine::internal
							// ==
							// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/rand/engine/philox/PhiloxConstants.hpp ==
							// ============================================================================

						// #include "alpaka/rand/engine/philox/multiplyAndSplit64to32.hpp"    // amalgamate: file already inlined

						// #include <utility>    // amalgamate: file already included

						namespace alpaka::rand::engine::internal
						{
						    /** Philox algorithm parameters
						     *
						     * @tparam TCounterSize number of elements in the counter
						     * @tparam TWidth width of one counter element (in bits)
						     * @tparam TRounds number of S-box rounds
						     */
						    template<unsigned TCounterSize, unsigned TWidth, unsigned TRounds>
						    struct PhiloxParams
						    {
						        static constexpr unsigned counterSize = TCounterSize;
						        static constexpr unsigned width = TWidth;
						        static constexpr unsigned rounds = TRounds;
						    };

						    /** Class basic Philox family counter-based PRNG
						     *
						     * Checks the validity of passed-in parameters and calls the backend methods to perform N rounds of the
						     * Philox shuffle.
						     *
						     * @tparam T_Params Philox algorithm parameters \sa PhiloxParams
						     */
						    template<typename T_Params>
						    class PhiloxStateless
						    {
						        static constexpr unsigned numRounds()
						        {
						            return T_Params::rounds;
						        }

						        static constexpr unsigned vectorSize()
						        {
						            return T_Params::counterSize;
						        }

						        static constexpr unsigned numberWidth()
						        {
						            return T_Params::width;
						        }

						        static_assert(numRounds() > 0, "Number of Philox rounds must be > 0.");
						        static_assert(vectorSize() % 2 == 0, "Philox counter size must be an even number.");
						        static_assert(vectorSize() <= 16, "Philox SP network is not specified for sizes > 16.");
						        static_assert(numberWidth() % 8 == 0, "Philox number width in bits must be a multiple of 8.");

						        static_assert(numberWidth() == 32, "Philox implemented only for 32 bit numbers.");

						    public:
						        using Counter = alpaka::Vec<std::uint32_t, T_Params::counterSize>;
						        using Key = alpaka::Vec<std::uint32_t, T_Params::counterSize / 2>;

						    protected:
						        /** Single round of the Philox shuffle
						         *
						         * @param counter state of the counter
						         * @param key value of the key
						         * @return shuffled counter
						         */
						        static constexpr auto singleRound(Counter const& counter, Key const& key)
						        {
						            std::uint32_t H0, L0, H1, L1;
						            multiplyAndSplit64to32(counter[0], PhiloxConstants::MULTIPLITER_4x32_0(), H0, L0);
						            multiplyAndSplit64to32(counter[2], PhiloxConstants::MULTIPLITER_4x32_1(), H1, L1);
						            return Counter{H1 ^ counter[1] ^ key[0], L1, H0 ^ counter[3] ^ key[1], L0};
						        }

						        /** Bump the \a key by the Weyl sequence step parameter
						         *
						         * @param key the key to be bumped
						         * @return the bumped key
						         */
						        static constexpr auto bumpKey(Key const& key)
						        {
						            return Key{key[0] + PhiloxConstants::WEYL_32_0(), key[1] + PhiloxConstants::WEYL_32_1()};
						        }

						        /** Performs N rounds of the Philox shuffle
						         *
						         * @param counter_in initial state of the counter
						         * @param key_in initial state of the key
						         * @return result of the PRNG shuffle; has the same size as the counter
						         */
						        static constexpr auto nRounds(Counter const& counter_in, Key const& key_in) -> Counter
						        {
						            Key key{key_in};
						            Counter counter = singleRound(counter_in, key);

						            // Use a constexpr variable to ensure the unroll factor is a compile-time constant
						            constexpr unsigned rounds = numRounds();

						            for(unsigned int n = 0; n < rounds; ++n)
						            {
						                key = bumpKey(key);
						                counter = singleRound(counter, key);
						            }

						            return counter;
						        }

						    public:
						        /** Generates a random number (\p TCounterSize x32-bit)
						         *
						         * @param counter initial state of the counter
						         * @param key initial state of the key
						         * @return result of the PRNG shuffle; has the same size as the counter
						         */
						        static constexpr auto generate(Counter const& counter, Key const& key) -> Counter
						        {
						            return nRounds(counter, key);
						        }
						    };
						} // namespace alpaka::rand::engine::internal
						// ==
						// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/rand/engine/philox/PhiloxStateless.hpp ==
						// ============================================================================


					namespace alpaka::rand::engine::internal
					{
					    /** Common class for Philox family engines
					     *
					     * Relies on `PhiloxStateless` to provide the PRNG and adds state to handling the counting.
					     *
					     * @tparam T_Params Philox algorithm parameters \sa PhiloxParams
					     * @tparam T_Impl engine type implementation (CRTP)
					     *
					     * static const data members are transformed into functions, because GCC
					     * assumes types with static data members to be not mappable and makes not
					     * exception for constexpr ones. This is a valid interpretation of the
					     * OpenMP <= 4.5 standard. In OpenMP >= 5.0 types with any kind of static
					     * data member are mappable.
					     */
					    template<typename T_Params, typename T_Impl>
					    class PhiloxBaseCommon : public PhiloxStateless<T_Params>
					    {
					    public:
					        using Counter = typename PhiloxStateless<T_Params>::Counter;
					        using Key = typename PhiloxStateless<T_Params>::Key;
					        /// State type
					        using State = PhiloxState<Counter, Key, T_Impl>;

					        /// Internal engine state
					        State state;
					        /// Distribution container type
					        template<typename TDistributionResultScalar>
					        using ResultContainer = Vec<TDistributionResultScalar, T_Params::counterSize>;

					        constexpr explicit PhiloxBaseCommon(State&& state) : state(std::move(state))
					        {
					        }

					    protected:
					        /** Advance the \a counter to the next state
					         *
					         * Increments the passed-in \a counter by one with a 128-bit carry.
					         *
					         * @param counter reference to the counter which is to be advanced
					         */
					        template<typename T, auto N>
					        static constexpr void advanceCounter(alpaka::Vec<T, N>& counter)
					        {
					            ++counter[0];

					            /* 128-bit carry */
					            if(counter[0] == 0)
					            {
					                ++counter[1];
					                if(counter[1] == 0)
					                {
					                    ++counter[2];
					                    if(counter[2] == 0)
					                    {
					                        ++counter[3];
					                    }
					                }
					            }
					        }

					        /** Advance the internal state counter by \a offset N-vectors (N = counter size)
					         *
					         * Advances the internal value of this->state.counter
					         *
					         * @param offset number of N-vectors to skip
					         */
					        constexpr void skip4(uint64_t offset)
					        {
					            Counter& counter = this->state.counter;
					            Counter temp = counter;
					            counter[0] += low32Bits(offset);
					            counter[1] += high32Bits(offset) + (counter[0] < temp[0] ? 1 : 0);
					            counter[2] += (counter[0] < temp[1] ? 1u : 0u);
					            counter[3] += (counter[0] < temp[2] ? 1u : 0u);
					        }

					        /** Advance the counter by the length of \a subsequence
					         *
					         * Advances the internal value of this->state.counter
					         *
					         * @param subsequence number of subsequences to skip
					         */
					        constexpr void skipSubsequence(uint64_t subsequence)
					        {
					            Counter& counter = this->state.counter;
					            Counter temp = counter;
					            counter[2] += low32Bits(subsequence);
					            counter[3] += high32Bits(subsequence) + (counter[2] < temp[2] ? 1 : 0);
					        }
					    };
					} // namespace alpaka::rand::engine::internal
					// ==
					// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/rand/engine/philox/PhiloxBaseCommon.hpp ==
					// ============================================================================

				// #include "alpaka/rand/engine/philox/multiplyAndSplit64to32.hpp"    // amalgamate: file already inlined

				// #include <utility>    // amalgamate: file already included

				namespace alpaka::rand::engine::internal
				{


				    /** Philox engine generating a single number
				     *
				     * This engine's operator() will return a single number. Since the result is the same size as the counter,
				     * and so it contains more than one number, it has to be stored between individual invocations of
				     * operator(). Additionally a pointer has to be stored indicating which part of the result array is to be
				     * returned next.
				     *
				     * @tparam TParams Basic parameters for the Philox algorithm
				     */
				    template<typename TParams>
				    class PhiloxSingle : public PhiloxBaseCommon<TParams, PhiloxSingle<TParams>>
				    {
				    public:
				        using Base = PhiloxBaseCommon<TParams, PhiloxSingle<TParams>>;

				        /// Counter type
				        using Counter = typename Base::Counter;
				        /// Key type
				        using Key = typename Base::Key;
				        using State = PhiloxState<Counter, Key, PhiloxSingle<TParams>>;


				    protected:
				        /** Advance internal counter to the next value
				         *
				         * Advances the full internal counter array, resets the position pointer and stores the intermediate
				         * result to be recalled when the user requests a number.
				         */
				        constexpr void advanceState()
				        {
				            this->advanceCounter(this->state.counter);
				            this->state.result = this->nRounds(this->state.counter, this->state.key);
				            this->state.position = 0;
				        }

				        /** Get the next random number and advance internal state
				         *
				         * The intermediate result stores N = TParams::counterSize numbers. Check if we've already given out
				         * all of them. If so, generate a new intermediate result (this also resets the pointer to the position
				         * of the actual number). Finally, we return the actual number.
				         *
				         * @return The next random number
				         */
				        constexpr auto nextNumber()
				        {
				            // Element zero will always contain the next valid random number.
				            auto result = this->state.result[0];
				            ++this->state.position;
				            if(this->state.position == TParams::counterSize)
				            {
				                advanceState();
				            }
				            else
				            {
				                /* Shift state results to allow hard coded access to element zero.
				                 * This will avoid high register usage on NVIDIA devices.
				                 * @todo Check if this shifting of the result vector is decreasing CPU performance.
				                 *       If so this optimization for GPUs (mostly NVIDIA/AMD) should be made optional.
				                 */
				                this->state.result[0] = this->state.result[1];
				                this->state.result[1] = this->state.result[2];
				                this->state.result[2] = this->state.result[3];
				            }

				            return result;
				        }

				        /// Skips the next \a offset numbers
				        constexpr void skip(uint64_t offset)
				        {
				            static_assert(TParams::counterSize == 4, "Only counterSize is supported.");
				            this->state.position = static_cast<decltype(this->state.position)>(this->state.position + (offset & 3));
				            offset += this->state.position < 4 ? 0 : 4;
				            this->state.position -= this->state.position < 4 ? 0 : 4u;
				            for(auto numShifts = this->state.position; numShifts > 0; --numShifts)
				            {
				                // Shift state results to allow hard coded access to element zero.
				                // This will avoid high register usage on NVIDIA devices.
				                this->state.result[0] = this->state.result[1];
				                this->state.result[1] = this->state.result[2];
				                this->state.result[2] = this->state.result[3];
				            }
				            this->skip4(offset / 4);
				        }

				    public:
				        /** Construct a new Philox engine with single-value output
				         *
				         * @param seed Set the Philox generator key
				         * @param subsequence Select a subsequence of size 2^64
				         * @param offset Skip \a offset numbers form the start of the subsequence
				         */
				        constexpr PhiloxSingle(uint64_t seed = 0, uint64_t subsequence = 0, uint64_t offset = 0)
				            : Base(State{{0, 0, 0, 0}, {low32Bits(seed), high32Bits(seed)}, {0, 0, 0, 0}, 0u})
				        {
				            this->skipSubsequence(subsequence);
				            skip(offset);
				            advanceState();
				        }

				        /** Get the next random number
				         *
				         * @return The next random number
				         */
				        constexpr auto operator()()
				        {
				            return nextNumber();
				        }
				    };
				} // namespace alpaka::rand::engine::internal
				// ==
				// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/rand/engine/philox/PhiloxSingle.hpp ==
				// ============================================================================

				// ============================================================================
				// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/rand/engine/philox/PhiloxVector.hpp ==
				// ==
				/* Copyright 2022 Jiri Vyskocil, Bernhard Manfred Gruber
				 * SPDX-License-Identifier: MPL-2.0
				 */

				// #pragma once
				// #include "alpaka/rand/engine/philox/PhiloxBaseCommon.hpp"    // amalgamate: file already inlined
				// #include "alpaka/rand/engine/philox/multiplyAndSplit64to32.hpp"    // amalgamate: file already inlined

				namespace alpaka::rand::engine::internal
				{
				    /** Philox engine generating a vector of numbers
				     *
				     * This engine's operator() will return a vector of numbers corresponding to the full size of its counter.
				     * This is a convenience vs. memory size tradeoff since the user has to deal with the output array
				     * themselves, but the internal state comprises only of a single counter and a key.
				     *
				     * @tparam T_Params Basic parameters for the Philox algorithm
				     */
				    template<typename T_Params>
				    class PhiloxVector : public PhiloxBaseCommon<T_Params, PhiloxVector<T_Params>>
				    {
				    public:
				        using Base = PhiloxBaseCommon<T_Params, PhiloxVector<T_Params>>;

				        /// Counter type
				        using Counter = typename Base::Counter;
				        /// Key type
				        using Key = typename Base::Key;
				        using State = PhiloxState<Counter, Key, PhiloxVector<T_Params>>;
				        template<typename TDistributionResultScalar>
				        using ResultContainer = typename Base::template ResultContainer<TDistributionResultScalar>;

				    protected:
				        /** Get the next array of random numbers and advance internal state
				         *
				         * @return The next array of random numbers
				         */
				        constexpr auto nextVector()
				        {
				            this->advanceCounter(this->state.counter);
				            return this->nRounds(this->state.counter, this->state.key);
				        }

				        /** Skips the next \a offset vectors
				         *
				         * Unlike its counterpart in \a PhiloxSingle, this function advances the state in multiples of the
				         * counter size thus skipping the entire array of numbers.
				         */
				        constexpr void skip(uint64_t offset)
				        {
				            this->skip4(offset);
				        }

				    public:
				        /** Construct a new Philox engine with vector output
				         *
				         * @param seed Set the Philox generator key
				         * @param subsequence Select a subsequence of size 2^64
				         * @param offset Skip \a offset numbers form the start of the subsequence
				         */
				        constexpr explicit PhiloxVector(uint64_t seed = 0, uint64_t subsequence = 0, uint64_t offset = 0)
				            : Base(State{{0, 0, 0, 0}, {low32Bits(seed), high32Bits(seed)}})
				        {
				            this->skipSubsequence(subsequence);
				            skip(offset);
				            nextVector();
				        }

				        /** Get the next vector of random numbers
				         *
				         * @return The next vector of random numbers
				         */
				        constexpr auto operator()()
				        {
				            return nextVector();
				        }
				    };
				} // namespace alpaka::rand::engine::internal
				// ==
				// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/rand/engine/philox/PhiloxVector.hpp ==
				// ============================================================================


			// #include <cstdint>    // amalgamate: file already included
			// #include <limits>    // amalgamate: file already included
			#include <random>
			#include <type_traits>

			namespace alpaka::rand::engine
			{

			    /** Most common Philox engine variant, outputs single number
			     *
			     * This is a variant of the Philox engine generator which outputs a single float. The counter size is \f$4
			     * \times 32 = 128\f$ bits. A bit shuffle is performed 10 subsequent times.
			     *  Since the engine returns a single number, the generated result, which has the same
			     * size as the counter, has to be stored between invocations. Additionally a 32 bit pointer is stored. The
			     * total size of the state is 352 bits = 44 bytes.
			     *
			     * Ref.: J. K. Salmon, M. A. Moraes, R. O. Dror and D. E. Shaw, "Parallel random numbers: As easy as 1, 2, 3,"
			     * SC '11: Proceedings of 2011 International Conference for High Performance Computing, Networking, Storage and
			     * Analysis, 2011, pp. 1-12, doi: 10.1145/2063384.2063405.
			     */
			    class Philox4x32x10
			    {
			    public:
			        /// Philox algorithm: 10 rounds, 4 numbers of size 32.
			        using EngineParams = internal::PhiloxParams<4, 32, 10>;
			        /// Engine outputs a single number
			        using EngineVariant = internal::PhiloxSingle<EngineParams>;

			        /** Initialize a new Philox engine
			         *
			         * @param seed Set the Philox generator key
			         * @param subsequence Select a subsequence of size 2^64
			         * @param offset Skip \a offset numbers form the start of the subsequence
			         */
			        constexpr explicit Philox4x32x10(
			            std::uint64_t const seed = 0,
			            std::uint64_t const subsequence = 0,
			            std::uint64_t const offset = 0)
			            : engineVariant(seed, subsequence, offset)
			        {
			        }

			        // STL UniformRandomBitGenerator concept
			        // See the functions min and max for the range of the generated numbers
			        // https://en.cppreference.com/w/cpp/named_req/UniformRandomBitGenerator
			        using result_type = std::uint32_t;

			        static constexpr auto min() -> result_type
			        {
			            return 0;
			        }

			        static constexpr auto max() -> result_type
			        {
			            return std::numeric_limits<result_type>::max();
			        }

			        constexpr auto operator()() -> result_type
			        {
			            return engineVariant();
			        }

			    private:
			        EngineVariant engineVariant;
			    };

			    /** Most common Philox engine variant, outputs a 4-vector of floats
			     *
			     * This is a variant of the Philox engine generator which outputs a vector containing 4 floats. The counter
			     * size is \f$4 \times 32 = 128\f$ bits. Since the engine returns the whole generated vector, it is up to the
			     * user to extract individual floats as they need. The benefit is smaller state size since the state does not
			     * contain the intermediate results. The total size of the state is 192 bits = 24 bytes.
			     *
			     * Ref.: J. K. Salmon, M. A. Moraes, R. O. Dror and D. E. Shaw, "Parallel random numbers: As easy as 1, 2, 3,"
			     * SC '11: Proceedings of 2011 International Conference for High Performance Computing, Networking, Storage and
			     * Analysis, 2011, pp. 1-12, doi: 10.1145/2063384.2063405.
			     */
			    class Philox4x32x10Vector
			    {
			    public:
			        using EngineParams = internal::PhiloxParams<4, 32, 10>;
			        using EngineVariant = internal::PhiloxVector<EngineParams>;

			        /** Initialize a new Philox engine
			         *
			         * @param seed Set the Philox generator key
			         * @param subsequence Select a subsequence of size 2^64
			         * @param offset Number of numbers to skip form the start of the subsequence.
			         */
			        constexpr explicit Philox4x32x10Vector(
			            std::uint32_t const seed = 0,
			            std::uint32_t const subsequence = 0,
			            std::uint32_t const offset = 0)
			            : engineVariant(seed, subsequence, offset)
			        {
			        }

			        template<typename TScalar>
			        using ResultContainer = EngineVariant::ResultContainer<TScalar>;

			        using ResultInt = std::uint32_t;
			        using ResultVec = decltype(std::declval<EngineVariant>()());

			        static constexpr auto min() -> ResultInt
			        {
			            return 0;
			        }

			        static constexpr auto max() -> ResultInt
			        {
			            return std::numeric_limits<ResultInt>::max();
			        }

			        constexpr auto operator()() -> ResultVec
			        {
			            return engineVariant();
			        }

			    private:
			        EngineVariant engineVariant;
			    };


			} // namespace alpaka::rand::engine
			// ==
			// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/rand/engine/philox/philox.hpp ==
			// ============================================================================


		namespace alpaka::rand::distribution::internal
		{
		    /** Returns a constant, which is equivalent to std::nextafter(T_Floating{1}, T_Floating{0})
		     * or more specifically: returns the highest floating-point value lower than one.
		     * There does not seem to exist a representation of this particular floating-point number in std::numerical_limits
		     * and std::nextafter is not constexpr(cpp20).
		     */
		    template<std::floating_point T_Floating>
		    consteval T_Floating prevOne() noexcept
		    {
		        if constexpr(sizeof(T_Floating) == 4)
		        {
		            return std::bit_cast<T_Floating>(static_cast<uint32_t>(0x3f7f'ffff));
		        }
		        else if constexpr(sizeof(T_Floating) == 8)
		        {
		            return std::bit_cast<T_Floating>(static_cast<uint64_t>(0x3fef'ffff'ffff'ffff));
		        }
		    }

		    /**
		     * Contains some (constexpr-)constants for random bit integer to floating point conversion -- which
		     * improve readability.
		     */
		    template<std::unsigned_integral T_Integer, std::floating_point T_Floating>
		    struct Constants
		    {
		        /// represents one bucket when converting an integer numbers to a floating point type in the range [0,1]
		        static constexpr T_Floating normalizedIntervalBin
		            = T_Floating{1} / static_cast<T_Floating>(std::numeric_limits<T_Integer>::max());
		        /// this expression has been used by nvidia curand to respect the lower bounds criteria  -> it essentially
		        /// shifts the distribution to the bin center (on the upper bounds this shift is mitigated due to rounding --
		        /// meaning 1 is not exceeded)
		        static constexpr T_Floating halfBinWidth = normalizedIntervalBin / T_Floating{2};
		        /// uses a slightly smaller bucket size [0,std::nextafter(1,0)] / MAX to enforce the (open-) upper bounds
		        /// criteria
		        static constexpr T_Floating normalizedOpenIntervalBin
		            = prevOne<T_Floating>() / static_cast<T_Floating>(std::numeric_limits<T_Integer>::max());
		    };

		    /** Convert an integer RNG result to a floating-point value.
		     *
		     * This is the fallback implementation used when no interval specialization
		     * matches. It should never be instantiated and exists only to
		     * catch unsupported interval configurations.
		     */
		    template<typename T_Engine, concepts::Interval T_Interval, std::integral T_Integer, std::floating_point T_Floating>
		    struct IntervalAwareConversion;

		    /** Converts an integer RNG output to a floating-point type in the interval [0, 1).
		     * The value is mapped to an interval in the range [0,std::nextafter(1,0)),
		     * where std::nextafter(1,0) represents the highest representable floating-point value lower than one.
		     */
		    template<typename T_Engine, std::integral T_Integer, std::floating_point T_Floating>
		    struct IntervalAwareConversion<T_Engine, interval::CO, T_Integer, T_Floating>
		    {
		        constexpr auto operator()(T_Integer const& i) const
		        {
		            constexpr auto interval = Constants<T_Integer, T_Floating>::normalizedOpenIntervalBin;
		            return static_cast<T_Floating>(i) * interval;
		        };
		    };

		    /** Convert an integer RNG output to a floating-point type in the interval (0, 1).
		     *  The value is mapped to an interval in the range [0,std::nextafter(1,0)),
		     * where std::nextafter(1,0) represents the highest representable floating-point value lower than one.
		     * Afterward an offset is added to avoid generating zero - inspired by Nvidias curand approach.
		     * **Note:** The bounds criteria is only strictly valid for the base interval (0, 1).
		     * When applying a scaling factor > 1, rounding effects may still cause the lower or upper bound
		     * to be hit. See **scaleInterval()** for the post-scaling correction.
		     */
		    template<typename T_Engine, std::integral T_Integer, std::floating_point T_Floating>
		    struct IntervalAwareConversion<T_Engine, interval::OO, T_Integer, T_Floating>
		    {
		        constexpr auto operator()(T_Integer const& i) const
		        {
		            return static_cast<T_Floating>(i) * Constants<T_Integer, T_Floating>::normalizedOpenIntervalBin
		                   + Constants<T_Integer, T_Floating>::halfBinWidth;
		        };
		    };

		    /** Convert an integer RNG output to a floating-point type in the interval (0, 1].
		     * Afterward an offset is added to avoid generating zero - inspired by Nvidias curand approach.
		     * **Note:** The bounds criteria is only strictly valid for the base interval (0, 1).
		     * When applying a scaling factor > 1, rounding effects may still cause the lower or upper bound
		     * to be hit. See **scaleInterval()** for the post-scaling correction.
		     */
		    template<typename T_Engine, std::integral T_Integer, std::floating_point T_Floating>
		    struct IntervalAwareConversion<T_Engine, interval::OC, T_Integer, T_Floating>
		    {
		        constexpr auto operator()(T_Integer const& i) const
		        {
		            return static_cast<T_Floating>(i) * Constants<T_Integer, T_Floating>::normalizedIntervalBin
		                   + Constants<T_Integer, T_Floating>::halfBinWidth;
		        };
		    };

		    /** Converts an integer RNG output to a floating point type in the closed interval [0, 1].
		     */
		    template<typename T_Engine, std::integral T_Integer, std::floating_point T_Floating>
		    struct IntervalAwareConversion<T_Engine, interval::CC, T_Integer, T_Floating>
		    {
		        constexpr auto operator()(T_Integer const& i) const
		        {
		            return static_cast<T_Floating>(i) * Constants<T_Integer, T_Floating>::normalizedIntervalBin;
		        };
		    };

		    /** Adapt the bit length of the engine output to match the target type.
		     * This is the default case where the engine result type already matches and thus the engine is simply invoked.
		     */
		    template<typename T_Engine, uint32_t byteLengthEngineResult, uint32_t byteLengthRealType>
		    struct BitLengthConformityAdapter
		    {
		        static_assert(
		            (byteLengthEngineResult == 4u || byteLengthRealType == 8u),
		            "Result returned by the randomBitGenerator does not have a length that is accepted by the uniformReal "
		            "distribution!");
		        static_assert(
		            (byteLengthEngineResult == 8u || byteLengthRealType == 4u),
		            "The requested floating point type does not have a length that is accepted by the uniformReal "
		            "distribution!");
		        static_assert(
		            byteLengthEngineResult == byteLengthRealType,
		            "By logic this should never fail in case the compiler accepts the specialization of the adapter!");

		        constexpr auto operator()(T_Engine& engine)
		        {
		            return engine();
		        }
		    };

		    /** Adapts a 32-bit engine output to a 64-bit value. This involves invoking the engine twice. */
		    template<typename T_Engine>
		    struct BitLengthConformityAdapter<T_Engine, 4u, 8u>
		    {
		        constexpr auto operator()(T_Engine& engine)
		        {
		            return static_cast<uint64_t>(engine()) << 32 | static_cast<uint64_t>(engine());
		        }
		    };

		    /** Adapt a 64-bit engine output to a 32-bit value. Uses a simple narrowing conversion.*/
		    template<typename T_Engine>
		    struct BitLengthConformityAdapter<T_Engine, 8u, 4u>
		    {
		        constexpr auto operator()(T_Engine& engine)
		        {
		            return static_cast<uint32_t>(engine());
		        }
		    };

		    /** Generate a floating-point value in the requested interval.
		     *
		     * Adapts the engine output to the required bit length and converts the integer
		     * to a normalized floating point value in the requested interval */
		    template<concepts::Interval T_Interval, std::floating_point T_Result, typename T_Engine>
		    constexpr auto getNormalizedUniformReal(T_Engine& engine) -> T_Result
		    {
		        using T_EngineResult = std::remove_cvref_t<decltype(engine())>;
		        // generates an integer the length of the size T_Result
		        auto adaptedBits = BitLengthConformityAdapter<
		            T_Engine,
		            static_cast<uint32_t>(sizeof(T_EngineResult)),
		            static_cast<uint32_t>(sizeof(T_Result))>{}(engine);
		        // convert randomBits into the required floating-point type, while respecting the requested bounds criteria
		        return IntervalAwareConversion<T_Engine, T_Interval, ALPAKA_TYPEOF(adaptedBits), T_Result>{}(adaptedBits);
		    }
		    template<concepts::UniformVectorEngine T_Engine, uint32_t TResultSize, uint32_t TElemSize, uint32_t TElems>
		    struct vectorDispatchWrapper;

		    template<concepts::UniformVectorEngine T_Engine, uint32_t TElemSize, uint32_t TElems>
		    struct vectorDispatchWrapper<T_Engine, 4u, TElemSize, TElems>
		    {
		        T_Engine& ph;
		        static_assert(TElems > 0, "RandomEngine did not return any elements!");

		        constexpr explicit vectorDispatchWrapper(T_Engine& eng) : ph(eng)
		        {
		        }

		        constexpr uint32_t operator()() const
		        {
		            auto res = ph();
		            return static_cast<uint32_t>(res[0]);
		        }
		    };

		    /// **Wrapper specialization enabling efficient generation of 64-bit values from vectorized engines without
		    /// requiring two engine calls.**
		    template<concepts::UniformVectorEngine T_Engine, uint32_t TElems>
		    struct vectorDispatchWrapper<T_Engine, 8u, 4u, TElems>
		    {
		        T_Engine& ph;
		        using TResult = decltype(ph());
		        static constexpr auto dim = TResult::dim();
		        static_assert(TElems >= 2, "Engine result dimension must be >= 2, to be usable in UniformReal<double>");

		        constexpr explicit vectorDispatchWrapper(T_Engine& eng) : ph(eng)
		        {
		        }

		        constexpr uint64_t operator()() const
		        {
		            auto res = ph();
		            return (static_cast<uint64_t>(res[0]) << 32) | static_cast<uint64_t>(res[1]);
		        }
		    };

		    template<std::floating_point T_Floating, concepts::Interval T_Interval>
		    class UniformRealBase
		    {
		    public:
		        using result_type = T_Floating;

		        using Interval_type = T_Interval;

		        constexpr explicit UniformRealBase(T_Floating min, T_Floating max, [[maybe_unused]] T_Interval)
		            : m_min(min)
		            , m_max(max)
		            , m_range(m_max - m_min) // abs is a fail-safe in case min>max
		        {
		        }

		    protected:
		        T_Floating const m_min;
		        T_Floating const m_max;
		        T_Floating const m_range;
		    };
		} // namespace alpaka::rand::distribution::internal

		namespace alpaka::rand::distribution
		{
		    /** Select a floating-point value from a uniform interval.
		     *
		     * This generator produces floating-point values of type `T_Result` drawn from a uniform
		     * interval `[a, b)` or `(a, b]`, depending on the interval type specified via `Interval_v`: default case is CO
		     * ->[a,b). The interface mirrors `std::uniform_real_distribution`, and can be invoked with any engine
		     * adhering to the 'UniformRandomEngine' concept, which includes std uniform engines.
		     *
		     * **Supported result types:** `float`, `double`
		     * **Supported engine result widths:** 32-bit and 64-bit unsigned integers.
		     *
		     * @note This distribution is subject to a small non-uniform bias; see
		     *       **UniformReal::operator()** for details.
		     */
		    template<std::floating_point T_Result, concepts::Interval T_Interval = interval::CO>
		    struct UniformReal : internal::UniformRealBase<T_Result, T_Interval>
		    {
		        static_assert(static_cast<uint32_t>(sizeof(T_Result)) == 4u || static_cast<uint32_t>(sizeof(T_Result)) == 8u);

		        template<std::integral T_Value>
		        static consteval void checkValueConformity()
		        {
		            static_assert(
		                static_cast<uint32_t>(sizeof(T_Value)) == 4u || static_cast<uint32_t>(sizeof(T_Value)) == 8u);
		        }

		        using Base = internal::UniformRealBase<T_Result, T_Interval>;

		        constexpr explicit UniformReal([[maybe_unused]] T_Interval interval = T_Interval{}) : Base(0, 1, interval)
		        {
		        }

		        constexpr UniformReal(T_Result min, T_Result max, [[maybe_unused]] T_Interval interval = T_Interval{})
		            : Base(min, max, interval)
		        {
		        }

		        /** Selects a value from a uniform distribution over the configured (min, max) interval,
		         * respecting the specified interval bounds.**
		         *
		         * **Input:** a random engine conforming to the `UniformRandomEngine` concept
		         * (currently accepts stdlib uniform engines and alpaka engines included in the alpaka::rand::engine namespace)
		         * **Output:** a floating-point value sampled from the configured distribution.
		         *
		         * @note: This distribution introduces a slight numerical bias due to floating-point
		         * rounding effects and the use of a **1 / MAX** integer-to-floating-point conversion methods
		         * @see Goualard, F. (2020). Generating Random Floating-Point Numbers by Dividing Integers: A Case Study.
		         * https://doi.org/10.1007/978-3-030-50417-5_2 or Allen B. Downey Generating Pseudo-random Floating-Point
		         * Values https://allendowney.com/research/rand/.
		         * Additionally, using an interval with an open bound
		         * (OC,CO,OO) may introduce yet another small non-uniform bias -- @see scaleInterval().
		         */
		        template<concepts::UniformRandomEngine T_Engine>
		        constexpr auto operator()(T_Engine& engine) const -> T_Result
		        {
		            return engineDispatch(engine);
		        }

		    private:
		        /** Dispatch for std engines and alpaka abbreviations conforming to the std::uniform_random_bit_generator
		         * concept (e.g. Philox4x32x10)
		         */
		        template<concepts::UniformStdEngine T_Engine>
		        constexpr auto engineDispatch(T_Engine& engine) const -> T_Result
		        {
		            using T_EngineResult = ALPAKA_TYPEOF(engine());
		            checkValueConformity<T_EngineResult>();
		            T_Result res = internal::getNormalizedUniformReal<T_Interval, T_Result, T_Engine>(engine);
		            // @TODO potentially add underflow protection as suggested by https://doi.org/10.1145/3503512
		            return scaleInterval(res);
		        }

		        /** Dispatch for vector engines (uniform bit generators that return a vector (e.g Philox4x32x10Vector) to
		         * enable efficient double precision uniform_real generation (reducing the number of invocations)
		         */
		        template<concepts::UniformVectorEngine T_Engine>
		        constexpr auto engineDispatch(T_Engine& engine) const -> T_Result
		        {
		            using T_EngineResult = ALPAKA_TYPEOF(engine());
		            using valueType = typename T_EngineResult::type;
		            checkValueConformity<valueType>();
		            constexpr auto dim = getDim(T_EngineResult{});
		            auto dispatchWrapper = internal::vectorDispatchWrapper<
		                T_Engine,
		                static_cast<uint32_t>(sizeof(T_Result)),
		                static_cast<uint32_t>(sizeof(valueType)),
		                dim>(engine);
		            using T_DispatchWrapper = decltype(dispatchWrapper);
		            T_Result res
		                = internal::getNormalizedUniformReal<T_Interval, T_Result, T_DispatchWrapper>(dispatchWrapper);
		            return scaleInterval(res);
		        }

		        /** For open bounds, the normalized value in (0, 1) may (still) hit the lower or upper bound
		         * due to floating-point rounding (after scaling is applied).
		         *
		         * To enforce adherence to the requested interval, the result is shifted to the next representable
		         * value using std::nextafter.
		         * WARNING: std::nextafter is not constexpr in cpp20 -> this might cause problems on some devices (regarding
		         * missing __device__ annotations) and requires often unnecessary runtime evaluation -- future
		         * maintainers should consider creating a constexpr adaptation of std::nextafter
		         *
		         * @note This introduces a(-/another) small non-uniform bias. The current implementation is inherently
		         *       non-uniform due to integer-to-floating-point mapping @see  https://doi.org/10.1007/978-3-030-50417-5_2


		         */
		        constexpr auto scaleInterval(T_Result const& normalizedVal) const -> T_Result
		        {
		            T_Result res = normalizedVal * this->m_range + this->m_min;

		            if constexpr(std::is_same_v<T_Interval, interval::OC> || std::is_same_v<T_Interval, interval::OO>)
		            {
		                if(res == this->m_min)
		                    res = std::nextafter(this->m_min, this->m_max);
		            }
		            if constexpr(std::is_same_v<T_Interval, interval::CO> || std::is_same_v<T_Interval, interval::OO>)
		            {
		                if(res == this->m_max)
		                    res = std::nextafter(this->m_max, this->m_min);
		            }
		            return res;
		        }
		    };
		} // namespace alpaka::rand::distribution
		// ==
		// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/rand/distribution/UniformReal.hpp ==
		// ============================================================================


	// #include <cmath>    // amalgamate: file already included

	namespace alpaka::rand::distribution
	{
	    namespace internal
	    {
	        /**
	         * @brief Scalar Box–Muller transform for floating-point types.
	         *
	         * Generates independent standard normal samples from a uniform
	         * scalar engine. The implementation caches the second sample so
	         * that a pair of uniforms yields two normal values.
	         *
	         * Ref.: G. E. P. Box and M. E. Muller, "A Note on the Generation of Random Normal Deviates,"
	         */
	        template<std::floating_point T_Fp>
	        struct BoxMuller
	        {
	            using result_type = T_Fp;

	            template<concepts::UniformStdEngine T_Rng>
	            constexpr T_Fp operator()(T_Rng& rng)
	            {
	                // Re-use cached second sample
	                if(m_hasSecondRngNumber)
	                {
	                    m_hasSecondRngNumber = false;
	                    return m_secondRngNumber;
	                }
	                // Generate two uniform floats in (0,1)
	                constexpr auto uniformDist = UniformReal{T_Fp{0}, T_Fp{1}, interval::oo};

	                T_Fp u1 = uniformDist(rng);
	                T_Fp u2 = uniformDist(rng);

	                // Box-Muller transform
	                T_Fp r = math::sqrt(T_Fp{-2} * math::log(u1));
	                T_Fp theta = T_Fp{2} * static_cast<T_Fp>(M_PI) * u2;
	                T_Fp z0 = r * math::cos(theta);
	                T_Fp z1 = r * math::sin(theta);

	                m_secondRngNumber = z1;
	                m_hasSecondRngNumber = true;
	                return z0;
	            }

	        private:
	            T_Fp m_secondRngNumber;
	            bool m_hasSecondRngNumber = false;
	        };
	    } // namespace internal

	    /**
	     * @brief Used to sample floating-point values from a normal(-/gaussian) distribution.
	     *
	     * Usage is analogous to std::normal_distribution<T_Result> @see
	     * https://en.cppreference.com/w/cpp/numeric/random/normal_distribution.html
	     * -> Generates N(mean, stddev).
	     * Is using the Box-Muller method of generating normal distributed random numbers from a uniform distribution.
	     *
	     * @note: currently supports 32 and 64 bit floating point types.
	     */
	    template<std::floating_point T_Result>
	    struct NormalReal
	    {
	        using result_type = T_Result;

	        /**
	         * @brief Constructs normal(-/gaussian) distribution with given parameters.
	         *
	         * @param mean   Mean of the target normal distribution.
	         * @param stdDev Standard deviation of the target normal distribution.
	         *
	         * The default is N(0,1). The distribution is sampled using the Box-Muller method.
	         * This implementation keeps an internal state, therefore each
	         * thread/worker must use its own instance to avoid data races
	         * when the same object is accessed concurrently by multiple
	         * workers.
	         *
	         * Usage is otherwise analogous to std::normal_distribution<T_Result> @see
	         * https://en.cppreference.com/w/cpp/numeric/random/normal_distribution.html
	         */
	        constexpr explicit NormalReal(T_Result mean = T_Result{0}, T_Result stdDev = T_Result{1})
	            : m_mean(mean)
	            , m_stdDev(stdDev)
	        {
	        }

	        /** Selects a value from a normal (-/gaussian) distribution for the configured (mean,stdDev) settings.
	         *
	         * @param engine a random engine conforming to the `UniformRandomEngine` concept
	         * (currently accepts stdlib uniform engines and alpaka engines included in the alpaka::rand::engine namespace)
	         * @return a floating-point value sampled from the configured distribution.
	         */
	        template<concepts::UniformStdEngine T_Engine>
	        constexpr result_type operator()(T_Engine& engine)
	        {
	            return m_impl(engine) * m_stdDev + m_mean;
	        }

	    private:
	        // current implementation
	        using T_Impl = internal::BoxMuller<T_Result>;
	        // box muller has a state and must therefore be an accessible field.
	        T_Impl m_impl;
	        result_type const m_mean;
	        result_type const m_stdDev;
	    };

	} // namespace alpaka::rand::distribution
	// ==
	// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/rand/distribution/NormalReal.hpp ==
	// ============================================================================

// #include "alpaka/rand/distribution/UniformReal.hpp"    // amalgamate: file already inlined
// #include "alpaka/rand/engine/philox/philox.hpp"    // amalgamate: file already inlined
	// ============================================================================
	// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/simd/math.hpp ==
	// ==
	/* Copyright 2025 René Widera
	 * SPDX-License-Identifier: MPL-2.0
	 */

	/** @file This file provides a basic implementation of a SIMD vector.
	 *
	 * The implementation is based on the class Vec:
	 *   - the storge policy should become the native SIMD implementation e.g. std::simd
	 *   - load/ store and simd specifis should be implemented in the storage policy
	 *   - the name of storage policy should be changed
	 *
	 *   The current operator operations relay on compilers auto vectorization.
	 */

	// #pragma once
	// #include "alpaka/Simd.hpp"    // amalgamate: file already inlined
	// #include "alpaka/math/internal/math.hpp"    // amalgamate: file already inlined
	// #include "alpaka/simd/concepts.hpp"    // amalgamate: file already inlined

	// #include <concepts>    // amalgamate: file already included
	#include <type_traits>

	namespace alpaka::math::internal
	{

	    /** Specialize unary math function for SIMD types
	     *
	     * The implementation evaluates if the STL defines a math function for the native type used in the SIMD pack, if
	     * there is no STL math specialization available the alpaka math function will be called for each SIMD lane.
	     *
	     * @param className class name of the math trait within alpaka
	     * @param funcName math function name within STL
	     * @return SIMD pack with result of the math function for each lane
	     */
	#define ALPAKA_SIMD_MATH_UNARY_OP(className, funcName)                                                                \
	    template<typename T_MathImpl, alpaka::concepts::Simd T_Arg>                                                       \
	    struct className::Op<T_MathImpl, T_Arg>                                                                           \
	    {                                                                                                                 \
	        constexpr auto operator()(T_MathImpl mathImpl, T_Arg const& arg) const -> T_Arg                               \
	        {                                                                                                             \
	            using std::funcName;                                                                                      \
	            if constexpr(requires { funcName(arg.asNativeType()); })                                                  \
	            {                                                                                                         \
	                return T_Arg{funcName(arg.asNativeType())};                                                           \
	            }                                                                                                         \
	            else                                                                                                      \
	            {                                                                                                         \
	                T_Arg ret{};                                                                                          \
	                for(uint32_t i = 0u; i < T_Arg::width(); i++)                                                         \
	                    ret[i] = className::Op<T_MathImpl, ALPAKA_TYPEOF(arg[i])>{}(mathImpl, arg[i]);                    \
	                return ret;                                                                                           \
	            }                                                                                                         \
	        }                                                                                                             \
	    }

	    ALPAKA_SIMD_MATH_UNARY_OP(Abs, abs);
	    ALPAKA_SIMD_MATH_UNARY_OP(Sin, sin);
	    ALPAKA_SIMD_MATH_UNARY_OP(Acosh, acosh);
	    ALPAKA_SIMD_MATH_UNARY_OP(Asinh, asinh);
	    ALPAKA_SIMD_MATH_UNARY_OP(Sinh, sinh);
	    ALPAKA_SIMD_MATH_UNARY_OP(Atan, atan);
	    ALPAKA_SIMD_MATH_UNARY_OP(Trunc, trunc);
	    ALPAKA_SIMD_MATH_UNARY_OP(Isinf, isinf);
	    ALPAKA_SIMD_MATH_UNARY_OP(Isfinite, isfinite);
	    ALPAKA_SIMD_MATH_UNARY_OP(Atanh, atanh);
	    ALPAKA_SIMD_MATH_UNARY_OP(Tanh, tanh);
	    ALPAKA_SIMD_MATH_UNARY_OP(Cbrt, cbrt);
	    ALPAKA_SIMD_MATH_UNARY_OP(Ceil, ceil);
	    ALPAKA_SIMD_MATH_UNARY_OP(Round, round);
	    ALPAKA_SIMD_MATH_UNARY_OP(Lround, lround);
	    ALPAKA_SIMD_MATH_UNARY_OP(Llround, llround);
	    ALPAKA_SIMD_MATH_UNARY_OP(Exp, exp);
	    ALPAKA_SIMD_MATH_UNARY_OP(Sqrt, sqrt);
	    ALPAKA_SIMD_MATH_UNARY_OP(Cos, cos);
	    ALPAKA_SIMD_MATH_UNARY_OP(Cosh, cosh);
	    ALPAKA_SIMD_MATH_UNARY_OP(Erf, erf);
	    ALPAKA_SIMD_MATH_UNARY_OP(Floor, floor);
	    ALPAKA_SIMD_MATH_UNARY_OP(Log, log);
	    ALPAKA_SIMD_MATH_UNARY_OP(Log2, log2);
	    ALPAKA_SIMD_MATH_UNARY_OP(Log10, log10);
	    ALPAKA_SIMD_MATH_UNARY_OP(Tan, tan);
	    ALPAKA_SIMD_MATH_UNARY_OP(Acos, acos);
	    ALPAKA_SIMD_MATH_UNARY_OP(Asin, asin);
	    ALPAKA_SIMD_MATH_UNARY_OP(Isnan, isnan);

	#undef ALPAKA_SIMD_MATH_UNARY_OP
	} // namespace alpaka::math::internal
	// ==
	// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/simd/math.hpp ==
	// ============================================================================

// #include "alpaka/simd/simdized.hpp"    // amalgamate: file already inlined
// #include "alpaka/tag.hpp"    // amalgamate: file already inlined
// #include "alpaka/utility.hpp"    // amalgamate: file already inlined

/** main alpaka namespace.
 *
 * namespace onHost::* contains all functionality which is usable on the host CPU controller thread.
 * namespace onAcc::* contains all functionality which is usable on the accelerator compute device from within a
 * kernel. namespace alpaka contains all functionality which is generic and can be used from within the host controller
 * thread and within compute device kernels.
 */
namespace alpaka
{
}
// ==
// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/alpaka.hpp ==
// ============================================================================

// ============================================================================
// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/onHost/example/executors.hpp ==
// ==
/* Copyright 2024 René Widera, Mehmet Yusufoglu
 * SPDX-License-Identifier: MPL-2.0
 */

// #pragma once
// #include "alpaka/executor.hpp"    // amalgamate: file already inlined

namespace alpaka::onHost::example
{
    /** list of enabled executors
     *
     * @deprecated The variable will be removed before the release 3.0.0, please use alpaka::exec::enabledExecutors
     *
     * @see exec::enabledExecutors
     */
    [[deprecated(
        "This variable will be removed before the release 3.0.0, please use alpaka::exec::enabledExecutors "
        "instead!")]] constexpr auto enabledExecutors
        = alpaka::exec::enabledExecutors;
} // namespace alpaka::onHost::example
// ==
// == /home/docs/checkouts/readthedocs.org/user_builds/alpaka3/checkouts/latest/include/alpaka/onHost/example/executors.hpp ==
// ============================================================================