summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorrenjithravindrankannath <94420380+renjithravindrankannath@users.noreply.github.com>2024-01-22 10:19:28 -0800
committerGitHub <noreply@github.com>2024-01-22 10:19:28 -0800
commitc673979feeaadcf03fc8803e2261809c40df8362 (patch)
treef496d602a3bb56d9648db4755a8f7096cc41bb05
parent7acd5bdc7f0fa646cf4ac1dd7acf7c85d62e3193 (diff)
downloadspack-c673979feeaadcf03fc8803e2261809c40df8362.tar.gz
spack-c673979feeaadcf03fc8803e2261809c40df8362.tar.bz2
spack-c673979feeaadcf03fc8803e2261809c40df8362.tar.xz
spack-c673979feeaadcf03fc8803e2261809c40df8362.zip
Bump up the version for ROCm-6.0.0 (#42026)
* Bump up the version for ROCm-6.0.0 * Adding patch files * Style check failure fix * Style check fixes * Style check error fixes * Patch to remove hipblas client file installation in 6.0 * Patch need to be applied on all 5.7 relases * 6.0 update for math libs and other packages, new github url etc * Correct package-audit failures * Correcting shasum for rocfft patch and limiting patch in rocblas * Reverting updates in rocprofiler-dev due to ci-gitlab failure * Fixes for ci-gitlab failure due to disabling hip backward compatibilit * Adding patch file to Change HIP_PLATFORM from HCC to AMD and NVCC to NVIDIA * Use the gcnArchName inplace of gcnArch as gcnArch is deprecated from rocm-6.0.0 * Patches to fix magma and blaspp build error with rocm 6.0.0 * Patch for mfem and arborx for rocm 6.0 * Style check error fix * Correcting style check errors * Uodating dependent version * Update for petsc to build with rocm 6.0 Need reverting-operator-mixup-fix-for-slate.patch for rocm 6.0 * Reverting the change in url for 2.7.4-rocm-enhanced * hip-tensor 6.0.0 update
-rw-r--r--var/spack/repos/builtin/packages/amdsmi/package.py4
-rw-r--r--var/spack/repos/builtin/packages/aomp/package.py6
-rw-r--r--var/spack/repos/builtin/packages/arborx/0001-update-major-version-required-for-rocm-6.0.patch24
-rw-r--r--var/spack/repos/builtin/packages/arborx/package.py1
-rw-r--r--var/spack/repos/builtin/packages/atmi/package.py6
-rw-r--r--var/spack/repos/builtin/packages/aws-ofi-rccl/package.py6
-rw-r--r--var/spack/repos/builtin/packages/babelstream/package.py2
-rw-r--r--var/spack/repos/builtin/packages/blaspp/0001-fix-blaspp-build-error-with-rocm-6.0.0.patch50
-rw-r--r--var/spack/repos/builtin/packages/blaspp/package.py7
-rw-r--r--var/spack/repos/builtin/packages/comgr/package.py10
-rw-r--r--var/spack/repos/builtin/packages/composable-kernel/package.py20
-rw-r--r--var/spack/repos/builtin/packages/heffte/package.py2
-rw-r--r--var/spack/repos/builtin/packages/hip-examples/package.py6
-rw-r--r--var/spack/repos/builtin/packages/hip-rocclr/package.py16
-rw-r--r--var/spack/repos/builtin/packages/hip-tensor/package.py3
-rw-r--r--var/spack/repos/builtin/packages/hip/0014-remove-compiler-rt-linkage-for-host.6.0.patch61
-rw-r--r--var/spack/repos/builtin/packages/hip/0018-reverting-hipMemoryType-with-memoryType.patch17
-rw-r--r--var/spack/repos/builtin/packages/hip/package.py69
-rw-r--r--var/spack/repos/builtin/packages/hipblas/package.py16
-rw-r--r--var/spack/repos/builtin/packages/hipblas/remove-hipblas-clients-file-installation-6.0.patch32
-rw-r--r--var/spack/repos/builtin/packages/hipcub/package.py8
-rw-r--r--var/spack/repos/builtin/packages/hipfft/001-remove-submodule-and-sync-shared-files-from-rocFFT.patch11431
-rw-r--r--var/spack/repos/builtin/packages/hipfft/package.py10
-rw-r--r--var/spack/repos/builtin/packages/hipfort/package.py8
-rw-r--r--var/spack/repos/builtin/packages/hipify-clang/package.py10
-rw-r--r--var/spack/repos/builtin/packages/hiprand/package.py8
-rw-r--r--var/spack/repos/builtin/packages/hipsolver/package.py8
-rw-r--r--var/spack/repos/builtin/packages/hipsparse/package.py8
-rw-r--r--var/spack/repos/builtin/packages/hsa-rocr-dev/package.py13
-rw-r--r--var/spack/repos/builtin/packages/hsakmt-roct/package.py11
-rw-r--r--var/spack/repos/builtin/packages/legion/package.py5
-rw-r--r--var/spack/repos/builtin/packages/legion/update-hip-path-legion-23.06.0.patch13
-rw-r--r--var/spack/repos/builtin/packages/llvm-amdgpu/package.py28
-rw-r--r--var/spack/repos/builtin/packages/magma/0001-fix-magma-build-error-with-rocm-6.0.0.patch99
-rw-r--r--var/spack/repos/builtin/packages/magma/package.py3
-rw-r--r--var/spack/repos/builtin/packages/mfem/mfem-hip.patch24
-rw-r--r--var/spack/repos/builtin/packages/mfem/package.py2
-rw-r--r--var/spack/repos/builtin/packages/migraphx/package.py12
-rw-r--r--var/spack/repos/builtin/packages/miopen-hip/package.py12
-rw-r--r--var/spack/repos/builtin/packages/miopen-opencl/package.py6
-rw-r--r--var/spack/repos/builtin/packages/miopen-tensile/package.py8
-rw-r--r--var/spack/repos/builtin/packages/miopengemm/package.py10
-rw-r--r--var/spack/repos/builtin/packages/mivisionx/package.py7
-rw-r--r--var/spack/repos/builtin/packages/mlirmiopen/package.py6
-rw-r--r--var/spack/repos/builtin/packages/petsc/Handle-hipsparse-api-changes-for-rocm-6.0.patch70
-rw-r--r--var/spack/repos/builtin/packages/petsc/package.py5
-rw-r--r--var/spack/repos/builtin/packages/raja/package.py8
-rw-r--r--var/spack/repos/builtin/packages/rccl-tests/package.py6
-rw-r--r--var/spack/repos/builtin/packages/rccl/0004-Set-rocm-core-path-for-version-file.patch13
-rw-r--r--var/spack/repos/builtin/packages/rccl/package.py15
-rw-r--r--var/spack/repos/builtin/packages/rdc/package.py13
-rw-r--r--var/spack/repos/builtin/packages/rocalution/package.py8
-rw-r--r--var/spack/repos/builtin/packages/rocblas/package.py27
-rw-r--r--var/spack/repos/builtin/packages/rocfft/package.py19
-rw-r--r--var/spack/repos/builtin/packages/rocm-bandwidth-test/package.py10
-rw-r--r--var/spack/repos/builtin/packages/rocm-clang-ocl/package.py10
-rw-r--r--var/spack/repos/builtin/packages/rocm-cmake/package.py9
-rw-r--r--var/spack/repos/builtin/packages/rocm-core/package.py5
-rw-r--r--var/spack/repos/builtin/packages/rocm-dbgapi/package.py10
-rw-r--r--var/spack/repos/builtin/packages/rocm-debug-agent/package.py15
-rw-r--r--var/spack/repos/builtin/packages/rocm-device-libs/package.py10
-rw-r--r--var/spack/repos/builtin/packages/rocm-gdb/package.py10
-rw-r--r--var/spack/repos/builtin/packages/rocm-opencl/package.py8
-rw-r--r--var/spack/repos/builtin/packages/rocm-openmp-extras/package.py26
-rw-r--r--var/spack/repos/builtin/packages/rocm-smi-lib/package.py9
-rw-r--r--var/spack/repos/builtin/packages/rocm-smi/package.py6
-rw-r--r--var/spack/repos/builtin/packages/rocm-tensile/package.py12
-rw-r--r--var/spack/repos/builtin/packages/rocm-validation-suite/009-replacing-rocm-path-with-package-path.patch636
-rw-r--r--var/spack/repos/builtin/packages/rocm-validation-suite/package.py11
-rw-r--r--var/spack/repos/builtin/packages/rocminfo/package.py10
-rw-r--r--var/spack/repos/builtin/packages/rocmlir/package.py6
-rw-r--r--var/spack/repos/builtin/packages/rocprim/package.py8
-rw-r--r--var/spack/repos/builtin/packages/rocprofiler-dev/package.py8
-rw-r--r--var/spack/repos/builtin/packages/rocrand/package.py14
-rw-r--r--var/spack/repos/builtin/packages/rocsolver/package.py12
-rw-r--r--var/spack/repos/builtin/packages/rocsparse/package.py9
-rw-r--r--var/spack/repos/builtin/packages/rocthrust/package.py8
-rw-r--r--var/spack/repos/builtin/packages/roctracer-dev-api/package.py7
-rw-r--r--var/spack/repos/builtin/packages/roctracer-dev/package.py11
-rw-r--r--var/spack/repos/builtin/packages/rocwmma/package.py10
-rw-r--r--var/spack/repos/builtin/packages/rpp/0003-include-half-through-spack-package.patch61
-rw-r--r--var/spack/repos/builtin/packages/rpp/package.py10
-rw-r--r--var/spack/repos/builtin/packages/sundials/Change-HIP_PLATFORM-from-HCC-to-AMD-and-NVCC-to-NVIDIA.patch28
-rw-r--r--var/spack/repos/builtin/packages/sundials/package.py4
-rw-r--r--var/spack/repos/builtin/packages/trilinos/0001-use-the-gcnArchName-inplace-of-gcnArch-as-gcnArch-is.patch26
-rw-r--r--var/spack/repos/builtin/packages/trilinos/package.py5
86 files changed, 13036 insertions, 280 deletions
diff --git a/var/spack/repos/builtin/packages/amdsmi/package.py b/var/spack/repos/builtin/packages/amdsmi/package.py
index ecd2ca1f1d..e7543fdb8b 100644
--- a/var/spack/repos/builtin/packages/amdsmi/package.py
+++ b/var/spack/repos/builtin/packages/amdsmi/package.py
@@ -12,8 +12,8 @@ class Amdsmi(CMakePackage):
is a C library for Linux that provides a user space interface for
applications to monitor and control AMD device."""
- homepage = "https://github.com/RadeonOpenCompute/amdsmi"
- url = "https://github.com/RadeonOpenCompute/amdsmi/archive/refs/tags/rocm-5.6.0.tar.gz"
+ homepage = "https://github.com/ROCm/amdsmi"
+ url = "https://github.com/ROCm/amdsmi/archive/refs/tags/rocm-5.6.0.tar.gz"
tags = ["rocm"]
maintainers("srekolam", "renjithravindrankannath")
diff --git a/var/spack/repos/builtin/packages/aomp/package.py b/var/spack/repos/builtin/packages/aomp/package.py
index e32dc705e3..6a9603a9de 100644
--- a/var/spack/repos/builtin/packages/aomp/package.py
+++ b/var/spack/repos/builtin/packages/aomp/package.py
@@ -7,8 +7,8 @@ import re
from spack.package import *
-tools_url = "https://github.com/ROCm-Developer-Tools"
-compute_url = "https://github.com/RadeonOpenCompute"
+tools_url = "https://github.com/ROCm"
+compute_url = "https://github.com/ROCm"
aomp = [
@@ -368,7 +368,7 @@ class Aomp(Package):
"-DCMAKE_C_COMPILER={0}".format(self.compiler.cc),
"-DCMAKE_CXX_COMPILER={0}".format(self.compiler.cxx),
"-DCMAKE_ASM_COMPILER={0}".format(self.compiler.cc),
- "-DBUG_REPORT_URL=https://github.com/ROCm-Developer-Tools/aomp",
+ "-DBUG_REPORT_URL=https://github.com/ROCm/aomp",
"-DLLVM_ENABLE_BINDINGS=OFF",
"-DLLVM_INCLUDE_BENCHMARKS=OFF",
"-DLLVM_BUILD_TESTS=OFF",
diff --git a/var/spack/repos/builtin/packages/arborx/0001-update-major-version-required-for-rocm-6.0.patch b/var/spack/repos/builtin/packages/arborx/0001-update-major-version-required-for-rocm-6.0.patch
new file mode 100644
index 0000000000..009a40f984
--- /dev/null
+++ b/var/spack/repos/builtin/packages/arborx/0001-update-major-version-required-for-rocm-6.0.patch
@@ -0,0 +1,24 @@
+From a31d3766f5a7a3a3e20d5bc0c315ad6295a82298 Mon Sep 17 00:00:00 2001
+From: Afzal Patel <afzal.patel@amd.com>
+Date: Wed, 17 Jan 2024 11:50:18 -0800
+Subject: [PATCH] Changed required version of rocthrust to 3 for rocm 6.0
+
+---
+ CMakeLists.txt | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/CMakeLists.txt b/CMakeLists.txt
+index 8c3c99a..1af6d13 100644
+--- a/CMakeLists.txt
++++ b/CMakeLists.txt
+@@ -22,7 +22,7 @@ if(Kokkos_ENABLE_HIP AND ARBORX_ENABLE_ROCTHRUST)
+ # Require at least rocThrust-2.10.5 (that comes with ROCm 3.9) because
+ # rocPRIM dependency is not set properly in exported configuration for
+ # earlier versions
+- find_package(rocthrust 2.10.5 REQUIRED CONFIG)
++ find_package(rocthrust 3 REQUIRED CONFIG)
+ target_link_libraries(ArborX INTERFACE roc::rocthrust)
+ endif()
+
+--
+2.25.1
diff --git a/var/spack/repos/builtin/packages/arborx/package.py b/var/spack/repos/builtin/packages/arborx/package.py
index 6eb003252c..1414a22d7a 100644
--- a/var/spack/repos/builtin/packages/arborx/package.py
+++ b/var/spack/repos/builtin/packages/arborx/package.py
@@ -96,6 +96,7 @@ class Arborx(CMakePackage, CudaPackage, ROCmPackage):
depends_on("trilinos@13.4.0:", when="@1.3+trilinos")
depends_on("trilinos@14.0.0:", when="@1.4:+trilinos")
patch("trilinos14.0-kokkos-major-version.patch", when="@1.4+trilinos ^trilinos@14.0.0")
+ patch("0001-update-major-version-required-for-rocm-6.0.patch", when="+rocm ^hip@6.0:")
conflicts("~serial", when="+trilinos")
conflicts("+cuda", when="+trilinos")
diff --git a/var/spack/repos/builtin/packages/atmi/package.py b/var/spack/repos/builtin/packages/atmi/package.py
index 98fc5999f5..96c588174f 100644
--- a/var/spack/repos/builtin/packages/atmi/package.py
+++ b/var/spack/repos/builtin/packages/atmi/package.py
@@ -13,9 +13,9 @@ class Atmi(CMakePackage):
consistent, declarative API to create task graphs on CPUs and GPUs
(integrated and discrete)."""
- homepage = "https://github.com/RadeonOpenCompute/atmi"
- git = "https://github.com/RadeonOpenCompute/atmi.git"
- url = "https://github.com/RadeonOpenCompute/atmi/archive/rocm-5.5.0.tar.gz"
+ homepage = "https://github.com/ROCm/atmi"
+ git = "https://github.com/ROCm/atmi.git"
+ url = "https://github.com/ROCm/atmi/archive/rocm-6.0.0.tar.gz"
tags = ["rocm"]
license("MIT")
diff --git a/var/spack/repos/builtin/packages/aws-ofi-rccl/package.py b/var/spack/repos/builtin/packages/aws-ofi-rccl/package.py
index 28045fd8ef..f831c88537 100644
--- a/var/spack/repos/builtin/packages/aws-ofi-rccl/package.py
+++ b/var/spack/repos/builtin/packages/aws-ofi-rccl/package.py
@@ -11,9 +11,9 @@ class AwsOfiRccl(AutotoolsPackage):
libfabric as a network provider while running AMD's RCCL based
applications."""
- homepage = "https://github.com/ROCmSoftwarePlatform/aws-ofi-rccl"
- git = "https://github.com/ROCmSoftwarePlatform/aws-ofi-rccl.git"
- url = "https://github.com/ROCmSoftwarePlatform/aws-ofi-rccl.git"
+ homepage = "https://github.com/ROCm/aws-ofi-rccl"
+ git = "https://github.com/ROCm/aws-ofi-rccl.git"
+ url = "https://github.com/ROCm/aws-ofi-rccl.git"
tags = ["rocm"]
maintainers("bvanessen")
diff --git a/var/spack/repos/builtin/packages/babelstream/package.py b/var/spack/repos/builtin/packages/babelstream/package.py
index 0d09e2f5d1..4b2a1c5857 100644
--- a/var/spack/repos/builtin/packages/babelstream/package.py
+++ b/var/spack/repos/builtin/packages/babelstream/package.py
@@ -157,7 +157,7 @@ class Babelstream(CMakePackage, CudaPackage, ROCmPackage):
when="+thrust",
msg="Which Thrust implementation to use, supported options include:\
- CUDA (via https://github.com/NVIDIA/thrust)\
- - ROCM (via https://github.com/ROCmSoftwarePlatform/rocThrust)",
+ - ROCM (via https://github.com/ROCm/rocThrust)",
)
# This applies to all
diff --git a/var/spack/repos/builtin/packages/blaspp/0001-fix-blaspp-build-error-with-rocm-6.0.0.patch b/var/spack/repos/builtin/packages/blaspp/0001-fix-blaspp-build-error-with-rocm-6.0.0.patch
new file mode 100644
index 0000000000..3ce15f0859
--- /dev/null
+++ b/var/spack/repos/builtin/packages/blaspp/0001-fix-blaspp-build-error-with-rocm-6.0.0.patch
@@ -0,0 +1,50 @@
+From a75f399bfa77680e7736d126ef3e5a520e1a1702 Mon Sep 17 00:00:00 2001
+From: sreenivasa murthy kolam <sreenivasamurthy.kolam@amd.com>
+Date: Wed, 17 Jan 2024 12:55:06 +0000
+Subject: [PATCH] fix build error with rocm-6.0.0 by adding extra parameters
+ for rocblas function calls rocblas_ztrmm() ,rocblas_strmm(),
+ rocblas_ctrmm(),rocblas_dtrmm()
+
+---
+ src/rocblas_wrappers.cc | 4 ++++
+ 1 file changed, 4 insertions(+)
+
+diff --git a/src/rocblas_wrappers.cc b/src/rocblas_wrappers.cc
+index 0e01a95..44ab150 100644
+--- a/src/rocblas_wrappers.cc
++++ b/src/rocblas_wrappers.cc
+@@ -667,6 +667,7 @@ void trmm(
+ m, n,
+ &alpha,
+ dA, ldda,
++ dB, lddb,
+ dB, lddb ) );
+ }
+
+@@ -686,6 +687,7 @@ void trmm(
+ m, n,
+ &alpha,
+ dA, ldda,
++ dB, lddb,
+ dB, lddb ) );
+ }
+
+@@ -705,6 +707,7 @@ void trmm(
+ m, n,
+ (rocblas_float_complex*) &alpha,
+ (rocblas_float_complex*) dA, ldda,
++ (rocblas_float_complex*) dB, lddb,
+ (rocblas_float_complex*) dB, lddb ) );
+ }
+
+@@ -724,6 +727,7 @@ void trmm(
+ m, n,
+ (rocblas_double_complex*) &alpha,
+ (rocblas_double_complex*) dA, ldda,
++ (rocblas_double_complex*) dB, lddb,
+ (rocblas_double_complex*) dB, lddb ) );
+ }
+
+--
+2.39.3
+
diff --git a/var/spack/repos/builtin/packages/blaspp/package.py b/var/spack/repos/builtin/packages/blaspp/package.py
index e0de779540..78a2fce1d6 100644
--- a/var/spack/repos/builtin/packages/blaspp/package.py
+++ b/var/spack/repos/builtin/packages/blaspp/package.py
@@ -22,6 +22,9 @@ class Blaspp(CMakePackage, CudaPackage, ROCmPackage):
version("master", branch="master")
version(
+ "2023.11.05", sha256="62dfc03ec07c0826e0466dc2c204b460caa929d53ad4f050cb132d92670be7ce"
+ )
+ version(
"2023.08.25", sha256="1d9c7227a6d8776944aa866592142b7b51c6e4ba5529d168eb8ae2b329c47401"
)
version(
@@ -76,6 +79,10 @@ class Blaspp(CMakePackage, CudaPackage, ROCmPackage):
requires("%oneapi", when="+sycl", msg="blaspp+sycl must be compiled with %oneapi")
+ patch(
+ "0001-fix-blaspp-build-error-with-rocm-6.0.0.patch", when="@2023.06.00: ^hip@6.0.0 +rocm"
+ )
+
def cmake_args(self):
spec = self.spec
backend_config = "-Duse_cuda=%s" % ("+cuda" in spec)
diff --git a/var/spack/repos/builtin/packages/comgr/package.py b/var/spack/repos/builtin/packages/comgr/package.py
index f713ccba6d..f8bbd4e526 100644
--- a/var/spack/repos/builtin/packages/comgr/package.py
+++ b/var/spack/repos/builtin/packages/comgr/package.py
@@ -12,9 +12,9 @@ class Comgr(CMakePackage):
"""This provides various Lightning Compiler related services. It currently
contains one library, the Code Object Manager (Comgr)"""
- homepage = "https://github.com/RadeonOpenCompute/ROCm-CompilerSupport"
- git = "https://github.com/RadeonOpenCompute/ROCm-CompilerSupport.git"
- url = "https://github.com/RadeonOpenCompute/ROCm-CompilerSupport/archive/rocm-5.5.0.tar.gz"
+ homepage = "https://github.com/ROCm/ROCm-CompilerSupport"
+ git = "https://github.com/ROCm/ROCm-CompilerSupport.git"
+ url = "https://github.com/ROCm/ROCm-CompilerSupport/archive/rocm-6.0.0.tar.gz"
tags = ["rocm"]
maintainers("srekolam", "renjithravindrankannath", "haampie")
@@ -23,6 +23,7 @@ class Comgr(CMakePackage):
license("NCSA")
version("master", branch="amd-stg-open")
+ version("6.0.0", sha256="04353d27a512642a5e5339532a39d0aabe44e0964985de37b150a2550385800a")
version("5.7.1", sha256="3b9433b4a0527167c3e9dfc37a3c54e0550744b8d4a8e1be298c8d4bcedfee7c")
version("5.7.0", sha256="e234bcb93d602377cfaaacb59aeac5796edcd842a618162867b7e670c3a2c42c")
version("5.6.1", sha256="0a85d84619f98be26ca7a32c71f94ed3c4e9866133789eabb451be64ce739300")
@@ -152,6 +153,7 @@ class Comgr(CMakePackage):
"5.6.1",
"5.7.0",
"5.7.1",
+ "6.0.0",
"master",
]:
# llvm libs are linked statically, so this *could* be a build dep
@@ -163,7 +165,7 @@ class Comgr(CMakePackage):
"rocm-device-libs@" + ver, when="@{0} ^llvm-amdgpu ~rocm-device-libs".format(ver)
)
- for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1", "5.7.0", "5.7.1"]:
+ for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1", "5.7.0", "5.7.1", "6.0.0"]:
depends_on("rocm-core@" + ver, when="@" + ver)
root_cmakelists_dir = join_path("lib", "comgr")
diff --git a/var/spack/repos/builtin/packages/composable-kernel/package.py b/var/spack/repos/builtin/packages/composable-kernel/package.py
index afbb86f01f..10bdf7183c 100644
--- a/var/spack/repos/builtin/packages/composable-kernel/package.py
+++ b/var/spack/repos/builtin/packages/composable-kernel/package.py
@@ -11,14 +11,15 @@ class ComposableKernel(CMakePackage):
"""Composable Kernel: Performance Portable Programming Model
for Machine Learning Tensor Operators."""
- homepage = "https://github.com/ROCmSoftwarePlatform/composable_kernel"
- git = "https://github.com/ROCmSoftwarePlatform/composable_kernel.git"
- url = "https://github.com/ROCmSoftwarePlatform/composable_kernel/archive/refs/tags/rocm-5.7.1.tar.gz"
+ homepage = "https://github.com/ROCm/composable_kernel"
+ git = "https://github.com/ROCm/composable_kernel.git"
+ url = "https://github.com/ROCm/composable_kernel/archive/refs/tags/rocm-5.7.1.tar.gz"
maintainers("srekolam", "afzpatel")
license("MIT")
version("master", branch="develop")
+ version("6.0.0", sha256="a8f736f2f2a8afa4cddd06301205be27774d85f545429049b4a2bbbe6fcd67df")
version("5.7.1", sha256="75f66e023c2e31948e91fa26366eaeac72d871fc2e5188361d4465179f13876e")
version("5.7.0", sha256="d9624dbaef04e0138f9f73596c49b4fe9ded69974bae7236354baa32649bf21a")
version("5.6.1", commit="f5ec04f091fa5c48c67d7bacec36a414d0be06a5")
@@ -46,7 +47,18 @@ class ComposableKernel(CMakePackage):
depends_on("pkgconfig", type="build")
depends_on("cmake@3.16:", type="build")
- for ver in ["master", "5.7.1", "5.7.0", "5.6.1", "5.6.0", "5.5.1", "5.5.0", "5.4.3", "5.4.0"]:
+ for ver in [
+ "master",
+ "6.0.0",
+ "5.7.1",
+ "5.7.0",
+ "5.6.1",
+ "5.6.0",
+ "5.5.1",
+ "5.5.0",
+ "5.4.3",
+ "5.4.0",
+ ]:
depends_on("hip@" + ver, when="@" + ver)
depends_on("llvm-amdgpu@" + ver, when="@" + ver)
depends_on("rocm-cmake@" + ver, when="@" + ver, type="build")
diff --git a/var/spack/repos/builtin/packages/heffte/package.py b/var/spack/repos/builtin/packages/heffte/package.py
index 228e813973..1472116be9 100644
--- a/var/spack/repos/builtin/packages/heffte/package.py
+++ b/var/spack/repos/builtin/packages/heffte/package.py
@@ -114,7 +114,7 @@ class Heffte(CMakePackage, CudaPackage, ROCmPackage):
if "none" not in rocm_arch:
args.append("-DCMAKE_CXX_FLAGS={0}".format(self.hip_flags(rocm_arch)))
- # See https://github.com/ROCmSoftwarePlatform/rocFFT/issues/322
+ # See https://github.com/ROCm/rocFFT/issues/322
if self.spec.satisfies("^cmake@3.21.0:3.21.2"):
args.append(self.define("__skip_rocmclang", "ON"))
diff --git a/var/spack/repos/builtin/packages/hip-examples/package.py b/var/spack/repos/builtin/packages/hip-examples/package.py
index c2e8aaa97e..22f5705389 100644
--- a/var/spack/repos/builtin/packages/hip-examples/package.py
+++ b/var/spack/repos/builtin/packages/hip-examples/package.py
@@ -11,9 +11,9 @@ from spack.package import *
class HipExamples(Package):
"""Examples for HIP"""
- homepage = "https://github.com/ROCm-Developer-Tools/HIP-Examples/"
- git = "https://github.com/ROCm-Developer-Tools/HIP-Examples.git"
- url = "https://github.com/ROCm-Developer-Tools/HIP-Examples/archive/rocm-5.4.3.tar.gz"
+ homepage = "https://github.com/ROCm/HIP-Examples/"
+ git = "https://github.com/ROCm/HIP-Examples.git"
+ url = "https://github.com/ROCm/HIP-Examples/archive/rocm-5.4.3.tar.gz"
tags = ["rocm"]
maintainers("srekolam", "renjithravindrankannath", "afzpatel")
diff --git a/var/spack/repos/builtin/packages/hip-rocclr/package.py b/var/spack/repos/builtin/packages/hip-rocclr/package.py
index 2ae9e375e0..22c1232e2c 100644
--- a/var/spack/repos/builtin/packages/hip-rocclr/package.py
+++ b/var/spack/repos/builtin/packages/hip-rocclr/package.py
@@ -12,8 +12,8 @@ class HipRocclr(CMakePackage):
with to different backends such as ROCr or PAL This abstraction allows
runtimes to work on Windows as well as on Linux without much effort."""
- homepage = "https://github.com/ROCm-Developer-Tools/ROCclr"
- git = "https://github.com/ROCm-Developer-Tools/ROCclr.git"
+ homepage = "https://github.com/ROCm/ROCclr"
+ git = "https://github.com/ROCm/ROCclr.git"
tags = ["rocm"]
maintainers("srekolam", "renjithravindrankannath")
@@ -21,9 +21,9 @@ class HipRocclr(CMakePackage):
def url_for_version(self, version):
# Fix up a typo in the 3.5.0 release.
if version == Version("3.5.0"):
- return "https://github.com/ROCm-Developer-Tools/ROCclr/archive/roc-3.5.0.tar.gz"
+ return "https://github.com/ROCm/ROCclr/archive/roc-3.5.0.tar.gz"
- url = "https://github.com/ROCm-Developer-Tools/ROCclr/archive/rocm-{0}.tar.gz"
+ url = "https://github.com/ROCm/ROCclr/archive/rocm-{0}.tar.gz"
return url.format(version)
license("MIT")
@@ -152,13 +152,13 @@ class HipRocclr(CMakePackage):
depends_on("hsa-rocr-dev@" + ver, when="@" + ver)
depends_on("comgr@" + ver, when="@" + ver)
- # See: https://github.com/ROCm-Developer-Tools/ROCclr/pull/16
+ # See: https://github.com/ROCm/ROCclr/pull/16
# In 3.7.0 the find opengl things have changed slightly.
patch("opengl.patch", when="@3.5.0")
resource(
name="opencl-on-vdi",
- url="https://github.com/RadeonOpenCompute/ROCm-OpenCL-Runtime/archive/roc-3.5.0.tar.gz",
+ url="https://github.com/ROCm/ROCm-OpenCL-Runtime/archive/roc-3.5.0.tar.gz",
sha256="511b617d5192f2d4893603c1a02402b2ac9556e9806ff09dd2a91d398abf39a0",
expand=True,
destination="",
@@ -197,7 +197,7 @@ class HipRocclr(CMakePackage):
]:
resource(
name="opencl-on-vdi",
- url="https://github.com/RadeonOpenCompute/ROCm-OpenCL-Runtime/archive/rocm-{0}.tar.gz".format(
+ url="https://github.com/ROCm/ROCm-OpenCL-Runtime/archive/rocm-{0}.tar.gz".format(
d_version
),
sha256=d_shasum,
@@ -209,7 +209,7 @@ class HipRocclr(CMakePackage):
resource(
name="opencl-on-vdi",
- git="https://github.com/RadeonOpenCompute/ROCm-OpenCL-Runtime.git",
+ git="https://github.com/ROCm/ROCm-OpenCL-Runtime.git",
destination="",
placement="opencl-on-vdi",
branch="main",
diff --git a/var/spack/repos/builtin/packages/hip-tensor/package.py b/var/spack/repos/builtin/packages/hip-tensor/package.py
index e925031945..86fd4e385d 100644
--- a/var/spack/repos/builtin/packages/hip-tensor/package.py
+++ b/var/spack/repos/builtin/packages/hip-tensor/package.py
@@ -17,10 +17,11 @@ class HipTensor(CMakePackage, ROCmPackage):
maintainers("srekolam", "afzpatel")
version("master", branch="master")
+ version("6.0.0", sha256="268d7f114784b7e824f89c21c65c2efedbb5486f09a356a56dca1b89bde1ef7a")
version("5.7.1", sha256="96743d4e695fe865aef4097ae31d9b4e42a2d5a92135a005b0d187d9c0b17645")
version("5.7.0", sha256="4b17f6d43b17fe2dc1d0c61e9663d4752006f7898cc94231206444a1663eb252")
- for ver in ["5.7.0", "5.7.1", "master"]:
+ for ver in ["5.7.0", "5.7.1", "6.0.0", "master"]:
depends_on(f"composable-kernel@{ver}", when=f"@{ver}")
depends_on(f"rocm-cmake@{ver}", when=f"@{ver}")
diff --git a/var/spack/repos/builtin/packages/hip/0014-remove-compiler-rt-linkage-for-host.6.0.patch b/var/spack/repos/builtin/packages/hip/0014-remove-compiler-rt-linkage-for-host.6.0.patch
new file mode 100644
index 0000000000..597baa2e5d
--- /dev/null
+++ b/var/spack/repos/builtin/packages/hip/0014-remove-compiler-rt-linkage-for-host.6.0.patch
@@ -0,0 +1,61 @@
+diff --git a/clr/hipamd/CMakeLists.txt b/clr/hipamd/CMakeLists.txt
+index 7ad3001..aaf6ad0 100755
+--- a/clr/hipamd/CMakeLists.txt
++++ b/clr/hipamd/CMakeLists.txt
+@@ -297,16 +297,6 @@ if(HIP_RUNTIME STREQUAL "rocclr")
+ add_subdirectory(src)
+ endif()
+
+-# Download libamdhip64.so.5
+-if(HIP_PLATFORM STREQUAL "amd")
+- if(NOT WIN32)
+- execute_process(COMMAND sh -c "${CMAKE_CURRENT_SOURCE_DIR}/download_libamhip64_v5.sh" WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} COMMAND_ECHO STDERR RESULT_VARIABLE DWLD_HIP_SO_RC)
+- if (DWLD_HIP_SO_RC AND NOT DWLD_HIP_SO_RC EQUAL 0)
+- message(FATAL_ERROR "Failed to download libamdhip64.so.5")
+- endif()
+- endif()
+-endif()
+-
+ # Build doxygen documentation
+ find_program(DOXYGEN_EXE doxygen)
+ if(DOXYGEN_EXE)
+@@ -408,8 +398,6 @@ if (NOT ${HIPCC_BIN_DIR} STREQUAL "")
+ install(PROGRAMS ${HIPCC_BIN_DIR}/hipcc.pl DESTINATION bin)
+ install(PROGRAMS ${HIPCC_BIN_DIR}/hipconfig.pl DESTINATION bin)
+ install(PROGRAMS ${HIPCC_BIN_DIR}/hipvars.pm DESTINATION bin)
+- install(PROGRAMS ${HIPCC_BIN_DIR}/hipcc.bat DESTINATION bin)
+- install(PROGRAMS ${HIPCC_BIN_DIR}/hipconfig.bat DESTINATION bin)
+ endif()
+
+ #############################
+diff --git a/hipcc/bin/hipcc.pl b/hipcc/bin/hipcc.pl
+index 513a427..cd2d6ac 100755
+--- a/hipcc/bin/hipcc.pl
++++ b/hipcc/bin/hipcc.pl
+@@ -160,11 +160,14 @@ if ($HIP_PLATFORM eq "amd") {
+ if($isWindows) {
+ $execExtension = ".exe";
+ }
+- $HIPCC=get_normalized_path("$HIP_CLANG_PATH/clang++" . $execExtension);
++ # llvm_path is set inside the hip recipe
++ $LLVM_PATH= $ENV{'LLVM_PATH'};
++ $HIPCC="${LLVM_PATH}/bin/clang++" . $execExtension;
+
+ # If $HIPCC clang++ is not compiled, use clang instead
+ if ( ! -e $HIPCC ) {
+- $HIPCC=get_normalized_path("$HIP_CLANG_PATH/clang" . $execExtension);
++ $LLVM_PATH= $ENV{'LLVM_PATH'};
++ $HIPCC="${LLVM_PATH}/bin/clang" . $execExtension;
+ $HIPLDFLAGS = "--driver-mode=g++";
+ }
+ # to avoid using dk linker or MSVC linker
+@@ -484,7 +487,8 @@ if($HIP_PLATFORM eq "amd"){
+ $targetsStr = $ENV{HCC_AMDGPU_TARGET};
+ } elsif (not $isWindows) {
+ # Else try using rocm_agent_enumerator
+- $ROCM_AGENT_ENUM = "${ROCM_PATH}/bin/rocm_agent_enumerator";
++ $ROCMINFO_PATH = $ENV{'ROCMINFO_PATH'} // $ROCMINFO_PATH;
++ $ROCM_AGENT_ENUM = "${ROCMINFO_PATH}/bin/rocm_agent_enumerator";
+ $targetsStr = `${ROCM_AGENT_ENUM} -t GPU`;
+ $targetsStr =~ s/\n/,/g;
+ }
diff --git a/var/spack/repos/builtin/packages/hip/0018-reverting-hipMemoryType-with-memoryType.patch b/var/spack/repos/builtin/packages/hip/0018-reverting-hipMemoryType-with-memoryType.patch
new file mode 100644
index 0000000000..c77075d640
--- /dev/null
+++ b/var/spack/repos/builtin/packages/hip/0018-reverting-hipMemoryType-with-memoryType.patch
@@ -0,0 +1,17 @@
+diff --git a/include/hip/hip_runtime_api.h b/include/hip/hip_runtime_api.h
+index 88e6850..d280504 100644
+--- a/include/hip/hip_runtime_api.h
++++ b/include/hip/hip_runtime_api.h
+@@ -259,7 +259,11 @@ typedef enum hipMemoryType {
+ * Pointer attributes
+ */
+ typedef struct hipPointerAttribute_t {
+- enum hipMemoryType type;
++ union {
++ // Deprecated, use instead type
++ enum hipMemoryType memoryType;
++ enum hipMemoryType type;
++ };
+ int device;
+ void* devicePointer;
+ void* hostPointer;
diff --git a/var/spack/repos/builtin/packages/hip/package.py b/var/spack/repos/builtin/packages/hip/package.py
index 29b23fecca..a6fd946955 100644
--- a/var/spack/repos/builtin/packages/hip/package.py
+++ b/var/spack/repos/builtin/packages/hip/package.py
@@ -16,9 +16,9 @@ class Hip(CMakePackage):
create portable applications for AMD and NVIDIA GPUs from
single source code."""
- homepage = "https://github.com/ROCm-Developer-Tools/HIP"
- git = "https://github.com/ROCm-Developer-Tools/HIP.git"
- url = "https://github.com/ROCm-Developer-Tools/HIP/archive/rocm-5.5.0.tar.gz"
+ homepage = "https://github.com/ROCm/HIP"
+ git = "https://github.com/ROCm/HIP.git"
+ url = "https://github.com/ROCm/HIP/archive/rocm-6.0.0.tar.gz"
tags = ["rocm"]
maintainers("srekolam", "renjithravindrankannath", "haampie")
@@ -27,6 +27,7 @@ class Hip(CMakePackage):
license("MIT")
version("master", branch="master")
+ version("6.0.0", sha256="0d575788e0b731124a8489a36652014a165b9ebab92d5456ec3c976e062f3a82")
version("5.7.1", sha256="eaa0e14a9ae45c58ed37863797b683a7778b3cbbf92f5b6529ec65fd61d61f3e")
version("5.7.0", sha256="cb61234eec7879fb7e20937659ad535b93a6e66fc8de0a543da8b7702474f2fc")
version("5.6.1", sha256="4b3c4dfcf8595da0e1b8c3e8067b1ccebeaac337762ff098db14375fa8dd4487")
@@ -172,6 +173,7 @@ class Hip(CMakePackage):
"5.6.1",
"5.7.0",
"5.7.1",
+ "6.0.0",
]:
depends_on("hsakmt-roct@" + ver, when="@" + ver)
depends_on("hsa-rocr-dev@" + ver, when="@" + ver)
@@ -180,12 +182,22 @@ class Hip(CMakePackage):
depends_on("rocminfo@" + ver, when="@" + ver)
depends_on("roctracer-dev-api@" + ver, when="@" + ver)
- for ver in ["5.4.0", "5.4.3", "5.5.0", "5.5.1", "5.6.0", "5.6.1", "5.7.0", "5.7.1"]:
+ for ver in [
+ "5.4.0",
+ "5.4.3",
+ "5.5.0",
+ "5.5.1",
+ "5.6.0",
+ "5.6.1",
+ "5.7.0",
+ "5.7.1",
+ "6.0.0",
+ ]:
depends_on("hipify-clang", when="@" + ver)
- for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1", "5.7.0", "5.7.1"]:
+ for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1", "5.7.0", "5.7.1", "6.0.0"]:
depends_on("rocm-core@" + ver, when="@" + ver)
# hipcc likes to add `-lnuma` by default :(
- # ref https://github.com/ROCm-Developer-Tools/HIP/pull/2202
+ # ref https://github.com/ROCm/HIP/pull/2202
depends_on("numactl", when="@3.7.0:")
# roc-obj-ls requirements
@@ -212,9 +224,7 @@ class Hip(CMakePackage):
]:
resource(
name="hipamd",
- url="https://github.com/ROCm-Developer-Tools/hipamd/archive/rocm-{0}.tar.gz".format(
- d_version
- ),
+ url="https://github.com/ROCm/hipamd/archive/rocm-{0}.tar.gz".format(d_version),
sha256=d_shasum,
expand=True,
destination="",
@@ -241,7 +251,7 @@ class Hip(CMakePackage):
]:
resource(
name="opencl",
- url="https://github.com/RadeonOpenCompute/ROCm-OpenCL-Runtime/archive/rocm-{0}.tar.gz".format(
+ url="https://github.com/ROCm/ROCm-OpenCL-Runtime/archive/rocm-{0}.tar.gz".format(
d_version
),
sha256=d_shasum,
@@ -269,9 +279,7 @@ class Hip(CMakePackage):
]:
resource(
name="rocclr",
- url="https://github.com/ROCm-Developer-Tools/ROCclr/archive/rocm-{0}.tar.gz".format(
- d_version
- ),
+ url="https://github.com/ROCm/ROCclr/archive/rocm-{0}.tar.gz".format(d_version),
sha256=d_shasum,
expand=True,
destination="",
@@ -280,6 +288,7 @@ class Hip(CMakePackage):
)
# Add hip-clr sources thru the below
for d_version, d_shasum in [
+ ("6.0.0", "798b55b5b5fb90dd19db54f136d8d8e1da9ae1e408d5b12b896101d635f97e50"),
("5.7.1", "c78490335233a11b4d8a5426ace7417c555f5e2325de10422df06c0f0f00f7eb"),
("5.7.0", "bc2447cb6fd86dff6a333b04e77ce85755104d9011a14a044af53caf02449573"),
("5.6.1", "0b88af1e99643899d11b1c8cf8a3c46601051b328a5e0ffbd44ee88b7eb0db33"),
@@ -287,9 +296,7 @@ class Hip(CMakePackage):
]:
resource(
name="clr",
- url="https://github.com/ROCm-Developer-Tools/clr/archive/refs/tags/rocm-{0}.tar.gz".format(
- d_version
- ),
+ url="https://github.com/ROCm/clr/archive/refs/tags/rocm-{0}.tar.gz".format(d_version),
sha256=d_shasum,
expand=True,
destination="",
@@ -299,6 +306,7 @@ class Hip(CMakePackage):
# Add hipcc sources thru the below
for d_version, d_shasum in [
+ ("6.0.0", "e9cfaaecaf0e6ed363946439197f340c115e8e1189f96dbd716cf20245c29255"),
("5.7.1", "d47d27ef2b5de7f49cdfd8547832ac9b437a32e6fc6f0e9c1646f4b704c90aee"),
("5.7.0", "9f839bf7226e5e26f3150f8ba6eca507ab9a668e68b207736301b3bb9040c973"),
("5.6.1", "5800fac92b841ef6f52acda78d9bf86f83970bec0fb848a6265d239bdb7eb51a"),
@@ -306,7 +314,7 @@ class Hip(CMakePackage):
]:
resource(
name="hipcc",
- url="https://github.com/ROCm-Developer-Tools/HIPCC/archive/refs/tags/rocm-{0}.tar.gz".format(
+ url="https://github.com/ROCm/HIPCC/archive/refs/tags/rocm-{0}.tar.gz".format(
d_version
),
sha256=d_shasum,
@@ -317,6 +325,7 @@ class Hip(CMakePackage):
)
# Add hiptests sources thru the below
for d_version, d_shasum in [
+ ("6.0.0", "e8f92a0f5d1f6093ca1fb24ff1b7140128900fcdc6e9f01f153d6907e5c2d807"),
("5.7.1", "28fbdf49f405adfee903bc0f05a43ac392c55b34c514c3582dfb7d6d67e79985"),
("5.7.0", "b1dae3cfc715e71dce92ac1da94265a9398944c76cee85ffab8f0c93665a48d6"),
("5.6.1", "5b3002ddfafda162329e4d9e6ac1200eeb48ff08e666b342aa8aeca30750f48b"),
@@ -324,7 +333,7 @@ class Hip(CMakePackage):
]:
resource(
name="hip-tests",
- url="https://github.com/ROCm-Developer-Tools/hip-tests/archive/refs/tags/rocm-{0}.tar.gz".format(
+ url="https://github.com/ROCm/hip-tests/archive/refs/tags/rocm-{0}.tar.gz".format(
d_version
),
sha256=d_shasum,
@@ -366,10 +375,10 @@ class Hip(CMakePackage):
)
patch("0013-remove-compiler-rt-linkage-for-host.5.3.0.patch", when="@5.3.0:5.4")
- # See https://github.com/ROCm-Developer-Tools/HIP/pull/2141
+ # See https://github.com/ROCm/HIP/pull/2141
patch("0002-Fix-detection-of-HIP_CLANG_ROOT.patch", when="@:3.9.0")
- # See https://github.com/ROCm-Developer-Tools/HIP/pull/2218
+ # See https://github.com/ROCm/HIP/pull/2218
patch("0003-Improve-compilation-without-git-repo.3.7.0.patch", when="@3.7.0:3.9.0")
patch("0003-Improve-compilation-without-git-repo.3.10.0.patch", when="@3.10.0:4.0.0")
patch("0003-Improve-compilation-without-git-repo.4.1.0.patch", when="@4.1.0")
@@ -383,7 +392,7 @@ class Hip(CMakePackage):
"_disabletests.4.5.0.patch",
when="@4.5.0:4.5.3",
)
- # See https://github.com/ROCm-Developer-Tools/HIP/pull/2219
+ # See https://github.com/ROCm/HIP/pull/2219
patch("0004-Drop-clang-rt-builtins-linking-on-hip-host.3.7.0.patch", when="@3.7.0:3.9.0")
patch("0004-Drop-clang-rt-builtins-linking-on-hip-host.3.10.0.patch", when="@3.10.0:4.1.0")
@@ -400,14 +409,16 @@ class Hip(CMakePackage):
patch("0014-remove-compiler-rt-linkage-for-host.5.5.0.patch", when="@5.5")
patch("0014-remove-compiler-rt-linkage-for-host.5.6.0.patch", when="@5.6.0:5.6")
patch("0014-Remove-compiler-rt-linkage-for-host-for-5.7.0.patch", when="@5.7.0:5.7")
- patch("0015-reverting-operator-mixup-fix-for-slate.patch", when="@5.6:")
- patch("0017-Set-PARAMETERS_MIN_ALIGNMENT-to-the-native-alignment.patch", when="@5.7")
+ patch("0014-remove-compiler-rt-linkage-for-host.6.0.patch", when="@6.0:")
+ patch("0015-reverting-operator-mixup-fix-for-slate.patch", when="@5.6:6.0")
+ patch("0017-Set-PARAMETERS_MIN_ALIGNMENT-to-the-native-alignment.patch", when="@5.7:6.0")
+ patch("0018-reverting-hipMemoryType-with-memoryType.patch", when="@6.0")
- # See https://github.com/ROCm-Developer-Tools/HIP/pull/3206
+ # See https://github.com/ROCm/HIP/pull/3206
patch(
- "https://github.com/ROCm-Developer-Tools/HIP/commit/50ee82f6bc4aad10908ce09198c9f7ebfb2a3561.patch?full_index=1",
+ "https://github.com/ROCm/HIP/commit/50ee82f6bc4aad10908ce09198c9f7ebfb2a3561.patch?full_index=1",
sha256="c2ee21cdc55262c7c6ba65546b5ca5f65ea89730",
- when="@5.2:",
+ when="@5.2:5.7",
)
@property
@@ -533,7 +544,7 @@ class Hip(CMakePackage):
# This is a variable that does not exist in hipcc but was introduced
# in a patch of ours since 3.5.0 to locate rocm_agent_enumerator:
- # https://github.com/ROCm-Developer-Tools/HIP/pull/2138
+ # https://github.com/ROCm/HIP/pull/2138
env.set("ROCMINFO_PATH", paths["rocminfo"])
# This one is used in hipcc to run `clang --hip-device-lib-path=...`
@@ -548,7 +559,7 @@ class Hip(CMakePackage):
# Used in comgr and seems necessary when using the JIT compiler, e.g.
# hiprtcCreateProgram:
- # https://github.com/RadeonOpenCompute/ROCm-CompilerSupport/blob/rocm-4.0.0/lib/comgr/src/comgr-env.cpp
+ # https://github.com/ROCm/ROCm-CompilerSupport/blob/rocm-4.0.0/lib/comgr/src/comgr-env.cpp
env.set("LLVM_PATH", paths["llvm-amdgpu"])
env.set("COMGR_PATH", paths["comgr"])
@@ -560,7 +571,7 @@ class Hip(CMakePackage):
# and parsing of the <prefix>/bin/.hipVersion file. Let's just set this
# to the hip prefix directory for non-external builds so that the
# bin/.hipVersion file can still be parsed.
- # See also https://github.com/ROCm-Developer-Tools/HIP/issues/2223
+ # See also https://github.com/ROCm/HIP/issues/2223
if "@3.8.0:" in self.spec:
env.append_path(
"HIPCC_COMPILE_FLAGS_APPEND",
diff --git a/var/spack/repos/builtin/packages/hipblas/package.py b/var/spack/repos/builtin/packages/hipblas/package.py
index e05dfd3768..46b02ad352 100644
--- a/var/spack/repos/builtin/packages/hipblas/package.py
+++ b/var/spack/repos/builtin/packages/hipblas/package.py
@@ -12,9 +12,9 @@ class Hipblas(CMakePackage, CudaPackage, ROCmPackage):
"""hipBLAS is a BLAS marshalling library, with multiple
supported backends"""
- homepage = "https://github.com/ROCmSoftwarePlatform/hipBLAS"
- git = "https://github.com/ROCmSoftwarePlatform/hipBLAS.git"
- url = "https://github.com/ROCmSoftwarePlatform/hipBLAS/archive/rocm-5.5.0.tar.gz"
+ homepage = "https://github.com/ROCm/hipBLAS"
+ git = "https://github.com/ROCm/hipBLAS.git"
+ url = "https://github.com/ROCm/hipBLAS/archive/rocm-6.0.0.tar.gz"
tags = ["rocm"]
maintainers("cgmb", "srekolam", "renjithravindrankannath", "haampie")
@@ -24,6 +24,7 @@ class Hipblas(CMakePackage, CudaPackage, ROCmPackage):
version("develop", branch="develop")
version("master", branch="master")
+ version("6.0.0", sha256="8fbd0c244fe82eded866e06d2399b1d91ab5d43d2ebcb73382c7ce1ae48d9cb3")
version("5.7.1", sha256="794e9298f48ffbe3bd1c1ab87a5c2c2b953713500155fdec9ef8cbb11f81fc8a")
version("5.7.0", sha256="8c6cd2ffa4ce6ab03e05feffe074685b5525610870aebe9d78f817b3037f33a4")
version("5.6.1", sha256="f9da82fbefc68b84081ea0ed0139b91d2a540357fcf505c7f1d57eab01eb327c")
@@ -136,11 +137,14 @@ class Hipblas(CMakePackage, CudaPackage, ROCmPackage):
patch("link-clients-blas.patch", when="@4.3.0:4.3.2")
patch("link-clients-blas-4.5.0.patch", when="@4.5.0:4.5.2")
patch("hipblas-link-clients-blas-5.0.0.patch", when="@5.0.0:5.0.2")
- patch("remove-hipblas-clients-file-installation.patch", when="@5.5:")
+ patch("remove-hipblas-clients-file-installation.patch", when="@5.5:5.7.1")
+ patch("remove-hipblas-clients-file-installation-6.0.patch", when="@6.0:")
- depends_on("rocm-cmake@5.2.0:", type="build", when="@5.2.0:")
+ depends_on("rocm-cmake@5.2.0:", type="build", when="@5.2.0:5.7")
depends_on("rocm-cmake@4.5.0:", type="build", when="@4.5.0:")
depends_on("rocm-cmake@3.5.0:", type="build")
+ for ver in ["6.0.0"]:
+ depends_on("rocm-cmake@" + ver, when="+rocm @" + ver)
depends_on("hip +cuda", when="+cuda")
@@ -174,12 +178,12 @@ class Hipblas(CMakePackage, CudaPackage, ROCmPackage):
"5.6.1",
"5.7.0",
"5.7.1",
+ "6.0.0",
"master",
"develop",
]:
depends_on("rocsolver@" + ver, when="+rocm @" + ver)
depends_on("rocblas@" + ver, when="+rocm @" + ver)
-
for tgt in ROCmPackage.amdgpu_targets:
depends_on(
"rocblas amdgpu_target={0}".format(tgt), when="+rocm amdgpu_target={0}".format(tgt)
diff --git a/var/spack/repos/builtin/packages/hipblas/remove-hipblas-clients-file-installation-6.0.patch b/var/spack/repos/builtin/packages/hipblas/remove-hipblas-clients-file-installation-6.0.patch
new file mode 100644
index 0000000000..ca6fa8f413
--- /dev/null
+++ b/var/spack/repos/builtin/packages/hipblas/remove-hipblas-clients-file-installation-6.0.patch
@@ -0,0 +1,32 @@
+From 120af1b2483868ebdc2ee5f137418d23c14178ad Mon Sep 17 00:00:00 2001
+From: Renjith Ravindran <Renjith.RavindranKannath@amd.com>
+Date: Wed, 10 Jan 2024 04:28:15 +0000
+Subject: [PATCH] Remove hipblas clients file installation
+
+---
+ clients/CMakeLists.txt | 12 ------------
+ 1 file changed, 12 deletions(-)
+
+diff --git a/clients/CMakeLists.txt b/clients/CMakeLists.txt
+index 8206ad7..6a59808 100644
+--- a/clients/CMakeLists.txt
++++ b/clients/CMakeLists.txt
+@@ -135,15 +135,3 @@ add_custom_command( OUTPUT "${HIPBLAS_GENTEST}"
+
+ add_custom_target( hipblas-common DEPENDS "${HIPBLAS_COMMON}" "${HIPBLAS_TEMPLATE}" "${HIPBLAS_SMOKE}" "${HIPBLAS_GENTEST}" )
+
+-if( BUILD_CLIENTS_TESTS OR BUILD_CLIENTS_BENCHMARKS )
+- rocm_install(
+- FILES ${HIPBLAS_COMMON} ${HIPBLAS_TEMPLATE} ${HIPBLAS_SMOKE}
+- DESTINATION "${CMAKE_INSTALL_BINDIR}"
+- COMPONENT clients-common
+- )
+- rocm_install(
+- PROGRAMS ${HIPBLAS_GENTEST}
+- DESTINATION "${CMAKE_INSTALL_BINDIR}"
+- COMPONENT clients-common
+- )
+-endif()
+--
+2.39.3
+
diff --git a/var/spack/repos/builtin/packages/hipcub/package.py b/var/spack/repos/builtin/packages/hipcub/package.py
index 61c05e7431..34e16cd4bc 100644
--- a/var/spack/repos/builtin/packages/hipcub/package.py
+++ b/var/spack/repos/builtin/packages/hipcub/package.py
@@ -9,14 +9,15 @@ from spack.package import *
class Hipcub(CMakePackage, CudaPackage, ROCmPackage):
"""Radeon Open Compute Parallel Primitives Library"""
- homepage = "https://github.com/ROCmSoftwarePlatform/hipCUB"
- git = "https://github.com/ROCmSoftwarePlatform/hipCUB.git"
- url = "https://github.com/ROCmSoftwarePlatform/hipCUB/archive/rocm-5.5.0.tar.gz"
+ homepage = "https://github.com/ROCm/hipCUB"
+ git = "https://github.com/ROCm/hipCUB.git"
+ url = "https://github.com/ROCm/hipCUB/archive/rocm-6.0.0.tar.gz"
tags = ["rocm"]
license("BSD-3-Clause")
maintainers("srekolam", "renjithravindrankannath")
+ version("6.0.0", sha256="8d9f6e1e3f8433a2ceae1b0efd6727c21383980077e264725d00d5fee165bd30")
version("5.7.1", sha256="9b23a58408bc4c549d3c754196cb3e2c1a50e177ab0a286101cbea2f7f173945")
version("5.7.0", sha256="899356867f662d9a6f3870bb4a496f605a3143c6ad4d1fa9e9faead68fa8d13b")
version("5.6.1", sha256="4b9479daa40424c9ddbc14ce967aa170680f8ca1ed01a514e6e30ccfa22552ce")
@@ -157,6 +158,7 @@ class Hipcub(CMakePackage, CudaPackage, ROCmPackage):
"5.6.1",
"5.7.0",
"5.7.1",
+ "6.0.0",
]:
depends_on("rocprim@" + ver, when="+rocm @" + ver)
depends_on("rocm-cmake@%s:" % ver, type="build", when="@" + ver)
diff --git a/var/spack/repos/builtin/packages/hipfft/001-remove-submodule-and-sync-shared-files-from-rocFFT.patch b/var/spack/repos/builtin/packages/hipfft/001-remove-submodule-and-sync-shared-files-from-rocFFT.patch
new file mode 100644
index 0000000000..537794d3cc
--- /dev/null
+++ b/var/spack/repos/builtin/packages/hipfft/001-remove-submodule-and-sync-shared-files-from-rocFFT.patch
@@ -0,0 +1,11431 @@
+From 27ae15a459f45f1acfcb1a9b1c8d491d9f731fd4 Mon Sep 17 00:00:00 2001
+From: Steve Leung <Steve.Leung@amd.com>
+Date: Thu, 4 Jan 2024 16:36:08 -0700
+Subject: [PATCH] remove submodule and sync shared files from rocFFT, update
+ CHANGELOG.md
+
+---
+ clients/CMakeLists.txt | 15 -
+ clients/bench/CMakeLists.txt | 4 +-
+ clients/bench/bench.cpp | 2 +-
+ clients/hipfft_params.h | 2 +-
+ clients/tests/CMakeLists.txt | 11 +-
+ clients/tests/accuracy_test_1D.cpp | 8 +-
+ clients/tests/accuracy_test_2D.cpp | 8 +-
+ clients/tests/accuracy_test_3D.cpp | 8 +-
+ clients/tests/accuracy_test_callback.cpp | 2 +-
+ clients/tests/gtest_main.cpp | 6 +-
+ clients/tests/hipfft_accuracy_test.cpp | 11 +-
+ clients/tests/hipfft_accuracy_test.h | 2 +-
+ clients/tests/multi_device_test.cpp | 2 +-
+ cmake/dependencies.cmake | 3 -
+ library/src/amd_detail/hipfft.cpp | 8 +-
+ shared/accuracy_test.h | 1949 +++++++++++++
+ shared/arithmetic.h | 61 +
+ shared/array_predicate.h | 47 +
+ shared/array_validator.cpp | 549 ++++
+ shared/array_validator.h | 31 +
+ shared/concurrency.h | 41 +
+ shared/data_gen_device.h | 1303 +++++++++
+ shared/data_gen_host.h | 881 ++++++
+ shared/device_properties.h | 74 +
+ shared/enum_to_string.h | 81 +
+ shared/environment.h | 97 +
+ shared/fft_params.h | 3274 ++++++++++++++++++++++
+ shared/fftw_transform.h | 493 ++++
+ shared/gpubuf.h | 134 +
+ shared/hip_object_wrapper.h | 86 +
+ shared/hostbuf.h | 158 ++
+ shared/increment.h | 100 +
+ shared/precision_type.h | 70 +
+ shared/printbuffer.h | 108 +
+ shared/ptrdiff.h | 40 +
+ shared/rocfft_accuracy_test.h | 29 +
+ shared/rocfft_against_fftw.h | 231 ++
+ shared/rocfft_complex.h | 346 +++
+ shared/rocfft_hip.h | 52 +
+ shared/rocfft_params.h | 585 ++++
+ shared/test_params.h | 51 +
+ shared/work_queue.h | 49 +
+ 46 files changed, 10966 insertions(+), 66 deletions(-)
+ create mode 100644 shared/accuracy_test.h
+ create mode 100644 shared/arithmetic.h
+ create mode 100644 shared/array_predicate.h
+ create mode 100644 shared/array_validator.cpp
+ create mode 100644 shared/array_validator.h
+ create mode 100644 shared/concurrency.h
+ create mode 100644 shared/data_gen_device.h
+ create mode 100644 shared/data_gen_host.h
+ create mode 100644 shared/device_properties.h
+ create mode 100644 shared/enum_to_string.h
+ create mode 100644 shared/environment.h
+ create mode 100644 shared/fft_params.h
+ create mode 100644 shared/fftw_transform.h
+ create mode 100644 shared/gpubuf.h
+ create mode 100644 shared/hip_object_wrapper.h
+ create mode 100644 shared/hostbuf.h
+ create mode 100644 shared/increment.h
+ create mode 100644 shared/precision_type.h
+ create mode 100644 shared/printbuffer.h
+ create mode 100644 shared/ptrdiff.h
+ create mode 100644 shared/rocfft_accuracy_test.h
+ create mode 100644 shared/rocfft_against_fftw.h
+ create mode 100644 shared/rocfft_complex.h
+ create mode 100644 shared/rocfft_hip.h
+ create mode 100644 shared/rocfft_params.h
+ create mode 100644 shared/test_params.h
+ create mode 100644 shared/work_queue.h
+
+diff --git a/clients/CMakeLists.txt b/clients/CMakeLists.txt
+index 1db0d9c..b99a9e5 100644
+--- a/clients/CMakeLists.txt
++++ b/clients/CMakeLists.txt
+@@ -65,21 +65,6 @@ if( NOT CMAKE_CXX_COMPILER MATCHES ".*/hipcc$" AND NOT CMAKE_CXX_COMPILER_ID STR
+ endif()
+
+
+-if( GIT_FOUND AND EXISTS "${CMAKE_SOURCE_DIR}/.git" )
+- message(STATUS "rocFFT submodule update")
+- execute_process(COMMAND ${GIT_EXECUTABLE} submodule update --init --recursive
+- WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}/clients/rocFFT
+- RESULT_VARIABLE GIT_SUBMOD_RESULT)
+- if( NOT GIT_SUBMOD_RESULT EQUAL "0" )
+- message(FATAL_ERROR "git submodule update --init --recursive failed with ${GIT_SUBMOD_RESULT}, please checkout submodules manually.")
+- endif( )
+-endif( )
+-
+-if( NOT EXISTS "${CMAKE_SOURCE_DIR}/clients/rocFFT/CMakeLists.txt" )
+- message(FATAL_ERROR "The rocFFT submodule is not present! Please update git submodules and try again. ${CMAKE_CURRENT_SOURCE_DIR}/clients/rocFFT/CMakeLists.txt")
+-endif( )
+-
+-
+ # This option only works for make/nmake and the ninja generators, but no reason it shouldn't be on
+ # all the time
+ # This tells cmake to create a compile_commands.json file that can be used with clang tooling or vim
+diff --git a/clients/bench/CMakeLists.txt b/clients/bench/CMakeLists.txt
+index b5cef9b..ccb8c29 100644
+--- a/clients/bench/CMakeLists.txt
++++ b/clients/bench/CMakeLists.txt
+@@ -26,8 +26,8 @@ find_package( Boost COMPONENTS program_options REQUIRED)
+ set( Boost_USE_STATIC_LIBS OFF )
+
+
+-set( hipfft_bench_source bench.cpp ../rocFFT/shared/array_validator.cpp )
+-set( hipfft_bench_includes bench.h ../rocFFT/shared/array_validator.h )
++set( hipfft_bench_source bench.cpp ../../shared/array_validator.cpp )
++set( hipfft_bench_includes bench.h ../../shared/array_validator.h )
+
+ add_executable( hipfft-bench ${hipfft_bench_source} ${hipfft_bench_includes} )
+
+diff --git a/clients/bench/bench.cpp b/clients/bench/bench.cpp
+index 894769c..a906879 100644
+--- a/clients/bench/bench.cpp
++++ b/clients/bench/bench.cpp
+@@ -29,7 +29,7 @@
+ #include <boost/program_options.hpp>
+ namespace po = boost::program_options;
+
+-#include "../rocFFT/shared/gpubuf.h"
++#include "../../shared/gpubuf.h"
+
+ int main(int argc, char* argv[])
+ {
+diff --git a/clients/hipfft_params.h b/clients/hipfft_params.h
+index b8b58ac..75d9db9 100644
+--- a/clients/hipfft_params.h
++++ b/clients/hipfft_params.h
+@@ -23,9 +23,9 @@
+
+ #include <optional>
+
++#include "../shared/fft_params.h"
+ #include "hipfft/hipfft.h"
+ #include "hipfft/hipfftXt.h"
+-#include "rocFFT/shared/fft_params.h"
+
+ inline fft_status fft_status_from_hipfftparams(const hipfftResult_t val)
+ {
+diff --git a/clients/tests/CMakeLists.txt b/clients/tests/CMakeLists.txt
+index 9742a45..2d1aac0 100644
+--- a/clients/tests/CMakeLists.txt
++++ b/clients/tests/CMakeLists.txt
+@@ -37,14 +37,7 @@ set( hipfft-test_source
+ accuracy_test_3D.cpp
+ accuracy_test_callback.cpp
+ multi_device_test.cpp
+- ../rocFFT/shared/array_validator.cpp
+- )
+-
+-set( hipfft-test_includes
+- ../rocFFT/clients/tests/fftw_transform.h
+- ../rocFFT/clients/tests/rocfft_against_fftw.h
+- ../rocFFT/clients/tests/misc/include/test_exception.h
+- ../rocFFT/shared/array_validator.h
++ ../../shared/array_validator.cpp
+ )
+
+ add_executable( hipfft-test ${hipfft-test_source} ${hipfft-test_includes} )
+@@ -56,8 +49,6 @@ target_include_directories(
+ $<BUILD_INTERFACE:${FFTW_INCLUDE_DIRS}>
+ $<BUILD_INTERFACE:${hip_INCLUDE_DIRS}>
+ $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../../library/include>
+- $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../rocFFT/library/include>
+- $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../rocFFT/clients/tests>
+ )
+
+
+diff --git a/clients/tests/accuracy_test_1D.cpp b/clients/tests/accuracy_test_1D.cpp
+index 27e849d..57d846a 100644
+--- a/clients/tests/accuracy_test_1D.cpp
++++ b/clients/tests/accuracy_test_1D.cpp
+@@ -23,11 +23,11 @@
+ #include <stdexcept>
+ #include <vector>
+
+-#include "../rocFFT/shared/fft_params.h"
++#include "../../shared/fft_params.h"
+
+-#include "accuracy_test.h"
+-#include "fftw_transform.h"
+-#include "rocfft_against_fftw.h"
++#include "../../shared/accuracy_test.h"
++#include "../../shared/fftw_transform.h"
++#include "../../shared/rocfft_against_fftw.h"
+
+ using ::testing::ValuesIn;
+
+diff --git a/clients/tests/accuracy_test_2D.cpp b/clients/tests/accuracy_test_2D.cpp
+index 1674593..6f618c0 100644
+--- a/clients/tests/accuracy_test_2D.cpp
++++ b/clients/tests/accuracy_test_2D.cpp
+@@ -23,11 +23,11 @@
+ #include <stdexcept>
+ #include <vector>
+
+-#include "../rocFFT/shared/fft_params.h"
++#include "../../shared/fft_params.h"
+
+-#include "accuracy_test.h"
+-#include "fftw_transform.h"
+-#include "rocfft_against_fftw.h"
++#include "../../shared/accuracy_test.h"
++#include "../../shared/fftw_transform.h"
++#include "../../shared/rocfft_against_fftw.h"
+
+ using ::testing::ValuesIn;
+
+diff --git a/clients/tests/accuracy_test_3D.cpp b/clients/tests/accuracy_test_3D.cpp
+index a87476a..941ec24 100644
+--- a/clients/tests/accuracy_test_3D.cpp
++++ b/clients/tests/accuracy_test_3D.cpp
+@@ -23,11 +23,11 @@
+ #include <stdexcept>
+ #include <vector>
+
+-#include "../rocFFT/shared/fft_params.h"
++#include "../../shared/fft_params.h"
+
+-#include "accuracy_test.h"
+-#include "fftw_transform.h"
+-#include "rocfft_against_fftw.h"
++#include "../../shared/accuracy_test.h"
++#include "../../shared/fftw_transform.h"
++#include "../../shared/rocfft_against_fftw.h"
+
+ using ::testing::ValuesIn;
+
+diff --git a/clients/tests/accuracy_test_callback.cpp b/clients/tests/accuracy_test_callback.cpp
+index 4782830..b5cc4a7 100644
+--- a/clients/tests/accuracy_test_callback.cpp
++++ b/clients/tests/accuracy_test_callback.cpp
+@@ -18,7 +18,7 @@
+ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ // THE SOFTWARE.
+
+-#include "accuracy_test.h"
++#include "../../shared/accuracy_test.h"
+
+ std::vector<std::vector<size_t>> callback_sizes = {
+ // some single kernel sizes
+diff --git a/clients/tests/gtest_main.cpp b/clients/tests/gtest_main.cpp
+index 1f0ae83..2f7674e 100644
+--- a/clients/tests/gtest_main.cpp
++++ b/clients/tests/gtest_main.cpp
+@@ -30,10 +30,10 @@
+ #include <streambuf>
+ #include <string>
+
++#include "../../shared/concurrency.h"
++#include "../../shared/environment.h"
++#include "../../shared/work_queue.h"
+ #include "../hipfft_params.h"
+-#include "../rocFFT/shared/concurrency.h"
+-#include "../rocFFT/shared/environment.h"
+-#include "../rocFFT/shared/work_queue.h"
+ #include "hipfft/hipfft.h"
+ #include "hipfft_accuracy_test.h"
+ #include "hipfft_test_params.h"
+diff --git a/clients/tests/hipfft_accuracy_test.cpp b/clients/tests/hipfft_accuracy_test.cpp
+index 2abaf74..609239a 100644
+--- a/clients/tests/hipfft_accuracy_test.cpp
++++ b/clients/tests/hipfft_accuracy_test.cpp
+@@ -29,11 +29,12 @@
+ #include "hipfft/hipfft.h"
+
+ #include "../hipfft_params.h"
+-#include "../rocFFT/clients/tests/fftw_transform.h"
+-#include "../rocFFT/clients/tests/rocfft_accuracy_test.h"
+-#include "../rocFFT/clients/tests/rocfft_against_fftw.h"
+-#include "../rocFFT/shared/gpubuf.h"
+-#include "../rocFFT/shared/rocfft_complex.h"
++
++#include "../../shared/accuracy_test.h"
++#include "../../shared/fftw_transform.h"
++#include "../../shared/gpubuf.h"
++#include "../../shared/rocfft_against_fftw.h"
++#include "../../shared/rocfft_complex.h"
+
+ void fft_vs_reference(hipfft_params& params, bool round_trip)
+ {
+diff --git a/clients/tests/hipfft_accuracy_test.h b/clients/tests/hipfft_accuracy_test.h
+index 0491bd9..181150e 100644
+--- a/clients/tests/hipfft_accuracy_test.h
++++ b/clients/tests/hipfft_accuracy_test.h
+@@ -23,8 +23,8 @@
+ #ifndef ROCFFT_ACCURACY_TEST
+ #define ROCFFT_ACCURACY_TEST
+
++#include "../../shared/accuracy_test.h"
+ #include "../hipfft_params.h"
+-#include "../rocFFT/clients/tests/accuracy_test.h"
+
+ void fft_vs_reference(hipfft_params& params, bool round_trip = false);
+
+diff --git a/clients/tests/multi_device_test.cpp b/clients/tests/multi_device_test.cpp
+index b3dc4c9..3274b80 100644
+--- a/clients/tests/multi_device_test.cpp
++++ b/clients/tests/multi_device_test.cpp
+@@ -18,7 +18,7 @@
+ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ // THE SOFTWARE.
+
+-#include "accuracy_test.h"
++#include "../../shared/accuracy_test.h"
+ #include <gtest/gtest.h>
+ #include <hip/hip_runtime_api.h>
+
+diff --git a/cmake/dependencies.cmake b/cmake/dependencies.cmake
+index 5810e37..bdbf689 100644
+--- a/cmake/dependencies.cmake
++++ b/cmake/dependencies.cmake
+@@ -21,9 +21,6 @@
+ #
+ # #############################################################################
+
+-# Git
+-find_package(Git REQUIRED)
+-
+ # HIP
+ if( NOT CMAKE_CXX_COMPILER MATCHES ".*/hipcc$" )
+ if( NOT BUILD_WITH_LIB STREQUAL "CUDA" )
+diff --git a/library/src/amd_detail/hipfft.cpp b/library/src/amd_detail/hipfft.cpp
+index c2f7036..3d4f61f 100644
+--- a/library/src/amd_detail/hipfft.cpp
++++ b/library/src/amd_detail/hipfft.cpp
+@@ -27,10 +27,10 @@
+ #include <string>
+ #include <vector>
+
+-#include "../../../clients/rocFFT/shared/arithmetic.h"
+-#include "../../../clients/rocFFT/shared/gpubuf.h"
+-#include "../../../clients/rocFFT/shared/ptrdiff.h"
+-#include "../../../clients/rocFFT/shared/rocfft_hip.h"
++#include "../../../shared/arithmetic.h"
++#include "../../../shared/gpubuf.h"
++#include "../../../shared/ptrdiff.h"
++#include "../../../shared/rocfft_hip.h"
+
+ #define ROC_FFT_CHECK_ALLOC_FAILED(ret) \
+ { \
+diff --git a/shared/accuracy_test.h b/shared/accuracy_test.h
+new file mode 100644
+index 0000000..362a7c1
+--- /dev/null
++++ b/shared/accuracy_test.h
+@@ -0,0 +1,1949 @@
++// Copyright (C) 2020 - 2023 Advanced Micro Devices, Inc. All rights reserved.
++//
++// Permission is hereby granted, free of charge, to any person obtaining a copy
++// of this software and associated documentation files (the "Software"), to deal
++// in the Software without restriction, including without limitation the rights
++// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
++// copies of the Software, and to permit persons to whom the Software is
++// furnished to do so, subject to the following conditions:
++//
++// The above copyright notice and this permission notice shall be included in
++// all copies or substantial portions of the Software.
++//
++// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
++// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
++// THE SOFTWARE.
++
++#pragma once
++
++#ifndef ACCURACY_TEST
++#define ACCURACY_TEST
++
++#include <algorithm>
++#include <functional>
++#include <future>
++#include <iterator>
++#include <string>
++#include <vector>
++
++#include "enum_to_string.h"
++#include "fft_params.h"
++#include "fftw_transform.h"
++#include "gpubuf.h"
++#include "rocfft_against_fftw.h"
++#include "test_params.h"
++
++extern int verbose;
++extern size_t ramgb;
++extern bool fftw_compare;
++
++static const size_t ONE_GiB = 1 << 30;
++
++inline size_t bytes_to_GiB(const size_t bytes)
++{
++ return bytes == 0 ? 0 : (bytes - 1 + ONE_GiB) / ONE_GiB;
++}
++
++typedef std::tuple<fft_transform_type, fft_result_placement, fft_array_type, fft_array_type>
++ type_place_io_t;
++
++// Remember the results of the last FFT we computed with FFTW. Tests
++// are ordered so that later cases can often reuse this result.
++struct last_cpu_fft_cache
++{
++ // keys to the cache
++ std::vector<size_t> length;
++ size_t nbatch = 0;
++ fft_transform_type transform_type = fft_transform_type_complex_forward;
++ bool run_callbacks = false;
++ fft_precision precision = fft_precision_single;
++
++ // FFTW input/output
++ std::vector<hostbuf> cpu_input;
++ std::vector<hostbuf> cpu_output;
++};
++extern last_cpu_fft_cache last_cpu_fft_data;
++
++struct system_memory
++{
++ size_t total_bytes = 0;
++ size_t free_bytes = 0;
++};
++extern system_memory start_memory;
++
++system_memory get_system_memory();
++
++// Estimate the amount of host memory needed for buffers.
++inline size_t needed_ram_buffers(const fft_params& params, const int verbose)
++{
++ // This calculation is assuming contiguous data but noncontiguous buffers
++ // are assumed to require a close enough amount of space for the purposes
++ // of this estimate.
++
++ size_t needed_ram = 6
++ * std::accumulate(params.length.begin(),
++ params.length.end(),
++ static_cast<size_t>(1),
++ std::multiplies<size_t>());
++
++ // Account for precision and data type:
++ if(params.transform_type != fft_transform_type_real_forward
++ && params.transform_type != fft_transform_type_real_inverse)
++ {
++ needed_ram *= 2;
++ }
++ switch(params.precision)
++ {
++ case fft_precision_half:
++ needed_ram *= 2;
++ break;
++ case fft_precision_single:
++ needed_ram *= 4;
++ break;
++ case fft_precision_double:
++ needed_ram *= 8;
++ break;
++ }
++
++ needed_ram *= params.nbatch;
++
++ if(verbose)
++ {
++ std::cout << "required host memory for buffers (GiB): " << bytes_to_GiB(needed_ram) << "\n";
++ }
++
++ return needed_ram;
++}
++
++template <typename Tfloat>
++bool fftw_plan_uses_bluestein(const typename fftw_trait<Tfloat>::fftw_plan_type& cpu_plan)
++{
++#ifdef FFTW_HAVE_SPRINT_PLAN
++ char* print_plan_c_str = fftw_sprint_plan<Tfloat>(cpu_plan);
++ std::string print_plan(print_plan_c_str);
++ free(print_plan_c_str);
++ return print_plan.find("bluestein") != std::string::npos;
++#else
++ // assume worst case (bluestein is always used)
++ return true;
++#endif
++}
++
++// Estimate the amount of host memory needed for fftw.
++template <typename Tfloat>
++inline size_t needed_ram_fftw(const fft_params& contiguous_params,
++ const typename fftw_trait<Tfloat>::fftw_plan_type& cpu_plan,
++ const int verbose)
++{
++ size_t total_length = std::accumulate(contiguous_params.length.begin(),
++ contiguous_params.length.end(),
++ static_cast<size_t>(1),
++ std::multiplies<size_t>());
++ size_t needed_ram = 0;
++ // Detect Bluestein in plan
++ if(fftw_plan_uses_bluestein<Tfloat>(cpu_plan))
++ {
++ for(size_t dim : contiguous_params.length)
++ {
++ unsigned int needed_ram_dim = dim;
++
++ // Next-plus-one-power-of-two multiplied any other lengths
++ needed_ram_dim--;
++
++ needed_ram_dim |= needed_ram_dim >> 2;
++ needed_ram_dim |= needed_ram_dim >> 4;
++ needed_ram_dim |= needed_ram_dim >> 8;
++ needed_ram_dim |= needed_ram_dim >> 16;
++
++ needed_ram_dim++;
++
++ needed_ram_dim *= 2 * (total_length / dim);
++
++ if(needed_ram_dim > needed_ram)
++ {
++ needed_ram = needed_ram_dim;
++ }
++ }
++ }
++
++ // Account for precision and data type:
++ if(contiguous_params.transform_type != fft_transform_type_real_forward
++ && contiguous_params.transform_type != fft_transform_type_real_inverse)
++ {
++ needed_ram *= 2;
++ }
++ switch(contiguous_params.precision)
++ {
++ case fft_precision_half:
++ needed_ram *= 2;
++ break;
++ case fft_precision_single:
++ needed_ram *= 4;
++ break;
++ case fft_precision_double:
++ needed_ram *= 8;
++ break;
++ }
++
++ needed_ram *= contiguous_params.nbatch;
++
++ if(verbose)
++ {
++ std::cout << "required host memory for FFTW (GiB): " << bytes_to_GiB(needed_ram) << "\n";
++ }
++
++ return needed_ram;
++}
++
++// Base gtest class for comparison with FFTW.
++class accuracy_test : public ::testing::TestWithParam<fft_params>
++{
++protected:
++ void SetUp() override {}
++ void TearDown() override {}
++
++public:
++ static std::string TestName(const testing::TestParamInfo<accuracy_test::ParamType>& info)
++ {
++ return info.param.token();
++ }
++};
++
++const static std::vector<size_t> batch_range = {2, 1};
++
++const static std::vector<fft_precision> precision_range_full
++ = {fft_precision_double, fft_precision_single, fft_precision_half};
++const static std::vector<fft_precision> precision_range_sp_dp
++ = {fft_precision_double, fft_precision_single};
++
++const static std::vector<fft_result_placement> place_range
++ = {fft_placement_inplace, fft_placement_notinplace};
++const static std::vector<fft_transform_type> trans_type_range
++ = {fft_transform_type_complex_forward, fft_transform_type_real_forward};
++const static std::vector<fft_transform_type> trans_type_range_complex
++ = {fft_transform_type_complex_forward};
++const static std::vector<fft_transform_type> trans_type_range_real
++ = {fft_transform_type_real_forward};
++
++// Given a vector of vector of lengths, generate all unique permutations.
++// Add an optional vector of ad-hoc lengths to the result.
++inline std::vector<std::vector<size_t>>
++ generate_lengths(const std::vector<std::vector<size_t>>& inlengths)
++{
++ std::vector<std::vector<size_t>> output;
++ if(inlengths.size() == 0)
++ {
++ return output;
++ }
++ const size_t dim = inlengths.size();
++ std::vector<size_t> looplength(dim);
++ for(unsigned int i = 0; i < dim; ++i)
++ {
++ looplength[i] = inlengths[i].size();
++ }
++ for(unsigned int idx = 0; idx < inlengths.size(); ++idx)
++ {
++ std::vector<size_t> index(dim);
++ do
++ {
++ std::vector<size_t> length(dim);
++ for(unsigned int i = 0; i < dim; ++i)
++ {
++ length[i] = inlengths[i][index[i]];
++ }
++ output.push_back(length);
++ } while(increment_rowmajor(index, looplength));
++ }
++ // uniquify the result
++ std::sort(output.begin(), output.end());
++ output.erase(std::unique(output.begin(), output.end()), output.end());
++ return output;
++}
++
++// Return the valid rocFFT input and output types for a given transform type.
++inline std::vector<std::pair<fft_array_type, fft_array_type>>
++ iotypes(const fft_transform_type transformType,
++ const fft_result_placement place,
++ const bool planar = true)
++{
++ std::vector<std::pair<fft_array_type, fft_array_type>> iotypes;
++ switch(transformType)
++ {
++ case fft_transform_type_complex_forward:
++ case fft_transform_type_complex_inverse:
++ iotypes.push_back(std::make_pair<fft_array_type, fft_array_type>(
++ fft_array_type_complex_interleaved, fft_array_type_complex_interleaved));
++ if(planar)
++ {
++ iotypes.push_back(std::make_pair<fft_array_type, fft_array_type>(
++ fft_array_type_complex_planar, fft_array_type_complex_planar));
++ if(place == fft_placement_notinplace)
++ {
++ iotypes.push_back(std::make_pair<fft_array_type, fft_array_type>(
++ fft_array_type_complex_planar, fft_array_type_complex_interleaved));
++ iotypes.push_back(std::make_pair<fft_array_type, fft_array_type>(
++ fft_array_type_complex_interleaved, fft_array_type_complex_planar));
++ }
++ }
++ break;
++ case fft_transform_type_real_forward:
++ iotypes.push_back(std::make_pair<fft_array_type, fft_array_type>(
++ fft_array_type_real, fft_array_type_hermitian_interleaved));
++ if(planar && place == fft_placement_notinplace)
++ {
++ iotypes.push_back(std::make_pair<fft_array_type, fft_array_type>(
++ fft_array_type_real, fft_array_type_hermitian_planar));
++ }
++ break;
++ case fft_transform_type_real_inverse:
++ iotypes.push_back(std::make_pair<fft_array_type, fft_array_type>(
++ fft_array_type_hermitian_interleaved, fft_array_type_real));
++ if(planar && place == fft_placement_notinplace)
++ {
++ iotypes.push_back(std::make_pair<fft_array_type, fft_array_type>(
++ fft_array_type_hermitian_planar, fft_array_type_real));
++ }
++ break;
++ default:
++ throw std::runtime_error("Invalid transform type");
++ }
++ return iotypes;
++}
++
++// Generate all combinations of input/output types, from combinations of transform and placement
++// types.
++static std::vector<type_place_io_t>
++ generate_types(fft_transform_type transform_type,
++ const std::vector<fft_result_placement>& place_range,
++ const bool planar)
++{
++ std::vector<type_place_io_t> ret;
++ for(auto place : place_range)
++ {
++ for(auto iotype : iotypes(transform_type, place, planar))
++ {
++ ret.push_back(std::make_tuple(transform_type, place, iotype.first, iotype.second));
++ }
++ }
++ return ret;
++}
++
++struct stride_generator
++{
++ struct stride_dist
++ {
++ stride_dist(const std::vector<size_t>& s, size_t d)
++ : stride(s)
++ , dist(d)
++ {
++ }
++ std::vector<size_t> stride;
++ size_t dist;
++ };
++
++ // NOTE: allow for this ctor to be implicit, so it's less typing for a test writer
++ //
++ // cppcheck-suppress noExplicitConstructor
++ stride_generator(const std::vector<std::vector<size_t>>& stride_list_in)
++ : stride_list(stride_list_in)
++ {
++ }
++ virtual std::vector<stride_dist> generate(const std::vector<size_t>& lengths,
++ size_t batch) const
++ {
++ std::vector<stride_dist> ret;
++ for(const auto& s : stride_list)
++ ret.emplace_back(s, 0);
++ return ret;
++ }
++ std::vector<std::vector<size_t>> stride_list;
++};
++
++// Generate strides such that batch is essentially the innermost dimension
++// e.g. given a batch-2 4x3x2 transform which logically looks like:
++//
++// batch0:
++// A B A B
++// A B A B
++// A B A B
++//
++// A B A B
++// A B A B
++// A B A B
++//
++// batch1:
++// A B A B
++// A B A B
++// A B A B
++//
++// A B A B
++// A B A B
++// A B A B
++//
++// we instead do stride-2 4x3x2 transform where first batch is the
++// A's and second batch is the B's.
++struct stride_generator_3D_inner_batch : public stride_generator
++{
++ explicit stride_generator_3D_inner_batch(const std::vector<std::vector<size_t>>& stride_list_in)
++ : stride_generator(stride_list_in)
++ {
++ }
++ std::vector<stride_dist> generate(const std::vector<size_t>& lengths,
++ size_t batch) const override
++ {
++ std::vector<stride_dist> ret = stride_generator::generate(lengths, batch);
++ std::vector<size_t> strides{lengths[1] * lengths[2] * batch, lengths[2] * batch, batch};
++ ret.emplace_back(strides, 1);
++ return ret;
++ }
++};
++
++// Create an array of parameters to pass to gtest. Base generator
++// that allows choosing transform type.
++inline auto param_generator_base(const std::vector<fft_transform_type>& type_range,
++ const std::vector<std::vector<size_t>>& v_lengths,
++ const std::vector<fft_precision>& precision_range,
++ const std::vector<size_t>& batch_range,
++ decltype(generate_types) types_generator,
++ const stride_generator& istride,
++ const stride_generator& ostride,
++ const std::vector<std::vector<size_t>>& ioffset_range,
++ const std::vector<std::vector<size_t>>& ooffset_range,
++ const std::vector<fft_result_placement>& place_range,
++ const bool planar = true,
++ const bool run_callbacks = false)
++{
++
++ std::vector<fft_params> params;
++
++ // For any length, we compute double-precision CPU reference
++ // for largest batch size first and reuse for smaller batch
++ // sizes, then convert to single-precision.
++
++ for(auto& transform_type : type_range)
++ {
++ for(const auto& lengths : v_lengths)
++ {
++ // try to ensure that we are given literal lengths, not
++ // something to be passed to generate_lengths
++ if(lengths.empty() || lengths.size() > 3)
++ {
++ continue;
++ }
++ {
++ for(const auto precision : precision_range)
++ {
++ for(const auto batch : batch_range)
++ {
++ for(const auto& types :
++ types_generator(transform_type, place_range, planar))
++ {
++ for(const auto& istride_dist : istride.generate(lengths, batch))
++ {
++ for(const auto& ostride_dist : ostride.generate(lengths, batch))
++ {
++ for(const auto& ioffset : ioffset_range)
++ {
++ for(const auto& ooffset : ooffset_range)
++ {
++ fft_params param;
++
++ param.length = lengths;
++ param.istride = istride_dist.stride;
++ param.ostride = ostride_dist.stride;
++ param.nbatch = batch;
++ param.precision = precision;
++ param.transform_type = std::get<0>(types);
++ param.placement = std::get<1>(types);
++ param.idist = istride_dist.dist;
++ param.odist = ostride_dist.dist;
++ param.itype = std::get<2>(types);
++ param.otype = std::get<3>(types);
++ param.ioffset = ioffset;
++ param.ooffset = ooffset;
++
++ if(run_callbacks)
++ {
++ // add a test if both input and output support callbacks
++ if(param.itype != fft_array_type_complex_planar
++ && param.itype != fft_array_type_hermitian_planar
++ && param.otype != fft_array_type_complex_planar
++ && param.otype
++ != fft_array_type_hermitian_planar)
++ {
++ param.run_callbacks = true;
++ }
++ else
++ {
++ continue;
++ }
++ }
++ param.validate();
++
++ // Keeping the random number generator here
++ // allows one to run the same tests for a given
++ // random seed; ie the test suite is repeatable.
++ std::hash<std::string> hasher;
++ std::ranlux24_base gen(random_seed
++ + hasher(param.token()));
++ std::uniform_real_distribution<> dis(0.0, 1.0);
++
++ if(param.is_planar())
++ {
++ const double roll = dis(gen);
++ if(roll > planar_prob)
++ {
++ if(verbose > 4)
++ {
++ std::cout << "Planar transform skipped "
++ "(planar_prob: "
++ << planar_prob << " > " << roll
++ << ")\n";
++ }
++ continue;
++ }
++ }
++ if(run_callbacks)
++ {
++ const double roll = dis(gen);
++ if(roll > callback_prob)
++ {
++
++ if(verbose > 4)
++ {
++ std::cout << "Callback transform skipped "
++ "(planar_prob: "
++ << planar_prob << " > " << roll
++ << ")\n";
++ }
++ continue;
++ }
++ }
++
++ if(param.valid(0))
++ {
++ params.push_back(param);
++ }
++ }
++ }
++ }
++ }
++ }
++ }
++ }
++ }
++ }
++ }
++ return params;
++}
++
++// Create an array of parameters to pass to gtest. Default generator
++// that picks all transform types.
++inline auto param_generator(const std::vector<std::vector<size_t>>& v_lengths,
++ const std::vector<fft_precision>& precision_range,
++ const std::vector<size_t>& batch_range,
++ const stride_generator& istride,
++ const stride_generator& ostride,
++ const std::vector<std::vector<size_t>>& ioffset_range,
++ const std::vector<std::vector<size_t>>& ooffset_range,
++ const std::vector<fft_result_placement>& place_range,
++ const bool planar,
++ const bool run_callbacks = false)
++{
++ return param_generator_base(trans_type_range,
++ v_lengths,
++ precision_range,
++ batch_range,
++ generate_types,
++ istride,
++ ostride,
++ ioffset_range,
++ ooffset_range,
++ place_range,
++ planar,
++ run_callbacks);
++}
++
++// Create an array of parameters to pass to gtest. Only tests complex-type transforms
++inline auto param_generator_complex(const std::vector<std::vector<size_t>>& v_lengths,
++ const std::vector<fft_precision>& precision_range,
++ const std::vector<size_t>& batch_range,
++ const stride_generator& istride,
++ const stride_generator& ostride,
++ const std::vector<std::vector<size_t>>& ioffset_range,
++ const std::vector<std::vector<size_t>>& ooffset_range,
++ const std::vector<fft_result_placement>& place_range,
++ const bool planar,
++ const bool run_callbacks = false)
++{
++ return param_generator_base(trans_type_range_complex,
++ v_lengths,
++ precision_range,
++ batch_range,
++ generate_types,
++ istride,
++ ostride,
++ ioffset_range,
++ ooffset_range,
++ place_range,
++ planar,
++ run_callbacks);
++}
++
++// Create an array of parameters to pass to gtest.
++inline auto param_generator_real(const std::vector<std::vector<size_t>>& v_lengths,
++ const std::vector<fft_precision>& precision_range,
++ const std::vector<size_t>& batch_range,
++ const stride_generator& istride,
++ const stride_generator& ostride,
++ const std::vector<std::vector<size_t>>& ioffset_range,
++ const std::vector<std::vector<size_t>>& ooffset_range,
++ const std::vector<fft_result_placement>& place_range,
++ const bool planar,
++ const bool run_callbacks = false)
++{
++ return param_generator_base(trans_type_range_real,
++ v_lengths,
++ precision_range,
++ batch_range,
++ generate_types,
++ istride,
++ ostride,
++ ioffset_range,
++ ooffset_range,
++ place_range,
++ planar,
++ run_callbacks);
++}
++
++template <class Tcontainer>
++auto param_generator_token(const Tcontainer& tokens)
++{
++ std::vector<fft_params> params;
++ params.reserve(tokens.size());
++ for(auto t : tokens)
++ {
++ params.push_back({});
++ params.back().from_token(t);
++ }
++ return params;
++}
++
++struct callback_test_data
++{
++ // scalar to modify the input/output with
++ double scalar;
++ // base address of input, to ensure that each callback gets an offset from that base
++ void* base;
++};
++
++void* get_load_callback_host(fft_array_type itype,
++ fft_precision precision,
++ bool round_trip_inverse);
++void apply_load_callback(const fft_params& params, std::vector<hostbuf>& input);
++void apply_store_callback(const fft_params& params, std::vector<hostbuf>& output);
++void* get_store_callback_host(fft_array_type otype,
++ fft_precision precision,
++ bool round_trip_inverse);
++
++static auto allocate_cpu_fft_buffer(const fft_precision precision,
++ const fft_array_type type,
++ const std::vector<size_t>& size)
++{
++ // FFTW does not support half-precision, so we do single instead.
++ // So if we need to do a half-precision FFTW transform, allocate
++ // enough buffer for single-precision instead.
++ return allocate_host_buffer(
++ precision == fft_precision_half ? fft_precision_single : precision, type, size);
++}
++
++template <typename Tfloat>
++inline void execute_cpu_fft(fft_params& params,
++ fft_params& contiguous_params,
++ typename fftw_trait<Tfloat>::fftw_plan_type& cpu_plan,
++ std::vector<hostbuf>& cpu_input,
++ std::vector<hostbuf>& cpu_output)
++{
++ // CPU output might not be allocated already for us, if FFTW never
++ // needed an output buffer during planning
++ if(cpu_output.empty())
++ cpu_output = allocate_cpu_fft_buffer(
++ contiguous_params.precision, contiguous_params.otype, contiguous_params.osize);
++
++ // If this is either C2R or callbacks are enabled, the
++ // input will be modified. So we need to modify the copy instead.
++ std::vector<hostbuf> cpu_input_copy(cpu_input.size());
++ std::vector<hostbuf>* input_ptr = &cpu_input;
++ if(params.run_callbacks || contiguous_params.transform_type == fft_transform_type_real_inverse)
++ {
++ for(size_t i = 0; i < cpu_input.size(); ++i)
++ {
++ cpu_input_copy[i] = cpu_input[i].copy();
++ }
++
++ input_ptr = &cpu_input_copy;
++ }
++
++ // run FFTW (which may destroy CPU input)
++ apply_load_callback(params, *input_ptr);
++ fftw_run<Tfloat>(contiguous_params.transform_type, cpu_plan, *input_ptr, cpu_output);
++ // clean up
++ fftw_destroy_plan_type(cpu_plan);
++ // ask FFTW to fully clean up, since it tries to cache plan details
++ fftw_cleanup();
++ cpu_plan = nullptr;
++ apply_store_callback(params, cpu_output);
++}
++
++// execute the GPU transform
++template <class Tparams>
++inline void execute_gpu_fft(Tparams& params,
++ std::vector<void*>& pibuffer,
++ std::vector<void*>& pobuffer,
++ std::vector<gpubuf>& obuffer,
++ std::vector<hostbuf>& gpu_output,
++ bool round_trip_inverse = false)
++{
++ gpubuf_t<callback_test_data> load_cb_data_dev;
++ gpubuf_t<callback_test_data> store_cb_data_dev;
++ if(params.run_callbacks)
++ {
++ void* load_cb_host
++ = get_load_callback_host(params.itype, params.precision, round_trip_inverse);
++
++ callback_test_data load_cb_data_host;
++
++ if(round_trip_inverse)
++ {
++ load_cb_data_host.scalar = params.store_cb_scalar;
++ }
++ else
++ {
++ load_cb_data_host.scalar = params.load_cb_scalar;
++ }
++
++ load_cb_data_host.base = pibuffer.front();
++
++ auto hip_status = hipSuccess;
++
++ hip_status = load_cb_data_dev.alloc(sizeof(callback_test_data));
++ if(hip_status != hipSuccess)
++ {
++ ++n_hip_failures;
++ if(skip_runtime_fails)
++ {
++ GTEST_SKIP();
++ }
++ else
++ {
++ GTEST_FAIL();
++ }
++ }
++ hip_status = hipMemcpy(load_cb_data_dev.data(),
++ &load_cb_data_host,
++ sizeof(callback_test_data),
++ hipMemcpyHostToDevice);
++ if(hip_status != hipSuccess)
++ {
++ ++n_hip_failures;
++ if(skip_runtime_fails)
++ {
++ GTEST_SKIP();
++ }
++ else
++ {
++ GTEST_FAIL();
++ }
++ }
++
++ void* store_cb_host
++ = get_store_callback_host(params.otype, params.precision, round_trip_inverse);
++
++ callback_test_data store_cb_data_host;
++
++ if(round_trip_inverse)
++ {
++ store_cb_data_host.scalar = params.load_cb_scalar;
++ }
++ else
++ {
++ store_cb_data_host.scalar = params.store_cb_scalar;
++ }
++
++ store_cb_data_host.base = pobuffer.front();
++
++ hip_status = store_cb_data_dev.alloc(sizeof(callback_test_data));
++ if(hip_status != hipSuccess)
++ {
++ ++n_hip_failures;
++ if(skip_runtime_fails)
++ {
++ GTEST_SKIP();
++ }
++ else
++ {
++ GTEST_FAIL();
++ }
++ }
++
++ hip_status = hipMemcpy(store_cb_data_dev.data(),
++ &store_cb_data_host,
++ sizeof(callback_test_data),
++ hipMemcpyHostToDevice);
++ if(hip_status != hipSuccess)
++ {
++ ++n_hip_failures;
++ if(skip_runtime_fails)
++ {
++ GTEST_SKIP();
++ }
++ else
++ {
++ GTEST_FAIL();
++ }
++ }
++
++ auto fft_status = params.set_callbacks(
++ load_cb_host, load_cb_data_dev.data(), store_cb_host, store_cb_data_dev.data());
++ if(fft_status != fft_status_success)
++ throw std::runtime_error("set callback failure");
++ }
++
++ // Execute the transform:
++ auto fft_status = params.execute(pibuffer.data(), pobuffer.data());
++ if(fft_status != fft_status_success)
++ throw std::runtime_error("rocFFT plan execution failure");
++
++ // if not comparing, then just executing the GPU FFT is all we
++ // need to do
++ if(!fftw_compare)
++ return;
++
++ // finalize a multi-GPU transform
++ params.multi_gpu_finalize(obuffer, pobuffer);
++
++ ASSERT_TRUE(!gpu_output.empty()) << "no output buffers";
++ for(unsigned int idx = 0; idx < gpu_output.size(); ++idx)
++ {
++ ASSERT_TRUE(gpu_output[idx].data() != nullptr)
++ << "output buffer index " << idx << " is empty";
++ auto hip_status = hipMemcpy(gpu_output[idx].data(),
++ pobuffer.at(idx),
++ gpu_output[idx].size(),
++ hipMemcpyDeviceToHost);
++ if(hip_status != hipSuccess)
++ {
++ ++n_hip_failures;
++ if(skip_runtime_fails)
++ {
++ GTEST_SKIP() << "hipMemcpy failure";
++ }
++ else
++ {
++ GTEST_FAIL() << "hipMemcpy failure";
++ }
++ }
++ }
++ if(verbose > 2)
++ {
++ std::cout << "GPU output:\n";
++ params.print_obuffer(gpu_output);
++ }
++ if(verbose > 5)
++ {
++ std::cout << "flat GPU output:\n";
++ params.print_obuffer_flat(gpu_output);
++ }
++}
++
++template <typename Tfloat>
++static void assert_init_value(const std::vector<hostbuf>& output,
++ const size_t idx,
++ const Tfloat orig_value);
++
++template <>
++void assert_init_value(const std::vector<hostbuf>& output, const size_t idx, const float orig_value)
++{
++ float actual_value = reinterpret_cast<const float*>(output.front().data())[idx];
++ ASSERT_EQ(actual_value, orig_value) << "index " << idx;
++}
++
++template <>
++void assert_init_value(const std::vector<hostbuf>& output,
++ const size_t idx,
++ const double orig_value)
++{
++ double actual_value = reinterpret_cast<const double*>(output.front().data())[idx];
++ ASSERT_EQ(actual_value, orig_value) << "index " << idx;
++}
++
++template <>
++void assert_init_value(const std::vector<hostbuf>& output,
++ const size_t idx,
++ const rocfft_complex<float> orig_value)
++{
++ // if this is interleaved, check directly
++ if(output.size() == 1)
++ {
++ rocfft_complex<float> actual_value
++ = reinterpret_cast<const rocfft_complex<float>*>(output.front().data())[idx];
++ ASSERT_EQ(actual_value.x, orig_value.x) << "x index " << idx;
++ ASSERT_EQ(actual_value.y, orig_value.y) << "y index " << idx;
++ }
++ else
++ {
++ // planar
++ rocfft_complex<float> actual_value{
++ reinterpret_cast<const float*>(output.front().data())[idx],
++ reinterpret_cast<const float*>(output.back().data())[idx]};
++ ASSERT_EQ(actual_value.x, orig_value.x) << "x index " << idx;
++ ASSERT_EQ(actual_value.y, orig_value.y) << "y index " << idx;
++ }
++}
++
++template <>
++void assert_init_value(const std::vector<hostbuf>& output,
++ const size_t idx,
++ const rocfft_complex<double> orig_value)
++{
++ // if this is interleaved, check directly
++ if(output.size() == 1)
++ {
++ rocfft_complex<double> actual_value
++ = reinterpret_cast<const rocfft_complex<double>*>(output.front().data())[idx];
++ ASSERT_EQ(actual_value.x, orig_value.x) << "x index " << idx;
++ ASSERT_EQ(actual_value.y, orig_value.y) << "y index " << idx;
++ }
++ else
++ {
++ // planar
++ rocfft_complex<double> actual_value{
++ reinterpret_cast<const double*>(output.front().data())[idx],
++ reinterpret_cast<const double*>(output.back().data())[idx]};
++ ASSERT_EQ(actual_value.x, orig_value.x) << "x index " << idx;
++ ASSERT_EQ(actual_value.y, orig_value.y) << "y index " << idx;
++ }
++}
++
++static const int OUTPUT_INIT_PATTERN = 0xcd;
++template <class Tfloat>
++void check_single_output_stride(const std::vector<hostbuf>& output,
++ const size_t offset,
++ const std::vector<size_t>& length,
++ const std::vector<size_t>& stride,
++ const size_t i)
++{
++ Tfloat orig;
++ memset(static_cast<void*>(&orig), OUTPUT_INIT_PATTERN, sizeof(Tfloat));
++
++ size_t curLength = length[i];
++ size_t curStride = stride[i];
++ size_t nextSmallerLength = i == length.size() - 1 ? 0 : length[i + 1];
++ size_t nextSmallerStride = i == stride.size() - 1 ? 0 : stride[i + 1];
++
++ if(nextSmallerLength == 0)
++ {
++ // this is the fastest dim, indexes that are not multiples of
++ // the stride should be the initial value
++ for(size_t idx = 0; idx < (curLength - 1) * curStride; ++idx)
++ {
++ if(idx % curStride != 0)
++ assert_init_value<Tfloat>(output, idx, orig);
++ }
++ }
++ else
++ {
++ for(size_t lengthIdx = 0; lengthIdx < curLength; ++lengthIdx)
++ {
++ // check that the space after the next smaller dim and the
++ // end of this dim is initial value
++ for(size_t idx = nextSmallerLength * nextSmallerStride; idx < curStride; ++idx)
++ assert_init_value<Tfloat>(output, idx, orig);
++
++ check_single_output_stride<Tfloat>(
++ output, offset + lengthIdx * curStride, length, stride, i + 1);
++ }
++ }
++}
++
++template <class Tparams>
++void check_output_strides(const std::vector<hostbuf>& output, Tparams& params)
++{
++ // treat batch+dist like highest length+stride, if batch > 1
++ std::vector<size_t> length;
++ std::vector<size_t> stride;
++ if(params.nbatch > 1)
++ {
++ length.push_back(params.nbatch);
++ stride.push_back(params.odist);
++ }
++
++ auto olength = params.olength();
++ std::copy(olength.begin(), olength.end(), std::back_inserter(length));
++ std::copy(params.ostride.begin(), params.ostride.end(), std::back_inserter(stride));
++
++ if(params.precision == fft_precision_single)
++ {
++ if(params.otype == fft_array_type_real)
++ check_single_output_stride<float>(output, 0, length, stride, 0);
++ else
++ check_single_output_stride<rocfft_complex<float>>(output, 0, length, stride, 0);
++ }
++ else
++ {
++ if(params.otype == fft_array_type_real)
++ check_single_output_stride<double>(output, 0, length, stride, 0);
++ else
++ check_single_output_stride<rocfft_complex<double>>(output, 0, length, stride, 0);
++ }
++}
++
++// run rocFFT inverse transform
++template <class Tparams>
++inline void run_round_trip_inverse(Tparams& params,
++ std::vector<gpubuf>& obuffer,
++ std::vector<void*>& pibuffer,
++ std::vector<void*>& pobuffer,
++ std::vector<hostbuf>& gpu_output)
++{
++ params.validate();
++
++ // Make sure that the parameters make sense:
++ ASSERT_TRUE(params.valid(verbose));
++
++ // Create FFT plan - this will also allocate work buffer, but will throw a
++ // specific exception if that step fails
++ auto plan_status = fft_status_success;
++ try
++ {
++ plan_status = params.create_plan();
++ }
++ catch(fft_params::work_buffer_alloc_failure& e)
++ {
++ std::stringstream ss;
++ ss << "Failed to allocate work buffer (size: " << params.workbuffersize << ")";
++ ++n_hip_failures;
++ if(skip_runtime_fails)
++ {
++ GTEST_SKIP() << ss.str();
++ }
++ else
++ {
++ GTEST_FAIL() << ss.str();
++ }
++ }
++ ASSERT_EQ(plan_status, fft_status_success) << "round trip inverse plan creation failed";
++
++ auto obuffer_sizes = params.obuffer_sizes();
++
++ if(params.placement != fft_placement_inplace)
++ {
++ for(unsigned int i = 0; i < obuffer_sizes.size(); ++i)
++ {
++ // If we're validating output strides, init the
++ // output buffer to a known pattern and we can check
++ // that the pattern is untouched in places that
++ // shouldn't have been touched.
++ if(params.check_output_strides)
++ {
++ auto hip_status
++ = hipMemset(obuffer[i].data(), OUTPUT_INIT_PATTERN, obuffer_sizes[i]);
++ if(hip_status != hipSuccess)
++ {
++ ++n_hip_failures;
++ if(skip_runtime_fails)
++ {
++ GTEST_SKIP() << "hipMemset failure";
++ }
++ else
++ {
++ GTEST_FAIL() << "hipMemset failure";
++ }
++ }
++ }
++ }
++ }
++
++ // execute GPU transform
++ execute_gpu_fft(params, pibuffer, pobuffer, obuffer, gpu_output, true);
++}
++
++// compare rocFFT inverse transform with forward transform input
++template <class Tparams>
++inline void compare_round_trip_inverse(Tparams& params,
++ fft_params& contiguous_params,
++ std::vector<hostbuf>& gpu_output,
++ std::vector<hostbuf>& cpu_input,
++ const VectorNorms& cpu_input_norm,
++ size_t total_length)
++{
++ if(params.check_output_strides)
++ {
++ check_output_strides<Tparams>(gpu_output, params);
++ }
++
++ // compute GPU output norm
++ std::shared_future<VectorNorms> gpu_norm = std::async(std::launch::async, [&]() {
++ return norm(gpu_output,
++ params.olength(),
++ params.nbatch,
++ params.precision,
++ params.otype,
++ params.ostride,
++ params.odist,
++ params.ooffset);
++ });
++
++ // compare GPU inverse output to CPU forward input
++ std::unique_ptr<std::vector<std::pair<size_t, size_t>>> linf_failures;
++ if(verbose > 1)
++ linf_failures = std::make_unique<std::vector<std::pair<size_t, size_t>>>();
++ const double linf_cutoff
++ = type_epsilon(params.precision) * cpu_input_norm.l_inf * log(total_length);
++
++ VectorNorms diff = distance(cpu_input,
++ gpu_output,
++ params.olength(),
++ params.nbatch,
++ params.precision,
++ contiguous_params.itype,
++ contiguous_params.istride,
++ contiguous_params.idist,
++ params.otype,
++ params.ostride,
++ params.odist,
++ linf_failures.get(),
++ linf_cutoff,
++ {0},
++ params.ooffset,
++ 1.0 / total_length);
++
++ if(verbose > 1)
++ {
++ std::cout << "GPU output Linf norm: " << gpu_norm.get().l_inf << "\n";
++ std::cout << "GPU output L2 norm: " << gpu_norm.get().l_2 << "\n";
++ std::cout << "GPU linf norm failures:";
++ std::sort(linf_failures->begin(), linf_failures->end());
++ for(const auto& i : *linf_failures)
++ {
++ std::cout << " (" << i.first << "," << i.second << ")";
++ }
++ std::cout << std::endl;
++ }
++
++ EXPECT_TRUE(std::isfinite(gpu_norm.get().l_inf)) << params.str();
++ EXPECT_TRUE(std::isfinite(gpu_norm.get().l_2)) << params.str();
++
++ switch(params.precision)
++ {
++ case fft_precision_half:
++ max_linf_eps_half
++ = std::max(max_linf_eps_half, diff.l_inf / cpu_input_norm.l_inf / log(total_length));
++ max_l2_eps_half
++ = std::max(max_l2_eps_half, diff.l_2 / cpu_input_norm.l_2 * sqrt(log2(total_length)));
++ break;
++ case fft_precision_single:
++ max_linf_eps_single
++ = std::max(max_linf_eps_single, diff.l_inf / cpu_input_norm.l_inf / log(total_length));
++ max_l2_eps_single
++ = std::max(max_l2_eps_single, diff.l_2 / cpu_input_norm.l_2 * sqrt(log2(total_length)));
++ break;
++ case fft_precision_double:
++ max_linf_eps_double
++ = std::max(max_linf_eps_double, diff.l_inf / cpu_input_norm.l_inf / log(total_length));
++ max_l2_eps_double
++ = std::max(max_l2_eps_double, diff.l_2 / cpu_input_norm.l_2 * sqrt(log2(total_length)));
++ break;
++ }
++
++ if(verbose > 1)
++ {
++ std::cout << "L2 diff: " << diff.l_2 << "\n";
++ std::cout << "Linf diff: " << diff.l_inf << "\n";
++ }
++
++ EXPECT_TRUE(diff.l_inf <= linf_cutoff)
++ << "Linf test failed. Linf:" << diff.l_inf
++ << "\tnormalized Linf: " << diff.l_inf / cpu_input_norm.l_inf << "\tcutoff: " << linf_cutoff
++ << params.str();
++
++ EXPECT_TRUE(diff.l_2 / cpu_input_norm.l_2
++ < sqrt(log2(total_length)) * type_epsilon(params.precision))
++ << "L2 test failed. L2: " << diff.l_2
++ << "\tnormalized L2: " << diff.l_2 / cpu_input_norm.l_2
++ << "\tepsilon: " << sqrt(log2(total_length)) * type_epsilon(params.precision)
++ << params.str();
++}
++
++// RAII type to put data into the cache when this object leaves scope
++struct StoreCPUDataToCache
++{
++ StoreCPUDataToCache(std::vector<hostbuf>& cpu_input, std::vector<hostbuf>& cpu_output)
++ : cpu_input(cpu_input)
++ , cpu_output(cpu_output)
++ {
++ }
++ ~StoreCPUDataToCache()
++ {
++ last_cpu_fft_data.cpu_output.swap(cpu_output);
++ last_cpu_fft_data.cpu_input.swap(cpu_input);
++ }
++ std::vector<hostbuf>& cpu_input;
++ std::vector<hostbuf>& cpu_output;
++};
++
++// run CPU + rocFFT transform with the given params and compare
++template <class Tfloat, class Tparams>
++inline void fft_vs_reference_impl(Tparams& params, bool round_trip)
++{
++ // Call hipGetLastError to reset any errors
++ // returned by previous HIP runtime API calls.
++ hipError_t hip_status = hipGetLastError();
++
++ // Make sure that the parameters make sense:
++ ASSERT_TRUE(params.valid(verbose));
++
++ size_t needed_ram = needed_ram_buffers(params, verbose);
++
++ if(ramgb > 0 && needed_ram > ramgb * ONE_GiB)
++ {
++ GTEST_SKIP() << "needed_ramgb: " << bytes_to_GiB(needed_ram) << ", ramgb limit: " << ramgb
++ << ".\n";
++ }
++
++ auto ibuffer_sizes = params.ibuffer_sizes();
++ auto obuffer_sizes = params.obuffer_sizes();
++
++ size_t vram_avail = 0;
++
++ if(vramgb == 0)
++ {
++ // Check free and total available memory:
++ size_t free = 0;
++ size_t total = 0;
++ auto hip_status = hipMemGetInfo(&free, &total);
++ if(hip_status != hipSuccess || total == 0)
++ {
++ ++n_hip_failures;
++ std::stringstream ss;
++ if(total == 0)
++ ss << "hipMemGetInfo claims there there isn't any vram";
++ else
++ ss << "hipMemGetInfo failure with error " << hip_status;
++ if(skip_runtime_fails)
++ {
++ GTEST_SKIP() << ss.str();
++ }
++ else
++ {
++ GTEST_FAIL() << ss.str();
++ }
++ }
++ vram_avail = total;
++ }
++ else
++ {
++ vram_avail = vramgb * ONE_GiB;
++ }
++
++ // First try a quick estimation of vram footprint, to speed up skipping tests
++ // that are too large to fit in the gpu (no plan created with the rocFFT backend)
++ const auto raw_vram_footprint
++ = params.fft_params_vram_footprint() + twiddle_table_vram_footprint(params);
++
++ if(!vram_fits_problem(raw_vram_footprint, vram_avail))
++ {
++ GTEST_SKIP() << "Raw problem size (" << bytes_to_GiB(raw_vram_footprint)
++ << " GiB) raw data too large for device";
++ }
++
++ if(verbose > 2)
++ {
++ std::cout << "Raw problem size: " << raw_vram_footprint << std::endl;
++ }
++
++ // If it passed the quick estimation test, go for the more
++ // accurate calculation that actually creates the plan and
++ // take into account the work buffer size
++ const auto vram_footprint = params.vram_footprint();
++ if(!vram_fits_problem(vram_footprint, vram_avail))
++ {
++ if(verbose)
++ {
++ std::cout << "Problem raw data won't fit on device; skipped." << std::endl;
++ }
++ GTEST_SKIP() << "Problem size (" << bytes_to_GiB(vram_footprint)
++ << " GiB) raw data too large for device";
++ }
++
++ // Create FFT plan - this will also allocate work buffer, but
++ // will throw a specific exception if that step fails
++ auto plan_status = fft_status_success;
++ try
++ {
++ plan_status = params.create_plan();
++ }
++ catch(fft_params::work_buffer_alloc_failure& e)
++ {
++ ++n_hip_failures;
++ std::stringstream ss;
++ ss << "Work buffer allocation failed with size: " << params.workbuffersize;
++ if(skip_runtime_fails)
++ {
++ GTEST_SKIP() << ss.str();
++ }
++ else
++ {
++ GTEST_FAIL() << ss.str();
++ }
++ }
++ ASSERT_EQ(plan_status, fft_status_success) << "plan creation failed";
++
++ if(!vram_fits_problem(vram_footprint, vram_avail))
++ {
++ if(verbose)
++ {
++ std::cout << "Problem won't fit on device; skipped." << std::endl;
++ }
++ GTEST_SKIP() << "Problem size (" << vram_footprint << ") too large for device";
++ return;
++ }
++
++ fft_params contiguous_params;
++ contiguous_params.length = params.length;
++ contiguous_params.precision = params.precision;
++ contiguous_params.placement = fft_placement_notinplace;
++ contiguous_params.transform_type = params.transform_type;
++ contiguous_params.nbatch = params.nbatch;
++ contiguous_params.itype = contiguous_itype(params.transform_type);
++ contiguous_params.otype = contiguous_otype(contiguous_params.transform_type);
++
++ contiguous_params.validate();
++
++ if(!contiguous_params.valid(verbose))
++ {
++ throw std::runtime_error("Invalid contiguous params");
++ }
++
++ if(verbose > 3)
++ {
++ std::cout << "CPU params:\n";
++ std::cout << contiguous_params.str("\n\t") << std::endl;
++ }
++
++ std::vector<gpubuf> ibuffer(ibuffer_sizes.size());
++ std::vector<void*> pibuffer(ibuffer_sizes.size());
++ for(unsigned int i = 0; i < ibuffer.size(); ++i)
++ {
++ hip_status = ibuffer[i].alloc(ibuffer_sizes[i]);
++ if(hip_status != hipSuccess)
++ {
++ std::stringstream ss;
++ ss << "hipMalloc failure for input buffer " << i << " size " << ibuffer_sizes[i] << "("
++ << bytes_to_GiB(ibuffer_sizes[i]) << " GiB)"
++ << " with code " << hipError_to_string(hip_status);
++ ++n_hip_failures;
++ if(skip_runtime_fails)
++ {
++ GTEST_SKIP() << ss.str();
++ }
++ else
++ {
++ GTEST_FAIL() << ss.str();
++ }
++ }
++ pibuffer[i] = ibuffer[i].data();
++ }
++
++ // allocation counts in elements, ibuffer_sizes is in bytes
++ auto ibuffer_sizes_elems = ibuffer_sizes;
++ for(auto& buf : ibuffer_sizes_elems)
++ buf /= var_size<size_t>(params.precision, params.itype);
++
++ // Check cache first - nbatch is a >= comparison because we compute
++ // the largest batch size and cache it. Smaller batch runs can
++ // compare against the larger data.
++ std::vector<hostbuf> cpu_input;
++ std::vector<hostbuf> cpu_output;
++ std::shared_future<void> convert_cpu_output_precision;
++ std::shared_future<void> convert_cpu_input_precision;
++ bool run_fftw = true;
++ std::unique_ptr<StoreCPUDataToCache> store_to_cache;
++ if(fftw_compare && last_cpu_fft_data.length == params.length
++ && last_cpu_fft_data.transform_type == params.transform_type
++ && last_cpu_fft_data.run_callbacks == params.run_callbacks)
++ {
++ if(last_cpu_fft_data.nbatch >= params.nbatch)
++ {
++ // use the cached input/output
++ cpu_input.swap(last_cpu_fft_data.cpu_input);
++ cpu_output.swap(last_cpu_fft_data.cpu_output);
++ run_fftw = false;
++
++ store_to_cache = std::make_unique<StoreCPUDataToCache>(cpu_input, cpu_output);
++
++ if(params.precision != last_cpu_fft_data.precision)
++ {
++ // Tests should be ordered so we do wider first, then narrower.
++ switch(params.precision)
++ {
++ case fft_precision_double:
++ std::cerr
++ << "test ordering is incorrect: double precision follows a narrower one"
++ << std::endl;
++ abort();
++ break;
++ case fft_precision_single:
++ if(last_cpu_fft_data.precision != fft_precision_double)
++ {
++ std::cerr
++ << "test ordering is incorrect: float precision follows a narrower one"
++ << std::endl;
++ abort();
++ }
++ // convert the input/output to single-precision
++ convert_cpu_output_precision = std::async(std::launch::async, [&]() {
++ narrow_precision_inplace<double, float>(cpu_output.front());
++ });
++ convert_cpu_input_precision = std::async(std::launch::async, [&]() {
++ narrow_precision_inplace<double, float>(cpu_input.front());
++ });
++ break;
++ case fft_precision_half:
++ // convert to half precision
++ if(last_cpu_fft_data.precision == fft_precision_double)
++ {
++ convert_cpu_output_precision = std::async(std::launch::async, [&]() {
++ narrow_precision_inplace<double, _Float16>(cpu_output.front());
++ });
++ convert_cpu_input_precision = std::async(std::launch::async, [&]() {
++ narrow_precision_inplace<double, _Float16>(cpu_input.front());
++ });
++ }
++ else if(last_cpu_fft_data.precision == fft_precision_single)
++ {
++ convert_cpu_output_precision = std::async(std::launch::async, [&]() {
++ narrow_precision_inplace<float, _Float16>(cpu_output.front());
++ });
++ convert_cpu_input_precision = std::async(std::launch::async, [&]() {
++ narrow_precision_inplace<float, _Float16>(cpu_input.front());
++ });
++ }
++ else
++ {
++ std::cerr << "unhandled previous precision, cannot convert to half"
++ << std::endl;
++ abort();
++ }
++ break;
++ }
++ last_cpu_fft_data.precision = params.precision;
++ }
++ }
++ // If the last result has a smaller batch than the new
++ // params, that might be a developer error - tests should be
++ // ordered to generate the bigger batch first. But if tests
++ // got filtered or skipped due to insufficient memory, we
++ // might never have tried to generate the bigger batch first.
++ // So just fall through and redo the CPU FFT.
++ }
++ else
++ {
++ // Clear cache explicitly so that even if we didn't get a hit,
++ // we're not uselessly holding on to cached cpu input/output
++ last_cpu_fft_data = last_cpu_fft_cache();
++ }
++
++ // Allocate CPU input
++ if(run_fftw)
++ {
++ cpu_input = allocate_cpu_fft_buffer(
++ contiguous_params.precision, contiguous_params.itype, contiguous_params.isize);
++ }
++
++ // Create FFTW plan - this may write to input, but that's fine
++ // since there's nothing in there right now
++ typename fftw_trait<Tfloat>::fftw_plan_type cpu_plan = nullptr;
++ if(run_fftw)
++ {
++ // Normally, we would want to defer allocation of CPU output
++ // buffer until when we actually do the CPU FFT. But if we're
++ // using FFTW wisdom, FFTW needs an output buffer at plan
++ // creation time.
++ if(use_fftw_wisdom)
++ {
++ cpu_output = allocate_cpu_fft_buffer(
++ contiguous_params.precision, contiguous_params.otype, contiguous_params.osize);
++ }
++ cpu_plan = fftw_plan_via_rocfft<Tfloat>(contiguous_params.length,
++ contiguous_params.istride,
++ contiguous_params.ostride,
++ contiguous_params.nbatch,
++ contiguous_params.idist,
++ contiguous_params.odist,
++ contiguous_params.transform_type,
++ cpu_input,
++ cpu_output);
++
++ needed_ram += needed_ram_fftw<Tfloat>(contiguous_params, cpu_plan, verbose);
++
++ if(ramgb > 0 && needed_ram > ramgb * ONE_GiB)
++ {
++ if(verbose)
++ {
++ std::cout << "Problem exceeds memory limit; skipped [rocfft_transform]."
++ << std::endl;
++ }
++ GTEST_SKIP();
++ return;
++ }
++ }
++
++ std::vector<hostbuf> gpu_input_data;
++
++ // allocate and populate the input buffer (cpu/gpu)
++ if(run_fftw)
++ {
++ gpu_input_data = allocate_host_buffer(params.precision, params.itype, ibuffer_sizes_elems);
++
++ //generate the input directly on the gpu
++ params.compute_input(ibuffer);
++
++ // Copy the input to CPU
++ if(params.itype != contiguous_params.itype || params.istride != contiguous_params.istride
++ || params.idist != contiguous_params.idist || params.isize != contiguous_params.isize)
++ {
++ // Copy input to CPU
++ for(unsigned int idx = 0; idx < ibuffer.size(); ++idx)
++ {
++ hip_status = hipMemcpy(gpu_input_data.at(idx).data(),
++ ibuffer[idx].data(),
++ ibuffer_sizes[idx],
++ hipMemcpyDeviceToHost);
++ if(hip_status != hipSuccess)
++ {
++ ++n_hip_failures;
++ if(skip_runtime_fails)
++ {
++ GTEST_SKIP() << "hipMemcpy failure with error " << hip_status;
++ }
++ else
++ {
++ GTEST_FAIL() << "hipMemcpy failure with error " << hip_status;
++ }
++ }
++ }
++
++ copy_buffers(gpu_input_data,
++ cpu_input,
++ params.ilength(),
++ params.nbatch,
++ params.precision,
++ params.itype,
++ params.istride,
++ params.idist,
++ contiguous_params.itype,
++ contiguous_params.istride,
++ contiguous_params.idist,
++ params.ioffset,
++ contiguous_params.ioffset);
++ }
++ else
++ {
++ // Copy input to CPU
++ for(unsigned int idx = 0; idx < ibuffer.size(); ++idx)
++ {
++ hip_status = hipMemcpy(cpu_input.at(idx).data(),
++ ibuffer[idx].data(),
++ ibuffer_sizes[idx],
++ hipMemcpyDeviceToHost);
++ if(hip_status != hipSuccess)
++ {
++ ++n_hip_failures;
++ if(skip_runtime_fails)
++ {
++ GTEST_SKIP() << "hipMemcpy failure with error " << hip_status;
++ }
++ else
++ {
++ GTEST_FAIL() << "hipMemcpy failure with error " << hip_status;
++ }
++ }
++ }
++ }
++ }
++ else if(fftw_compare)
++ {
++ gpu_input_data = allocate_host_buffer(params.precision, params.itype, ibuffer_sizes_elems);
++
++ // In case the cached cpu input needed conversion, wait for it
++ if(convert_cpu_input_precision.valid())
++ convert_cpu_input_precision.get();
++
++ // gets a pre-computed gpu input buffer from the cpu cache
++ std::vector<hostbuf>* gpu_input = &cpu_input;
++
++ if(params.itype != contiguous_params.itype || params.istride != contiguous_params.istride
++ || params.idist != contiguous_params.idist || params.isize != contiguous_params.isize)
++ {
++ copy_buffers(cpu_input,
++ gpu_input_data,
++ params.ilength(),
++ params.nbatch,
++ params.precision,
++ contiguous_params.itype,
++ contiguous_params.istride,
++ contiguous_params.idist,
++ params.itype,
++ params.istride,
++ params.idist,
++ {0},
++ params.ioffset);
++ gpu_input = &gpu_input_data;
++ }
++
++ // Copy input to GPU
++ for(unsigned int idx = 0; idx < gpu_input->size(); ++idx)
++ {
++ hip_status = hipMemcpy(ibuffer[idx].data(),
++ gpu_input->at(idx).data(),
++ ibuffer_sizes[idx],
++ hipMemcpyHostToDevice);
++
++ if(hip_status != hipSuccess)
++ {
++ ++n_hip_failures;
++ if(skip_runtime_fails)
++ {
++ GTEST_SKIP() << "hipMemcpy failure with error " << hip_status;
++ }
++ else
++ {
++ GTEST_FAIL() << "hipMemcpy failure with error " << hip_status;
++ }
++ }
++ }
++ }
++
++ if(verbose > 3)
++ {
++ std::cout << "CPU input:\n";
++ contiguous_params.print_ibuffer(cpu_input);
++ }
++
++ // compute input norm
++ std::shared_future<VectorNorms> cpu_input_norm;
++ if(fftw_compare)
++ cpu_input_norm = std::async(std::launch::async, [&]() {
++ // in case the cached cpu input needed conversion, wait for it
++ if(convert_cpu_input_precision.valid())
++ convert_cpu_input_precision.get();
++
++ auto input_norm = norm(cpu_input,
++ contiguous_params.ilength(),
++ contiguous_params.nbatch,
++ contiguous_params.precision,
++ contiguous_params.itype,
++ contiguous_params.istride,
++ contiguous_params.idist,
++ contiguous_params.ioffset);
++ if(verbose > 2)
++ {
++ std::cout << "CPU Input Linf norm: " << input_norm.l_inf << "\n";
++ std::cout << "CPU Input L2 norm: " << input_norm.l_2 << "\n";
++ }
++ return input_norm;
++ });
++
++ std::vector<gpubuf> obuffer_data;
++ std::vector<gpubuf>* obuffer = &obuffer_data;
++ std::vector<void*> pobuffer;
++
++ // allocate the output buffer
++
++ if(params.placement == fft_placement_inplace)
++ {
++ obuffer = &ibuffer;
++ }
++ else
++ {
++ auto obuffer_sizes = params.obuffer_sizes();
++ obuffer_data.resize(obuffer_sizes.size());
++ for(unsigned int i = 0; i < obuffer_data.size(); ++i)
++ {
++ hip_status = obuffer_data[i].alloc(obuffer_sizes[i]);
++ if(hip_status != hipSuccess)
++ {
++ ++n_hip_failures;
++ std::stringstream ss;
++ ss << "hipMalloc failure for output buffer " << i << " size " << obuffer_sizes[i]
++ << "(" << bytes_to_GiB(obuffer_sizes[i]) << " GiB)"
++ << " with code " << hipError_to_string(hip_status);
++ if(skip_runtime_fails)
++ {
++ GTEST_SKIP() << ss.str();
++ }
++ else
++ {
++ GTEST_FAIL() << ss.str();
++ }
++ }
++
++ // If we're validating output strides, init the
++ // output buffer to a known pattern and we can check
++ // that the pattern is untouched in places that
++ // shouldn't have been touched.
++ if(params.check_output_strides)
++ {
++ hip_status
++ = hipMemset(obuffer_data[i].data(), OUTPUT_INIT_PATTERN, obuffer_sizes[i]);
++ if(hip_status != hipSuccess)
++ {
++ ++n_hip_failures;
++ if(skip_runtime_fails)
++ {
++ GTEST_SKIP() << "hipMemset failure with error " << hip_status;
++ }
++ else
++ {
++ GTEST_FAIL() << "hipMemset failure with error " << hip_status;
++ }
++ }
++ }
++ }
++ }
++ pobuffer.resize(obuffer->size());
++ for(unsigned int i = 0; i < obuffer->size(); ++i)
++ {
++ pobuffer[i] = obuffer->at(i).data();
++ }
++
++ // Run CPU transform
++ //
++ // NOTE: This must happen after input is copied to GPU and input
++ // norm is computed, since the CPU FFT may overwrite the input.
++ VectorNorms cpu_output_norm;
++ std::shared_future<void> cpu_fft;
++ if(fftw_compare)
++ cpu_fft = std::async(std::launch::async, [&]() {
++ // wait for input norm to finish, since we might overwrite input
++ cpu_input_norm.get();
++
++ if(run_fftw)
++ execute_cpu_fft<Tfloat>(params, contiguous_params, cpu_plan, cpu_input, cpu_output);
++ // in case the cached cpu output needed conversion, wait for it
++ else if(convert_cpu_output_precision.valid())
++ convert_cpu_output_precision.get();
++
++ if(verbose > 3)
++ {
++ std::cout << "CPU output:\n";
++ contiguous_params.print_obuffer(cpu_output);
++ }
++
++ cpu_output_norm = norm(cpu_output,
++ params.olength(),
++ params.nbatch,
++ params.precision,
++ contiguous_params.otype,
++ contiguous_params.ostride,
++ contiguous_params.odist,
++ contiguous_params.ooffset);
++ if(verbose > 2)
++ {
++ std::cout << "CPU Output Linf norm: " << cpu_output_norm.l_inf << "\n";
++ std::cout << "CPU Output L2 norm: " << cpu_output_norm.l_2 << "\n";
++ }
++ });
++
++ // scatter data out to multi-GPUs if this is a multi-GPU test
++ params.multi_gpu_prepare(ibuffer, pibuffer, pobuffer);
++
++ // execute GPU transform
++ std::vector<hostbuf> gpu_output
++ = allocate_host_buffer(params.precision, params.otype, params.osize);
++
++ execute_gpu_fft(params, pibuffer, pobuffer, *obuffer, gpu_output);
++
++ params.free();
++
++ if(params.check_output_strides)
++ {
++ check_output_strides<Tparams>(gpu_output, params);
++ }
++
++ // compute GPU output norm
++ std::shared_future<VectorNorms> gpu_norm;
++ if(fftw_compare)
++ gpu_norm = std::async(std::launch::async, [&]() {
++ return norm(gpu_output,
++ params.olength(),
++ params.nbatch,
++ params.precision,
++ params.otype,
++ params.ostride,
++ params.odist,
++ params.ooffset);
++ });
++
++ // compare output
++ //
++ // Compute the l-infinity and l-2 distance between the CPU and GPU output:
++ // wait for cpu FFT so we can compute cutoff
++
++ const auto total_length = std::accumulate(params.length.begin(),
++ params.length.end(),
++ static_cast<size_t>(1),
++ std::multiplies<size_t>());
++
++ std::unique_ptr<std::vector<std::pair<size_t, size_t>>> linf_failures;
++ if(verbose > 1)
++ linf_failures = std::make_unique<std::vector<std::pair<size_t, size_t>>>();
++ double linf_cutoff;
++ VectorNorms diff;
++
++ std::shared_future<void> compare_output;
++ if(fftw_compare)
++ compare_output = std::async(std::launch::async, [&]() {
++ cpu_fft.get();
++ linf_cutoff
++ = type_epsilon(params.precision) * cpu_output_norm.l_inf * log(total_length);
++
++ diff = distance(cpu_output,
++ gpu_output,
++ params.olength(),
++ params.nbatch,
++ params.precision,
++ contiguous_params.otype,
++ contiguous_params.ostride,
++ contiguous_params.odist,
++ params.otype,
++ params.ostride,
++ params.odist,
++ linf_failures.get(),
++ linf_cutoff,
++ {0},
++ params.ooffset);
++ });
++
++ // Update the cache if this current transform is different from
++ // what's stored. But if this transform only has a smaller batch
++ // than what's cached, we can still keep the cache around since
++ // the input/output we already have is still valid.
++ const bool update_last_cpu_fft_data
++ = last_cpu_fft_data.length != params.length
++ || last_cpu_fft_data.transform_type != params.transform_type
++ || last_cpu_fft_data.run_callbacks != params.run_callbacks
++ || last_cpu_fft_data.precision != params.precision
++ || params.nbatch > last_cpu_fft_data.nbatch;
++
++ // store cpu output in cache
++ if(update_last_cpu_fft_data)
++ {
++ last_cpu_fft_data.length = params.length;
++ last_cpu_fft_data.nbatch = params.nbatch;
++ last_cpu_fft_data.transform_type = params.transform_type;
++ last_cpu_fft_data.run_callbacks = params.run_callbacks;
++ last_cpu_fft_data.precision = params.precision;
++ }
++
++ if(compare_output.valid())
++ compare_output.get();
++
++ if(!store_to_cache)
++ store_to_cache = std::make_unique<StoreCPUDataToCache>(cpu_input, cpu_output);
++
++ Tparams params_inverse;
++
++ if(round_trip)
++ {
++ params_inverse.inverse_from_forward(params);
++
++ run_round_trip_inverse<Tparams>(
++ params_inverse, ibuffer, pobuffer, pibuffer, gpu_input_data);
++ }
++
++ if(fftw_compare)
++ {
++ ASSERT_TRUE(std::isfinite(cpu_input_norm.get().l_2));
++ ASSERT_TRUE(std::isfinite(cpu_input_norm.get().l_inf));
++
++ ASSERT_TRUE(std::isfinite(cpu_output_norm.l_2));
++ ASSERT_TRUE(std::isfinite(cpu_output_norm.l_inf));
++
++ if(verbose > 1)
++ {
++ std::cout << "GPU output Linf norm: " << gpu_norm.get().l_inf << "\n";
++ std::cout << "GPU output L2 norm: " << gpu_norm.get().l_2 << "\n";
++ std::cout << "GPU linf norm failures:";
++ std::sort(linf_failures->begin(), linf_failures->end());
++ for(const auto& i : *linf_failures)
++ {
++ std::cout << " (" << i.first << "," << i.second << ")";
++ }
++ std::cout << std::endl;
++ }
++
++ EXPECT_TRUE(std::isfinite(gpu_norm.get().l_inf)) << params.str();
++ EXPECT_TRUE(std::isfinite(gpu_norm.get().l_2)) << params.str();
++ }
++
++ switch(params.precision)
++ {
++ case fft_precision_half:
++ max_linf_eps_half
++ = std::max(max_linf_eps_half, diff.l_inf / cpu_output_norm.l_inf / log(total_length));
++ max_l2_eps_half
++ = std::max(max_l2_eps_half, diff.l_2 / cpu_output_norm.l_2 * sqrt(log2(total_length)));
++ break;
++ case fft_precision_single:
++ max_linf_eps_single
++ = std::max(max_linf_eps_single, diff.l_inf / cpu_output_norm.l_inf / log(total_length));
++ max_l2_eps_single = std::max(max_l2_eps_single,
++ diff.l_2 / cpu_output_norm.l_2 * sqrt(log2(total_length)));
++ break;
++ case fft_precision_double:
++ max_linf_eps_double
++ = std::max(max_linf_eps_double, diff.l_inf / cpu_output_norm.l_inf / log(total_length));
++ max_l2_eps_double = std::max(max_l2_eps_double,
++ diff.l_2 / cpu_output_norm.l_2 * sqrt(log2(total_length)));
++ break;
++ }
++
++ if(verbose > 1)
++ {
++ std::cout << "L2 diff: " << diff.l_2 << "\n";
++ std::cout << "Linf diff: " << diff.l_inf << "\n";
++ }
++
++ if(fftw_compare)
++ {
++ EXPECT_TRUE(diff.l_inf <= linf_cutoff)
++ << "Linf test failed. Linf:" << diff.l_inf
++ << "\tnormalized Linf: " << diff.l_inf / cpu_output_norm.l_inf
++ << "\tcutoff: " << linf_cutoff << params.str();
++
++ EXPECT_TRUE(diff.l_2 / cpu_output_norm.l_2
++ < sqrt(log2(total_length)) * type_epsilon(params.precision))
++ << "L2 test failed. L2: " << diff.l_2
++ << "\tnormalized L2: " << diff.l_2 / cpu_output_norm.l_2
++ << "\tepsilon: " << sqrt(log2(total_length)) * type_epsilon(params.precision)
++ << params.str();
++ }
++
++ if(round_trip && fftw_compare)
++ {
++ compare_round_trip_inverse<Tparams>(params_inverse,
++ contiguous_params,
++ gpu_input_data,
++ cpu_input,
++ cpu_input_norm.get(),
++ total_length);
++ }
++}
++
++#endif
+diff --git a/shared/arithmetic.h b/shared/arithmetic.h
+new file mode 100644
+index 0000000..774d342
+--- /dev/null
++++ b/shared/arithmetic.h
+@@ -0,0 +1,61 @@
++/******************************************************************************
++* Copyright (C) 2021 - 2022 Advanced Micro Devices, Inc. All rights reserved.
++*
++* Permission is hereby granted, free of charge, to any person obtaining a copy
++* of this software and associated documentation files (the "Software"), to deal
++* in the Software without restriction, including without limitation the rights
++* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
++* copies of the Software, and to permit persons to whom the Software is
++* furnished to do so, subject to the following conditions:
++*
++* The above copyright notice and this permission notice shall be included in
++* all copies or substantial portions of the Software.
++*
++* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
++* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
++* THE SOFTWARE.
++*******************************************************************************/
++
++#pragma once
++
++#include <numeric>
++#include <stddef.h>
++
++// arithmetic helper functions
++
++static inline bool IsPo2(size_t u)
++{
++ return (u != 0) && (0 == (u & (u - 1)));
++}
++
++// help function: Find the smallest power of 2 that is >= n; return its
++// power of 2 factor
++// e.g., CeilPo2 (7) returns 3 : (2^3 >= 7)
++static inline size_t CeilPo2(size_t n)
++{
++ size_t v = 1, t = 0;
++ while(v < n)
++ {
++ v <<= 1;
++ t++;
++ }
++
++ return t;
++}
++
++template <typename T>
++static inline T DivRoundingUp(T a, T b)
++{
++ return (a + (b - 1)) / b;
++}
++
++template <typename Titer>
++typename Titer::value_type product(Titer begin, Titer end)
++{
++ return std::accumulate(
++ begin, end, typename Titer::value_type(1), std::multiplies<typename Titer::value_type>());
++}
+diff --git a/shared/array_predicate.h b/shared/array_predicate.h
+new file mode 100644
+index 0000000..92e45b4
+--- /dev/null
++++ b/shared/array_predicate.h
+@@ -0,0 +1,47 @@
++// Copyright (C) 2021 - 2022 Advanced Micro Devices, Inc. All rights reserved.
++//
++// Permission is hereby granted, free of charge, to any person obtaining a copy
++// of this software and associated documentation files (the "Software"), to deal
++// in the Software without restriction, including without limitation the rights
++// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
++// copies of the Software, and to permit persons to whom the Software is
++// furnished to do so, subject to the following conditions:
++//
++// The above copyright notice and this permission notice shall be included in
++// all copies or substantial portions of the Software.
++//
++// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
++// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
++// THE SOFTWARE.
++
++#ifndef ROCFFT_ARRAY_PREDICATE_H
++#define ROCFFT_ARRAY_PREDICATE_H
++
++#include "rocfft/rocfft.h"
++
++namespace
++{
++ bool array_type_is_complex(rocfft_array_type type)
++ {
++ return type == rocfft_array_type_complex_interleaved
++ || type == rocfft_array_type_complex_planar
++ || type == rocfft_array_type_hermitian_interleaved
++ || type == rocfft_array_type_hermitian_planar;
++ }
++ bool array_type_is_interleaved(rocfft_array_type type)
++ {
++ return type == rocfft_array_type_complex_interleaved
++ || type == rocfft_array_type_hermitian_interleaved;
++ }
++ bool array_type_is_planar(rocfft_array_type type)
++ {
++ return type == rocfft_array_type_complex_planar
++ || type == rocfft_array_type_hermitian_planar;
++ }
++}
++
++#endif
+diff --git a/shared/array_validator.cpp b/shared/array_validator.cpp
+new file mode 100644
+index 0000000..70abb08
+--- /dev/null
++++ b/shared/array_validator.cpp
+@@ -0,0 +1,549 @@
++// Copyright (C) 2022 Advanced Micro Devices, Inc. All rights reserved.
++//
++// Permission is hereby granted, free of charge, to any person obtaining a copy
++// of this software and associated documentation files (the "Software"), to deal
++// in the Software without restriction, including without limitation the rights
++// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
++// copies of the Software, and to permit persons to whom the Software is
++// furnished to do so, subject to the following conditions:
++//
++// The above copyright notice and this permission notice shall be included in
++// all copies or substantial portions of the Software.
++//
++// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
++// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
++// THE SOFTWARE.
++
++#include <iostream>
++#include <numeric>
++#include <unordered_set>
++
++#include "array_validator.h"
++#include "increment.h"
++
++// Check a 2D array for collisions.
++// The 2D case can be determined via a number-theoretic argument.
++bool valid_length_stride_2d(const size_t l0, const size_t l1, const size_t s0, const size_t s1)
++{
++ if(s0 == s1)
++ return false;
++ const auto c = std::lcm(s0, s1);
++ return !((s0 * (l0 - 1) >= c) && (s1 * (l1 - 1) >= c));
++}
++
++// Compare a 1D direction with a multi-index hyperface for collisions.
++bool valid_length_stride_1d_multi(const unsigned int idx,
++ const std::vector<size_t> l,
++ const std::vector<size_t> s,
++ const int verbose)
++{
++ size_t l0{0}, s0{0};
++ std::vector<size_t> l1{}, s1{};
++ for(unsigned int i = 0; i < l.size(); ++i)
++ {
++ if(i == idx)
++ {
++ l0 = l[i];
++ s0 = s[i];
++ }
++ else
++ {
++ l1.push_back(l[i]);
++ s1.push_back(s[i]);
++ }
++ }
++
++ if(verbose > 4)
++ {
++ std::cout << "l0: " << l0 << "\ts0: " << s0 << std::endl;
++ }
++
++ // We only need to go to the maximum pointer offset for (l1,s1).
++ const auto max_offset
++ = std::accumulate(l1.begin(), l1.end(), (size_t)1, std::multiplies<size_t>())
++ - std ::inner_product(l1.begin(), l1.end(), s1.begin(), (size_t)0);
++ std::unordered_set<size_t> a0{};
++ for(size_t i = 1; i < l0; ++i)
++ {
++ const auto val = i * s0;
++ if(val <= max_offset)
++ a0.insert(val);
++ else
++ break;
++ }
++
++ if(verbose > 5)
++ {
++ std::cout << "a0:";
++ for(auto i : a0)
++ std::cout << " " << i;
++ std::cout << std::endl;
++
++ std::cout << "l1:";
++ for(auto i : l1)
++ std::cout << " " << i;
++ std::cout << std::endl;
++
++ std::cout << "s1:";
++ for(auto i : s1)
++ std::cout << " " << i;
++ std::cout << std::endl;
++ }
++
++ // TODO: this can be multi-threaded, since find(...) is thread-safe.
++ std::vector<size_t> index(l1.size());
++ std::fill(index.begin(), index.end(), 0);
++ do
++ {
++ const int i = std::inner_product(index.begin(), index.end(), s1.begin(), (size_t)0);
++ if(i > 0 && (i % s0 == 0))
++ {
++ // TODO: use an ordered set and binary search
++ if(verbose > 6)
++ std::cout << i << std::endl;
++ if(a0.find(i) != a0.end())
++ {
++ if(verbose > 4)
++ {
++ std::cout << "l0: " << l0 << "\ts0: " << s0 << std::endl;
++ std::cout << "l1:";
++ for(const auto li : l1)
++ std::cout << " " << li;
++ std::cout << " s1:";
++ for(const auto si : s1)
++ std::cout << " " << si;
++ std::cout << std::endl;
++ std::cout << "Found duplicate: " << i << std::endl;
++ }
++ return false;
++ }
++ }
++ } while(increment_rowmajor(index, l1));
++
++ return true;
++}
++
++// Compare a hyperface with another hyperface for collisions.
++bool valid_length_stride_multi_multi(const std::vector<size_t> l0,
++ const std::vector<size_t> s0,
++ const std::vector<size_t> l1,
++ const std::vector<size_t> s1)
++{
++ std::unordered_set<size_t> a0{};
++
++ const auto max_offset
++ = std::accumulate(l1.begin(), l1.end(), (size_t)1, std::multiplies<size_t>())
++ - std::inner_product(l1.begin(), l1.end(), s1.begin(), (size_t)0);
++ std::vector<size_t> index0(l0.size()); // TODO: check this
++ std::fill(index0.begin(), index0.end(), 0);
++ do
++ {
++ const auto i = std::inner_product(index0.begin(), index0.end(), s0.begin(), (size_t)0);
++ if(i > max_offset)
++ a0.insert(i);
++ } while(increment_rowmajor(index0, l0));
++
++ std::vector<size_t> index1(l1.size());
++ std::fill(index1.begin(), index1.end(), 0);
++ do
++ {
++ const auto i = std::inner_product(index1.begin(), index1.end(), s1.begin(), (size_t)0);
++ if(i > 0)
++ {
++ // TODO: use an ordered set and binary search
++ if(a0.find(i) != a0.end())
++ {
++
++ return false;
++ }
++ }
++ } while(increment_rowmajor(index1, l1));
++
++ return true;
++}
++
++bool valid_length_stride_3d(const std::vector<size_t>& l,
++ const std::vector<size_t>& s,
++ const int verbose)
++{
++ // Check that 2D faces are valid:
++ if(!valid_length_stride_2d(l[0], l[1], s[0], s[1]))
++ return false;
++ if(!valid_length_stride_2d(l[0], l[2], s[0], s[2]))
++ return false;
++ if(!valid_length_stride_2d(l[1], l[2], s[1], s[2]))
++ return false;
++
++ // If the 2D faces are valid, check an axis vs a face for collisions:
++ bool invalid = false;
++#ifdef _OPENMP
++#pragma omp parallel for
++#endif
++ for(int idx = 0; idx < 3; ++idx)
++ {
++ if(!valid_length_stride_1d_multi(idx, l, s, verbose))
++ {
++#ifdef _OPENMP
++#pragma omp cancel for
++#endif
++ invalid = true;
++ }
++ }
++ if(invalid)
++ return false;
++ return true;
++}
++
++bool valid_length_stride_4d(const std::vector<size_t>& l,
++ const std::vector<size_t>& s,
++ const int verbose)
++{
++ if(l.size() != 4)
++ {
++ throw std::runtime_error("Incorrect dimensions for valid_length_stride_4d");
++ }
++
++ // Check that 2D faces are valid:
++ for(int idx0 = 0; idx0 < 3; ++idx0)
++ {
++ for(int idx1 = idx0 + 1; idx1 < 4; ++idx1)
++ {
++ if(!valid_length_stride_2d(l[idx0], l[idx1], s[idx0], s[idx1]))
++ return false;
++ }
++ }
++
++ bool invalid = false;
++ // Check that 1D vs 3D faces are valid:
++#ifdef _OPENMP
++#pragma omp parallel for
++#endif
++ for(int idx0 = 0; idx0 < 4; ++idx0)
++ {
++ if(!valid_length_stride_1d_multi(idx0, l, s, verbose))
++ {
++#ifdef _OPENMP
++#pragma omp cancel for
++#endif
++ invalid = true;
++ }
++ }
++ if(invalid)
++ return false;
++
++ // Check that 2D vs 2D faces are valid:
++
++ // First, get all the permutations
++ std::vector<std::vector<size_t>> perms;
++ std::vector<size_t> v(l.size());
++ std::fill(v.begin(), v.begin() + 2, 0);
++ std::fill(v.begin() + 2, v.end(), 1);
++ do
++ {
++ perms.push_back(v);
++ if(verbose > 3)
++ {
++ std::cout << "v:";
++ for(const auto i : v)
++ {
++ std::cout << " " << i;
++ }
++ std::cout << "\n";
++ }
++ } while(std::next_permutation(v.begin(), v.end()));
++
++ // Then loop over all of the permutations.
++#ifdef _OPENMP
++#pragma omp parallel for
++#endif
++ for(size_t iperm = 0; iperm < perms.size(); ++iperm)
++ {
++ std::vector<size_t> l0(2);
++ std::vector<size_t> s0(2);
++ std::vector<size_t> l1(2);
++ std::vector<size_t> s1(2);
++ for(size_t i = 0; i < l.size(); ++i)
++ {
++ if(perms[iperm][i] == 0)
++ {
++ l0.push_back(l[i]);
++ s0.push_back(s[i]);
++ }
++ else
++ {
++ l1.push_back(l[i]);
++ s1.push_back(s[i]);
++ }
++ }
++
++ if(verbose > 3)
++ {
++ std::cout << "\tl0:";
++ for(const auto i : l0)
++ {
++ std::cout << " " << i;
++ }
++ std::cout << "\n";
++ std::cout << "\ts0:";
++ for(const auto i : s0)
++ {
++ std::cout << " " << i;
++ }
++ std::cout << "\n";
++ std::cout << "\tl1:";
++ for(const auto i : l1)
++ {
++ std::cout << " " << i;
++ }
++ std::cout << "\n";
++ std::cout << "\ts1:";
++ for(const auto i : s1)
++ {
++ std::cout << " " << i;
++ }
++ std::cout << "\n";
++ }
++
++ if(!valid_length_stride_multi_multi(l0, s0, l1, s1))
++ {
++#ifdef _OPENMP
++#pragma omp cancel for
++#endif
++ invalid = true;
++ }
++ }
++ if(invalid)
++ return false;
++
++ return true;
++}
++
++bool valid_length_stride_generald(const std::vector<size_t> l,
++ const std::vector<size_t> s,
++ const int verbose)
++{
++ if(verbose > 2)
++ {
++ std::cout << "checking dimension " << l.size() << std::endl;
++ }
++
++ // Recurse on d-1 hyper-faces:
++ for(unsigned int idx = 0; idx < l.size(); ++idx)
++ {
++ std::vector<size_t> l0{};
++ std::vector<size_t> s0{};
++ for(size_t i = 0; i < l.size(); ++i)
++ {
++ if(i != idx)
++ {
++ l0.push_back(l[i]);
++ s0.push_back(s[i]);
++ }
++ }
++ if(!array_valid(l0, s0, verbose))
++ return false;
++ }
++
++ // Handle the 1D vs (N-1) case:
++ for(unsigned int idx = 0; idx < l.size(); ++idx)
++ {
++ if(!valid_length_stride_1d_multi(idx, l, s, verbose))
++ return false;
++ }
++
++ for(size_t dim0 = 2; dim0 <= l.size() / 2; ++dim0)
++ {
++ const size_t dim1 = l.size() - dim0;
++ if(verbose > 2)
++ std::cout << "dims: " << dim0 << " " << dim1 << std::endl;
++
++ // We iterate over all permutations of an array of length l.size() which contains dim0 zeros
++ // and dim1 ones. We start with {0, ..., 0, 1, ... 1} to guarantee that we hit all the
++ // possibilities.
++
++ // First, get all the permutations
++ std::vector<std::vector<size_t>> perms;
++ std::vector<size_t> v(l.size());
++ std::fill(v.begin(), v.begin() + dim1, 0);
++ std::fill(v.begin() + dim1, v.end(), 1);
++ do
++ {
++ perms.push_back(v);
++ if(verbose > 3)
++ {
++ std::cout << "v:";
++ for(const auto i : v)
++ {
++ std::cout << " " << i;
++ }
++ std::cout << "\n";
++ }
++
++ } while(std::next_permutation(v.begin(), v.end()));
++
++ bool invalid = false;
++ // Then loop over all of the permutations.
++#ifdef _OPENMP
++#pragma omp parallel for
++#endif
++ for(size_t iperm = 0; iperm < perms.size(); ++iperm)
++ {
++ std::vector<size_t> l0(dim0);
++ std::vector<size_t> s0(dim0);
++ std::vector<size_t> l1(dim1);
++ std::vector<size_t> s1(dim1);
++
++ for(size_t i = 0; i < l.size(); ++i)
++ {
++ if(v[i] == 0)
++ {
++ l0.push_back(l[i]);
++ s0.push_back(s[i]);
++ }
++ else
++ {
++ l1.push_back(l[i]);
++ s1.push_back(s[i]);
++ }
++ }
++
++ if(verbose > 3)
++ {
++ std::cout << "\tl0:";
++ for(const auto i : l0)
++ {
++ std::cout << " " << i;
++ }
++ std::cout << "\n";
++ std::cout << "\ts0:";
++ for(const auto i : s0)
++ {
++ std::cout << " " << i;
++ }
++ std::cout << "\n";
++ std::cout << "\tl1:";
++ for(const auto i : l1)
++ {
++ std::cout << " " << i;
++ }
++ std::cout << "\n";
++ std::cout << "\ts1:";
++ for(const auto i : s1)
++ {
++ std::cout << " " << i;
++ }
++ std::cout << "\n";
++ }
++
++ if(!valid_length_stride_multi_multi(l0, s0, l1, s1))
++ {
++#ifdef _OPENMP
++#pragma omp cancel for
++#endif
++ invalid = true;
++ }
++ }
++ if(invalid)
++ return false;
++ }
++
++ return true;
++}
++
++bool sort_by_stride(const std::pair<size_t, size_t>& ls0, const std::pair<size_t, size_t>& ls1)
++{
++ return ls0.second < ls1.second;
++}
++
++bool array_valid(const std::vector<size_t>& length,
++ const std::vector<size_t>& stride,
++ const int verbose)
++{
++ if(length.size() != stride.size())
++ return false;
++
++ // If a length is 1, then the stride is irrelevant.
++ // If a length is > 1, then the corresponding stride must be > 1.
++ std::vector<size_t> l{}, s{};
++ for(unsigned int i = 0; i < length.size(); ++i)
++ {
++ if(length[i] > 1)
++ {
++ if(stride[i] == 0)
++ return false;
++ l.push_back(length[i]);
++ s.push_back(stride[i]);
++ }
++ }
++
++ if(length.size() > 1)
++ {
++ // Check happy path.
++ bool happy_path = true;
++ std::vector<std::pair<size_t, size_t>> ls;
++ for(size_t idx = 0; idx < length.size(); ++idx)
++ {
++ ls.push_back(std::pair(length[idx], stride[idx]));
++ }
++ std::sort(ls.begin(), ls.end(), sort_by_stride);
++
++ if(verbose > 2)
++ {
++ for(size_t idx = 0; idx < ls.size(); ++idx)
++ {
++ std::cout << ls[idx].first << "\t" << ls[idx].second << "\n";
++ }
++ }
++
++ for(size_t idx = 1; idx < ls.size(); ++idx)
++ {
++ if(ls[idx].second < ls[idx - 1].first * ls[idx - 1].second)
++ {
++ happy_path = false;
++ break;
++ }
++ }
++ if(happy_path)
++ {
++ if(verbose > 2)
++ {
++ std::cout << "happy path\n";
++ }
++ return true;
++ }
++ }
++
++ switch(l.size())
++ {
++ case 0:
++ return true;
++ break;
++ case 1:
++ return s[0] != 0;
++ break;
++ case 2:
++ {
++ return valid_length_stride_2d(l[0], l[1], s[0], s[1]);
++ break;
++ }
++ case 3:
++ {
++ return valid_length_stride_3d(l, s, verbose);
++ break;
++ }
++ case 4:
++ {
++ return valid_length_stride_4d(l, s, verbose);
++ break;
++ }
++ default:
++ return valid_length_stride_generald(l, s, verbose);
++ return true;
++ }
++
++ return true;
++}
+diff --git a/shared/array_validator.h b/shared/array_validator.h
+new file mode 100644
+index 0000000..ce85173
+--- /dev/null
++++ b/shared/array_validator.h
+@@ -0,0 +1,31 @@
++// Copyright (C) 2022 Advanced Micro Devices, Inc. All rights reserved.
++//
++// Permission is hereby granted, free of charge, to any person obtaining a copy
++// of this software and associated documentation files (the "Software"), to deal
++// in the Software without restriction, including without limitation the rights
++// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
++// copies of the Software, and to permit persons to whom the Software is
++// furnished to do so, subject to the following conditions:
++//
++// The above copyright notice and this permission notice shall be included in
++// all copies or substantial portions of the Software.
++//
++// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
++// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
++// THE SOFTWARE.
++
++#ifndef ARRAY_VALIDATOR_H
++#define ARRAY_VALIDATOR_H
++
++#include <vector>
++
++// Checks whether the array with given length and stride has multi-index collisions.
++bool array_valid(const std::vector<size_t>& length,
++ const std::vector<size_t>& stride,
++ const int verbose = 0);
++
++#endif
+diff --git a/shared/concurrency.h b/shared/concurrency.h
+new file mode 100644
+index 0000000..a36c7c1
+--- /dev/null
++++ b/shared/concurrency.h
+@@ -0,0 +1,41 @@
++// Copyright (C) 2022 Advanced Micro Devices, Inc. All rights reserved.
++//
++// Permission is hereby granted, free of charge, to any person obtaining a copy
++// of this software and associated documentation files (the "Software"), to deal
++// in the Software without restriction, including without limitation the rights
++// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
++// copies of the Software, and to permit persons to whom the Software is
++// furnished to do so, subject to the following conditions:
++//
++// The above copyright notice and this permission notice shall be included in
++// all copies or substantial portions of the Software.
++//
++// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
++// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
++// THE SOFTWARE.
++
++#pragma once
++
++#include <thread>
++
++#ifndef WIN32
++#include <sched.h>
++#endif
++
++// work out how many parallel tasks to run, based on available
++// resources. on Linux, this will look at the cpu affinity mask (if
++// available) which might be restricted in a container. otherwise,
++// return std::thread::hardware_concurrency().
++static unsigned int rocfft_concurrency()
++{
++#ifndef WIN32
++ cpu_set_t cpuset;
++ if(sched_getaffinity(0, sizeof(cpuset), &cpuset) == 0)
++ return CPU_COUNT(&cpuset);
++#endif
++ return std::thread::hardware_concurrency();
++}
+diff --git a/shared/data_gen_device.h b/shared/data_gen_device.h
+new file mode 100644
+index 0000000..77fb012
+--- /dev/null
++++ b/shared/data_gen_device.h
+@@ -0,0 +1,1303 @@
++// Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved.
++//
++// Permission is hereby granted, free of charge, to any person obtaining a copy
++// of this software and associated documentation files (the "Software"), to deal
++// in the Software without restriction, including without limitation the rights
++// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
++// copies of the Software, and to permit persons to whom the Software is
++// furnished to do so, subject to the following conditions:
++//
++// The above copyright notice and this permission notice shall be included in
++// all copies or substantial portions of the Software.
++//
++// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
++// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
++// THE SOFTWARE.
++
++#ifndef DATA_GEN_DEVICE_H
++#define DATA_GEN_DEVICE_H
++
++// rocRAND can generate warnings if inline asm is not available for
++// some architectures. data generation isn't performance-critical,
++// so just disable inline asm to prevent the warnings.
++#define ROCRAND_DISABLE_INLINE_ASM
++
++#include "../shared/arithmetic.h"
++#include "../shared/device_properties.h"
++#include "../shared/gpubuf.h"
++#include "../shared/increment.h"
++#include "../shared/rocfft_complex.h"
++#include <hip/hip_runtime.h>
++#include <hip/hip_runtime_api.h>
++#include <hiprand/hiprand.h>
++#include <hiprand/hiprand_kernel.h>
++#include <limits>
++#include <vector>
++
++static const unsigned int DATA_GEN_THREADS = 8;
++static const unsigned int DATA_GEN_GRID_Y_MAX = 64;
++
++template <typename T>
++struct input_val_1D
++{
++ T val1;
++};
++
++template <typename T>
++struct input_val_2D
++{
++ T val1;
++ T val2;
++};
++
++template <typename T>
++struct input_val_3D
++{
++ T val1;
++ T val2;
++ T val3;
++};
++
++template <typename T>
++static input_val_1D<T> get_input_val(const T& val)
++{
++ return input_val_1D<T>{val};
++}
++
++template <typename T>
++static input_val_2D<T> get_input_val(const std::tuple<T, T>& val)
++{
++ return input_val_2D<T>{std::get<0>(val), std::get<1>(val)};
++}
++
++template <typename T>
++static input_val_3D<T> get_input_val(const std::tuple<T, T, T>& val)
++{
++ return input_val_3D<T>{std::get<0>(val), std::get<1>(val), std::get<2>(val)};
++}
++
++template <typename T>
++__device__ static size_t
++ compute_index(const input_val_1D<T>& length, const input_val_1D<T>& stride, size_t base)
++{
++ return (length.val1 * stride.val1) + base;
++}
++
++template <typename T>
++__device__ static size_t
++ compute_index(const input_val_2D<T>& length, const input_val_2D<T>& stride, size_t base)
++{
++ return (length.val1 * stride.val1) + (length.val2 * stride.val2) + base;
++}
++
++template <typename T>
++__device__ static size_t
++ compute_index(const input_val_3D<T>& length, const input_val_3D<T>& stride, size_t base)
++{
++ return (length.val1 * stride.val1) + (length.val2 * stride.val2) + (length.val3 * stride.val3)
++ + base;
++}
++
++template <typename T>
++static inline input_val_1D<T> make_zero_length(const input_val_1D<T>& whole_length)
++{
++ return input_val_1D<T>{0};
++}
++
++template <typename T>
++static inline input_val_2D<T> make_zero_length(const input_val_2D<T>& whole_length)
++{
++ return input_val_2D<T>{0, 0};
++}
++
++template <typename T>
++static inline input_val_3D<T> make_zero_length(const input_val_3D<T>& whole_length)
++{
++ return input_val_3D<T>{0, 0, 0};
++}
++
++template <typename T>
++static inline input_val_1D<T> make_unit_stride(const input_val_1D<T>& whole_length)
++{
++ return input_val_1D<T>{1};
++}
++
++template <typename T>
++static inline input_val_2D<T> make_unit_stride(const input_val_2D<T>& whole_length)
++{
++ return input_val_2D<T>{1, whole_length.val1};
++}
++
++template <typename T>
++static inline input_val_3D<T> make_unit_stride(const input_val_3D<T>& whole_length)
++{
++ return input_val_3D<T>{1, whole_length.val1, whole_length.val1 * whole_length.val2};
++}
++
++template <typename T>
++__device__ static input_val_1D<T> get_length(const size_t i, const input_val_1D<T>& whole_length)
++{
++ auto xlen = whole_length.val1;
++
++ auto xidx = i % xlen;
++
++ return input_val_1D<T>{xidx};
++}
++
++template <typename T>
++__device__ static input_val_2D<T> get_length(const size_t i, const input_val_2D<T>& whole_length)
++{
++ auto xlen = whole_length.val1;
++ auto ylen = whole_length.val2;
++
++ auto xidx = i % xlen;
++ auto yidx = i / xlen % ylen;
++
++ return input_val_2D<T>{xidx, yidx};
++}
++
++template <typename T>
++__device__ static input_val_3D<T> get_length(const size_t i, const input_val_3D<T>& whole_length)
++{
++ auto xlen = whole_length.val1;
++ auto ylen = whole_length.val2;
++ auto zlen = whole_length.val3;
++
++ auto xidx = i % xlen;
++ auto yidx = i / xlen % ylen;
++ auto zidx = i / xlen / ylen % zlen;
++
++ return input_val_3D<T>{xidx, yidx, zidx};
++}
++
++template <typename T>
++__device__ static size_t get_batch(const size_t i, const input_val_1D<T>& whole_length)
++{
++ auto xlen = whole_length.val1;
++
++ auto yidx = i / xlen;
++
++ return yidx;
++}
++
++template <typename T>
++__device__ static size_t get_batch(const size_t i, const input_val_2D<T>& whole_length)
++{
++ auto xlen = whole_length.val1;
++ auto ylen = whole_length.val2;
++
++ auto zidx = i / xlen / ylen;
++
++ return zidx;
++}
++
++template <typename T>
++__device__ static size_t get_batch(const size_t i, const input_val_3D<T>& length)
++{
++ auto xlen = length.val1;
++ auto ylen = length.val2;
++ auto zlen = length.val3;
++
++ auto widx = i / xlen / ylen / zlen;
++
++ return widx;
++}
++
++__device__ static double make_random_val(hiprandStatePhilox4_32_10* gen_state, double offset)
++{
++ return hiprand_uniform_double(gen_state) + offset;
++}
++
++__device__ static float make_random_val(hiprandStatePhilox4_32_10* gen_state, float offset)
++{
++ return hiprand_uniform(gen_state) + offset;
++}
++
++__device__ static _Float16 make_random_val(hiprandStatePhilox4_32_10* gen_state, _Float16 offset)
++{
++ return static_cast<_Float16>(hiprand_uniform(gen_state)) + offset;
++}
++
++template <typename Tcomplex>
++__device__ static void set_imag_zero(const size_t pos, Tcomplex* x)
++{
++ x[pos].y = 0.0;
++}
++
++template <typename Tfloat>
++__device__ static void set_imag_zero(const size_t pos, Tfloat* xreal, Tfloat* ximag)
++{
++ ximag[pos] = 0.0;
++}
++
++template <typename Tcomplex>
++__device__ static void conjugate(const size_t pos, const size_t cpos, Tcomplex* x)
++{
++ x[pos].x = x[cpos].x;
++ x[pos].y = -x[cpos].y;
++}
++
++template <typename Tfloat>
++__device__ static void conjugate(const size_t pos, const size_t cpos, Tfloat* xreal, Tfloat* ximag)
++{
++ xreal[pos] = xreal[cpos];
++ ximag[pos] = -ximag[cpos];
++}
++
++template <typename Tint, typename Treal>
++__global__ static void __launch_bounds__(DATA_GEN_THREADS)
++ generate_random_interleaved_data_kernel(const Tint whole_length,
++ const Tint zero_length,
++ const size_t idist,
++ const size_t isize,
++ const Tint istride,
++ rocfft_complex<Treal>* data)
++{
++ auto const i = static_cast<size_t>(threadIdx.x) + blockIdx.x * blockDim.x
++ + blockIdx.y * gridDim.x * DATA_GEN_THREADS;
++ static_assert(sizeof(i) >= sizeof(isize));
++ if(i < isize)
++ {
++ auto i_length = get_length(i, whole_length);
++ auto i_batch = get_batch(i, whole_length);
++ auto i_base = i_batch * idist;
++
++ auto seed = compute_index(zero_length, istride, i_base);
++ auto idx = compute_index(i_length, istride, i_base);
++
++ hiprandStatePhilox4_32_10 gen_state;
++ hiprand_init(seed, idx, 0, &gen_state);
++
++ data[idx].x = make_random_val(&gen_state, static_cast<Treal>(-0.5));
++ data[idx].y = make_random_val(&gen_state, static_cast<Treal>(-0.5));
++ }
++}
++
++template <typename Tint, typename Treal>
++__global__ static void __launch_bounds__(DATA_GEN_THREADS)
++ generate_interleaved_data_kernel(const Tint whole_length,
++ const size_t idist,
++ const size_t isize,
++ const Tint istride,
++ const Tint ustride,
++ const Treal inv_scale,
++ rocfft_complex<Treal>* data)
++{
++ auto const i = static_cast<size_t>(threadIdx.x) + blockIdx.x * blockDim.x
++ + blockIdx.y * gridDim.x * DATA_GEN_THREADS;
++ static_assert(sizeof(i) >= sizeof(isize));
++ if(i < isize)
++ {
++ const auto i_length = get_length(i, whole_length);
++ const auto i_batch = get_batch(i, whole_length);
++ const auto i_base = i_batch * idist;
++
++ const auto val = static_cast<Treal>(-0.5)
++ + static_cast<Treal>(
++ static_cast<unsigned long long>(compute_index(i_length, ustride, 0)))
++ * inv_scale;
++
++ const auto idx = compute_index(i_length, istride, i_base);
++
++ data[idx].x = val;
++ data[idx].y = val;
++ }
++}
++
++template <typename Tint, typename Treal>
++__global__ static void __launch_bounds__(DATA_GEN_THREADS)
++ generate_random_planar_data_kernel(const Tint whole_length,
++ const Tint zero_length,
++ const size_t idist,
++ const size_t isize,
++ const Tint istride,
++ Treal* real_data,
++ Treal* imag_data)
++{
++ auto const i = static_cast<size_t>(threadIdx.x) + blockIdx.x * blockDim.x
++ + blockIdx.y * gridDim.x * DATA_GEN_THREADS;
++ static_assert(sizeof(i) >= sizeof(isize));
++ if(i < isize)
++ {
++ auto i_length = get_length(i, whole_length);
++ auto i_batch = get_batch(i, whole_length);
++ auto i_base = i_batch * idist;
++
++ auto seed = compute_index(zero_length, istride, i_base);
++ auto idx = compute_index(i_length, istride, i_base);
++
++ hiprandStatePhilox4_32_10 gen_state;
++ hiprand_init(seed, idx, 0, &gen_state);
++
++ real_data[idx] = make_random_val(&gen_state, static_cast<Treal>(-0.5));
++ imag_data[idx] = make_random_val(&gen_state, static_cast<Treal>(-0.5));
++ }
++}
++
++template <typename Tint, typename Treal>
++__global__ static void __launch_bounds__(DATA_GEN_THREADS)
++ generate_planar_data_kernel(const Tint whole_length,
++ const size_t idist,
++ const size_t isize,
++ const Tint istride,
++ const Tint ustride,
++ const Treal inv_scale,
++ Treal* real_data,
++ Treal* imag_data)
++{
++ auto const i = static_cast<size_t>(threadIdx.x) + blockIdx.x * blockDim.x
++ + blockIdx.y * gridDim.x * DATA_GEN_THREADS;
++ static_assert(sizeof(i) >= sizeof(isize));
++ if(i < isize)
++ {
++ const auto i_length = get_length(i, whole_length);
++ const auto i_batch = get_batch(i, whole_length);
++ const auto i_base = i_batch * idist;
++
++ const auto val = static_cast<Treal>(-0.5)
++ + static_cast<Treal>(
++ static_cast<unsigned long long>(compute_index(i_length, ustride, 0)))
++ * inv_scale;
++
++ const auto idx = compute_index(i_length, istride, i_base);
++
++ real_data[idx] = val;
++ imag_data[idx] = val;
++ }
++}
++
++template <typename Tint, typename Treal>
++__global__ static void __launch_bounds__(DATA_GEN_THREADS)
++ generate_random_real_data_kernel(const Tint whole_length,
++ const Tint zero_length,
++ const size_t idist,
++ const size_t isize,
++ const Tint istride,
++ Treal* data)
++{
++ auto const i = static_cast<size_t>(threadIdx.x) + blockIdx.x * blockDim.x
++ + blockIdx.y * gridDim.x * DATA_GEN_THREADS;
++ static_assert(sizeof(i) >= sizeof(isize));
++ if(i < isize)
++ {
++ auto i_length = get_length(i, whole_length);
++ auto i_batch = get_batch(i, whole_length);
++ auto i_base = i_batch * idist;
++
++ auto seed = compute_index(zero_length, istride, i_base);
++ auto idx = compute_index(i_length, istride, i_base);
++
++ hiprandStatePhilox4_32_10 gen_state;
++ hiprand_init(seed, idx, 0, &gen_state);
++
++ data[idx] = make_random_val(&gen_state, static_cast<Treal>(-0.5));
++ }
++}
++
++template <typename Tint, typename Treal>
++__global__ static void __launch_bounds__(DATA_GEN_THREADS)
++ generate_real_data_kernel(const Tint whole_length,
++ const size_t idist,
++ const size_t isize,
++ const Tint istride,
++ const Tint ustride,
++ const Treal inv_scale,
++ Treal* data)
++{
++ auto const i = static_cast<size_t>(threadIdx.x) + blockIdx.x * blockDim.x
++ + blockIdx.y * gridDim.x * DATA_GEN_THREADS;
++ static_assert(sizeof(i) >= sizeof(isize));
++ if(i < isize)
++ {
++ const auto i_length = get_length(i, whole_length);
++ const auto i_batch = get_batch(i, whole_length);
++ const auto i_base = i_batch * idist;
++
++ const auto val = static_cast<Treal>(-0.5)
++ + static_cast<Treal>(
++ static_cast<unsigned long long>(compute_index(i_length, ustride, 0)))
++ * inv_scale;
++
++ const auto idx = compute_index(i_length, istride, i_base);
++
++ data[idx] = val;
++ }
++}
++
++// For complex-to-real transforms, the input data must be Hermitiam-symmetric.
++// That is, u_k is the complex conjugate of u_{-k}, where k is the wavevector in Fourier
++// space. For multi-dimensional data, this means that we only need to store a bit more
++// than half of the complex values; the rest are redundant. However, there are still
++// some restrictions:
++// * the origin and Nyquist value(s) must be real-valued
++// * some of the remaining values are still redundant, and you might get different results
++// than you expect if the values don't agree.
++
++template <typename Tcomplex>
++__global__ static void impose_hermitian_symmetry_interleaved_1D_kernel(Tcomplex* x,
++ const size_t Nx,
++ const size_t xstride,
++ const size_t dist,
++ const size_t batch_total,
++ const bool Nxeven)
++{
++ auto id_batch = static_cast<size_t>(threadIdx.x) + blockIdx.x * blockDim.x;
++ static_assert(sizeof(id_batch) == sizeof(size_t));
++
++ if(id_batch < batch_total)
++ {
++ id_batch *= dist;
++
++ set_imag_zero(id_batch, x);
++
++ if(Nxeven)
++ set_imag_zero(id_batch + (Nx / 2) * xstride, x);
++ }
++}
++
++template <typename Tfloat>
++__global__ static void impose_hermitian_symmetry_planar_1D_kernel(Tfloat* xreal,
++ Tfloat* ximag,
++ const size_t Nx,
++ const size_t xstride,
++ const size_t dist,
++ const size_t batch_total,
++ const bool Nxeven)
++{
++ auto id_batch = static_cast<size_t>(threadIdx.x) + blockIdx.x * blockDim.x;
++ static_assert(sizeof(id_batch) == sizeof(size_t));
++
++ if(id_batch < batch_total)
++ {
++ id_batch *= dist;
++
++ set_imag_zero(id_batch, xreal, ximag);
++
++ if(Nxeven)
++ set_imag_zero(id_batch + (Nx / 2) * xstride, xreal, ximag);
++ }
++}
++
++template <typename Tcomplex>
++__global__ static void impose_hermitian_symmetry_interleaved_2D_kernel(Tcomplex* x,
++ const size_t Nx,
++ const size_t Ny,
++ const size_t xstride,
++ const size_t ystride,
++ const size_t dist,
++ const size_t batch_total,
++ const size_t x_total,
++ const bool Nxeven,
++ const bool Nyeven)
++{
++ auto id_batch = static_cast<size_t>(threadIdx.x) + blockIdx.x * blockDim.x;
++ const auto id_x = static_cast<size_t>(threadIdx.y) + blockIdx.y * blockDim.y;
++ static_assert(sizeof(id_batch) == sizeof(size_t));
++ static_assert(sizeof(id_x) == sizeof(size_t));
++
++ if(id_batch < batch_total)
++ {
++ id_batch *= dist;
++
++ if(id_x == 0)
++ set_imag_zero(id_batch, x);
++
++ if(id_x == 0 && Nxeven)
++ set_imag_zero(id_batch + (Nx / 2) * xstride, x);
++
++ if(id_x == 0 && Nyeven)
++ set_imag_zero(id_batch + ystride * (Ny / 2), x);
++
++ if(id_x == 0 && Nxeven && Nyeven)
++ set_imag_zero(id_batch + xstride * (Nx / 2) + ystride * (Ny / 2), x);
++
++ if(id_x < x_total)
++ {
++ conjugate(id_batch + xstride * (Nx - (id_x + 1)), id_batch + xstride * (id_x + 1), x);
++
++ if(Nyeven)
++ conjugate(id_batch + xstride * (Nx - (id_x + 1)) + ystride * (Ny / 2),
++ id_batch + xstride * (id_x + 1) + ystride * (Ny / 2),
++ x);
++ }
++ }
++}
++
++template <typename Tfloat>
++__global__ static void impose_hermitian_symmetry_planar_2D_kernel(Tfloat* xreal,
++ Tfloat* ximag,
++ const size_t Nx,
++ const size_t Ny,
++ const size_t xstride,
++ const size_t ystride,
++ const size_t dist,
++ const size_t batch_total,
++ const size_t x_total,
++ const bool Nxeven,
++ const bool Nyeven)
++{
++ auto id_batch = static_cast<size_t>(threadIdx.x) + blockIdx.x * blockDim.x;
++ const auto id_x = static_cast<size_t>(threadIdx.y) + blockIdx.y * blockDim.y;
++ static_assert(sizeof(id_batch) == sizeof(size_t));
++ static_assert(sizeof(id_x) == sizeof(size_t));
++
++ if(id_batch < batch_total)
++ {
++ id_batch *= dist;
++
++ if(id_x == 0)
++ set_imag_zero(id_batch, xreal, ximag);
++
++ if(id_x == 0 && Nxeven)
++ set_imag_zero(id_batch + (Nx / 2) * xstride, xreal, ximag);
++
++ if(id_x == 0 && Nyeven)
++ set_imag_zero(id_batch + ystride * (Ny / 2), xreal, ximag);
++
++ if(id_x == 0 && Nxeven && Nyeven)
++ set_imag_zero(id_batch + xstride * (Nx / 2) + ystride * (Ny / 2), xreal, ximag);
++
++ if(id_x < x_total)
++ {
++ conjugate(id_batch + xstride * (Nx - (id_x + 1)),
++ id_batch + xstride * (id_x + 1),
++ xreal,
++ ximag);
++
++ if(Nyeven)
++ conjugate(id_batch + xstride * (Nx - (id_x + 1)) + ystride * (Ny / 2),
++ id_batch + xstride * (id_x + 1) + ystride * (Ny / 2),
++ xreal,
++ ximag);
++ }
++ }
++}
++
++template <typename Tcomplex>
++__global__ static void impose_hermitian_symmetry_interleaved_3D_kernel(Tcomplex* x,
++ const size_t Nx,
++ const size_t Ny,
++ const size_t Nz,
++ const size_t xstride,
++ const size_t ystride,
++ const size_t zstride,
++ const size_t dist,
++ const size_t batch_total,
++ const size_t x_total,
++ const size_t y_total,
++ const size_t y_total_half,
++ const bool Nxeven,
++ const bool Nyeven,
++ const bool Nzeven)
++{
++ auto id_batch = static_cast<size_t>(threadIdx.x) + blockIdx.x * blockDim.x;
++ const auto id_x = static_cast<size_t>(threadIdx.y) + blockIdx.y * blockDim.y;
++ const auto id_y = static_cast<size_t>(threadIdx.z) + blockIdx.z * blockDim.z;
++ static_assert(sizeof(id_batch) == sizeof(size_t));
++ static_assert(sizeof(id_x) == sizeof(size_t));
++ static_assert(sizeof(id_y) == sizeof(size_t));
++
++ if(id_batch < batch_total)
++ {
++ auto id_x_y_zero = (id_x == 0 && id_y == 0);
++
++ id_batch *= dist;
++
++ if(id_x_y_zero)
++ set_imag_zero(id_batch, x);
++
++ if(Nxeven && id_x_y_zero)
++ set_imag_zero(id_batch + xstride * (Nx / 2), x);
++
++ if(Nyeven && id_x_y_zero)
++ set_imag_zero(id_batch + ystride * (Ny / 2), x);
++
++ if(Nzeven && id_x_y_zero)
++ set_imag_zero(id_batch + zstride * (Nz / 2), x);
++
++ if(Nxeven && Nyeven && id_x_y_zero)
++ set_imag_zero(id_batch + xstride * (Nx / 2) + ystride * (Ny / 2), x);
++
++ if(Nxeven && Nzeven && id_x_y_zero)
++ set_imag_zero(id_batch + xstride * (Nx / 2) + zstride * (Nz / 2), x);
++
++ if(Nyeven && Nzeven && id_x_y_zero)
++ set_imag_zero(id_batch + ystride * (Ny / 2) + zstride * (Nz / 2), x);
++
++ if(Nxeven && Nyeven && Nzeven && id_x_y_zero)
++ set_imag_zero(id_batch + xstride * (Nx / 2) + ystride * (Ny / 2) + zstride * (Nz / 2),
++ x);
++
++ if(id_x == 0 && id_y < y_total_half)
++ conjugate(id_batch + ystride * (Ny - (id_y + 1)), id_batch + ystride * (id_y + 1), x);
++
++ if(Nxeven && id_x == 0 && id_y < y_total_half)
++ conjugate(id_batch + xstride * (Nx / 2) + ystride * (Ny - (id_y + 1)),
++ id_batch + xstride * (Nx / 2) + ystride * (id_y + 1),
++ x);
++
++ if(id_x < x_total && id_y == 0)
++ conjugate(id_batch + xstride * (Nx - (id_x + 1)), id_batch + xstride * (id_x + 1), x);
++
++ if(Nyeven && id_x < x_total && id_y == 0)
++ conjugate(id_batch + xstride * (Nx - (id_x + 1)) + ystride * (Ny / 2),
++ id_batch + xstride * (id_x + 1) + ystride * (Ny / 2),
++ x);
++
++ if(id_x < x_total && id_y < y_total)
++ conjugate(id_batch + xstride * (Nx - (id_x + 1)) + ystride * (Ny - (id_y + 1)),
++ id_batch + xstride * (id_x + 1) + ystride * (id_y + 1),
++ x);
++
++ if(Nzeven)
++ {
++ if(id_x < x_total && id_y == 0)
++ conjugate(id_batch + xstride * (Nx - (id_x + 1)) + zstride * (Nz / 2),
++ id_batch + xstride * (id_x + 1) + zstride * (Nz / 2),
++ x);
++
++ if(Nyeven && id_x < x_total && id_y == 0)
++ conjugate(id_batch + xstride * (Nx - (id_x + 1)) + zstride * (Nz / 2),
++ id_batch + xstride * (id_x + 1) + zstride * (Nz / 2),
++ x);
++
++ if(id_x == 0 && id_y < y_total_half)
++ conjugate(id_batch + ystride * (Ny - (id_y + 1)) + zstride * (Nz / 2),
++ id_batch + ystride * (id_y + 1) + zstride * (Nz / 2),
++ x);
++
++ if(Nxeven && id_x == 0 && id_y < y_total_half)
++ conjugate(id_batch + xstride * (Nx / 2) + ystride * (Ny - (id_y + 1))
++ + zstride * (Nz / 2),
++ id_batch + xstride * (Nx / 2) + ystride * (id_y + 1) + zstride * (Nz / 2),
++ x);
++
++ if(id_x < x_total && id_y < y_total)
++ conjugate(id_batch + xstride * (Nx - (id_x + 1)) + ystride * (Ny - (id_y + 1))
++ + zstride * (Nz / 2),
++ id_batch + xstride * (id_x + 1) + ystride * (id_y + 1)
++ + zstride * (Nz / 2),
++ x);
++ }
++ }
++}
++
++template <typename Tfloat>
++__global__ static void impose_hermitian_symmetry_planar_3D_kernel(Tfloat* xreal,
++ Tfloat* ximag,
++ const size_t Nx,
++ const size_t Ny,
++ const size_t Nz,
++ const size_t xstride,
++ const size_t ystride,
++ const size_t zstride,
++ const size_t dist,
++ const size_t batch_total,
++ const size_t x_total,
++ const size_t y_total,
++ const size_t y_total_half,
++ const bool Nxeven,
++ const bool Nyeven,
++ const bool Nzeven)
++{
++ auto id_batch = static_cast<size_t>(threadIdx.x) + blockIdx.x * blockDim.x;
++ const auto id_x = static_cast<size_t>(threadIdx.y) + blockIdx.y * blockDim.y;
++ const auto id_y = static_cast<size_t>(threadIdx.z) + blockIdx.z * blockDim.z;
++ static_assert(sizeof(id_batch) == sizeof(size_t));
++ static_assert(sizeof(id_x) == sizeof(size_t));
++ static_assert(sizeof(id_y) == sizeof(size_t));
++
++ if(id_batch < batch_total)
++ {
++ auto id_x_y_zero = (id_x == 0 && id_y == 0);
++
++ id_batch *= dist;
++
++ if(id_x_y_zero)
++ set_imag_zero(id_batch, xreal, ximag);
++
++ if(Nxeven && id_x_y_zero)
++ set_imag_zero(id_batch + xstride * (Nx / 2), xreal, ximag);
++
++ if(Nyeven && id_x_y_zero)
++ set_imag_zero(id_batch + ystride * (Ny / 2), xreal, ximag);
++
++ if(Nzeven && id_x_y_zero)
++ set_imag_zero(id_batch + zstride * (Nz / 2), xreal, ximag);
++
++ if(Nxeven && Nyeven && id_x_y_zero)
++ set_imag_zero(id_batch + xstride * (Nx / 2) + ystride * (Ny / 2), xreal, ximag);
++
++ if(Nxeven && Nzeven && id_x_y_zero)
++ set_imag_zero(id_batch + xstride * (Nx / 2) + zstride * (Nz / 2), xreal, ximag);
++
++ if(Nyeven && Nzeven && id_x_y_zero)
++ set_imag_zero(id_batch + ystride * (Ny / 2) + zstride * (Nz / 2), xreal, ximag);
++
++ if(Nxeven && Nyeven && Nzeven && id_x_y_zero)
++ set_imag_zero(id_batch + xstride * (Nx / 2) + ystride * (Ny / 2) + zstride * (Nz / 2),
++ xreal,
++ ximag);
++
++ if(id_x == 0 && id_y < y_total_half)
++ conjugate(id_batch + ystride * (Ny - (id_y + 1)),
++ id_batch + ystride * (id_y + 1),
++ xreal,
++ ximag);
++
++ if(Nxeven && id_x == 0 && id_y < y_total_half)
++ conjugate(id_batch + xstride * (Nx / 2) + ystride * (Ny - (id_y + 1)),
++ id_batch + xstride * (Nx / 2) + ystride * (id_y + 1),
++ xreal,
++ ximag);
++
++ if(id_x < x_total && id_y == 0)
++ conjugate(id_batch + xstride * (Nx - (id_x + 1)),
++ id_batch + xstride * (id_x + 1),
++ xreal,
++ ximag);
++
++ if(Nyeven && id_x < x_total && id_y == 0)
++ conjugate(id_batch + xstride * (Nx - (id_x + 1)) + ystride * (Ny / 2),
++ id_batch + xstride * (id_x + 1) + ystride * (Ny / 2),
++ xreal,
++ ximag);
++
++ if(id_x < x_total && id_y < y_total)
++ conjugate(id_batch + xstride * (Nx - (id_x + 1)) + ystride * (Ny - (id_y + 1)),
++ id_batch + xstride * (id_x + 1) + ystride * (id_y + 1),
++ xreal,
++ ximag);
++
++ if(Nzeven)
++ {
++ if(id_x < x_total && id_y == 0)
++ conjugate(id_batch + xstride * (Nx - (id_x + 1)) + zstride * (Nz / 2),
++ id_batch + xstride * (id_x + 1) + zstride * (Nz / 2),
++ xreal,
++ ximag);
++
++ if(Nyeven && id_x < x_total && id_y == 0)
++ conjugate(id_batch + xstride * (Nx - (id_x + 1)) + zstride * (Nz / 2),
++ id_batch + xstride * (id_x + 1) + zstride * (Nz / 2),
++ xreal,
++ ximag);
++
++ if(id_x == 0 && id_y < y_total_half)
++ conjugate(id_batch + ystride * (Ny - (id_y + 1)) + zstride * (Nz / 2),
++ id_batch + ystride * (id_y + 1) + zstride * (Nz / 2),
++ xreal,
++ ximag);
++
++ if(Nxeven && id_x == 0 && id_y < y_total_half)
++ conjugate(id_batch + xstride * (Nx / 2) + ystride * (Ny - (id_y + 1))
++ + zstride * (Nz / 2),
++ id_batch + xstride * (Nx / 2) + ystride * (id_y + 1) + zstride * (Nz / 2),
++ xreal,
++ ximag);
++
++ if(id_x < x_total && id_y < y_total)
++ conjugate(id_batch + xstride * (Nx - (id_x + 1)) + ystride * (Ny - (id_y + 1))
++ + zstride * (Nz / 2),
++ id_batch + xstride * (id_x + 1) + ystride * (id_y + 1)
++ + zstride * (Nz / 2),
++ xreal,
++ ximag);
++ }
++ }
++}
++
++// get grid dimensions for data gen kernel
++static dim3 generate_data_gridDim(const size_t isize)
++{
++ auto blockSize = DATA_GEN_THREADS;
++ // total number of blocks needed in the grid
++ auto numBlocks_setup = DivRoundingUp<size_t>(isize, blockSize);
++
++ // Total work items per dimension in the grid is counted in
++ // uint32_t. Since each thread initializes one element, very
++ // large amounts of data will overflow this total size if we do
++ // all this work in one grid dimension, causing launch failure.
++ //
++ // CUDA also generally allows for effectively unlimited grid X
++ // dim, but Y and Z are more limited.
++ auto gridDim_y = std::min<unsigned int>(DATA_GEN_GRID_Y_MAX, numBlocks_setup);
++ auto gridDim_x = DivRoundingUp<unsigned int>(numBlocks_setup, DATA_GEN_GRID_Y_MAX);
++ return {gridDim_x, gridDim_y};
++}
++
++// get grid dimensions for hermitian symmetrizer kernel
++static dim3 generate_hermitian_gridDim(const std::vector<size_t>& length,
++ const size_t batch,
++ const size_t blockSize)
++{
++ dim3 gridDim;
++
++ switch(length.size())
++ {
++ case 1:
++ gridDim = dim3(DivRoundingUp<size_t>(batch, blockSize));
++ break;
++ case 2:
++ gridDim = dim3(DivRoundingUp<size_t>(batch, blockSize),
++ DivRoundingUp<size_t>((length[0] + 1) / 2 - 1, blockSize));
++ break;
++ case 3:
++ gridDim = dim3(DivRoundingUp<size_t>(batch, blockSize),
++ DivRoundingUp<size_t>((length[0] + 1) / 2 - 1, blockSize),
++ DivRoundingUp<size_t>(length[1] - 1, blockSize));
++ break;
++ default:
++ throw std::runtime_error("Invalid dimension for impose_hermitian_symmetry");
++ }
++
++ return gridDim;
++}
++
++static dim3 generate_blockDim(const std::vector<size_t>& length, const size_t blockSize)
++{
++ dim3 blockDim;
++
++ switch(length.size())
++ {
++ case 1:
++ blockDim = dim3(blockSize);
++ break;
++ case 2:
++ blockDim = dim3(blockSize, blockSize);
++ break;
++ case 3:
++ blockDim = dim3(blockSize, blockSize, blockSize);
++ break;
++ default:
++ throw std::runtime_error("Invalid dimension for impose_hermitian_symmetry");
++ }
++
++ return blockDim;
++}
++
++template <typename Tint, typename Treal>
++static void generate_random_interleaved_data(const Tint& whole_length,
++ const size_t idist,
++ const size_t isize,
++ const Tint& whole_stride,
++ rocfft_complex<Treal>* input_data,
++ const hipDeviceProp_t& deviceProp)
++{
++ auto input_length = get_input_val(whole_length);
++ auto zero_length = make_zero_length(input_length);
++ auto input_stride = get_input_val(whole_stride);
++
++ dim3 gridDim = generate_data_gridDim(isize);
++ dim3 blockDim{DATA_GEN_THREADS};
++
++ launch_limits_check("generate_random_interleaved_data_kernel", gridDim, blockDim, deviceProp);
++
++ hipLaunchKernelGGL(
++ HIP_KERNEL_NAME(generate_random_interleaved_data_kernel<decltype(input_length), Treal>),
++ gridDim,
++ blockDim,
++ 0, // sharedMemBytes
++ 0, // stream
++ input_length,
++ zero_length,
++ idist,
++ isize,
++ input_stride,
++ input_data);
++ auto err = hipGetLastError();
++ if(err != hipSuccess)
++ throw std::runtime_error("generate_random_interleaved_data_kernel launch failure: "
++ + std::string(hipGetErrorName(err)));
++}
++
++template <typename Tint, typename Treal>
++static void generate_interleaved_data(const Tint& whole_length,
++ const size_t idist,
++ const size_t isize,
++ const Tint& whole_stride,
++ const size_t nbatch,
++ rocfft_complex<Treal>* input_data,
++ const hipDeviceProp_t& deviceProp)
++{
++ const auto input_length = get_input_val(whole_length);
++ const auto input_stride = get_input_val(whole_stride);
++ const auto unit_stride = make_unit_stride(input_length);
++
++ const auto inv_scale
++ = static_cast<Treal>(1.0)
++ / static_cast<Treal>(static_cast<unsigned long long>(isize) / nbatch - 1);
++
++ dim3 gridDim = generate_data_gridDim(isize);
++ dim3 blockDim{DATA_GEN_THREADS};
++
++ launch_limits_check("generate_interleaved_data_kernel", gridDim, blockDim, deviceProp);
++
++ hipLaunchKernelGGL(
++ HIP_KERNEL_NAME(generate_interleaved_data_kernel<decltype(input_length), Treal>),
++ gridDim,
++ blockDim,
++ 0, // sharedMemBytes
++ 0, // stream
++ input_length,
++ idist,
++ isize,
++ input_stride,
++ unit_stride,
++ inv_scale,
++ input_data);
++ auto err = hipGetLastError();
++ if(err != hipSuccess)
++ throw std::runtime_error("generate_interleaved_data_kernel launch failure: "
++ + std::string(hipGetErrorName(err)));
++}
++
++template <typename Tint, typename Treal>
++static void generate_random_planar_data(const Tint& whole_length,
++ const size_t idist,
++ const size_t isize,
++ const Tint& whole_stride,
++ Treal* real_data,
++ Treal* imag_data,
++ const hipDeviceProp_t& deviceProp)
++{
++ const auto input_length = get_input_val(whole_length);
++ const auto zero_length = make_zero_length(input_length);
++ const auto input_stride = get_input_val(whole_stride);
++
++ dim3 gridDim = generate_data_gridDim(isize);
++ dim3 blockDim{DATA_GEN_THREADS};
++
++ launch_limits_check("generate_random_planar_data_kernel", gridDim, blockDim, deviceProp);
++
++ hipLaunchKernelGGL(
++ HIP_KERNEL_NAME(generate_random_planar_data_kernel<decltype(input_length), Treal>),
++ gridDim,
++ blockDim,
++ 0, // sharedMemBytes
++ 0, // stream
++ input_length,
++ zero_length,
++ idist,
++ isize,
++ input_stride,
++ real_data,
++ imag_data);
++ auto err = hipGetLastError();
++ if(err != hipSuccess)
++ throw std::runtime_error("generate_random_planar_data_kernel launch failure: "
++ + std::string(hipGetErrorName(err)));
++}
++
++template <typename Tint, typename Treal>
++static void generate_planar_data(const Tint& whole_length,
++ const size_t idist,
++ const size_t isize,
++ const Tint& whole_stride,
++ const size_t nbatch,
++ Treal* real_data,
++ Treal* imag_data,
++ const hipDeviceProp_t& deviceProp)
++{
++ const auto input_length = get_input_val(whole_length);
++ const auto input_stride = get_input_val(whole_stride);
++ const auto unit_stride = make_unit_stride(input_length);
++
++ const auto inv_scale
++ = static_cast<Treal>(1.0)
++ / static_cast<Treal>(static_cast<unsigned long long>(isize) / nbatch - 1);
++
++ dim3 gridDim = generate_data_gridDim(isize);
++ dim3 blockDim{DATA_GEN_THREADS};
++
++ launch_limits_check("generate_planar_data_kernel", gridDim, blockDim, deviceProp);
++
++ hipLaunchKernelGGL(HIP_KERNEL_NAME(generate_planar_data_kernel<decltype(input_length), Treal>),
++ gridDim,
++ blockDim,
++ 0, // sharedMemBytes
++ 0, // stream
++ input_length,
++ idist,
++ isize,
++ input_stride,
++ unit_stride,
++ inv_scale,
++ real_data,
++ imag_data);
++ auto err = hipGetLastError();
++ if(err != hipSuccess)
++ throw std::runtime_error("generate_planar_data_kernel launch failure: "
++ + std::string(hipGetErrorName(err)));
++}
++
++template <typename Tint, typename Treal>
++static void generate_random_real_data(const Tint& whole_length,
++ const size_t idist,
++ const size_t isize,
++ const Tint& whole_stride,
++ Treal* input_data,
++ const hipDeviceProp_t& deviceProp)
++{
++ const auto input_length = get_input_val(whole_length);
++ const auto zero_length = make_zero_length(input_length);
++ const auto input_stride = get_input_val(whole_stride);
++
++ dim3 gridDim = generate_data_gridDim(isize);
++ dim3 blockDim{DATA_GEN_THREADS};
++
++ launch_limits_check("generate_random_real_data_kernel", gridDim, blockDim, deviceProp);
++
++ hipLaunchKernelGGL(
++ HIP_KERNEL_NAME(generate_random_real_data_kernel<decltype(input_length), Treal>),
++ gridDim,
++ blockDim,
++ 0, // sharedMemBytes
++ 0, // stream
++ input_length,
++ zero_length,
++ idist,
++ isize,
++ input_stride,
++ input_data);
++ auto err = hipGetLastError();
++ if(err != hipSuccess)
++ throw std::runtime_error("generate_random_real_data_kernel launch failure: "
++ + std::string(hipGetErrorName(err)));
++}
++
++template <typename Tint, typename Treal>
++static void generate_real_data(const Tint& whole_length,
++ const size_t idist,
++ const size_t isize,
++ const Tint& whole_stride,
++ const size_t nbatch,
++ Treal* input_data,
++ const hipDeviceProp_t& deviceProp)
++{
++ const auto input_length = get_input_val(whole_length);
++ const auto input_stride = get_input_val(whole_stride);
++ const auto unit_stride = make_unit_stride(input_length);
++
++ const auto inv_scale
++ = static_cast<Treal>(1.0)
++ / static_cast<Treal>(static_cast<unsigned long long>(isize) / nbatch - 1);
++
++ dim3 gridDim = generate_data_gridDim(isize);
++ dim3 blockDim{DATA_GEN_THREADS};
++
++ launch_limits_check("generate_real_data_kernel", gridDim, blockDim, deviceProp);
++
++ hipLaunchKernelGGL(HIP_KERNEL_NAME(generate_real_data_kernel<decltype(input_length), Treal>),
++ gridDim,
++ blockDim,
++ 0, // sharedMemBytes
++ 0, // stream
++ input_length,
++ idist,
++ isize,
++ input_stride,
++ unit_stride,
++ inv_scale,
++ input_data);
++ auto err = hipGetLastError();
++ if(err != hipSuccess)
++ throw std::runtime_error("generate_real_data_kernel launch failure: "
++ + std::string(hipGetErrorName(err)));
++}
++
++template <typename Tcomplex>
++static void impose_hermitian_symmetry_interleaved(const std::vector<size_t>& length,
++ const std::vector<size_t>& ilength,
++ const std::vector<size_t>& stride,
++ const size_t dist,
++ const size_t batch,
++ Tcomplex* input_data,
++ const hipDeviceProp_t& deviceProp)
++{
++ auto blockSize = DATA_GEN_THREADS;
++ auto blockDim = generate_blockDim(length, blockSize);
++ auto gridDim = generate_hermitian_gridDim(length, batch, blockSize);
++
++ switch(length.size())
++ {
++ case 1:
++ {
++ launch_limits_check(
++ "impose_hermitian_symmetry_interleaved_1D_kernel", gridDim, blockDim, deviceProp);
++
++ hipLaunchKernelGGL(impose_hermitian_symmetry_interleaved_1D_kernel<Tcomplex>,
++ gridDim,
++ blockDim,
++ 0,
++ 0,
++ input_data,
++ length[0],
++ stride[0],
++ dist,
++ batch,
++ length[0] % 2 == 0);
++
++ break;
++ }
++ case 2:
++ {
++ launch_limits_check(
++ "impose_hermitian_symmetry_interleaved_2D_kernel", gridDim, blockDim, deviceProp);
++
++ hipLaunchKernelGGL(impose_hermitian_symmetry_interleaved_2D_kernel<Tcomplex>,
++ gridDim,
++ blockDim,
++ 0,
++ 0,
++ input_data,
++ length[0],
++ length[1],
++ stride[0],
++ stride[1],
++ dist,
++ batch,
++ (ilength[0] + 1) / 2 - 1,
++ length[0] % 2 == 0,
++ length[1] % 2 == 0);
++
++ break;
++ }
++ case 3:
++ {
++ launch_limits_check(
++ "impose_hermitian_symmetry_interleaved_3D_kernel", gridDim, blockDim, deviceProp);
++
++ hipLaunchKernelGGL(impose_hermitian_symmetry_interleaved_3D_kernel<Tcomplex>,
++ gridDim,
++ blockDim,
++ 0,
++ 0,
++ input_data,
++ length[0],
++ length[1],
++ length[2],
++ stride[0],
++ stride[1],
++ stride[2],
++ dist,
++ batch,
++ (ilength[0] + 1) / 2 - 1,
++ ilength[1] - 1,
++ (ilength[1] + 1) / 2 - 1,
++ length[0] % 2 == 0,
++ length[1] % 2 == 0,
++ length[2] % 2 == 0);
++ break;
++ }
++ default:
++ throw std::runtime_error("Invalid dimension for impose_hermitian_symmetry");
++ }
++ auto err = hipGetLastError();
++ if(err != hipSuccess)
++ throw std::runtime_error("impose_hermitian_symmetry_interleaved_kernel launch failure: "
++ + std::string(hipGetErrorName(err)));
++}
++
++template <typename Tfloat>
++static void impose_hermitian_symmetry_planar(const std::vector<size_t>& length,
++ const std::vector<size_t>& ilength,
++ const std::vector<size_t>& stride,
++ const size_t dist,
++ const size_t batch,
++ Tfloat* input_data_real,
++ Tfloat* input_data_imag,
++ const hipDeviceProp_t& deviceProp)
++{
++ auto blockSize = DATA_GEN_THREADS;
++ auto blockDim = generate_blockDim(length, blockSize);
++ auto gridDim = generate_hermitian_gridDim(length, batch, blockSize);
++
++ switch(length.size())
++ {
++ case 1:
++ {
++ launch_limits_check(
++ "impose_hermitian_symmetry_planar_1D_kernel", gridDim, blockDim, deviceProp);
++
++ hipLaunchKernelGGL(impose_hermitian_symmetry_planar_1D_kernel<Tfloat>,
++ gridDim,
++ blockDim,
++ 0,
++ 0,
++ input_data_real,
++ input_data_imag,
++ length[0],
++ stride[0],
++ dist,
++ batch,
++ length[0] % 2 == 0);
++
++ break;
++ }
++ case 2:
++ {
++ launch_limits_check(
++ "impose_hermitian_symmetry_planar_2D_kernel", gridDim, blockDim, deviceProp);
++
++ hipLaunchKernelGGL(impose_hermitian_symmetry_planar_2D_kernel<Tfloat>,
++ gridDim,
++ blockDim,
++ 0,
++ 0,
++ input_data_real,
++ input_data_imag,
++ length[0],
++ length[1],
++ stride[0],
++ stride[1],
++ dist,
++ batch,
++ (ilength[0] + 1) / 2 - 1,
++ length[0] % 2 == 0,
++ length[1] % 2 == 0);
++
++ break;
++ }
++ case 3:
++ {
++ launch_limits_check(
++ "impose_hermitian_symmetry_planar_3D_kernel", gridDim, blockDim, deviceProp);
++
++ hipLaunchKernelGGL(impose_hermitian_symmetry_planar_3D_kernel<Tfloat>,
++ gridDim,
++ blockDim,
++ 0,
++ 0,
++ input_data_real,
++ input_data_imag,
++ length[0],
++ length[1],
++ length[2],
++ stride[0],
++ stride[1],
++ stride[2],
++ dist,
++ batch,
++ (ilength[0] + 1) / 2 - 1,
++ ilength[1] - 1,
++ (ilength[1] + 1) / 2 - 1,
++ length[0] % 2 == 0,
++ length[1] % 2 == 0,
++ length[2] % 2 == 0);
++ break;
++ }
++ default:
++ throw std::runtime_error("Invalid dimension for impose_hermitian_symmetry");
++ }
++ auto err = hipGetLastError();
++ if(err != hipSuccess)
++ throw std::runtime_error("impose_hermitian_symmetry_planar_kernel launch failure: "
++ + std::string(hipGetErrorName(err)));
++}
++
++#endif // DATA_GEN_DEVICE_H
+diff --git a/shared/data_gen_host.h b/shared/data_gen_host.h
+new file mode 100644
+index 0000000..29d3854
+--- /dev/null
++++ b/shared/data_gen_host.h
+@@ -0,0 +1,881 @@
++// Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved.
++//
++// Permission is hereby granted, free of charge, to any person obtaining a copy
++// of this software and associated documentation files (the "Software"), to deal
++// in the Software without restriction, including without limitation the rights
++// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
++// copies of the Software, and to permit persons to whom the Software is
++// furnished to do so, subject to the following conditions:
++//
++// The above copyright notice and this permission notice shall be included in
++// all copies or substantial portions of the Software.
++//
++// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
++// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
++// THE SOFTWARE.
++
++#ifndef DATA_GEN_HOST_H
++#define DATA_GEN_HOST_H
++
++#include "../shared/hostbuf.h"
++#include "../shared/increment.h"
++#include <complex>
++#include <limits>
++#include <random>
++#include <tuple>
++#include <vector>
++
++// Specialized computation of index given 1-, 2-, 3- dimension length + stride
++template <typename T1, typename T2>
++size_t compute_index(T1 length, T2 stride, size_t base)
++{
++ return (length * stride) + base;
++}
++
++template <typename T1, typename T2>
++size_t
++ compute_index(const std::tuple<T1, T1>& length, const std::tuple<T2, T2>& stride, size_t base)
++{
++ static_assert(std::is_integral<T1>::value, "Integral required.");
++ static_assert(std::is_integral<T2>::value, "Integral required.");
++ return (std::get<0>(length) * std::get<0>(stride)) + (std::get<1>(length) * std::get<1>(stride))
++ + base;
++}
++
++template <typename T1, typename T2>
++size_t compute_index(const std::tuple<T1, T1, T1>& length,
++ const std::tuple<T2, T2, T2>& stride,
++ size_t base)
++{
++ static_assert(std::is_integral<T1>::value, "Integral required.");
++ static_assert(std::is_integral<T2>::value, "Integral required.");
++ return (std::get<0>(length) * std::get<0>(stride)) + (std::get<1>(length) * std::get<1>(stride))
++ + (std::get<2>(length) * std::get<2>(stride)) + base;
++}
++
++// count the number of total iterations for 1-, 2-, and 3-D dimensions
++template <typename T1>
++size_t count_iters(const T1& i)
++{
++ return i;
++}
++
++template <typename T1>
++size_t count_iters(const std::tuple<T1, T1>& i)
++{
++ return std::get<0>(i) * std::get<1>(i);
++}
++
++template <typename T1>
++size_t count_iters(const std::tuple<T1, T1, T1>& i)
++{
++ return std::get<0>(i) * std::get<1>(i) * std::get<2>(i);
++}
++
++template <typename T1>
++T1 make_unit_stride(const T1& whole_length)
++{
++ return static_cast<T1>(1);
++}
++
++template <typename T1>
++std::tuple<T1, T1> make_unit_stride(const std::tuple<T1, T1>& whole_length)
++{
++ return std::make_tuple(static_cast<T1>(1), static_cast<T1>(std::get<0>(whole_length)));
++}
++
++template <typename T1>
++std::tuple<T1, T1, T1> make_unit_stride(const std::tuple<T1, T1, T1>& whole_length)
++{
++ return std::make_tuple(static_cast<T1>(1),
++ static_cast<T1>(std::get<0>(whole_length)),
++ static_cast<T1>(std::get<0>(whole_length))
++ * static_cast<T1>(std::get<1>(whole_length)));
++}
++
++// Work out how many partitions to break our iteration problem into
++template <typename T1>
++static size_t compute_partition_count(T1 length)
++{
++#ifdef _OPENMP
++ // we seem to get contention from too many threads, which slows
++ // things down. particularly noticeable with mix_3D tests
++ static const size_t MAX_PARTITIONS = 8;
++ size_t iters = count_iters(length);
++ size_t hw_threads = std::min(MAX_PARTITIONS, static_cast<size_t>(omp_get_num_procs()));
++ if(!hw_threads)
++ return 1;
++
++ // don't bother threading problem sizes that are too small. pick
++ // an arbitrary number of iterations and ensure that each thread
++ // has at least that many iterations to process
++ static const size_t MIN_ITERS_PER_THREAD = 2048;
++
++ // either use the whole CPU, or use ceil(iters/iters_per_thread)
++ return std::min(hw_threads, (iters + MIN_ITERS_PER_THREAD + 1) / MIN_ITERS_PER_THREAD);
++#else
++ return 1;
++#endif
++}
++
++// Break a scalar length into some number of pieces, returning
++// [(start0, end0), (start1, end1), ...]
++template <typename T1>
++std::vector<std::pair<T1, T1>> partition_base(const T1& length, size_t num_parts)
++{
++ static_assert(std::is_integral<T1>::value, "Integral required.");
++
++ // make sure we don't exceed the length
++ num_parts = std::min(length, num_parts);
++
++ std::vector<std::pair<T1, T1>> ret(num_parts);
++ auto partition_size = length / num_parts;
++ T1 cur_partition = 0;
++ for(size_t i = 0; i < num_parts; ++i, cur_partition += partition_size)
++ {
++ ret[i].first = cur_partition;
++ ret[i].second = cur_partition + partition_size;
++ }
++ // last partition might not divide evenly, fix it up
++ ret.back().second = length;
++ return ret;
++}
++
++// Returns pairs of startindex, endindex, for 1D, 2D, 3D lengths
++template <typename T1>
++std::vector<std::pair<T1, T1>> partition_rowmajor(const T1& length)
++{
++ return partition_base(length, compute_partition_count(length));
++}
++
++// Partition on the leftmost part of the tuple, for row-major indexing
++template <typename T1>
++std::vector<std::pair<std::tuple<T1, T1>, std::tuple<T1, T1>>>
++ partition_rowmajor(const std::tuple<T1, T1>& length)
++{
++ auto partitions = partition_base(std::get<0>(length), compute_partition_count(length));
++ std::vector<std::pair<std::tuple<T1, T1>, std::tuple<T1, T1>>> ret(partitions.size());
++ for(size_t i = 0; i < partitions.size(); ++i)
++ {
++ std::get<0>(ret[i].first) = partitions[i].first;
++ std::get<1>(ret[i].first) = 0;
++ std::get<0>(ret[i].second) = partitions[i].second;
++ std::get<1>(ret[i].second) = std::get<1>(length);
++ }
++ return ret;
++}
++template <typename T1>
++std::vector<std::pair<std::tuple<T1, T1, T1>, std::tuple<T1, T1, T1>>>
++ partition_rowmajor(const std::tuple<T1, T1, T1>& length)
++{
++ auto partitions = partition_base(std::get<0>(length), compute_partition_count(length));
++ std::vector<std::pair<std::tuple<T1, T1, T1>, std::tuple<T1, T1, T1>>> ret(partitions.size());
++ for(size_t i = 0; i < partitions.size(); ++i)
++ {
++ std::get<0>(ret[i].first) = partitions[i].first;
++ std::get<1>(ret[i].first) = 0;
++ std::get<2>(ret[i].first) = 0;
++ std::get<0>(ret[i].second) = partitions[i].second;
++ std::get<1>(ret[i].second) = std::get<1>(length);
++ std::get<2>(ret[i].second) = std::get<2>(length);
++ }
++ return ret;
++}
++
++// For complex-to-real transforms, the input data must be Hermitiam-symmetric.
++// That is, u_k is the complex conjugate of u_{-k}, where k is the wavevector in Fourier
++// space. For multi-dimensional data, this means that we only need to store a bit more
++// than half of the complex values; the rest are redundant. However, there are still
++// some restrictions:
++// * the origin and Nyquist value(s) must be real-valued
++// * some of the remaining values are still redundant, and you might get different results
++// than you expect if the values don't agree.
++// Below are some example kernels which impose Hermitian symmetry on a complex array
++// of the given dimensions.
++
++template <typename Tfloat, typename Tsize>
++static void impose_hermitian_symmetry_interleaved_1D(std::vector<hostbuf>& vals,
++ const std::vector<Tsize>& length,
++ const std::vector<Tsize>& istride,
++ const Tsize idist,
++ const Tsize nbatch)
++{
++ for(unsigned int ibatch = 0; ibatch < nbatch; ++ibatch)
++ {
++ auto data = ((std::complex<Tfloat>*)vals[0].data()) + ibatch * idist;
++
++ data[0].imag(0.0);
++
++ if(length[0] % 2 == 0)
++ {
++ data[istride[0] * (length[0] / 2)].imag(0.0);
++ }
++ }
++}
++
++template <typename Tfloat, typename Tsize>
++static void impose_hermitian_symmetry_planar_1D(std::vector<hostbuf>& vals,
++ const std::vector<Tsize>& length,
++ const std::vector<Tsize>& istride,
++ const Tsize idist,
++ const Tsize nbatch)
++{
++ for(unsigned int ibatch = 0; ibatch < nbatch; ++ibatch)
++ {
++ auto data_imag = ((Tfloat*)vals[1].data()) + ibatch * idist;
++
++ data_imag[0] = 0.0;
++
++ if(length[0] % 2 == 0)
++ {
++ data_imag[istride[0] * (length[0] / 2)] = 0.0;
++ }
++ }
++}
++
++template <typename Tfloat, typename Tsize>
++static void impose_hermitian_symmetry_interleaved_2D(std::vector<hostbuf>& vals,
++ const std::vector<Tsize>& length,
++ const std::vector<Tsize>& istride,
++ const Tsize idist,
++ const Tsize nbatch)
++{
++ for(unsigned int ibatch = 0; ibatch < nbatch; ++ibatch)
++ {
++ auto data = ((std::complex<Tfloat>*)vals[0].data()) + ibatch * idist;
++
++ data[0].imag(0.0);
++
++ if(length[0] % 2 == 0)
++ {
++ data[istride[0] * (length[0] / 2)].imag(0.0);
++ }
++
++ if(length[1] % 2 == 0)
++ {
++ data[istride[1] * (length[1] / 2)].imag(0.0);
++ }
++
++ if(length[0] % 2 == 0 && length[1] % 2 == 0)
++ {
++ data[istride[0] * (length[0] / 2) + istride[1] * (length[1] / 2)].imag(0.0);
++ }
++
++ for(unsigned int i = 1; i < (length[0] + 1) / 2; ++i)
++ {
++ data[istride[0] * (length[0] - i)] = std::conj(data[istride[0] * i]);
++ }
++
++ if(length[1] % 2 == 0)
++ {
++ for(unsigned int i = 1; i < (length[0] + 1) / 2; ++i)
++ {
++ data[istride[0] * (length[0] - i) + istride[1] * (length[1] / 2)]
++ = std::conj(data[istride[0] * i + istride[1] * (length[1] / 2)]);
++ }
++ }
++ }
++}
++
++template <typename Tfloat, typename Tsize>
++static void impose_hermitian_symmetry_planar_2D(std::vector<hostbuf>& vals,
++ const std::vector<Tsize>& length,
++ const std::vector<Tsize>& istride,
++ const Tsize idist,
++ const Tsize nbatch)
++{
++ for(unsigned int ibatch = 0; ibatch < nbatch; ++ibatch)
++ {
++ auto data_real = ((Tfloat*)vals[0].data()) + ibatch * idist;
++ auto data_imag = ((Tfloat*)vals[1].data()) + ibatch * idist;
++
++ data_imag[0] = 0.0;
++
++ if(length[0] % 2 == 0)
++ {
++ data_imag[istride[0] * (length[0] / 2)] = 0.0;
++ }
++
++ if(length[1] % 2 == 0)
++ {
++ data_imag[istride[1] * (length[1] / 2)] = 0.0;
++ }
++
++ if(length[0] % 2 == 0 && length[1] % 2 == 0)
++ {
++ data_imag[istride[0] * (length[0] / 2) + istride[1] * (length[1] / 2)] = 0.0;
++ }
++
++ for(unsigned int i = 1; i < (length[0] + 1) / 2; ++i)
++ {
++ data_real[istride[0] * (length[0] - i)] = data_real[istride[0] * i];
++ data_imag[istride[0] * (length[0] - i)] = -data_imag[istride[0] * i];
++ }
++
++ if(length[1] % 2 == 0)
++ {
++ for(unsigned int i = 1; i < (length[0] + 1) / 2; ++i)
++ {
++ data_real[istride[0] * (length[0] - i) + istride[1] * (length[1] / 2)]
++ = data_real[istride[0] * i + istride[1] * (length[1] / 2)];
++ data_imag[istride[0] * (length[0] - i) + istride[1] * (length[1] / 2)]
++ = -data_imag[istride[0] * i + istride[1] * (length[1] / 2)];
++ }
++ }
++ }
++}
++
++template <typename Tfloat, typename Tsize>
++static void impose_hermitian_symmetry_interleaved_3D(std::vector<hostbuf>& vals,
++ const std::vector<Tsize>& length,
++ const std::vector<Tsize>& istride,
++ const Tsize idist,
++ const Tsize nbatch)
++{
++ for(unsigned int ibatch = 0; ibatch < nbatch; ++ibatch)
++ {
++ auto data = ((std::complex<Tfloat>*)vals[0].data()) + ibatch * idist;
++
++ data[0].imag(0.0);
++
++ if(length[0] % 2 == 0)
++ {
++ data[istride[0] * (length[0] / 2)].imag(0.0);
++ }
++
++ if(length[1] % 2 == 0)
++ {
++ data[istride[1] * (length[1] / 2)].imag(0.0);
++ }
++
++ if(length[2] % 2 == 0)
++ {
++ data[istride[2] * (length[2] / 2)].imag(0.0);
++ }
++
++ if(length[0] % 2 == 0 && length[1] % 2 == 0)
++ {
++ data[istride[0] * (length[0] / 2) + istride[1] * (length[1] / 2)].imag(0.0);
++ }
++
++ if(length[0] % 2 == 0 && length[2] % 2 == 0)
++ {
++ data[istride[0] * (length[0] / 2) + istride[2] * (length[2] / 2)].imag(0.0);
++ }
++ if(length[1] % 2 == 0 && length[2] % 2 == 0)
++ {
++ data[istride[1] * (length[1] / 2) + istride[2] * (length[2] / 2)].imag(0.0);
++ }
++
++ if(length[0] % 2 == 0 && length[1] % 2 == 0 && length[2] % 2 == 0)
++ {
++ data[istride[0] * (length[0] / 2) + istride[1] * (length[1] / 2)
++ + istride[2] * (length[2] / 2)]
++ .imag(0.0);
++ }
++
++ // y-axis:
++ for(unsigned int j = 1; j < (length[1] + 1) / 2; ++j)
++ {
++ data[istride[1] * (length[1] - j)] = std::conj(data[istride[1] * j]);
++ }
++
++ if(length[0] % 2 == 0)
++ {
++ // y-axis at x-nyquist
++ for(unsigned int j = 1; j < (length[1] + 1) / 2; ++j)
++ {
++ data[istride[0] * (length[0] / 2) + istride[1] * (length[1] - j)]
++ = std::conj(data[istride[0] * (length[0] / 2) + istride[1] * j]);
++ }
++ }
++
++ // x-axis:
++ for(unsigned int i = 1; i < (length[0] + 1) / 2; ++i)
++ {
++ data[istride[0] * (length[0] - i)] = std::conj(data[istride[0] * i]);
++ }
++
++ if(length[1] % 2 == 0)
++ {
++ // x-axis at y-nyquist
++ for(unsigned int i = 1; i < (length[0] + 1) / 2; ++i)
++ {
++ data[istride[0] * (length[0] - i) + istride[1] * (length[1] / 2)]
++ = std::conj(data[istride[0] * i + istride[1] * (length[1] / 2)]);
++ }
++ }
++
++ // x-y plane:
++ for(unsigned int i = 1; i < (length[0] + 1) / 2; ++i)
++ {
++ for(unsigned int j = 1; j < length[1]; ++j)
++ {
++ data[istride[0] * (length[0] - i) + istride[1] * (length[1] - j)]
++ = std::conj(data[istride[0] * i + istride[1] * j]);
++ }
++ }
++
++ if(length[2] % 2 == 0)
++ {
++ // x-axis at z-nyquist
++ for(unsigned int i = 1; i < (length[0] + 1) / 2; ++i)
++ {
++ data[istride[0] * (length[0] - i) + istride[2] * (length[2] / 2)]
++ = std::conj(data[istride[0] * i + istride[2] * (length[2] / 2)]);
++ }
++ if(length[1] % 2 == 0)
++ {
++ // x-axis at yz-nyquist
++ for(unsigned int i = 1; i < (length[0] + 1) / 2; ++i)
++ {
++ data[istride[0] * (length[0] - i) + istride[2] * (length[2] / 2)]
++ = std::conj(data[istride[0] * i + istride[2] * (length[2] / 2)]);
++ }
++ }
++
++ // y-axis: at z-nyquist
++ for(unsigned int j = 1; j < (length[1] + 1) / 2; ++j)
++ {
++ data[istride[1] * (length[1] - j) + istride[2] * (length[2] / 2)]
++ = std::conj(data[istride[1] * j + istride[2] * (length[2] / 2)]);
++ }
++
++ if(length[0] % 2 == 0)
++ {
++ // y-axis: at xz-nyquist
++ for(unsigned int j = 1; j < (length[1] + 1) / 2; ++j)
++ {
++ data[istride[0] * (length[0] / 2) + istride[1] * (length[1] - j)
++ + istride[2] * (length[2] / 2)]
++ = std::conj(data[istride[0] * (length[0] / 2) + istride[1] * j
++ + istride[2] * (length[2] / 2)]);
++ }
++ }
++
++ // x-y plane: at z-nyquist
++ for(unsigned int i = 1; i < (length[0] + 1) / 2; ++i)
++ {
++ for(unsigned int j = 1; j < length[1]; ++j)
++ {
++ data[istride[0] * (length[0] - i) + istride[1] * (length[1] - j)
++ + istride[2] * (length[2] / 2)]
++ = std::conj(
++ data[istride[0] * i + istride[1] * j + istride[2] * (length[2] / 2)]);
++ }
++ }
++ }
++ }
++}
++
++template <typename Tfloat, typename Tsize>
++static void impose_hermitian_symmetry_planar_3D(std::vector<hostbuf>& vals,
++ const std::vector<Tsize>& length,
++ const std::vector<Tsize>& istride,
++ const Tsize idist,
++ const Tsize nbatch)
++{
++ for(unsigned int ibatch = 0; ibatch < nbatch; ++ibatch)
++ {
++ auto data_real = ((Tfloat*)vals[0].data()) + ibatch * idist;
++ auto data_imag = ((Tfloat*)vals[1].data()) + ibatch * idist;
++
++ data_imag[0] = 0.0;
++
++ if(length[0] % 2 == 0)
++ {
++ data_imag[istride[0] * (length[0] / 2)] = 0.0;
++ }
++
++ if(length[1] % 2 == 0)
++ {
++ data_imag[istride[1] * (length[1] / 2)] = 0.0;
++ }
++
++ if(length[2] % 2 == 0)
++ {
++ data_imag[istride[2] * (length[2] / 2)] = 0.0;
++ }
++
++ if(length[0] % 2 == 0 && length[1] % 2 == 0)
++ {
++ data_imag[istride[0] * (length[0] / 2) + istride[1] * (length[1] / 2)] = 0.0;
++ }
++
++ if(length[0] % 2 == 0 && length[2] % 2 == 0)
++ {
++ data_imag[istride[0] * (length[0] / 2) + istride[2] * (length[2] / 2)] = 0.0;
++ }
++ if(length[1] % 2 == 0 && length[2] % 2 == 0)
++ {
++ data_imag[istride[1] * (length[1] / 2) + istride[2] * (length[2] / 2)] = 0.0;
++ }
++
++ if(length[0] % 2 == 0 && length[1] % 2 == 0 && length[2] % 2 == 0)
++ {
++ data_imag[istride[0] * (length[0] / 2) + istride[1] * (length[1] / 2)
++ + istride[2] * (length[2] / 2)]
++ = 0.0;
++ }
++
++ // y-axis:
++ for(unsigned int j = 1; j < (length[1] + 1) / 2; ++j)
++ {
++ data_real[istride[1] * (length[1] - j)] = data_real[istride[1] * j];
++ data_imag[istride[1] * (length[1] - j)] = -data_imag[istride[1] * j];
++ }
++
++ if(length[0] % 2 == 0)
++ {
++ // y-axis at x-nyquist
++ for(unsigned int j = 1; j < (length[1] + 1) / 2; ++j)
++ {
++ data_real[istride[0] * (length[0] / 2) + istride[1] * (length[1] - j)]
++ = data_real[istride[0] * (length[0] / 2) + istride[1] * j];
++ data_imag[istride[0] * (length[0] / 2) + istride[1] * (length[1] - j)]
++ = -data_imag[istride[0] * (length[0] / 2) + istride[1] * j];
++ }
++ }
++
++ // x-axis:
++ for(unsigned int i = 1; i < (length[0] + 1) / 2; ++i)
++ {
++ data_real[istride[0] * (length[0] - i)] = data_real[istride[0] * i];
++ data_imag[istride[0] * (length[0] - i)] = -data_imag[istride[0] * i];
++ }
++
++ if(length[1] % 2 == 0)
++ {
++ // x-axis at y-nyquist
++ for(unsigned int i = 1; i < (length[0] + 1) / 2; ++i)
++ {
++ data_real[istride[0] * (length[0] - i) + istride[1] * (length[1] / 2)]
++ = data_real[istride[0] * i + istride[1] * (length[1] / 2)];
++ data_imag[istride[0] * (length[0] - i) + istride[1] * (length[1] / 2)]
++ = -data_imag[istride[0] * i + istride[1] * (length[1] / 2)];
++ }
++ }
++
++ // x-y plane:
++ for(unsigned int i = 1; i < (length[0] + 1) / 2; ++i)
++ {
++ for(unsigned int j = 1; j < length[1]; ++j)
++ {
++ data_real[istride[0] * (length[0] - i) + istride[1] * (length[1] - j)]
++ = data_real[istride[0] * i + istride[1] * j];
++ data_imag[istride[0] * (length[0] - i) + istride[1] * (length[1] - j)]
++ = -data_imag[istride[0] * i + istride[1] * j];
++ }
++ }
++
++ if(length[2] % 2 == 0)
++ {
++ // x-axis at z-nyquist
++ for(unsigned int i = 1; i < (length[0] + 1) / 2; ++i)
++ {
++ data_real[istride[0] * (length[0] - i) + istride[2] * (length[2] / 2)]
++ = data_real[istride[0] * i + istride[2] * (length[2] / 2)];
++ data_imag[istride[0] * (length[0] - i) + istride[2] * (length[2] / 2)]
++ = -data_imag[istride[0] * i + istride[2] * (length[2] / 2)];
++ }
++ if(length[1] % 2 == 0)
++ {
++ // x-axis at yz-nyquist
++ for(unsigned int i = 1; i < (length[0] + 1) / 2; ++i)
++ {
++ data_real[istride[0] * (length[0] - i) + istride[2] * (length[2] / 2)]
++ = data_real[istride[0] * i + istride[2] * (length[2] / 2)];
++ data_imag[istride[0] * (length[0] - i) + istride[2] * (length[2] / 2)]
++ = -data_imag[istride[0] * i + istride[2] * (length[2] / 2)];
++ }
++ }
++
++ // y-axis: at z-nyquist
++ for(unsigned int j = 1; j < (length[1] + 1) / 2; ++j)
++ {
++ data_real[istride[1] * (length[1] - j) + istride[2] * (length[2] / 2)]
++ = data_real[istride[1] * j + istride[2] * (length[2] / 2)];
++ data_imag[istride[1] * (length[1] - j) + istride[2] * (length[2] / 2)]
++ = -data_imag[istride[1] * j + istride[2] * (length[2] / 2)];
++ }
++
++ if(length[0] % 2 == 0)
++ {
++ // y-axis: at xz-nyquist
++ for(unsigned int j = 1; j < (length[1] + 1) / 2; ++j)
++ {
++ data_real[istride[0] * (length[0] / 2) + istride[1] * (length[1] - j)
++ + istride[2] * (length[2] / 2)]
++ = data_real[istride[0] * (length[0] / 2) + istride[1] * j
++ + istride[2] * (length[2] / 2)];
++ data_imag[istride[0] * (length[0] / 2) + istride[1] * (length[1] - j)
++ + istride[2] * (length[2] / 2)]
++ = -data_imag[istride[0] * (length[0] / 2) + istride[1] * j
++ + istride[2] * (length[2] / 2)];
++ }
++ }
++
++ // x-y plane: at z-nyquist
++ for(unsigned int i = 1; i < (length[0] + 1) / 2; ++i)
++ {
++ for(unsigned int j = 1; j < length[1]; ++j)
++ {
++ data_real[istride[0] * (length[0] - i) + istride[1] * (length[1] - j)
++ + istride[2] * (length[2] / 2)]
++ = data_real[istride[0] * i + istride[1] * j + istride[2] * (length[2] / 2)];
++ data_imag[istride[0] * (length[0] - i) + istride[1] * (length[1] - j)
++ + istride[2] * (length[2] / 2)]
++ = -data_imag[istride[0] * i + istride[1] * j
++ + istride[2] * (length[2] / 2)];
++ }
++ }
++ }
++ }
++}
++
++template <typename Tfloat, typename Tint1>
++static void generate_random_interleaved_data(std::vector<hostbuf>& input,
++ const Tint1& whole_length,
++ const Tint1& whole_stride,
++ const size_t idist,
++ const size_t nbatch)
++{
++ auto idata = (std::complex<Tfloat>*)input[0].data();
++ size_t i_base = 0;
++ auto partitions = partition_rowmajor(whole_length);
++ for(unsigned int b = 0; b < nbatch; b++, i_base += idist)
++ {
++#pragma omp parallel for num_threads(partitions.size())
++ for(size_t part = 0; part < partitions.size(); ++part)
++ {
++ auto index = partitions[part].first;
++ const auto length = partitions[part].second;
++ std::mt19937 gen(compute_index(index, whole_stride, i_base));
++ do
++ {
++ const auto i = compute_index(index, whole_stride, i_base);
++ const Tfloat x = (Tfloat)gen() / (Tfloat)gen.max();
++ const Tfloat y = (Tfloat)gen() / (Tfloat)gen.max();
++ const std::complex<Tfloat> val(x, y);
++ idata[i] = val;
++ } while(increment_rowmajor(index, length));
++ }
++ }
++}
++
++template <typename Tfloat, typename Tint1>
++static void generate_interleaved_data(std::vector<hostbuf>& input,
++ const Tint1& whole_length,
++ const Tint1& whole_stride,
++ const size_t idist,
++ const size_t nbatch)
++{
++ auto idata = (std::complex<Tfloat>*)input[0].data();
++ size_t i_base = 0;
++ auto partitions = partition_rowmajor(whole_length);
++ auto unit_stride = make_unit_stride(whole_length);
++
++ const Tfloat inv_scale = 1.0 / static_cast<Tfloat>(count_iters(whole_length) - 1);
++
++ for(unsigned int b = 0; b < nbatch; b++, i_base += idist)
++ {
++#pragma omp parallel for num_threads(partitions.size())
++ for(size_t part = 0; part < partitions.size(); ++part)
++ {
++ auto index = partitions[part].first;
++ const auto length = partitions[part].second;
++ do
++ {
++ const auto val_xy
++ = -0.5 + static_cast<Tfloat>(compute_index(index, unit_stride, 0)) * inv_scale;
++
++ const std::complex<Tfloat> val(val_xy, val_xy);
++
++ const auto i = compute_index(index, whole_stride, i_base);
++
++ idata[i] = val;
++ } while(increment_rowmajor(index, length));
++ }
++ }
++}
++
++template <typename Tfloat, typename Tint1>
++static void generate_random_planar_data(std::vector<hostbuf>& input,
++ const Tint1& whole_length,
++ const Tint1& whole_stride,
++ const size_t idist,
++ const size_t nbatch)
++{
++ auto ireal = (Tfloat*)input[0].data();
++ auto iimag = (Tfloat*)input[1].data();
++ size_t i_base = 0;
++ auto partitions = partition_rowmajor(whole_length);
++ for(unsigned int b = 0; b < nbatch; b++, i_base += idist)
++ {
++#pragma omp parallel for num_threads(partitions.size())
++ for(size_t part = 0; part < partitions.size(); ++part)
++ {
++ auto index = partitions[part].first;
++ const auto length = partitions[part].second;
++ std::mt19937 gen(compute_index(index, whole_stride, i_base));
++ do
++ {
++ const auto i = compute_index(index, whole_stride, i_base);
++ const std::complex<Tfloat> val((Tfloat)gen() / (Tfloat)gen.max(),
++ (Tfloat)gen() / (Tfloat)gen.max());
++ ireal[i] = val.real();
++ iimag[i] = val.imag();
++ } while(increment_rowmajor(index, length));
++ }
++ }
++}
++
++template <typename Tfloat, typename Tint1>
++static void generate_planar_data(std::vector<hostbuf>& input,
++ const Tint1& whole_length,
++ const Tint1& whole_stride,
++ const size_t idist,
++ const size_t nbatch)
++{
++
++ auto ireal = (Tfloat*)input[0].data();
++ auto iimag = (Tfloat*)input[1].data();
++ size_t i_base = 0;
++ auto partitions = partition_rowmajor(whole_length);
++ auto unit_stride = make_unit_stride(whole_length);
++
++ const Tfloat inv_scale = 1.0 / static_cast<Tfloat>(count_iters(whole_length) - 1);
++
++ for(unsigned int b = 0; b < nbatch; b++, i_base += idist)
++ {
++#pragma omp parallel for num_threads(partitions.size())
++ for(size_t part = 0; part < partitions.size(); ++part)
++ {
++ auto index = partitions[part].first;
++ const auto length = partitions[part].second;
++ do
++ {
++ const auto val_xy
++ = -0.5 + static_cast<Tfloat>(compute_index(index, unit_stride, 0)) * inv_scale;
++
++ const auto i = compute_index(index, whole_stride, i_base);
++
++ ireal[i] = val_xy;
++ iimag[i] = val_xy;
++ } while(increment_rowmajor(index, length));
++ }
++ }
++}
++
++template <typename Tfloat, typename Tint1>
++static void generate_random_real_data(std::vector<hostbuf>& input,
++ const Tint1& whole_length,
++ const Tint1& whole_stride,
++ const size_t idist,
++ const size_t nbatch)
++{
++ auto idata = (Tfloat*)input[0].data();
++ size_t i_base = 0;
++ auto partitions = partition_rowmajor(whole_length);
++ for(unsigned int b = 0; b < nbatch; b++, i_base += idist)
++ {
++#pragma omp parallel for num_threads(partitions.size())
++ for(size_t part = 0; part < partitions.size(); ++part)
++ {
++ auto index = partitions[part].first;
++ const auto length = partitions[part].second;
++ std::mt19937 gen(compute_index(index, whole_stride, i_base));
++ do
++ {
++ const auto i = compute_index(index, whole_stride, i_base);
++ const Tfloat val = (Tfloat)gen() / (Tfloat)gen.max();
++ idata[i] = val;
++ } while(increment_rowmajor(index, length));
++ }
++ }
++}
++
++template <typename Tfloat, typename Tint1>
++static void generate_real_data(std::vector<hostbuf>& input,
++ const Tint1& whole_length,
++ const Tint1& whole_stride,
++ const size_t idist,
++ const size_t nbatch)
++{
++
++ auto idata = (Tfloat*)input[0].data();
++ size_t i_base = 0;
++ auto partitions = partition_rowmajor(whole_length);
++ auto unit_stride = make_unit_stride(whole_length);
++
++ const Tfloat inv_scale = 1.0 / static_cast<Tfloat>(count_iters(whole_length) - 1);
++
++ for(unsigned int b = 0; b < nbatch; b++, i_base += idist)
++ {
++#pragma omp parallel for num_threads(partitions.size())
++ for(size_t part = 0; part < partitions.size(); ++part)
++ {
++ auto index = partitions[part].first;
++ const auto length = partitions[part].second;
++ do
++ {
++ const auto i = compute_index(index, whole_stride, i_base);
++
++ idata[i]
++ = -0.5 + static_cast<Tfloat>(compute_index(index, unit_stride, 0)) * inv_scale;
++ } while(increment_rowmajor(index, length));
++ }
++ }
++}
++
++template <typename Tfloat, typename Tsize>
++static void impose_hermitian_symmetry_interleaved(std::vector<hostbuf>& vals,
++ const std::vector<Tsize>& length,
++ const std::vector<Tsize>& istride,
++ const Tsize idist,
++ const Tsize nbatch)
++{
++ switch(length.size())
++ {
++ case 1:
++ impose_hermitian_symmetry_interleaved_1D<Tfloat>(vals, length, istride, idist, nbatch);
++ break;
++ case 2:
++ impose_hermitian_symmetry_interleaved_2D<Tfloat>(vals, length, istride, idist, nbatch);
++ break;
++ case 3:
++ impose_hermitian_symmetry_interleaved_3D<Tfloat>(vals, length, istride, idist, nbatch);
++ break;
++ default:
++ throw std::runtime_error("Invalid dimension for impose_hermitian_symmetry");
++ }
++}
++
++template <typename Tfloat, typename Tsize>
++static void impose_hermitian_symmetry_planar(std::vector<hostbuf>& vals,
++ const std::vector<Tsize>& length,
++ const std::vector<Tsize>& istride,
++ const Tsize idist,
++ const Tsize nbatch)
++{
++ switch(length.size())
++ {
++ case 1:
++ impose_hermitian_symmetry_planar_1D<Tfloat>(vals, length, istride, idist, nbatch);
++ break;
++ case 2:
++ impose_hermitian_symmetry_planar_2D<Tfloat>(vals, length, istride, idist, nbatch);
++ break;
++ case 3:
++ impose_hermitian_symmetry_planar_3D<Tfloat>(vals, length, istride, idist, nbatch);
++ break;
++ default:
++ throw std::runtime_error("Invalid dimension for impose_hermitian_symmetry");
++ }
++}
++
++#endif // DATA_GEN_HOST_H
+diff --git a/shared/device_properties.h b/shared/device_properties.h
+new file mode 100644
+index 0000000..6e2e1e1
+--- /dev/null
++++ b/shared/device_properties.h
+@@ -0,0 +1,74 @@
++// Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved.
++//
++// Permission is hereby granted, free of charge, to any person obtaining a copy
++// of this software and associated documentation files (the "Software"), to deal
++// in the Software without restriction, including without limitation the rights
++// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
++// copies of the Software, and to permit persons to whom the Software is
++// furnished to do so, subject to the following conditions:
++//
++// The above copyright notice and this permission notice shall be included in
++// all copies or substantial portions of the Software.
++//
++// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
++// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
++// THE SOFTWARE.
++
++#ifndef ROCFFT_DEVICE_PROPS_H
++#define ROCFFT_DEVICE_PROPS_H
++
++#include <cstdint>
++#include <hip/hip_runtime_api.h>
++#include <stdexcept>
++
++// get device properties
++static hipDeviceProp_t get_curr_device_prop()
++{
++ hipDeviceProp_t prop;
++ int deviceId = 0;
++ if(hipGetDevice(&deviceId) != hipSuccess)
++ throw std::runtime_error("hipGetDevice failed.");
++
++ if(hipGetDeviceProperties(&prop, deviceId) != hipSuccess)
++ throw std::runtime_error("hipGetDeviceProperties failed for deviceId "
++ + std::to_string(deviceId));
++
++ return prop;
++}
++
++// check that the given grid/block dims will fit into the limits in
++// the device properties. throws std::runtime_error if the limits
++// are exceeded.
++static void launch_limits_check(const std::string& kernel_name,
++ const dim3 gridDim,
++ const dim3 blockDim,
++ const hipDeviceProp_t& deviceProp)
++{
++ // Need lots of casting here because dim3 is unsigned but device
++ // props are signed. Cast direct comparisons to fix signedness
++ // issues. Promote types to 64-bit when multiplying to try to
++ // avoid overflow.
++
++ // Block limits along each dimension
++ if(blockDim.x > static_cast<uint32_t>(deviceProp.maxThreadsDim[0])
++ || blockDim.y > static_cast<uint32_t>(deviceProp.maxThreadsDim[1])
++ || blockDim.z > static_cast<uint32_t>(deviceProp.maxThreadsDim[2]))
++ throw std::runtime_error("max threads per dim exceeded: " + kernel_name);
++
++ // Total threads for the whole block
++ if(static_cast<uint64_t>(blockDim.x) * blockDim.y * blockDim.z
++ > static_cast<uint64_t>(deviceProp.maxThreadsPerBlock))
++ throw std::runtime_error("max threads per block exceeded: " + kernel_name);
++
++ // Grid dimension limits
++ if(gridDim.x > static_cast<uint32_t>(deviceProp.maxGridSize[0])
++ || gridDim.y > static_cast<uint32_t>(deviceProp.maxGridSize[1])
++ || gridDim.z > static_cast<uint32_t>(deviceProp.maxGridSize[2]))
++ throw std::runtime_error("max grid size exceeded: " + kernel_name);
++}
++
++#endif
+diff --git a/shared/enum_to_string.h b/shared/enum_to_string.h
+new file mode 100644
+index 0000000..1c2fba0
+--- /dev/null
++++ b/shared/enum_to_string.h
+@@ -0,0 +1,81 @@
++// Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved.
++//
++// Permission is hereby granted, free of charge, to any person obtaining a copy
++// of this software and associated documentation files (the "Software"), to deal
++// in the Software without restriction, including without limitation the rights
++// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
++// copies of the Software, and to permit persons to whom the Software is
++// furnished to do so, subject to the following conditions:
++//
++// The above copyright notice and this permission notice shall be included in
++// all copies or substantial portions of the Software.
++//
++// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
++// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
++// THE SOFTWARE.
++
++#ifndef ENUM_TO_STRING_H
++#define ENUM_TO_STRING_H
++
++#include "fft_params.h"
++
++// Return the string of the hipError code.
++static std::string hipError_to_string(const hipError_t ret)
++{
++ switch(ret)
++ {
++ case hipSuccess:
++ return "hipSuccess";
++ case hipErrorInvalidContext:
++ return "hipErrorInvalidContext";
++ case hipErrorInvalidKernelFile:
++ return "hipErrorInvalidKernelFile";
++ case hipErrorMemoryAllocation:
++ return "hipErrorMemoryAllocation";
++ case hipErrorInitializationError:
++ return "hipErrorInitializationError";
++ case hipErrorLaunchFailure:
++ return "hipErrorLaunchFailure";
++ case hipErrorLaunchOutOfResources:
++ return "hipErrorLaunchOutOfResources";
++ case hipErrorInvalidDevice:
++ return "hipErrorInvalidDevice";
++ case hipErrorInvalidValue:
++ return "hipErrorInvalidValue";
++ case hipErrorInvalidDevicePointer:
++ return "hipErrorInvalidDevicePointer";
++ case hipErrorInvalidMemcpyDirection:
++ return "hipErrorInvalidMemcpyDirection";
++ case hipErrorUnknown:
++ return "hipErrorUnknown";
++ case hipErrorInvalidResourceHandle:
++ return "hipErrorInvalidResourceHandle";
++ case hipErrorNotReady:
++ return "hipErrorNotReady";
++ case hipErrorNoDevice:
++ return "hipErrorNoDevice";
++ case hipErrorPeerAccessAlreadyEnabled:
++ return "hipErrorPeerAccessAlreadyEnabled";
++ case hipErrorPeerAccessNotEnabled:
++ return "hipErrorPeerAccessNotEnabled";
++ case hipErrorRuntimeMemory:
++ return "hipErrorRuntimeMemory";
++ case hipErrorRuntimeOther:
++ return "hipErrorRuntimeOther";
++ case hipErrorHostMemoryAlreadyRegistered:
++ return "hipErrorHostMemoryAlreadyRegistered";
++ case hipErrorHostMemoryNotRegistered:
++ return "hipErrorHostMemoryNotRegistered";
++ case hipErrorMapBufferObjectFailed:
++ return "hipErrorMapBufferObjectFailed";
++ case hipErrorTbd:
++ return "hipErrorTbd";
++ default:
++ throw std::runtime_error("unknown hipError");
++ }
++}
++#endif
+diff --git a/shared/environment.h b/shared/environment.h
+new file mode 100644
+index 0000000..7be56a0
+--- /dev/null
++++ b/shared/environment.h
+@@ -0,0 +1,97 @@
++// Copyright (C) 2021 - 2022 Advanced Micro Devices, Inc. All rights reserved.
++//
++// Permission is hereby granted, free of charge, to any person obtaining a copy
++// of this software and associated documentation files (the "Software"), to deal
++// in the Software without restriction, including without limitation the rights
++// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
++// copies of the Software, and to permit persons to whom the Software is
++// furnished to do so, subject to the following conditions:
++//
++// The above copyright notice and this permission notice shall be included in
++// all copies or substantial portions of the Software.
++//
++// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
++// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
++// THE SOFTWARE.
++
++// wrappers around environment variable routines
++
++#pragma once
++
++#include <string>
++
++// Windows provides "getenv" and "_putenv", but those modify the
++// runtime's copy of the environment. The actual environment in the
++// process control block is accessed using GetEnvironmentVariable and
++// SetEnvironmentVariable.
++
++#ifdef WIN32
++#include <windows.h>
++static void rocfft_setenv(const char* var, const char* value)
++{
++ SetEnvironmentVariable(var, value);
++}
++static void rocfft_unsetenv(const char* var)
++{
++ SetEnvironmentVariable(var, nullptr);
++}
++static std::string rocfft_getenv(const char* var)
++{
++ DWORD size = GetEnvironmentVariable(var, nullptr, 0);
++ std::string ret;
++ if(size)
++ {
++ ret.resize(size);
++ GetEnvironmentVariable(var, ret.data(), size);
++ // GetEnvironmentVariable counts the terminating null, so remove it
++ while(!ret.empty() && ret.back() == 0)
++ ret.pop_back();
++ }
++ return ret;
++}
++
++#else
++
++#include <stdlib.h>
++
++static void rocfft_setenv(const char* var, const char* value)
++{
++ setenv(var, value, 1);
++}
++static void rocfft_unsetenv(const char* var)
++{
++ unsetenv(var);
++}
++static std::string rocfft_getenv(const char* var)
++{
++ auto value = getenv(var);
++ return value ? value : "";
++}
++#endif
++
++// RAII object to set an environment variable and restore it to its
++// previous value on destruction
++struct EnvironmentSetTemp
++{
++ EnvironmentSetTemp(const char* _var, const char* val)
++ : var(_var)
++ {
++ auto val_ptr = rocfft_getenv(_var);
++ if(!val_ptr.empty())
++ oldvalue = val_ptr;
++ rocfft_setenv(_var, val);
++ }
++ ~EnvironmentSetTemp()
++ {
++ if(oldvalue.empty())
++ rocfft_unsetenv(var.c_str());
++ else
++ rocfft_setenv(var.c_str(), oldvalue.c_str());
++ }
++ std::string var;
++ std::string oldvalue;
++};
+diff --git a/shared/fft_params.h b/shared/fft_params.h
+new file mode 100644
+index 0000000..bf428ef
+--- /dev/null
++++ b/shared/fft_params.h
+@@ -0,0 +1,3274 @@
++// Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved.
++//
++// Permission is hereby granted, free of charge, to any person obtaining a copy
++// of this software and associated documentation files (the "Software"), to deal
++// in the Software without restriction, including without limitation the rights
++// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
++// copies of the Software, and to permit persons to whom the Software is
++// furnished to do so, subject to the following conditions:
++//
++// The above copyright notice and this permission notice shall be included in
++// all copies or substantial portions of the Software.
++//
++// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
++// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
++// THE SOFTWARE.
++
++#ifndef FFT_PARAMS_H
++#define FFT_PARAMS_H
++
++#include <algorithm>
++#include <hip/hip_runtime.h>
++#include <iostream>
++#include <mutex>
++#include <numeric>
++#include <sstream>
++#ifdef _OPENMP
++#include <omp.h>
++#endif
++#include <random>
++#include <tuple>
++#include <unordered_set>
++#include <vector>
++
++#include "../shared/arithmetic.h"
++#include "../shared/array_validator.h"
++#include "../shared/data_gen_device.h"
++#include "../shared/data_gen_host.h"
++#include "../shared/device_properties.h"
++#include "../shared/printbuffer.h"
++#include "../shared/ptrdiff.h"
++
++enum fft_status
++{
++ fft_status_success,
++ fft_status_failure,
++ fft_status_invalid_arg_value,
++ fft_status_invalid_dimensions,
++ fft_status_invalid_array_type,
++ fft_status_invalid_strides,
++ fft_status_invalid_distance,
++ fft_status_invalid_offset,
++ fft_status_invalid_work_buffer,
++};
++
++enum fft_transform_type
++{
++ fft_transform_type_complex_forward,
++ fft_transform_type_complex_inverse,
++ fft_transform_type_real_forward,
++ fft_transform_type_real_inverse,
++};
++
++enum fft_precision
++{
++ fft_precision_half,
++ fft_precision_single,
++ fft_precision_double,
++};
++
++static std::istream& operator>>(std::istream& str, fft_precision& precision)
++{
++ std::string word;
++ str >> word;
++
++ if(word == "half")
++ precision = fft_precision_half;
++ else if(word == "single")
++ precision = fft_precision_single;
++ else if(word == "double")
++ precision = fft_precision_double;
++ else
++ throw std::runtime_error("Invalid precision specified");
++ return str;
++}
++
++// fft_input_generator: linearly spaced sequence in [-0.5,0.5]
++// fft_input_random_generator: pseudo-random sequence in [-0.5,0.5]
++enum fft_input_generator
++{
++ fft_input_random_generator_device,
++ fft_input_random_generator_host,
++ fft_input_generator_device,
++ fft_input_generator_host,
++};
++
++static std::istream& operator>>(std::istream& str, fft_input_generator& gen)
++{
++ std::string word;
++ str >> word;
++
++ if(word == "0")
++ gen = fft_input_random_generator_device;
++ else if(word == "1")
++ gen = fft_input_random_generator_host;
++ else if(word == "2")
++ gen = fft_input_generator_device;
++ else if(word == "3")
++ gen = fft_input_generator_host;
++ else
++ throw std::runtime_error("Invalid input generator specified");
++ return str;
++}
++
++enum fft_array_type
++{
++ fft_array_type_complex_interleaved,
++ fft_array_type_complex_planar,
++ fft_array_type_real,
++ fft_array_type_hermitian_interleaved,
++ fft_array_type_hermitian_planar,
++ fft_array_type_unset,
++};
++
++enum fft_result_placement
++{
++ fft_placement_inplace,
++ fft_placement_notinplace,
++};
++
++// Determine the size of the data type given the precision and type.
++template <typename Tsize>
++inline Tsize var_size(const fft_precision precision, const fft_array_type type)
++{
++ size_t var_size = 0;
++ switch(precision)
++ {
++ case fft_precision_half:
++ var_size = sizeof(_Float16);
++ break;
++ case fft_precision_single:
++ var_size = sizeof(float);
++ break;
++ case fft_precision_double:
++ var_size = sizeof(double);
++ break;
++ }
++ switch(type)
++ {
++ case fft_array_type_complex_interleaved:
++ case fft_array_type_hermitian_interleaved:
++ var_size *= 2;
++ break;
++ default:
++ break;
++ }
++ return var_size;
++}
++// Given an array type and transform length, strides, etc, load random floats in [0,1]
++// into the input array of floats/doubles or complex floats/doubles gpu buffers.
++template <typename Tfloat, typename Tint1>
++inline void set_input(std::vector<gpubuf>& input,
++ const fft_input_generator igen,
++ const fft_array_type itype,
++ const std::vector<size_t>& length,
++ const std::vector<size_t>& ilength,
++ const std::vector<size_t>& istride,
++ const Tint1& whole_length,
++ const Tint1& whole_stride,
++ const size_t idist,
++ const size_t nbatch,
++ const hipDeviceProp_t& deviceProp)
++{
++ auto isize = count_iters(whole_length) * nbatch;
++
++ switch(itype)
++ {
++ case fft_array_type_complex_interleaved:
++ case fft_array_type_hermitian_interleaved:
++ {
++ auto ibuffer = (rocfft_complex<Tfloat>*)input[0].data();
++
++ if(igen == fft_input_generator_device)
++ generate_interleaved_data(
++ whole_length, idist, isize, whole_stride, nbatch, ibuffer, deviceProp);
++ else if(igen == fft_input_random_generator_device)
++ generate_random_interleaved_data(
++ whole_length, idist, isize, whole_stride, ibuffer, deviceProp);
++
++ if(itype == fft_array_type_hermitian_interleaved)
++ {
++ auto ibuffer_2 = (rocfft_complex<Tfloat>*)input[0].data();
++ impose_hermitian_symmetry_interleaved(
++ length, ilength, istride, idist, nbatch, ibuffer_2, deviceProp);
++ }
++
++ break;
++ }
++ case fft_array_type_complex_planar:
++ case fft_array_type_hermitian_planar:
++ {
++ auto ibuffer_real = (Tfloat*)input[0].data();
++ auto ibuffer_imag = (Tfloat*)input[1].data();
++
++ if(igen == fft_input_generator_device)
++ generate_planar_data(whole_length,
++ idist,
++ isize,
++ whole_stride,
++ nbatch,
++ ibuffer_real,
++ ibuffer_imag,
++ deviceProp);
++ else if(igen == fft_input_random_generator_device)
++ generate_random_planar_data(
++ whole_length, idist, isize, whole_stride, ibuffer_real, ibuffer_imag, deviceProp);
++
++ if(itype == fft_array_type_hermitian_planar)
++ impose_hermitian_symmetry_planar(
++ length, ilength, istride, idist, nbatch, ibuffer_real, ibuffer_imag, deviceProp);
++
++ break;
++ }
++ case fft_array_type_real:
++ {
++ auto ibuffer = (Tfloat*)input[0].data();
++
++ if(igen == fft_input_generator_device)
++ generate_real_data(
++ whole_length, idist, isize, whole_stride, nbatch, ibuffer, deviceProp);
++ else if(igen == fft_input_random_generator_device)
++ generate_random_real_data(
++ whole_length, idist, isize, whole_stride, ibuffer, deviceProp);
++
++ break;
++ }
++ default:
++ throw std::runtime_error("Input layout format not yet supported");
++ }
++}
++
++template <typename Tfloat, typename Tint1>
++inline void set_input(std::vector<hostbuf>& input,
++ const fft_input_generator igen,
++ const fft_array_type itype,
++ const std::vector<size_t>& length,
++ const std::vector<size_t>& ilength,
++ const std::vector<size_t>& istride,
++ const Tint1& whole_length,
++ const Tint1& whole_stride,
++ const size_t idist,
++ const size_t nbatch,
++ const hipDeviceProp_t& deviceProp)
++{
++ switch(itype)
++ {
++ case fft_array_type_complex_interleaved:
++ case fft_array_type_hermitian_interleaved:
++ {
++ if(igen == fft_input_generator_host)
++ generate_interleaved_data<Tfloat>(input, whole_length, whole_stride, idist, nbatch);
++ else if(igen == fft_input_random_generator_host)
++ generate_random_interleaved_data<Tfloat>(
++ input, whole_length, whole_stride, idist, nbatch);
++
++ if(itype == fft_array_type_hermitian_interleaved)
++ impose_hermitian_symmetry_interleaved<Tfloat>(input, length, istride, idist, nbatch);
++
++ break;
++ }
++ case fft_array_type_complex_planar:
++ case fft_array_type_hermitian_planar:
++ {
++ if(igen == fft_input_generator_host)
++ generate_planar_data<Tfloat>(input, whole_length, whole_stride, idist, nbatch);
++ else if(igen == fft_input_random_generator_host)
++ generate_random_planar_data<Tfloat>(input, whole_length, whole_stride, idist, nbatch);
++
++ if(itype == fft_array_type_hermitian_planar)
++ impose_hermitian_symmetry_planar<Tfloat>(input, length, istride, idist, nbatch);
++
++ break;
++ }
++ case fft_array_type_real:
++ {
++ if(igen == fft_input_generator_host)
++ generate_real_data<Tfloat>(input, whole_length, whole_stride, idist, nbatch);
++ else if(igen == fft_input_random_generator_host)
++ generate_random_real_data<Tfloat>(input, whole_length, whole_stride, idist, nbatch);
++
++ break;
++ }
++ default:
++ throw std::runtime_error("Input layout format not yet supported");
++ }
++}
++
++// unroll set_input for dimension 1, 2, 3
++template <typename Tbuff, typename Tfloat>
++inline void set_input(std::vector<Tbuff>& input,
++ const fft_input_generator igen,
++ const fft_array_type itype,
++ const std::vector<size_t>& length,
++ const std::vector<size_t>& ilength,
++ const std::vector<size_t>& istride,
++ const size_t idist,
++ const size_t nbatch,
++ const hipDeviceProp_t& deviceProp)
++{
++ switch(length.size())
++ {
++ case 1:
++ set_input<Tfloat>(input,
++ igen,
++ itype,
++ length,
++ ilength,
++ istride,
++ ilength[0],
++ istride[0],
++ idist,
++ nbatch,
++ deviceProp);
++ break;
++ case 2:
++ set_input<Tfloat>(input,
++ igen,
++ itype,
++ length,
++ ilength,
++ istride,
++ std::make_tuple(ilength[0], ilength[1]),
++ std::make_tuple(istride[0], istride[1]),
++ idist,
++ nbatch,
++ deviceProp);
++ break;
++ case 3:
++ set_input<Tfloat>(input,
++ igen,
++ itype,
++ length,
++ ilength,
++ istride,
++ std::make_tuple(ilength[0], ilength[1], ilength[2]),
++ std::make_tuple(istride[0], istride[1], istride[2]),
++ idist,
++ nbatch,
++ deviceProp);
++ break;
++ default:
++ abort();
++ }
++}
++
++// Container class for test parameters.
++class fft_params
++{
++public:
++ // All parameters are row-major.
++ std::vector<size_t> length;
++ std::vector<size_t> istride;
++ std::vector<size_t> ostride;
++ size_t nbatch = 1;
++ fft_precision precision = fft_precision_single;
++ fft_input_generator igen = fft_input_random_generator_device;
++ fft_transform_type transform_type = fft_transform_type_complex_forward;
++ fft_result_placement placement = fft_placement_inplace;
++ size_t idist = 0;
++ size_t odist = 0;
++ fft_array_type itype = fft_array_type_unset;
++ fft_array_type otype = fft_array_type_unset;
++ std::vector<size_t> ioffset = {0, 0};
++ std::vector<size_t> ooffset = {0, 0};
++
++ std::vector<size_t> isize;
++ std::vector<size_t> osize;
++
++ size_t workbuffersize = 0;
++
++ struct fft_brick
++ {
++ // all vectors here are row-major, with same length as FFT
++ // dimension + 1 (for batch dimension)
++
++ // inclusive lower bound of brick
++ std::vector<size_t> lower;
++ // exclusive upper bound of brick
++ std::vector<size_t> upper;
++ // stride of brick in memory
++ std::vector<size_t> stride;
++
++ // compute the length of this brick
++ std::vector<size_t> length() const
++ {
++ std::vector<size_t> ret;
++ for(size_t i = 0; i < lower.size(); ++i)
++ ret.push_back(upper[i] - lower[i]);
++ return ret;
++ }
++
++ // compute offset of lower bound in a field with the given
++ // stride + dist (batch stride is separate)
++ size_t lower_field_offset(std::vector<size_t> stride, size_t dist) const
++ {
++ // brick strides include batch, so adjust our input accordingly
++ stride.insert(stride.begin(), dist);
++
++ return std::inner_product(lower.begin(), lower.end(), stride.begin(), 0);
++ }
++
++ // location of the brick
++ int device = 0;
++ };
++
++ struct fft_field
++ {
++ std::vector<fft_brick> bricks;
++ };
++ // optional brick decomposition of inputs/outputs
++ std::vector<fft_field> ifields;
++ std::vector<fft_field> ofields;
++
++ // run testing load/store callbacks
++ bool run_callbacks = false;
++ static constexpr double load_cb_scalar = 0.457813941;
++ static constexpr double store_cb_scalar = 0.391504938;
++
++ // Check that data outside of output strides is not overwritten.
++ // This is only set explicitly on some tests where there's space
++ // between dimensions, but the dimensions are still in-order.
++ // We're not trying to generically find holes in arbitrary data
++ // layouts.
++ //
++ // NOTE: this flag is not included in tokens, since it doesn't
++ // affect how the FFT library behaves.
++ bool check_output_strides = false;
++
++ // scaling factor - we do a pointwise multiplication of outputs by
++ // this factor
++ double scale_factor = 1.0;
++
++ fft_params(){};
++ virtual ~fft_params(){};
++
++ // Given an array type, return the name as a string.
++ static std::string array_type_name(const fft_array_type type, bool verbose = true)
++ {
++ switch(type)
++ {
++ case fft_array_type_complex_interleaved:
++ return verbose ? "fft_array_type_complex_interleaved" : "CI";
++ case fft_array_type_complex_planar:
++ return verbose ? "fft_array_type_complex_planar" : "CP";
++ case fft_array_type_real:
++ return verbose ? "fft_array_type_real" : "R";
++ case fft_array_type_hermitian_interleaved:
++ return verbose ? "fft_array_type_hermitian_interleaved" : "HI";
++ case fft_array_type_hermitian_planar:
++ return verbose ? "fft_array_type_hermitian_planar" : "HP";
++ case fft_array_type_unset:
++ return verbose ? "fft_array_type_unset" : "UN";
++ }
++ return "";
++ }
++
++ std::string transform_type_name() const
++ {
++ switch(transform_type)
++ {
++ case fft_transform_type_complex_forward:
++ return "fft_transform_type_complex_forward";
++ case fft_transform_type_complex_inverse:
++ return "fft_transform_type_complex_inverse";
++ case fft_transform_type_real_forward:
++ return "fft_transform_type_real_forward";
++ case fft_transform_type_real_inverse:
++ return "fft_transform_type_real_inverse";
++ default:
++ throw std::runtime_error("Invalid transform type");
++ }
++ }
++
++ // Convert to string for output.
++ std::string str(const std::string& separator = ", ") const
++ {
++ // top-level stride/dist are not used when fields are specified.
++ const bool have_ifields = !ifields.empty();
++ const bool have_ofields = !ofields.empty();
++
++ std::stringstream ss;
++ auto print_size_vec = [&](const char* description, const std::vector<size_t>& vec) {
++ ss << description << ":";
++ for(auto i : vec)
++ ss << " " << i;
++ ss << separator;
++ };
++ auto print_fields = [&](const char* description, const std::vector<fft_field>& fields) {
++ for(unsigned int fidx = 0; fidx < fields.size(); ++fidx)
++ {
++ const auto& f = fields[fidx];
++ ss << description << " " << fidx << ":" << separator;
++ for(unsigned int bidx = 0; bidx < f.bricks.size(); ++bidx)
++ {
++ const auto& b = f.bricks[bidx];
++ ss << " brick " << bidx << ":" << separator;
++ print_size_vec(" lower", b.lower);
++ print_size_vec(" upper", b.upper);
++ print_size_vec(" stride", b.stride);
++ ss << " device: " << b.device << separator;
++ }
++ }
++ };
++
++ print_size_vec("length", length);
++ if(have_ifields)
++ {
++ print_fields("ifield", ifields);
++ }
++ else
++ {
++ print_size_vec("istride", istride);
++ ss << "idist: " << idist << separator;
++ }
++
++ if(have_ofields)
++ {
++ print_fields("ofield", ofields);
++ }
++ else
++ {
++ print_size_vec("ostride", ostride);
++ ss << "odist: " << odist << separator;
++ }
++
++ ss << "batch: " << nbatch << separator;
++ print_size_vec("isize", isize);
++ print_size_vec("osize", osize);
++
++ print_size_vec("ioffset", ioffset);
++ print_size_vec("ooffset", ooffset);
++
++ if(placement == fft_placement_inplace)
++ ss << "in-place";
++ else
++ ss << "out-of-place";
++ ss << separator;
++ ss << "transform_type: " << transform_type_name() << separator;
++ ss << array_type_name(itype) << " -> " << array_type_name(otype) << separator;
++ switch(precision)
++ {
++ case fft_precision_half:
++ ss << "half-precision";
++ break;
++ case fft_precision_single:
++ ss << "single-precision";
++ break;
++ case fft_precision_double:
++ ss << "double-precision";
++ break;
++ }
++ ss << separator;
++
++ print_size_vec("ilength", ilength());
++ print_size_vec("olength", olength());
++
++ print_size_vec("ibuffer_size", ibuffer_sizes());
++ print_size_vec("obuffer_size", obuffer_sizes());
++
++ if(scale_factor != 1.0)
++ ss << "scale factor: " << scale_factor << separator;
++
++ return ss.str();
++ }
++
++ // Produce a stringified token of the test fft params.
++ std::string token() const
++ {
++ std::string ret;
++
++ switch(transform_type)
++ {
++ case fft_transform_type_complex_forward:
++ ret += "complex_forward_";
++ break;
++ case fft_transform_type_complex_inverse:
++ ret += "complex_inverse_";
++ break;
++ case fft_transform_type_real_forward:
++ ret += "real_forward_";
++ break;
++ case fft_transform_type_real_inverse:
++ ret += "real_inverse_";
++ break;
++ }
++
++ auto append_size_vec = [&ret](const std::vector<size_t>& vec) {
++ for(auto s : vec)
++ {
++ ret += "_";
++ ret += std::to_string(s);
++ }
++ };
++
++ ret += "len";
++ append_size_vec(length);
++
++ switch(precision)
++ {
++ case fft_precision_half:
++ ret += "_half_";
++ break;
++ case fft_precision_single:
++ ret += "_single_";
++ break;
++ case fft_precision_double:
++ ret += "_double_";
++ break;
++ }
++
++ switch(placement)
++ {
++ case fft_placement_inplace:
++ ret += "ip_";
++ break;
++ case fft_placement_notinplace:
++ ret += "op_";
++ break;
++ }
++
++ ret += "batch_";
++ ret += std::to_string(nbatch);
++
++ auto append_array_type = [&ret](fft_array_type type) {
++ switch(type)
++ {
++ case fft_array_type_complex_interleaved:
++ ret += "CI";
++ break;
++ case fft_array_type_complex_planar:
++ ret += "CP";
++ break;
++ case fft_array_type_real:
++ ret += "R";
++ break;
++ case fft_array_type_hermitian_interleaved:
++ ret += "HI";
++ break;
++ case fft_array_type_hermitian_planar:
++ ret += "HP";
++ break;
++ default:
++ ret += "UN";
++ break;
++ }
++ };
++
++ auto append_brick_info = [&ret, &append_size_vec](const fft_brick& b) {
++ ret += "_brick";
++
++ ret += "_lower";
++ append_size_vec(b.lower);
++ ret += "_upper";
++ append_size_vec(b.upper);
++ ret += "_stride";
++ append_size_vec(b.stride);
++ ret += "_dev_";
++ ret += std::to_string(b.device);
++ };
++
++ const bool have_ifields = !ifields.empty();
++ const bool have_ofields = !ofields.empty();
++
++ if(have_ifields)
++ {
++ for(const auto& f : ifields)
++ {
++ ret += "_ifield";
++ for(const auto& b : f.bricks)
++ append_brick_info(b);
++ }
++ }
++ else
++ {
++ ret += "_istride";
++ append_size_vec(istride);
++ ret += "_";
++ append_array_type(itype);
++ }
++
++ if(have_ofields)
++ {
++ for(const auto& f : ofields)
++ {
++ ret += "_ofield";
++ for(const auto& b : f.bricks)
++ append_brick_info(b);
++ }
++ }
++ else
++ {
++ ret += "_ostride";
++ append_size_vec(ostride);
++ ret += "_";
++ append_array_type(otype);
++ }
++
++ if(!have_ifields)
++ {
++ ret += "_idist_";
++ ret += std::to_string(idist);
++ }
++ if(!have_ofields)
++ {
++ ret += "_odist_";
++ ret += std::to_string(odist);
++ }
++
++ if(!have_ifields)
++ {
++ ret += "_ioffset";
++ append_size_vec(ioffset);
++ }
++
++ if(!have_ofields)
++ {
++ ret += "_ooffset";
++ append_size_vec(ooffset);
++ }
++
++ if(run_callbacks)
++ ret += "_CB";
++
++ if(scale_factor != 1.0)
++ ret += "_scale";
++
++ return ret;
++ }
++
++ // Set all params from a stringified token.
++ void from_token(std::string token)
++ {
++ std::vector<std::string> vals;
++
++ std::string delimiter = "_";
++ {
++ size_t pos = 0;
++ while((pos = token.find(delimiter)) != std::string::npos)
++ {
++ auto val = token.substr(0, pos);
++ vals.push_back(val);
++ token.erase(0, pos + delimiter.length());
++ }
++ vals.push_back(token);
++ }
++
++ auto size_parser
++ = [](const std::vector<std::string>& vals, const std::string token, size_t& pos) {
++ if(vals[pos++] != token)
++ throw std::runtime_error("Unable to parse token");
++ return std::stoull(vals[pos++]);
++ };
++
++ auto vector_parser
++ = [](const std::vector<std::string>& vals, const std::string token, size_t& pos) {
++ if(vals[pos++] != token)
++ throw std::runtime_error("Unable to parse token");
++ std::vector<size_t> vec;
++
++ while(pos < vals.size())
++ {
++ if(std::all_of(vals[pos].begin(), vals[pos].end(), ::isdigit))
++ {
++ vec.push_back(std::stoull(vals[pos++]));
++ }
++ else
++ {
++ break;
++ }
++ }
++ return vec;
++ };
++
++ auto type_parser = [](const std::string& val) {
++ if(val == "CI")
++ return fft_array_type_complex_interleaved;
++ else if(val == "CP")
++ return fft_array_type_complex_planar;
++ else if(val == "R")
++ return fft_array_type_real;
++ else if(val == "HI")
++ return fft_array_type_hermitian_interleaved;
++ else if(val == "HP")
++ return fft_array_type_hermitian_planar;
++ return fft_array_type_unset;
++ };
++
++ auto field_parser = [&vector_parser, &size_parser](const std::vector<std::string>& vals,
++ size_t& pos,
++ std::vector<fft_field>& output) {
++ // skip over ifield/ofield word
++ pos++;
++ fft_field& f = output.emplace_back();
++ while(pos < vals.size() && vals[pos] == "brick")
++ {
++ fft_brick& b = f.bricks.emplace_back();
++ pos++;
++ b.lower = vector_parser(vals, "lower", pos);
++ b.upper = vector_parser(vals, "upper", pos);
++ b.stride = vector_parser(vals, "stride", pos);
++ b.device = size_parser(vals, "dev", pos);
++ }
++ };
++
++ size_t pos = 0;
++
++ bool complex = vals[pos++] == "complex";
++ bool forward = vals[pos++] == "forward";
++
++ if(complex && forward)
++ transform_type = fft_transform_type_complex_forward;
++ if(complex && !forward)
++ transform_type = fft_transform_type_complex_inverse;
++ if(!complex && forward)
++ transform_type = fft_transform_type_real_forward;
++ if(!complex && !forward)
++ transform_type = fft_transform_type_real_inverse;
++
++ length = vector_parser(vals, "len", pos);
++
++ if(vals[pos] == "half")
++ precision = fft_precision_half;
++ else if(vals[pos] == "single")
++ precision = fft_precision_single;
++ else if(vals[pos] == "double")
++ precision = fft_precision_double;
++ pos++;
++
++ placement = (vals[pos++] == "ip") ? fft_placement_inplace : fft_placement_notinplace;
++
++ nbatch = size_parser(vals, "batch", pos);
++
++ // strides, bricks etc are mixed in from here, so just keep
++ // looking at the next token to decide what to do
++ while(pos < vals.size())
++ {
++ const auto& next_token = vals[pos];
++ if(next_token == "istride")
++ {
++ istride = vector_parser(vals, "istride", pos);
++ itype = type_parser(vals[pos]);
++ pos++;
++ }
++ else if(next_token == "ostride")
++ {
++ ostride = vector_parser(vals, "ostride", pos);
++ otype = type_parser(vals[pos]);
++ pos++;
++ }
++ else if(next_token == "idist")
++ idist = size_parser(vals, "idist", pos);
++ else if(next_token == "odist")
++ odist = size_parser(vals, "odist", pos);
++ else if(next_token == "ioffset")
++ ioffset = vector_parser(vals, "ioffset", pos);
++ else if(next_token == "ooffset")
++ ooffset = vector_parser(vals, "ooffset", pos);
++ else if(next_token == "ifield")
++ field_parser(vals, pos, ifields);
++ else if(next_token == "ofield")
++ field_parser(vals, pos, ofields);
++ else
++ break;
++ }
++
++ if(pos < vals.size() && vals[pos] == "CB")
++ {
++ run_callbacks = true;
++ ++pos;
++ }
++
++ if(pos < vals.size() && vals[pos] == "scale")
++ {
++ // just pick some factor that's not zero or one
++ scale_factor = 0.1239;
++ ++pos;
++ }
++ }
++
++ // Stream output operator (for gtest, etc).
++ friend std::ostream& operator<<(std::ostream& stream, const fft_params& params)
++ {
++ stream << params.str();
++ return stream;
++ }
++
++ // Dimension of the transform.
++ size_t dim() const
++ {
++ return length.size();
++ }
++
++ virtual std::vector<size_t> ilength() const
++ {
++ auto ilength = length;
++ if(transform_type == fft_transform_type_real_inverse)
++ ilength[dim() - 1] = ilength[dim() - 1] / 2 + 1;
++ return ilength;
++ }
++
++ virtual std::vector<size_t> olength() const
++ {
++ auto olength = length;
++ if(transform_type == fft_transform_type_real_forward)
++ olength[dim() - 1] = olength[dim() - 1] / 2 + 1;
++ return olength;
++ }
++
++ static size_t nbuffer(const fft_array_type type)
++ {
++ switch(type)
++ {
++ case fft_array_type_real:
++ case fft_array_type_complex_interleaved:
++ case fft_array_type_hermitian_interleaved:
++ return 1;
++ case fft_array_type_complex_planar:
++ case fft_array_type_hermitian_planar:
++ return 2;
++ case fft_array_type_unset:
++ return 0;
++ }
++ return 0;
++ }
++
++ // Number of input buffers
++ size_t nibuffer() const
++ {
++ return nbuffer(itype);
++ }
++
++ // Number of output buffers
++ size_t nobuffer() const
++ {
++ return nbuffer(otype);
++ }
++
++ void set_iotypes()
++ {
++ if(itype == fft_array_type_unset)
++ {
++ switch(transform_type)
++ {
++ case fft_transform_type_complex_forward:
++ case fft_transform_type_complex_inverse:
++ itype = fft_array_type_complex_interleaved;
++ break;
++ case fft_transform_type_real_forward:
++ itype = fft_array_type_real;
++ break;
++ case fft_transform_type_real_inverse:
++ itype = fft_array_type_hermitian_interleaved;
++ break;
++ default:
++ throw std::runtime_error("Invalid transform type");
++ }
++ }
++ if(otype == fft_array_type_unset)
++ {
++ switch(transform_type)
++ {
++ case fft_transform_type_complex_forward:
++ case fft_transform_type_complex_inverse:
++ otype = fft_array_type_complex_interleaved;
++ break;
++ case fft_transform_type_real_forward:
++ otype = fft_array_type_hermitian_interleaved;
++ break;
++ case fft_transform_type_real_inverse:
++ otype = fft_array_type_real;
++ break;
++ default:
++ throw std::runtime_error("Invalid transform type");
++ }
++ }
++ }
++
++ // Check that the input and output types are consistent.
++ bool check_iotypes() const
++ {
++ switch(itype)
++ {
++ case fft_array_type_complex_interleaved:
++ case fft_array_type_complex_planar:
++ case fft_array_type_hermitian_interleaved:
++ case fft_array_type_hermitian_planar:
++ case fft_array_type_real:
++ break;
++ default:
++ throw std::runtime_error("Invalid Input array type format");
++ }
++
++ switch(otype)
++ {
++ case fft_array_type_complex_interleaved:
++ case fft_array_type_complex_planar:
++ case fft_array_type_hermitian_interleaved:
++ case fft_array_type_hermitian_planar:
++ case fft_array_type_real:
++ break;
++ default:
++ throw std::runtime_error("Invalid Input array type format");
++ }
++
++ // Check that format choices are supported
++ if(transform_type != fft_transform_type_real_forward
++ && transform_type != fft_transform_type_real_inverse)
++ {
++ if(placement == fft_placement_inplace && itype != otype)
++ {
++ throw std::runtime_error(
++ "In-place transforms must have identical input and output types");
++ }
++ }
++
++ bool okformat = true;
++ switch(itype)
++ {
++ case fft_array_type_complex_interleaved:
++ case fft_array_type_complex_planar:
++ okformat = (otype == fft_array_type_complex_interleaved
++ || otype == fft_array_type_complex_planar);
++ break;
++ case fft_array_type_hermitian_interleaved:
++ case fft_array_type_hermitian_planar:
++ okformat = otype == fft_array_type_real;
++ break;
++ case fft_array_type_real:
++ okformat = (otype == fft_array_type_hermitian_interleaved
++ || otype == fft_array_type_hermitian_planar);
++ break;
++ default:
++ throw std::runtime_error("Invalid Input array type format");
++ }
++
++ return okformat;
++ }
++
++ // Given a length vector, set the rest of the strides.
++ // The optional argument stride0 sets the stride for the contiguous dimension.
++ // The optional rcpadding argument sets the stride correctly for in-place
++ // multi-dimensional real/complex transforms.
++ // Format is row-major.
++ template <typename T1>
++ std::vector<T1> compute_stride(const std::vector<T1>& length,
++ const std::vector<size_t>& stride0 = std::vector<size_t>(),
++ const bool rcpadding = false) const
++ {
++ std::vector<T1> stride(dim());
++
++ size_t dimoffset = 0;
++
++ if(stride0.size() == 0)
++ {
++ // Set the contiguous stride:
++ stride[dim() - 1] = 1;
++ dimoffset = 1;
++ }
++ else
++ {
++ // Copy the input values to the end of the stride array:
++ for(size_t i = 0; i < stride0.size(); ++i)
++ {
++ stride[dim() - stride0.size() + i] = stride0[i];
++ }
++ }
++
++ if(stride0.size() < dim())
++ {
++ // Compute any remaining values via recursion.
++ for(size_t i = dim() - dimoffset - stride0.size(); i-- > 0;)
++ {
++ auto lengthip1 = length[i + 1];
++ if(rcpadding && i == dim() - 2)
++ {
++ lengthip1 = 2 * (lengthip1 / 2 + 1);
++ }
++ stride[i] = stride[i + 1] * lengthip1;
++ }
++ }
++
++ return stride;
++ }
++
++ void compute_istride()
++ {
++ istride = compute_stride(ilength(),
++ istride,
++ placement == fft_placement_inplace
++ && transform_type == fft_transform_type_real_forward);
++ }
++
++ void compute_ostride()
++ {
++ ostride = compute_stride(olength(),
++ ostride,
++ placement == fft_placement_inplace
++ && transform_type == fft_transform_type_real_inverse);
++ }
++
++ virtual void compute_isize()
++ {
++ auto il = ilength();
++ size_t val = compute_ptrdiff(il, istride, nbatch, idist);
++ isize.resize(nibuffer());
++ for(unsigned int i = 0; i < isize.size(); ++i)
++ {
++ isize[i] = val + ioffset[i];
++ }
++ }
++
++ virtual void compute_osize()
++ {
++ auto ol = olength();
++ size_t val = compute_ptrdiff(ol, ostride, nbatch, odist);
++ osize.resize(nobuffer());
++ for(unsigned int i = 0; i < osize.size(); ++i)
++ {
++ osize[i] = val + ooffset[i];
++ }
++ }
++
++ std::vector<size_t> ibuffer_sizes() const
++ {
++ std::vector<size_t> ibuffer_sizes;
++
++ // In-place real-to-complex transforms need to have enough space in the input buffer to
++ // accomadate the output, which is slightly larger.
++ if(placement == fft_placement_inplace && transform_type == fft_transform_type_real_forward)
++ {
++ return obuffer_sizes();
++ }
++
++ if(isize.empty())
++ return ibuffer_sizes;
++
++ switch(itype)
++ {
++ case fft_array_type_complex_planar:
++ case fft_array_type_hermitian_planar:
++ ibuffer_sizes.resize(2);
++ break;
++ default:
++ ibuffer_sizes.resize(1);
++ }
++ for(unsigned i = 0; i < ibuffer_sizes.size(); i++)
++ {
++ ibuffer_sizes[i] = isize[i] * var_size<size_t>(precision, itype);
++ }
++ return ibuffer_sizes;
++ }
++
++ virtual std::vector<size_t> obuffer_sizes() const
++ {
++ std::vector<size_t> obuffer_sizes;
++
++ if(osize.empty())
++ return obuffer_sizes;
++
++ switch(otype)
++ {
++ case fft_array_type_complex_planar:
++ case fft_array_type_hermitian_planar:
++ obuffer_sizes.resize(2);
++ break;
++ default:
++ obuffer_sizes.resize(1);
++ }
++ for(unsigned i = 0; i < obuffer_sizes.size(); i++)
++ {
++ obuffer_sizes[i] = osize[i] * var_size<size_t>(precision, otype);
++ }
++ return obuffer_sizes;
++ }
++
++ // Compute the idist for a given transform based on the placeness, transform type, and data
++ // layout.
++ size_t compute_idist() const
++ {
++ size_t dist = 0;
++ // In-place 1D transforms need extra dist.
++ if(transform_type == fft_transform_type_real_forward && dim() == 1
++ && placement == fft_placement_inplace)
++ {
++ dist = 2 * (length[0] / 2 + 1) * istride[0];
++ return dist;
++ }
++
++ if(transform_type == fft_transform_type_real_inverse && dim() == 1)
++ {
++ dist = (length[0] / 2 + 1) * istride[0];
++ return dist;
++ }
++
++ dist = (transform_type == fft_transform_type_real_inverse)
++ ? (length[dim() - 1] / 2 + 1) * istride[dim() - 1]
++ : length[dim() - 1] * istride[dim() - 1];
++ for(unsigned int i = 0; i < dim() - 1; ++i)
++ {
++ dist = std::max(length[i] * istride[i], dist);
++ }
++ return dist;
++ }
++ void set_idist()
++ {
++ if(idist != 0)
++ return;
++ idist = compute_idist();
++ }
++
++ // Compute the odist for a given transform based on the placeness, transform type, and data
++ // layout. Row-major.
++ size_t compute_odist() const
++ {
++ size_t dist = 0;
++ // In-place 1D transforms need extra dist.
++ if(transform_type == fft_transform_type_real_inverse && dim() == 1
++ && placement == fft_placement_inplace)
++ {
++ dist = 2 * (length[0] / 2 + 1) * ostride[0];
++ return dist;
++ }
++
++ if(transform_type == fft_transform_type_real_forward && dim() == 1)
++ {
++ dist = (length[0] / 2 + 1) * ostride[0];
++ return dist;
++ }
++
++ dist = (transform_type == fft_transform_type_real_forward)
++ ? (length[dim() - 1] / 2 + 1) * ostride[dim() - 1]
++ : length[dim() - 1] * ostride[dim() - 1];
++ for(unsigned int i = 0; i < dim() - 1; ++i)
++ {
++ dist = std::max(length[i] * ostride[i], dist);
++ }
++ return dist;
++ }
++ void set_odist()
++ {
++ if(odist != 0)
++ return;
++ odist = compute_odist();
++ }
++
++ // Put the length, stride, batch, and dist into a single length/stride array and pass off to the
++ // validity checker.
++ bool valid_length_stride_batch_dist(const std::vector<size_t>& l0,
++ const std::vector<size_t>& s0,
++ const size_t n,
++ const size_t dist,
++ const int verbose = 0) const
++ {
++ if(l0.size() != s0.size())
++ return false;
++
++ // Length and stride vectors, including bathes:
++ std::vector<size_t> l{}, s{};
++ for(unsigned int i = 0; i < l0.size(); ++i)
++ {
++ if(l0[i] > 1)
++ {
++ if(s0[i] == 0)
++ return false;
++ l.push_back(l0[i]);
++ s.push_back(s0[i]);
++ }
++ }
++ if(n > 1)
++ {
++ if(dist == 0)
++ return false;
++ l.push_back(n);
++ s.push_back(dist);
++ }
++
++ return array_valid(l, s, verbose);
++ }
++
++ // Return true if the given GPU parameters would produce a valid transform.
++ bool valid(const int verbose) const
++ {
++ if(ioffset.size() < nibuffer() || ooffset.size() < nobuffer())
++ return false;
++
++ // Check that in-place transforms have the same input and output stride:
++ if(placement == fft_placement_inplace)
++ {
++ const auto stridesize = std::min(istride.size(), ostride.size());
++ bool samestride = true;
++ for(unsigned int i = 0; i < stridesize; ++i)
++ {
++ if(istride[i] != ostride[i])
++ samestride = false;
++ }
++ if((transform_type == fft_transform_type_complex_forward
++ || transform_type == fft_transform_type_complex_inverse)
++ && !samestride)
++ {
++ // In-place transforms require identical input and output strides.
++ if(verbose)
++ {
++ std::cout << "istride:";
++ for(const auto& i : istride)
++ std::cout << " " << i;
++ std::cout << " ostride0:";
++ for(const auto& i : ostride)
++ std::cout << " " << i;
++ std::cout << " differ; skipped for in-place transforms: skipping test"
++ << std::endl;
++ }
++ return false;
++ }
++
++ if((transform_type == fft_transform_type_complex_forward
++ || transform_type == fft_transform_type_complex_inverse)
++ && (idist != odist) && nbatch > 1)
++ {
++ // In-place transforms require identical distance, if
++ // batch > 1. If batch is 1 then dist is ignored and
++ // the FFT should still work.
++ if(verbose)
++ {
++ std::cout << "idist:" << idist << " odist:" << odist
++ << " differ; skipped for in-place transforms: skipping test"
++ << std::endl;
++ }
++ return false;
++ }
++
++ if((transform_type == fft_transform_type_real_forward
++ || transform_type == fft_transform_type_real_inverse)
++ && (istride.back() != 1 || ostride.back() != 1))
++ {
++ // In-place real/complex transforms require unit strides.
++ if(verbose)
++ {
++ std::cout
++ << "istride.back(): " << istride.back()
++ << " ostride.back(): " << ostride.back()
++ << " must be unitary for in-place real/complex transforms: skipping test"
++ << std::endl;
++ }
++ return false;
++ }
++
++ if((itype == fft_array_type_complex_interleaved
++ && otype == fft_array_type_complex_planar)
++ || (itype == fft_array_type_complex_planar
++ && otype == fft_array_type_complex_interleaved))
++ {
++ if(verbose)
++ {
++ std::cout << "In-place c2c transforms require identical io types; skipped.\n";
++ }
++ return false;
++ }
++
++ // Check offsets
++ switch(transform_type)
++ {
++ case fft_transform_type_complex_forward:
++ case fft_transform_type_complex_inverse:
++ for(unsigned int i = 0; i < nibuffer(); ++i)
++ {
++ if(ioffset[i] != ooffset[i])
++ return false;
++ }
++ break;
++ case fft_transform_type_real_forward:
++ if(ioffset[0] != 2 * ooffset[0])
++ return false;
++ break;
++ case fft_transform_type_real_inverse:
++ if(2 * ioffset[0] != ooffset[0])
++ return false;
++ break;
++ }
++ }
++
++ if(!check_iotypes())
++ return false;
++
++ // we can only check output strides on out-of-place
++ // transforms, since we need to initialize output to a known
++ // pattern
++ if(placement == fft_placement_inplace && check_output_strides)
++ return false;
++
++ // Check input and output strides
++ if(valid_length_stride_batch_dist(ilength(), istride, nbatch, idist, verbose) != true)
++ {
++ if(verbose)
++ std::cout << "Invalid input data format.\n";
++ return false;
++ }
++ if(!(ilength() == olength() && istride == ostride && idist == odist))
++ {
++ // Only check if different
++ if(valid_length_stride_batch_dist(olength(), ostride, nbatch, odist, verbose) != true)
++ {
++ if(verbose)
++ std::cout << "Invalid output data format.\n";
++ return false;
++ }
++ }
++
++ // The parameters are valid.
++ return true;
++ }
++
++ // Fill in any missing parameters.
++ void validate()
++ {
++ set_iotypes();
++ compute_istride();
++ compute_ostride();
++ set_idist();
++ set_odist();
++ compute_isize();
++ compute_osize();
++
++ validate_fields();
++ }
++
++ virtual void validate_fields() const
++ {
++ if(!ifields.empty() || !ofields.empty())
++ throw std::runtime_error("input/output fields are unsupported");
++ }
++
++ // Column-major getters:
++ std::vector<size_t> length_cm() const
++ {
++ auto length_cm = length;
++ std::reverse(std::begin(length_cm), std::end(length_cm));
++ return length_cm;
++ }
++ std::vector<size_t> ilength_cm() const
++ {
++ auto ilength_cm = ilength();
++ std::reverse(std::begin(ilength_cm), std::end(ilength_cm));
++ return ilength_cm;
++ }
++ std::vector<size_t> olength_cm() const
++ {
++ auto olength_cm = olength();
++ std::reverse(std::begin(olength_cm), std::end(olength_cm));
++ return olength_cm;
++ }
++ std::vector<size_t> istride_cm() const
++ {
++ auto istride_cm = istride;
++ std::reverse(std::begin(istride_cm), std::end(istride_cm));
++ return istride_cm;
++ }
++ std::vector<size_t> ostride_cm() const
++ {
++ auto ostride_cm = ostride;
++ std::reverse(std::begin(ostride_cm), std::end(ostride_cm));
++ return ostride_cm;
++ }
++ bool is_planar() const
++ {
++ if(itype == fft_array_type_complex_planar || itype == fft_array_type_hermitian_planar)
++ return true;
++ if(otype == fft_array_type_complex_planar || otype == fft_array_type_hermitian_planar)
++ return true;
++ return false;
++ }
++
++ // Given a data type and dimensions, fill the buffer, imposing Hermitian symmetry if necessary.
++ template <typename Tbuff>
++ inline void compute_input(std::vector<Tbuff>& input)
++ {
++ auto deviceProp = get_curr_device_prop();
++
++ switch(precision)
++ {
++ case fft_precision_half:
++ set_input<Tbuff, _Float16>(
++ input, igen, itype, length, ilength(), istride, idist, nbatch, deviceProp);
++ break;
++ case fft_precision_double:
++ set_input<Tbuff, double>(
++ input, igen, itype, length, ilength(), istride, idist, nbatch, deviceProp);
++ break;
++ case fft_precision_single:
++ set_input<Tbuff, float>(
++ input, igen, itype, length, ilength(), istride, idist, nbatch, deviceProp);
++ break;
++ }
++ }
++
++ template <typename Tstream = std::ostream>
++ void print_ibuffer(const std::vector<hostbuf>& buf, Tstream& stream = std::cout) const
++ {
++ switch(itype)
++ {
++ case fft_array_type_complex_interleaved:
++ case fft_array_type_hermitian_interleaved:
++ {
++ switch(precision)
++ {
++ case fft_precision_half:
++ {
++ buffer_printer<rocfft_complex<_Float16>> s;
++ s.print_buffer(buf, ilength(), istride, nbatch, idist, ioffset);
++ break;
++ }
++ case fft_precision_single:
++ {
++ buffer_printer<rocfft_complex<float>> s;
++ s.print_buffer(buf, ilength(), istride, nbatch, idist, ioffset);
++ break;
++ }
++ case fft_precision_double:
++ {
++ buffer_printer<rocfft_complex<double>> s;
++ s.print_buffer(buf, ilength(), istride, nbatch, idist, ioffset);
++ break;
++ }
++ }
++ break;
++ }
++ case fft_array_type_complex_planar:
++ case fft_array_type_hermitian_planar:
++ case fft_array_type_real:
++ {
++ switch(precision)
++ {
++ case fft_precision_half:
++ {
++ buffer_printer<_Float16> s;
++ s.print_buffer(buf, ilength(), istride, nbatch, idist, ioffset);
++ break;
++ }
++ case fft_precision_single:
++ {
++ buffer_printer<float> s;
++ s.print_buffer(buf, ilength(), istride, nbatch, idist, ioffset);
++ break;
++ }
++ case fft_precision_double:
++ {
++ buffer_printer<double> s;
++ s.print_buffer(buf, ilength(), istride, nbatch, idist, ioffset);
++ break;
++ }
++ }
++ break;
++ }
++ default:
++ throw std::runtime_error("Invalid itype in print_ibuffer");
++ }
++ }
++
++ template <typename Tstream = std::ostream>
++ void print_obuffer(const std::vector<hostbuf>& buf, Tstream& stream = std::cout) const
++ {
++ switch(otype)
++ {
++ case fft_array_type_complex_interleaved:
++ case fft_array_type_hermitian_interleaved:
++ {
++ switch(precision)
++ {
++ case fft_precision_half:
++ {
++ buffer_printer<rocfft_complex<_Float16>> s;
++ s.print_buffer(buf, olength(), ostride, nbatch, odist, ooffset);
++ break;
++ }
++ case fft_precision_single:
++ {
++ buffer_printer<rocfft_complex<float>> s;
++ s.print_buffer(buf, olength(), ostride, nbatch, odist, ooffset);
++ break;
++ }
++ case fft_precision_double:
++ buffer_printer<rocfft_complex<double>> s;
++ s.print_buffer(buf, olength(), ostride, nbatch, odist, ooffset);
++ break;
++ }
++ break;
++ }
++ case fft_array_type_complex_planar:
++ case fft_array_type_hermitian_planar:
++ case fft_array_type_real:
++ {
++ switch(precision)
++ {
++ case fft_precision_half:
++ {
++ buffer_printer<_Float16> s;
++ s.print_buffer(buf, olength(), ostride, nbatch, odist, ooffset);
++ break;
++ }
++ case fft_precision_single:
++ {
++ buffer_printer<float> s;
++ s.print_buffer(buf, olength(), ostride, nbatch, odist, ooffset);
++ break;
++ }
++ case fft_precision_double:
++ {
++ buffer_printer<double> s;
++ s.print_buffer(buf, olength(), ostride, nbatch, odist, ooffset);
++ break;
++ }
++ }
++ break;
++ }
++
++ default:
++ throw std::runtime_error("Invalid itype in print_obuffer");
++ }
++ }
++
++ void print_ibuffer_flat(const std::vector<hostbuf>& buf) const
++ {
++ switch(itype)
++ {
++ case fft_array_type_complex_interleaved:
++ case fft_array_type_hermitian_interleaved:
++ {
++ switch(precision)
++ {
++ case fft_precision_half:
++ {
++ buffer_printer<rocfft_complex<_Float16>> s;
++ s.print_buffer_flat(buf, osize, ooffset);
++ break;
++ }
++ case fft_precision_single:
++ {
++ buffer_printer<rocfft_complex<float>> s;
++ s.print_buffer_flat(buf, osize, ooffset);
++ break;
++ }
++ case fft_precision_double:
++ buffer_printer<rocfft_complex<double>> s;
++ s.print_buffer_flat(buf, osize, ooffset);
++ break;
++ }
++ break;
++ }
++ case fft_array_type_complex_planar:
++ case fft_array_type_hermitian_planar:
++ case fft_array_type_real:
++ {
++ switch(precision)
++ {
++ case fft_precision_half:
++ {
++ buffer_printer<_Float16> s;
++ s.print_buffer_flat(buf, osize, ooffset);
++ break;
++ }
++ case fft_precision_single:
++ {
++ buffer_printer<float> s;
++ s.print_buffer_flat(buf, osize, ooffset);
++ break;
++ }
++ case fft_precision_double:
++ {
++ buffer_printer<double> s;
++ s.print_buffer_flat(buf, osize, ooffset);
++ break;
++ }
++ }
++ break;
++ default:
++ throw std::runtime_error("Invalid itype in print_ibuffer_flat");
++ }
++ }
++ }
++
++ void print_obuffer_flat(const std::vector<hostbuf>& buf) const
++ {
++ switch(otype)
++ {
++ case fft_array_type_complex_interleaved:
++ case fft_array_type_hermitian_interleaved:
++ {
++ switch(precision)
++ {
++ case fft_precision_half:
++ {
++ buffer_printer<rocfft_complex<_Float16>> s;
++ s.print_buffer_flat(buf, osize, ooffset);
++ break;
++ }
++ case fft_precision_single:
++ {
++ buffer_printer<rocfft_complex<float>> s;
++ s.print_buffer_flat(buf, osize, ooffset);
++ break;
++ }
++ case fft_precision_double:
++ buffer_printer<rocfft_complex<double>> s;
++ s.print_buffer_flat(buf, osize, ooffset);
++ break;
++ }
++ break;
++ }
++ case fft_array_type_complex_planar:
++ case fft_array_type_hermitian_planar:
++ case fft_array_type_real:
++ {
++ switch(precision)
++ {
++ case fft_precision_half:
++ {
++ buffer_printer<_Float16> s;
++ s.print_buffer_flat(buf, osize, ooffset);
++ break;
++ }
++ case fft_precision_single:
++ {
++ buffer_printer<float> s;
++ s.print_buffer_flat(buf, osize, ooffset);
++ break;
++ }
++
++ case fft_precision_double:
++ {
++ buffer_printer<double> s;
++ s.print_buffer_flat(buf, osize, ooffset);
++ break;
++ }
++ }
++ break;
++ default:
++ throw std::runtime_error("Invalid itype in print_ibuffer_flat");
++ }
++ }
++ }
++
++ virtual fft_status set_callbacks(void* load_cb_host,
++ void* load_cb_data,
++ void* store_cb_host,
++ void* store_cb_data)
++ {
++ return fft_status_success;
++ }
++
++ virtual fft_status execute(void** in, void** out)
++ {
++ return fft_status_success;
++ };
++
++ size_t fft_params_vram_footprint()
++ {
++ return fft_params::vram_footprint();
++ }
++
++ virtual size_t vram_footprint()
++ {
++ const auto ibuf_size = ibuffer_sizes();
++ size_t val = std::accumulate(ibuf_size.begin(), ibuf_size.end(), (size_t)1);
++ if(placement == fft_placement_notinplace)
++ {
++ const auto obuf_size = obuffer_sizes();
++ val += std::accumulate(obuf_size.begin(), obuf_size.end(), (size_t)1);
++ }
++ return val;
++ }
++
++ // Specific exception type for work buffer allocation failure.
++ // Tests that hit this can't fit on the GPU and should be skipped.
++ struct work_buffer_alloc_failure : public std::runtime_error
++ {
++ work_buffer_alloc_failure(const std::string& s)
++ : std::runtime_error(s)
++ {
++ }
++ };
++
++ virtual fft_status create_plan()
++ {
++ return fft_status_success;
++ }
++
++ // Change a forward transform to it's inverse
++ void inverse_from_forward(fft_params& params_forward)
++ {
++ switch(params_forward.transform_type)
++ {
++ case fft_transform_type_complex_forward:
++ transform_type = fft_transform_type_complex_inverse;
++ break;
++ case fft_transform_type_real_forward:
++ transform_type = fft_transform_type_real_inverse;
++ break;
++ default:
++ throw std::runtime_error("Transform type not forward.");
++ }
++
++ length = params_forward.length;
++ istride = params_forward.ostride;
++ ostride = params_forward.istride;
++ nbatch = params_forward.nbatch;
++ precision = params_forward.precision;
++ placement = params_forward.placement;
++ idist = params_forward.odist;
++ odist = params_forward.idist;
++ itype = params_forward.otype;
++ otype = params_forward.itype;
++ ioffset = params_forward.ooffset;
++ ooffset = params_forward.ioffset;
++
++ run_callbacks = params_forward.run_callbacks;
++
++ check_output_strides = params_forward.check_output_strides;
++
++ scale_factor = 1 / params_forward.scale_factor;
++ }
++
++ // prepare for multi-GPU transform. Generated input is in ibuffer.
++ // pibuffer, pobuffer are the pointers that will be passed to the
++ // FFT library's "execute" API.
++ virtual void multi_gpu_prepare(std::vector<gpubuf>& ibuffer,
++ std::vector<void*>& pibuffer,
++ std::vector<void*>& pobuffer)
++ {
++ }
++
++ // finalize multi-GPU transform. pobuffers are the pointers
++ // provided to the FFT library's "execute" API. obuffer is the
++ // buffer where transform output needs to go for validation
++ virtual void multi_gpu_finalize(std::vector<gpubuf>& obuffer, std::vector<void*>& pobuffer) {}
++
++ // create bricks in the specified field for the specified number
++ // of devices. The field is split along the highest FFT
++ // dimension, and the length only includes FFT lengths, not batch
++ // dimension.
++ void distribute_field(int deviceCount,
++ std::vector<fft_field>& fields,
++ const std::vector<size_t>& field_length)
++ {
++ size_t slowLen = field_length.front();
++ if(slowLen < static_cast<size_t>(deviceCount))
++ throw std::runtime_error("too many devices to distribute length "
++ + std::to_string(slowLen));
++
++ auto& field = fields.emplace_back();
++
++ for(int i = 0; i < deviceCount; ++i)
++ {
++ // start at origin
++ std::vector<size_t> field_lower(field_length.size());
++ std::vector<size_t> field_upper(field_length.size());
++
++ // note: slowest FFT dim is index 0 in these coordinates
++ field_lower[0] = slowLen / deviceCount * i;
++
++ // last brick needs to include the whole slow len
++ if(i == deviceCount - 1)
++ {
++ field_upper[0] = slowLen;
++ }
++ else
++ {
++ field_upper[0] = std::min(slowLen, field_lower[0] + slowLen / deviceCount);
++ }
++
++ for(unsigned int upperDim = 1; upperDim < field_length.size(); ++upperDim)
++ {
++ field_upper[upperDim] = field_length[upperDim];
++ }
++
++ // field coordinates also need to include batch
++ field_lower.insert(field_lower.begin(), 0);
++ field_upper.insert(field_upper.begin(), nbatch);
++
++ // bricks have contiguous strides
++ size_t brick_dist = 1;
++ std::vector<size_t> brick_stride(field_lower.size());
++ for(size_t distIdx = 0; distIdx < field_lower.size(); ++distIdx)
++ {
++ // fill strides from fastest to slowest
++ *(brick_stride.rbegin() + distIdx) = brick_dist;
++ brick_dist *= *(field_upper.rbegin() + distIdx) - *(field_lower.rbegin() + distIdx);
++ }
++ field.bricks.push_back(
++ fft_params::fft_brick{field_lower, field_upper, brick_stride, i});
++ }
++ }
++
++ void distribute_input(int deviceCount)
++ {
++ distribute_field(deviceCount, ifields, length);
++ }
++
++ void distribute_output(int deviceCount)
++ {
++ distribute_field(deviceCount, ofields, olength());
++ }
++};
++
++// This is used with the program_options class so that the user can type an integer on the
++// command line and we store into an enum varaible
++template <typename _Elem, typename _Traits>
++std::basic_istream<_Elem, _Traits>& operator>>(std::basic_istream<_Elem, _Traits>& stream,
++ fft_array_type& atype)
++{
++ unsigned tmp;
++ stream >> tmp;
++ atype = fft_array_type(tmp);
++ return stream;
++}
++
++// similarly for transform type
++template <typename _Elem, typename _Traits>
++std::basic_istream<_Elem, _Traits>& operator>>(std::basic_istream<_Elem, _Traits>& stream,
++ fft_transform_type& ttype)
++{
++ unsigned tmp;
++ stream >> tmp;
++ ttype = fft_transform_type(tmp);
++ return stream;
++}
++
++// Returns pairs of startindex, endindex, for 1D, 2D, 3D lengths
++template <typename T1>
++std::vector<std::pair<T1, T1>> partition_colmajor(const T1& length)
++{
++ return partition_base(length, compute_partition_count(length));
++}
++
++// Partition on the rightmost part of the tuple, for col-major indexing
++template <typename T1>
++std::vector<std::pair<std::tuple<T1, T1>, std::tuple<T1, T1>>>
++ partition_colmajor(const std::tuple<T1, T1>& length)
++{
++ auto partitions = partition_base(std::get<1>(length), compute_partition_count(length));
++ std::vector<std::pair<std::tuple<T1, T1>, std::tuple<T1, T1>>> ret(partitions.size());
++ for(size_t i = 0; i < partitions.size(); ++i)
++ {
++ std::get<1>(ret[i].first) = partitions[i].first;
++ std::get<0>(ret[i].first) = 0;
++ std::get<1>(ret[i].second) = partitions[i].second;
++ std::get<0>(ret[i].second) = std::get<0>(length);
++ }
++ return ret;
++}
++template <typename T1>
++std::vector<std::pair<std::tuple<T1, T1, T1>, std::tuple<T1, T1, T1>>>
++ partition_colmajor(const std::tuple<T1, T1, T1>& length)
++{
++ auto partitions = partition_base(std::get<2>(length), compute_partition_count(length));
++ std::vector<std::pair<std::tuple<T1, T1, T1>, std::tuple<T1, T1, T1>>> ret(partitions.size());
++ for(size_t i = 0; i < partitions.size(); ++i)
++ {
++ std::get<2>(ret[i].first) = partitions[i].first;
++ std::get<1>(ret[i].first) = 0;
++ std::get<0>(ret[i].first) = 0;
++ std::get<2>(ret[i].second) = partitions[i].second;
++ std::get<1>(ret[i].second) = std::get<1>(length);
++ std::get<0>(ret[i].second) = std::get<0>(length);
++ }
++ return ret;
++}
++
++// Copy data of dimensions length with strides istride and length idist between batches to
++// a buffer with strides ostride and length odist between batches. The input and output
++// types are identical.
++template <typename Tval, typename Tint1, typename Tint2, typename Tint3>
++inline void copy_buffers_1to1(const Tval* input,
++ Tval* output,
++ const Tint1& whole_length,
++ const size_t nbatch,
++ const Tint2& istride,
++ const size_t idist,
++ const Tint3& ostride,
++ const size_t odist,
++ const std::vector<size_t>& ioffset,
++ const std::vector<size_t>& ooffset)
++{
++ const bool idx_equals_odx = istride == ostride && idist == odist;
++ size_t idx_base = 0;
++ size_t odx_base = 0;
++ auto partitions = partition_rowmajor(whole_length);
++ for(size_t b = 0; b < nbatch; b++, idx_base += idist, odx_base += odist)
++ {
++#ifdef _OPENMP
++#pragma omp parallel for num_threads(partitions.size())
++#endif
++ for(size_t part = 0; part < partitions.size(); ++part)
++ {
++ auto index = partitions[part].first;
++ const auto length = partitions[part].second;
++ do
++ {
++ const auto idx = compute_index(index, istride, idx_base);
++ const auto odx = idx_equals_odx ? idx : compute_index(index, ostride, odx_base);
++ output[odx + ooffset[0]] = input[idx + ioffset[0]];
++ } while(increment_rowmajor(index, length));
++ }
++ }
++}
++
++// Copy data of dimensions length with strides istride and length idist between batches to
++// a buffer with strides ostride and length odist between batches. The input type is
++// planar and the output type is complex interleaved.
++template <typename Tval, typename Tint1, typename Tint2, typename Tint3>
++inline void copy_buffers_2to1(const Tval* input0,
++ const Tval* input1,
++ rocfft_complex<Tval>* output,
++ const Tint1& whole_length,
++ const size_t nbatch,
++ const Tint2& istride,
++ const size_t idist,
++ const Tint3& ostride,
++ const size_t odist,
++ const std::vector<size_t>& ioffset,
++ const std::vector<size_t>& ooffset)
++{
++ const bool idx_equals_odx = istride == ostride && idist == odist;
++ size_t idx_base = 0;
++ size_t odx_base = 0;
++ auto partitions = partition_rowmajor(whole_length);
++ for(size_t b = 0; b < nbatch; b++, idx_base += idist, odx_base += odist)
++ {
++#ifdef _OPENMP
++#pragma omp parallel for num_threads(partitions.size())
++#endif
++ for(size_t part = 0; part < partitions.size(); ++part)
++ {
++ auto index = partitions[part].first;
++ const auto length = partitions[part].second;
++ do
++ {
++ const auto idx = compute_index(index, istride, idx_base);
++ const auto odx = idx_equals_odx ? idx : compute_index(index, ostride, odx_base);
++ output[odx + ooffset[0]]
++ = rocfft_complex<Tval>(input0[idx + ioffset[0]], input1[idx + ioffset[1]]);
++ } while(increment_rowmajor(index, length));
++ }
++ }
++}
++
++// Copy data of dimensions length with strides istride and length idist between batches to
++// a buffer with strides ostride and length odist between batches. The input type is
++// complex interleaved and the output type is planar.
++template <typename Tval, typename Tint1, typename Tint2, typename Tint3>
++inline void copy_buffers_1to2(const rocfft_complex<Tval>* input,
++ Tval* output0,
++ Tval* output1,
++ const Tint1& whole_length,
++ const size_t nbatch,
++ const Tint2& istride,
++ const size_t idist,
++ const Tint3& ostride,
++ const size_t odist,
++ const std::vector<size_t>& ioffset,
++ const std::vector<size_t>& ooffset)
++{
++ const bool idx_equals_odx = istride == ostride && idist == odist;
++ size_t idx_base = 0;
++ size_t odx_base = 0;
++ auto partitions = partition_rowmajor(whole_length);
++ for(size_t b = 0; b < nbatch; b++, idx_base += idist, odx_base += odist)
++ {
++#ifdef _OPENMP
++#pragma omp parallel for num_threads(partitions.size())
++#endif
++ for(size_t part = 0; part < partitions.size(); ++part)
++ {
++ auto index = partitions[part].first;
++ const auto length = partitions[part].second;
++ do
++ {
++ const auto idx = compute_index(index, istride, idx_base);
++ const auto odx = idx_equals_odx ? idx : compute_index(index, ostride, odx_base);
++ output0[odx + ooffset[0]] = input[idx + ioffset[0]].real();
++ output1[odx + ooffset[1]] = input[idx + ioffset[0]].imag();
++ } while(increment_rowmajor(index, length));
++ }
++ }
++}
++
++// Copy data of dimensions length with strides istride and length idist between batches to
++// a buffer with strides ostride and length odist between batches. The input type given
++// by itype, and the output type is given by otype.
++template <typename Tint1, typename Tint2, typename Tint3>
++inline void copy_buffers(const std::vector<hostbuf>& input,
++ std::vector<hostbuf>& output,
++ const Tint1& length,
++ const size_t nbatch,
++ const fft_precision precision,
++ const fft_array_type itype,
++ const Tint2& istride,
++ const size_t idist,
++ const fft_array_type otype,
++ const Tint3& ostride,
++ const size_t odist,
++ const std::vector<size_t>& ioffset,
++ const std::vector<size_t>& ooffset)
++{
++ if(itype == otype)
++ {
++ switch(itype)
++ {
++ case fft_array_type_complex_interleaved:
++ case fft_array_type_hermitian_interleaved:
++ switch(precision)
++ {
++ case fft_precision_half:
++ copy_buffers_1to1(
++ reinterpret_cast<const rocfft_complex<_Float16>*>(input[0].data()),
++ reinterpret_cast<rocfft_complex<_Float16>*>(output[0].data()),
++ length,
++ nbatch,
++ istride,
++ idist,
++ ostride,
++ odist,
++ ioffset,
++ ooffset);
++ break;
++ case fft_precision_single:
++ copy_buffers_1to1(reinterpret_cast<const rocfft_complex<float>*>(input[0].data()),
++ reinterpret_cast<rocfft_complex<float>*>(output[0].data()),
++ length,
++ nbatch,
++ istride,
++ idist,
++ ostride,
++ odist,
++ ioffset,
++ ooffset);
++ break;
++ case fft_precision_double:
++ copy_buffers_1to1(reinterpret_cast<const rocfft_complex<double>*>(input[0].data()),
++ reinterpret_cast<rocfft_complex<double>*>(output[0].data()),
++ length,
++ nbatch,
++ istride,
++ idist,
++ ostride,
++ odist,
++ ioffset,
++ ooffset);
++ break;
++ }
++ break;
++ case fft_array_type_real:
++ case fft_array_type_complex_planar:
++ case fft_array_type_hermitian_planar:
++ for(unsigned int idx = 0; idx < input.size(); ++idx)
++ {
++ switch(precision)
++ {
++ case fft_precision_half:
++ copy_buffers_1to1(reinterpret_cast<const _Float16*>(input[idx].data()),
++ reinterpret_cast<_Float16*>(output[idx].data()),
++ length,
++ nbatch,
++ istride,
++ idist,
++ ostride,
++ odist,
++ ioffset,
++ ooffset);
++ break;
++ case fft_precision_single:
++ copy_buffers_1to1(reinterpret_cast<const float*>(input[idx].data()),
++ reinterpret_cast<float*>(output[idx].data()),
++ length,
++ nbatch,
++ istride,
++ idist,
++ ostride,
++ odist,
++ ioffset,
++ ooffset);
++ break;
++ case fft_precision_double:
++ copy_buffers_1to1(reinterpret_cast<const double*>(input[idx].data()),
++ reinterpret_cast<double*>(output[idx].data()),
++ length,
++ nbatch,
++ istride,
++ idist,
++ ostride,
++ odist,
++ ioffset,
++ ooffset);
++ break;
++ }
++ }
++ break;
++ default:
++ throw std::runtime_error("Invalid data type");
++ }
++ }
++ else if((itype == fft_array_type_complex_interleaved && otype == fft_array_type_complex_planar)
++ || (itype == fft_array_type_hermitian_interleaved
++ && otype == fft_array_type_hermitian_planar))
++ {
++ // copy 1to2
++ switch(precision)
++ {
++ case fft_precision_half:
++ copy_buffers_1to2(reinterpret_cast<const rocfft_complex<_Float16>*>(input[0].data()),
++ reinterpret_cast<_Float16*>(output[0].data()),
++ reinterpret_cast<_Float16*>(output[1].data()),
++ length,
++ nbatch,
++ istride,
++ idist,
++ ostride,
++ odist,
++ ioffset,
++ ooffset);
++ break;
++ case fft_precision_single:
++ copy_buffers_1to2(reinterpret_cast<const rocfft_complex<float>*>(input[0].data()),
++ reinterpret_cast<float*>(output[0].data()),
++ reinterpret_cast<float*>(output[1].data()),
++ length,
++ nbatch,
++ istride,
++ idist,
++ ostride,
++ odist,
++ ioffset,
++ ooffset);
++ break;
++ case fft_precision_double:
++ copy_buffers_1to2(reinterpret_cast<const rocfft_complex<double>*>(input[0].data()),
++ reinterpret_cast<double*>(output[0].data()),
++ reinterpret_cast<double*>(output[1].data()),
++ length,
++ nbatch,
++ istride,
++ idist,
++ ostride,
++ odist,
++ ioffset,
++ ooffset);
++ break;
++ }
++ }
++ else if((itype == fft_array_type_complex_planar && otype == fft_array_type_complex_interleaved)
++ || (itype == fft_array_type_hermitian_planar
++ && otype == fft_array_type_hermitian_interleaved))
++ {
++ // copy 2 to 1
++ switch(precision)
++ {
++ case fft_precision_half:
++ copy_buffers_2to1(reinterpret_cast<const _Float16*>(input[0].data()),
++ reinterpret_cast<const _Float16*>(input[1].data()),
++ reinterpret_cast<rocfft_complex<_Float16>*>(output[0].data()),
++ length,
++ nbatch,
++ istride,
++ idist,
++ ostride,
++ odist,
++ ioffset,
++ ooffset);
++ break;
++ case fft_precision_single:
++ copy_buffers_2to1(reinterpret_cast<const float*>(input[0].data()),
++ reinterpret_cast<const float*>(input[1].data()),
++ reinterpret_cast<rocfft_complex<float>*>(output[0].data()),
++ length,
++ nbatch,
++ istride,
++ idist,
++ ostride,
++ odist,
++ ioffset,
++ ooffset);
++ break;
++ case fft_precision_double:
++ copy_buffers_2to1(reinterpret_cast<const double*>(input[0].data()),
++ reinterpret_cast<const double*>(input[1].data()),
++ reinterpret_cast<rocfft_complex<double>*>(output[0].data()),
++ length,
++ nbatch,
++ istride,
++ idist,
++ ostride,
++ odist,
++ ioffset,
++ ooffset);
++ break;
++ }
++ }
++ else
++ {
++ throw std::runtime_error("Invalid input and output types.");
++ }
++}
++
++// unroll arbitrary-dimension copy_buffers into specializations for 1-, 2-, 3-dimensions
++template <typename Tint1, typename Tint2, typename Tint3>
++inline void copy_buffers(const std::vector<hostbuf>& input,
++ std::vector<hostbuf>& output,
++ const std::vector<Tint1>& length,
++ const size_t nbatch,
++ const fft_precision precision,
++ const fft_array_type itype,
++ const std::vector<Tint2>& istride,
++ const size_t idist,
++ const fft_array_type otype,
++ const std::vector<Tint3>& ostride,
++ const size_t odist,
++ const std::vector<size_t>& ioffset,
++ const std::vector<size_t>& ooffset)
++{
++ switch(length.size())
++ {
++ case 1:
++ return copy_buffers(input,
++ output,
++ length[0],
++ nbatch,
++ precision,
++ itype,
++ istride[0],
++ idist,
++ otype,
++ ostride[0],
++ odist,
++ ioffset,
++ ooffset);
++ case 2:
++ return copy_buffers(input,
++ output,
++ std::make_tuple(length[0], length[1]),
++ nbatch,
++ precision,
++ itype,
++ std::make_tuple(istride[0], istride[1]),
++ idist,
++ otype,
++ std::make_tuple(ostride[0], ostride[1]),
++ odist,
++ ioffset,
++ ooffset);
++ case 3:
++ return copy_buffers(input,
++ output,
++ std::make_tuple(length[0], length[1], length[2]),
++ nbatch,
++ precision,
++ itype,
++ std::make_tuple(istride[0], istride[1], istride[2]),
++ idist,
++ otype,
++ std::make_tuple(ostride[0], ostride[1], ostride[2]),
++ odist,
++ ioffset,
++ ooffset);
++ default:
++ abort();
++ }
++}
++
++// Compute the L-infinity and L-2 distance between two buffers with strides istride and
++// length idist between batches to a buffer with strides ostride and length odist between
++// batches. Both buffers are of complex type.
++
++struct VectorNorms
++{
++ double l_2 = 0.0, l_inf = 0.0;
++};
++
++template <typename Tcomplex, typename Tint1, typename Tint2, typename Tint3>
++inline VectorNorms distance_1to1_complex(const Tcomplex* input,
++ const Tcomplex* output,
++ const Tint1& whole_length,
++ const size_t nbatch,
++ const Tint2& istride,
++ const size_t idist,
++ const Tint3& ostride,
++ const size_t odist,
++ std::vector<std::pair<size_t, size_t>>* linf_failures,
++ const double linf_cutoff,
++ const std::vector<size_t>& ioffset,
++ const std::vector<size_t>& ooffset,
++ const double output_scalar = 1.0)
++{
++ double linf = 0.0;
++ double l2 = 0.0;
++
++ std::mutex linf_failure_lock;
++ std::vector<std::pair<size_t, size_t>> linf_failures_private;
++
++ const bool idx_equals_odx = istride == ostride && idist == odist;
++ size_t idx_base = 0;
++ size_t odx_base = 0;
++ auto partitions = partition_colmajor(whole_length);
++ for(size_t b = 0; b < nbatch; b++, idx_base += idist, odx_base += odist)
++ {
++#ifdef _OPENMP
++#pragma omp parallel for reduction(max : linf) reduction(+ : l2) num_threads(partitions.size()) private(linf_failures_private)
++#endif
++ for(size_t part = 0; part < partitions.size(); ++part)
++ {
++ double cur_linf = 0.0;
++ double cur_l2 = 0.0;
++ auto index = partitions[part].first;
++ const auto length = partitions[part].second;
++
++ do
++ {
++ const auto idx = compute_index(index, istride, idx_base);
++ const auto odx = idx_equals_odx ? idx : compute_index(index, ostride, odx_base);
++ const double rdiff
++ = std::abs(static_cast<double>(output[odx + ooffset[0]].real()) * output_scalar
++ - static_cast<double>(input[idx + ioffset[0]].real()));
++ cur_linf = std::max(rdiff, cur_linf);
++ if(cur_linf > linf_cutoff)
++ {
++ std::pair<size_t, size_t> fval(b, idx);
++ if(linf_failures)
++ linf_failures_private.push_back(fval);
++ }
++ cur_l2 += rdiff * rdiff;
++
++ const double idiff
++ = std::abs(static_cast<double>(output[odx + ooffset[0]].imag()) * output_scalar
++ - static_cast<double>(input[idx + ioffset[0]].imag()));
++ cur_linf = std::max(idiff, cur_linf);
++ if(cur_linf > linf_cutoff)
++ {
++ std::pair<size_t, size_t> fval(b, idx);
++ if(linf_failures)
++ linf_failures_private.push_back(fval);
++ }
++ cur_l2 += idiff * idiff;
++
++ } while(increment_rowmajor(index, length));
++ linf = std::max(linf, cur_linf);
++ l2 += cur_l2;
++
++ if(linf_failures)
++ {
++ linf_failure_lock.lock();
++ std::copy(linf_failures_private.begin(),
++ linf_failures_private.end(),
++ std::back_inserter(*linf_failures));
++ linf_failure_lock.unlock();
++ }
++ }
++ }
++ return {.l_2 = sqrt(l2), .l_inf = linf};
++}
++
++// Compute the L-infinity and L-2 distance between two buffers with strides istride and
++// length idist between batches to a buffer with strides ostride and length odist between
++// batches. Both buffers are of real type.
++template <typename Tfloat, typename Tint1, typename Tint2, typename Tint3>
++inline VectorNorms distance_1to1_real(const Tfloat* input,
++ const Tfloat* output,
++ const Tint1& whole_length,
++ const size_t nbatch,
++ const Tint2& istride,
++ const size_t idist,
++ const Tint3& ostride,
++ const size_t odist,
++ std::vector<std::pair<size_t, size_t>>* linf_failures,
++ const double linf_cutoff,
++ const std::vector<size_t>& ioffset,
++ const std::vector<size_t>& ooffset,
++ const double output_scalar = 1.0)
++{
++ double linf = 0.0;
++ double l2 = 0.0;
++
++ std::mutex linf_failure_lock;
++ std::vector<std::pair<size_t, size_t>> linf_failures_private;
++
++ const bool idx_equals_odx = istride == ostride && idist == odist;
++ size_t idx_base = 0;
++ size_t odx_base = 0;
++ auto partitions = partition_rowmajor(whole_length);
++ for(size_t b = 0; b < nbatch; b++, idx_base += idist, odx_base += odist)
++ {
++#ifdef _OPENMP
++#pragma omp parallel for reduction(max : linf) reduction(+ : l2) num_threads(partitions.size()) private(linf_failures_private)
++#endif
++ for(size_t part = 0; part < partitions.size(); ++part)
++ {
++ double cur_linf = 0.0;
++ double cur_l2 = 0.0;
++ auto index = partitions[part].first;
++ const auto length = partitions[part].second;
++ do
++ {
++ const auto idx = compute_index(index, istride, idx_base);
++ const auto odx = idx_equals_odx ? idx : compute_index(index, ostride, odx_base);
++ const double diff
++ = std::abs(static_cast<double>(output[odx + ooffset[0]]) * output_scalar
++ - static_cast<double>(input[idx + ioffset[0]]));
++ cur_linf = std::max(diff, cur_linf);
++ if(cur_linf > linf_cutoff)
++ {
++ std::pair<size_t, size_t> fval(b, idx);
++ if(linf_failures)
++ linf_failures_private.push_back(fval);
++ }
++ cur_l2 += diff * diff;
++
++ } while(increment_rowmajor(index, length));
++ linf = std::max(linf, cur_linf);
++ l2 += cur_l2;
++
++ if(linf_failures)
++ {
++ linf_failure_lock.lock();
++ std::copy(linf_failures_private.begin(),
++ linf_failures_private.end(),
++ std::back_inserter(*linf_failures));
++ linf_failure_lock.unlock();
++ }
++ }
++ }
++ return {.l_2 = sqrt(l2), .l_inf = linf};
++}
++
++// Compute the L-infinity and L-2 distance between two buffers with strides istride and
++// length idist between batches to a buffer with strides ostride and length odist between
++// batches. input is complex-interleaved, output is complex-planar.
++template <typename Tval, typename Tint1, typename T2, typename T3>
++inline VectorNorms distance_1to2(const rocfft_complex<Tval>* input,
++ const Tval* output0,
++ const Tval* output1,
++ const Tint1& whole_length,
++ const size_t nbatch,
++ const T2& istride,
++ const size_t idist,
++ const T3& ostride,
++ const size_t odist,
++ std::vector<std::pair<size_t, size_t>>* linf_failures,
++ const double linf_cutoff,
++ const std::vector<size_t>& ioffset,
++ const std::vector<size_t>& ooffset,
++ const double output_scalar = 1.0)
++{
++ double linf = 0.0;
++ double l2 = 0.0;
++
++ std::mutex linf_failure_lock;
++ std::vector<std::pair<size_t, size_t>> linf_failures_private;
++
++ const bool idx_equals_odx = istride == ostride && idist == odist;
++ size_t idx_base = 0;
++ size_t odx_base = 0;
++ auto partitions = partition_rowmajor(whole_length);
++ for(size_t b = 0; b < nbatch; b++, idx_base += idist, odx_base += odist)
++ {
++#ifdef _OPENMP
++#pragma omp parallel for reduction(max : linf) reduction(+ : l2) num_threads(partitions.size()) private(linf_failures_private)
++#endif
++ for(size_t part = 0; part < partitions.size(); ++part)
++ {
++ double cur_linf = 0.0;
++ double cur_l2 = 0.0;
++ auto index = partitions[part].first;
++ const auto length = partitions[part].second;
++ do
++ {
++ const auto idx = compute_index(index, istride, idx_base);
++ const auto odx = idx_equals_odx ? idx : compute_index(index, ostride, odx_base);
++ const double rdiff
++ = std::abs(static_cast<double>(output0[odx + ooffset[0]]) * output_scalar
++ - static_cast<double>(input[idx + ioffset[0]].real()));
++ cur_linf = std::max(rdiff, cur_linf);
++ if(cur_linf > linf_cutoff)
++ {
++ std::pair<size_t, size_t> fval(b, idx);
++ if(linf_failures)
++ linf_failures_private.push_back(fval);
++ }
++ cur_l2 += rdiff * rdiff;
++
++ const double idiff
++ = std::abs(static_cast<double>(output1[odx + ooffset[1]]) * output_scalar
++ - static_cast<double>(input[idx + ioffset[0]].imag()));
++ cur_linf = std::max(idiff, cur_linf);
++ if(cur_linf > linf_cutoff)
++ {
++ std::pair<size_t, size_t> fval(b, idx);
++ if(linf_failures)
++ linf_failures_private.push_back(fval);
++ }
++ cur_l2 += idiff * idiff;
++
++ } while(increment_rowmajor(index, length));
++ linf = std::max(linf, cur_linf);
++ l2 += cur_l2;
++
++ if(linf_failures)
++ {
++ linf_failure_lock.lock();
++ std::copy(linf_failures_private.begin(),
++ linf_failures_private.end(),
++ std::back_inserter(*linf_failures));
++ linf_failure_lock.unlock();
++ }
++ }
++ }
++ return {.l_2 = sqrt(l2), .l_inf = linf};
++}
++
++// Compute the L-inifnity and L-2 distance between two buffers of dimension length and
++// with types given by itype, otype, and precision.
++template <typename Tint1, typename Tint2, typename Tint3>
++inline VectorNorms distance(const std::vector<hostbuf>& input,
++ const std::vector<hostbuf>& output,
++ const Tint1& length,
++ const size_t nbatch,
++ const fft_precision precision,
++ const fft_array_type itype,
++ const Tint2& istride,
++ const size_t idist,
++ const fft_array_type otype,
++ const Tint3& ostride,
++ const size_t odist,
++ std::vector<std::pair<size_t, size_t>>* linf_failures,
++ const double linf_cutoff,
++ const std::vector<size_t>& ioffset,
++ const std::vector<size_t>& ooffset,
++ const double output_scalar = 1.0)
++{
++ VectorNorms dist;
++
++ if(itype == otype)
++ {
++ switch(itype)
++ {
++ case fft_array_type_complex_interleaved:
++ case fft_array_type_hermitian_interleaved:
++ switch(precision)
++ {
++ case fft_precision_half:
++ dist = distance_1to1_complex(
++ reinterpret_cast<const rocfft_complex<_Float16>*>(input[0].data()),
++ reinterpret_cast<const rocfft_complex<_Float16>*>(output[0].data()),
++ length,
++ nbatch,
++ istride,
++ idist,
++ ostride,
++ odist,
++ linf_failures,
++ linf_cutoff,
++ ioffset,
++ ooffset,
++ output_scalar);
++ break;
++ case fft_precision_single:
++ dist = distance_1to1_complex(
++ reinterpret_cast<const rocfft_complex<float>*>(input[0].data()),
++ reinterpret_cast<const rocfft_complex<float>*>(output[0].data()),
++ length,
++ nbatch,
++ istride,
++ idist,
++ ostride,
++ odist,
++ linf_failures,
++ linf_cutoff,
++ ioffset,
++ ooffset,
++ output_scalar);
++ break;
++ case fft_precision_double:
++ dist = distance_1to1_complex(
++ reinterpret_cast<const rocfft_complex<double>*>(input[0].data()),
++ reinterpret_cast<const rocfft_complex<double>*>(output[0].data()),
++ length,
++ nbatch,
++ istride,
++ idist,
++ ostride,
++ odist,
++ linf_failures,
++ linf_cutoff,
++ ioffset,
++ ooffset,
++ output_scalar);
++ break;
++ }
++ dist.l_2 *= dist.l_2;
++ break;
++ case fft_array_type_real:
++ case fft_array_type_complex_planar:
++ case fft_array_type_hermitian_planar:
++ for(unsigned int idx = 0; idx < input.size(); ++idx)
++ {
++ VectorNorms d;
++ switch(precision)
++ {
++ case fft_precision_half:
++ d = distance_1to1_real(reinterpret_cast<const _Float16*>(input[idx].data()),
++ reinterpret_cast<const _Float16*>(output[idx].data()),
++ length,
++ nbatch,
++ istride,
++ idist,
++ ostride,
++ odist,
++ linf_failures,
++ linf_cutoff,
++ ioffset,
++ ooffset,
++ output_scalar);
++ break;
++ case fft_precision_single:
++ d = distance_1to1_real(reinterpret_cast<const float*>(input[idx].data()),
++ reinterpret_cast<const float*>(output[idx].data()),
++ length,
++ nbatch,
++ istride,
++ idist,
++ ostride,
++ odist,
++ linf_failures,
++ linf_cutoff,
++ ioffset,
++ ooffset,
++ output_scalar);
++ break;
++ case fft_precision_double:
++ d = distance_1to1_real(reinterpret_cast<const double*>(input[idx].data()),
++ reinterpret_cast<const double*>(output[idx].data()),
++ length,
++ nbatch,
++ istride,
++ idist,
++ ostride,
++ odist,
++ linf_failures,
++ linf_cutoff,
++ ioffset,
++ ooffset,
++ output_scalar);
++ break;
++ }
++ dist.l_inf = std::max(d.l_inf, dist.l_inf);
++ dist.l_2 += d.l_2 * d.l_2;
++ }
++ break;
++ default:
++ throw std::runtime_error("Invalid input and output types.");
++ }
++ }
++ else if((itype == fft_array_type_complex_interleaved && otype == fft_array_type_complex_planar)
++ || (itype == fft_array_type_hermitian_interleaved
++ && otype == fft_array_type_hermitian_planar))
++ {
++ switch(precision)
++ {
++ case fft_precision_half:
++ dist = distance_1to2(reinterpret_cast<const rocfft_complex<_Float16>*>(input[0].data()),
++ reinterpret_cast<const _Float16*>(output[0].data()),
++ reinterpret_cast<const _Float16*>(output[1].data()),
++ length,
++ nbatch,
++ istride,
++ idist,
++ ostride,
++ odist,
++ linf_failures,
++ linf_cutoff,
++ ioffset,
++ ooffset,
++ output_scalar);
++ break;
++ case fft_precision_single:
++ dist = distance_1to2(reinterpret_cast<const rocfft_complex<float>*>(input[0].data()),
++ reinterpret_cast<const float*>(output[0].data()),
++ reinterpret_cast<const float*>(output[1].data()),
++ length,
++ nbatch,
++ istride,
++ idist,
++ ostride,
++ odist,
++ linf_failures,
++ linf_cutoff,
++ ioffset,
++ ooffset,
++ output_scalar);
++ break;
++ case fft_precision_double:
++ dist = distance_1to2(reinterpret_cast<const rocfft_complex<double>*>(input[0].data()),
++ reinterpret_cast<const double*>(output[0].data()),
++ reinterpret_cast<const double*>(output[1].data()),
++ length,
++ nbatch,
++ istride,
++ idist,
++ ostride,
++ odist,
++ linf_failures,
++ linf_cutoff,
++ ioffset,
++ ooffset,
++ output_scalar);
++ break;
++ }
++ dist.l_2 *= dist.l_2;
++ }
++ else if((itype == fft_array_type_complex_planar && otype == fft_array_type_complex_interleaved)
++ || (itype == fft_array_type_hermitian_planar
++ && otype == fft_array_type_hermitian_interleaved))
++ {
++ switch(precision)
++ {
++ case fft_precision_half:
++ dist
++ = distance_1to2(reinterpret_cast<const rocfft_complex<_Float16>*>(output[0].data()),
++ reinterpret_cast<const _Float16*>(input[0].data()),
++ reinterpret_cast<const _Float16*>(input[1].data()),
++ length,
++ nbatch,
++ ostride,
++ odist,
++ istride,
++ idist,
++ linf_failures,
++ linf_cutoff,
++ ioffset,
++ ooffset,
++ output_scalar);
++ break;
++ case fft_precision_single:
++ dist = distance_1to2(reinterpret_cast<const rocfft_complex<float>*>(output[0].data()),
++ reinterpret_cast<const float*>(input[0].data()),
++ reinterpret_cast<const float*>(input[1].data()),
++ length,
++ nbatch,
++ ostride,
++ odist,
++ istride,
++ idist,
++ linf_failures,
++ linf_cutoff,
++ ioffset,
++ ooffset,
++ output_scalar);
++ break;
++ case fft_precision_double:
++ dist = distance_1to2(reinterpret_cast<const rocfft_complex<double>*>(output[0].data()),
++ reinterpret_cast<const double*>(input[0].data()),
++ reinterpret_cast<const double*>(input[1].data()),
++ length,
++ nbatch,
++ ostride,
++ odist,
++ istride,
++ idist,
++ linf_failures,
++ linf_cutoff,
++ ioffset,
++ ooffset,
++ output_scalar);
++ break;
++ }
++ dist.l_2 *= dist.l_2;
++ }
++ else
++ {
++ throw std::runtime_error("Invalid input and output types.");
++ }
++ dist.l_2 = sqrt(dist.l_2);
++ return dist;
++}
++
++// check if the specified length + stride/dist is contiguous
++template <typename Tint1, typename Tint2>
++bool is_contiguous_rowmajor(const std::vector<Tint1>& length,
++ const std::vector<Tint2>& stride,
++ size_t dist)
++{
++ size_t expected_stride = 1;
++ auto stride_it = stride.rbegin();
++ auto length_it = length.rbegin();
++ for(; stride_it != stride.rend() && length_it != length.rend(); ++stride_it, ++length_it)
++ {
++ if(*stride_it != expected_stride)
++ return false;
++ expected_stride *= *length_it;
++ }
++ return expected_stride == dist;
++}
++
++// Unroll arbitrary-dimension distance into specializations for 1-, 2-, 3-dimensions
++template <typename Tint1, typename Tint2, typename Tint3>
++inline VectorNorms distance(const std::vector<hostbuf>& input,
++ const std::vector<hostbuf>& output,
++ std::vector<Tint1> length,
++ size_t nbatch,
++ const fft_precision precision,
++ const fft_array_type itype,
++ std::vector<Tint2> istride,
++ const size_t idist,
++ const fft_array_type otype,
++ std::vector<Tint3> ostride,
++ const size_t odist,
++ std::vector<std::pair<size_t, size_t>>* linf_failures,
++ const double linf_cutoff,
++ const std::vector<size_t>& ioffset,
++ const std::vector<size_t>& ooffset,
++ const double output_scalar = 1.0)
++{
++ // If istride and ostride are both contiguous, collapse them down
++ // to one dimension. Index calculation is simpler (and faster)
++ // in the 1D case.
++ if(is_contiguous_rowmajor(length, istride, idist)
++ && is_contiguous_rowmajor(length, ostride, odist))
++ {
++ length = {product(length.begin(), length.end()) * nbatch};
++ istride = {static_cast<Tint2>(1)};
++ ostride = {static_cast<Tint3>(1)};
++ nbatch = 1;
++ }
++
++ switch(length.size())
++ {
++ case 1:
++ return distance(input,
++ output,
++ length[0],
++ nbatch,
++ precision,
++ itype,
++ istride[0],
++ idist,
++ otype,
++ ostride[0],
++ odist,
++ linf_failures,
++ linf_cutoff,
++ ioffset,
++ ooffset,
++ output_scalar);
++ case 2:
++ return distance(input,
++ output,
++ std::make_tuple(length[0], length[1]),
++ nbatch,
++ precision,
++ itype,
++ std::make_tuple(istride[0], istride[1]),
++ idist,
++ otype,
++ std::make_tuple(ostride[0], ostride[1]),
++ odist,
++ linf_failures,
++ linf_cutoff,
++ ioffset,
++ ooffset,
++ output_scalar);
++ case 3:
++ return distance(input,
++ output,
++ std::make_tuple(length[0], length[1], length[2]),
++ nbatch,
++ precision,
++ itype,
++ std::make_tuple(istride[0], istride[1], istride[2]),
++ idist,
++ otype,
++ std::make_tuple(ostride[0], ostride[1], ostride[2]),
++ odist,
++ linf_failures,
++ linf_cutoff,
++ ioffset,
++ ooffset,
++ output_scalar);
++ default:
++ abort();
++ }
++}
++
++// Compute the L-infinity and L-2 norm of a buffer with strides istride and
++// length idist. Data is rocfft_complex.
++template <typename Tcomplex, typename T1, typename T2>
++inline VectorNorms norm_complex(const Tcomplex* input,
++ const T1& whole_length,
++ const size_t nbatch,
++ const T2& istride,
++ const size_t idist,
++ const std::vector<size_t>& offset)
++{
++ double linf = 0.0;
++ double l2 = 0.0;
++
++ size_t idx_base = 0;
++ auto partitions = partition_rowmajor(whole_length);
++ for(size_t b = 0; b < nbatch; b++, idx_base += idist)
++ {
++#ifdef _OPENMP
++#pragma omp parallel for reduction(max : linf) reduction(+ : l2) num_threads(partitions.size())
++#endif
++ for(size_t part = 0; part < partitions.size(); ++part)
++ {
++ double cur_linf = 0.0;
++ double cur_l2 = 0.0;
++ auto index = partitions[part].first;
++ const auto length = partitions[part].second;
++ do
++ {
++ const auto idx = compute_index(index, istride, idx_base);
++
++ const double rval = std::abs(static_cast<double>(input[idx + offset[0]].real()));
++ cur_linf = std::max(rval, cur_linf);
++ cur_l2 += rval * rval;
++
++ const double ival = std::abs(static_cast<double>(input[idx + offset[0]].imag()));
++ cur_linf = std::max(ival, cur_linf);
++ cur_l2 += ival * ival;
++
++ } while(increment_rowmajor(index, length));
++ linf = std::max(linf, cur_linf);
++ l2 += cur_l2;
++ }
++ }
++ return {.l_2 = sqrt(l2), .l_inf = linf};
++}
++
++// Compute the L-infinity and L-2 norm of abuffer with strides istride and
++// length idist. Data is real-valued.
++template <typename Tfloat, typename T1, typename T2>
++inline VectorNorms norm_real(const Tfloat* input,
++ const T1& whole_length,
++ const size_t nbatch,
++ const T2& istride,
++ const size_t idist,
++ const std::vector<size_t>& offset)
++{
++ double linf = 0.0;
++ double l2 = 0.0;
++
++ size_t idx_base = 0;
++ auto partitions = partition_rowmajor(whole_length);
++ for(size_t b = 0; b < nbatch; b++, idx_base += idist)
++ {
++#ifdef _OPENMP
++#pragma omp parallel for reduction(max : linf) reduction(+ : l2) num_threads(partitions.size())
++#endif
++ for(size_t part = 0; part < partitions.size(); ++part)
++ {
++ double cur_linf = 0.0;
++ double cur_l2 = 0.0;
++ auto index = partitions[part].first;
++ const auto length = partitions[part].second;
++ do
++ {
++ const auto idx = compute_index(index, istride, idx_base);
++ const double val = std::abs(static_cast<double>(input[idx + offset[0]]));
++ cur_linf = std::max(val, cur_linf);
++ cur_l2 += val * val;
++
++ } while(increment_rowmajor(index, length));
++ linf = std::max(linf, cur_linf);
++ l2 += cur_l2;
++ }
++ }
++ return {.l_2 = sqrt(l2), .l_inf = linf};
++}
++
++// Compute the L-infinity and L-2 norm of abuffer with strides istride and
++// length idist. Data format is given by precision and itype.
++template <typename T1, typename T2>
++inline VectorNorms norm(const std::vector<hostbuf>& input,
++ const T1& length,
++ const size_t nbatch,
++ const fft_precision precision,
++ const fft_array_type itype,
++ const T2& istride,
++ const size_t idist,
++ const std::vector<size_t>& offset)
++{
++ VectorNorms norm;
++
++ switch(itype)
++ {
++ case fft_array_type_complex_interleaved:
++ case fft_array_type_hermitian_interleaved:
++ switch(precision)
++ {
++ case fft_precision_half:
++ norm = norm_complex(reinterpret_cast<const rocfft_complex<_Float16>*>(input[0].data()),
++ length,
++ nbatch,
++ istride,
++ idist,
++ offset);
++ break;
++ case fft_precision_single:
++ norm = norm_complex(reinterpret_cast<const rocfft_complex<float>*>(input[0].data()),
++ length,
++ nbatch,
++ istride,
++ idist,
++ offset);
++ break;
++ case fft_precision_double:
++ norm = norm_complex(reinterpret_cast<const rocfft_complex<double>*>(input[0].data()),
++ length,
++ nbatch,
++ istride,
++ idist,
++ offset);
++ break;
++ }
++ norm.l_2 *= norm.l_2;
++ break;
++ case fft_array_type_real:
++ case fft_array_type_complex_planar:
++ case fft_array_type_hermitian_planar:
++ for(unsigned int idx = 0; idx < input.size(); ++idx)
++ {
++ VectorNorms n;
++ switch(precision)
++ {
++ case fft_precision_half:
++ n = norm_real(reinterpret_cast<const _Float16*>(input[idx].data()),
++ length,
++ nbatch,
++ istride,
++ idist,
++ offset);
++ break;
++ case fft_precision_single:
++ n = norm_real(reinterpret_cast<const float*>(input[idx].data()),
++ length,
++ nbatch,
++ istride,
++ idist,
++ offset);
++ break;
++ case fft_precision_double:
++ n = norm_real(reinterpret_cast<const double*>(input[idx].data()),
++ length,
++ nbatch,
++ istride,
++ idist,
++ offset);
++ break;
++ }
++ norm.l_inf = std::max(n.l_inf, norm.l_inf);
++ norm.l_2 += n.l_2 * n.l_2;
++ }
++ break;
++ default:
++ throw std::runtime_error("Invalid data type");
++ }
++
++ norm.l_2 = sqrt(norm.l_2);
++ return norm;
++}
++
++// Unroll arbitrary-dimension norm into specializations for 1-, 2-, 3-dimensions
++template <typename T1, typename T2>
++inline VectorNorms norm(const std::vector<hostbuf>& input,
++ std::vector<T1> length,
++ size_t nbatch,
++ const fft_precision precision,
++ const fft_array_type type,
++ std::vector<T2> stride,
++ const size_t dist,
++ const std::vector<size_t>& offset)
++{
++ // If stride is contiguous, collapse it down to one dimension.
++ // Index calculation is simpler (and faster) in the 1D case.
++ if(is_contiguous_rowmajor(length, stride, dist))
++ {
++ length = {product(length.begin(), length.end()) * nbatch};
++ stride = {static_cast<T2>(1)};
++ nbatch = 1;
++ }
++
++ switch(length.size())
++ {
++ case 1:
++ return norm(input, length[0], nbatch, precision, type, stride[0], dist, offset);
++ case 2:
++ return norm(input,
++ std::make_tuple(length[0], length[1]),
++ nbatch,
++ precision,
++ type,
++ std::make_tuple(stride[0], stride[1]),
++ dist,
++ offset);
++ case 3:
++ return norm(input,
++ std::make_tuple(length[0], length[1], length[2]),
++ nbatch,
++ precision,
++ type,
++ std::make_tuple(stride[0], stride[1], stride[2]),
++ dist,
++ offset);
++ default:
++ abort();
++ }
++}
++
++// Given a data type and precision, the distance between batches, and
++// the batch size, allocate the required host buffer(s).
++static std::vector<hostbuf> allocate_host_buffer(const fft_precision precision,
++ const fft_array_type type,
++ const std::vector<size_t>& size)
++{
++ std::vector<hostbuf> buffers(size.size());
++ for(unsigned int i = 0; i < size.size(); ++i)
++ {
++ buffers[i].alloc(size[i] * var_size<size_t>(precision, type));
++ }
++ return buffers;
++}
++
++// Check if the required buffers fit in the device vram.
++inline bool vram_fits_problem(const size_t prob_size, const size_t vram_avail, int deviceId = 0)
++{
++ // We keep a small margin of error for fitting the problem into vram:
++ const size_t extra = 1 << 27;
++
++ return vram_avail > prob_size + extra;
++}
++
++// Computes the twiddle table VRAM footprint for r2c/c2r transforms.
++// This function will return 0 for the other transform types, since
++// the VRAM footprint in rocFFT is negligible for the other cases.
++inline size_t twiddle_table_vram_footprint(const fft_params& params)
++{
++ size_t vram_footprint = 0;
++
++ // Add vram footprint from real/complex even twiddle buffer size.
++ if(params.transform_type == fft_transform_type_real_forward
++ || params.transform_type == fft_transform_type_real_inverse)
++ {
++ const auto realdim = params.length.back();
++ if(realdim % 2 == 0)
++ {
++ const auto complex_size = params.precision == fft_precision_single ? 8 : 16;
++ // even length twiddle size is 1/4 of the real size, but
++ // in complex elements
++ vram_footprint += realdim * complex_size / 4;
++ }
++ }
++
++ return vram_footprint;
++}
++
++#endif
+diff --git a/shared/fftw_transform.h b/shared/fftw_transform.h
+new file mode 100644
+index 0000000..873a373
+--- /dev/null
++++ b/shared/fftw_transform.h
+@@ -0,0 +1,493 @@
++// Copyright (C) 2016 - 2023 Advanced Micro Devices, Inc. All rights reserved.
++//
++// Permission is hereby granted, free of charge, to any person obtaining a copy
++// of this software and associated documentation files (the "Software"), to deal
++// in the Software without restriction, including without limitation the rights
++// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
++// copies of the Software, and to permit persons to whom the Software is
++// furnished to do so, subject to the following conditions:
++//
++// The above copyright notice and this permission notice shall be included in
++// all copies or substantial portions of the Software.
++//
++// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
++// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
++// THE SOFTWARE.
++
++#pragma once
++#ifndef FFTWTRANSFORM_H
++#define FFTWTRANSFORM_H
++
++#include "hostbuf.h"
++#include "rocfft_complex.h"
++#include "test_params.h"
++#include <fftw3.h>
++#include <vector>
++
++// Function to return maximum error for float and double types.
++//
++// Following Schatzman (1996; Accuracy of the Discrete Fourier
++// Transform and the Fast Fourier Transform), the shape of relative
++// l_2 error vs length should look like
++//
++// epsilon * sqrt(log2(length)).
++//
++// The magic epsilon constants below were chosen so that we get a
++// reasonable upper bound for (all of) our tests.
++//
++// For rocFFT, prime lengths result in the highest error. As such,
++// the epsilons below are perhaps too loose for pow2 lengths; but they
++// are appropriate for prime lengths.
++template <typename Tfloat>
++inline double type_epsilon();
++template <>
++inline double type_epsilon<_Float16>()
++{
++ return half_epsilon;
++}
++template <>
++inline double type_epsilon<float>()
++{
++ return single_epsilon;
++}
++template <>
++inline double type_epsilon<double>()
++{
++ return double_epsilon;
++}
++
++// C++ traits to translate float->fftwf_complex and
++// double->fftw_complex.
++// The correct FFTW complex type can be accessed via, for example,
++// using complex_t = typename fftw_complex_trait<Tfloat>::complex_t;
++template <typename Tfloat>
++struct fftw_trait;
++template <>
++struct fftw_trait<_Float16>
++{
++ // fftw does not support half precision, so use single precision and convert
++ using fftw_complex_type = fftwf_complex;
++ using fftw_plan_type = fftwf_plan;
++};
++template <>
++struct fftw_trait<float>
++{
++ using fftw_complex_type = fftwf_complex;
++ using fftw_plan_type = fftwf_plan;
++};
++template <>
++struct fftw_trait<double>
++{
++ using fftw_complex_type = fftw_complex;
++ using fftw_plan_type = fftw_plan;
++};
++
++// Copies the half-precision input buffer to a single-precision
++// buffer. Note that the input buffer is already sized like it's a
++// single-precision buffer (but only half of it is filled), because
++// we allocate a single-precision buffer for FFTW to plan with.
++static hostbuf half_to_single_copy(const hostbuf& in)
++{
++ auto out = in.copy();
++ auto in_begin = reinterpret_cast<const _Float16*>(in.data());
++ std::copy_n(in_begin, in.size() / sizeof(_Float16) / 2, reinterpret_cast<float*>(out.data()));
++ return out;
++}
++
++// converts a wider precision buffer to a narrower precision, in-place
++template <typename TfloatIn, typename TfloatOut>
++void narrow_precision_inplace(hostbuf& in)
++{
++ // ensure we're actually shrinking the data
++ static_assert(sizeof(TfloatIn) > sizeof(TfloatOut));
++
++ auto readPtr = reinterpret_cast<const TfloatIn*>(in.data());
++ auto writePtr = reinterpret_cast<TfloatOut*>(in.data());
++ std::copy_n(readPtr, in.size() / sizeof(TfloatIn), writePtr);
++ in.shrink(in.size() / (sizeof(TfloatIn) / sizeof(TfloatOut)));
++}
++
++static void single_to_half_inplace(hostbuf& in)
++{
++ narrow_precision_inplace<float, _Float16>(in);
++}
++
++// Template wrappers for real-valued FFTW allocators:
++template <typename Tfloat>
++inline Tfloat* fftw_alloc_real_type(size_t n);
++template <>
++inline float* fftw_alloc_real_type<float>(size_t n)
++{
++ return fftwf_alloc_real(n);
++}
++template <>
++inline double* fftw_alloc_real_type<double>(size_t n)
++{
++ return fftw_alloc_real(n);
++}
++
++// Template wrappers for complex-valued FFTW allocators:
++template <typename Tfloat>
++inline typename fftw_trait<Tfloat>::fftw_complex_type* fftw_alloc_complex_type(size_t n);
++template <>
++inline typename fftw_trait<float>::fftw_complex_type* fftw_alloc_complex_type<float>(size_t n)
++{
++ return fftwf_alloc_complex(n);
++}
++template <>
++inline typename fftw_trait<double>::fftw_complex_type* fftw_alloc_complex_type<double>(size_t n)
++{
++ return fftw_alloc_complex(n);
++}
++
++template <typename fftw_type>
++inline fftw_type* fftw_alloc_type(size_t n);
++template <>
++inline float* fftw_alloc_type<float>(size_t n)
++{
++ return fftw_alloc_real_type<float>(n);
++}
++template <>
++inline double* fftw_alloc_type<double>(size_t n)
++{
++ return fftw_alloc_real_type<double>(n);
++}
++template <>
++inline fftwf_complex* fftw_alloc_type<fftwf_complex>(size_t n)
++{
++ return fftw_alloc_complex_type<float>(n);
++}
++template <>
++inline fftw_complex* fftw_alloc_type<fftw_complex>(size_t n)
++{
++ return fftw_alloc_complex_type<double>(n);
++}
++template <>
++inline rocfft_complex<float>* fftw_alloc_type<rocfft_complex<float>>(size_t n)
++{
++ return (rocfft_complex<float>*)fftw_alloc_complex_type<float>(n);
++}
++template <>
++inline rocfft_complex<double>* fftw_alloc_type<rocfft_complex<double>>(size_t n)
++{
++ return (rocfft_complex<double>*)fftw_alloc_complex_type<double>(n);
++}
++
++// Template wrappers for FFTW plan executors:
++template <typename Tfloat>
++inline void fftw_execute_type(typename fftw_trait<Tfloat>::fftw_plan_type plan);
++template <>
++inline void fftw_execute_type<float>(typename fftw_trait<float>::fftw_plan_type plan)
++{
++ return fftwf_execute(plan);
++}
++template <>
++inline void fftw_execute_type<double>(typename fftw_trait<double>::fftw_plan_type plan)
++{
++ return fftw_execute(plan);
++}
++
++// Template wrappers for FFTW plan destroyers:
++template <typename Tfftw_plan>
++inline void fftw_destroy_plan_type(Tfftw_plan plan);
++template <>
++inline void fftw_destroy_plan_type<fftwf_plan>(fftwf_plan plan)
++{
++ return fftwf_destroy_plan(plan);
++}
++template <>
++inline void fftw_destroy_plan_type<fftw_plan>(fftw_plan plan)
++{
++ return fftw_destroy_plan(plan);
++}
++
++// Template wrappers for FFTW c2c planners:
++template <typename Tfloat>
++inline typename fftw_trait<Tfloat>::fftw_plan_type
++ fftw_plan_guru64_dft(int rank,
++ const fftw_iodim64* dims,
++ int howmany_rank,
++ const fftw_iodim64* howmany_dims,
++ typename fftw_trait<Tfloat>::fftw_complex_type* in,
++ typename fftw_trait<Tfloat>::fftw_complex_type* out,
++ int sign,
++ unsigned flags);
++
++template <>
++inline typename fftw_trait<_Float16>::fftw_plan_type
++ fftw_plan_guru64_dft<_Float16>(int rank,
++ const fftw_iodim64* dims,
++ int howmany_rank,
++ const fftw_iodim64* howmany_dims,
++ typename fftw_trait<_Float16>::fftw_complex_type* in,
++ typename fftw_trait<_Float16>::fftw_complex_type* out,
++ int sign,
++ unsigned flags)
++{
++ return fftwf_plan_guru64_dft(rank, dims, howmany_rank, howmany_dims, in, out, sign, flags);
++}
++
++template <>
++inline typename fftw_trait<float>::fftw_plan_type
++ fftw_plan_guru64_dft<float>(int rank,
++ const fftw_iodim64* dims,
++ int howmany_rank,
++ const fftw_iodim64* howmany_dims,
++ typename fftw_trait<float>::fftw_complex_type* in,
++ typename fftw_trait<float>::fftw_complex_type* out,
++ int sign,
++ unsigned flags)
++{
++ return fftwf_plan_guru64_dft(rank, dims, howmany_rank, howmany_dims, in, out, sign, flags);
++}
++
++template <>
++inline typename fftw_trait<double>::fftw_plan_type
++ fftw_plan_guru64_dft<double>(int rank,
++ const fftw_iodim64* dims,
++ int howmany_rank,
++ const fftw_iodim64* howmany_dims,
++ typename fftw_trait<double>::fftw_complex_type* in,
++ typename fftw_trait<double>::fftw_complex_type* out,
++ int sign,
++ unsigned flags)
++{
++ return fftw_plan_guru64_dft(rank, dims, howmany_rank, howmany_dims, in, out, sign, flags);
++}
++
++// Template wrappers for FFTW c2c executors:
++template <typename Tfloat>
++inline void fftw_plan_execute_c2c(typename fftw_trait<Tfloat>::fftw_plan_type plan,
++ std::vector<hostbuf>& in,
++ std::vector<hostbuf>& out);
++
++template <>
++inline void fftw_plan_execute_c2c<_Float16>(typename fftw_trait<_Float16>::fftw_plan_type plan,
++ std::vector<hostbuf>& in,
++ std::vector<hostbuf>& out)
++{
++ // since FFTW does not natively support half precision, convert
++ // input to single, execute, then convert output back to half
++ auto in_single = half_to_single_copy(in.front());
++ fftwf_execute_dft(plan,
++ reinterpret_cast<fftwf_complex*>(in_single.data()),
++ reinterpret_cast<fftwf_complex*>(out.front().data()));
++ single_to_half_inplace(out.front());
++}
++
++template <>
++inline void fftw_plan_execute_c2c<float>(typename fftw_trait<float>::fftw_plan_type plan,
++ std::vector<hostbuf>& in,
++ std::vector<hostbuf>& out)
++{
++ fftwf_execute_dft(plan,
++ reinterpret_cast<fftwf_complex*>(in.front().data()),
++ reinterpret_cast<fftwf_complex*>(out.front().data()));
++}
++
++template <>
++inline void fftw_plan_execute_c2c<double>(typename fftw_trait<double>::fftw_plan_type plan,
++ std::vector<hostbuf>& in,
++ std::vector<hostbuf>& out)
++{
++ fftw_execute_dft(plan,
++ reinterpret_cast<fftw_complex*>(in.front().data()),
++ reinterpret_cast<fftw_complex*>(out.front().data()));
++}
++
++// Template wrappers for FFTW r2c planners:
++template <typename Tfloat>
++inline typename fftw_trait<Tfloat>::fftw_plan_type
++ fftw_plan_guru64_r2c(int rank,
++ const fftw_iodim64* dims,
++ int howmany_rank,
++ const fftw_iodim64* howmany_dims,
++ Tfloat* in,
++ typename fftw_trait<Tfloat>::fftw_complex_type* out,
++ unsigned flags);
++template <>
++inline typename fftw_trait<_Float16>::fftw_plan_type
++ fftw_plan_guru64_r2c<_Float16>(int rank,
++ const fftw_iodim64* dims,
++ int howmany_rank,
++ const fftw_iodim64* howmany_dims,
++ _Float16* in,
++ typename fftw_trait<_Float16>::fftw_complex_type* out,
++ unsigned flags)
++{
++ return fftwf_plan_guru64_dft_r2c(
++ rank, dims, howmany_rank, howmany_dims, reinterpret_cast<float*>(in), out, flags);
++}
++template <>
++inline typename fftw_trait<float>::fftw_plan_type
++ fftw_plan_guru64_r2c<float>(int rank,
++ const fftw_iodim64* dims,
++ int howmany_rank,
++ const fftw_iodim64* howmany_dims,
++ float* in,
++ typename fftw_trait<float>::fftw_complex_type* out,
++ unsigned flags)
++{
++ return fftwf_plan_guru64_dft_r2c(rank, dims, howmany_rank, howmany_dims, in, out, flags);
++}
++template <>
++inline typename fftw_trait<double>::fftw_plan_type
++ fftw_plan_guru64_r2c<double>(int rank,
++ const fftw_iodim64* dims,
++ int howmany_rank,
++ const fftw_iodim64* howmany_dims,
++ double* in,
++ typename fftw_trait<double>::fftw_complex_type* out,
++ unsigned flags)
++{
++ return fftw_plan_guru64_dft_r2c(rank, dims, howmany_rank, howmany_dims, in, out, flags);
++}
++
++// Template wrappers for FFTW r2c executors:
++template <typename Tfloat>
++inline void fftw_plan_execute_r2c(typename fftw_trait<Tfloat>::fftw_plan_type plan,
++ std::vector<hostbuf>& in,
++ std::vector<hostbuf>& out);
++template <>
++inline void fftw_plan_execute_r2c<_Float16>(typename fftw_trait<float>::fftw_plan_type plan,
++ std::vector<hostbuf>& in,
++ std::vector<hostbuf>& out)
++{
++ // since FFTW does not natively support half precision, convert
++ // input to single, execute, then convert output back to half
++ auto in_single = half_to_single_copy(in.front());
++ fftwf_execute_dft_r2c(plan,
++ reinterpret_cast<float*>(in_single.data()),
++ reinterpret_cast<fftwf_complex*>(out.front().data()));
++ single_to_half_inplace(out.front());
++}
++template <>
++inline void fftw_plan_execute_r2c<float>(typename fftw_trait<float>::fftw_plan_type plan,
++ std::vector<hostbuf>& in,
++ std::vector<hostbuf>& out)
++{
++ fftwf_execute_dft_r2c(plan,
++ reinterpret_cast<float*>(in.front().data()),
++ reinterpret_cast<fftwf_complex*>(out.front().data()));
++}
++template <>
++inline void fftw_plan_execute_r2c<double>(typename fftw_trait<double>::fftw_plan_type plan,
++ std::vector<hostbuf>& in,
++ std::vector<hostbuf>& out)
++{
++ fftw_execute_dft_r2c(plan,
++ reinterpret_cast<double*>(in.front().data()),
++ reinterpret_cast<fftw_complex*>(out.front().data()));
++}
++
++// Template wrappers for FFTW c2r planners:
++template <typename Tfloat>
++inline typename fftw_trait<Tfloat>::fftw_plan_type
++ fftw_plan_guru64_c2r(int rank,
++ const fftw_iodim64* dims,
++ int howmany_rank,
++ const fftw_iodim64* howmany_dims,
++ typename fftw_trait<Tfloat>::fftw_complex_type* in,
++ Tfloat* out,
++ unsigned flags);
++template <>
++inline typename fftw_trait<_Float16>::fftw_plan_type
++ fftw_plan_guru64_c2r<_Float16>(int rank,
++ const fftw_iodim64* dims,
++ int howmany_rank,
++ const fftw_iodim64* howmany_dims,
++ typename fftw_trait<_Float16>::fftw_complex_type* in,
++ _Float16* out,
++ unsigned flags)
++{
++ return fftwf_plan_guru64_dft_c2r(
++ rank, dims, howmany_rank, howmany_dims, in, reinterpret_cast<float*>(out), flags);
++}
++template <>
++inline typename fftw_trait<float>::fftw_plan_type
++ fftw_plan_guru64_c2r<float>(int rank,
++ const fftw_iodim64* dims,
++ int howmany_rank,
++ const fftw_iodim64* howmany_dims,
++ typename fftw_trait<float>::fftw_complex_type* in,
++ float* out,
++ unsigned flags)
++{
++ return fftwf_plan_guru64_dft_c2r(rank, dims, howmany_rank, howmany_dims, in, out, flags);
++}
++template <>
++inline typename fftw_trait<double>::fftw_plan_type
++ fftw_plan_guru64_c2r<double>(int rank,
++ const fftw_iodim64* dims,
++ int howmany_rank,
++ const fftw_iodim64* howmany_dims,
++ typename fftw_trait<double>::fftw_complex_type* in,
++ double* out,
++ unsigned flags)
++{
++ return fftw_plan_guru64_dft_c2r(rank, dims, howmany_rank, howmany_dims, in, out, flags);
++}
++
++// Template wrappers for FFTW c2r executors:
++template <typename Tfloat>
++inline void fftw_plan_execute_c2r(typename fftw_trait<Tfloat>::fftw_plan_type plan,
++ std::vector<hostbuf>& in,
++ std::vector<hostbuf>& out);
++template <>
++inline void fftw_plan_execute_c2r<_Float16>(typename fftw_trait<float>::fftw_plan_type plan,
++ std::vector<hostbuf>& in,
++ std::vector<hostbuf>& out)
++{
++ // since FFTW does not natively support half precision, convert
++ // input to single, execute, then convert output back to half
++ auto in_single = half_to_single_copy(in.front());
++ fftwf_execute_dft_c2r(plan,
++ reinterpret_cast<fftwf_complex*>(in_single.data()),
++ reinterpret_cast<float*>(out.front().data()));
++ single_to_half_inplace(out.front());
++}
++template <>
++inline void fftw_plan_execute_c2r<float>(typename fftw_trait<float>::fftw_plan_type plan,
++ std::vector<hostbuf>& in,
++ std::vector<hostbuf>& out)
++{
++ fftwf_execute_dft_c2r(plan,
++ reinterpret_cast<fftwf_complex*>(in.front().data()),
++ reinterpret_cast<float*>(out.front().data()));
++}
++template <>
++inline void fftw_plan_execute_c2r<double>(typename fftw_trait<double>::fftw_plan_type plan,
++ std::vector<hostbuf>& in,
++ std::vector<hostbuf>& out)
++{
++ fftw_execute_dft_c2r(plan,
++ reinterpret_cast<fftw_complex*>(in.front().data()),
++ reinterpret_cast<double*>(out.front().data()));
++}
++
++#ifdef FFTW_HAVE_SPRINT_PLAN
++// Template wrappers for FFTW print plan:
++template <typename Tfloat>
++inline char* fftw_sprint_plan(const typename fftw_trait<Tfloat>::fftw_plan_type plan);
++template <>
++inline char* fftw_sprint_plan<_Float16>(const typename fftw_trait<_Float16>::fftw_plan_type plan)
++{
++ return fftwf_sprint_plan(plan);
++}
++template <>
++inline char* fftw_sprint_plan<float>(const typename fftw_trait<float>::fftw_plan_type plan)
++{
++ return fftwf_sprint_plan(plan);
++}
++template <>
++inline char* fftw_sprint_plan<double>(const typename fftw_trait<double>::fftw_plan_type plan)
++{
++ return fftw_sprint_plan(plan);
++}
++#endif
++
++#endif
+diff --git a/shared/gpubuf.h b/shared/gpubuf.h
+new file mode 100644
+index 0000000..993fa95
+--- /dev/null
++++ b/shared/gpubuf.h
+@@ -0,0 +1,134 @@
++// Copyright (C) 2021 - 2023 Advanced Micro Devices, Inc. All rights reserved.
++//
++// Permission is hereby granted, free of charge, to any person obtaining a copy
++// of this software and associated documentation files (the "Software"), to deal
++// in the Software without restriction, including without limitation the rights
++// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
++// copies of the Software, and to permit persons to whom the Software is
++// furnished to do so, subject to the following conditions:
++//
++// The above copyright notice and this permission notice shall be included in
++// all copies or substantial portions of the Software.
++//
++// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
++// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
++// THE SOFTWARE.
++
++#ifndef ROCFFT_GPUBUF_H
++#define ROCFFT_GPUBUF_H
++
++#include "rocfft_hip.h"
++#include <cstdlib>
++
++// Simple RAII class for GPU buffers. T is the type of pointer that
++// data() returns
++template <class T = void>
++class gpubuf_t
++{
++public:
++ gpubuf_t() {}
++ // buffers are movable but not copyable
++ gpubuf_t(gpubuf_t&& other)
++ {
++ std::swap(buf, other.buf);
++ std::swap(bsize, other.bsize);
++ std::swap(device, other.device);
++ }
++ gpubuf_t& operator=(gpubuf_t&& other)
++ {
++ std::swap(buf, other.buf);
++ std::swap(bsize, other.bsize);
++ std::swap(device, other.device);
++ return *this;
++ }
++ gpubuf_t(const gpubuf_t&) = delete;
++ gpubuf_t& operator=(const gpubuf_t&) = delete;
++
++ ~gpubuf_t()
++ {
++ free();
++ }
++
++ static bool use_alloc_managed()
++ {
++ return std::getenv("ROCFFT_MALLOC_MANAGED");
++ }
++
++ hipError_t alloc(const size_t size)
++ {
++ // remember the device that was current as of alloc, so we can
++ // free on the correct device
++ auto ret = hipGetDevice(&device);
++ if(ret != hipSuccess)
++ return ret;
++
++ bsize = size;
++ static bool alloc_managed = use_alloc_managed();
++ free();
++ ret = alloc_managed ? hipMallocManaged(&buf, bsize) : hipMalloc(&buf, bsize);
++ if(ret != hipSuccess)
++ {
++ buf = nullptr;
++ bsize = 0;
++ }
++ return ret;
++ }
++
++ size_t size() const
++ {
++ return bsize;
++ }
++
++ void free()
++ {
++ if(buf != nullptr)
++ {
++ // free on the device we allocated on
++ rocfft_scoped_device dev(device);
++ (void)hipFree(buf);
++ buf = nullptr;
++ bsize = 0;
++ }
++ }
++
++ // return a pointer to the allocated memory, offset by the
++ // specified number of bytes
++ T* data_offset(size_t offset_bytes = 0) const
++ {
++ void* ptr = static_cast<char*>(buf) + offset_bytes;
++ return static_cast<T*>(ptr);
++ }
++
++ T* data() const
++ {
++ return static_cast<T*>(buf);
++ }
++
++ // equality/bool tests
++ bool operator==(std::nullptr_t n) const
++ {
++ return buf == n;
++ }
++ bool operator!=(std::nullptr_t n) const
++ {
++ return buf != n;
++ }
++ operator bool() const
++ {
++ return buf;
++ }
++
++private:
++ // The GPU buffer
++ void* buf = nullptr;
++ size_t bsize = 0;
++ int device = 0;
++};
++
++// default gpubuf that gives out void* pointers
++typedef gpubuf_t<> gpubuf;
++#endif
+diff --git a/shared/hip_object_wrapper.h b/shared/hip_object_wrapper.h
+new file mode 100644
+index 0000000..54083ab
+--- /dev/null
++++ b/shared/hip_object_wrapper.h
+@@ -0,0 +1,86 @@
++/******************************************************************************
++* Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved.
++*
++* Permission is hereby granted, free of charge, to any person obtaining a copy
++* of this software and associated documentation files (the "Software"), to deal
++* in the Software without restriction, including without limitation the rights
++* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
++* copies of the Software, and to permit persons to whom the Software is
++* furnished to do so, subject to the following conditions:
++*
++* The above copyright notice and this permission notice shall be included in
++* all copies or substantial portions of the Software.
++*
++* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
++* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
++* THE SOFTWARE.
++*******************************************************************************/
++
++#ifndef ROCFFT_HIP_OBJ_WRAPPER_H
++#define ROCFFT_HIP_OBJ_WRAPPER_H
++
++#include "rocfft_hip.h"
++
++// RAII wrapper around HIP objects
++template <typename T, auto TCreate, auto TDestroy>
++struct hip_object_wrapper_t
++{
++ hip_object_wrapper_t()
++ : obj(nullptr)
++ {
++ }
++
++ void alloc()
++ {
++ if(obj == nullptr && TCreate(&obj) != hipSuccess)
++ throw std::runtime_error("hip create failure");
++ }
++
++ void free()
++ {
++ if(obj)
++ {
++ (void)TDestroy(obj);
++ obj = nullptr;
++ }
++ }
++
++ operator const T&() const
++ {
++ return obj;
++ }
++ operator T&()
++ {
++ return obj;
++ }
++
++ operator bool() const
++ {
++ return obj != nullptr;
++ }
++
++ ~hip_object_wrapper_t()
++ {
++ free();
++ }
++
++ hip_object_wrapper_t(const hip_object_wrapper_t&) = delete;
++ hip_object_wrapper_t& operator=(const hip_object_wrapper_t&) = delete;
++ hip_object_wrapper_t(hip_object_wrapper_t&& other)
++ : obj(other.obj)
++ {
++ other.obj = nullptr;
++ }
++
++private:
++ T obj;
++};
++
++typedef hip_object_wrapper_t<hipStream_t, hipStreamCreate, hipStreamDestroy> hipStream_wrapper_t;
++typedef hip_object_wrapper_t<hipEvent_t, hipEventCreate, hipEventDestroy> hipEvent_wrapper_t;
++
++#endif // ROCFFT_HIP_OBJ_WRAPPER_H
+diff --git a/shared/hostbuf.h b/shared/hostbuf.h
+new file mode 100644
+index 0000000..0a96c7d
+--- /dev/null
++++ b/shared/hostbuf.h
+@@ -0,0 +1,158 @@
++// Copyright (C) 2021 - 2023 Advanced Micro Devices, Inc. All rights reserved.
++//
++// Permission is hereby granted, free of charge, to any person obtaining a copy
++// of this software and associated documentation files (the "Software"), to deal
++// in the Software without restriction, including without limitation the rights
++// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
++// copies of the Software, and to permit persons to whom the Software is
++// furnished to do so, subject to the following conditions:
++//
++// The above copyright notice and this permission notice shall be included in
++// all copies or substantial portions of the Software.
++//
++// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
++// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
++// THE SOFTWARE.
++
++#ifndef ROCFFT_HOSTBUF_H
++#define ROCFFT_HOSTBUF_H
++
++#include "arithmetic.h"
++#include <cstdlib>
++#include <cstring>
++
++#ifndef WIN32
++#include <stdlib.h>
++#include <sys/mman.h>
++#endif
++
++// Simple RAII class for host buffers. T is the type of pointer that
++// data() returns
++template <class T = void>
++class hostbuf_t
++{
++public:
++ hostbuf_t() {}
++ // buffers are movable but not copyable
++ hostbuf_t(hostbuf_t&& other)
++ {
++ std::swap(buf, other.buf);
++ std::swap(bsize, other.bsize);
++ }
++ hostbuf_t& operator=(hostbuf_t&& other)
++ {
++ std::swap(buf, other.buf);
++ std::swap(bsize, other.bsize);
++ return *this;
++ }
++ hostbuf_t(const hostbuf_t&) = delete;
++ hostbuf_t& operator=(const hostbuf_t&) = delete;
++
++ ~hostbuf_t()
++ {
++ free();
++ }
++
++ void alloc(size_t size)
++ {
++ bsize = size;
++ free();
++
++ // we're aligning to multiples of 64 bytes, so round the
++ // allocation size up to the nearest 64 to keep ASAN happy
++ if(size % 64)
++ {
++ size += 64 - size % 64;
++ }
++
++ // FFTW requires aligned allocations to use faster SIMD instructions.
++ // If enabling hugepages, align to 2 MiB. Otherwise, aligning to
++ // 64 bytes is enough for AVX instructions up to AVX512.
++#ifdef WIN32
++ buf = _aligned_malloc(size, 64);
++#else
++ // On Linux, ask for hugepages to reduce TLB pressure and
++ // improve performance. Allocations need to be aligned to
++ // the hugepage size, and rounded up to the next whole
++ // hugepage.
++ static const size_t TWO_MiB = 2 * 1024 * 1024;
++ if(size >= TWO_MiB)
++ {
++ size_t rounded_size = DivRoundingUp(size, TWO_MiB) * TWO_MiB;
++ buf = aligned_alloc(TWO_MiB, rounded_size);
++ madvise(buf, rounded_size, MADV_HUGEPAGE);
++ }
++ else
++ buf = aligned_alloc(64, size);
++#endif
++ }
++
++ size_t size() const
++ {
++ return bsize;
++ }
++
++ void free()
++ {
++ if(buf != nullptr)
++ {
++#ifdef WIN32
++ _aligned_free(buf);
++#else
++ std::free(buf);
++#endif
++ buf = nullptr;
++ bsize = 0;
++ }
++ }
++
++ T* data() const
++ {
++ return static_cast<T*>(buf);
++ }
++
++ // Copy method
++ hostbuf_t copy() const
++ {
++ hostbuf_t copy;
++ copy.alloc(bsize);
++ memcpy(copy.buf, buf, bsize);
++ return copy;
++ }
++
++ // shrink the buffer to fit the new size
++ void shrink(size_t new_size)
++ {
++ if(new_size > bsize)
++ throw std::runtime_error("can't shrink hostbuf to larger size");
++ // just pretend the buffer is now that size
++ bsize = new_size;
++ }
++
++ // equality/bool tests
++ bool operator==(std::nullptr_t n) const
++ {
++ return buf == n;
++ }
++ bool operator!=(std::nullptr_t n) const
++ {
++ return buf != n;
++ }
++ operator bool() const
++ {
++ return buf;
++ }
++
++private:
++ // The host buffer
++ void* buf = nullptr;
++ size_t bsize = 0;
++};
++
++// default hostbuf that gives out void* pointers
++typedef hostbuf_t<> hostbuf;
++#endif
+diff --git a/shared/increment.h b/shared/increment.h
+new file mode 100644
+index 0000000..90bba1d
+--- /dev/null
++++ b/shared/increment.h
+@@ -0,0 +1,100 @@
++// Copyright (C) 2021 - 2022 Advanced Micro Devices, Inc. All rights reserved.
++//
++// Permission is hereby granted, free of charge, to any person obtaining a copy
++// of this software and associated documentation files (the "Software"), to deal
++// in the Software without restriction, including without limitation the rights
++// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
++// copies of the Software, and to permit persons to whom the Software is
++// furnished to do so, subject to the following conditions:
++//
++// The above copyright notice and this permission notice shall be included in
++// all copies or substantial portions of the Software.
++//
++// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
++// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
++// THE SOFTWARE.
++
++#ifndef ROCFFT_INCREMENT_H
++#define ROCFFT_INCREMENT_H
++
++#include <algorithm>
++#include <tuple>
++#include <vector>
++
++// Helper functions to iterate over a buffer in row-major order.
++// Indexes may be given as either a tuple or vector of sizes. They
++// return true if the index was successfully incremented to move to
++// the next element in the buffer.
++
++template <typename T1, typename T2>
++static bool increment_base(T1& index, const T2& length)
++{
++ static_assert(std::is_integral<T1>::value, "Integral required.");
++ static_assert(std::is_integral<T2>::value, "Integral required.");
++ if(index < length - 1)
++ {
++ ++index;
++ return true;
++ }
++ index = 0;
++ return false;
++}
++
++// Increment the index (row-major) for looping over 1, 2, and 3 dimensions length.
++template <typename T1, typename T2>
++static bool increment_rowmajor(T1& index, const T2& length)
++{
++ static_assert(std::is_integral<T1>::value, "Integral required.");
++ static_assert(std::is_integral<T2>::value, "Integral required.");
++ return increment_base(index, length);
++}
++
++template <typename T1, typename T2>
++static bool increment_rowmajor(std::tuple<T1, T1>& index, const std::tuple<T2, T2>& length)
++{
++ if(increment_base(std::get<1>(index), std::get<1>(length)))
++ // we incremented ok, nothing further to do
++ return true;
++ // otherwise, we rolled over
++ return increment_base(std::get<0>(index), std::get<0>(length));
++}
++
++template <typename T1, typename T2>
++static bool increment_rowmajor(std::tuple<T1, T1, T1>& index, const std::tuple<T2, T2, T2>& length)
++{
++ if(increment_base(std::get<2>(index), std::get<2>(length)))
++ // we incremented ok, nothing further to do
++ return true;
++ if(increment_base(std::get<1>(index), std::get<1>(length)))
++ // we incremented ok, nothing further to do
++ return true;
++ // otherwise, we rolled over
++ return increment_base(std::get<0>(index), std::get<0>(length));
++}
++
++// Increment row-major index over arbitrary dimension length
++template <typename T1, typename T2>
++bool increment_rowmajor(std::vector<T1>& index, const std::vector<T2>& length)
++{
++ for(int idim = length.size(); idim-- > 0;)
++ {
++ if(index[idim] < length[idim])
++ {
++ if((++index[idim]) == length[idim])
++ {
++ index[idim] = 0;
++ continue;
++ }
++ // we know we were able to increment something and didn't hit the end
++ return true;
++ }
++ }
++ // End the loop when we get back to the start:
++ return !std::all_of(index.begin(), index.end(), [](int i) { return i == 0; });
++}
++
++#endif
+diff --git a/shared/precision_type.h b/shared/precision_type.h
+new file mode 100644
+index 0000000..526fc9a
+--- /dev/null
++++ b/shared/precision_type.h
+@@ -0,0 +1,70 @@
++// Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved.
++//
++// Permission is hereby granted, free of charge, to any person obtaining a copy
++// of this software and associated documentation files (the "Software"), to deal
++// in the Software without restriction, including without limitation the rights
++// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
++// copies of the Software, and to permit persons to whom the Software is
++// furnished to do so, subject to the following conditions:
++//
++// The above copyright notice and this permission notice shall be included in
++// all copies or substantial portions of the Software.
++//
++// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
++// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
++// THE SOFTWARE.
++
++#ifndef ROCFFT_PRECISION_TYPE_H
++#define ROCFFT_PRECISION_TYPE_H
++
++#include "array_predicate.h"
++#include "rocfft/rocfft.h"
++
++static size_t real_type_size(rocfft_precision precision)
++{
++ switch(precision)
++ {
++ case rocfft_precision_half:
++ return 2;
++ case rocfft_precision_single:
++ return 4;
++ case rocfft_precision_double:
++ return 8;
++ }
++}
++
++static size_t complex_type_size(rocfft_precision precision)
++{
++ return real_type_size(precision) * 2;
++}
++
++static const char* precision_name(rocfft_precision precision)
++{
++ switch(precision)
++ {
++ case rocfft_precision_half:
++ return "half";
++ case rocfft_precision_single:
++ return "single";
++ case rocfft_precision_double:
++ return "double";
++ }
++}
++
++static size_t element_size(rocfft_precision precision, rocfft_array_type array_type)
++{
++ return array_type_is_complex(array_type) ? complex_type_size(precision)
++ : real_type_size(precision);
++}
++
++// offset a pointer by a number of elements, given the elements'
++// precision and type (complex or not)
++static void* ptr_offset(void* p, size_t elems, rocfft_precision precision, rocfft_array_type type)
++{
++ return static_cast<char*>(p) + elems * element_size(precision, type);
++}
++#endif
+diff --git a/shared/printbuffer.h b/shared/printbuffer.h
+new file mode 100644
+index 0000000..5ae0b64
+--- /dev/null
++++ b/shared/printbuffer.h
+@@ -0,0 +1,108 @@
++// Copyright (C) 2021 - 2023 Advanced Micro Devices, Inc. All rights reserved.
++//
++// Permission is hereby granted, free of charge, to any person obtaining a copy
++// of this software and associated documentation files (the "Software"), to deal
++// in the Software without restriction, including without limitation the rights
++// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
++// copies of the Software, and to permit persons to whom the Software is
++// furnished to do so, subject to the following conditions:
++//
++// The above copyright notice and this permission notice shall be included in
++// all copies or substantial portions of the Software.
++//
++// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
++// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
++// THE SOFTWARE.
++
++#ifndef PRINTBUFFER_H
++#define PRINTBUFFER_H
++
++#include "hostbuf.h"
++#include "increment.h"
++#include <algorithm>
++#include <vector>
++
++// Output a formatted general-dimensional array with given length and stride in batches
++// separated by dist.
++template <typename Toutput, typename T1, typename T2, typename Tsize, typename Tstream>
++inline void printbuffer(const Toutput* output,
++ const std::vector<T1>& length,
++ const std::vector<T2>& stride,
++ const Tsize nbatch,
++ const Tsize dist,
++ const size_t offset,
++ Tstream& stream)
++{
++ auto i_base = 0;
++ for(unsigned int b = 0; b < nbatch; b++, i_base += dist)
++ {
++ std::vector<size_t> index(length.size());
++ std::fill(index.begin(), index.end(), 0);
++ do
++ {
++ const int i
++ = std::inner_product(index.begin(), index.end(), stride.begin(), i_base + offset);
++ stream << output[i] << " ";
++ for(int li = index.size(); li-- > 0;)
++ {
++ if(index[li] == (length[li] - 1))
++ {
++ stream << "\n";
++ }
++ else
++ {
++ break;
++ }
++ }
++ } while(increment_rowmajor(index, length));
++ stream << std::endl;
++ }
++}
++
++template <typename Telem>
++class buffer_printer
++{
++ // The scalar versions might be part of a planar format.
++public:
++ template <typename Tint1, typename Tint2, typename Tsize, typename Tstream = std::ostream>
++ static void print_buffer(const std::vector<hostbuf>& buf,
++ const std::vector<Tint1>& length,
++ const std::vector<Tint2>& stride,
++ const Tsize nbatch,
++ const Tsize dist,
++ const std::vector<size_t>& offset,
++ Tstream& stream = std::cout)
++ {
++ for(const auto& vec : buf)
++ {
++ printbuffer(reinterpret_cast<const Telem*>(vec.data()),
++ length,
++ stride,
++ nbatch,
++ dist,
++ offset[0],
++ stream);
++ }
++ };
++ template <typename Tstream = std::ostream>
++ static void print_buffer_flat(const std::vector<hostbuf>& buf,
++ const std::vector<size_t>& size,
++ const std::vector<size_t>& offset,
++ Tstream& stream = std::cout)
++ {
++ for(const auto& vec : buf)
++ {
++ auto data = reinterpret_cast<const Telem*>(vec.data());
++ stream << "idx " << 0;
++ for(size_t i = 0; i < size[0]; ++i)
++ stream << " " << data[i];
++ stream << std::endl;
++ }
++ };
++};
++
++#endif
+diff --git a/shared/ptrdiff.h b/shared/ptrdiff.h
+new file mode 100644
+index 0000000..3bd15de
+--- /dev/null
++++ b/shared/ptrdiff.h
+@@ -0,0 +1,40 @@
++// Copyright (C) 2022 Advanced Micro Devices, Inc. All rights reserved.
++//
++// Permission is hereby granted, free of charge, to any person obtaining a copy
++// of this software and associated documentation files (the "Software"), to deal
++// in the Software without restriction, including without limitation the rights
++// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
++// copies of the Software, and to permit persons to whom the Software is
++// furnished to do so, subject to the following conditions:
++//
++// The above copyright notice and this permission notice shall be included in
++// all copies or substantial portions of the Software.
++//
++// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
++// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
++// THE SOFTWARE.
++
++#pragma once
++
++// Compute the farthest point from the original pointer.
++static size_t compute_ptrdiff(const std::vector<size_t>& length,
++ const std::vector<size_t>& stride,
++ const size_t nbatch,
++ const size_t dist)
++{
++ size_t val = 0;
++ if(!length.empty())
++ {
++ val = 1;
++ for(unsigned int i = 0; i < length.size(); ++i)
++ {
++ val += (length[i] - 1) * stride[i];
++ }
++ val += (nbatch - 1) * dist;
++ }
++ return val;
++}
+diff --git a/shared/rocfft_accuracy_test.h b/shared/rocfft_accuracy_test.h
+new file mode 100644
+index 0000000..4ce3059
+--- /dev/null
++++ b/shared/rocfft_accuracy_test.h
+@@ -0,0 +1,29 @@
++// Copyright (C) 2022 - 2022 Advanced Micro Devices, Inc. All rights reserved.
++//
++// Permission is hereby granted, free of charge, to any person obtaining a copy
++// of this software and associated documentation files (the "Software"), to deal
++// in the Software without restriction, including without limitation the rights
++// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
++// copies of the Software, and to permit persons to whom the Software is
++// furnished to do so, subject to the following conditions:
++//
++// The above copyright notice and this permission notice shall be included in
++// all copies or substantial portions of the Software.
++//
++// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
++// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
++// THE SOFTWARE.
++
++#ifndef ROCFFT_ACCURACY_TEST
++#define ROCFFT_ACCURACY_TEST
++
++#include "accuracy_test.h"
++#include "rocfft_params.h"
++
++void fft_vs_reference(rocfft_params& params, bool round_trip = false);
++
++#endif
+diff --git a/shared/rocfft_against_fftw.h b/shared/rocfft_against_fftw.h
+new file mode 100644
+index 0000000..d03754c
+--- /dev/null
++++ b/shared/rocfft_against_fftw.h
+@@ -0,0 +1,231 @@
++// Copyright (C) 2016 - 2023 Advanced Micro Devices, Inc. All rights reserved.
++//
++// Permission is hereby granted, free of charge, to any person obtaining a copy
++// of this software and associated documentation files (the "Software"), to deal
++// in the Software without restriction, including without limitation the rights
++// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
++// copies of the Software, and to permit persons to whom the Software is
++// furnished to do so, subject to the following conditions:
++//
++// The above copyright notice and this permission notice shall be included in
++// all copies or substantial portions of the Software.
++//
++// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
++// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
++// THE SOFTWARE.
++
++#pragma once
++#ifndef ROCFFT_AGAINST_FFTW
++#define ROCFFT_AGAINST_FFTW
++
++#include <gtest/gtest.h>
++#include <math.h>
++#include <stdexcept>
++#include <vector>
++
++#include "fftw_transform.h"
++
++// Return the precision enum for rocFFT based upon the type.
++template <typename Tfloat>
++inline fft_precision precision_selector();
++template <>
++inline fft_precision precision_selector<float>()
++{
++ return fft_precision_single;
++}
++template <>
++inline fft_precision precision_selector<double>()
++{
++ return fft_precision_double;
++}
++
++extern bool use_fftw_wisdom;
++
++// construct and return an FFTW plan with the specified type,
++// precision, and dimensions. cpu_out is required if we're using
++// wisdom, which runs actual FFTs to work out the best plan.
++template <typename Tfloat>
++static typename fftw_trait<Tfloat>::fftw_plan_type
++ fftw_plan_with_precision(const std::vector<fftw_iodim64>& dims,
++ const std::vector<fftw_iodim64>& howmany_dims,
++ const fft_transform_type transformType,
++ const size_t isize,
++ void* cpu_in,
++ void* cpu_out)
++{
++ using fftw_complex_type = typename fftw_trait<Tfloat>::fftw_complex_type;
++
++ // NB: Using FFTW_MEASURE implies that the input buffer's data
++ // may be destroyed during plan creation. But if we're wanting
++ // to run FFTW in the first place, we must have just created an
++ // uninitialized input buffer anyway.
++
++ switch(transformType)
++ {
++ case fft_transform_type_complex_forward:
++ return fftw_plan_guru64_dft<Tfloat>(dims.size(),
++ dims.data(),
++ howmany_dims.size(),
++ howmany_dims.data(),
++ reinterpret_cast<fftw_complex_type*>(cpu_in),
++ reinterpret_cast<fftw_complex_type*>(cpu_out),
++ -1,
++ use_fftw_wisdom ? FFTW_MEASURE : FFTW_ESTIMATE);
++ case fft_transform_type_complex_inverse:
++ return fftw_plan_guru64_dft<Tfloat>(dims.size(),
++ dims.data(),
++ howmany_dims.size(),
++ howmany_dims.data(),
++ reinterpret_cast<fftw_complex_type*>(cpu_in),
++ reinterpret_cast<fftw_complex_type*>(cpu_out),
++ 1,
++ use_fftw_wisdom ? FFTW_MEASURE : FFTW_ESTIMATE);
++ case fft_transform_type_real_forward:
++ return fftw_plan_guru64_r2c<Tfloat>(dims.size(),
++ dims.data(),
++ howmany_dims.size(),
++ howmany_dims.data(),
++ reinterpret_cast<Tfloat*>(cpu_in),
++ reinterpret_cast<fftw_complex_type*>(cpu_out),
++ use_fftw_wisdom ? FFTW_MEASURE : FFTW_ESTIMATE);
++ case fft_transform_type_real_inverse:
++ return fftw_plan_guru64_c2r<Tfloat>(dims.size(),
++ dims.data(),
++ howmany_dims.size(),
++ howmany_dims.data(),
++ reinterpret_cast<fftw_complex_type*>(cpu_in),
++ reinterpret_cast<Tfloat*>(cpu_out),
++ use_fftw_wisdom ? FFTW_MEASURE : FFTW_ESTIMATE);
++ default:
++ throw std::runtime_error("Invalid transform type");
++ }
++}
++
++// construct an FFTW plan, given rocFFT parameters. output is
++// required if planning with wisdom.
++template <typename Tfloat>
++static typename fftw_trait<Tfloat>::fftw_plan_type
++ fftw_plan_via_rocfft(const std::vector<size_t>& length,
++ const std::vector<size_t>& istride,
++ const std::vector<size_t>& ostride,
++ const size_t nbatch,
++ const size_t idist,
++ const size_t odist,
++ const fft_transform_type transformType,
++ std::vector<hostbuf>& input,
++ std::vector<hostbuf>& output)
++{
++ // Dimension configuration:
++ std::vector<fftw_iodim64> dims(length.size());
++ for(unsigned int idx = 0; idx < length.size(); ++idx)
++ {
++ dims[idx].n = length[idx];
++ dims[idx].is = istride[idx];
++ dims[idx].os = ostride[idx];
++ }
++
++ // Batch configuration:
++ std::vector<fftw_iodim64> howmany_dims(1);
++ howmany_dims[0].n = nbatch;
++ howmany_dims[0].is = idist;
++ howmany_dims[0].os = odist;
++
++ return fftw_plan_with_precision<Tfloat>(dims,
++ howmany_dims,
++ transformType,
++ idist * nbatch,
++ input.front().data(),
++ output.empty() ? nullptr : output.front().data());
++}
++
++template <typename Tfloat>
++void fftw_run(fft_transform_type transformType,
++ typename fftw_trait<Tfloat>::fftw_plan_type cpu_plan,
++ std::vector<hostbuf>& cpu_in,
++ std::vector<hostbuf>& cpu_out)
++{
++ switch(transformType)
++ {
++ case fft_transform_type_complex_forward:
++ {
++ fftw_plan_execute_c2c<Tfloat>(cpu_plan, cpu_in, cpu_out);
++ break;
++ }
++ case fft_transform_type_complex_inverse:
++ {
++ fftw_plan_execute_c2c<Tfloat>(cpu_plan, cpu_in, cpu_out);
++ break;
++ }
++ case fft_transform_type_real_forward:
++ {
++ fftw_plan_execute_r2c<Tfloat>(cpu_plan, cpu_in, cpu_out);
++ break;
++ }
++ case fft_transform_type_real_inverse:
++ {
++ fftw_plan_execute_c2r<Tfloat>(cpu_plan, cpu_in, cpu_out);
++ break;
++ }
++ }
++}
++
++// Given a transform type, return the contiguous input type.
++inline fft_array_type contiguous_itype(const fft_transform_type transformType)
++{
++ switch(transformType)
++ {
++ case fft_transform_type_complex_forward:
++ case fft_transform_type_complex_inverse:
++ return fft_array_type_complex_interleaved;
++ case fft_transform_type_real_forward:
++ return fft_array_type_real;
++ case fft_transform_type_real_inverse:
++ return fft_array_type_hermitian_interleaved;
++ default:
++ throw std::runtime_error("Invalid transform type");
++ }
++ return fft_array_type_complex_interleaved;
++}
++
++// Given a transform type, return the contiguous output type.
++inline fft_array_type contiguous_otype(const fft_transform_type transformType)
++{
++ switch(transformType)
++ {
++ case fft_transform_type_complex_forward:
++ case fft_transform_type_complex_inverse:
++ return fft_array_type_complex_interleaved;
++ case fft_transform_type_real_forward:
++ return fft_array_type_hermitian_interleaved;
++ case fft_transform_type_real_inverse:
++ return fft_array_type_real;
++ default:
++ throw std::runtime_error("Invalid transform type");
++ }
++ return fft_array_type_complex_interleaved;
++}
++
++// Given a precision, return the acceptable tolerance.
++inline double type_epsilon(const fft_precision precision)
++{
++ switch(precision)
++ {
++ case fft_precision_half:
++ return type_epsilon<_Float16>();
++ break;
++ case fft_precision_single:
++ return type_epsilon<float>();
++ break;
++ case fft_precision_double:
++ return type_epsilon<double>();
++ break;
++ default:
++ throw std::runtime_error("Invalid precision");
++ }
++}
++
++#endif
+diff --git a/shared/rocfft_complex.h b/shared/rocfft_complex.h
+new file mode 100644
+index 0000000..efa0290
+--- /dev/null
++++ b/shared/rocfft_complex.h
+@@ -0,0 +1,346 @@
++// Copyright (C) 2021 - 2023 Advanced Micro Devices, Inc. All rights reserved.
++//
++// Permission is hereby granted, free of charge, to any person obtaining a copy
++// of this software and associated documentation files (the "Software"), to deal
++// in the Software without restriction, including without limitation the rights
++// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
++// copies of the Software, and to permit persons to whom the Software is
++// furnished to do so, subject to the following conditions:
++//
++// The above copyright notice and this permission notice shall be included in
++// all copies or substantial portions of the Software.
++//
++// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
++// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
++// THE SOFTWARE.
++
++#ifndef ROCFFT_COMPLEX_H
++#define ROCFFT_COMPLEX_H
++
++#include <hip/hip_fp16.h>
++#if !defined(__HIPCC_RTC__)
++#include <iostream>
++#endif
++#include <math.h>
++#include <type_traits>
++
++#ifdef __HIP_PLATFORM_NVIDIA__
++typedef __half _Float16;
++#endif
++
++template <typename Treal>
++struct rocfft_complex
++{
++
++ Treal x; // Real part
++ Treal y; // Imaginary part
++
++ // Constructors
++ // Do not initialize the members x or y by default, to ensure that it can
++ // be used in __shared__ and that it is a trivial class compatible with C.
++ __device__ __host__ rocfft_complex() = default;
++ __device__ __host__ rocfft_complex(const rocfft_complex&) = default;
++ __device__ __host__ rocfft_complex(rocfft_complex&&) = default;
++ __device__ __host__ rocfft_complex& operator=(const rocfft_complex& rhs) & = default;
++ __device__ __host__ rocfft_complex& operator=(rocfft_complex&& rhs) & = default;
++ __device__ __host__ ~rocfft_complex() = default;
++
++ // Constructor from real and imaginary parts
++ __device__ __host__ constexpr rocfft_complex(Treal real, Treal imag)
++ : x{real}
++ , y{imag}
++ {
++ }
++
++ // Conversion from different precision
++ template <typename U>
++ __device__ __host__ explicit constexpr rocfft_complex(const rocfft_complex<U>& z)
++ : x(z.x)
++ , y(z.y)
++ {
++ }
++
++ // Accessors
++ __device__ __host__ constexpr Treal real() const
++ {
++ return x;
++ }
++
++ __device__ __host__ constexpr Treal imag() const
++ {
++ return y;
++ }
++
++ // Unary operations
++ __forceinline__ __device__ __host__ rocfft_complex operator-() const
++ {
++ return {-x, -y};
++ }
++
++ __forceinline__ __device__ __host__ rocfft_complex operator+() const
++ {
++ return *this;
++ }
++
++ __device__ __host__ Treal asum(const rocfft_complex& z)
++ {
++ return abs(z.x) + abs(z.y);
++ }
++
++ // Internal real functions
++ static __forceinline__ __device__ __host__ Treal abs(Treal x)
++ {
++ return x < 0 ? -x : x;
++ }
++
++ static __forceinline__ __device__ __host__ float sqrt(float x)
++ {
++ return ::sqrtf(x);
++ }
++
++ static __forceinline__ __device__ __host__ double sqrt(double x)
++ {
++ return ::sqrt(x);
++ }
++
++ // Addition operators
++ __device__ __host__ auto& operator+=(const rocfft_complex& rhs)
++ {
++ return *this = {x + rhs.x, y + rhs.y};
++ }
++
++ __device__ __host__ auto operator+(const rocfft_complex& rhs) const
++ {
++ auto lhs = *this;
++ return lhs += rhs;
++ }
++
++ // Subtraction operators
++ __device__ __host__ auto& operator-=(const rocfft_complex& rhs)
++ {
++ return *this = {x - rhs.x, y - rhs.y};
++ }
++
++ __device__ __host__ auto operator-(const rocfft_complex& rhs) const
++ {
++ auto lhs = *this;
++ return lhs -= rhs;
++ }
++
++ // Multiplication operators
++ __device__ __host__ auto& operator*=(const rocfft_complex& rhs)
++ {
++ return *this = {x * rhs.x - y * rhs.y, y * rhs.x + x * rhs.y};
++ }
++
++ __device__ __host__ auto operator*(const rocfft_complex& rhs) const
++ {
++ auto lhs = *this;
++ return lhs *= rhs;
++ }
++
++ // Division operators
++ __device__ __host__ auto& operator/=(const rocfft_complex& rhs)
++ {
++ // Form of Robert L. Smith's Algorithm 116
++ if(abs(rhs.x) > abs(rhs.y))
++ {
++ Treal ratio = rhs.y / rhs.x;
++ Treal scale = 1 / (rhs.x + rhs.y * ratio);
++ *this = {(x + y * ratio) * scale, (y - x * ratio) * scale};
++ }
++ else
++ {
++ Treal ratio = rhs.x / rhs.y;
++ Treal scale = 1 / (rhs.x * ratio + rhs.y);
++ *this = {(y + x * ratio) * scale, (y * ratio - x) * scale};
++ }
++ return *this;
++ }
++
++ __device__ __host__ auto operator/(const rocfft_complex& rhs) const
++ {
++ auto lhs = *this;
++ return lhs /= rhs;
++ }
++
++ // Comparison operators
++ __device__ __host__ constexpr bool operator==(const rocfft_complex& rhs) const
++ {
++ return x == rhs.x && y == rhs.y;
++ }
++
++ __device__ __host__ constexpr bool operator!=(const rocfft_complex& rhs) const
++ {
++ return !(*this == rhs);
++ }
++
++ // Operators for complex-real computations
++ template <typename U>
++ __device__ __host__ auto& operator+=(const U& rhs)
++ {
++ return (x += Treal(rhs)), *this;
++ }
++
++ template <typename U>
++ __device__ __host__ auto& operator-=(const U& rhs)
++ {
++ return (x -= Treal(rhs)), *this;
++ }
++
++ __device__ __host__ auto operator+(const Treal& rhs)
++ {
++ auto lhs = *this;
++ return lhs += rhs;
++ }
++
++ __device__ __host__ auto operator-(const Treal& rhs)
++ {
++ auto lhs = *this;
++ return lhs -= rhs;
++ }
++
++ template <typename U>
++ __device__ __host__ auto& operator*=(const U& rhs)
++ {
++ return (x *= Treal(rhs)), (y *= Treal(rhs)), *this;
++ }
++
++ template <typename U>
++ __device__ __host__ auto operator*(const U& rhs) const
++ {
++ auto lhs = *this;
++ return lhs *= Treal(rhs);
++ }
++
++ template <typename U>
++ __device__ __host__ auto& operator/=(const U& rhs)
++ {
++ return (x /= Treal(rhs)), (y /= Treal(rhs)), *this;
++ }
++
++ template <typename U>
++ __device__ __host__ auto operator/(const U& rhs) const
++ {
++ auto lhs = *this;
++ return lhs /= Treal(rhs);
++ }
++
++ template <typename U>
++ __device__ __host__ constexpr bool operator==(const U& rhs) const
++ {
++ return x == Treal(rhs) && y == 0;
++ }
++
++ template <typename U>
++ __device__ __host__ constexpr bool operator!=(const U& rhs) const
++ {
++ return !(*this == rhs);
++ }
++};
++
++// Stream operators
++#if !defined(__HIPCC_RTC__)
++static std::ostream& operator<<(std::ostream& stream, const _Float16& f)
++{
++ return stream << static_cast<double>(f);
++}
++
++template <typename Treal>
++std::ostream& operator<<(std::ostream& out, const rocfft_complex<Treal>& z)
++{
++ return out << '(' << static_cast<double>(z.x) << ',' << static_cast<double>(z.y) << ')';
++}
++#endif
++
++// Operators for real-complex computations
++template <typename U, typename Treal>
++__device__ __host__ rocfft_complex<Treal> operator+(const U& lhs, const rocfft_complex<Treal>& rhs)
++{
++ return {Treal(lhs) + rhs.x, rhs.y};
++}
++
++template <typename U, typename Treal>
++__device__ __host__ rocfft_complex<Treal> operator-(const U& lhs, const rocfft_complex<Treal>& rhs)
++{
++ return {Treal(lhs) - rhs.x, -rhs.y};
++}
++
++template <typename U, typename Treal>
++__device__ __host__ rocfft_complex<Treal> operator*(const U& lhs, const rocfft_complex<Treal>& rhs)
++{
++ return {Treal(lhs) * rhs.x, Treal(lhs) * rhs.y};
++}
++
++template <typename U, typename Treal>
++__device__ __host__ rocfft_complex<Treal> operator/(const U& lhs, const rocfft_complex<Treal>& rhs)
++{
++ // Form of Robert L. Smith's Algorithm 116
++ if(rocfft_complex<Treal>::abs(rhs.x) > rocfft_complex<Treal>::abs(rhs.y))
++ {
++ Treal ratio = rhs.y / rhs.x;
++ Treal scale = Treal(lhs) / (rhs.x + rhs.y * ratio);
++ return {scale, -scale * ratio};
++ }
++ else
++ {
++ Treal ratio = rhs.x / rhs.y;
++ Treal scale = Treal(lhs) / (rhs.x * ratio + rhs.y);
++ return {ratio * scale, -scale};
++ }
++}
++
++template <typename U, typename Treal>
++__device__ __host__ constexpr bool operator==(const U& lhs, const rocfft_complex<Treal>& rhs)
++{
++ return Treal(lhs) == rhs.x && 0 == rhs.y;
++}
++
++template <typename U, typename Treal>
++__device__ __host__ constexpr bool operator!=(const U& lhs, const rocfft_complex<Treal>& rhs)
++{
++ return !(lhs == rhs);
++}
++
++// Extending std namespace to handle rocfft_complex datatype
++namespace std
++{
++ template <typename Treal>
++ __device__ __host__ constexpr Treal real(const rocfft_complex<Treal>& z)
++ {
++ return z.x;
++ }
++
++ template <typename Treal>
++ __device__ __host__ constexpr Treal imag(const rocfft_complex<Treal>& z)
++ {
++ return z.y;
++ }
++
++ template <typename Treal>
++ __device__ __host__ constexpr rocfft_complex<Treal> conj(const rocfft_complex<Treal>& z)
++ {
++ return {z.x, -z.y};
++ }
++
++ template <typename Treal>
++ __device__ __host__ inline Treal norm(const rocfft_complex<Treal>& z)
++ {
++ return (z.x * z.x) + (z.y * z.y);
++ }
++
++ template <typename Treal>
++ __device__ __host__ inline Treal abs(const rocfft_complex<Treal>& z)
++ {
++ Treal tr = rocfft_complex<Treal>::abs(z.x), ti = rocfft_complex<Treal>::abs(z.y);
++ return tr > ti ? (ti /= tr, tr * rocfft_complex<Treal>::sqrt(ti * ti + 1))
++ : ti ? (tr /= ti, ti * rocfft_complex<Treal>::sqrt(tr * tr + 1))
++ : 0;
++ }
++}
++
++#endif // ROCFFT_COMPLEX_H
+diff --git a/shared/rocfft_hip.h b/shared/rocfft_hip.h
+new file mode 100644
+index 0000000..e086cab
+--- /dev/null
++++ b/shared/rocfft_hip.h
+@@ -0,0 +1,52 @@
++// Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved.
++//
++// Permission is hereby granted, free of charge, to any person obtaining a copy
++// of this software and associated documentation files (the "Software"), to deal
++// in the Software without restriction, including without limitation the rights
++// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
++// copies of the Software, and to permit persons to whom the Software is
++// furnished to do so, subject to the following conditions:
++//
++// The above copyright notice and this permission notice shall be included in
++// all copies or substantial portions of the Software.
++//
++// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
++// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
++// THE SOFTWARE.
++
++#ifndef __ROCFFT_HIP_H__
++#define __ROCFFT_HIP_H__
++
++#include <hip/hip_runtime_api.h>
++#include <stdexcept>
++
++class rocfft_scoped_device
++{
++public:
++ rocfft_scoped_device(int device)
++ {
++ if(hipGetDevice(&orig_device) != hipSuccess)
++ throw std::runtime_error("hipGetDevice failure");
++
++ if(hipSetDevice(device) != hipSuccess)
++ throw std::runtime_error("hipSetDevice failure");
++ }
++ ~rocfft_scoped_device()
++ {
++ (void)hipSetDevice(orig_device);
++ }
++
++ // not copyable or movable
++ rocfft_scoped_device(const rocfft_scoped_device&) = delete;
++ rocfft_scoped_device(rocfft_scoped_device&&) = delete;
++ rocfft_scoped_device& operator=(const rocfft_scoped_device&) = delete;
++
++private:
++ int orig_device;
++};
++
++#endif // __ROCFFT_HIP_H__
+diff --git a/shared/rocfft_params.h b/shared/rocfft_params.h
+new file mode 100644
+index 0000000..bf9b728
+--- /dev/null
++++ b/shared/rocfft_params.h
+@@ -0,0 +1,585 @@
++// Copyright (C) 2021 - 2023 Advanced Micro Devices, Inc. All rights reserved.
++//
++// Permission is hereby granted, free of charge, to any person obtaining a copy
++// of this software and associated documentation files (the "Software"), to deal
++// in the Software without restriction, including without limitation the rights
++// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
++// copies of the Software, and to permit persons to whom the Software is
++// furnished to do so, subject to the following conditions:
++//
++// The above copyright notice and this permission notice shall be included in
++// all copies or substantial portions of the Software.
++//
++// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
++// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
++// THE SOFTWARE.
++
++#ifndef ROCFFT_PARAMS_H
++#define ROCFFT_PARAMS_H
++
++#include "../shared/fft_params.h"
++#include "../shared/gpubuf.h"
++#include "rocfft/rocfft.h"
++
++// Return the string of the rocfft_status code
++static std::string rocfft_status_to_string(const rocfft_status ret)
++{
++ switch(ret)
++ {
++ case rocfft_status_success:
++ return "rocfft_status_success";
++ case rocfft_status_failure:
++ return "rocfft_status_failure";
++ case rocfft_status_invalid_arg_value:
++ return "rocfft_status_invalid_arg_value";
++ case rocfft_status_invalid_dimensions:
++ return "rocfft_status_invalid_dimensions";
++ case rocfft_status_invalid_array_type:
++ return "rocfft_status_invalid_array_type";
++ case rocfft_status_invalid_strides:
++ return "rocfft_status_invalid_strides";
++ case rocfft_status_invalid_distance:
++ return "rocfft_status_invalid_distance";
++ case rocfft_status_invalid_offset:
++ return "rocfft_status_invalid_offset";
++ case rocfft_status_invalid_work_buffer:
++ return "rocfft_status_invalid_work_buffer";
++ default:
++ throw std::runtime_error("unknown rocfft_status");
++ }
++}
++
++inline fft_status fft_status_from_rocfftparams(const rocfft_status val)
++{
++ switch(val)
++ {
++ case rocfft_status_success:
++ return fft_status_success;
++ case rocfft_status_failure:
++ return fft_status_failure;
++ case rocfft_status_invalid_arg_value:
++ return fft_status_invalid_arg_value;
++ case rocfft_status_invalid_dimensions:
++ return fft_status_invalid_dimensions;
++ case rocfft_status_invalid_array_type:
++ return fft_status_invalid_array_type;
++ case rocfft_status_invalid_strides:
++ return fft_status_invalid_strides;
++ case rocfft_status_invalid_distance:
++ return fft_status_invalid_distance;
++ case rocfft_status_invalid_offset:
++ return fft_status_invalid_offset;
++ case rocfft_status_invalid_work_buffer:
++ return fft_status_invalid_work_buffer;
++ default:
++ throw std::runtime_error("Invalid status");
++ }
++}
++
++inline rocfft_precision rocfft_precision_from_fftparams(const fft_precision val)
++{
++ switch(val)
++ {
++ case fft_precision_single:
++ return rocfft_precision_single;
++ case fft_precision_double:
++ return rocfft_precision_double;
++ case fft_precision_half:
++ return rocfft_precision_half;
++ default:
++ throw std::runtime_error("Invalid precision");
++ }
++}
++
++inline rocfft_array_type rocfft_array_type_from_fftparams(const fft_array_type val)
++{
++ switch(val)
++ {
++ case fft_array_type_complex_interleaved:
++ return rocfft_array_type_complex_interleaved;
++ case fft_array_type_complex_planar:
++ return rocfft_array_type_complex_planar;
++ case fft_array_type_real:
++ return rocfft_array_type_real;
++ case fft_array_type_hermitian_interleaved:
++ return rocfft_array_type_hermitian_interleaved;
++ case fft_array_type_hermitian_planar:
++ return rocfft_array_type_hermitian_planar;
++ case fft_array_type_unset:
++ return rocfft_array_type_unset;
++ }
++ return rocfft_array_type_unset;
++}
++
++inline rocfft_transform_type rocfft_transform_type_from_fftparams(const fft_transform_type val)
++{
++ switch(val)
++ {
++ case fft_transform_type_complex_forward:
++ return rocfft_transform_type_complex_forward;
++ case fft_transform_type_complex_inverse:
++ return rocfft_transform_type_complex_inverse;
++ case fft_transform_type_real_forward:
++ return rocfft_transform_type_real_forward;
++ case fft_transform_type_real_inverse:
++ return rocfft_transform_type_real_inverse;
++ default:
++ throw std::runtime_error("Invalid transform type");
++ }
++}
++
++inline rocfft_result_placement
++ rocfft_result_placement_from_fftparams(const fft_result_placement val)
++{
++ switch(val)
++ {
++ case fft_placement_inplace:
++ return rocfft_placement_inplace;
++ case fft_placement_notinplace:
++ return rocfft_placement_notinplace;
++ default:
++ throw std::runtime_error("Invalid result placement");
++ }
++}
++
++class rocfft_params : public fft_params
++{
++public:
++ rocfft_plan plan = nullptr;
++ rocfft_execution_info info = nullptr;
++ rocfft_plan_description desc = nullptr;
++ gpubuf_t<void> wbuffer;
++
++ explicit rocfft_params(){};
++
++ explicit rocfft_params(const fft_params& p)
++ : fft_params(p){};
++
++ rocfft_params(const rocfft_params&) = delete;
++ rocfft_params& operator=(const rocfft_params&) = delete;
++
++ ~rocfft_params()
++ {
++ free();
++ };
++
++ void free()
++ {
++ if(plan != nullptr)
++ {
++ rocfft_plan_destroy(plan);
++ plan = nullptr;
++ }
++ if(info != nullptr)
++ {
++ rocfft_execution_info_destroy(info);
++ info = nullptr;
++ }
++ if(desc != nullptr)
++ {
++ rocfft_plan_description_destroy(desc);
++ desc = nullptr;
++ }
++ wbuffer.free();
++ }
++
++ void validate_fields() const override
++ {
++ // row-major lengths including batch (i.e. batch is at the front)
++ std::vector<size_t> length_with_batch{nbatch};
++ std::copy(length.begin(), length.end(), std::back_inserter(length_with_batch));
++
++ auto validate_field = [&](const fft_field& f) {
++ for(const auto& b : f.bricks)
++ {
++ // bricks must have same dim as FFT, including batch
++ if(b.lower.size() != length.size() + 1 || b.upper.size() != length.size() + 1
++ || b.stride.size() != length.size() + 1)
++ throw std::runtime_error(
++ "brick dimension does not match FFT + batch dimension");
++
++ // ensure lower < upper, and that both fit in the FFT + batch dims
++ if(!std::lexicographical_compare(
++ b.lower.begin(), b.lower.end(), b.upper.begin(), b.upper.end()))
++ throw std::runtime_error("brick lower index is not less than upper index");
++
++ if(!std::lexicographical_compare(b.lower.begin(),
++ b.lower.end(),
++ length_with_batch.begin(),
++ length_with_batch.end()))
++ throw std::runtime_error(
++ "brick lower index is not less than FFT + batch length");
++
++ if(!std::lexicographical_compare(b.upper.begin(),
++ b.upper.end(),
++ length_with_batch.begin(),
++ length_with_batch.end())
++ && b.upper != length_with_batch)
++ throw std::runtime_error("brick upper index is not <= FFT + batch length");
++ }
++ };
++
++ for(const auto& ifield : ifields)
++ validate_field(ifield);
++ for(const auto& ofield : ofields)
++ validate_field(ofield);
++ }
++
++ rocfft_precision get_rocfft_precision()
++ {
++ return rocfft_precision_from_fftparams(precision);
++ }
++
++ size_t vram_footprint() override
++ {
++ size_t val = fft_params::vram_footprint();
++ if(setup_structs() != fft_status_success)
++ {
++ throw std::runtime_error("Struct setup failed");
++ }
++ val += workbuffersize;
++
++ return val;
++ }
++
++ // Convert the generic fft_field structure to a rocfft_field
++ // structure that can be passed to rocFFT. In particular, we need
++ // to convert from row-major to column-major.
++ static rocfft_field fft_field_to_rocfft_field(const fft_field& f)
++ {
++ rocfft_field rfield = nullptr;
++ if(f.bricks.empty())
++ return rfield;
++
++ if(rocfft_field_create(&rfield) != rocfft_status_success)
++ throw std::runtime_error("rocfft_field_create failed");
++ for(const auto& b : f.bricks)
++ {
++ // rocFFT wants column-major bricks and fft_params stores
++ // row-major
++ std::vector<size_t> lower_cm;
++ std::copy(b.lower.rbegin(), b.lower.rend(), std::back_inserter(lower_cm));
++ std::vector<size_t> upper_cm;
++ std::copy(b.upper.rbegin(), b.upper.rend(), std::back_inserter(upper_cm));
++ std::vector<size_t> stride_cm;
++ std::copy(b.stride.rbegin(), b.stride.rend(), std::back_inserter(stride_cm));
++
++ rocfft_brick rbrick = nullptr;
++ if(rocfft_brick_create(&rbrick,
++ lower_cm.data(), // field_lower
++ upper_cm.data(), // field_upper
++ stride_cm.data(), // brick_stride
++ lower_cm.size(), // dim
++ b.device) // deviceID
++ != rocfft_status_success)
++ throw std::runtime_error("rocfft_brick_create failed");
++
++ if(rocfft_field_add_brick(rfield, rbrick) != rocfft_status_success)
++ throw std::runtime_error("rocfft_field_add_brick failed");
++
++ rocfft_brick_destroy(rbrick);
++ }
++ return rfield;
++ }
++
++ fft_status setup_structs()
++ {
++ rocfft_status fft_status = rocfft_status_success;
++ if(desc == nullptr)
++ {
++ rocfft_plan_description_create(&desc);
++ if(fft_status != rocfft_status_success)
++ return fft_status_from_rocfftparams(fft_status);
++
++ fft_status
++ = rocfft_plan_description_set_data_layout(desc,
++ rocfft_array_type_from_fftparams(itype),
++ rocfft_array_type_from_fftparams(otype),
++ ioffset.data(),
++ ooffset.data(),
++ istride_cm().size(),
++ istride_cm().data(),
++ idist,
++ ostride_cm().size(),
++ ostride_cm().data(),
++ odist);
++ if(fft_status != rocfft_status_success)
++ {
++ throw std::runtime_error("rocfft_plan_description_set_data_layout failed");
++ }
++
++ if(scale_factor != 1.0)
++ {
++ fft_status = rocfft_plan_description_set_scale_factor(desc, scale_factor);
++ if(fft_status != rocfft_status_success)
++ {
++ throw std::runtime_error("rocfft_plan_description_set_scale_factor failed");
++ }
++ }
++
++ for(const auto& ifield : ifields)
++ {
++ rocfft_field infield = fft_field_to_rocfft_field(ifield);
++ if(rocfft_plan_description_add_infield(desc, infield) != rocfft_status_success)
++ throw std::runtime_error("rocfft_description_add_infield failed");
++ rocfft_field_destroy(infield);
++ }
++
++ for(const auto& ofield : ofields)
++ {
++ rocfft_field outfield = fft_field_to_rocfft_field(ofield);
++ if(rocfft_plan_description_add_outfield(desc, outfield) != rocfft_status_success)
++ throw std::runtime_error("rocfft_description_add_outfield failed");
++ rocfft_field_destroy(outfield);
++ }
++ }
++
++ if(plan == nullptr)
++ {
++ fft_status = rocfft_plan_create(&plan,
++ rocfft_result_placement_from_fftparams(placement),
++ rocfft_transform_type_from_fftparams(transform_type),
++ get_rocfft_precision(),
++ length_cm().size(),
++ length_cm().data(),
++ nbatch,
++ desc);
++ if(fft_status != rocfft_status_success)
++ {
++ throw std::runtime_error("rocfft_plan_create failed");
++ }
++ }
++
++ if(info == nullptr)
++ {
++ fft_status = rocfft_execution_info_create(&info);
++ if(fft_status != rocfft_status_success)
++ {
++ throw std::runtime_error("rocfft_execution_info_create failed");
++ }
++ }
++
++ fft_status = rocfft_plan_get_work_buffer_size(plan, &workbuffersize);
++ if(fft_status != rocfft_status_success)
++ {
++ throw std::runtime_error("rocfft_plan_get_work_buffer_size failed");
++ }
++
++ return fft_status_from_rocfftparams(fft_status);
++ }
++
++ fft_status create_plan() override
++ {
++ fft_status ret = setup_structs();
++ if(ret != fft_status_success)
++ {
++ return ret;
++ }
++ if(workbuffersize > 0)
++ {
++ hipError_t hip_status = hipSuccess;
++ hip_status = wbuffer.alloc(workbuffersize);
++ if(hip_status != hipSuccess)
++ {
++ std::ostringstream oss;
++ oss << "work buffer allocation failed (" << workbuffersize << " requested)";
++ size_t mem_free = 0;
++ size_t mem_total = 0;
++ hip_status = hipMemGetInfo(&mem_free, &mem_total);
++ if(hip_status == hipSuccess)
++ {
++ oss << "free vram: " << mem_free << " total vram: " << mem_total;
++ }
++ else
++ {
++ oss << "hipMemGetInfo also failed";
++ }
++ throw work_buffer_alloc_failure(oss.str());
++ }
++
++ auto rocret
++ = rocfft_execution_info_set_work_buffer(info, wbuffer.data(), workbuffersize);
++ if(rocret != rocfft_status_success)
++ {
++ throw std::runtime_error("rocfft_execution_info_set_work_buffer failed");
++ }
++ }
++
++ return ret;
++ }
++
++ fft_status set_callbacks(void* load_cb_host,
++ void* load_cb_data,
++ void* store_cb_host,
++ void* store_cb_data) override
++ {
++ if(run_callbacks)
++ {
++ auto roc_status
++ = rocfft_execution_info_set_load_callback(info, &load_cb_host, &load_cb_data, 0);
++ if(roc_status != rocfft_status_success)
++ return fft_status_from_rocfftparams(roc_status);
++
++ roc_status
++ = rocfft_execution_info_set_store_callback(info, &store_cb_host, &store_cb_data, 0);
++ if(roc_status != rocfft_status_success)
++ return fft_status_from_rocfftparams(roc_status);
++ }
++ return fft_status_success;
++ }
++
++ fft_status execute(void** in, void** out) override
++ {
++ auto ret = rocfft_execute(plan, in, out, info);
++ return fft_status_from_rocfftparams(ret);
++ }
++
++ // scatter data to multiple GPUs and adjust I/O buffers to match
++ void multi_gpu_prepare(std::vector<gpubuf>& ibuffer,
++ std::vector<void*>& pibuffer,
++ std::vector<void*>& pobuffer) override
++ {
++ auto alloc_fields = [&](const fft_params::fft_field& field,
++ fft_array_type array_type,
++ std::vector<void*>& pbuffer,
++ bool copy_input) {
++ if(field.bricks.empty())
++ return;
++
++ // we have a field defined, clear the list of buffers as
++ // we'll be allocating new ones for each brick
++ pbuffer.clear();
++
++ for(const auto& b : field.bricks)
++ {
++ // get brick's length - note that this includes batch
++ // dimension
++ const auto brick_len = b.length();
++ const auto brick_stride = b.stride;
++
++ const size_t brick_size_elems = product(brick_len.begin(), brick_len.end());
++ const size_t elem_size_bytes = var_size<size_t>(precision, array_type);
++ const size_t brick_size_bytes = brick_size_elems * elem_size_bytes;
++
++ // set device for the alloc, but we want to return to the
++ // default device as the source of a following memcpy
++ {
++ rocfft_scoped_device dev(b.device);
++ multi_gpu_data.emplace_back();
++ if(multi_gpu_data.back().alloc(brick_size_bytes) != hipSuccess)
++ throw std::runtime_error("device allocation failure");
++ pbuffer.push_back(multi_gpu_data.back().data());
++ }
++
++ if(copy_input)
++ {
++ // For now, assume we're only splitting on highest FFT
++ // dimension, lower-dimensional FFT data is all
++ // contiguous, and batches are contiguous in each brick.
++ //
++ // That means we can express this as a 2D memcpy.
++ const size_t unbatched_elems_per_brick
++ = product(brick_len.begin() + 1, brick_len.end());
++ const size_t unbatched_elems_per_fft = product(length.begin(), length.end());
++
++ // get this brick's starting offset in the field
++ const size_t brick_offset
++ = b.lower_field_offset(istride, idist) * elem_size_bytes;
++
++ // copy from original input - note that we're
++ // assuming interleaved data so ibuffer has only one
++ // gpubuf
++ if(hipMemcpy2D(pbuffer.back(),
++ unbatched_elems_per_brick * elem_size_bytes,
++ ibuffer.front().data_offset(brick_offset),
++ unbatched_elems_per_fft * elem_size_bytes,
++ unbatched_elems_per_brick * elem_size_bytes,
++ brick_len.front(),
++ hipMemcpyHostToDevice)
++ != hipSuccess)
++ throw std::runtime_error("hipMemcpy failure");
++ }
++ }
++
++ // if we copied the input to all the other devices, and
++ // this is an out-of-place transform, we no longer
++ // need the original input
++ if(copy_input && placement == fft_placement_notinplace)
++ ibuffer.clear();
++ };
++
++ // assume one input, one output field for simple cases
++ if(!ifields.empty())
++ alloc_fields(ifields.front(), itype, pibuffer, true);
++ if(!ofields.empty())
++ {
++ if(!ifields.empty() && placement == fft_placement_inplace)
++ pobuffer = pibuffer;
++ else
++ alloc_fields(ofields.front(), otype, pobuffer, false);
++ }
++ }
++
++ // when preparing for multi-GPU transform, we need to allocate data
++ // on each GPU. This vector remembers all of those allocations.
++ std::vector<gpubuf> multi_gpu_data;
++
++ // gather data after multi-GPU FFT for verification
++ void multi_gpu_finalize(std::vector<gpubuf>& obuffer, std::vector<void*>& pobuffer) override
++ {
++ if(ofields.empty())
++ return;
++
++ for(size_t i = 0; i < ofields.front().bricks.size(); ++i)
++ {
++ const auto& b = ofields.front().bricks[i];
++ const auto& brick_ptr = pobuffer[i];
++
++ const auto brick_len = b.length();
++
++ const size_t elem_size_bytes = var_size<size_t>(precision, otype);
++
++ // get this brick's starting offset in the field
++ const size_t brick_offset = b.lower_field_offset(ostride, odist) * elem_size_bytes;
++
++ // switch device to where we're copying from
++ rocfft_scoped_device dev(b.device);
++
++ // For now, assume we're only splitting on highest FFT
++ // dimension, lower-dimensional FFT data is all
++ // contiguous, and batches are contiguous in each brick.
++ //
++ // That means we can express this as a 2D memcpy.
++ const size_t unbatched_elems_per_brick
++ = product(brick_len.begin() + 1, brick_len.end());
++ const auto output_length = olength();
++ const size_t unbatched_elems_per_fft
++ = product(output_length.begin(), output_length.end());
++
++ // copy to original output buffer - note that
++ // we're assuming interleaved data so obuffer
++ // has only one gpubuf
++ if(hipMemcpy2D(obuffer.front().data_offset(brick_offset),
++ unbatched_elems_per_fft * elem_size_bytes,
++ brick_ptr,
++ unbatched_elems_per_brick * elem_size_bytes,
++ unbatched_elems_per_brick * elem_size_bytes,
++ brick_len.front(),
++ hipMemcpyDeviceToDevice)
++ != hipSuccess)
++ throw std::runtime_error("hipMemcpy failure");
++
++ // device-to-device transfers don't synchronize with the
++ // host, add explicit sync
++ (void)hipDeviceSynchronize();
++ }
++ pobuffer.clear();
++ pobuffer.push_back(obuffer.front().data());
++ }
++};
++
++#endif
+diff --git a/shared/test_params.h b/shared/test_params.h
+new file mode 100644
+index 0000000..8d8f6f7
+--- /dev/null
++++ b/shared/test_params.h
+@@ -0,0 +1,51 @@
++// Copyright (C) 2016 - 2023 Advanced Micro Devices, Inc. All rights reserved.
++//
++// Permission is hereby granted, free of charge, to any person obtaining a copy
++// of this software and associated documentation files (the "Software"), to deal
++// in the Software without restriction, including without limitation the rights
++// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
++// copies of the Software, and to permit persons to whom the Software is
++// furnished to do so, subject to the following conditions:
++//
++// The above copyright notice and this permission notice shall be included in
++// all copies or substantial portions of the Software.
++//
++// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
++// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
++// THE SOFTWARE.
++
++#pragma once
++#ifndef TESTCONSTANTS_H
++#define TESTCONSTANTS_H
++
++#include <stdexcept>
++
++extern int verbose;
++extern size_t ramgb;
++extern size_t vramgb;
++
++extern size_t n_random_tests;
++
++extern size_t random_seed;
++extern double planar_prob;
++extern double callback_prob;
++
++extern double half_epsilon;
++extern double single_epsilon;
++extern double double_epsilon;
++extern bool skip_runtime_fails;
++
++extern double max_linf_eps_double;
++extern double max_l2_eps_double;
++extern double max_linf_eps_single;
++extern double max_l2_eps_single;
++extern double max_linf_eps_half;
++extern double max_l2_eps_half;
++
++extern int n_hip_failures;
++
++#endif
+diff --git a/shared/work_queue.h b/shared/work_queue.h
+new file mode 100644
+index 0000000..e13fc41
+--- /dev/null
++++ b/shared/work_queue.h
+@@ -0,0 +1,49 @@
++// Copyright (C) 2022 Advanced Micro Devices, Inc. All rights reserved.
++//
++// Permission is hereby granted, free of charge, to any person obtaining a copy
++// of this software and associated documentation files (the "Software"), to deal
++// in the Software without restriction, including without limitation the rights
++// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
++// copies of the Software, and to permit persons to whom the Software is
++// furnished to do so, subject to the following conditions:
++//
++// The above copyright notice and this permission notice shall be included in
++// all copies or substantial portions of the Software.
++//
++// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
++// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
++// THE SOFTWARE.
++
++#pragma once
++
++#include <condition_variable>
++#include <mutex>
++#include <queue>
++template <typename _WorkItem>
++struct WorkQueue
++{
++ void push(_WorkItem&& i)
++ {
++ std::unique_lock<std::mutex> lock(queueMutex);
++ items.emplace(std::move(i));
++ emptyWait.notify_all();
++ }
++ _WorkItem pop()
++ {
++ std::unique_lock<std::mutex> lock(queueMutex);
++ while(items.empty())
++ emptyWait.wait(lock);
++ _WorkItem item(items.front());
++ items.pop();
++ return item;
++ }
++
++private:
++ std::queue<_WorkItem> items;
++ std::mutex queueMutex;
++ std::condition_variable emptyWait;
++};
diff --git a/var/spack/repos/builtin/packages/hipfft/package.py b/var/spack/repos/builtin/packages/hipfft/package.py
index 818a9c4935..f5749749ac 100644
--- a/var/spack/repos/builtin/packages/hipfft/package.py
+++ b/var/spack/repos/builtin/packages/hipfft/package.py
@@ -14,9 +14,9 @@ class Hipfft(CMakePackage, CudaPackage, ROCmPackage):
It sits between the application and the backend FFT library, marshalling
inputs into the backend and results back to the application."""
- homepage = "https://github.com/ROCmSoftwarePlatform/hipFFT"
- git = "https://github.com/ROCmSoftwarePlatform/hipFFT.git"
- url = "https://github.com/ROCmSoftwarePlatform/hipfft/archive/rocm-5.5.0.tar.gz"
+ homepage = "https://github.com/ROCm/hipFFT"
+ git = "https://github.com/ROCm/hipFFT.git"
+ url = "https://github.com/ROCm/hipfft/archive/rocm-6.0.0.tar.gz"
tags = ["rocm"]
maintainers("renjithravindrankannath", "srekolam")
@@ -24,6 +24,7 @@ class Hipfft(CMakePackage, CudaPackage, ROCmPackage):
license("MIT")
version("master", branch="master")
+ version("6.0.0", sha256="44f328b7862c066459089dfe62833cb7d626c6ceb71c57d8c7d6bba45dad491e")
version("5.7.1", sha256="33452576649df479f084076c47d0b30f6f1da34864094bce767dd9bf609f04aa")
version("5.7.0", sha256="daa5dc44580145e85ff8ffa7eb40a3d1ef41f3217549c01281715ff696a31588")
version("5.6.1", sha256="d2ae36b8eacd39b865e8a7972b8eb86bcea2de4ac90711bba7e29b39b01eaa74")
@@ -125,6 +126,7 @@ class Hipfft(CMakePackage, CudaPackage, ROCmPackage):
"5.6.1",
"5.7.0",
"5.7.1",
+ "6.0.0",
]:
depends_on("rocm-cmake@%s:" % ver, type="build", when="@" + ver)
depends_on("rocfft@" + ver, when="+rocm @" + ver)
@@ -133,6 +135,8 @@ class Hipfft(CMakePackage, CudaPackage, ROCmPackage):
depends_on(
"rocfft amdgpu_target={0}".format(tgt), when="+rocm amdgpu_target={0}".format(tgt)
)
+ # https://github.com/ROCm/rocFFT/pull/85)
+ patch("001-remove-submodule-and-sync-shared-files-from-rocFFT.patch", when="@6.0.0")
def cmake_args(self):
args = [self.define("BUILD_CLIENTS_SAMPLES", "OFF")]
diff --git a/var/spack/repos/builtin/packages/hipfort/package.py b/var/spack/repos/builtin/packages/hipfort/package.py
index be1819bf50..8e8ea5a0a6 100644
--- a/var/spack/repos/builtin/packages/hipfort/package.py
+++ b/var/spack/repos/builtin/packages/hipfort/package.py
@@ -9,14 +9,15 @@ from spack.package import *
class Hipfort(CMakePackage):
"""Radeon Open Compute Parallel Primitives Library"""
- homepage = "https://github.com/ROCmSoftwarePlatform/hipfort"
- git = "https://github.com/ROCmSoftwarePlatform/hipfort.git"
- url = "https://github.com/ROCmSoftwarePlatform/hipfort/archive/rocm-5.5.0.tar.gz"
+ homepage = "https://github.com/ROCm/hipfort"
+ git = "https://github.com/ROCm/hipfort.git"
+ url = "https://github.com/ROCm/hipfort/archive/rocm-6.0.0.tar.gz"
tags = ["rocm"]
license("MIT")
maintainers("cgmb", "srekolam", "renjithravindrankannath")
+ version("6.0.0", sha256="151cf11648885db799aade0d00a7882589e7195643b02beaa251f1b2a43aceed")
version("5.7.1", sha256="859fac509e195f3ab97c555b5f63afea325a61aae0f281cb19a970a1b533dead")
version("5.7.0", sha256="57b04d59f61683a1b141d6d831d10c9fdecea483991ec02d14c14e441e935c05")
version("5.6.1", sha256="a55345cc9ccaf0cd69d306b8eb9ec2a02c220a57e9c396443cc7273aa3377adc")
@@ -127,6 +128,7 @@ class Hipfort(CMakePackage):
"5.6.1",
"5.7.0",
"5.7.1",
+ "6.0.0",
]:
depends_on("hip@" + ver, type="build", when="@" + ver)
diff --git a/var/spack/repos/builtin/packages/hipify-clang/package.py b/var/spack/repos/builtin/packages/hipify-clang/package.py
index ab15e479d4..b1c5f2a7fb 100644
--- a/var/spack/repos/builtin/packages/hipify-clang/package.py
+++ b/var/spack/repos/builtin/packages/hipify-clang/package.py
@@ -10,9 +10,9 @@ class HipifyClang(CMakePackage):
"""hipify-clang is a clang-based tool for translation CUDA
sources into HIP sources"""
- homepage = "https://github.com/ROCm-Developer-Tools/HIPIFY"
- git = "https://github.com/ROCm-Developer-Tools/HIPIFY.git"
- url = "https://github.com/ROCm-Developer-Tools/HIPIFY/archive/rocm-5.5.0.tar.gz"
+ homepage = "https://github.com/ROCm/HIPIFY"
+ git = "https://github.com/ROCm/HIPIFY.git"
+ url = "https://github.com/ROCm/HIPIFY/archive/rocm-6.0.0.tar.gz"
tags = ["rocm"]
maintainers("srekolam", "renjithravindrankannath")
@@ -20,6 +20,7 @@ class HipifyClang(CMakePackage):
license("MIT")
version("master", branch="master")
+ version("6.0.0", sha256="91bed2b72a6684a04e078e50b12b36b93f64ff96523283f4e5d9a33c11e6b967")
version("5.7.1", sha256="43121e62233dab010ab686d6805bc2d3163f0dc5e89cc503d50c4bcd59eeb394")
version("5.7.0", sha256="10e4386727e102fba166f012147120a6ec776e8d95fbcac3af93e243205d80a6")
version("5.6.1", sha256="ec3a4f276556f9fd924ea3c89be11b6c6ddf999cdd4387f669e38e41ee0042e8")
@@ -143,11 +144,12 @@ class HipifyClang(CMakePackage):
"5.6.1",
"5.7.0",
"5.7.1",
+ "6.0.0",
"master",
]:
depends_on("llvm-amdgpu@" + ver, when="@" + ver)
- for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1", "5.7.0", "5.7.1"]:
+ for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1", "5.7.0", "5.7.1", "6.0.0"]:
depends_on("rocm-core@" + ver, when="@" + ver)
def setup_run_environment(self, env):
diff --git a/var/spack/repos/builtin/packages/hiprand/package.py b/var/spack/repos/builtin/packages/hiprand/package.py
index 0d8666f884..acc3629762 100644
--- a/var/spack/repos/builtin/packages/hiprand/package.py
+++ b/var/spack/repos/builtin/packages/hiprand/package.py
@@ -12,9 +12,9 @@ class Hiprand(CMakePackage, CudaPackage, ROCmPackage):
"""The hipRAND project provides an interface for generating pseudo-random
and quasi-random numbers with either cuRAND or rocRAND backends."""
- homepage = "https://github.com/ROCmSoftwarePlatform/hipRAND"
- git = "https://github.com/ROCmSoftwarePlatform/hipRAND.git"
- url = "https://github.com/ROCmSoftwarePlatform/hipRAND/archive/rocm-5.7.1.tar.gz"
+ homepage = "https://github.com/ROCm/hipRAND"
+ git = "https://github.com/ROCm/hipRAND.git"
+ url = "https://github.com/ROCm/hipRAND/archive/rocm-5.7.1.tar.gz"
tags = ["rocm"]
maintainers("cgmb", "srekolam", "renjithravindrankannath")
@@ -24,6 +24,7 @@ class Hiprand(CMakePackage, CudaPackage, ROCmPackage):
version("develop", branch="develop")
version("master", branch="master")
+ version("6.0.0", sha256="7e06c98f9da7c0b20b55b2106cf3a48b9ef6577a79549a455667ae97bd15b61d")
version("5.7.1", sha256="81a9f5f0960dce125ce1ab1c7eb58bb07c8756346f9e46a1cc65aa61d5a114f8")
version("5.7.0", sha256="4dee76719839503b02ce7d38e1c61bbdb2da18da7f63a7ef7012c84c71aa0a9d")
version("5.6.1", sha256="a73d5578bc7f8dff0b8960e4bff97bc4fc28f508a19ed6acd1cfd4d3e76b47ee")
@@ -88,6 +89,7 @@ class Hiprand(CMakePackage, CudaPackage, ROCmPackage):
"5.6.1",
"5.7.0",
"5.7.1",
+ "6.0.0",
"master",
"develop",
]:
diff --git a/var/spack/repos/builtin/packages/hipsolver/package.py b/var/spack/repos/builtin/packages/hipsolver/package.py
index f39755d03d..81c956334c 100644
--- a/var/spack/repos/builtin/packages/hipsolver/package.py
+++ b/var/spack/repos/builtin/packages/hipsolver/package.py
@@ -16,9 +16,9 @@ class Hipsolver(CMakePackage, CudaPackage, ROCmPackage):
regardless of the chosen backend. Currently, hipSOLVER supports rocSOLVER
and cuSOLVER as backends."""
- homepage = "https://github.com/ROCmSoftwarePlatform/hipSOLVER"
- git = "https://github.com/ROCmSoftwarePlatform/hipSOLVER.git"
- url = "https://github.com/ROCmSoftwarePlatform/hipSOLVER/archive/rocm-5.5.0.tar.gz"
+ homepage = "https://github.com/ROCm/hipSOLVER"
+ git = "https://github.com/ROCm/hipSOLVER.git"
+ url = "https://github.com/ROCm/hipSOLVER/archive/rocm-6.0.0.tar.gz"
tags = ["rocm"]
maintainers("cgmb", "srekolam", "renjithravindrankannath")
@@ -28,6 +28,7 @@ class Hipsolver(CMakePackage, CudaPackage, ROCmPackage):
version("develop", branch="develop")
version("master", branch="master")
+ version("6.0.0", sha256="385849db02189d5e62096457e52ae899ae5c1ae7d409dc1da61f904d8861b48c")
version("5.7.1", sha256="5592e965c0dc5722931302289643d1ece370220af2c7afc58af97b3395295658")
version("5.7.0", sha256="0e35795bfbcb57ed8e8437471209fb7d230babcc31d9a4a0b3640c3ee639f4a7")
version("5.6.1", sha256="2e546bc7771f7bf0aa7892b69cded725941573e8b70614759c3d03c21eb78dde")
@@ -115,6 +116,7 @@ class Hipsolver(CMakePackage, CudaPackage, ROCmPackage):
"5.6.1",
"5.7.0",
"5.7.1",
+ "6.0.0",
"master",
"develop",
]:
diff --git a/var/spack/repos/builtin/packages/hipsparse/package.py b/var/spack/repos/builtin/packages/hipsparse/package.py
index a195356fa4..0473a3ea3d 100644
--- a/var/spack/repos/builtin/packages/hipsparse/package.py
+++ b/var/spack/repos/builtin/packages/hipsparse/package.py
@@ -12,9 +12,9 @@ class Hipsparse(CMakePackage, CudaPackage, ROCmPackage):
"""hipSPARSE is a SPARSE marshalling library, with
multiple supported backends"""
- homepage = "https://github.com/ROCmSoftwarePlatform/hipSPARSE"
- git = "https://github.com/ROCmSoftwarePlatform/hipSPARSE.git"
- url = "https://github.com/ROCmSoftwarePlatform/hipSPARSE/archive/rocm-5.5.0.tar.gz"
+ homepage = "https://github.com/ROCm/hipSPARSE"
+ git = "https://github.com/ROCm/hipSPARSE.git"
+ url = "https://github.com/ROCm/hipSPARSE/archive/rocm-6.0.0.tar.gz"
tags = ["rocm"]
maintainers("cgmb", "srekolam", "renjithravindrankannath", "haampie")
@@ -22,6 +22,7 @@ class Hipsparse(CMakePackage, CudaPackage, ROCmPackage):
license("MIT")
+ version("6.0.0", sha256="718a5f03b6a579c0542a60d00f5688bec53a181b429b7ee8ce3c8b6c4a78d754")
version("5.7.1", sha256="16c3818260611226c3576d8d55ad8f51e0890d2473503edf2c9313250ae65ca7")
version("5.7.0", sha256="729b749b5340034639873a99e6091963374f6f0456c8f36d076c96f03fe43888")
version("5.6.1", sha256="d636d0c5d1e38cc0c09b1e95380199ec82bd465b94bd6661f0c8d9374d9b565d")
@@ -160,6 +161,7 @@ class Hipsparse(CMakePackage, CudaPackage, ROCmPackage):
"5.6.1",
"5.7.0",
"5.7.1",
+ "6.0.0",
]:
depends_on("rocm-cmake@%s:" % ver, type="build", when="@" + ver)
depends_on("rocsparse@" + ver, when="+rocm @" + ver)
diff --git a/var/spack/repos/builtin/packages/hsa-rocr-dev/package.py b/var/spack/repos/builtin/packages/hsa-rocr-dev/package.py
index d0a153a595..6b64a0129b 100644
--- a/var/spack/repos/builtin/packages/hsa-rocr-dev/package.py
+++ b/var/spack/repos/builtin/packages/hsa-rocr-dev/package.py
@@ -15,15 +15,16 @@ class HsaRocrDev(CMakePackage):
HSA ROCm kernel agents.AMD Heterogeneous System Architecture HSA -
Linux HSA Runtime for Boltzmann (ROCm) platforms."""
- homepage = "https://github.com/RadeonOpenCompute/ROCR-Runtime"
- git = "https://github.com/RadeonOpenCompute/ROCR-Runtime.git"
- url = "https://github.com/RadeonOpenCompute/ROCR-Runtime/archive/rocm-5.5.0.tar.gz"
+ homepage = "https://github.com/ROCm/ROCR-Runtime"
+ git = "https://github.com/ROCm/ROCR-Runtime.git"
+ url = "https://github.com/ROCm/ROCR-Runtime/archive/rocm-6.0.0.tar.gz"
tags = ["rocm"]
maintainers("srekolam", "renjithravindrankannath", "haampie")
libraries = ["libhsa-runtime64"]
version("master", branch="master")
+ version("6.0.0", sha256="99e8fa1af52d0bf382f28468e1a345af1ff3452c35914a6a7b5eeaf69fc568db")
version("5.7.1", sha256="655e9bfef4b0b6ad3f9b89c934dc0a8377273bb0bccbda6c399ac5d5d2c1c04c")
version("5.7.0", sha256="2c56ec5c78a36f2b847afd4632cb25dbf6ecc58661eb2ae038c2552342e6ce23")
version("5.6.1", sha256="4de9a57c2092edf9398d671c8a2c60626eb7daf358caf710da70d9c105490221")
@@ -154,6 +155,7 @@ class HsaRocrDev(CMakePackage):
"5.6.1",
"5.7.0",
"5.7.1",
+ "6.0.0",
"master",
]:
depends_on("hsakmt-roct@" + ver, when="@" + ver)
@@ -163,7 +165,7 @@ class HsaRocrDev(CMakePackage):
"rocm-device-libs@" + ver, when="@{0} ^llvm-amdgpu ~rocm-device-libs".format(ver)
)
- for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1", "5.7.0", "5.7.1"]:
+ for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1", "5.7.0", "5.7.1", "6.0.0"]:
depends_on("rocm-core@" + ver, when="@" + ver)
# Both 3.5.0 and 3.7.0 force INSTALL_RPATH in different ways
@@ -210,4 +212,7 @@ class HsaRocrDev(CMakePackage):
if self.spec.satisfies("@5.6:"):
args.append("-DCMAKE_INSTALL_LIBDIR=lib")
+ if self.spec.satisfies("@6.0:"):
+ args.append(self.define("ROCM_PATCH_VERSION", "60000"))
+
return args
diff --git a/var/spack/repos/builtin/packages/hsakmt-roct/package.py b/var/spack/repos/builtin/packages/hsakmt-roct/package.py
index e087ea6519..89be71a9ea 100644
--- a/var/spack/repos/builtin/packages/hsakmt-roct/package.py
+++ b/var/spack/repos/builtin/packages/hsakmt-roct/package.py
@@ -14,14 +14,15 @@ class HsakmtRoct(CMakePackage):
Thunk Interface is a user-mode API interfaces used to interact
with the ROCk driver."""
- homepage = "https://github.com/RadeonOpenCompute/ROCT-Thunk-Interface"
- git = "https://github.com/RadeonOpenCompute/ROCT-Thunk-Interface.git"
- url = "https://github.com/RadeonOpenCompute/ROCT-Thunk-Interface/archive/rocm-5.5.0.tar.gz"
+ homepage = "https://github.com/ROCm/ROCT-Thunk-Interface"
+ git = "https://github.com/ROCm/ROCT-Thunk-Interface.git"
+ url = "https://github.com/ROCm/ROCT-Thunk-Interface/archive/rocm-6.0.0.tar.gz"
tags = ["rocm"]
maintainers("srekolam", "renjithravindrankannath")
version("master", branch="master")
+ version("6.0.0", sha256="9f4e80bd0a714ce45326941b906a62298c62025eff186dc6c48282ce84c787c7")
version("5.7.1", sha256="38bc3732886a52ca9cd477ec6fcde3ab17a0ba5dc8e2f7ac34c4de597bd00e8b")
version("5.7.0", sha256="52293e40c4ba0c653d796e2f6109f5fb4c79f5fb82310ecbfd9a5432acf9da43")
version("5.6.1", sha256="d60b355bfd21a08e0e36270fd56f98d052c3c6edca47da887fa32bf32759c29b")
@@ -119,11 +120,11 @@ class HsakmtRoct(CMakePackage):
for ver in ["5.3.0", "5.4.0", "5.4.3"]:
depends_on("llvm-amdgpu@" + ver, type="test", when="@" + ver)
- for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1", "5.7.0", "5.7.1"]:
+ for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1", "5.7.0", "5.7.1", "6.0.0"]:
depends_on("rocm-core@" + ver, when="@" + ver)
depends_on("llvm-amdgpu@" + ver, type="test", when="@" + ver)
- # See https://github.com/RadeonOpenCompute/ROCT-Thunk-Interface/issues/72
+ # See https://github.com/ROCm/ROCT-Thunk-Interface/issues/72
# and https://github.com/spack/spack/issues/28398
patch("0001-Remove-compiler-support-libraries-and-libudev-as-req.patch", when="@4.5.0:5.2")
patch("0002-Remove-compiler-support-libraries-and-libudev-as-req-5.3.patch", when="@5.3.0:5.4")
diff --git a/var/spack/repos/builtin/packages/legion/package.py b/var/spack/repos/builtin/packages/legion/package.py
index 2840d577de..7cc446cded 100644
--- a/var/spack/repos/builtin/packages/legion/package.py
+++ b/var/spack/repos/builtin/packages/legion/package.py
@@ -74,6 +74,7 @@ class Legion(CMakePackage, ROCmPackage):
# https://github.com/spack/spack/issues/37232#issuecomment-1553376552
patch("hip-offload-arch.patch", when="@23.03.0 +rocm")
+ patch("update-hip-path-legion-23.06.0.patch", when="@23.06.0 ^hip@6.0.0 +rocm")
def patch(self):
if "network=gasnet conduit=ofi-slingshot11 ^cray-mpich+wrappers" in self.spec:
@@ -349,6 +350,10 @@ class Legion(CMakePackage, ROCmPackage):
options.append(from_variant("Legion_HIP_ARCH", "amdgpu_target"))
options.append(from_variant("Legion_HIJACK_HIP", "hip_hijack"))
options.append(self.define("HIP_PATH", "{0}/hip".format(spec["hip"].prefix)))
+ if "^hip@:5.7" in spec:
+ options.append(self.define("HIP_PATH", "{0}/hip".format(spec["hip"].prefix)))
+ elif "^hip@6.0:" in spec:
+ options.append(self.define("HIP_PATH", "{0}".format(spec["hip"].prefix)))
if "+fortran" in spec:
# default is off.
diff --git a/var/spack/repos/builtin/packages/legion/update-hip-path-legion-23.06.0.patch b/var/spack/repos/builtin/packages/legion/update-hip-path-legion-23.06.0.patch
new file mode 100644
index 0000000000..9f7f6a7a86
--- /dev/null
+++ b/var/spack/repos/builtin/packages/legion/update-hip-path-legion-23.06.0.patch
@@ -0,0 +1,13 @@
+diff --git a/cmake/FindHIP.cmake b/cmake/FindHIP.cmake
+index f86edd2..24492ad 100644
+--- a/cmake/FindHIP.cmake
++++ b/cmake/FindHIP.cmake
+@@ -22,7 +22,7 @@ if(NOT DEFINED HIP_PATH)
+ set(HIP_PATH $ENV{HIP_PATH} CACHE PATH "Path to where HIP has been installed")
+ endif()
+ endif()
+-include(${HIP_PATH}/cmake/FindHIP.cmake)
++include(${HIP_PATH}/lib/cmake/hip/FindHIP.cmake)
+
+ if(NOT HIP_INCLUDE_DIRS)
+ list(APPEND HIP_INCLUDE_DIRS
diff --git a/var/spack/repos/builtin/packages/llvm-amdgpu/package.py b/var/spack/repos/builtin/packages/llvm-amdgpu/package.py
index f8cddebf84..99a2e67488 100644
--- a/var/spack/repos/builtin/packages/llvm-amdgpu/package.py
+++ b/var/spack/repos/builtin/packages/llvm-amdgpu/package.py
@@ -14,9 +14,9 @@ class LlvmAmdgpu(CMakePackage):
"""Toolkit for the construction of highly optimized compilers,
optimizers, and run-time environments."""
- homepage = "https://github.com/RadeonOpenCompute/llvm-project"
- git = "https://github.com/RadeonOpenCompute/llvm-project.git"
- url = "https://github.com/RadeonOpenCompute/llvm-project/archive/rocm-5.5.0.tar.gz"
+ homepage = "https://github.com/ROCm/llvm-project"
+ git = "https://github.com/ROCm/llvm-project.git"
+ url = "https://github.com/ROCm/llvm-project/archive/rocm-6.0.0.tar.gz"
tags = ["rocm"]
executables = [r"amdclang", r"amdclang\+\+", r"amdflang", r"clang.*", r"flang.*", "llvm-.*"]
generator("ninja")
@@ -26,6 +26,7 @@ class LlvmAmdgpu(CMakePackage):
license("Apache-2.0")
version("master", branch="amd-stg-open")
+ version("6.0.0", sha256="c673708d413d60ca8606ee75c77e9871b6953c59029c987b92f2f6e85f683626")
version("5.7.1", sha256="6b54c422e45ad19c9bf5ab090ec21753e7f7d854ca78132c30eb146657b168eb")
version("5.7.0", sha256="4abdf00b297a77c5886cedb37e63acda2ba11cb9f4c0a64e133b05800aadfcf0")
version("5.6.1", sha256="045e43c0c4a3f4f2f1db9fb603a4f1ea3d56e128147e19ba17909eb57d7f08e5")
@@ -167,12 +168,12 @@ class LlvmAmdgpu(CMakePackage):
# as per 5.2.0 llvm code. It used to be llvm/bin/../lib/libdevice.
# Below patch is to look in the old path.
patch("adjust-openmp-bitcode-directory-for-llvm-link.patch", when="@5.2.0:5.6")
- patch("0001-update-HIP_PATH-deduction-for-5.7.0.patch", when="@5.7.0:5.7")
+ patch("0001-update-HIP_PATH-deduction-for-5.7.0.patch", when="@5.7.0:6.0")
# Below patch is to set the flag -mcode-object-version=none until
# the below fix is available in device-libs release code.
- # https://github.com/RadeonOpenCompute/ROCm-Device-Libs/commit/f0356159dbdc93ea9e545f9b61a7842f9c881fdf
- patch("patch-llvm-5.5.0.patch", when="@5.5: +rocm-device-libs")
+ # https://github.com/ROCm/ROCm-Device-Libs/commit/f0356159dbdc93ea9e545f9b61a7842f9c881fdf
+ patch("patch-llvm-5.5.0.patch", when="@5.5:5.7 +rocm-device-libs")
# i1 muls can sometimes happen after SCEV.
# They resulted in ISel failures because we were missing the patterns for them.
@@ -188,6 +189,7 @@ class LlvmAmdgpu(CMakePackage):
# Add device libs sources so they can be an external LLVM project
for d_version, d_shasum in [
+ ("6.0.0", "198df4550d4560537ba60ac7af9bde31d59779c8ec5d6309627f77a43ab6ef6f"),
("5.7.1", "703de8403c0bd0d80f37c970a698f10f148daf144d34f982e4484d04f7c7bbef"),
("5.7.0", "0f8780b9098573f1c456bdc84358de924dcf00604330770a383983e1775bf61e"),
("5.6.1", "f0dfab272ff936225bfa1e9dabeb3c5d12ce08b812bf53ffbddd2ddfac49761c"),
@@ -221,7 +223,7 @@ class LlvmAmdgpu(CMakePackage):
resource(
name="rocm-device-libs",
placement="rocm-device-libs",
- url="https://github.com/RadeonOpenCompute/ROCm-Device-Libs/archive/rocm-{0}.tar.gz".format(
+ url="https://github.com/ROCm/ROCm-Device-Libs/archive/rocm-{0}.tar.gz".format(
d_version
),
sha256=d_shasum,
@@ -231,11 +233,12 @@ class LlvmAmdgpu(CMakePackage):
resource(
name="rocm-device-libs",
placement="rocm-device-libs",
- git="https://github.com/RadeonOpenCompute/ROCm-Device-Libs.git",
+ git="https://github.com/ROCm/ROCm-Device-Libs.git",
branch="amd-stg-open",
when="@master +rocm-device-libs",
)
for d_version, d_shasum in [
+ ("6.0.0", "99e8fa1af52d0bf382f28468e1a345af1ff3452c35914a6a7b5eeaf69fc568db"),
("5.7.1", "655e9bfef4b0b6ad3f9b89c934dc0a8377273bb0bccbda6c399ac5d5d2c1c04c"),
("5.7.0", "2c56ec5c78a36f2b847afd4632cb25dbf6ecc58661eb2ae038c2552342e6ce23"),
("5.6.1", "4de9a57c2092edf9398d671c8a2c60626eb7daf358caf710da70d9c105490221"),
@@ -244,19 +247,20 @@ class LlvmAmdgpu(CMakePackage):
resource(
name="hsa-runtime",
placement="hsa-runtime",
- url=f"https://github.com/RadeonOpenCompute/ROCR-Runtime/archive/rocm-{d_version}.tar.gz",
+ url=f"https://github.com/ROCm/ROCR-Runtime/archive/rocm-{d_version}.tar.gz",
sha256=d_shasum,
when="@{0}".format(d_version),
)
resource(
name="hsa-runtime",
placement="hsa-runtime",
- git="https://github.com/RadeonOpenCompute/ROCR-Runtime.git",
+ git="https://github.com/ROCm/ROCR-Runtime.git",
branch="master",
when="@master",
)
for d_version, d_shasum in [
+ ("6.0.0", "04353d27a512642a5e5339532a39d0aabe44e0964985de37b150a2550385800a"),
("5.7.1", "3b9433b4a0527167c3e9dfc37a3c54e0550744b8d4a8e1be298c8d4bcedfee7c"),
("5.7.0", "e234bcb93d602377cfaaacb59aeac5796edcd842a618162867b7e670c3a2c42c"),
("5.6.1", "0a85d84619f98be26ca7a32c71f94ed3c4e9866133789eabb451be64ce739300"),
@@ -265,14 +269,14 @@ class LlvmAmdgpu(CMakePackage):
resource(
name="comgr",
placement="comgr",
- url=f"https://github.com/RadeonOpenCompute/ROCm-CompilerSupport/archive/rocm-{d_version}.tar.gz",
+ url=f"https://github.com/ROCm/ROCm-CompilerSupport/archive/rocm-{d_version}.tar.gz",
sha256=d_shasum,
when="@{0}".format(d_version),
)
resource(
name="comgr",
placement="comgr",
- git="https://github.com/RadeonOpenCompute/ROCm-CompilerSupport.git",
+ git="https://github.com/ROCm/ROCm-CompilerSupport.git",
branch="amd-stg-open",
when="@master",
)
diff --git a/var/spack/repos/builtin/packages/magma/0001-fix-magma-build-error-with-rocm-6.0.0.patch b/var/spack/repos/builtin/packages/magma/0001-fix-magma-build-error-with-rocm-6.0.0.patch
new file mode 100644
index 0000000000..accc271419
--- /dev/null
+++ b/var/spack/repos/builtin/packages/magma/0001-fix-magma-build-error-with-rocm-6.0.0.patch
@@ -0,0 +1,99 @@
+From 4f7d9ff22996ba3000ee344a0f84f73c27257f47 Mon Sep 17 00:00:00 2001
+From: sreenivasa murthy kolam <sreenivasamurthy.kolam@amd.com>
+Date: Wed, 17 Jan 2024 11:44:32 +0000
+Subject: [PATCH] Fix Build Failure with rocm-6.0.0 . Add extra parameter for
+ hipblasZtrmm(),hipblasCtrmm()etc
+
+---
+ interface_hip/blas_c_v2.cpp | 3 ++-
+ interface_hip/blas_d_v2.cpp | 3 ++-
+ interface_hip/blas_s_v2.cpp | 3 ++-
+ interface_hip/blas_z_v2.cpp | 3 ++-
+ interface_hip/interface.cpp | 5 ++---
+ 5 files changed, 10 insertions(+), 7 deletions(-)
+
+diff --git a/interface_hip/blas_c_v2.cpp b/interface_hip/blas_c_v2.cpp
+index 6147857..a406faf 100644
+--- a/interface_hip/blas_c_v2.cpp
++++ b/interface_hip/blas_c_v2.cpp
+@@ -1858,7 +1858,8 @@ magma_ctrmm(
+ hipblas_diag_const( diag ),
+ int(m), int(n),
+ (hipblasComplex*)&alpha, (const hipblasComplex*)dA, int(ldda),
+- (hipblasComplex*)dB, int(lddb) );
++ (hipblasComplex*)dB, int(lddb),
++ (hipblasComplex*)dB, int(lddb) ); /* C same as B; less efficient */
+ #else
+ hipblasCtrmm(
+ queue->hipblas_handle(),
+diff --git a/interface_hip/blas_d_v2.cpp b/interface_hip/blas_d_v2.cpp
+index 340f0b2..8c1ecd4 100644
+--- a/interface_hip/blas_d_v2.cpp
++++ b/interface_hip/blas_d_v2.cpp
+@@ -1858,7 +1858,8 @@ magma_dtrmm(
+ hipblas_diag_const( diag ),
+ int(m), int(n),
+ (double*)&alpha, (const double*)dA, int(ldda),
+- (double*)dB, int(lddb) );
++ (double*)dB, int(lddb),
++ (double*)dB, int(lddb) ); /* C same as B; less efficient */
+ #else
+ hipblasDtrmm(
+ queue->hipblas_handle(),
+diff --git a/interface_hip/blas_s_v2.cpp b/interface_hip/blas_s_v2.cpp
+index 87aeba3..a2cfc02 100644
+--- a/interface_hip/blas_s_v2.cpp
++++ b/interface_hip/blas_s_v2.cpp
+@@ -1858,7 +1858,8 @@ magma_strmm(
+ hipblas_diag_const( diag ),
+ int(m), int(n),
+ (float*)&alpha, (const float*)dA, int(ldda),
+- (float*)dB, int(lddb) );
++ (float*)dB, int(lddb),
++ (float*)dB, int(lddb) ); /* C same as B; less efficient */
+ #else
+ hipblasStrmm(
+ queue->hipblas_handle(),
+diff --git a/interface_hip/blas_z_v2.cpp b/interface_hip/blas_z_v2.cpp
+index 3c7e87a..eb9e2e6 100644
+--- a/interface_hip/blas_z_v2.cpp
++++ b/interface_hip/blas_z_v2.cpp
+@@ -1858,7 +1858,8 @@ magma_ztrmm(
+ hipblas_diag_const( diag ),
+ int(m), int(n),
+ (hipblasDoubleComplex*)&alpha, (const hipblasDoubleComplex*)dA, int(ldda),
+- (hipblasDoubleComplex*)dB, int(lddb) );
++ (hipblasDoubleComplex*)dB, int(lddb),
++ (hipblasDoubleComplex*)dB, int(lddb) ); /* C same as B; less efficient */
+ #else
+ hipblasZtrmm(
+ queue->hipblas_handle(),
+diff --git a/interface_hip/interface.cpp b/interface_hip/interface.cpp
+index 2b35b34..7c76426 100644
+--- a/interface_hip/interface.cpp
++++ b/interface_hip/interface.cpp
+@@ -209,11 +209,10 @@ magma_init()
+ else {
+ g_magma_devices[dev].memory = prop.totalGlobalMem;
+ g_magma_devices[dev].shmem_block = prop.sharedMemPerBlock;
+- #ifdef MAGMA_HAVE_CUDA
+ g_magma_devices[dev].cuda_arch = prop.major*100 + prop.minor*10;
++ #ifdef MAGMA_HAVE_CUDA
+ g_magma_devices[dev].shmem_multiproc = prop.sharedMemPerMultiprocessor;
+ #elif defined(MAGMA_HAVE_HIP)
+- g_magma_devices[dev].cuda_arch = prop.gcnArch;
+ g_magma_devices[dev].shmem_multiproc = prop.maxSharedMemoryPerMultiProcessor;
+ #endif
+
+@@ -464,7 +463,7 @@ magma_print_environment()
+ prop.name,
+ prop.clockRate / 1000.,
+ prop.totalGlobalMem / (1024.*1024.),
+- prop.gcnArch );
++ prop.gcnArchName );
+ #endif
+ }
+
+--
+2.39.3
+
diff --git a/var/spack/repos/builtin/packages/magma/package.py b/var/spack/repos/builtin/packages/magma/package.py
index 26ed916e72..585f360a7b 100644
--- a/var/spack/repos/builtin/packages/magma/package.py
+++ b/var/spack/repos/builtin/packages/magma/package.py
@@ -78,6 +78,7 @@ class Magma(CMakePackage, CudaPackage, ROCmPackage):
patch("magma-2.5.0.patch", when="@2.5.0")
patch("magma-2.5.0-cmake.patch", when="@2.5.0")
patch("cmake-W.patch", when="@2.5.0:%nvhpc")
+ patch("0001-fix-magma-build-error-with-rocm-6.0.0.patch", when="@2.7.2 ^hip@6.0.0 + rocm")
@run_before("cmake")
def generate_gpu_config(self):
@@ -146,7 +147,7 @@ class Magma(CMakePackage, CudaPackage, ROCmPackage):
if "+rocm" in spec:
options.append(define("MAGMA_ENABLE_HIP", True))
options.append(define("CMAKE_CXX_COMPILER", spec["hip"].hipcc))
- # See https://github.com/ROCmSoftwarePlatform/rocFFT/issues/322
+ # See https://github.com/ROCm/rocFFT/issues/322
if spec.satisfies("^cmake@3.21.0:3.21.2"):
options.append(define("__skip_rocmclang", True))
else:
diff --git a/var/spack/repos/builtin/packages/mfem/mfem-hip.patch b/var/spack/repos/builtin/packages/mfem/mfem-hip.patch
new file mode 100644
index 0000000000..565bae348c
--- /dev/null
+++ b/var/spack/repos/builtin/packages/mfem/mfem-hip.patch
@@ -0,0 +1,24 @@
+From 93ab69cac72cc2d13cfd4b7efcc235bdbca2b9f5 Mon Sep 17 00:00:00 2001
+From: Afzal Patel <afzal.patel@amd.com>
+Date: Wed, 17 Jan 2024 11:44:18 -0800
+Subject: [PATCH] Add hip library path to ghv flags so libamdhip64 can be found
+
+---
+ config/makefile | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/config/makefile b/config/makefile
+index 627d117..a453865 100644
+--- a/config/makefile
++++ b/config/makefile
+@@ -38,7 +38,7 @@ all: header config-mk
+ MPI = $(MFEM_USE_MPI:NO=)
+ GHV_CXX ?= $(MFEM_CXX)
+ GHV = get_hypre_version
+-GHV_FLAGS = $(subst @MFEM_DIR@,$(if $(MFEM_DIR),$(MFEM_DIR),..),$(HYPRE_OPT))
++GHV_FLAGS = $(subst @MFEM_DIR@,$(if $(MFEM_DIR),$(MFEM_DIR),..),$(HYPRE_OPT)) $(HIP_LIB)
+ SMX = $(if $(MFEM_USE_PUMI:NO=),MFEM_USE_SIMMETRIX)
+ SMX_PATH = $(PUMI_DIR)/include/gmi_sim.h
+ SMX_FILE = $(subst @MFEM_DIR@,$(if $(MFEM_DIR),$(MFEM_DIR),..),$(SMX_PATH))
+--
+2.25.1 \ No newline at end of file
diff --git a/var/spack/repos/builtin/packages/mfem/package.py b/var/spack/repos/builtin/packages/mfem/package.py
index 618b397181..ddd7be363d 100644
--- a/var/spack/repos/builtin/packages/mfem/package.py
+++ b/var/spack/repos/builtin/packages/mfem/package.py
@@ -480,6 +480,7 @@ class Mfem(Package, CudaPackage, ROCmPackage):
when="@4.6.0 +gslib+shared+miniapps",
sha256="2a31682d876626529e2778a216d403648b83b90997873659a505d982d0e65beb",
)
+ patch("mfem-hip.patch", when="+rocm ^hip@6.0:")
phases = ["configure", "build", "install"]
@@ -954,6 +955,7 @@ class Mfem(Package, CudaPackage, ROCmPackage):
options += ["HIP_CXX=%s" % spec["hip"].hipcc, "HIP_ARCH=%s" % amdgpu_target]
hip_headers = HeaderList([])
hip_libs = LibraryList([])
+ hip_libs += find_libraries("libamdhip64", spec["hip"].prefix.lib)
# To use a C++ compiler that supports -xhip flag one can use
# something like this:
# options += [
diff --git a/var/spack/repos/builtin/packages/migraphx/package.py b/var/spack/repos/builtin/packages/migraphx/package.py
index 1245a48109..efc4280521 100644
--- a/var/spack/repos/builtin/packages/migraphx/package.py
+++ b/var/spack/repos/builtin/packages/migraphx/package.py
@@ -11,9 +11,9 @@ from spack.package import *
class Migraphx(CMakePackage):
"""AMD's graph optimization engine."""
- homepage = "https://github.com/ROCmSoftwarePlatform/AMDMIGraphX"
- git = "https://github.com/ROCmSoftwarePlatform/AMDMIGraphX.git"
- url = "https://github.com/ROCmSoftwarePlatform/AMDMIGraphX/archive/rocm-5.5.0.tar.gz"
+ homepage = "https://github.com/ROCm/AMDMIGraphX"
+ git = "https://github.com/ROCm/AMDMIGraphX.git"
+ url = "https://github.com/ROCm/AMDMIGraphX/archive/rocm-6.0.0.tar.gz"
tags = ["rocm"]
maintainers("srekolam", "renjithravindrankannath")
@@ -21,6 +21,7 @@ class Migraphx(CMakePackage):
license("MIT")
+ version("6.0.0", sha256="7bb3f5011da9b1f3b79707b06118c523c1259215f650c2ffa5622a7e1d88868f")
version("5.7.1", sha256="3e58c043a5a7d1357ee05725fd6cd41e190b070f1ba57f61300128429902089c")
version("5.7.0", sha256="14f13554367d2d6490d66f8b5b739203225e7acce25085559e7c4acf29e2a4d5")
version("5.6.1", sha256="b108c33f07572ffd880b20f6de06f1934ab2a1b41ae69095612322ac412fa91c")
@@ -108,7 +109,7 @@ class Migraphx(CMakePackage):
)
def url_for_version(self, version):
- url = "https://github.com/ROCmSoftwarePlatform/AMDMIGraphX/archive/"
+ url = "https://github.com/ROCm/AMDMIGraphX/archive/"
if version <= Version("3.5.0"):
url += "{0}.tar.gz".format(version)
else:
@@ -168,6 +169,7 @@ class Migraphx(CMakePackage):
"5.6.1",
"5.7.0",
"5.7.1",
+ "6.0.0",
]:
depends_on("rocm-cmake@%s:" % ver, type="build", when="@" + ver)
depends_on("hip@" + ver, when="@" + ver)
@@ -175,7 +177,7 @@ class Migraphx(CMakePackage):
depends_on("rocblas@" + ver, when="@" + ver)
depends_on("miopen-hip@" + ver, when="@" + ver)
- for ver in ["5.7.0", "5.7.1"]:
+ for ver in ["5.7.0", "5.7.1", "6.0.0"]:
depends_on("composable-kernel@" + ver, when="@" + ver)
@property
diff --git a/var/spack/repos/builtin/packages/miopen-hip/package.py b/var/spack/repos/builtin/packages/miopen-hip/package.py
index ee3b78a5ff..8bafc28701 100644
--- a/var/spack/repos/builtin/packages/miopen-hip/package.py
+++ b/var/spack/repos/builtin/packages/miopen-hip/package.py
@@ -12,9 +12,9 @@ from spack.pkg.builtin.boost import Boost
class MiopenHip(CMakePackage):
"""AMD's library for high performance machine learning primitives."""
- homepage = "https://github.com/ROCmSoftwarePlatform/MIOpen"
- git = "https://github.com/ROCmSoftwarePlatform/MIOpen.git"
- url = "https://github.com/ROCmSoftwarePlatform/MIOpen/archive/rocm-5.5.0.tar.gz"
+ homepage = "https://github.com/ROCm/MIOpen"
+ git = "https://github.com/ROCm/MIOpen.git"
+ url = "https://github.com/ROCm/MIOpen/archive/rocm-6.0.0.tar.gz"
tags = ["rocm"]
maintainers("srekolam", "renjithravindrankannath")
@@ -22,6 +22,7 @@ class MiopenHip(CMakePackage):
license("MIT")
+ version("6.0.0", sha256="a0718a48353be30ff98118ade511f0c1b454e394d8f934aefe7dd6946562b2e9")
version("5.7.1", sha256="912a658fe21ce6f1982b0f2ff251c3f7bb618f2e7e9876d983bcb54e3cd7129e")
version("5.7.0", sha256="5cd0b62254469e1c246d5890d2b78f8aedcf42cf8a327eabc1a391b83bcd14e1")
version("5.6.1", sha256="ff627d68ed9e52433a3c808b5d3ff179a398b77ce81b00cfea7b2c4da5162c6c")
@@ -124,7 +125,7 @@ class MiopenHip(CMakePackage):
patch("0001-Add-rocm-path-and-rocm-device-lib-path-flags.patch", when="@3.9.0:5.0.2")
patch("miopen-hip-include-nlohmann-include-directory.patch", when="@5.4.0:")
patch(
- "https://github.com/ROCmSoftwarePlatform/MIOpen/pull/2276/commits/f60aa1ff89f8fb596b4a6a4c70aa7d557803db87.patch?full_index=1",
+ "https://github.com/ROCm/MIOpen/pull/2276/commits/f60aa1ff89f8fb596b4a6a4c70aa7d557803db87.patch?full_index=1",
sha256="c777d9f4cd2bbfec632b38620c0f70bb0cce8da1",
when="@5.7:",
)
@@ -159,6 +160,7 @@ class MiopenHip(CMakePackage):
"5.6.1",
"5.7.0",
"5.7.1",
+ "6.0.0",
]:
depends_on("rocm-cmake@%s:" % ver, type="build", when="@" + ver)
depends_on("hip@" + ver, when="@" + ver)
@@ -168,7 +170,7 @@ class MiopenHip(CMakePackage):
for ver in ["5.1.0", "5.1.3", "5.2.0", "5.2.1", "5.2.3", "5.3.0", "5.3.3"]:
depends_on("mlirmiopen@" + ver, when="@" + ver)
- for ver in ["5.5.1", "5.6.0", "5.6.1", "5.7.0", "5.7.1"]:
+ for ver in ["5.5.1", "5.6.0", "5.6.1", "5.7.0", "5.7.1", "6.0.0"]:
depends_on("nlohmann-json", type="link")
depends_on("composable-kernel@" + ver, when="@" + ver)
for ver in ["5.4.0", "5.4.3", "5.5.0"]:
diff --git a/var/spack/repos/builtin/packages/miopen-opencl/package.py b/var/spack/repos/builtin/packages/miopen-opencl/package.py
index ec5eac8a96..5ec89b243d 100644
--- a/var/spack/repos/builtin/packages/miopen-opencl/package.py
+++ b/var/spack/repos/builtin/packages/miopen-opencl/package.py
@@ -12,9 +12,9 @@ from spack.pkg.builtin.boost import Boost
class MiopenOpencl(CMakePackage):
"""AMD's library for high performance machine learning primitives."""
- homepage = "https://github.com/ROCmSoftwarePlatform/MIOpen"
- git = "https://github.com/ROCmSoftwarePlatform/MIOpen.git"
- url = "https://github.com/ROCmSoftwarePlatform/MIOpen/archive/rocm-5.5.0.tar.gz"
+ homepage = "https://github.com/ROCm/MIOpen"
+ git = "https://github.com/ROCm/MIOpen.git"
+ url = "https://github.com/ROCm/MIOpen/archive/rocm-6.0.0.tar.gz"
tags = ["rocm"]
maintainers("srekolam", "renjithravindrankannath")
diff --git a/var/spack/repos/builtin/packages/miopen-tensile/package.py b/var/spack/repos/builtin/packages/miopen-tensile/package.py
index 11dece2143..1d64b792d4 100644
--- a/var/spack/repos/builtin/packages/miopen-tensile/package.py
+++ b/var/spack/repos/builtin/packages/miopen-tensile/package.py
@@ -12,9 +12,9 @@ class MiopenTensile(CMakePackage):
"""MIOpenTensile provides host-callable interfaces to Tensile library.
MIOpenTensile supports one programming model: HIP"""
- homepage = "https://github.com/ROCmSoftwarePlatform/MIOpenTensile"
- git = "https://github.com/ROCmSoftwarePlatform/MIOpenTensile.git"
- url = "https://github.com/ROCmSoftwarePlatform/MIOpentensile/archive/rocm-5.0.0.tar.gz"
+ homepage = "https://github.com/ROCm/MIOpenTensile"
+ git = "https://github.com/ROCm/MIOpenTensile.git"
+ url = "https://github.com/ROCm/MIOpentensile/archive/rocm-5.0.0.tar.gz"
tags = ["rocm"]
maintainers("srekolam")
@@ -72,7 +72,7 @@ class MiopenTensile(CMakePackage):
resource(
name="Tensile",
- git="https://github.com/ROCmSoftwarePlatform/Tensile.git",
+ git="https://github.com/ROCm/Tensile.git",
commit="9cbabb07f81e932b9c98bf5ae48fbd7fcef615cf",
when="@4.5.0:",
)
diff --git a/var/spack/repos/builtin/packages/miopengemm/package.py b/var/spack/repos/builtin/packages/miopengemm/package.py
index 937210ec77..e67185563e 100644
--- a/var/spack/repos/builtin/packages/miopengemm/package.py
+++ b/var/spack/repos/builtin/packages/miopengemm/package.py
@@ -12,9 +12,9 @@ class Miopengemm(CMakePackage):
"""An OpenCL general matrix multiplication (GEMM) API
and kernel generator"""
- homepage = "https://github.com/ROCmSoftwarePlatform/MIOpenGEMM"
- git = "https://github.com/ROCmSoftwarePlatform/MIOpenGEMM.git"
- url = "https://github.com/ROCmSoftwarePlatform/MIOpenGEMM/archive/rocm-5.5.0.tar.gz"
+ homepage = "https://github.com/ROCm/MIOpenGEMM"
+ git = "https://github.com/ROCm/MIOpenGEMM.git"
+ url = "https://github.com/ROCm/MIOpenGEMM/archive/rocm-6.0.0.tar.gz"
tags = ["rocm"]
maintainers("srekolam", "renjithravindrankannath")
@@ -22,8 +22,8 @@ class Miopengemm(CMakePackage):
def url_for_version(self, version):
if version == Version("1.1.6"):
- return "https://github.com/ROCmSoftwarePlatform/MIOpenGEMM/archive/1.1.6.tar.gz"
- url = "https://github.com/ROCmSoftwarePlatform/MIOpenGEMM/archive/rocm-{0}.tar.gz"
+ return "https://github.com/ROCm/MIOpenGEMM/archive/1.1.6.tar.gz"
+ url = "https://github.com/ROCm/MIOpenGEMM/archive/rocm-{0}.tar.gz"
return url.format(version)
license("MIT")
diff --git a/var/spack/repos/builtin/packages/mivisionx/package.py b/var/spack/repos/builtin/packages/mivisionx/package.py
index 153469f16e..5e2549631f 100644
--- a/var/spack/repos/builtin/packages/mivisionx/package.py
+++ b/var/spack/repos/builtin/packages/mivisionx/package.py
@@ -13,7 +13,7 @@ class Mivisionx(CMakePackage):
homepage = "https://github.com/GPUOpen-ProfessionalCompute-Libraries/MIVisionX"
git = "https://github.com/GPUOpen-ProfessionalCompute-Libraries/MIVisionX.git"
- url = "https://github.com/GPUOpen-ProfessionalCompute-Libraries/MIVisionX/archive/rocm-5.5.0.tar.gz"
+ url = "https://github.com/GPUOpen-ProfessionalCompute-Libraries/MIVisionX/archive/rocm-6.0.0.tar.gz"
maintainers("srekolam", "renjithravindrankannath")
tags = ["rocm"]
@@ -27,6 +27,7 @@ class Mivisionx(CMakePackage):
license("MIT")
+ version("6.0.0", sha256="01324a12f21ea0e29a4d7d7c60498ba9231723569fedcdd90f28ddffb5e0570e")
version("5.7.1", sha256="bfc074bc32ebe84c72149ee6abb30b5b6499023d5b98269232de82e35d0505a8")
version("5.7.0", sha256="07e4ec8a8c06a9a8bb6394a043c9c3e7176acd3b462a16de91ef9518a64df9ba")
version("5.6.1", sha256="b2ff95c1488e244f379482631dae4f9ab92d94a513d180e03607aa1e184b5b0a")
@@ -369,6 +370,7 @@ class Mivisionx(CMakePackage):
"5.6.1",
"5.7.0",
"5.7.1",
+ "6.0.0",
]:
depends_on("miopen-hip@" + ver, when="@" + ver)
for ver in [
@@ -381,11 +383,12 @@ class Mivisionx(CMakePackage):
"5.6.1",
"5.7.0",
"5.7.1",
+ "6.0.0",
]:
depends_on("migraphx@" + ver, when="@" + ver)
depends_on("hip@" + ver, when="@" + ver)
- for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1", "5.7.0", "5.7.1"]:
+ for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1", "5.7.0", "5.7.1", "6.0.0"]:
depends_on("rocm-core@" + ver, when="@" + ver)
depends_on("python@3.5:", type="build")
diff --git a/var/spack/repos/builtin/packages/mlirmiopen/package.py b/var/spack/repos/builtin/packages/mlirmiopen/package.py
index eeed27450d..7cfe466a83 100644
--- a/var/spack/repos/builtin/packages/mlirmiopen/package.py
+++ b/var/spack/repos/builtin/packages/mlirmiopen/package.py
@@ -10,9 +10,9 @@ from spack.package import *
class Mlirmiopen(CMakePackage):
"""Multi-Level Intermediate Representation for rocm miopen project."""
- homepage = "https://github.com/ROCmSoftwarePlatform/llvm-project-mlir"
- url = "https://github.com/ROCmSoftwarePlatform/llvm-project-mlir/archive/refs/tags/rocm-5.4.0.tar.gz"
- git = "https://github.com/ROCmSoftwarePlatform/llvm-project-mlir.git"
+ homepage = "https://github.com/ROCm/llvm-project-mlir"
+ url = "https://github.com/ROCm/llvm-project-mlir/archive/refs/tags/rocm-5.4.0.tar.gz"
+ git = "https://github.com/ROCm/llvm-project-mlir.git"
tags = ["rocm"]
maintainers("srekolam")
diff --git a/var/spack/repos/builtin/packages/petsc/Handle-hipsparse-api-changes-for-rocm-6.0.patch b/var/spack/repos/builtin/packages/petsc/Handle-hipsparse-api-changes-for-rocm-6.0.patch
new file mode 100644
index 0000000000..674c083f51
--- /dev/null
+++ b/var/spack/repos/builtin/packages/petsc/Handle-hipsparse-api-changes-for-rocm-6.0.patch
@@ -0,0 +1,70 @@
+From 3c9aaca12a1ae6000ff3cfd0564f7b2ab45396d2 Mon Sep 17 00:00:00 2001
+From: Renjith Ravindran <Renjith.RavindranKannath@amd.com>
+Date: Thu, 18 Jan 2024 07:38:25 +0000
+Subject: [PATCH] Handle the hipsparse api changes for rocm 6.0
+
+---
+ .../impls/aij/seq/seqhipsparse/aijhipsparse.hip.cpp | 12 ++++++------
+ 1 file changed, 6 insertions(+), 6 deletions(-)
+
+diff --git a/src/mat/impls/aij/seq/seqhipsparse/aijhipsparse.hip.cpp b/src/mat/impls/aij/seq/seqhipsparse/aijhipsparse.hip.cpp
+index e6f878f..4bf52cd 100644
+--- a/src/mat/impls/aij/seq/seqhipsparse/aijhipsparse.hip.cpp
++++ b/src/mat/impls/aij/seq/seqhipsparse/aijhipsparse.hip.cpp
+@@ -1258,7 +1258,7 @@ static PetscErrorCode MatSolve_SeqAIJHIPSPARSE_ILU0(Mat fact, Vec b, Vec x)
+ /* Solve L*y = b */
+ PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
+ PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
+- #if PETSC_PKG_HIP_VERSION_EQ(5, 6, 31061) // i.e., 5.6.0
++ #if (PETSC_PKG_HIP_VERSION_EQ(5, 6, 31061)||PETSC_PKG_HIP_VERSION_EQ(6, 0, 32830)) // i.e., 5.6.0
+ PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */
+ fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L)); // hipsparseSpSV_solve() secretely uses the external buffer used in hipsparseSpSV_analysis()!
+ #else
+@@ -1267,7 +1267,7 @@ static PetscErrorCode MatSolve_SeqAIJHIPSPARSE_ILU0(Mat fact, Vec b, Vec x)
+ #endif
+ /* Solve U*x = y */
+ PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_X, xarray));
+- #if PETSC_PKG_HIP_VERSION_EQ(5, 6, 31061) // i.e., 5.6.0
++ #if (PETSC_PKG_HIP_VERSION_EQ(5, 6, 31061)||PETSC_PKG_HIP_VERSION_EQ(6, 0, 32830)) // i.e., 5.6.0
+ PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, /* U X = Y */
+ fs->dnVecDescr_Y, fs->dnVecDescr_X, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U));
+ #else
+@@ -1316,7 +1316,7 @@ static PetscErrorCode MatSolveTranspose_SeqAIJHIPSPARSE_ILU0(Mat fact, Vec b, Ve
+ /* Solve Ut*y = b */
+ PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
+ PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
+- #if PETSC_PKG_HIP_VERSION_EQ(5, 6, 31061) // i.e., 5.6.0
++ #if (PETSC_PKG_HIP_VERSION_EQ(5, 6, 31061)||PETSC_PKG_HIP_VERSION_EQ(6, 0, 32830)) // i.e., 5.6.0
+ PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, /* Ut Y = X */
+ fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut));
+ #else
+@@ -1325,7 +1325,7 @@ static PetscErrorCode MatSolveTranspose_SeqAIJHIPSPARSE_ILU0(Mat fact, Vec b, Ve
+ #endif
+ /* Solve Lt*x = y */
+ PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_X, xarray));
+- #if PETSC_PKG_HIP_VERSION_EQ(5, 6, 31061) // i.e., 5.6.0
++ #if (PETSC_PKG_HIP_VERSION_EQ(5, 6, 31061)||PETSC_PKG_HIP_VERSION_EQ(6, 0, 32830)) // i.e., 5.6.0
+ PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
+ fs->dnVecDescr_Y, fs->dnVecDescr_X, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt));
+ #else
+@@ -1559,7 +1559,7 @@ static PetscErrorCode MatSolve_SeqAIJHIPSPARSE_ICC0(Mat fact, Vec b, Vec x)
+ /* Solve L*y = b */
+ PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
+ PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
+- #if PETSC_PKG_HIP_VERSION_EQ(5, 6, 31061) // i.e., 5.6.0
++ #if (PETSC_PKG_HIP_VERSION_EQ(5, 6, 31061)||PETSC_PKG_HIP_VERSION_EQ(6, 0, 32830)) // i.e., 5.6.0
+ PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */
+ fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L));
+ #else
+@@ -1568,7 +1568,7 @@ static PetscErrorCode MatSolve_SeqAIJHIPSPARSE_ICC0(Mat fact, Vec b, Vec x)
+ #endif
+ /* Solve Lt*x = y */
+ PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_X, xarray));
+- #if PETSC_PKG_HIP_VERSION_EQ(5, 6, 31061) // i.e., 5.6.0
++ #if (PETSC_PKG_HIP_VERSION_EQ(5, 6, 31061)||PETSC_PKG_HIP_VERSION_EQ(6, 0, 32830)) // i.e., 5.6.0
+ PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
+ fs->dnVecDescr_Y, fs->dnVecDescr_X, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt));
+ #else
+--
+2.39.3
+
diff --git a/var/spack/repos/builtin/packages/petsc/package.py b/var/spack/repos/builtin/packages/petsc/package.py
index 5a4c011002..67a872ea8b 100644
--- a/var/spack/repos/builtin/packages/petsc/package.py
+++ b/var/spack/repos/builtin/packages/petsc/package.py
@@ -21,7 +21,7 @@ class Petsc(Package, CudaPackage, ROCmPackage):
tags = ["e4s"]
version("main", branch="main")
-
+ version("3.20.3", sha256="75a94fb44df0512f51ad093fa784e56b61f51b7ead5956fbe49185c203f8c245")
version("3.20.2", sha256="2a2d08b5f0e3d0198dae2c42ce1fd036f25c153ef2bb4a2d320ca141ac7cd30b")
version("3.20.1", sha256="3d54f13000c9c8ceb13ca4f24f93d838319019d29e6de5244551a3ec22704f32")
version("3.20.0", sha256="c152ccb12cb2353369d27a65470d4044a0c67e0b69814368249976f5bb232bd4")
@@ -172,6 +172,9 @@ class Petsc(Package, CudaPackage, ROCmPackage):
)
patch("hip-5.6.0-for-3.18.diff", when="@3.18:3.19 ^hipsparse@5.6.0")
patch("hip-5.7-plus-for-3.18.diff", when="@3.18:3.19 ^hipsparse@5.7:")
+ patch(
+ "Handle-hipsparse-api-changes-for-rocm-6.0.patch", when="@3.20.2:3.20.3 ^hipsparse@6.0"
+ )
# 3.8.0 has a build issue with MKL - so list this conflict explicitly
conflicts("^intel-mkl", when="@3.8.0")
diff --git a/var/spack/repos/builtin/packages/raja/package.py b/var/spack/repos/builtin/packages/raja/package.py
index fb67631779..9bb463412f 100644
--- a/var/spack/repos/builtin/packages/raja/package.py
+++ b/var/spack/repos/builtin/packages/raja/package.py
@@ -114,6 +114,14 @@ class Raja(CachedCMakePackage, CudaPackage, ROCmPackage):
when="@:0.13.0 ^blt@0.4:",
)
+ # Backward compatibility is stopped from ROCm 6.0
+ # Future relase will have the change from PR https://github.com/LLNL/RAJA/pull/1568
+ patch(
+ "https://github.com/LLNL/RAJA/commit/406eb8dee05a41eb32c421c375688a4863b60642.patch?full_index=1",
+ sha256="d9ce5ef038555cbccb330a9016b7be77e56ae0660583cba955dab9d0297a4b07",
+ when="^hip@6.0.0",
+ )
+
variant("openmp", default=True, description="Build OpenMP backend")
variant("shared", default=True, description="Build Shared Libs")
variant("plugins", default=False, description="Enable runtime plugins")
diff --git a/var/spack/repos/builtin/packages/rccl-tests/package.py b/var/spack/repos/builtin/packages/rccl-tests/package.py
index 18131077e4..a27bebac07 100644
--- a/var/spack/repos/builtin/packages/rccl-tests/package.py
+++ b/var/spack/repos/builtin/packages/rccl-tests/package.py
@@ -10,9 +10,9 @@ class RcclTests(MakefilePackage):
"""These tests check both the performance and the correctness of RCCL
operations. They can be compiled against RCCL."""
- homepage = "https://github.com/ROCmSoftwarePlatform/rccl-tests"
- git = "https://github.com/ROCmSoftwarePlatform/rccl-tests.git"
- url = "https://github.com/ROCmSoftwarePlatform/rccl-tests.git"
+ homepage = "https://github.com/ROCm/rccl-tests"
+ git = "https://github.com/ROCm/rccl-tests.git"
+ url = "https://github.com/ROCm/rccl-tests.git"
tags = ["rocm"]
maintainers("bvanessen")
diff --git a/var/spack/repos/builtin/packages/rccl/0004-Set-rocm-core-path-for-version-file.patch b/var/spack/repos/builtin/packages/rccl/0004-Set-rocm-core-path-for-version-file.patch
new file mode 100644
index 0000000000..fd03def3ee
--- /dev/null
+++ b/var/spack/repos/builtin/packages/rccl/0004-Set-rocm-core-path-for-version-file.patch
@@ -0,0 +1,13 @@
+diff --git a/CMakeLists.txt b/CMakeLists.txt
+index 5384287..ea6fd4b 100644
+--- a/CMakeLists.txt
++++ b/CMakeLists.txt
+@@ -121,7 +121,7 @@ message(STATUS "hipcc version: ${hipcc_version_string}")
+
+ ## Check for ROCm version
+ execute_process(
+- COMMAND bash "-c" "cat ${ROCM_PATH}/.info/version"
++ COMMAND bash "-c" "cat $ENV{ROCMCORE_PATH}/.info/version"
+ OUTPUT_VARIABLE rocm_version_string
+ )
+ string(REGEX MATCH "([0-9]+)\\.([0-9]+)\\.([0-9]+)" rocm_version_matches ${rocm_version_string})
diff --git a/var/spack/repos/builtin/packages/rccl/package.py b/var/spack/repos/builtin/packages/rccl/package.py
index 9b388d1a27..52519c0194 100644
--- a/var/spack/repos/builtin/packages/rccl/package.py
+++ b/var/spack/repos/builtin/packages/rccl/package.py
@@ -14,13 +14,14 @@ class Rccl(CMakePackage):
implementing all-reduce, all-gather, reduce, broadcast,
and reduce-scatter."""
- homepage = "https://github.com/ROCmSoftwarePlatform/rccl"
- git = "https://github.com/ROCmSoftwarePlatform/rccl.git"
- url = "https://github.com/ROCmSoftwarePlatform/rccl/archive/rocm-5.5.0.tar.gz"
+ homepage = "https://github.com/ROCm/rccl"
+ git = "https://github.com/ROCm/rccl.git"
+ url = "https://github.com/ROCm/rccl/archive/rocm-6.0.0.tar.gz"
tags = ["rocm"]
maintainers("srekolam", "renjithravindrankannath")
libraries = ["librccl"]
+ version("6.0.0", sha256="0496d5a5f2e48c92cd390ab318df31a53cf7ec590988c2574c9f3d99c38b0fa7")
version("5.7.1", sha256="fb4c1f0084196d1226ce8a726d0f012d3890b54508a06ca87bbda619be8b90b1")
version("5.7.0", sha256="4c2825a3e4323ef3c2f8855ef445c1a81cf1992fb37e3e8a07a50db354aa3954")
version("5.6.1", sha256="27ec6b86a1a329684d808f728c1fce134517ac8e6e7047689f95dbf8386c077e")
@@ -119,6 +120,7 @@ class Rccl(CMakePackage):
patch("0001-Fix-numactl-path-issue.patch", when="@3.7.0:4.3.2")
patch("0002-Fix-numactl-rocm-smi-path-issue.patch", when="@4.5.0:5.2.1")
patch("0003-Fix-numactl-rocm-smi-path-issue.patch", when="@5.2.3:5.6")
+ patch("0004-Set-rocm-core-path-for-version-file.patch", when="@6.0:")
depends_on("cmake@3.5:", type="build")
for ver in [
@@ -151,6 +153,7 @@ class Rccl(CMakePackage):
"5.6.1",
"5.7.0",
"5.7.1",
+ "6.0.0",
]:
depends_on("rocm-cmake@%s:" % ver, type="build", when="@" + ver)
depends_on("hip@" + ver, when="@" + ver)
@@ -186,6 +189,7 @@ class Rccl(CMakePackage):
"5.6.1",
"5.7.0",
"5.7.1",
+ "6.0.0",
]:
depends_on("numactl@2:", when="@" + ver)
for ver in [
@@ -208,12 +212,14 @@ class Rccl(CMakePackage):
"5.6.1",
"5.7.0",
"5.7.1",
+ "6.0.0",
]:
depends_on("rocm-smi-lib@" + ver, when="@" + ver)
depends_on("chrpath", when="@5.3.0:")
- for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1", "5.7.0", "5.7.1"]:
+ for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1", "5.7.0", "5.7.1", "6.0.0"]:
depends_on("rocm-core@" + ver, when="@" + ver)
+
depends_on("googletest@1.11.0:", when="@5.3:")
@classmethod
@@ -229,6 +235,7 @@ class Rccl(CMakePackage):
def setup_build_environment(self, env):
env.set("CXX", self.spec["hip"].hipcc)
+ env.set("ROCMCORE_PATH", self.spec["rocm-core"].prefix)
def cmake_args(self):
args = []
diff --git a/var/spack/repos/builtin/packages/rdc/package.py b/var/spack/repos/builtin/packages/rdc/package.py
index fbcb130fb2..f4466bc991 100644
--- a/var/spack/repos/builtin/packages/rdc/package.py
+++ b/var/spack/repos/builtin/packages/rdc/package.py
@@ -12,8 +12,8 @@ from spack.package import *
class Rdc(CMakePackage):
"""ROCm Data Center Tool"""
- homepage = "https://github.com/RadeonOpenCompute/rdc"
- url = "https://github.com/RadeonOpenCompute/rdc/archive/rocm-5.5.0.tar.gz"
+ homepage = "https://github.com/ROCm/rdc"
+ url = "https://github.com/ROCm/rdc/archive/rocm-6.0.0.tar.gz"
tags = ["rocm"]
maintainers("srekolam", "renjithravindrankannath")
@@ -21,13 +21,14 @@ class Rdc(CMakePackage):
def url_for_version(self, version):
if version == Version("3.9.0"):
- return "https://github.com/RadeonOpenCompute/rdc/archive/rdc_so_ver-0.3.tar.gz"
+ return "https://github.com/ROCm/rdc/archive/rdc_so_ver-0.3.tar.gz"
- url = "https://github.com/RadeonOpenCompute/rdc/archive/rocm-{0}.tar.gz"
+ url = "https://github.com/ROCm/rdc/archive/rocm-{0}.tar.gz"
return url.format(version)
license("MIT")
+ version("6.0.0", sha256="5e3847a919d5f7efe99d8d76c96e78401659eccd1fb234b1b8cb4304096d6e89")
version("5.7.1", sha256="5251eb3085f2019246b332e9552dfae1572cf64ddf58306b81cbe7108019ffee")
version("5.7.0", sha256="924e94f14f6390d7a6ff7863fb4e2085c1ff5f9c12b8bd46471eb31f001c4f14")
version("5.6.1", sha256="9e9f57cebbc5ae386a405957ed2c17344cdb42db5e1a71285f2c9bc09eea6519")
@@ -140,6 +141,7 @@ class Rdc(CMakePackage):
"5.6.1",
"5.7.0",
"5.7.1",
+ "6.0.0",
]:
depends_on("rocm-smi-lib@" + ver, type=("build", "link"), when="@" + ver)
@@ -161,10 +163,11 @@ class Rdc(CMakePackage):
"5.6.1",
"5.7.0",
"5.7.1",
+ "6.0.0",
]:
depends_on("hsa-rocr-dev@" + ver, when="@" + ver)
- for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1", "5.7.0", "5.7.1"]:
+ for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1", "5.7.0", "5.7.1", "6.0.0"]:
depends_on("rocm-core@" + ver, when="@" + ver)
def patch(self):
diff --git a/var/spack/repos/builtin/packages/rocalution/package.py b/var/spack/repos/builtin/packages/rocalution/package.py
index 103fcd7373..d04530e77b 100644
--- a/var/spack/repos/builtin/packages/rocalution/package.py
+++ b/var/spack/repos/builtin/packages/rocalution/package.py
@@ -17,9 +17,9 @@ class Rocalution(CMakePackage):
generic and flexible design that allows seamless integration with
other scientific software packages."""
- homepage = "https://github.com/ROCmSoftwarePlatform/rocALUTION"
- git = "https://github.com/ROCmSoftwarePlatform/rocALUTION.git"
- url = "https://github.com/ROCmSoftwarePlatform/rocALUTION/archive/rocm-5.5.0.tar.gz"
+ homepage = "https://github.com/ROCm/rocALUTION"
+ git = "https://github.com/ROCm/rocALUTION.git"
+ url = "https://github.com/ROCm/rocALUTION/archive/rocm-6.0.0.tar.gz"
tags = ["rocm"]
maintainers("cgmb", "srekolam", "renjithravindrankannath")
@@ -27,6 +27,7 @@ class Rocalution(CMakePackage):
license("MIT")
+ version("6.0.0", sha256="cabf37691b8db00c82bda49c7dcfaefd9b9067b7d097afa43b7a5f86c45bff99")
version("5.7.1", sha256="b95afa1285759843c5fea1ad6e1c1edf283922e0d448db03a3e1f42b6942bc24")
version("5.7.0", sha256="48232a0d1250debce89e39a233bd0b5d52324a2454c078b99c9d44965cbbc0e9")
version("5.6.1", sha256="7197b3617a0c91e90adaa32003c04d247a5f585d216e77493d20984ba215addb")
@@ -165,6 +166,7 @@ class Rocalution(CMakePackage):
"5.6.1",
"5.7.0",
"5.7.1",
+ "6.0.0",
]:
depends_on("hip@" + ver, when="@" + ver)
depends_on("rocprim@" + ver, when="@" + ver)
diff --git a/var/spack/repos/builtin/packages/rocblas/package.py b/var/spack/repos/builtin/packages/rocblas/package.py
index 1012b89a17..854d897e5d 100644
--- a/var/spack/repos/builtin/packages/rocblas/package.py
+++ b/var/spack/repos/builtin/packages/rocblas/package.py
@@ -11,9 +11,9 @@ from spack.package import *
class Rocblas(CMakePackage):
"""Radeon Open Compute BLAS library"""
- homepage = "https://github.com/ROCmSoftwarePlatform/rocBLAS/"
- git = "https://github.com/ROCmSoftwarePlatform/rocBLAS.git"
- url = "https://github.com/ROCmSoftwarePlatform/rocBLAS/archive/rocm-5.5.0.tar.gz"
+ homepage = "https://github.com/ROCm/rocBLAS/"
+ git = "https://github.com/ROCm/rocBLAS.git"
+ url = "https://github.com/ROCm/rocBLAS/archive/rocm-6.0.0.tar.gz"
tags = ["rocm"]
maintainers("cgmb", "srekolam", "renjithravindrankannath", "haampie")
@@ -23,6 +23,7 @@ class Rocblas(CMakePackage):
version("develop", branch="develop")
version("master", branch="master")
+ version("6.0.0", sha256="befa4a75f1de0ea37f2358d4c2de5406d7bce671ca9936e2294b64d3b3bafb60")
version("5.7.1", sha256="2984a5ed0ea5a05d40996ee3fddecb24399cbe8ea3e4921fc254e54d8f52fe4f")
version("5.7.0", sha256="024edd98de9687ee5394badc4dd4c543eef4eb3f71c96ff64100705d851e1744")
version("5.6.1", sha256="73896ebd445162a69af97f9fd462684609b4e0cf617eab450cd4558b4a23941e")
@@ -131,8 +132,8 @@ class Rocblas(CMakePackage):
conflicts("amdgpu_target=gfx1012", when="@:4.2.1")
conflicts("amdgpu_target=gfx1030", when="@:4.2.1")
# https://reviews.llvm.org/D124866
- # https://github.com/ROCm-Developer-Tools/HIP/issues/2678
- # https://github.com/ROCm-Developer-Tools/hipamd/blob/rocm-5.2.x/include/hip/amd_detail/host_defines.h#L50
+ # https://github.com/ROCm/HIP/issues/2678
+ # https://github.com/ROCm/hipamd/blob/rocm-5.2.x/include/hip/amd_detail/host_defines.h#L50
conflicts("%gcc@12", when="@5.2")
depends_on("cmake@3.16.8:", type="build", when="@4.2.0:")
@@ -182,6 +183,7 @@ class Rocblas(CMakePackage):
"5.6.1",
"5.7.0",
"5.7.1",
+ "6.0.0",
]:
depends_on("hip@" + ver, when="@" + ver)
depends_on("llvm-amdgpu@" + ver, type="build", when="@" + ver)
@@ -232,10 +234,11 @@ class Rocblas(CMakePackage):
("@5.6.1", "7d0a9d040c3bbae893df7ecef6a19d9cd1c304aa"),
("@5.7.0", "97e0cfc2c8cb87a1e38901d99c39090dc4181652"),
("@5.7.1", "97e0cfc2c8cb87a1e38901d99c39090dc4181652"),
+ ("@6.0.0", "17df881bde80fc20f997dfb290f4bb4b0e05a7e9"),
]:
resource(
name="Tensile",
- git="https://github.com/ROCmSoftwarePlatform/Tensile.git",
+ git="https://github.com/ROCm/Tensile.git",
commit=t_commit,
when="{} +tensile".format(t_version),
)
@@ -243,12 +246,12 @@ class Rocblas(CMakePackage):
for ver in ["master", "develop"]:
resource(
name="Tensile",
- git="https://github.com/ROCmSoftwarePlatform/Tensile.git",
+ git="https://github.com/ROCm/Tensile.git",
branch=ver,
when="@{} +tensile".format(ver),
)
- # Status: https://github.com/ROCmSoftwarePlatform/Tensile/commit/a488f7dadba34f84b9658ba92ce9ec5a0615a087
+ # Status: https://github.com/ROCm/Tensile/commit/a488f7dadba34f84b9658ba92ce9ec5a0615a087
# Not yet landed in 3.7.0, nor 3.8.0.
patch("0001-Fix-compilation-error-with-StringRef-to-basic-string.patch", when="@:3.8")
patch("0002-Fix-rocblas-clients-blas.patch", when="@4.2.0:4.3.1")
@@ -256,7 +259,7 @@ class Rocblas(CMakePackage):
# Finding Python package and set command python as python3
patch("0004-Find-python.patch", when="@5.2.0:5.4")
patch("0006-Guard-use-of-OpenMP-to-make-it-optional-5.4.patch", when="@5.4")
- patch("0007-add-rocm-openmp-extras-include-dir.patch", when="@5.6:")
+ patch("0007-add-rocm-openmp-extras-include-dir.patch", when="@5.6:5.7")
def setup_build_environment(self, env):
env.set("CXX", self.spec["hip"].hipcc)
@@ -309,14 +312,14 @@ class Rocblas(CMakePackage):
# Restrict the number of jobs Tensile can spawn.
# If we don't specify otherwise, Tensile creates a job per available core,
# and that consumes a lot of system memory.
- # https://github.com/ROCmSoftwarePlatform/Tensile/blob/93e10678a0ced7843d9332b80bc17ebf9a166e8e/Tensile/Parallel.py#L38
+ # https://github.com/ROCm/Tensile/blob/93e10678a0ced7843d9332b80bc17ebf9a166e8e/Tensile/Parallel.py#L38
args.append(self.define("Tensile_CPU_THREADS", min(16, make_jobs)))
- # See https://github.com/ROCmSoftwarePlatform/rocBLAS/commit/c1895ba4bb3f4f5947f3818ebd155cf71a27b634
+ # See https://github.com/ROCm/rocBLAS/commit/c1895ba4bb3f4f5947f3818ebd155cf71a27b634
if "auto" not in self.spec.variants["amdgpu_target"]:
args.append(self.define_from_variant(arch_define_name, "amdgpu_target"))
- # See https://github.com/ROCmSoftwarePlatform/rocBLAS/issues/1196
+ # See https://github.com/ROCm/rocBLAS/issues/1196
if self.spec.satisfies("^cmake@3.21.0:3.21.2"):
args.append(self.define("__skip_rocmclang", "ON"))
diff --git a/var/spack/repos/builtin/packages/rocfft/package.py b/var/spack/repos/builtin/packages/rocfft/package.py
index 229dd4bdb0..815bb03132 100644
--- a/var/spack/repos/builtin/packages/rocfft/package.py
+++ b/var/spack/repos/builtin/packages/rocfft/package.py
@@ -11,16 +11,16 @@ from spack.package import *
class Rocfft(CMakePackage):
"""Radeon Open Compute FFT library"""
- homepage = "https://github.com/ROCmSoftwarePlatform/rocFFT/"
- git = "https://github.com/ROCmSoftwarePlatform/rocFFT.git"
- url = "https://github.com/ROCmSoftwarePlatform/rocfft/archive/rocm-5.5.0.tar.gz"
+ homepage = "https://github.com/ROCm/rocFFT/"
+ git = "https://github.com/ROCm/rocFFT.git"
+ url = "https://github.com/ROCm/rocfft/archive/rocm-6.0.0.tar.gz"
tags = ["rocm"]
maintainers("cgmb", "srekolam", "renjithravindrankannath", "haampie")
libraries = ["librocfft"]
license("MIT")
-
+ version("6.0.0", sha256="fb8ba56572702e77e4383d922cd1fee4ad3fa5f63a5ebdb3d9c354439a446992")
version("5.7.1", sha256="202f11f60dc8738e29bbd1b397d419e032794f8bffb7f48f2b31f09cc5f08bc2")
version("5.7.0", sha256="3c4a1537a6ec76dc9b622644fe3890647306bf9f28f61c5d2028259c31bb964f")
version("5.6.1", sha256="a65861e453587c3e6393da75b0b1976508c61f968aecda77fbec920fea48489e")
@@ -167,6 +167,7 @@ class Rocfft(CMakePackage):
"5.6.1",
"5.7.0",
"5.7.1",
+ "6.0.0",
]:
depends_on("hip@" + ver, when="@" + ver)
depends_on("rocm-cmake@%s:" % ver, type="build", when="@" + ver)
@@ -178,6 +179,14 @@ class Rocfft(CMakePackage):
# Patch to add install prefix header location for sqlite for 5.4
patch("0004-fix-missing-sqlite-include-paths.patch", when="@5.4.0:5.5")
+ # Set LD_LIBRARY_PATH for executing the binaries from build directoryfix missing type
+ # https://github.com/ROCm/rocFFT/pull/449)
+ patch(
+ "https://github.com/ROCm/rocFFT/commit/0ec78f1daac2d7fa1415f4deff0d129252c1c9de.patch?full_index=1",
+ sha256="bac7873185ac60f2aaa50e278f0b8d52b4d79d586bf7f52db1da33559569ba54",
+ when="@6.0.0",
+ )
+
def setup_build_environment(self, env):
env.set("CXX", self.spec["hip"].hipcc)
@@ -214,7 +223,7 @@ class Rocfft(CMakePackage):
self.define_from_variant("AMDGPU_TARGETS_SRAM_ECC", "amdgpu_target_sram_ecc")
)
- # See https://github.com/ROCmSoftwarePlatform/rocFFT/issues/322
+ # See https://github.com/ROCm/rocFFT/issues/322
if self.spec.satisfies("^cmake@3.21.0:3.21.2"):
args.append(self.define("__skip_rocmclang", "ON"))
diff --git a/var/spack/repos/builtin/packages/rocm-bandwidth-test/package.py b/var/spack/repos/builtin/packages/rocm-bandwidth-test/package.py
index 27806866a4..ffb8f927f0 100644
--- a/var/spack/repos/builtin/packages/rocm-bandwidth-test/package.py
+++ b/var/spack/repos/builtin/packages/rocm-bandwidth-test/package.py
@@ -10,14 +10,15 @@ from spack.package import *
class RocmBandwidthTest(CMakePackage):
"""Test to measure PciE bandwidth on ROCm platforms"""
- homepage = "https://github.com/RadeonOpenCompute/rocm_bandwidth_test"
- git = "https://github.com/RadeonOpenCompute/rocm_bandwidth_test.git"
- url = "https://github.com/RadeonOpenCompute/rocm_bandwidth_test/archive/rocm-5.5.0.tar.gz"
+ homepage = "https://github.com/ROCm/rocm_bandwidth_test"
+ git = "https://github.com/ROCm/rocm_bandwidth_test.git"
+ url = "https://github.com/ROCm/rocm_bandwidth_test/archive/rocm-6.0.0.tar.gz"
tags = ["rocm"]
maintainers("srekolam", "renjithravindrankannath")
version("master", branch="master")
+ version("6.0.0", sha256="9023401bd6a896059545b8e6263c6730afd89d7d45c0f5866261c300415532a6")
version("5.7.1", sha256="7426ef1e317b8293e4d6389673cfa8c63efb3f7d061e2f50a6f0b1b706e2a2a7")
version("5.7.0", sha256="fa95c28488ab4bb6d920b9f3c316554ca340f44c87ec2efb4cf8fa488e63ddd9")
version("5.6.1", sha256="849af715d08dfd89e7aa5e4453b624151db1cafaa567ab5fa36a77948b90bf0d")
@@ -136,12 +137,13 @@ class RocmBandwidthTest(CMakePackage):
"5.6.1",
"5.7.0",
"5.7.1",
+ "6.0.0",
"master",
]:
depends_on("hsa-rocr-dev@" + ver, when="@" + ver)
depends_on("hsakmt-roct@" + ver, when="@" + ver)
- for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1", "5.7.0", "5.7.1"]:
+ for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1", "5.7.0", "5.7.1", "6.0.0"]:
depends_on("rocm-core@" + ver, when="@" + ver)
build_targets = ["package"]
diff --git a/var/spack/repos/builtin/packages/rocm-clang-ocl/package.py b/var/spack/repos/builtin/packages/rocm-clang-ocl/package.py
index 6961c15b80..aeca0c39a2 100644
--- a/var/spack/repos/builtin/packages/rocm-clang-ocl/package.py
+++ b/var/spack/repos/builtin/packages/rocm-clang-ocl/package.py
@@ -9,15 +9,16 @@ from spack.package import *
class RocmClangOcl(CMakePackage):
"""OpenCL compilation with clang compiler"""
- homepage = "https://github.com/RadeonOpenCompute/clang-ocl"
- git = "https://github.com/RadeonOpenCompute/clang-ocl.git"
- url = "https://github.com/RadeonOpenCompute/clang-ocl/archive/rocm-5.5.0.tar.gz"
+ homepage = "https://github.com/ROCm/clang-ocl"
+ git = "https://github.com/ROCm/clang-ocl.git"
+ url = "https://github.com/ROCm/clang-ocl/archive/rocm-6.0.0.tar.gz"
tags = ["rocm"]
license("MIT")
maintainers("srekolam", "renjithravindrankannath")
version("master", branch="master")
+ version("6.0.0", sha256="74b5a64c32f3c57e7e4de638fffabbf448ecdb3dd8e65678b7ba0633352b4ca3")
version("5.7.1", sha256="32e4430d009cbbf5404ca9cbbb549b36897fa1826bc2285372e293cfe7531bf8")
version("5.7.0", sha256="c9ca80bfee674e740039256a846107373f1cf6554dc28398599976d8646a0392")
version("5.6.1", sha256="c41deb1b564d939fc897b2bbdb13570b2234fa4c052a39783f5ad2dd1052f901")
@@ -136,6 +137,7 @@ class RocmClangOcl(CMakePackage):
"5.6.1",
"5.7.0",
"5.7.1",
+ "6.0.0",
"master",
]:
depends_on("rocm-cmake@%s:" % ver, type="build", when="@" + ver)
@@ -145,7 +147,7 @@ class RocmClangOcl(CMakePackage):
depends_on(
"rocm-device-libs@" + ver, when="@{0} ^llvm-amdgpu ~rocm-device-libs".format(ver)
)
- for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1", "5.7.0", "5.7.1"]:
+ for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1", "5.7.0", "5.7.1", "6.0.0"]:
depends_on("rocm-core@" + ver, when="@" + ver)
test_src_dir = "test"
diff --git a/var/spack/repos/builtin/packages/rocm-cmake/package.py b/var/spack/repos/builtin/packages/rocm-cmake/package.py
index c14999a989..a5cbb03c5b 100644
--- a/var/spack/repos/builtin/packages/rocm-cmake/package.py
+++ b/var/spack/repos/builtin/packages/rocm-cmake/package.py
@@ -11,9 +11,9 @@ class RocmCmake(CMakePackage):
"""rocm-cmake provides CMake modules for common build tasks
in the ROCm software stack"""
- homepage = "https://github.com/RadeonOpenCompute/rocm-cmake"
- git = "https://github.com/RadeonOpenCompute/rocm-cmake.git"
- url = "https://github.com/RadeonOpenCompute/rocm-cmake/archive/rocm-5.6.0.tar.gz"
+ homepage = "https://github.com/ROCm/rocm-cmake"
+ git = "https://github.com/ROCm/rocm-cmake.git"
+ url = "https://github.com/ROCm/rocm-cmake/archive/rocm-5.6.0.tar.gz"
tags = ["rocm"]
maintainers("srekolam", "renjithravindrankannath")
@@ -21,6 +21,7 @@ class RocmCmake(CMakePackage):
license("MIT")
version("master", branch="master")
+ version("6.0.0", sha256="82bd97ba23d1883ef38bb667e92f7367fedc50d6c11c82f54cced4ab04b0412d")
version("5.7.1", sha256="4a4c6aa09576ccb834f869bdcb49e98cc0f0bac3678b802358065d1179a9d6f1")
version("5.7.0", sha256="93b98144201a1143eeca32744a9927d063f4685189f132ba52a6f3bba158a86b")
version("5.6.1", sha256="98bf5fe2e6e12f55d122807d0060f1bb19c80d63d2c2f6fee579c40bfd244fa6")
@@ -110,7 +111,7 @@ class RocmCmake(CMakePackage):
depends_on("cmake@3:", type="build")
depends_on("cmake@3.6:", type="build", when="@4.1.0:")
- for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1", "5.7.0", "5.7.1"]:
+ for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1", "5.7.0", "5.7.1", "6.0.0"]:
depends_on("rocm-core@" + ver, when="@" + ver)
test_src_dir = "test"
diff --git a/var/spack/repos/builtin/packages/rocm-core/package.py b/var/spack/repos/builtin/packages/rocm-core/package.py
index 9d6bca46b6..54c1a526dd 100644
--- a/var/spack/repos/builtin/packages/rocm-core/package.py
+++ b/var/spack/repos/builtin/packages/rocm-core/package.py
@@ -12,8 +12,8 @@ class RocmCore(CMakePackage):
It also provides the Lmod modules files for the ROCm release.
getROCmVersion function provides the ROCm version."""
- homepage = "https://github.com/RadeonOpenCompute/rocm-core"
- url = "https://github.com/RadeonOpenCompute/rocm-core/archive/refs/tags/rocm-5.5.0.tar.gz"
+ homepage = "https://github.com/ROCm/rocm-core"
+ url = "https://github.com/ROCm/rocm-core/archive/refs/tags/rocm-6.0.0.tar.gz"
tags = ["rocm"]
maintainers("srekolam", "renjithravindrankannath")
@@ -21,6 +21,7 @@ class RocmCore(CMakePackage):
license("MIT")
+ version("6.0.0", sha256="d950ee4b63336f34579b6e1dda2d05966b7afa9c84bcdc13874991d1147dc788")
version("5.7.1", sha256="fc4915019ddfd126e8ef6a15006bce3aa7bd5fd11dc8eb04ce2ee6bdf9c6ae7f")
version("5.7.0", sha256="722689bfec46c35f5428a41c5aacfc31efec2294fc3b0112861c562f8a71ac93")
version("5.6.1", sha256="eeef75e16e05380ccbc8df17a02dc141a66dddaadb444a97f7278f78067c498c")
diff --git a/var/spack/repos/builtin/packages/rocm-dbgapi/package.py b/var/spack/repos/builtin/packages/rocm-dbgapi/package.py
index 92b4ec72a9..d068de3456 100644
--- a/var/spack/repos/builtin/packages/rocm-dbgapi/package.py
+++ b/var/spack/repos/builtin/packages/rocm-dbgapi/package.py
@@ -14,9 +14,9 @@ class RocmDbgapi(CMakePackage):
control of the execution and inspection of execution state of
AMD's commercially available GPU architectures."""
- homepage = "https://github.com/ROCm-Developer-Tools/ROCdbgapi"
- git = "https://github.com/ROCm-Developer-Tools/ROCdbgapi.git"
- url = "https://github.com/ROCm-Developer-Tools/ROCdbgapi/archive/rocm-5.5.0.tar.gz"
+ homepage = "https://github.com/ROCm/ROCdbgapi"
+ git = "https://github.com/ROCm/ROCdbgapi.git"
+ url = "https://github.com/ROCm/ROCdbgapi/archive/rocm-6.0.0.tar.gz"
tags = ["rocm"]
maintainers("srekolam", "renjithravindrankannath")
@@ -25,6 +25,7 @@ class RocmDbgapi(CMakePackage):
license("MIT")
version("master", branch="amd-master")
+ version("6.0.0", sha256="4e823eba255e46b93aff05fd5938ef2a51693ffd74debebffc1aabfce613805c")
version("5.7.1", sha256="0ee9c2f083868849f2ea0cec7010e0270c27e7679ccbbadd12072cc0ef6c8a6f")
version("5.7.0", sha256="285ddded8e7f1981d8861ffc1cd7770b78129e4955da08ad55a4779945699716")
version("5.6.1", sha256="c7241bf94bdb97a4cf1befbf25b8c35720797710da6f6b5b9d6a4094c1bc9c8b")
@@ -144,12 +145,13 @@ class RocmDbgapi(CMakePackage):
"5.6.1",
"5.7.0",
"5.7.1",
+ "6.0.0",
"master",
]:
depends_on("hsa-rocr-dev@" + ver, type="build", when="@" + ver)
depends_on("comgr@" + ver, type=("build", "link"), when="@" + ver)
- for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1", "5.7.0", "5.7.1"]:
+ for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1", "5.7.0", "5.7.1", "6.0.0"]:
depends_on("rocm-core@" + ver, when="@" + ver)
@classmethod
diff --git a/var/spack/repos/builtin/packages/rocm-debug-agent/package.py b/var/spack/repos/builtin/packages/rocm-debug-agent/package.py
index 5f95ebf8e4..a397fb6f56 100644
--- a/var/spack/repos/builtin/packages/rocm-debug-agent/package.py
+++ b/var/spack/repos/builtin/packages/rocm-debug-agent/package.py
@@ -11,13 +11,14 @@ from spack.package import *
class RocmDebugAgent(CMakePackage):
"""Radeon Open Compute (ROCm) debug agent"""
- homepage = "https://github.com/ROCm-Developer-Tools/rocr_debug_agent"
- git = "https://github.com/ROCm-Developer-Tools/rocr_debug_agent.git"
- url = "https://github.com/ROCm-Developer-Tools/rocr_debug_agent/archive/rocm-5.5.0.tar.gz"
+ homepage = "https://github.com/ROCm/rocr_debug_agent"
+ git = "https://github.com/ROCm/rocr_debug_agent.git"
+ url = "https://github.com/ROCm/rocr_debug_agent/archive/rocm-6.0.0.tar.gz"
tags = ["rocm"]
maintainers("srekolam", "renjithravindrankannath")
libraries = ["librocm-debug-agent"]
+ version("6.0.0", sha256="705be2c2bd0f5c7d1e286eb9b94045b2bd017ff323f07bca9aa7c81f2d168524")
version("5.7.1", sha256="3b8d2835935da98f41e7cfc5b808c596ac06dd705b9a07bb70283e002f8dea6a")
version("5.7.0", sha256="d9344ed02e82a01140f2162e901e6a519e5fee6b498e2f49417730ee2660c5c1")
version("5.6.1", sha256="d3b1d5d757489ed3cc66d351cec56b7b850aaa7ecf6a55b0350b89c3dee3153a")
@@ -105,7 +106,7 @@ class RocmDebugAgent(CMakePackage):
)
def url_for_version(self, version):
- url = "https://github.com/ROCm-Developer-Tools/rocr_debug_agent/archive/"
+ url = "https://github.com/ROCm/rocr_debug_agent/archive/"
if version <= Version("3.7.0"):
url += "roc-{0}.tar.gz".format(version)
else:
@@ -146,6 +147,7 @@ class RocmDebugAgent(CMakePackage):
"5.6.1",
"5.7.0",
"5.7.1",
+ "6.0.0",
]:
depends_on("hsa-rocr-dev@" + ver, when="@" + ver)
depends_on("hsakmt-roct@" + ver, when="@" + ver)
@@ -179,14 +181,15 @@ class RocmDebugAgent(CMakePackage):
"5.6.1",
"5.7.0",
"5.7.1",
+ "6.0.0",
]:
depends_on("rocm-dbgapi@" + ver, when="@" + ver)
depends_on("hip@" + ver, when="@" + ver)
- for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1", "5.7.0", "5.7.1"]:
+ for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1", "5.7.0", "5.7.1", "6.0.0"]:
depends_on("rocm-core@" + ver, when="@" + ver)
- # https://github.com/ROCm-Developer-Tools/rocr_debug_agent/pull/4
+ # https://github.com/ROCm/rocr_debug_agent/pull/4
patch("0001-Drop-overly-strict-Werror-flag.patch", when="@3.7.0:")
patch("0002-add-hip-architecture.patch", when="@3.9.0:")
diff --git a/var/spack/repos/builtin/packages/rocm-device-libs/package.py b/var/spack/repos/builtin/packages/rocm-device-libs/package.py
index b83682d120..6ba87f4dab 100644
--- a/var/spack/repos/builtin/packages/rocm-device-libs/package.py
+++ b/var/spack/repos/builtin/packages/rocm-device-libs/package.py
@@ -10,14 +10,15 @@ from spack.package import *
class RocmDeviceLibs(CMakePackage):
"""set of AMD specific device-side language runtime libraries"""
- homepage = "https://github.com/RadeonOpenCompute/ROCm-Device-Libs"
- git = "https://github.com/RadeonOpenCompute/ROCm-Device-Libs.git"
- url = "https://github.com/RadeonOpenCompute/ROCm-Device-Libs/archive/rocm-5.5.0.tar.gz"
+ homepage = "https://github.com/ROCm/ROCm-Device-Libs"
+ git = "https://github.com/ROCm/ROCm-Device-Libs.git"
+ url = "https://github.com/ROCm/ROCm-Device-Libs/archive/rocm-6.0.0.tar.gz"
tags = ["rocm"]
maintainers("srekolam", "renjithravindrankannath", "haampie")
version("master", branch="amd-stg-open")
+ version("6.0.0", sha256="198df4550d4560537ba60ac7af9bde31d59779c8ec5d6309627f77a43ab6ef6f")
version("5.7.1", sha256="703de8403c0bd0d80f37c970a698f10f148daf144d34f982e4484d04f7c7bbef")
version("5.7.0", sha256="0f8780b9098573f1c456bdc84358de924dcf00604330770a383983e1775bf61e")
version("5.6.1", sha256="f0dfab272ff936225bfa1e9dabeb3c5d12ce08b812bf53ffbddd2ddfac49761c")
@@ -146,11 +147,12 @@ class RocmDeviceLibs(CMakePackage):
"5.6.1",
"5.7.0",
"5.7.1",
+ "6.0.0",
"master",
]:
depends_on("llvm-amdgpu@" + ver, when="@" + ver)
- for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1", "5.7.0", "5.7.1"]:
+ for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1", "5.7.0", "5.7.1", "6.0.0"]:
depends_on("rocm-core@" + ver, when="@" + ver)
def cmake_args(self):
diff --git a/var/spack/repos/builtin/packages/rocm-gdb/package.py b/var/spack/repos/builtin/packages/rocm-gdb/package.py
index 8c29704b29..5a7c06d8eb 100644
--- a/var/spack/repos/builtin/packages/rocm-gdb/package.py
+++ b/var/spack/repos/builtin/packages/rocm-gdb/package.py
@@ -11,13 +11,14 @@ class RocmGdb(AutotoolsPackage):
"""This is ROCmgdb, the ROCm source-level debugger for Linux,
based on GDB, the GNU source-level debugger."""
- homepage = "https://github.com/ROCm-Developer-Tools/ROCgdb/"
- url = "https://github.com/ROCm-Developer-Tools/ROCgdb/archive/rocm-5.5.0.tar.gz"
+ homepage = "https://github.com/ROCm/ROCgdb"
+ url = "https://github.com/ROCm/ROCgdb/archive/rocm-6.0.0.tar.gz"
tags = ["rocm"]
license("LGPL-2.0-or-later")
maintainers("srekolam", "renjithravindrankannath")
+ version("6.0.0", sha256="0db4ab32ca729e69688cdb238df274ce5cf58b5cb2538584662cca4358708c2b")
version("5.7.1", sha256="5cd150b5796aea9d77efd43b89d30a34fa4125338179eb87c6053abcac9f3c62")
version("5.7.0", sha256="94fba57b2f17b593de61f7593b404fabc00b054d38567be57d12cf7654b7969a")
version("5.6.1", sha256="d2b40d4c5aa41a6ce2a84307627b30d16a458672e03e13f9d27c12f2dc3f21d6")
@@ -145,11 +146,12 @@ class RocmGdb(AutotoolsPackage):
"5.6.1",
"5.7.0",
"5.7.1",
+ "6.0.0",
]:
depends_on("rocm-dbgapi@" + ver, type="link", when="@" + ver)
depends_on("comgr@" + ver, type="link", when="@" + ver)
- for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1", "5.7.0", "5.7.1"]:
+ for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1", "5.7.0", "5.7.1", "6.0.0"]:
depends_on("rocm-core@" + ver, when="@" + ver)
build_directory = "spack-build"
@@ -160,7 +162,7 @@ class RocmGdb(AutotoolsPackage):
# Distributor options
"--program-prefix=roc",
"--enable-64-bit-bfd",
- "--with-bugurl=https://github.com/ROCm-Developer-Tools/ROCgdb/issues",
+ "--with-bugurl=https://github.com/ROCm/ROCgdb/issues",
"--with-pkgversion=-ROCm",
"--enable-targets=x86_64-linux-gnu,amdgcn-amd-amdhsa",
"--disable-ld",
diff --git a/var/spack/repos/builtin/packages/rocm-opencl/package.py b/var/spack/repos/builtin/packages/rocm-opencl/package.py
index 9435c1a8ec..8aa0b0a391 100644
--- a/var/spack/repos/builtin/packages/rocm-opencl/package.py
+++ b/var/spack/repos/builtin/packages/rocm-opencl/package.py
@@ -12,8 +12,8 @@ from spack.package import *
class RocmOpencl(CMakePackage):
"""OpenCL: Open Computing Language on ROCclr"""
- homepage = "https://github.com/RadeonOpenCompute/ROCm-OpenCL-Runtime"
- git = "https://github.com/RadeonOpenCompute/ROCm-OpenCL-Runtime.git"
+ homepage = "https://github.com/ROCm/ROCm-OpenCL-Runtime"
+ git = "https://github.com/ROCm/ROCm-OpenCL-Runtime.git"
tags = ["rocm"]
maintainers("srekolam", "renjithravindrankannath")
@@ -146,9 +146,7 @@ class RocmOpencl(CMakePackage):
]:
resource(
name="rocclr",
- url="https://github.com/ROCm-Developer-Tools/ROCclr/archive/rocm-{0}.tar.gz".format(
- d_version
- ),
+ url="https://github.com/ROCm/ROCclr/archive/rocm-{0}.tar.gz".format(d_version),
sha256=d_shasum,
expand=True,
destination="",
diff --git a/var/spack/repos/builtin/packages/rocm-openmp-extras/package.py b/var/spack/repos/builtin/packages/rocm-openmp-extras/package.py
index 836698b92b..d23a487914 100644
--- a/var/spack/repos/builtin/packages/rocm-openmp-extras/package.py
+++ b/var/spack/repos/builtin/packages/rocm-openmp-extras/package.py
@@ -8,8 +8,8 @@ import re
from spack.package import *
-tools_url = "https://github.com/ROCm-Developer-Tools"
-compute_url = "https://github.com/RadeonOpenCompute"
+tools_url = "https://github.com/ROCm"
+compute_url = "https://github.com/ROCm"
# Arrays of hashes are in order of the versions array below
# For example array[0] = 3.9.0, array[1] = 3.10.0, etc.
@@ -41,6 +41,7 @@ aomp = [
"6c051bf7625f682ba3d2ea80b46a38ca2cbcd20f5d89ae3433602d3e7ef0403a",
"4f34fa02db410808c5e629f30f8804210b42c4ff7d31aa80606deaed43054c3c",
"ed7bbf92230b6535a353ed032a39a9f16e9987397798100392fc25e40c8a1a4e",
+ "1b2c0934ef16e17b2377944fae8c9b3db6dc64b7e43932ddfe2eeefdf6821410",
]
devlib = [
@@ -70,6 +71,7 @@ devlib = [
"f0dfab272ff936225bfa1e9dabeb3c5d12ce08b812bf53ffbddd2ddfac49761c",
"0f8780b9098573f1c456bdc84358de924dcf00604330770a383983e1775bf61e",
"703de8403c0bd0d80f37c970a698f10f148daf144d34f982e4484d04f7c7bbef",
+ "198df4550d4560537ba60ac7af9bde31d59779c8ec5d6309627f77a43ab6ef6f",
]
llvm = [
@@ -99,6 +101,7 @@ llvm = [
"045e43c0c4a3f4f2f1db9fb603a4f1ea3d56e128147e19ba17909eb57d7f08e5",
"4abdf00b297a77c5886cedb37e63acda2ba11cb9f4c0a64e133b05800aadfcf0",
"6b54c422e45ad19c9bf5ab090ec21753e7f7d854ca78132c30eb146657b168eb",
+ "c673708d413d60ca8606ee75c77e9871b6953c59029c987b92f2f6e85f683626",
]
flang = [
@@ -128,6 +131,7 @@ flang = [
"5ebcbca2e03bd0686e677f44ea551e97bd9395c6b119f832fa784818733aa652",
"cc4f1973b1b8e7bcc4f09e3381bae4e1a2e51ea4e2598fc1b520ccb8bf24d28c",
"8fd618d81af092416b267c4d00c801731f7a00c0f8d4aedb795e52a4ec1bf183",
+ "fcb319ddb2aa3004a6ae60370ab4425f529336b1cee50f29200e697e61b53586",
]
extras = [
@@ -157,6 +161,7 @@ extras = [
"437e2017cfe2ab73b15ada0fc1ea88f794f0b108cc5410f457268ae7e4e8985a",
"be59433dd85d4b8f0eaff87e0cc424a814152c67f3a682d1343c4bd61dd49a0f",
"8060c6879708faf5f7d417b19a479dec9b7b9583a1b885f12d247faf831f7f0b",
+ "f37e1107e4da5b083e794244f3d0c9fd073ccb6fd6015e635349d8f0d679c4b8",
]
versions = [
@@ -186,6 +191,7 @@ versions = [
"5.6.1",
"5.7.0",
"5.7.1",
+ "6.0.0",
]
versions_dict = dict() # type: Dict[str,Dict[str,str]]
components = ["aomp", "devlib", "llvm", "flang", "extras"]
@@ -203,12 +209,13 @@ class RocmOpenmpExtras(Package):
"""OpenMP support for ROCm LLVM."""
homepage = tools_url + "/aomp"
- url = tools_url + "/aomp/archive/rocm-5.5.0.tar.gz"
+ url = tools_url + "/aomp/archive/rocm-6.0.0.tar.gz"
tags = ["rocm"]
license("Apache-2.0")
maintainers("srekolam", "renjithravindrankannath", "estewart08")
+ version("6.0.0", sha256=versions_dict["6.0.0"]["aomp"])
version("5.7.1", sha256=versions_dict["5.7.1"]["aomp"])
version("5.7.0", sha256=versions_dict["5.7.0"]["aomp"])
version("5.6.1", sha256=versions_dict["5.6.1"]["aomp"])
@@ -243,8 +250,8 @@ class RocmOpenmpExtras(Package):
depends_on("awk", type="build")
depends_on("elfutils", type=("build", "link"))
depends_on("libffi", type=("build", "link"))
- depends_on("libdrm", when="@5.7")
- depends_on("numactl", when="@5.7")
+ depends_on("libdrm", when="@5.7:6.0")
+ depends_on("numactl", when="@5.7:6.0")
for ver in [
"3.9.0",
@@ -273,13 +280,14 @@ class RocmOpenmpExtras(Package):
"5.6.1",
"5.7.0",
"5.7.1",
+ "6.0.0",
]:
depends_on("hsakmt-roct@" + ver, when="@" + ver)
depends_on("comgr@" + ver, when="@" + ver)
depends_on("hsa-rocr-dev@" + ver, when="@" + ver)
depends_on("llvm-amdgpu@{0} ~openmp".format(ver), when="@" + ver)
- for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1", "5.7.0", "5.7.1"]:
+ for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1", "5.7.0", "5.7.1", "6.0.0"]:
depends_on("rocm-core@" + ver, when="@" + ver)
# tag changed to 'rocm-' in 4.0.0
@@ -327,7 +335,7 @@ class RocmOpenmpExtras(Package):
placement="llvm-project",
when="@" + ver,
)
- patch("0001-Linking-hsakmt-libdrm-and-numactl-libraries.patch", when="@5.7")
+ patch("0001-Linking-hsakmt-libdrm-and-numactl-libraries.patch", when="@5.7:6.0")
def setup_run_environment(self, env):
devlibs_prefix = self.spec["llvm-amdgpu"].prefix
@@ -497,7 +505,7 @@ class RocmOpenmpExtras(Package):
devlibs_src = "{0}/rocm-openmp-extras/rocm-device-libs".format(src)
hsa_prefix = self.spec["hsa-rocr-dev"].prefix
hsakmt_prefix = self.spec["hsakmt-roct"].prefix
- if self.spec.satisfies("@5.7"):
+ if self.spec.satisfies("@5.7:6.0"):
libdrm_prefix = self.spec["libdrm"].prefix
numactl_prefix = self.spec["numactl"].prefix
comgr_prefix = self.spec["comgr"].prefix
@@ -576,7 +584,7 @@ class RocmOpenmpExtras(Package):
"-DCMAKE_CXX_FLAGS=-isystem{0} -I{1}".format(elfutils_inc, ffi_inc),
"-DNEW_BC_PATH=1",
]
- if self.spec.satisfies("@5.7"):
+ if self.spec.satisfies("@5.7:6.0"):
openmp_common_args += [
"-DLIBDRM_LIB={0}/lib".format(libdrm_prefix),
"-DHSAKMT_INC_PATH={0}/include".format(hsakmt_prefix),
diff --git a/var/spack/repos/builtin/packages/rocm-smi-lib/package.py b/var/spack/repos/builtin/packages/rocm-smi-lib/package.py
index fdd2bf216c..23af4a7653 100644
--- a/var/spack/repos/builtin/packages/rocm-smi-lib/package.py
+++ b/var/spack/repos/builtin/packages/rocm-smi-lib/package.py
@@ -16,15 +16,16 @@ class RocmSmiLib(CMakePackage):
"""It is a C library for Linux that provides a user space interface
for applications to monitor and control GPU applications."""
- homepage = "https://github.com/RadeonOpenCompute/rocm_smi_lib"
- git = "https://github.com/RadeonOpenCompute/rocm_smi_lib.git"
- url = "https://github.com/RadeonOpenCompute/rocm_smi_lib/archive/rocm-5.5.0.tar.gz"
+ homepage = "https://github.com/ROCm/rocm_smi_lib"
+ git = "https://github.com/ROCm/rocm_smi_lib.git"
+ url = "https://github.com/ROCm/rocm_smi_lib/archive/rocm-6.0.0.tar.gz"
tags = ["rocm"]
maintainers("srekolam", "renjithravindrankannath")
libraries = ["librocm_smi64"]
version("master", branch="master")
+ version("6.0.0", sha256="0053b42402fd007e5ca9b3186c70f2c6f1b3026558f328722adadc2838c51309")
version("5.7.1", sha256="4d79cb0482b2f801cc7824172743e3dd2b44b9f6784d1ca2e5067f2fbb4ef803")
version("5.7.0", sha256="a399db3d9fc113ce2dd1ab5608a1cf9129ec4b6a2a79ab7922b1d9f43c454640")
version("5.6.1", sha256="9e94f9a941202c3d7ce917fd1cd78c4e0f06f48d6c929f3aa916378ccef1e02c")
@@ -116,7 +117,7 @@ class RocmSmiLib(CMakePackage):
depends_on("cmake@3:", type="build")
depends_on("python@3:", type=("build", "run"), when="@3.9.0:")
- for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1", "5.7.0", "5.7.1"]:
+ for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1", "5.7.0", "5.7.1", "6.0.0"]:
depends_on("rocm-core@" + ver, when="@" + ver)
patch("disable_pdf_generation_with_doxygen_and_latex.patch", when="@4.5.2:5.6")
diff --git a/var/spack/repos/builtin/packages/rocm-smi/package.py b/var/spack/repos/builtin/packages/rocm-smi/package.py
index 0cc265c849..4e927b1f01 100644
--- a/var/spack/repos/builtin/packages/rocm-smi/package.py
+++ b/var/spack/repos/builtin/packages/rocm-smi/package.py
@@ -14,11 +14,11 @@ class RocmSmi(MakefilePackage):
management of your ROCm enabled system
Note: After ROCm 3.9, this project moved to
- https://github.com/RadeonOpenCompute/rocm_smi_lib/tree/master/python_smi_tools
+ https://github.com/ROCm/rocm_smi_lib/tree/master/python_smi_tools
The spack package is called: rocm-smi-lib"""
- homepage = "https://github.com/RadeonOpenCompute/ROC-smi"
- url = "https://github.com/RadeonOpenCompute/ROC-smi/archive/rocm-4.1.0.tar.gz"
+ homepage = "https://github.com/ROCm/ROC-smi"
+ url = "https://github.com/ROCm/ROC-smi/archive/rocm-4.1.0.tar.gz"
maintainers("srekolam", "renjithravindrankannath")
tags = ["rocm"]
diff --git a/var/spack/repos/builtin/packages/rocm-tensile/package.py b/var/spack/repos/builtin/packages/rocm-tensile/package.py
index c92e4b34d6..8b869452cc 100644
--- a/var/spack/repos/builtin/packages/rocm-tensile/package.py
+++ b/var/spack/repos/builtin/packages/rocm-tensile/package.py
@@ -11,14 +11,15 @@ from spack.pkg.builtin.boost import Boost
class RocmTensile(CMakePackage):
"""Radeon Open Compute Tensile library"""
- homepage = "https://github.com/ROCmSoftwarePlatform/Tensile/"
- git = "https://github.com/ROCmSoftwarePlatform/Tensile.git"
- url = "https://github.com/ROCmSoftwarePlatform/Tensile/archive/rocm-5.5.0.tar.gz"
+ homepage = "https://github.com/ROCm/Tensile/"
+ git = "https://github.com/ROCm/Tensile.git"
+ url = "https://github.com/ROCm/Tensile/archive/rocm-6.0.0.tar.gz"
tags = ["rocm"]
license("MIT")
maintainers("srekolam", "renjithravindrankannath", "haampie")
+ version("6.0.0", sha256="5d90add62d1439b7daf0527316e950e454e5d8beefb4f723865fe9ab26c7aa42")
version("5.7.1", sha256="9211a51b23c22b7a79e4e494e8ff3c31e90bf21adb8cce260acc57891fb2c917")
version("5.7.0", sha256="fe2ae067c1c579f33d7a1e26da3fe6b4ed44befa08f9dfce2ceae586f184b816")
version("5.6.1", sha256="3e78c933563fade8781a1dca2079bff135af2f5d2c6eb0147797d2c1f24d006c")
@@ -166,6 +167,7 @@ class RocmTensile(CMakePackage):
"5.6.1",
"5.7.0",
"5.7.1",
+ "6.0.0",
]:
depends_on("rocm-cmake@" + ver, type="build", when="@" + ver)
depends_on("hip@" + ver, when="@" + ver)
@@ -187,6 +189,7 @@ class RocmTensile(CMakePackage):
"5.6.1",
"5.7.0",
"5.7.1",
+ "6.0.0",
]:
depends_on("rocm-openmp-extras@" + ver, when="@" + ver)
@@ -218,11 +221,12 @@ class RocmTensile(CMakePackage):
"5.6.1",
"5.7.0",
"5.7.1",
+ "6.0.0",
]:
depends_on("rocm-smi-lib@" + ver, type="build", when="@" + ver)
root_cmakelists_dir = "Tensile/Source"
- # Status: https://github.com/ROCmSoftwarePlatform/Tensile/commit/a488f7dadba34f84b9658ba92ce9ec5a0615a087
+ # Status: https://github.com/ROCm/Tensile/commit/a488f7dadba34f84b9658ba92ce9ec5a0615a087
# Not yet landed in 3.7.0, nor 3.8.0.
patch("0001-fix-compile-error.patch", when="@3.7.0:3.8.0")
patch("0002-require-openmp-when-tensile-use-openmp-is-on.patch", when="@3.9.0:4.0.0")
diff --git a/var/spack/repos/builtin/packages/rocm-validation-suite/009-replacing-rocm-path-with-package-path.patch b/var/spack/repos/builtin/packages/rocm-validation-suite/009-replacing-rocm-path-with-package-path.patch
new file mode 100644
index 0000000000..ae21de8c82
--- /dev/null
+++ b/var/spack/repos/builtin/packages/rocm-validation-suite/009-replacing-rocm-path-with-package-path.patch
@@ -0,0 +1,636 @@
+From 7bb26280b6da667573a581780f97856985b44e4e Mon Sep 17 00:00:00 2001
+From: Renjith Ravindran <Renjith.RavindranKannath@amd.com>
+Date: Fri, 12 Jan 2024 09:31:21 +0000
+Subject: [PATCH] Updating cmake with include and library path for spack
+
+---
+ CMakeLists.txt | 21 +++++++++++----------
+ babel.so/CMakeLists.txt | 18 +++++++++---------
+ cmake_modules/tests_unit.cmake | 3 ++-
+ edp.so/CMakeLists.txt | 6 +++---
+ gm.so/CMakeLists.txt | 6 +++---
+ gpup.so/CMakeLists.txt | 8 ++++----
+ gst.so/CMakeLists.txt | 10 +++++-----
+ iet.so/CMakeLists.txt | 6 +++---
+ mem.so/CMakeLists.txt | 6 +++---
+ pbqt.so/CMakeLists.txt | 6 +++---
+ pebb.so/CMakeLists.txt | 4 ++--
+ peqt.so/CMakeLists.txt | 6 +++---
+ perf.so/CMakeLists.txt | 8 ++++----
+ pesm.so/CMakeLists.txt | 8 ++++----
+ rcqt.so/CMakeLists.txt | 6 +++---
+ rvs/CMakeLists.txt | 15 ++++++++-------
+ rvs/tests.cmake | 6 ++++--
+ rvslib/CMakeLists.txt | 2 +-
+ smqt.so/CMakeLists.txt | 6 +++---
+ testif.so/CMakeLists.txt | 20 ++++++++++----------
+ 20 files changed, 88 insertions(+), 83 deletions(-)
+
+diff --git a/CMakeLists.txt b/CMakeLists.txt
+index b25eca4..eeee55d 100644
+--- a/CMakeLists.txt
++++ b/CMakeLists.txt
+@@ -70,13 +70,14 @@ endif(rocblas_FOUND)
+ # variables since we will pass them as cmake params appropriately, and
+ # all find_packages relevant to this build will be in ROCM path hence appending it to CMAKE_PREFIX_PATH
+ set(ROCM_PATH "/opt/rocm" CACHE PATH "ROCM install path")
+-set(CMAKE_INSTALL_PREFIX "/opt/rocm" CACHE PATH "CMAKE installation directory")
+-set(CPACK_PACKAGING_INSTALL_PREFIX "/opt/rocm" CACHE PATH "Prefix used in built packages")
++set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}")
++set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
++set(CPACK_PACKAGING_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}")
+ list(APPEND CMAKE_PREFIX_PATH "${ROCM_PATH}")
+-set(ROCR_INC_DIR "${ROCM_PATH}/include" CACHE PATH "Contains header files exported by ROC Runtime" FORCE)
+-set(ROCR_LIB_DIR "${ROCM_PATH}/lib" CACHE PATH "Contains library files exported by ROC Runtime" FORCE)
+-set(HIP_INC_DIR "${ROCM_PATH}" CACHE PATH "Contains header files exported by ROC Runtime")
+-set(ROCT_INC_DIR "${ROCM_PATH}/include" CACHE PATH "Contains header files exported by ROC Trunk" FORCE)
++set(ROCR_INC_DIR "${HSA_PATH}/include" CACHE PATH "Contains header files exported by ROC Runtime")
++set(ROCR_LIB_DIR "${HSA_PATH}/lib" CACHE PATH "Contains library files exported by ROC Runtime")
++set(HIP_INC_DIR "${HIP_PATH}" CACHE PATH "Contains header files exported by ROC Runtime")
++set(ROCT_INC_DIR "${ROCM_PATH}/include" CACHE PATH "Contains header files exported by ROC Trunk")
+
+ add_definitions(-DROCM_PATH="${ROCM_PATH}")
+ add_definitions(-DRVS_LIB_PATH="${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}/rvs")
+@@ -420,8 +421,8 @@ if (RVS_ROCBLAS EQUAL 1)
+ set(ROCBLAS_INC_DIR "${CMAKE_BINARY_DIR}/rvs_rblas-src/build/release/rocblas-install")
+ set(ROCBLAS_LIB_DIR "${CMAKE_BINARY_DIR}/rvs_rblas-src/build/release/rocblas-install/lib/")
+ else()
+- set(ROCBLAS_INC_DIR "${ROCM_PATH}/include")
+- set(ROCBLAS_LIB_DIR "${ROCM_PATH}/lib")
++ set(ROCBLAS_INC_DIR "${ROCBLAS_DIR}/include")
++ set(ROCBLAS_LIB_DIR "${ROCBLAS_DIR}/lib")
+ endif()
+
+ if (RVS_ROCMSMI EQUAL 1)
+@@ -436,8 +437,8 @@ else()
+ set(ROCM_SMI_LIB_DIR "${ROCM_PATH}/rocm_smi/lib")
+ else()
+ message( STATUS "ROCBLAS REORG Enabled Version: ${RVS_ROCBLAS_VERSION_FLAT}" )
+- set(ROCM_SMI_INC_DIR "${ROCM_PATH}/include")
+- set(ROCM_SMI_LIB_DIR "${ROCM_PATH}/lib")
++ set(ROCM_SMI_INC_DIR "${ROCM_SMI_DIR}/include")
++ set(ROCM_SMI_LIB_DIR "${ROCM_SMI_DIR}/lib")
+ endif()
+ endif()
+ set(ROCM_SMI_LIB "rocm_smi64" CACHE STRING "rocm_smi library name")
+diff --git a/babel.so/CMakeLists.txt b/babel.so/CMakeLists.txt
+index f163dae..fa85b38 100644
+--- a/babel.so/CMakeLists.txt
++++ b/babel.so/CMakeLists.txt
+@@ -107,13 +107,13 @@ set(HIP_HCC_LIB "amdhip64")
+ add_compile_options(-DRVS_ROCBLAS_VERSION_FLAT=${RVS_ROCBLAS_VERSION_FLAT})
+
+ # Determine Roc Runtime header files are accessible
+-if(NOT EXISTS ${HIP_INC_DIR}/include/hip/hip_runtime.h)
+- message("ERROR: ROC Runtime headers can't be found under specified path. Please set HIP_INC_DIR path. Current value is : " ${HIP_INC_DIR})
++if(NOT EXISTS ${HIP_PATH}/include/hip/hip_runtime.h)
++ message("ERROR: ROC Runtime headers can't be found under specified path. Please set HIP_PATH path. Current value is : " ${HIP_PATH})
+ RETURN()
+ endif()
+
+-if(NOT EXISTS ${HIP_INC_DIR}/include/hip/hip_runtime_api.h)
+- message("ERROR: ROC Runtime headers can't be found under specified path. Please set HIP_INC_DIR path. Current value is : " ${HIP_INC_DIR})
++if(NOT EXISTS ${HIP_PATH}/include/hip/hip_runtime_api.h)
++ message("ERROR: ROC Runtime headers can't be found under specified path. Please set HIP_PATH path. Current value is : " ${HIP_PATH})
+ RETURN()
+ endif()
+
+@@ -133,16 +133,16 @@ if(DEFINED RVS_ROCMSMI)
+ endif()
+
+
+-if(NOT EXISTS "${ROCR_LIB_DIR}/lib${HIP_HCC_LIB}.so")
+- message("ERROR: ROC Runtime libraries can't be found under specified path. Please set ROCR_LIB_DIR path. Current value is : " ${ROCR_LIB_DIR})
++if(NOT EXISTS "${HIP_PATH}/lib/lib${HIP_HCC_LIB}.so")
++ message("ERROR: ROC Runtime libraries can't be found under specified path. Please set HIP_PATH path. Current value is : " ${HIP_PATH})
+ RETURN()
+ endif()
+
+ ## define include directories
+-include_directories(./ ../ ${ROCR_INC_DIR} ${HIP_INC_DIR})
++include_directories(./ ../ ${HIP_PATH})
+
+ # Add directories to look for library files to link
+-link_directories(${RVS_LIB_DIR} ${ROCR_LIB_DIR} ${ROCBLAS_LIB_DIR} ${ASAN_LIB_PATH})
++link_directories(${RVS_LIB_DIR} ${HIP_PATH}/lib/ ${ROCBLAS_LIB_DIR} ${ASAN_LIB_PATH} ${ROCM_SMI_LIB_DIR})
+ ## additional libraries
+ set (PROJECT_LINK_LIBS rvslib libpthread.so libpci.so libm.so)
+
+@@ -154,7 +154,7 @@ add_library( ${RVS_TARGET} SHARED ${SOURCES})
+ set_target_properties(${RVS_TARGET} PROPERTIES
+ SUFFIX .so.${LIB_VERSION_STRING}
+ LIBRARY_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
+-target_link_libraries(${RVS_TARGET} ${PROJECT_LINK_LIBS} ${HIP_HCC_LIB} ${ROCBLAS_LIB})
++target_link_libraries(${RVS_TARGET} ${PROJECT_LINK_LIBS} ${HIP_HCC_LIB} ${ROCBLAS_LIB} ${ROCBLAS_LIB_DIR}/librocblas.so ${HSAKMT_LIB_DIR}/libhsakmt.a ${HSA_PATH}/lib/libhsa-runtime64.so)
+ add_dependencies(${RVS_TARGET} rvslib)
+
+ add_custom_command(TARGET ${RVS_TARGET} POST_BUILD
+diff --git a/cmake_modules/tests_unit.cmake b/cmake_modules/tests_unit.cmake
+index e0e9f88..7321e0a 100644
+--- a/cmake_modules/tests_unit.cmake
++++ b/cmake_modules/tests_unit.cmake
+@@ -27,7 +27,7 @@
+ ## define additional unit testing include directories
+ include_directories(${UT_INC})
+ ## define additional unit testing lib directories
+-link_directories(${UT_LIB} ${RVS_LIB_DIR})
++link_directories(${UT_LIB} ${RVS_LIB_DIR} ${ROCM_SMI_LIB_DIR})
+
+ file(GLOB TESTSOURCES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} test/test*.cpp )
+ #message ( "TESTSOURCES: ${TESTSOURCES}" )
+@@ -45,6 +45,7 @@ FOREACH(SINGLE_TEST ${TESTSOURCES})
+ )
+ target_link_libraries(${TEST_NAME}
+ ${UT_LINK_LIBS} rvslibut rvslib gtest_main gtest pthread pci
++ ${ROCBLAS_LIB_DIR}/librocblas.so ${HSAKMT_LIB_DIR}/libhsakmt.a ${HSA_PATH}/lib/libhsa-runtime64.so
+ )
+ target_compile_definitions(${TEST_NAME} PUBLIC RVS_UNIT_TEST)
+ if(DEFINED tcd.${TEST_NAME})
+diff --git a/edp.so/CMakeLists.txt b/edp.so/CMakeLists.txt
+index 7dd34ea..7978abe 100644
+--- a/edp.so/CMakeLists.txt
++++ b/edp.so/CMakeLists.txt
+@@ -134,11 +134,11 @@ if(NOT EXISTS "${ROCR_LIB_DIR}/lib${HIP_HCC_LIB}.so")
+ endif()
+
+ ## define include directories
+-include_directories(./ ../ ${ROCR_INC_DIR} ${ROCBLAS_INC_DIR} ${HIP_INC_DIR})
++include_directories(./ ../ ${ROCR_INC_DIR} ${ROCBLAS_INC_DIR} ${HIP_INC_DIR} ${YAML_CPP_INCLUDE_DIRS})
+ # Add directories to look for library files to link
+-link_directories(${RVS_LIB_DIR} ${ROCR_LIB_DIR} ${ROCBLAS_LIB_DIR})
++link_directories(${RVS_LIB_DIR} ${ROCR_LIB_DIR} ${ROCBLAS_LIB_DIR} ${ROCM_SMI_LIB_DIR})
+ ## additional libraries
+-set (PROJECT_LINK_LIBS rvslib libpthread.so libpciaccess.so libpci.so libm.so)
++set (PROJECT_LINK_LIBS rvslib libpthread.so libpciaccess.so libpci.so libm.so ${ROCBLAS_LIB_DIR}/librocblas.so ${HSAKMT_LIB_DIR}/libhsakmt.a ${HSA_PATH}/lib/libhsa-runtime64.so)
+
+ ## define source files
+ set (SOURCES src/rvs_module.cpp src/action.cpp src/edp_worker.cpp )
+diff --git a/gm.so/CMakeLists.txt b/gm.so/CMakeLists.txt
+index d3caa84..73b83ce 100644
+--- a/gm.so/CMakeLists.txt
++++ b/gm.so/CMakeLists.txt
+@@ -118,11 +118,11 @@ if(DEFINED RVS_ROCMSMI)
+ endif()
+
+ ## define include directories
+-include_directories(./ ../ ${ROCM_SMI_INC_DIR})
++include_directories(./ ../ ${ROCM_SMI_INC_DIR} ${YAML_CPP_INCLUDE_DIRS})
+ # Add directories to look for library files to link
+ link_directories(${RVS_LIB_DIR} ${ROCM_SMI_LIB_DIR} ${ASAN_LIB_PATH})
+ ## additional libraries
+-set (PROJECT_LINK_LIBS rvslib libpthread.so libpci.so libm.so)
++set (PROJECT_LINK_LIBS rvslib libpthread.so libpci.so libm.so librocm_smi64.so ${ROCBLAS_LIB_DIR}/librocblas.so ${HSAKMT_LIB_DIR}/libhsakmt.a ${HSA_PATH}/lib/libhsa-runtime64.so)
+
+ ## define source files
+ set(SOURCES src/rvs_module.cpp src/action.cpp src/worker.cpp)
+@@ -133,7 +133,7 @@ add_library( ${RVS_TARGET} SHARED ${SOURCES})
+ set_target_properties(${RVS_TARGET} PROPERTIES
+ SUFFIX .so.${LIB_VERSION_STRING}
+ LIBRARY_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
+-target_link_libraries(${RVS_TARGET} ${PROJECT_LINK_LIBS} ${ROCM_SMI_LIB})
++target_link_libraries(${RVS_TARGET} ${PROJECT_LINK_LIBS})
+ add_dependencies(${RVS_TARGET} rvslib)
+
+ add_custom_command(TARGET ${RVS_TARGET} POST_BUILD
+diff --git a/gpup.so/CMakeLists.txt b/gpup.so/CMakeLists.txt
+index 43d337a..a234feb 100644
+--- a/gpup.so/CMakeLists.txt
++++ b/gpup.so/CMakeLists.txt
+@@ -109,11 +109,11 @@ else()
+ endif()
+
+ ## define include directories
+-include_directories(./ ../ include ../include)
++include_directories(./ ../ include ../include ${YAML_CPP_INCLUDE_DIRS})
+ # Add directories to look for library files to link
+-link_directories(${RVS_LIB_DIR} ${ASAN_LIB_PATH})
++link_directories(${RVS_LIB_DIR} ${ASAN_LIB_PATH} ${ROCM_SMI_LIB_DIR})
+ ## additional libraries
+-set (PROJECT_LINK_LIBS rvslib libpci.so libm.so)
++set (PROJECT_LINK_LIBS rvslib libpci.so libm.so ${ROCBLAS_LIB_DIR}/librocblas.so ${HSAKMT_LIB_DIR}/libhsakmt.a ${HSA_PATH}/lib/libhsa-runtime64.so)
+
+ ## define source files
+ set(SOURCES src/rvs_module.cpp src/action.cpp)
+@@ -124,7 +124,7 @@ set_target_properties(${RVS_TARGET} PROPERTIES
+ SUFFIX .so.${LIB_VERSION_STRING}
+ LIBRARY_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
+
+-target_link_libraries(${RVS_TARGET} ${PROJECT_LINK_LIBS} )
++target_link_libraries(${RVS_TARGET} ${PROJECT_LINK_LIBS})
+ add_dependencies(${RVS_TARGET} rvslib)
+
+ add_custom_command(TARGET ${RVS_TARGET} POST_BUILD
+diff --git a/gst.so/CMakeLists.txt b/gst.so/CMakeLists.txt
+index fd346ce..cb8c4b6 100644
+--- a/gst.so/CMakeLists.txt
++++ b/gst.so/CMakeLists.txt
+@@ -137,17 +137,17 @@ if(DEFINED RVS_ROCMSMI)
+ endif()
+
+
+-if(NOT EXISTS "${ROCR_LIB_DIR}/lib${HIP_HCC_LIB}.so")
++if(NOT EXISTS "${HIP_INC_DIR}/lib/lib${HIP_HCC_LIB}.so")
+ message("ERROR: ROC Runtime libraries can't be found under specified path. Please set ROCR_LIB_DIR path. Current value is : " ${ROCR_LIB_DIR})
+ RETURN()
+ endif()
+
+ ## define include directories
+-include_directories(./ ../ ${ROCR_INC_DIR} ${ROCBLAS_INC_DIR} ${HIP_INC_DIR})
++include_directories(./ ../ ${ROCR_INC_DIR} ${ROCBLAS_INC_DIR} ${HIP_INC_DIR} ${YAML_CPP_INCLUDE_DIRS})
+ # Add directories to look for library files to link
+-link_directories(${RVS_LIB_DIR} ${ROCR_LIB_DIR} ${ROCBLAS_LIB_DIR} ${ASAN_LIB_PATH})
++link_directories(${RVS_LIB_DIR} ${ROCR_LIB_DIR} ${HIP_INC_DIR}/lib/ ${ROCBLAS_LIB_DIR} ${ASAN_LIB_PATH} ${ROCM_SMI_LIB_DIR})
+ ## additional libraries
+-set (PROJECT_LINK_LIBS rvslib libpthread.so libpci.so libm.so)
++set (PROJECT_LINK_LIBS rvslib libpthread.so libpci.so libm.so ${ROCBLAS_LIB_DIR}/librocblas.so ${HSAKMT_LIB_DIR}/libhsakmt.a ${HSA_PATH}/lib/libhsa-runtime64.so)
+
+ ## define source files
+ set(SOURCES src/rvs_module.cpp src/action.cpp src/gst_worker.cpp)
+@@ -157,7 +157,7 @@ add_library( ${RVS_TARGET} SHARED ${SOURCES})
+ set_target_properties(${RVS_TARGET} PROPERTIES
+ SUFFIX .so.${LIB_VERSION_STRING}
+ LIBRARY_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
+-target_link_libraries(${RVS_TARGET} ${PROJECT_LINK_LIBS} ${HIP_HCC_LIB} ${ROCBLAS_LIB})
++target_link_libraries(${RVS_TARGET} ${PROJECT_LINK_LIBS} ${HIP_HCC_LIB} ${ROCBLAS_LIB} )
+ add_dependencies(${RVS_TARGET} rvslib)
+
+ add_custom_command(TARGET ${RVS_TARGET} POST_BUILD
+diff --git a/iet.so/CMakeLists.txt b/iet.so/CMakeLists.txt
+index a85ca98..252e565 100644
+--- a/iet.so/CMakeLists.txt
++++ b/iet.so/CMakeLists.txt
+@@ -140,7 +140,7 @@ if(DEFINED RVS_ROCMSMI)
+ endif()
+ endif()
+
+-if(NOT EXISTS "${ROCR_LIB_DIR}/lib${HIP_HCC_LIB}.so")
++if(NOT EXISTS "${HIP_INC_DIR}/lib/lib${HIP_HCC_LIB}.so")
+ message("ERROR: ROC Runtime libraries can't be found under specified path. Please set ROCR_LIB_DIR path. Current value is : " ${ROCR_LIB_DIR})
+ RETURN()
+ endif()
+@@ -159,7 +159,7 @@ include_directories(./ ../ ${ROCM_SMI_INC_DIR} ${ROCBLAS_INC_DIR} ${ROCR_INC_DIR
+ # Add directories to look for library files to link
+ link_directories(${RVS_LIB_DIR} ${ROCR_LIB_DIR} ${ROCBLAS_LIB_DIR} ${ROCM_SMI_LIB_DIR} ${ASAN_LIB_PATH})
+ ## additional libraries
+-set (PROJECT_LINK_LIBS rvslib libpthread.so libpci.so libm.so)
++set (PROJECT_LINK_LIBS rvslib libpthread.so libpci.so libm.so librocm_smi64.so ${ROCBLAS_LIB_DIR}/librocblas.so ${HSAKMT_LIB_DIR}/libhsakmt.a ${HSA_PATH}/lib/libhsa-runtime64.so)
+
+ set(SOURCES src/rvs_module.cpp src/action.cpp src/iet_worker.cpp )
+
+@@ -168,7 +168,7 @@ add_library( ${RVS_TARGET} SHARED ${SOURCES})
+ set_target_properties(${RVS_TARGET} PROPERTIES
+ SUFFIX .so.${LIB_VERSION_STRING}
+ LIBRARY_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
+-target_link_libraries(${RVS_TARGET} ${PROJECT_LINK_LIBS} ${HIP_HCC_LIB} ${ROCBLAS_LIB} ${ROCM_SMI_LIB})
++target_link_libraries(${RVS_TARGET} ${PROJECT_LINK_LIBS} ${HIP_INC_DIR}/lib/ ${HIP_HCC_LIB} ${ROCBLAS_LIB})
+ add_dependencies(${RVS_TARGET} rvslib)
+
+ add_custom_command(TARGET ${RVS_TARGET} POST_BUILD
+diff --git a/mem.so/CMakeLists.txt b/mem.so/CMakeLists.txt
+index 5133337..2462bbc 100644
+--- a/mem.so/CMakeLists.txt
++++ b/mem.so/CMakeLists.txt
+@@ -134,7 +134,7 @@ if(DEFINED RVS_ROCMSMI)
+ endif()
+
+
+-if(NOT EXISTS "${ROCR_LIB_DIR}/lib${HIP_HCC_LIB}.so")
++if(NOT EXISTS "${HIP_INC_DIR}/lib/lib${HIP_HCC_LIB}.so")
+ message("ERROR: ROC Runtime libraries can't be found under specified path. Please set ROCR_LIB_DIR path. Current value is : " ${ROCR_LIB_DIR})
+ RETURN()
+ endif()
+@@ -143,9 +143,9 @@ endif()
+ include_directories(./ ../ ${ROCR_INC_DIR} ${HIP_INC_DIR})
+
+ # Add directories to look for library files to link
+-link_directories(${RVS_LIB_DIR} ${ROCR_LIB_DIR} ${ROCBLAS_LIB_DIR} ${ASAN_LIB_PATH})
++link_directories(${RVS_LIB_DIR} ${ROCR_LIB_DIR} ${HIP_INC_DIR}/lib ${ROCBLAS_LIB_DIR} ${ASAN_LIB_PATH} ${ROCM_SMI_LIB_DIR})
+ ## additional libraries
+-set (PROJECT_LINK_LIBS rvslib libpthread.so libpci.so libm.so)
++set (PROJECT_LINK_LIBS rvslib libpthread.so libpci.so libm.so ${ROCBLAS_LIB_DIR}/librocblas.so ${HSAKMT_LIB_DIR}/libhsakmt.a ${HSA_PATH}/lib/libhsa-runtime64.so)
+
+ ## define source files
+ set(SOURCES src/rvs_module.cpp src/action.cpp src/rvs_memtest.cpp src/rvs_memworker.cpp)
+diff --git a/pbqt.so/CMakeLists.txt b/pbqt.so/CMakeLists.txt
+index 5ae675a..892b6ac 100644
+--- a/pbqt.so/CMakeLists.txt
++++ b/pbqt.so/CMakeLists.txt
+@@ -136,11 +136,11 @@ if(NOT EXISTS ${ROCR_LIB_DIR}/${CORE_RUNTIME_LIBRARY}.so)
+ endif()
+
+ ## define include directories
+-include_directories(./ ../ pci ${ROCR_INC_DIR})
++include_directories(./ ../ pci ${ROCR_INC_DIR} ${YAML_CPP_INCLUDE_DIRS})
+ # Add directories to look for library files to link
+-link_directories(${RVS_LIB_DIR} ${ROCR_LIB_DIR} ${ROCT_LIB_DIR} ${ASAN_LIB_PATH})
++link_directories(${RVS_LIB_DIR} ${ROCR_LIB_DIR} ${HSAKMT_LIB_DIR} ${ROCT_LIB_DIR} ${ASAN_LIB_PATH} ${ROCM_SMI_LIB_DIR})
+ ## additional libraries
+-set (PROJECT_LINK_LIBS rvslib libpthread.so libpci.so libm.so)
++set (PROJECT_LINK_LIBS rvslib libpthread.so libpci.so libm.so ${ROCBLAS_LIB_DIR}/librocblas.so ${HSAKMT_LIB_DIR}/libhsakmt.a ${HSA_PATH}/lib/libhsa-runtime64.so)
+
+ ## define source files
+ set(SOURCES src/rvs_module.cpp src/action.cpp src/action_run.cpp
+diff --git a/pebb.so/CMakeLists.txt b/pebb.so/CMakeLists.txt
+index c4e2964..7a6b368 100644
+--- a/pebb.so/CMakeLists.txt
++++ b/pebb.so/CMakeLists.txt
+@@ -139,9 +139,9 @@ endif()
+ ## define include directories
+ include_directories(./ ../ pci ${ROCR_INC_DIR})
+ # Add directories to look for library files to link
+-link_directories(${RVS_LIB_DIR} ${ROCR_LIB_DIR} ${ROCT_LIB_DIR} ${ASAN_LIB_PATH})
++link_directories(${RVS_LIB_DIR} ${ROCR_LIB_DIR} ${HSAKMT_LIB_DIR} ${ROCT_LIB_DIR} ${ASAN_LIB_PATH} ${ROCM_SMI_LIB_DIR} )
+ ## additional libraries
+-set (PROJECT_LINK_LIBS rvslib libpthread.so libpci.so libm.so)
++set (PROJECT_LINK_LIBS rvslib libpthread.so libpci.so libm.so ${ROCBLAS_LIB_DIR}/librocblas.so ${HSAKMT_LIB_DIR}/libhsakmt.a ${HSA_PATH}/lib/libhsa-runtime64.so)
+
+ ## define source files
+ set(SOURCES src/rvs_module.cpp src/action.cpp src/action_run.cpp
+diff --git a/peqt.so/CMakeLists.txt b/peqt.so/CMakeLists.txt
+index ead507d..567358b 100644
+--- a/peqt.so/CMakeLists.txt
++++ b/peqt.so/CMakeLists.txt
+@@ -107,9 +107,9 @@ else()
+ endif()
+
+ ## define include directories
+-include_directories(./ ../)
++include_directories(./ ../ ${HSA_PATH})
+ # Add directories to look for library files to link
+-link_directories(${RVS_LIB_DIR} ${ASAN_LIB_PATH})
++link_directories(${RVS_LIB_DIR} ${HSA_PATH}/lib/ ${HSAKMT_LIB_DIR} ${ASAN_LIB_PATH} ${ROCM_SMI_LIB_DIR} ${YAML_CPP_INCLUDE_DIRS})
+ ## additional libraries
+ set (PROJECT_LINK_LIBS rvslib libpci.so libm.so)
+
+@@ -121,7 +121,7 @@ add_library( ${RVS_TARGET} SHARED ${SOURCES})
+ set_target_properties(${RVS_TARGET} PROPERTIES
+ SUFFIX .so.${LIB_VERSION_STRING}
+ LIBRARY_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
+-target_link_libraries(${RVS_TARGET} ${PROJECT_LINK_LIBS} )
++target_link_libraries(${RVS_TARGET} ${PROJECT_LINK_LIBS} ${ROCBLAS_LIB_DIR}/librocblas.so ${HSAKMT_LIB_DIR}/libhsakmt.a ${HSA_PATH}/lib/libhsa-runtime64.so)
+ add_dependencies(${RVS_TARGET} rvslib)
+
+ add_custom_command(TARGET ${RVS_TARGET} POST_BUILD
+diff --git a/perf.so/CMakeLists.txt b/perf.so/CMakeLists.txt
+index 518dac9..02d2245 100644
+--- a/perf.so/CMakeLists.txt
++++ b/perf.so/CMakeLists.txt
+@@ -137,7 +137,7 @@ if(DEFINED RVS_ROCMSMI)
+ endif()
+
+
+-if(NOT EXISTS "${ROCR_LIB_DIR}/lib${HIP_HCC_LIB}.so")
++if(NOT EXISTS "${HIP_INC_DIR}/lib/lib${HIP_HCC_LIB}.so")
+ message("ERROR: ROC Runtime libraries can't be found under specified path. Please set ROCR_LIB_DIR path. Current value is : " ${ROCR_LIB_DIR})
+ RETURN()
+ endif()
+@@ -145,9 +145,9 @@ endif()
+ ## define include directories
+ include_directories(./ ../ ${ROCR_INC_DIR} ${ROCBLAS_INC_DIR} ${HIP_INC_DIR})
+ # Add directories to look for library files to link
+-link_directories(${RVS_LIB_DIR} ${ROCR_LIB_DIR} ${ROCBLAS_LIB_DIR} ${ASAN_LIB_PATH})
++link_directories(${RVS_LIB_DIR} ${ROCR_LIB_DIR} ${HIP_INC_DIR}/lib ${ROCBLAS_LIB_DIR} ${ASAN_LIB_PATH} ${ROCM_SMI_LIB_DIR})
+ ## additional libraries
+-set (PROJECT_LINK_LIBS rvslib libpthread.so libpci.so libm.so)
++set (PROJECT_LINK_LIBS rvslib libpthread.so libpci.so libm.so ${ROCBLAS_LIB_DIR}/librocblas.so ${HSAKMT_LIB_DIR}/libhsakmt.a ${HSA_PATH}/lib/libhsa-runtime64.so)
+
+ ## define source files
+ set(SOURCES src/rvs_module.cpp src/action.cpp src/perf_worker.cpp)
+@@ -157,7 +157,7 @@ add_library( ${RVS_TARGET} SHARED ${SOURCES})
+ set_target_properties(${RVS_TARGET} PROPERTIES
+ SUFFIX .so.${LIB_VERSION_STRING}
+ LIBRARY_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
+-target_link_libraries(${RVS_TARGET} ${PROJECT_LINK_LIBS} ${HIP_HCC_LIB} ${ROCBLAS_LIB})
++target_link_libraries(${RVS_TARGET} ${PROJECT_LINK_LIBS})
+ add_dependencies(${RVS_TARGET} rvslib)
+
+ add_custom_command(TARGET ${RVS_TARGET} POST_BUILD
+diff --git a/pesm.so/CMakeLists.txt b/pesm.so/CMakeLists.txt
+index 1f27f34..20a8bed 100644
+--- a/pesm.so/CMakeLists.txt
++++ b/pesm.so/CMakeLists.txt
+@@ -107,11 +107,11 @@ else()
+ endif()
+
+ ## define include directories
+-include_directories(./ ../ pci)
++include_directories(./ ../ pci ${YAML_CPP_INCLUDE_DIRS})
+ # Add directories to look for library files to link
+-link_directories(${RVS_LIB_DIR} ${ROCR_LIB_DIR} ${ROCBLAS_LIB_DIR} ${ASAN_LIB_PATH})
++link_directories(${RVS_LIB_DIR} ${ROCR_LIB_DIR} ${ROCBLAS_LIB_DIR} ${ASAN_LIB_PATH} ${ROCM_SMI_LIB_DIR})
+ ## additional libraries
+-set (PROJECT_LINK_LIBS libpthread.so libpci.so libm.so)
++set (PROJECT_LINK_LIBS libpthread.so libpci.so libm.so ${PROJECT_LINK_LIBS} ${ROCBLAS_LIB_DIR}/librocblas.so ${HSAKMT_LIB_DIR}/libhsakmt.a ${HSA_PATH}/lib/libhsa-runtime64.so)
+
+ ## define source files
+ set(SOURCES src/rvs_module.cpp src/action.cpp src/worker.cpp)
+@@ -121,7 +121,7 @@ add_library( ${RVS_TARGET} SHARED ${SOURCES})
+ set_target_properties(${RVS_TARGET} PROPERTIES
+ SUFFIX .so.${LIB_VERSION_STRING}
+ LIBRARY_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
+-target_link_libraries(${RVS_TARGET} rvslib ${PROJECT_LINK_LIBS} )
++target_link_libraries(${RVS_TARGET} rvslib ${PROJECT_LINK_LIBS})
+ add_dependencies(${RVS_TARGET} rvslib)
+
+ add_custom_command(TARGET ${RVS_TARGET} POST_BUILD
+diff --git a/rcqt.so/CMakeLists.txt b/rcqt.so/CMakeLists.txt
+index c0099ab..8d92982 100644
+--- a/rcqt.so/CMakeLists.txt
++++ b/rcqt.so/CMakeLists.txt
+@@ -108,11 +108,11 @@ else()
+ endif()
+
+ ## define include directories
+-include_directories(./ ../)
++include_directories(./ ../ ${YAML_CPP_INCLUDE_DIRS})
+ # Add directories to look for library files to link
+-link_directories(${RVS_LIB_DIR} ${ASAN_LIB_PATH})
++link_directories(${RVS_LIB_DIR} ${ASAN_LIB_PATH} ${ASAN_LIB_PATH} ${HSAKMT_LIB_DIR} ${ROCM_SMI_LIB_DIR})
+ ## additional libraries
+-set (PROJECT_LINK_LIBS rvslib)
++set (PROJECT_LINK_LIBS rvslib ${ROCBLAS_LIB_DIR}/librocblas.so ${HSAKMT_LIB_DIR}/libhsakmt.a ${HSA_PATH}/lib/libhsa-runtime64.so)
+
+ ## define source files
+ set(SOURCES
+diff --git a/rvs/CMakeLists.txt b/rvs/CMakeLists.txt
+index 527d474..76a5efd 100644
+--- a/rvs/CMakeLists.txt
++++ b/rvs/CMakeLists.txt
+@@ -113,21 +113,22 @@ else()
+ endif()
+
+ ## define include directories
+-include_directories(./ ../ ${YAML_CPP_INCLUDE_DIRS})
++include_directories(./ ../ ${YAML_INC_DIR})
+ ## define lib directories
+-link_directories(${CMAKE_CURRENT_BINARY_DIR} ${RVS_LIB_DIR} ${ROCT_LIB_DIR} ${ROCBLAS_LIB_DIR} ${ROCM_SMI_LIB_DIR} ${ASAN_LIB_PATH})
++link_directories(${CMAKE_CURRENT_BINARY_DIR} ${RVS_LIB_DIR} ${RVS_LIB_DIR}/.. ${ASAN_LIB_PATH} ${ROCM_SMI_LIB_DIR} ${YAML_CPP_LIBRARIES} ${ROCT_LIB_DIR} ${ROCBLAS_LIB_DIR} )
+
+ ## additional libraries
+-set(ROCBLAS_LIB "rocblas")
+-set(ROC_THUNK_NAME "hsakmt")
+-set(CORE_RUNTIME_NAME "hsa-runtime")
++set(ROCBLAS_LIB "${ROCBLAS_LIB_DIR}/librocblas.so")
++set(ROC_THUNK_NAME "${HSAKMT_LIB_DIR}/libhsakmt.a")
++set(CORE_RUNTIME_NAME "${HSA_PATH}/lib/libhsa-runtime64.so")
++set(YAML_CPP_LIB "${YAML_INC_DIR}/../lib64/libyaml-cpp.a")
+ set(CORE_RUNTIME_TARGET "${CORE_RUNTIME_NAME}64")
+-set(PROJECT_LINK_LIBS libdl.so libpthread.so libpci.so ${YAML_CPP_LIBRARIES})
++set(PROJECT_LINK_LIBS libdl.so libpthread.so libpci.so)
+
+ ## define target
+ add_executable(${RVS_TARGET} src/rvs.cpp)
+ target_link_libraries(${RVS_TARGET} rvslib
+- ${ROCBLAS_LIB} ${ROCM_SMI_LIB} ${ROC_THUNK_NAME} ${CORE_RUNTIME_TARGET} ${PROJECT_LINK_LIBS})
++ ${ROCBLAS_LIB} ${ROCM_SMI_LIB} ${ROC_THUNK_NAME} ${PROJECT_LINK_LIBS} ${CORE_RUNTIME_NAME} ${YAML_CPP_LIB})
+ add_dependencies(${RVS_TARGET} rvslib)
+
+ install(TARGETS ${RVS_TARGET}
+diff --git a/rvs/tests.cmake b/rvs/tests.cmake
+index 38ae3fb..0d62675 100644
+--- a/rvs/tests.cmake
++++ b/rvs/tests.cmake
+@@ -41,7 +41,8 @@ link_directories(${RVS_LIB_DIR} ${ROCBLAS_LIB_DIR} ${ROCM_SMI_LIB_DIR} ${ROCT_LI
+ ## define target for "test-to-fail"
+ add_executable(${RVS_TARGET}fail src/rvs.cpp)
+ target_link_libraries(${RVS_TARGET}fail rvslib rvslibut ${PROJECT_LINK_LIBS}
+- ${ROCM_SMI_LIB} ${ROCBLAS_LIB} ${ROC_THUNK_NAME} ${CORE_RUNTIME_TARGET})
++ ${ROCM_SMI_LIB} ${ROCBLAS_LIB} ${ROC_THUNK_NAME} ${CORE_RUNTIME_TARGET}
++ ${ROCBLAS_LIB_DIR}/librocblas.so ${HSAKMT_LIB_DIR}/libhsakmt.a ${HSA_PATH}/lib/libhsa-runtime64.so)
+
+ target_compile_definitions(${RVS_TARGET}fail PRIVATE RVS_INVERT_RETURN_STATUS)
+ set_target_properties(${RVS_TARGET}fail PROPERTIES
+@@ -187,7 +188,7 @@ add_test(NAME unit.ttf.rvs.config.noconfig
+ )
+
+ ## define include directories
+-include_directories(${UT_INC})
++include_directories(${UT_INC} ${YAML_INC_DIR})
+ ## define lib directories
+ link_directories(${UT_LIB} ${ROCBLAS_LIB_DIR} ${ROCM_SMI_LIB_DIR} ${ROCT_LIB_DIR})
+ ## additional libraries for unit tests
+@@ -211,6 +212,7 @@ FOREACH(SINGLE_TEST ${TESTSOURCES})
+ ${PROJECT_TEST_LINK_LIBS}
+ rvslib rvslibut gtest_main gtest pthread
+ ${ROCM_SMI_LIB} ${ROCBLAS_LIB} ${ROC_THUNK_NAME} ${CORE_RUNTIME_TARGET}
++ ${ROCBLAS_LIB_DIR}/librocblas.so ${HSAKMT_LIB_DIR}/libhsakmt.a ${HSA_PATH}/lib/libhsa-runtime64.so
+ )
+ add_dependencies(${TEST_NAME} rvs_gtest_target)
+
+diff --git a/rvslib/CMakeLists.txt b/rvslib/CMakeLists.txt
+index 8d29590..d52aee3 100644
+--- a/rvslib/CMakeLists.txt
++++ b/rvslib/CMakeLists.txt
+@@ -116,7 +116,7 @@ endif()
+
+ ## define include directories
+ include_directories(./ ../ ../rvs
+- ${ROCM_SMI_INC_DIR} ${ROCR_INC_DIR} ${ROCBLAS_INC_DIR} ${HIP_INC_DIR})
++ ${ROCM_SMI_INC_DIR} ${HIP_PATH} ${ROCBLAS_INC_DIR} ${YAML_INC_DIR})
+
+ link_directories(${ASAN_LIB_PATH} ${ROCM_SMI_LIB_DIR})
+
+diff --git a/smqt.so/CMakeLists.txt b/smqt.so/CMakeLists.txt
+index 042586f..0133c00 100644
+--- a/smqt.so/CMakeLists.txt
++++ b/smqt.so/CMakeLists.txt
+@@ -106,11 +106,11 @@ else()
+ endif()
+
+ ## define include directories
+-include_directories(./ ../ pci)
++include_directories(./ ../ pci ${YAML_CPP_INCLUDE_DIRS})
+ # Add directories to look for library files to link
+-link_directories(${RVS_LIB_DIR} ${ASAN_LIB_PATH})
++link_directories(${RVS_LIB_DIR} ${ASAN_LIB_PATH} ${ROCM_SMI_LIB_DIR})
+ ## additional libraries
+-set (PROJECT_LINK_LIBS rvslib libpci.so libm.so)
++set (PROJECT_LINK_LIBS rvslib libpci.so libm.so ${ROCBLAS_LIB_DIR}/librocblas.so ${HSAKMT_LIB_DIR}/libhsakmt.a ${HSA_PATH}/lib/libhsa-runtime64.so)
+
+ ## define source files
+ set(SOURCES src/rvs_module.cpp src/action.cpp)
+diff --git a/testif.so/CMakeLists.txt b/testif.so/CMakeLists.txt
+index 4cba0f9..34b491e 100644
+--- a/testif.so/CMakeLists.txt
++++ b/testif.so/CMakeLists.txt
+@@ -108,11 +108,11 @@ endif()
+
+
+ ## define include directories
+-include_directories(./ ../ pci)
++include_directories(./ ../ pci ${YAML_CPP_INCLUDE_DIRS})
+ # Add directories to look for library files to link
+-link_directories(${RVS_LIB_DIR} ${ROCR_LIB_DIR} ${ROCBLAS_LIB_DIR} ${ASAN_LIB_PATH})
++link_directories(${RVS_LIB_DIR} ${ROCBLAS_LIB_DIR} ${ASAN_LIB_PATH} ${ROCM_SMI_LIB_DIR})
+ ## additional libraries
+-set (PROJECT_LINK_LIBS libpthread.so libpci.so libm.so)
++set (PROJECT_LINK_LIBS libpthread.so libpci.so libm.so ${ROCBLAS_LIB_DIR}/librocblas.so ${HSAKMT_LIB_DIR}/libhsakmt.a ${HSA_PATH}/lib/libhsa-runtime64.so)
+
+ ## define source files
+ ## set(SOURCES src/rvs_module.cpp src/action.cpp src/worker.cpp)
+@@ -124,7 +124,7 @@ add_library( ${RVS_TARGET} SHARED src/rvs_module_no_if_methods.cpp)
+ set_target_properties(${RVS_TARGET} PROPERTIES
+ SUFFIX .so.${LIB_VERSION_STRING}
+ LIBRARY_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
+-target_link_libraries(${RVS_TARGET} rvslib ${PROJECT_LINK_LIBS} )
++target_link_libraries(${RVS_TARGET} rvslib ${PROJECT_LINK_LIBS})
+ add_dependencies(${RVS_TARGET} rvslib)
+
+ add_custom_command(TARGET ${RVS_TARGET} POST_BUILD
+@@ -145,7 +145,7 @@ add_library( ${RVS_TARGET} SHARED src/rvs_module_no_if0.cpp)
+ set_target_properties(${RVS_TARGET} PROPERTIES
+ SUFFIX .so.${LIB_VERSION_STRING}
+ LIBRARY_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
+-target_link_libraries(${RVS_TARGET} rvslib ${PROJECT_LINK_LIBS} )
++target_link_libraries(${RVS_TARGET} rvslib ${PROJECT_LINK_LIBS})
+ add_dependencies(${RVS_TARGET} rvslib)
+
+ add_custom_command(TARGET ${RVS_TARGET} POST_BUILD
+@@ -166,7 +166,7 @@ add_library( ${RVS_TARGET} SHARED src/rvs_module_no_if0_methods.cpp)
+ set_target_properties(${RVS_TARGET} PROPERTIES
+ SUFFIX .so.${LIB_VERSION_STRING}
+ LIBRARY_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
+-target_link_libraries(${RVS_TARGET} rvslib ${PROJECT_LINK_LIBS} )
++target_link_libraries(${RVS_TARGET} rvslib ${PROJECT_LINK_LIBS})
+ add_dependencies(${RVS_TARGET} rvslib)
+
+ add_custom_command(TARGET ${RVS_TARGET} POST_BUILD
+@@ -187,7 +187,7 @@ add_library( ${RVS_TARGET} SHARED src/rvs_module_no_if1.cpp)
+ set_target_properties(${RVS_TARGET} PROPERTIES
+ SUFFIX .so.${LIB_VERSION_STRING}
+ LIBRARY_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
+-target_link_libraries(${RVS_TARGET} rvslib ${PROJECT_LINK_LIBS} )
++target_link_libraries(${RVS_TARGET} rvslib ${PROJECT_LINK_LIBS})
+ add_dependencies(${RVS_TARGET} rvslib)
+
+ add_custom_command(TARGET ${RVS_TARGET} POST_BUILD
+@@ -208,7 +208,7 @@ add_library( ${RVS_TARGET} SHARED src/rvs_module_no_if1_methods.cpp)
+ set_target_properties(${RVS_TARGET} PROPERTIES
+ SUFFIX .so.${LIB_VERSION_STRING}
+ LIBRARY_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
+-target_link_libraries(${RVS_TARGET} rvslib ${PROJECT_LINK_LIBS} )
++target_link_libraries(${RVS_TARGET} rvslib ${PROJECT_LINK_LIBS})
+ add_dependencies(${RVS_TARGET} rvslib)
+
+ add_custom_command(TARGET ${RVS_TARGET} POST_BUILD
+@@ -229,7 +229,7 @@ add_library( ${RVS_TARGET} SHARED src/rvs_module_fail_init.cpp)
+ set_target_properties(${RVS_TARGET} PROPERTIES
+ SUFFIX .so.${LIB_VERSION_STRING}
+ LIBRARY_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
+-target_link_libraries(${RVS_TARGET} rvslib ${PROJECT_LINK_LIBS} )
++target_link_libraries(${RVS_TARGET} rvslib ${PROJECT_LINK_LIBS})
+ add_dependencies(${RVS_TARGET} rvslib)
+
+ add_custom_command(TARGET ${RVS_TARGET} POST_BUILD
+@@ -250,7 +250,7 @@ add_library( ${RVS_TARGET} SHARED src/rvs_module_fail_create_action.cpp)
+ set_target_properties(${RVS_TARGET} PROPERTIES
+ SUFFIX .so.${LIB_VERSION_STRING}
+ LIBRARY_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
+-target_link_libraries(${RVS_TARGET} rvslib ${PROJECT_LINK_LIBS} )
++target_link_libraries(${RVS_TARGET} rvslib ${PROJECT_LINK_LIBS})
+ add_dependencies(${RVS_TARGET} rvslib)
+
+ add_custom_command(TARGET ${RVS_TARGET} POST_BUILD
+--
+2.39.3
+
diff --git a/var/spack/repos/builtin/packages/rocm-validation-suite/package.py b/var/spack/repos/builtin/packages/rocm-validation-suite/package.py
index 52e267f580..03b1c0d45e 100644
--- a/var/spack/repos/builtin/packages/rocm-validation-suite/package.py
+++ b/var/spack/repos/builtin/packages/rocm-validation-suite/package.py
@@ -15,13 +15,14 @@ class RocmValidationSuite(CMakePackage):
computing environment, enabled using the ROCm software stack on a
compatible platform."""
- homepage = "https://github.com/ROCm-Developer-Tools/ROCmValidationSuite"
- url = "https://github.com/ROCm-Developer-Tools/ROCmValidationSuite/archive/rocm-5.5.0.tar.gz"
+ homepage = "https://github.com/ROCm/ROCmValidationSuite"
+ url = "https://github.com/ROCm/ROCmValidationSuite/archive/rocm-6.0.0.tar.gz"
tags = ["rocm"]
license("MIT")
maintainers("srekolam", "renjithravindrankannath")
+ version("6.0.0", sha256="a84e36b5e50e70ba033fb6bc6fa99da2e32bf7eaef2098df3164365a77a8f14c")
version("5.7.1", sha256="202f2b6e014bbbeec40af5d3ec630c042f09a61087a77bd70715d81044ea4d65")
version("5.7.0", sha256="f049b7786a220e9b6dfe099f17727dd0d9e41be9e680fe8309eae400cc5536ea")
version("5.6.1", sha256="d5e4100e2d07311dfa101563c15d026a8130442cdee8af9ef861832cd7866c0d")
@@ -122,9 +123,8 @@ class RocmValidationSuite(CMakePackage):
"007-cleanup-path-reference-donot-download-googletest-yaml-library-path_5.6.patch",
when="@5.6",
)
- patch(
- "008-correcting-library-and-include-path-WITHOUT-RVS-BUILD-TESTS.patch", when="@5.7.0:5.7"
- )
+ patch("008-correcting-library-and-include-path-WITHOUT-RVS-BUILD-TESTS.patch", when="@5.7")
+ patch("009-replacing-rocm-path-with-package-path.patch", when="@6.0")
depends_on("cmake@3.5:", type="build")
depends_on("zlib-api", type="link")
depends_on("yaml-cpp~shared")
@@ -165,6 +165,7 @@ class RocmValidationSuite(CMakePackage):
"5.6.1",
"5.7.0",
"5.7.1",
+ "6.0.0",
]:
depends_on("hip@" + ver, when="@" + ver)
depends_on("rocminfo@" + ver, when="@" + ver)
diff --git a/var/spack/repos/builtin/packages/rocminfo/package.py b/var/spack/repos/builtin/packages/rocminfo/package.py
index 3d70c7024b..a71259914a 100644
--- a/var/spack/repos/builtin/packages/rocminfo/package.py
+++ b/var/spack/repos/builtin/packages/rocminfo/package.py
@@ -10,14 +10,15 @@ from spack.package import *
class Rocminfo(CMakePackage):
"""Radeon Open Compute (ROCm) Runtime rocminfo tool"""
- homepage = "https://github.com/RadeonOpenCompute/rocminfo"
- git = "https://github.com/RadeonOpenCompute/rocminfo.git"
- url = "https://github.com/RadeonOpenCompute/rocminfo/archive/rocm-5.5.0.tar.gz"
+ homepage = "https://github.com/ROCm/rocminfo"
+ git = "https://github.com/ROCm/rocminfo.git"
+ url = "https://github.com/ROCm/rocminfo/archive/rocm-6.0.0.tar.gz"
tags = ["rocm"]
maintainers("srekolam", "renjithravindrankannath", "haampie")
version("master", branch="master")
+ version("6.0.0", sha256="bc29f1798644b6dea73895353dffada9db7366d0058274e587ebd3291a4d3844")
version("5.7.1", sha256="642dc2ec4254b3c30c43064e6690861486db820b25f4906ec78bdb47e68dcd0b")
version("5.7.0", sha256="a5a3c19513bf26f17f163a03ba5288c5c761619ef55f0cb9e15472771748b93e")
version("5.6.1", sha256="780b186ac7410a503eca1060f4bbc35db1b7b4d1d714d15c7534cd26d8af7b54")
@@ -136,12 +137,13 @@ class Rocminfo(CMakePackage):
"5.6.1",
"5.7.0",
"5.7.1",
+ "6.0.0",
"master",
]:
depends_on("hsakmt-roct@" + ver, when="@" + ver)
depends_on("hsa-rocr-dev@" + ver, when="@" + ver)
- for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1", "5.7.0", "5.7.1"]:
+ for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1", "5.7.0", "5.7.1", "6.0.0"]:
depends_on("rocm-core@" + ver, when="@" + ver)
def cmake_args(self):
diff --git a/var/spack/repos/builtin/packages/rocmlir/package.py b/var/spack/repos/builtin/packages/rocmlir/package.py
index e7be5107d6..0c57ef3b4f 100644
--- a/var/spack/repos/builtin/packages/rocmlir/package.py
+++ b/var/spack/repos/builtin/packages/rocmlir/package.py
@@ -12,9 +12,9 @@ class Rocmlir(CMakePackage):
targetting AMD hardware. This generator is mainly used from MIOpen and MIGraphX,
but it can be used on a standalone basis."""
- homepage = "https://github.com/ROCmSoftwarePlatform/rocMLIR"
- git = "https://github.com/ROCmSoftwarePlatform/rocMLIR.git"
- url = "https://github.com/ROCmSoftwarePlatform/rocMLIR/archive/refs/tags/rocm-5.5.0.tar.gz"
+ homepage = "https://github.com/ROCm/rocMLIR"
+ git = "https://github.com/ROCm/rocMLIR.git"
+ url = "https://github.com/ROCm/rocMLIR/archive/refs/tags/rocm-6.0.0.tar.gz"
maintainers("srekolam")
version("5.5.1", commit="8c29325e7e68e3248e863172bf0e7f97055d45ee")
diff --git a/var/spack/repos/builtin/packages/rocprim/package.py b/var/spack/repos/builtin/packages/rocprim/package.py
index a6fd4806c1..fc0e594d15 100644
--- a/var/spack/repos/builtin/packages/rocprim/package.py
+++ b/var/spack/repos/builtin/packages/rocprim/package.py
@@ -9,14 +9,15 @@ from spack.package import *
class Rocprim(CMakePackage):
"""Radeon Open Compute Parallel Primitives Library"""
- homepage = "https://github.com/ROCmSoftwarePlatform/rocPRIM"
- git = "https://github.com/ROCmSoftwarePlatform/rocPRIM.git"
- url = "https://github.com/ROCmSoftwarePlatform/rocPRIM/archive/rocm-5.5.0.tar.gz"
+ homepage = "https://github.com/ROCm/rocPRIM"
+ git = "https://github.com/ROCm/rocPRIM.git"
+ url = "https://github.com/ROCm/rocPRIM/archive/rocm-6.0.0.tar.gz"
tags = ["rocm"]
license("MIT")
maintainers("cgmb", "srekolam", "renjithravindrankannath")
+ version("6.0.0", sha256="51f26c9f891a64c8db8df51d75d86d404d682092fd9d243e966ac6b2a6de381a")
version("5.7.1", sha256="15d820a0f61aed60efbba88b6efe6942878b02d912f523f9cf8f33a4583d6cd7")
version("5.7.0", sha256="a1bf94bbad13a0410b49476771270606d8a9d257188ee3ec3a37eee80540fe9b")
version("5.6.1", sha256="e9ec1b0039c07cf3096653a04224fe5fe755afc6ba000f6838b3a8bc84df27de")
@@ -147,6 +148,7 @@ class Rocprim(CMakePackage):
"5.6.1",
"5.7.0",
"5.7.1",
+ "6.0.0",
]:
depends_on("hip@" + ver, when="@" + ver)
depends_on("comgr@" + ver, when="@" + ver)
diff --git a/var/spack/repos/builtin/packages/rocprofiler-dev/package.py b/var/spack/repos/builtin/packages/rocprofiler-dev/package.py
index d87dd3ad2d..b9375fd7ac 100644
--- a/var/spack/repos/builtin/packages/rocprofiler-dev/package.py
+++ b/var/spack/repos/builtin/packages/rocprofiler-dev/package.py
@@ -11,9 +11,9 @@ from spack.package import *
class RocprofilerDev(CMakePackage):
"""ROCPROFILER library for AMD HSA runtime API extension support"""
- homepage = "https://github.com/ROCm-Developer-Tools/rocprofiler"
- git = "https://github.com/ROCm-Developer-Tools/rocprofiler.git"
- url = "https://github.com/ROCm-Developer-Tools/rocprofiler/archive/refs/tags/rocm-5.4.3.tar.gz"
+ homepage = "https://github.com/ROCm/rocprofiler"
+ git = "https://github.com/ROCm/rocprofiler.git"
+ url = "https://github.com/ROCm/rocprofiler/archive/refs/tags/rocm-5.4.3.tar.gz"
tags = ["rocm"]
maintainers("srekolam", "renjithravindrankannath")
@@ -133,7 +133,7 @@ class RocprofilerDev(CMakePackage):
depends_on("roctracer-dev-api@" + ver, when="@" + ver)
depends_on("numactl", type="link", when="@4.3.1")
- # See https://github.com/ROCm-Developer-Tools/rocprofiler/pull/50
+ # See https://github.com/ROCm/rocprofiler/pull/50
patch("fix-includes.patch")
patch("0001-Continue-build-in-absence-of-aql-profile-lib.patch", when="@5.3:")
diff --git a/var/spack/repos/builtin/packages/rocrand/package.py b/var/spack/repos/builtin/packages/rocrand/package.py
index 775f1eee69..d83857f346 100644
--- a/var/spack/repos/builtin/packages/rocrand/package.py
+++ b/var/spack/repos/builtin/packages/rocrand/package.py
@@ -14,9 +14,9 @@ class Rocrand(CMakePackage):
"""The rocRAND project provides functions that generate
pseudo-random and quasi-random numbers."""
- homepage = "https://github.com/ROCmSoftwarePlatform/rocRAND"
- git = "https://github.com/ROCmSoftwarePlatform/rocRAND.git"
- url = "https://github.com/ROCmSoftwarePlatform/rocRAND/archive/rocm-5.5.0.tar.gz"
+ homepage = "https://github.com/ROCm/rocRAND"
+ git = "https://github.com/ROCm/rocRAND.git"
+ url = "https://github.com/ROCm/rocRAND/archive/rocm-6.0.0.tar.gz"
tags = ["rocm"]
maintainers("cgmb", "srekolam", "renjithravindrankannath")
@@ -26,6 +26,7 @@ class Rocrand(CMakePackage):
version("develop", branch="develop")
version("master", branch="master")
+ version("6.0.0", sha256="cee93231c088be524bb2cb0e6093ec47e62e61a55153486bebbc2ca5b3d49360")
version("5.7.1", sha256="885cd905bbd23d02ba8f3f87d5c0b79bc44bd020ea9af190f3959cf5aa33d07d")
version("5.7.0", sha256="d6053d986821e5cbc6cfec0778476efb1411ef943f11e7a8b973b1814a259dcf")
version("5.6.1", sha256="6bf71e687ffa0fcc1b00e3567dd43da4147a82390f1b2db5e6f1f594dee6066d")
@@ -149,7 +150,7 @@ class Rocrand(CMakePackage):
]:
resource(
name="hipRAND",
- git="https://github.com/ROCmSoftwarePlatform/hipRAND.git",
+ git="https://github.com/ROCm/hipRAND.git",
commit=d_commit,
destination="",
placement="hiprand",
@@ -157,7 +158,7 @@ class Rocrand(CMakePackage):
)
resource(
name="hipRAND",
- git="https://github.com/ROCmSoftwarePlatform/hipRAND.git",
+ git="https://github.com/ROCm/hipRAND.git",
branch="master",
destination="",
placement="hiprand",
@@ -165,7 +166,7 @@ class Rocrand(CMakePackage):
)
resource(
name="hipRAND",
- git="https://github.com/ROCmSoftwarePlatform/hipRAND.git",
+ git="https://github.com/ROCm/hipRAND.git",
branch="develop",
destination="",
placement="hiprand",
@@ -202,6 +203,7 @@ class Rocrand(CMakePackage):
"5.6.1",
"5.7.0",
"5.7.1",
+ "6.0.0",
]:
depends_on("hip@" + ver, when="@" + ver)
depends_on("rocm-cmake@%s:" % ver, type="build", when="@" + ver)
diff --git a/var/spack/repos/builtin/packages/rocsolver/package.py b/var/spack/repos/builtin/packages/rocsolver/package.py
index ea85a69965..576675a371 100644
--- a/var/spack/repos/builtin/packages/rocsolver/package.py
+++ b/var/spack/repos/builtin/packages/rocsolver/package.py
@@ -13,9 +13,9 @@ class Rocsolver(CMakePackage):
"""rocSOLVER is a work-in-progress implementation of a
subset of LAPACK functionality on the ROCm platform."""
- homepage = "https://github.com/ROCmSoftwarePlatform/rocSOLVER"
- git = "https://github.com/ROCmSoftwarePlatform/rocSOLVER.git"
- url = "https://github.com/ROCmSoftwarePlatform/rocSOLVER/archive/rocm-5.5.0.tar.gz"
+ homepage = "https://github.com/ROCm/rocSOLVER"
+ git = "https://github.com/ROCm/rocSOLVER.git"
+ url = "https://github.com/ROCm/rocSOLVER/archive/rocm-6.0.0.tar.gz"
tags = ["rocm"]
maintainers("cgmb", "srekolam", "renjithravindrankannath", "haampie")
@@ -41,6 +41,7 @@ class Rocsolver(CMakePackage):
version("develop", branch="develop")
version("master", branch="master")
+ version("6.0.0", sha256="5fcaba96f3efafc2ecc3f4ec104095d96545c16e1b9f95410bd571cb0fc643ae")
version("5.7.1", sha256="83e0c137b8690dbeb2e85d9e25415d96bd06979f09f2b10b2aff8e4c9f833fa4")
version("5.7.0", sha256="bb16d360f14b34fe6e8a6b8ddc6e631672a5ffccbdcb25f0ce319edddd7f9682")
version("5.6.1", sha256="6a8f366218aee599a0e56755030f94ee690b34f30e6d602748632226c5dc21bb")
@@ -136,7 +137,7 @@ class Rocsolver(CMakePackage):
depends_on("netlib-lapack@3.7.1:", type="test")
patch("link-clients-blas.patch", when="@4.3.0:4.3.2")
- # Backport https://github.com/ROCmSoftwarePlatform/rocSOLVER/commit/2bbfb8976f6e4d667499c77e41a6433850063e88
+ # Backport https://github.com/ROCm/rocSOLVER/commit/2bbfb8976f6e4d667499c77e41a6433850063e88
patch("fmt-8.1-compatibility.patch", when="@4.5.0:5.1.3")
# Maximize compatibility with other libraries that are using fmt.
patch("fmt-9-compatibility.patch", when="@5.2.0:5.5")
@@ -180,10 +181,11 @@ class Rocsolver(CMakePackage):
"5.6.1",
"5.7.0",
"5.7.1",
+ "6.0.0",
]:
depends_on("hip@" + ver, when="@" + ver)
depends_on("rocblas@" + ver, when="@" + ver)
- for ver in ["5.6.0", "5.6.1", "5.7.0", "5.7.1"]:
+ for ver in ["5.6.0", "5.6.1", "5.7.0", "5.7.1", "6.0.0"]:
depends_on("rocsparse@5.2:", when="@5.6:")
for tgt in itertools.chain(["auto"], amdgpu_targets):
diff --git a/var/spack/repos/builtin/packages/rocsparse/package.py b/var/spack/repos/builtin/packages/rocsparse/package.py
index 98c02e8807..211afb0d36 100644
--- a/var/spack/repos/builtin/packages/rocsparse/package.py
+++ b/var/spack/repos/builtin/packages/rocsparse/package.py
@@ -15,9 +15,9 @@ class Rocsparse(CMakePackage):
and toolchains. rocSPARSE is created using the HIP programming
language and optimized for AMD's latest discrete GPUs."""
- homepage = "https://github.com/ROCmSoftwarePlatform/rocSPARSE"
- git = "https://github.com/ROCmSoftwarePlatform/rocSPARSE.git"
- url = "https://github.com/ROCmSoftwarePlatform/rocSPARSE/archive/rocm-5.5.0.tar.gz"
+ homepage = "https://github.com/ROCm/rocSPARSE"
+ git = "https://github.com/ROCm/rocSPARSE.git"
+ url = "https://github.com/ROCm/rocSPARSE/archive/rocm-6.0.0.tar.gz"
tags = ["rocm"]
maintainers("cgmb", "srekolam", "renjithravindrankannath")
@@ -34,7 +34,7 @@ class Rocsparse(CMakePackage):
variant("test", default=False, description="Build rocsparse-test client")
license("MIT")
-
+ version("6.0.0", sha256="bdc618677ec78830c6af315d61194d6ab8532345b8daeeb115aca96f274d4ca4")
version("5.7.1", sha256="4c09b182b371124675d4057246021b5ed45e2833fdbf265b37a9b06b668baf0a")
version("5.7.0", sha256="a42f0eb531b015b719e2bdcdff0cfb214e9894f73107966260f26931f982ecbc")
version("5.6.1", sha256="6a50a64354507f1374e1a86aa7f5c07d1aaa96ac193ac292c279153087bb5d54")
@@ -153,6 +153,7 @@ class Rocsparse(CMakePackage):
"5.6.1",
"5.7.0",
"5.7.1",
+ "6.0.0",
]:
depends_on("hip@" + ver, when="@" + ver)
depends_on("rocprim@" + ver, when="@" + ver)
diff --git a/var/spack/repos/builtin/packages/rocthrust/package.py b/var/spack/repos/builtin/packages/rocthrust/package.py
index c5e8dd1acc..01da0551b1 100644
--- a/var/spack/repos/builtin/packages/rocthrust/package.py
+++ b/var/spack/repos/builtin/packages/rocthrust/package.py
@@ -12,12 +12,13 @@ class Rocthrust(CMakePackage):
HIP/ROCm platform, which uses the rocPRIM library. The HIP ported
library works on HIP/ROCm platforms"""
- homepage = "https://github.com/ROCmSoftwarePlatform/rocThrust"
- git = "https://github.com/ROCmSoftwarePlatform/rocThrust.git"
- url = "https://github.com/ROCmSoftwarePlatform/rocThrust/archive/rocm-5.5.0.tar.gz"
+ homepage = "https://github.com/ROCm/rocThrust"
+ git = "https://github.com/ROCm/rocThrust.git"
+ url = "https://github.com/ROCm/rocThrust/archive/rocm-6.0.0.tar.gz"
tags = ["rocm"]
maintainers("cgmb", "srekolam", "renjithravindrankannath")
+ version("6.0.0", sha256="a3fdafe4b6124118e07f23a3b0270d91740da324f61aaa3e8c034da08d9312b1")
version("5.7.1", sha256="b7cb9ea6c42b2c6b610c34d2c438443e0f99245bd391aff18591949bf1cd53ee")
version("5.7.0", sha256="64e10f071acfc5b8e3c168b9178289cf1afc7b168bf1962793fc256b25074d3a")
version("5.6.1", sha256="63df61d5ab46d4cfda6066d748274bacecc77151692e372e6f7df5e91852bdc2")
@@ -149,6 +150,7 @@ class Rocthrust(CMakePackage):
"5.6.1",
"5.7.0",
"5.7.1",
+ "6.0.0",
]:
depends_on("hip@" + ver, when="@" + ver)
depends_on("rocprim@" + ver, when="@" + ver)
diff --git a/var/spack/repos/builtin/packages/roctracer-dev-api/package.py b/var/spack/repos/builtin/packages/roctracer-dev-api/package.py
index a944ff3970..e93c202ccf 100644
--- a/var/spack/repos/builtin/packages/roctracer-dev-api/package.py
+++ b/var/spack/repos/builtin/packages/roctracer-dev-api/package.py
@@ -11,14 +11,15 @@ class RoctracerDevApi(Package):
package, mainly to avoid circular dependencies in the ROCm ecosystem.
For the ROC-tracer library, please check out roctracer-dev."""
- homepage = "https://github.com/ROCm-Developer-Tools/roctracer"
- git = "https://github.com/ROCm-Developer-Tools/roctracer.git"
- url = "https://github.com/ROCm-Developer-Tools/roctracer/archive/refs/tags/rocm-5.5.0.tar.gz"
+ homepage = "https://github.com/ROCm/roctracer"
+ git = "https://github.com/ROCm/roctracer.git"
+ url = "https://github.com/ROCm/roctracer/archive/refs/tags/rocm-6.0.0.tar.gz"
tags = ["rocm"]
license("MIT")
maintainers("srekolam", "renjithravindrankannath")
+ version("6.0.0", sha256="941166a0363c5689bfec118d54e986c43fb1ec8cbf18d95721d9a824bd52c0f8")
version("5.7.1", sha256="ec0453adac7e62b142eb0df1e1e2506863aac4c3f2ce9d117c3184c08c0c6b48")
version("5.7.0", sha256="40bb757920488466e29df90bb80a975cc340bf7f8771fb1d754dfbb6b688d78e")
version("5.6.1", sha256="007c498be25b067ad9a7631a2b0892f9129150ee9714e471a921225875d45e69")
diff --git a/var/spack/repos/builtin/packages/roctracer-dev/package.py b/var/spack/repos/builtin/packages/roctracer-dev/package.py
index aa15dca00e..3c5f81e643 100644
--- a/var/spack/repos/builtin/packages/roctracer-dev/package.py
+++ b/var/spack/repos/builtin/packages/roctracer-dev/package.py
@@ -13,16 +13,16 @@ class RoctracerDev(CMakePackage, ROCmPackage):
The goal of the implementation is to provide a generic independent from
specific runtime profiler to trace API and asyncronous activity."""
- homepage = "https://github.com/ROCm-Developer-Tools/roctracer"
- git = "https://github.com/ROCm-Developer-Tools/roctracer.git"
- url = "https://github.com/ROCm-Developer-Tools/roctracer/archive/rocm-5.5.0.tar.gz"
+ homepage = "https://github.com/ROCm/roctracer"
+ git = "https://github.com/ROCm/roctracer.git"
+ url = "https://github.com/ROCm/roctracer/archive/rocm-6.0.0.tar.gz"
tags = ["rocm"]
maintainers("srekolam", "renjithravindrankannath")
libraries = ["libroctracer64"]
license("MIT")
-
+ version("6.0.0", sha256="941166a0363c5689bfec118d54e986c43fb1ec8cbf18d95721d9a824bd52c0f8")
version("5.7.1", sha256="ec0453adac7e62b142eb0df1e1e2506863aac4c3f2ce9d117c3184c08c0c6b48")
version("5.7.0", sha256="40bb757920488466e29df90bb80a975cc340bf7f8771fb1d754dfbb6b688d78e")
version("5.6.1", sha256="007c498be25b067ad9a7631a2b0892f9129150ee9714e471a921225875d45e69")
@@ -83,6 +83,7 @@ class RoctracerDev(CMakePackage, ROCmPackage):
"5.6.1",
"5.7.0",
"5.7.1",
+ "6.0.0",
]:
depends_on("hsakmt-roct@" + ver, when="@" + ver)
depends_on("hsa-rocr-dev@" + ver, when="@" + ver)
@@ -105,7 +106,7 @@ class RoctracerDev(CMakePackage, ROCmPackage):
]:
depends_on("rocprofiler-dev@" + ver, when="@" + ver)
- for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1", "5.7.0", "5.7.1"]:
+ for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1", "5.7.0", "5.7.1", "6.0.0"]:
depends_on("rocm-core@" + ver, when="@" + ver)
patch("0001-include-rocprofiler-dev-path.patch", when="@5.3:5.4")
diff --git a/var/spack/repos/builtin/packages/rocwmma/package.py b/var/spack/repos/builtin/packages/rocwmma/package.py
index 8d5a9fdbea..ee5418b1c8 100644
--- a/var/spack/repos/builtin/packages/rocwmma/package.py
+++ b/var/spack/repos/builtin/packages/rocwmma/package.py
@@ -19,14 +19,15 @@ class Rocwmma(CMakePackage):
generation of kernel assembly, and does not incur additional overhead costs of
linking to external runtime libraries or having to launch separate kernels."""
- homepage = "https://github.com/ROCmSoftwarePlatform/rocWMMA"
- git = "https://github.com/ROCmSoftwarePlatform/rocWMMA.git"
- url = "https://github.com/ROCmSoftwarePlatform/rocWMMA/archive/refs/tags/rocm-5.5.0.tar.gz"
+ homepage = "https://github.com/ROCm/rocWMMA"
+ git = "https://github.com/ROCm/rocWMMA.git"
+ url = "https://github.com/ROCm/rocWMMA/archive/refs/tags/rocm-6.0.0.tar.gz"
tags = ["rocm"]
license("MIT")
maintainers("srekolam", "renjithravindrankannath")
+ version("6.0.0", sha256="f9e97e7c6c552d43ef8c7348e4402bead2cd978d0f81a9657d6a0f6c83a6139b")
version("5.7.1", sha256="a998a1385e6ad7062707ddb9ff82bef727ca48c39a10b4d861667024e3ffd2a3")
version("5.7.0", sha256="a8f1b090e9e504a149a924c80cfb6aca817359b43833a6512ba32e178245526f")
version("5.6.1", sha256="41a5159ee1ad5fc411fe6220f37bd754e26d3883c24c0f2378f50ef628bc1b8f")
@@ -78,6 +79,7 @@ class Rocwmma(CMakePackage):
"5.6.1",
"5.7.0",
"5.7.1",
+ "6.0.0",
]:
depends_on("rocm-cmake@%s:" % ver, type="build", when="@" + ver)
depends_on("llvm-amdgpu@" + ver, type="build", when="@" + ver)
@@ -85,7 +87,7 @@ class Rocwmma(CMakePackage):
depends_on("rocblas@" + ver, type="build", when="@" + ver)
depends_on("rocm-openmp-extras@" + ver, type="build", when="@" + ver)
- for ver in ["5.6.0", "5.6.1", "5.7.0", "5.7.1"]:
+ for ver in ["5.6.0", "5.6.1", "5.7.0", "5.7.1", "6.0.0"]:
depends_on("rocm-smi-lib@" + ver, when="@" + ver)
for tgt in itertools.chain(["auto"], amdgpu_targets):
diff --git a/var/spack/repos/builtin/packages/rpp/0003-include-half-through-spack-package.patch b/var/spack/repos/builtin/packages/rpp/0003-include-half-through-spack-package.patch
new file mode 100644
index 0000000000..2e7e08c2ac
--- /dev/null
+++ b/var/spack/repos/builtin/packages/rpp/0003-include-half-through-spack-package.patch
@@ -0,0 +1,61 @@
+diff --git a/CMakeLists.txt b/CMakeLists.txt
+index 137896e..ca82e98 100644
+--- a/CMakeLists.txt
++++ b/CMakeLists.txt
+@@ -129,6 +129,9 @@ list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake)
+
+ # OpenMP
+ find_package(OpenMP REQUIRED)
++find_path(HALF_INCLUDE_DIR half.hpp)
++message(STATUS "HALF_INCLUDE_DIR: ${HALF_INCLUDE_DIR}")
++
+ if(APPLE)
+ if(CMAKE_C_COMPILER_ID MATCHES "Clang")
+ set(OpenMP_C "${CMAKE_C_COMPILER}")
+@@ -278,6 +281,7 @@ target_include_directories(${PROJECT_NAME}
+ PUBLIC
+ ${CMAKE_CURRENT_SOURCE_DIR}/include
+ ${ROCM_PATH}/include
++ ${HALF_INCLUDE_DIR}
+ PRIVATE
+ ${CMAKE_CURRENT_SOURCE_DIR}/src/include/cpu
+ ${CMAKE_CURRENT_SOURCE_DIR}/src/include/common
+diff --git a/src/modules/CMakeLists.txt b/src/modules/CMakeLists.txt
+index 2a64d77..80c5686 100644
+--- a/src/modules/CMakeLists.txt
++++ b/src/modules/CMakeLists.txt
+@@ -81,6 +81,8 @@ if("${TIME_INFO}" STREQUAL "1")
+ endif()
+
+ # Backend specific settings
++find_path(HALF_INCLUDE_DIR half.hpp)
++message(STATUS "HALF_INCLUDE_DIR: ${HALF_INCLUDE_DIR}")
+
+ if( "${BACKEND}" STREQUAL "HIP")
+ # Add HIP kernels
+@@ -99,7 +101,7 @@ if( "${BACKEND}" STREQUAL "HIP")
+ # Add HIP specific includes
+ set(ROCM_INC ${ROCM_PATH}/include/)
+ list(APPEND HIP_LOCAL_INCLUDE_DIRS ${CMAKE_SOURCE_DIR}/src/include/hip/ ${CMAKE_SOURCE_DIR}/src/include/common/)
+- set(INCLUDE_LIST ${ROCM_INC} ${HIP_LOCAL_INCLUDE_DIRS} ${INCLUDE_LIST})
++ set(INCLUDE_LIST ${ROCM_INC} ${HIP_LOCAL_INCLUDE_DIRS} ${INCLUDE_LIST} ${HALF_INCLUDE_DIR})
+ elseif( "${BACKEND}" STREQUAL "OCL")
+ # Add OpenCL kernels
+ file(GLOB MOD_CL_CPP "cl/*.cpp" )
+@@ -114,7 +116,7 @@ elseif( "${BACKEND}" STREQUAL "OCL")
+ # Add OpenCL specific includes
+ set(ROCM_INC ${ROCM_PATH}/include/)
+ list(APPEND OCL_LOCAL_INCLUDE_LIST ${CMAKE_SOURCE_DIR}/src/include/cl/ ${CMAKE_SOURCE_DIR}/src/include/common/)
+- set(INCLUDE_LIST ${ROCM_INC} ${OCL_LOCAL_INCLUDE_LIST} ${INCLUDE_LIST})
++ set(INCLUDE_LIST ${ROCM_INC} ${OCL_LOCAL_INCLUDE_LIST} ${INCLUDE_LIST} ${HALF_INCLUDE_DIR})
+ elseif( "${BACKEND}" STREQUAL "CPU")
+ # Add CPU specific includes
+ set(INCLUDE_LIST ${CMAKE_SOURCE_DIR}/src/include/common/)
+@@ -136,6 +138,7 @@ target_include_directories( ${PROJECT_NAME}
+ PUBLIC
+ ${CMAKE_SOURCE_DIR}/include
+ ${ROCM_INC}
++ ${HALF_INCLUDE_DIR}
+ PRIVATE
+ ${CMAKE_SOURCE_DIR}/src/include/cpu
+ ${CMAKE_SOURCE_DIR}/src/include/common \ No newline at end of file
diff --git a/var/spack/repos/builtin/packages/rpp/package.py b/var/spack/repos/builtin/packages/rpp/package.py
index 116fa90328..7049b342cd 100644
--- a/var/spack/repos/builtin/packages/rpp/package.py
+++ b/var/spack/repos/builtin/packages/rpp/package.py
@@ -29,6 +29,7 @@ class Rpp(CMakePackage):
license("MIT")
+ version("6.0.0", sha256="3626a648bc773520f5cd5ca15f494de6e74b422baf32491750ce0737c3367f15")
version("5.7.1", sha256="36fff5f1c52d969c3e2e0c75b879471f731770f193c9644aa6ab993fb8fa4bbf")
version("5.7.0", sha256="1c612cde3c3d3840ae75ee5c1ee59bd8d61b1fdbf84421ae535cda863470fc06")
version("1.2.0", sha256="660a11e1bd8706967835597b26daa874fd1507459bfebe22818149444bec540c")
@@ -54,8 +55,9 @@ class Rpp(CMakePackage):
description="add utilities folder which contains rpp unit tests",
)
- patch("0001-include-half-openmp-through-spack-package.patch")
+ patch("0001-include-half-openmp-through-spack-package.patch", when="@:5.7")
patch("0002-declare-handle-in-header.patch")
+ patch("0003-include-half-through-spack-package.patch", when="@6.0:")
# adds half.hpp include directory and modifies how the libjpegturbo
# library is linked for the rpp unit test
@@ -118,7 +120,11 @@ class Rpp(CMakePackage):
conflicts("+opencl+hip")
with when("+hip"):
- depends_on("hip@5:")
+ with when("@5.7:"):
+ for ver in ["5.7.0", "5.7.1", "6.0.0"]:
+ depends_on("hip@" + ver, when="@" + ver)
+ with when("@:1.2"):
+ depends_on("hip@5:")
with when("~hip"):
depends_on("rocm-opencl@5:")
diff --git a/var/spack/repos/builtin/packages/sundials/Change-HIP_PLATFORM-from-HCC-to-AMD-and-NVCC-to-NVIDIA.patch b/var/spack/repos/builtin/packages/sundials/Change-HIP_PLATFORM-from-HCC-to-AMD-and-NVCC-to-NVIDIA.patch
new file mode 100644
index 0000000000..4dd9dc7a47
--- /dev/null
+++ b/var/spack/repos/builtin/packages/sundials/Change-HIP_PLATFORM-from-HCC-to-AMD-and-NVCC-to-NVIDIA.patch
@@ -0,0 +1,28 @@
+From d4afbed86fc4f9925e55367267b3796a522ba5d5 Mon Sep 17 00:00:00 2001
+From: Renjith Ravindran <Renjith.RavindranKannath@amd.com>
+Date: Sun, 14 Jan 2024 10:20:21 +0000
+Subject: [PATCH] Change HIP_PLATFORM from HCC to AMD and NVCC to NVIDIA
+
+---
+ include/sundials/sundials_hip_policies.hpp | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+diff --git a/include/sundials/sundials_hip_policies.hpp b/include/sundials/sundials_hip_policies.hpp
+index d759bbc..f6dfe41 100644
+--- a/include/sundials/sundials_hip_policies.hpp
++++ b/include/sundials/sundials_hip_policies.hpp
+@@ -30,9 +30,9 @@ namespace sundials
+ namespace hip
+ {
+
+-#if defined(__HIP_PLATFORM_HCC__)
++#if defined(__HIP_PLATFORM_AMD__)
+ constexpr const sunindextype WARP_SIZE = 64;
+-#elif defined(__HIP_PLATFORM_NVCC__)
++#elif defined(__HIP_PLATFORM_NVIDIA__)
+ constexpr const sunindextype WARP_SIZE = 32;
+ #endif
+ constexpr const sunindextype MAX_BLOCK_SIZE = 1024;
+--
+2.39.3
+
diff --git a/var/spack/repos/builtin/packages/sundials/package.py b/var/spack/repos/builtin/packages/sundials/package.py
index 48f5ec65d7..3a906e6c2c 100644
--- a/var/spack/repos/builtin/packages/sundials/package.py
+++ b/var/spack/repos/builtin/packages/sundials/package.py
@@ -285,6 +285,10 @@ class Sundials(CMakePackage, CudaPackage, ROCmPackage):
# https://github.com/spack/spack/issues/29526
patch("nvector-pic.patch", when="@6.1.0:6.2.0 +rocm")
+ # Backward compatibility is stopped from ROCm 6.0
+ # Need to follow the changes similar to PR https://github.com/LLNL/RAJA/pull/1568
+ patch("Change-HIP_PLATFORM-from-HCC-to-AMD-and-NVCC-to-NVIDIA.patch", when="^hip@6.0.0 +rocm")
+
# remove OpenMP header file and function from hypre vector test code
patch("test_nvector_parhyp.patch", when="@2.7.0:3.0.0")
patch("FindPackageMultipass.cmake.patch", when="@5.0.0")
diff --git a/var/spack/repos/builtin/packages/trilinos/0001-use-the-gcnArchName-inplace-of-gcnArch-as-gcnArch-is.patch b/var/spack/repos/builtin/packages/trilinos/0001-use-the-gcnArchName-inplace-of-gcnArch-as-gcnArch-is.patch
new file mode 100644
index 0000000000..ea2b8b98a4
--- /dev/null
+++ b/var/spack/repos/builtin/packages/trilinos/0001-use-the-gcnArchName-inplace-of-gcnArch-as-gcnArch-is.patch
@@ -0,0 +1,26 @@
+From e7fa7ea37423d3d17d77334ac849c5df00feb20e Mon Sep 17 00:00:00 2001
+From: sreenivasa murthy kolam <sreenivasamurthy.kolam@amd.com>
+Date: Tue, 16 Jan 2024 10:09:34 +0000
+Subject: [PATCH] use the gcnArchName inplace of gcnArch as gcnArch is
+ deprecated from rocm-6.0.0
+
+---
+ packages/kokkos/core/src/HIP/Kokkos_HIP_Instance.cpp | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_Instance.cpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_Instance.cpp
+index 7840ad9..882d143 100644
+--- a/packages/kokkos/core/src/HIP/Kokkos_HIP_Instance.cpp
++++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_Instance.cpp
+@@ -86,7 +86,7 @@ void HIPInternal::print_configuration(std::ostream &s) const {
+ KOKKOS_IMPL_HIP_SAFE_CALL(hipGetDeviceProperties(&hipProp, i));
+
+ s << "Kokkos::HIP[ " << i << " ] "
+- << "gcnArch " << hipProp.gcnArch << ", Total Global Memory: "
++ << "gcnArchName " << hipProp.gcnArchName << ", Total Global Memory: "
+ << ::Kokkos::Impl::human_memory_size(hipProp.totalGlobalMem)
+ << ", Shared Memory per Block: "
+ << ::Kokkos::Impl::human_memory_size(hipProp.sharedMemPerBlock);
+--
+2.39.3
+
diff --git a/var/spack/repos/builtin/packages/trilinos/package.py b/var/spack/repos/builtin/packages/trilinos/package.py
index d1de74f11c..e015bb7f4e 100644
--- a/var/spack/repos/builtin/packages/trilinos/package.py
+++ b/var/spack/repos/builtin/packages/trilinos/package.py
@@ -489,6 +489,11 @@ class Trilinos(CMakePackage, CudaPackage, ROCmPackage):
# workaround an NVCC bug with c++14 (https://github.com/trilinos/Trilinos/issues/6954)
# avoid calling deprecated functions with CUDA-11
patch("fix_cxx14_cuda11.patch", when="@13.0.0:13.0.1 cxxstd=14 ^cuda@11:")
+ patch(
+ "0001-use-the-gcnArchName-inplace-of-gcnArch-as-gcnArch-is.patch",
+ when="@15.0.0 ^hip@6.0.0 +rocm",
+ )
+
# Allow building with +teko gotype=long
patch(
"https://github.com/trilinos/Trilinos/commit/b17f20a0b91e0b9fc5b1b0af3c8a34e2a4874f3f.patch?full_index=1",