Bump up the version for ROCm-6.0.0 (#42026)

* Bump up the version for ROCm-6.0.0 * Adding patch files * Style check failure fix * Style check fixes * Style check error fixes * Patch to remove hipblas client file installation in 6.0 * Patch need to be applied on all 5.7 relases * 6.0 update for math libs and other packages, new github url etc * Correct package-audit failures * Correcting shasum for rocfft patch and limiting patch in rocblas * Reverting updates in rocprofiler-dev due to ci-gitlab failure * Fixes for ci-gitlab failure due to disabling hip backward compatibilit * Adding patch file to Change HIP_PLATFORM from HCC to AMD and NVCC to NVIDIA * Use the gcnArchName inplace of gcnArch as gcnArch is deprecated from rocm-6.0.0 * Patches to fix magma and blaspp build error with rocm 6.0.0 * Patch for mfem and arborx for rocm 6.0 * Style check error fix * Correcting style check errors * Uodating dependent version * Update for petsc to build with rocm 6.0 Need reverting-operator-mixup-fix-for-slate.patch for rocm 6.0 * Reverting the change in url for 2.7.4-rocm-enhanced * hip-tensor 6.0.0 update
author: renjithravindrankannath <94420380+renjithravindrankannath@users.noreply.github.com> 2024-01-22 10:19:28 -0800
committer: GitHub <noreply@github.com> 2024-01-22 10:19:28 -0800
commit: c673979feeaadcf03fc8803e2261809c40df8362 (patch)
tree: f496d602a3bb56d9648db4755a8f7096cc41bb05
parent: 7acd5bdc7f0fa646cf4ac1dd7acf7c85d62e3193 (diff)
download: spack-c673979feeaadcf03fc8803e2261809c40df8362.tar.gz
spack-c673979feeaadcf03fc8803e2261809c40df8362.tar.bz2
spack-c673979feeaadcf03fc8803e2261809c40df8362.tar.xz
spack-c673979feeaadcf03fc8803e2261809c40df8362.zip
86 files changed, 13036 insertions, 280 deletions
diff --git a/var/spack/repos/builtin/packages/amdsmi/package.py b/var/spack/repos/builtin/packages/amdsmi/package.py
index ecd2ca1f1d..e7543fdb8b 100644
--- a/var/spack/repos/builtin/packages/amdsmi/package.py
+++ b/var/spack/repos/builtin/packages/amdsmi/package.py
@@ -12,8 +12,8 @@ class Amdsmi(CMakePackage):
     is a C library for Linux that provides a user space interface for
     applications to monitor and control AMD device."""
 
-    homepage = "https://github.com/RadeonOpenCompute/amdsmi"
-    url = "https://github.com/RadeonOpenCompute/amdsmi/archive/refs/tags/rocm-5.6.0.tar.gz"
+    homepage = "https://github.com/ROCm/amdsmi"
+    url = "https://github.com/ROCm/amdsmi/archive/refs/tags/rocm-5.6.0.tar.gz"
 
     tags = ["rocm"]
     maintainers("srekolam", "renjithravindrankannath")
diff --git a/var/spack/repos/builtin/packages/aomp/package.py b/var/spack/repos/builtin/packages/aomp/package.py
index e32dc705e3..6a9603a9de 100644
--- a/var/spack/repos/builtin/packages/aomp/package.py
+++ b/var/spack/repos/builtin/packages/aomp/package.py
@@ -7,8 +7,8 @@ import re
 
 from spack.package import *
 
-tools_url = "https://github.com/ROCm-Developer-Tools"
-compute_url = "https://github.com/RadeonOpenCompute"
+tools_url = "https://github.com/ROCm"
+compute_url = "https://github.com/ROCm"
 
 
 aomp = [
@@ -368,7 +368,7 @@ class Aomp(Package):
             "-DCMAKE_C_COMPILER={0}".format(self.compiler.cc),
             "-DCMAKE_CXX_COMPILER={0}".format(self.compiler.cxx),
             "-DCMAKE_ASM_COMPILER={0}".format(self.compiler.cc),
-            "-DBUG_REPORT_URL=https://github.com/ROCm-Developer-Tools/aomp",
+            "-DBUG_REPORT_URL=https://github.com/ROCm/aomp",
             "-DLLVM_ENABLE_BINDINGS=OFF",
             "-DLLVM_INCLUDE_BENCHMARKS=OFF",
             "-DLLVM_BUILD_TESTS=OFF",
diff --git a/var/spack/repos/builtin/packages/arborx/0001-update-major-version-required-for-rocm-6.0.patch b/var/spack/repos/builtin/packages/arborx/0001-update-major-version-required-for-rocm-6.0.patch
new file mode 100644
index 0000000000..009a40f984
--- /dev/null
+++ b/var/spack/repos/builtin/packages/arborx/0001-update-major-version-required-for-rocm-6.0.patch
@@ -0,0 +1,24 @@
+From a31d3766f5a7a3a3e20d5bc0c315ad6295a82298 Mon Sep 17 00:00:00 2001
+From: Afzal Patel <afzal.patel@amd.com>
+Date: Wed, 17 Jan 2024 11:50:18 -0800
+Subject: [PATCH] Changed required version of rocthrust to 3 for rocm 6.0
+
+---
+ CMakeLists.txt | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/CMakeLists.txt b/CMakeLists.txt
+index 8c3c99a..1af6d13 100644
+--- a/CMakeLists.txt
++++ b/CMakeLists.txt
+@@ -22,7 +22,7 @@ if(Kokkos_ENABLE_HIP AND ARBORX_ENABLE_ROCTHRUST)
+   # Require at least rocThrust-2.10.5 (that comes with ROCm 3.9) because
+   # rocPRIM dependency is not set properly in exported configuration for
+   # earlier versions
+-  find_package(rocthrust 2.10.5 REQUIRED CONFIG)
++  find_package(rocthrust 3 REQUIRED CONFIG)
+   target_link_libraries(ArborX INTERFACE roc::rocthrust)
+ endif()
+
+--
+2.25.1
diff --git a/var/spack/repos/builtin/packages/arborx/package.py b/var/spack/repos/builtin/packages/arborx/package.py
index 6eb003252c..1414a22d7a 100644
--- a/var/spack/repos/builtin/packages/arborx/package.py
+++ b/var/spack/repos/builtin/packages/arborx/package.py
@@ -96,6 +96,7 @@ class Arborx(CMakePackage, CudaPackage, ROCmPackage):
     depends_on("trilinos@13.4.0:", when="@1.3+trilinos")
     depends_on("trilinos@14.0.0:", when="@1.4:+trilinos")
     patch("trilinos14.0-kokkos-major-version.patch", when="@1.4+trilinos ^trilinos@14.0.0")
+    patch("0001-update-major-version-required-for-rocm-6.0.patch", when="+rocm ^hip@6.0:")
     conflicts("~serial", when="+trilinos")
     conflicts("+cuda", when="+trilinos")
 
diff --git a/var/spack/repos/builtin/packages/atmi/package.py b/var/spack/repos/builtin/packages/atmi/package.py
index 98fc5999f5..96c588174f 100644
--- a/var/spack/repos/builtin/packages/atmi/package.py
+++ b/var/spack/repos/builtin/packages/atmi/package.py
@@ -13,9 +13,9 @@ class Atmi(CMakePackage):
     consistent, declarative API to create task graphs on CPUs and GPUs
     (integrated and discrete)."""
 
-    homepage = "https://github.com/RadeonOpenCompute/atmi"
-    git = "https://github.com/RadeonOpenCompute/atmi.git"
-    url = "https://github.com/RadeonOpenCompute/atmi/archive/rocm-5.5.0.tar.gz"
+    homepage = "https://github.com/ROCm/atmi"
+    git = "https://github.com/ROCm/atmi.git"
+    url = "https://github.com/ROCm/atmi/archive/rocm-6.0.0.tar.gz"
     tags = ["rocm"]
 
     license("MIT")
diff --git a/var/spack/repos/builtin/packages/aws-ofi-rccl/package.py b/var/spack/repos/builtin/packages/aws-ofi-rccl/package.py
index 28045fd8ef..f831c88537 100644
--- a/var/spack/repos/builtin/packages/aws-ofi-rccl/package.py
+++ b/var/spack/repos/builtin/packages/aws-ofi-rccl/package.py
@@ -11,9 +11,9 @@ class AwsOfiRccl(AutotoolsPackage):
     libfabric as a network provider while running AMD's RCCL based
     applications."""
 
-    homepage = "https://github.com/ROCmSoftwarePlatform/aws-ofi-rccl"
-    git = "https://github.com/ROCmSoftwarePlatform/aws-ofi-rccl.git"
-    url = "https://github.com/ROCmSoftwarePlatform/aws-ofi-rccl.git"
+    homepage = "https://github.com/ROCm/aws-ofi-rccl"
+    git = "https://github.com/ROCm/aws-ofi-rccl.git"
+    url = "https://github.com/ROCm/aws-ofi-rccl.git"
     tags = ["rocm"]
 
     maintainers("bvanessen")
diff --git a/var/spack/repos/builtin/packages/babelstream/package.py b/var/spack/repos/builtin/packages/babelstream/package.py
index 0d09e2f5d1..4b2a1c5857 100644
--- a/var/spack/repos/builtin/packages/babelstream/package.py
+++ b/var/spack/repos/builtin/packages/babelstream/package.py
@@ -157,7 +157,7 @@ class Babelstream(CMakePackage, CudaPackage, ROCmPackage):
         when="+thrust",
         msg="Which Thrust implementation to use, supported options include:\
          - CUDA (via https://github.com/NVIDIA/thrust)\
-         - ROCM (via https://github.com/ROCmSoftwarePlatform/rocThrust)",
+         - ROCM (via https://github.com/ROCm/rocThrust)",
     )
 
     # This applies to all
diff --git a/var/spack/repos/builtin/packages/blaspp/0001-fix-blaspp-build-error-with-rocm-6.0.0.patch b/var/spack/repos/builtin/packages/blaspp/0001-fix-blaspp-build-error-with-rocm-6.0.0.patch
new file mode 100644
index 0000000000..3ce15f0859
--- /dev/null
+++ b/var/spack/repos/builtin/packages/blaspp/0001-fix-blaspp-build-error-with-rocm-6.0.0.patch
@@ -0,0 +1,50 @@
+From a75f399bfa77680e7736d126ef3e5a520e1a1702 Mon Sep 17 00:00:00 2001
+From: sreenivasa murthy kolam <sreenivasamurthy.kolam@amd.com>
+Date: Wed, 17 Jan 2024 12:55:06 +0000
+Subject: [PATCH] fix build error with rocm-6.0.0 by adding extra parameters
+ for rocblas function calls rocblas_ztrmm() ,rocblas_strmm(),
+ rocblas_ctrmm(),rocblas_dtrmm()
+
+---
+ src/rocblas_wrappers.cc | 4 ++++
+ 1 file changed, 4 insertions(+)
+
+diff --git a/src/rocblas_wrappers.cc b/src/rocblas_wrappers.cc
+index 0e01a95..44ab150 100644
+--- a/src/rocblas_wrappers.cc
++++ b/src/rocblas_wrappers.cc
+@@ -667,6 +667,7 @@ void trmm(
+             m, n,
+             &alpha,
+             dA, ldda,
++            dB, lddb,
+             dB, lddb ) );
+ }
+ 
+@@ -686,6 +687,7 @@ void trmm(
+             m, n,
+             &alpha,
+             dA, ldda,
++            dB, lddb,
+             dB, lddb ) );
+ }
+ 
+@@ -705,6 +707,7 @@ void trmm(
+             m, n,
+             (rocblas_float_complex*) &alpha,
+             (rocblas_float_complex*) dA, ldda,
++            (rocblas_float_complex*) dB, lddb,
+             (rocblas_float_complex*) dB, lddb ) );
+ }
+ 
+@@ -724,6 +727,7 @@ void trmm(
+             m, n,
+             (rocblas_double_complex*) &alpha,
+             (rocblas_double_complex*) dA, ldda,
++            (rocblas_double_complex*) dB, lddb,
+             (rocblas_double_complex*) dB, lddb ) );
+ }
+ 
+-- 
+2.39.3
+
diff --git a/var/spack/repos/builtin/packages/blaspp/package.py b/var/spack/repos/builtin/packages/blaspp/package.py
index e0de779540..78a2fce1d6 100644
--- a/var/spack/repos/builtin/packages/blaspp/package.py
+++ b/var/spack/repos/builtin/packages/blaspp/package.py
@@ -22,6 +22,9 @@ class Blaspp(CMakePackage, CudaPackage, ROCmPackage):
 
     version("master", branch="master")
     version(
+        "2023.11.05", sha256="62dfc03ec07c0826e0466dc2c204b460caa929d53ad4f050cb132d92670be7ce"
+    )
+    version(
         "2023.08.25", sha256="1d9c7227a6d8776944aa866592142b7b51c6e4ba5529d168eb8ae2b329c47401"
     )
     version(
@@ -76,6 +79,10 @@ class Blaspp(CMakePackage, CudaPackage, ROCmPackage):
 
     requires("%oneapi", when="+sycl", msg="blaspp+sycl must be compiled with %oneapi")
 
+    patch(
+        "0001-fix-blaspp-build-error-with-rocm-6.0.0.patch", when="@2023.06.00: ^hip@6.0.0 +rocm"
+    )
+
     def cmake_args(self):
         spec = self.spec
         backend_config = "-Duse_cuda=%s" % ("+cuda" in spec)
diff --git a/var/spack/repos/builtin/packages/comgr/package.py b/var/spack/repos/builtin/packages/comgr/package.py
index f713ccba6d..f8bbd4e526 100644
--- a/var/spack/repos/builtin/packages/comgr/package.py
+++ b/var/spack/repos/builtin/packages/comgr/package.py
@@ -12,9 +12,9 @@ class Comgr(CMakePackage):
     """This provides various Lightning Compiler related services. It currently
     contains one library, the Code Object Manager (Comgr)"""
 
-    homepage = "https://github.com/RadeonOpenCompute/ROCm-CompilerSupport"
-    git = "https://github.com/RadeonOpenCompute/ROCm-CompilerSupport.git"
-    url = "https://github.com/RadeonOpenCompute/ROCm-CompilerSupport/archive/rocm-5.5.0.tar.gz"
+    homepage = "https://github.com/ROCm/ROCm-CompilerSupport"
+    git = "https://github.com/ROCm/ROCm-CompilerSupport.git"
+    url = "https://github.com/ROCm/ROCm-CompilerSupport/archive/rocm-6.0.0.tar.gz"
     tags = ["rocm"]
 
     maintainers("srekolam", "renjithravindrankannath", "haampie")
@@ -23,6 +23,7 @@ class Comgr(CMakePackage):
     license("NCSA")
 
     version("master", branch="amd-stg-open")
+    version("6.0.0", sha256="04353d27a512642a5e5339532a39d0aabe44e0964985de37b150a2550385800a")
     version("5.7.1", sha256="3b9433b4a0527167c3e9dfc37a3c54e0550744b8d4a8e1be298c8d4bcedfee7c")
     version("5.7.0", sha256="e234bcb93d602377cfaaacb59aeac5796edcd842a618162867b7e670c3a2c42c")
     version("5.6.1", sha256="0a85d84619f98be26ca7a32c71f94ed3c4e9866133789eabb451be64ce739300")
@@ -152,6 +153,7 @@ class Comgr(CMakePackage):
         "5.6.1",
         "5.7.0",
         "5.7.1",
+        "6.0.0",
         "master",
     ]:
         # llvm libs are linked statically, so this *could* be a build dep
@@ -163,7 +165,7 @@ class Comgr(CMakePackage):
             "rocm-device-libs@" + ver, when="@{0} ^llvm-amdgpu ~rocm-device-libs".format(ver)
         )
 
-    for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1", "5.7.0", "5.7.1"]:
+    for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1", "5.7.0", "5.7.1", "6.0.0"]:
         depends_on("rocm-core@" + ver, when="@" + ver)
 
     root_cmakelists_dir = join_path("lib", "comgr")
diff --git a/var/spack/repos/builtin/packages/composable-kernel/package.py b/var/spack/repos/builtin/packages/composable-kernel/package.py
index afbb86f01f..10bdf7183c 100644
--- a/var/spack/repos/builtin/packages/composable-kernel/package.py
+++ b/var/spack/repos/builtin/packages/composable-kernel/package.py
@@ -11,14 +11,15 @@ class ComposableKernel(CMakePackage):
     """Composable Kernel: Performance Portable Programming Model
     for Machine Learning Tensor Operators."""
 
-    homepage = "https://github.com/ROCmSoftwarePlatform/composable_kernel"
-    git = "https://github.com/ROCmSoftwarePlatform/composable_kernel.git"
-    url = "https://github.com/ROCmSoftwarePlatform/composable_kernel/archive/refs/tags/rocm-5.7.1.tar.gz"
+    homepage = "https://github.com/ROCm/composable_kernel"
+    git = "https://github.com/ROCm/composable_kernel.git"
+    url = "https://github.com/ROCm/composable_kernel/archive/refs/tags/rocm-5.7.1.tar.gz"
     maintainers("srekolam", "afzpatel")
 
     license("MIT")
 
     version("master", branch="develop")
+    version("6.0.0", sha256="a8f736f2f2a8afa4cddd06301205be27774d85f545429049b4a2bbbe6fcd67df")
     version("5.7.1", sha256="75f66e023c2e31948e91fa26366eaeac72d871fc2e5188361d4465179f13876e")
     version("5.7.0", sha256="d9624dbaef04e0138f9f73596c49b4fe9ded69974bae7236354baa32649bf21a")
     version("5.6.1", commit="f5ec04f091fa5c48c67d7bacec36a414d0be06a5")
@@ -46,7 +47,18 @@ class ComposableKernel(CMakePackage):
     depends_on("pkgconfig", type="build")
     depends_on("cmake@3.16:", type="build")
 
-    for ver in ["master", "5.7.1", "5.7.0", "5.6.1", "5.6.0", "5.5.1", "5.5.0", "5.4.3", "5.4.0"]:
+    for ver in [
+        "master",
+        "6.0.0",
+        "5.7.1",
+        "5.7.0",
+        "5.6.1",
+        "5.6.0",
+        "5.5.1",
+        "5.5.0",
+        "5.4.3",
+        "5.4.0",
+    ]:
         depends_on("hip@" + ver, when="@" + ver)
         depends_on("llvm-amdgpu@" + ver, when="@" + ver)
         depends_on("rocm-cmake@" + ver, when="@" + ver, type="build")
diff --git a/var/spack/repos/builtin/packages/heffte/package.py b/var/spack/repos/builtin/packages/heffte/package.py
index 228e813973..1472116be9 100644
--- a/var/spack/repos/builtin/packages/heffte/package.py
+++ b/var/spack/repos/builtin/packages/heffte/package.py
@@ -114,7 +114,7 @@ class Heffte(CMakePackage, CudaPackage, ROCmPackage):
             if "none" not in rocm_arch:
                 args.append("-DCMAKE_CXX_FLAGS={0}".format(self.hip_flags(rocm_arch)))
 
-            # See https://github.com/ROCmSoftwarePlatform/rocFFT/issues/322
+            # See https://github.com/ROCm/rocFFT/issues/322
             if self.spec.satisfies("^cmake@3.21.0:3.21.2"):
                 args.append(self.define("__skip_rocmclang", "ON"))
 
diff --git a/var/spack/repos/builtin/packages/hip-examples/package.py b/var/spack/repos/builtin/packages/hip-examples/package.py
index c2e8aaa97e..22f5705389 100644
--- a/var/spack/repos/builtin/packages/hip-examples/package.py
+++ b/var/spack/repos/builtin/packages/hip-examples/package.py
@@ -11,9 +11,9 @@ from spack.package import *
 class HipExamples(Package):
     """Examples for HIP"""
 
-    homepage = "https://github.com/ROCm-Developer-Tools/HIP-Examples/"
-    git = "https://github.com/ROCm-Developer-Tools/HIP-Examples.git"
-    url = "https://github.com/ROCm-Developer-Tools/HIP-Examples/archive/rocm-5.4.3.tar.gz"
+    homepage = "https://github.com/ROCm/HIP-Examples/"
+    git = "https://github.com/ROCm/HIP-Examples.git"
+    url = "https://github.com/ROCm/HIP-Examples/archive/rocm-5.4.3.tar.gz"
     tags = ["rocm"]
 
     maintainers("srekolam", "renjithravindrankannath", "afzpatel")
diff --git a/var/spack/repos/builtin/packages/hip-rocclr/package.py b/var/spack/repos/builtin/packages/hip-rocclr/package.py
index 2ae9e375e0..22c1232e2c 100644
--- a/var/spack/repos/builtin/packages/hip-rocclr/package.py
+++ b/var/spack/repos/builtin/packages/hip-rocclr/package.py
@@ -12,8 +12,8 @@ class HipRocclr(CMakePackage):
     with to different backends such as ROCr or PAL This abstraction allows
     runtimes to work on Windows as well as on Linux without much effort."""
 
-    homepage = "https://github.com/ROCm-Developer-Tools/ROCclr"
-    git = "https://github.com/ROCm-Developer-Tools/ROCclr.git"
+    homepage = "https://github.com/ROCm/ROCclr"
+    git = "https://github.com/ROCm/ROCclr.git"
     tags = ["rocm"]
 
     maintainers("srekolam", "renjithravindrankannath")
@@ -21,9 +21,9 @@ class HipRocclr(CMakePackage):
     def url_for_version(self, version):
         # Fix up a typo in the 3.5.0 release.
         if version == Version("3.5.0"):
-            return "https://github.com/ROCm-Developer-Tools/ROCclr/archive/roc-3.5.0.tar.gz"
+            return "https://github.com/ROCm/ROCclr/archive/roc-3.5.0.tar.gz"
 
-        url = "https://github.com/ROCm-Developer-Tools/ROCclr/archive/rocm-{0}.tar.gz"
+        url = "https://github.com/ROCm/ROCclr/archive/rocm-{0}.tar.gz"
         return url.format(version)
 
     license("MIT")
@@ -152,13 +152,13 @@ class HipRocclr(CMakePackage):
         depends_on("hsa-rocr-dev@" + ver, when="@" + ver)
         depends_on("comgr@" + ver, when="@" + ver)
 
-    # See: https://github.com/ROCm-Developer-Tools/ROCclr/pull/16
+    # See: https://github.com/ROCm/ROCclr/pull/16
     # In 3.7.0 the find opengl things have changed slightly.
     patch("opengl.patch", when="@3.5.0")
 
     resource(
         name="opencl-on-vdi",
-        url="https://github.com/RadeonOpenCompute/ROCm-OpenCL-Runtime/archive/roc-3.5.0.tar.gz",
+        url="https://github.com/ROCm/ROCm-OpenCL-Runtime/archive/roc-3.5.0.tar.gz",
         sha256="511b617d5192f2d4893603c1a02402b2ac9556e9806ff09dd2a91d398abf39a0",
         expand=True,
         destination="",
@@ -197,7 +197,7 @@ class HipRocclr(CMakePackage):
     ]:
         resource(
             name="opencl-on-vdi",
-            url="https://github.com/RadeonOpenCompute/ROCm-OpenCL-Runtime/archive/rocm-{0}.tar.gz".format(
+            url="https://github.com/ROCm/ROCm-OpenCL-Runtime/archive/rocm-{0}.tar.gz".format(
                 d_version
             ),
             sha256=d_shasum,
@@ -209,7 +209,7 @@ class HipRocclr(CMakePackage):
 
     resource(
         name="opencl-on-vdi",
-        git="https://github.com/RadeonOpenCompute/ROCm-OpenCL-Runtime.git",
+        git="https://github.com/ROCm/ROCm-OpenCL-Runtime.git",
         destination="",
         placement="opencl-on-vdi",
         branch="main",
diff --git a/var/spack/repos/builtin/packages/hip-tensor/package.py b/var/spack/repos/builtin/packages/hip-tensor/package.py
index e925031945..86fd4e385d 100644
--- a/var/spack/repos/builtin/packages/hip-tensor/package.py
+++ b/var/spack/repos/builtin/packages/hip-tensor/package.py
@@ -17,10 +17,11 @@ class HipTensor(CMakePackage, ROCmPackage):
     maintainers("srekolam", "afzpatel")
 
     version("master", branch="master")
+    version("6.0.0", sha256="268d7f114784b7e824f89c21c65c2efedbb5486f09a356a56dca1b89bde1ef7a")
     version("5.7.1", sha256="96743d4e695fe865aef4097ae31d9b4e42a2d5a92135a005b0d187d9c0b17645")
     version("5.7.0", sha256="4b17f6d43b17fe2dc1d0c61e9663d4752006f7898cc94231206444a1663eb252")
 
-    for ver in ["5.7.0", "5.7.1", "master"]:
+    for ver in ["5.7.0", "5.7.1", "6.0.0", "master"]:
         depends_on(f"composable-kernel@{ver}", when=f"@{ver}")
         depends_on(f"rocm-cmake@{ver}", when=f"@{ver}")
 
diff --git a/var/spack/repos/builtin/packages/hip/0014-remove-compiler-rt-linkage-for-host.6.0.patch b/var/spack/repos/builtin/packages/hip/0014-remove-compiler-rt-linkage-for-host.6.0.patch
new file mode 100644
index 0000000000..597baa2e5d
--- /dev/null
+++ b/var/spack/repos/builtin/packages/hip/0014-remove-compiler-rt-linkage-for-host.6.0.patch
@@ -0,0 +1,61 @@
+diff --git a/clr/hipamd/CMakeLists.txt b/clr/hipamd/CMakeLists.txt
+index 7ad3001..aaf6ad0 100755
+--- a/clr/hipamd/CMakeLists.txt
++++ b/clr/hipamd/CMakeLists.txt
+@@ -297,16 +297,6 @@ if(HIP_RUNTIME STREQUAL "rocclr")
+    add_subdirectory(src)
+ endif()
+ 
+-# Download libamdhip64.so.5
+-if(HIP_PLATFORM STREQUAL "amd")
+-    if(NOT WIN32)
+-        execute_process(COMMAND sh -c "${CMAKE_CURRENT_SOURCE_DIR}/download_libamhip64_v5.sh" WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} COMMAND_ECHO STDERR RESULT_VARIABLE DWLD_HIP_SO_RC)
+-        if (DWLD_HIP_SO_RC AND NOT DWLD_HIP_SO_RC EQUAL 0)
+-            message(FATAL_ERROR "Failed to download libamdhip64.so.5")
+-        endif()
+-    endif()
+-endif()
+-
+ # Build doxygen documentation
+ find_program(DOXYGEN_EXE doxygen)
+ if(DOXYGEN_EXE)
+@@ -408,8 +398,6 @@ if (NOT ${HIPCC_BIN_DIR} STREQUAL "")
+   install(PROGRAMS ${HIPCC_BIN_DIR}/hipcc.pl DESTINATION bin)
+   install(PROGRAMS ${HIPCC_BIN_DIR}/hipconfig.pl DESTINATION bin)
+   install(PROGRAMS ${HIPCC_BIN_DIR}/hipvars.pm DESTINATION bin)
+-  install(PROGRAMS ${HIPCC_BIN_DIR}/hipcc.bat DESTINATION bin)
+-  install(PROGRAMS ${HIPCC_BIN_DIR}/hipconfig.bat DESTINATION bin)
+ endif()
+ 
+ #############################
+diff --git a/hipcc/bin/hipcc.pl b/hipcc/bin/hipcc.pl
+index 513a427..cd2d6ac 100755
+--- a/hipcc/bin/hipcc.pl
++++ b/hipcc/bin/hipcc.pl
+@@ -160,11 +160,14 @@ if ($HIP_PLATFORM eq "amd") {
+     if($isWindows) {
+         $execExtension = ".exe";
+     }
+-    $HIPCC=get_normalized_path("$HIP_CLANG_PATH/clang++" . $execExtension);
++    # llvm_path is set inside the hip recipe
++    $LLVM_PATH= $ENV{'LLVM_PATH'};
++    $HIPCC="${LLVM_PATH}/bin/clang++" . $execExtension;
+ 
+     # If $HIPCC clang++ is not compiled, use clang instead
+     if ( ! -e $HIPCC ) {
+-        $HIPCC=get_normalized_path("$HIP_CLANG_PATH/clang" . $execExtension);
++        $LLVM_PATH= $ENV{'LLVM_PATH'};
++        $HIPCC="${LLVM_PATH}/bin/clang" . $execExtension;
+         $HIPLDFLAGS = "--driver-mode=g++";
+     }
+     # to avoid using dk linker or MSVC linker
+@@ -484,7 +487,8 @@ if($HIP_PLATFORM eq "amd"){
+             $targetsStr = $ENV{HCC_AMDGPU_TARGET};
+         } elsif (not $isWindows) {
+             # Else try using rocm_agent_enumerator
+-            $ROCM_AGENT_ENUM = "${ROCM_PATH}/bin/rocm_agent_enumerator";
++            $ROCMINFO_PATH = $ENV{'ROCMINFO_PATH'} // $ROCMINFO_PATH;
++            $ROCM_AGENT_ENUM = "${ROCMINFO_PATH}/bin/rocm_agent_enumerator";
+             $targetsStr = `${ROCM_AGENT_ENUM} -t GPU`;
+             $targetsStr =~ s/\n/,/g;
+         }
diff --git a/var/spack/repos/builtin/packages/hip/0018-reverting-hipMemoryType-with-memoryType.patch b/var/spack/repos/builtin/packages/hip/0018-reverting-hipMemoryType-with-memoryType.patch
new file mode 100644
index 0000000000..c77075d640
--- /dev/null
+++ b/var/spack/repos/builtin/packages/hip/0018-reverting-hipMemoryType-with-memoryType.patch
@@ -0,0 +1,17 @@
+diff --git a/include/hip/hip_runtime_api.h b/include/hip/hip_runtime_api.h
+index 88e6850..d280504 100644
+--- a/include/hip/hip_runtime_api.h
++++ b/include/hip/hip_runtime_api.h
+@@ -259,7 +259,11 @@ typedef enum hipMemoryType {
+  * Pointer attributes
+  */
+ typedef struct hipPointerAttribute_t {
+-    enum hipMemoryType type;
++    union {
++      // Deprecated, use instead type
++      enum hipMemoryType memoryType;
++      enum hipMemoryType type;
++    };
+     int device;
+     void* devicePointer;
+     void* hostPointer;
diff --git a/var/spack/repos/builtin/packages/hip/package.py b/var/spack/repos/builtin/packages/hip/package.py
index 29b23fecca..a6fd946955 100644
--- a/var/spack/repos/builtin/packages/hip/package.py
+++ b/var/spack/repos/builtin/packages/hip/package.py
@@ -16,9 +16,9 @@ class Hip(CMakePackage):
     create portable applications for AMD and NVIDIA GPUs from
     single source code."""
 
-    homepage = "https://github.com/ROCm-Developer-Tools/HIP"
-    git = "https://github.com/ROCm-Developer-Tools/HIP.git"
-    url = "https://github.com/ROCm-Developer-Tools/HIP/archive/rocm-5.5.0.tar.gz"
+    homepage = "https://github.com/ROCm/HIP"
+    git = "https://github.com/ROCm/HIP.git"
+    url = "https://github.com/ROCm/HIP/archive/rocm-6.0.0.tar.gz"
     tags = ["rocm"]
 
     maintainers("srekolam", "renjithravindrankannath", "haampie")
@@ -27,6 +27,7 @@ class Hip(CMakePackage):
     license("MIT")
 
     version("master", branch="master")
+    version("6.0.0", sha256="0d575788e0b731124a8489a36652014a165b9ebab92d5456ec3c976e062f3a82")
     version("5.7.1", sha256="eaa0e14a9ae45c58ed37863797b683a7778b3cbbf92f5b6529ec65fd61d61f3e")
     version("5.7.0", sha256="cb61234eec7879fb7e20937659ad535b93a6e66fc8de0a543da8b7702474f2fc")
     version("5.6.1", sha256="4b3c4dfcf8595da0e1b8c3e8067b1ccebeaac337762ff098db14375fa8dd4487")
@@ -172,6 +173,7 @@ class Hip(CMakePackage):
             "5.6.1",
             "5.7.0",
             "5.7.1",
+            "6.0.0",
         ]:
             depends_on("hsakmt-roct@" + ver, when="@" + ver)
             depends_on("hsa-rocr-dev@" + ver, when="@" + ver)
@@ -180,12 +182,22 @@ class Hip(CMakePackage):
             depends_on("rocminfo@" + ver, when="@" + ver)
             depends_on("roctracer-dev-api@" + ver, when="@" + ver)
 
-        for ver in ["5.4.0", "5.4.3", "5.5.0", "5.5.1", "5.6.0", "5.6.1", "5.7.0", "5.7.1"]:
+        for ver in [
+            "5.4.0",
+            "5.4.3",
+            "5.5.0",
+            "5.5.1",
+            "5.6.0",
+            "5.6.1",
+            "5.7.0",
+            "5.7.1",
+            "6.0.0",
+        ]:
             depends_on("hipify-clang", when="@" + ver)
-        for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1", "5.7.0", "5.7.1"]:
+        for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1", "5.7.0", "5.7.1", "6.0.0"]:
             depends_on("rocm-core@" + ver, when="@" + ver)
         # hipcc likes to add `-lnuma` by default :(
-        # ref https://github.com/ROCm-Developer-Tools/HIP/pull/2202
+        # ref https://github.com/ROCm/HIP/pull/2202
         depends_on("numactl", when="@3.7.0:")
 
     # roc-obj-ls requirements
@@ -212,9 +224,7 @@ class Hip(CMakePackage):
     ]:
         resource(
             name="hipamd",
-            url="https://github.com/ROCm-Developer-Tools/hipamd/archive/rocm-{0}.tar.gz".format(
-                d_version
-            ),
+            url="https://github.com/ROCm/hipamd/archive/rocm-{0}.tar.gz".format(d_version),
             sha256=d_shasum,
             expand=True,
             destination="",
@@ -241,7 +251,7 @@ class Hip(CMakePackage):
     ]:
         resource(
             name="opencl",
-            url="https://github.com/RadeonOpenCompute/ROCm-OpenCL-Runtime/archive/rocm-{0}.tar.gz".format(
+            url="https://github.com/ROCm/ROCm-OpenCL-Runtime/archive/rocm-{0}.tar.gz".format(
                 d_version
             ),
             sha256=d_shasum,
@@ -269,9 +279,7 @@ class Hip(CMakePackage):
     ]:
         resource(
             name="rocclr",
-            url="https://github.com/ROCm-Developer-Tools/ROCclr/archive/rocm-{0}.tar.gz".format(
-                d_version
-            ),
+            url="https://github.com/ROCm/ROCclr/archive/rocm-{0}.tar.gz".format(d_version),
             sha256=d_shasum,
             expand=True,
             destination="",
@@ -280,6 +288,7 @@ class Hip(CMakePackage):
         )
     # Add hip-clr sources thru the below
     for d_version, d_shasum in [
+        ("6.0.0", "798b55b5b5fb90dd19db54f136d8d8e1da9ae1e408d5b12b896101d635f97e50"),
         ("5.7.1", "c78490335233a11b4d8a5426ace7417c555f5e2325de10422df06c0f0f00f7eb"),
         ("5.7.0", "bc2447cb6fd86dff6a333b04e77ce85755104d9011a14a044af53caf02449573"),
         ("5.6.1", "0b88af1e99643899d11b1c8cf8a3c46601051b328a5e0ffbd44ee88b7eb0db33"),
@@ -287,9 +296,7 @@ class Hip(CMakePackage):
     ]:
         resource(
             name="clr",
-            url="https://github.com/ROCm-Developer-Tools/clr/archive/refs/tags/rocm-{0}.tar.gz".format(
-                d_version
-            ),
+            url="https://github.com/ROCm/clr/archive/refs/tags/rocm-{0}.tar.gz".format(d_version),
             sha256=d_shasum,
             expand=True,
             destination="",
@@ -299,6 +306,7 @@ class Hip(CMakePackage):
 
     # Add hipcc sources thru the below
     for d_version, d_shasum in [
+        ("6.0.0", "e9cfaaecaf0e6ed363946439197f340c115e8e1189f96dbd716cf20245c29255"),
         ("5.7.1", "d47d27ef2b5de7f49cdfd8547832ac9b437a32e6fc6f0e9c1646f4b704c90aee"),
         ("5.7.0", "9f839bf7226e5e26f3150f8ba6eca507ab9a668e68b207736301b3bb9040c973"),
         ("5.6.1", "5800fac92b841ef6f52acda78d9bf86f83970bec0fb848a6265d239bdb7eb51a"),
@@ -306,7 +314,7 @@ class Hip(CMakePackage):
     ]:
         resource(
             name="hipcc",
-            url="https://github.com/ROCm-Developer-Tools/HIPCC/archive/refs/tags/rocm-{0}.tar.gz".format(
+            url="https://github.com/ROCm/HIPCC/archive/refs/tags/rocm-{0}.tar.gz".format(
                 d_version
             ),
             sha256=d_shasum,
@@ -317,6 +325,7 @@ class Hip(CMakePackage):
         )
     # Add hiptests sources thru the below
     for d_version, d_shasum in [
+        ("6.0.0", "e8f92a0f5d1f6093ca1fb24ff1b7140128900fcdc6e9f01f153d6907e5c2d807"),
         ("5.7.1", "28fbdf49f405adfee903bc0f05a43ac392c55b34c514c3582dfb7d6d67e79985"),
         ("5.7.0", "b1dae3cfc715e71dce92ac1da94265a9398944c76cee85ffab8f0c93665a48d6"),
         ("5.6.1", "5b3002ddfafda162329e4d9e6ac1200eeb48ff08e666b342aa8aeca30750f48b"),
@@ -324,7 +333,7 @@ class Hip(CMakePackage):
     ]:
         resource(
             name="hip-tests",
-            url="https://github.com/ROCm-Developer-Tools/hip-tests/archive/refs/tags/rocm-{0}.tar.gz".format(
+            url="https://github.com/ROCm/hip-tests/archive/refs/tags/rocm-{0}.tar.gz".format(
                 d_version
             ),
             sha256=d_shasum,
@@ -366,10 +375,10 @@ class Hip(CMakePackage):
     )
     patch("0013-remove-compiler-rt-linkage-for-host.5.3.0.patch", when="@5.3.0:5.4")
 
-    # See https://github.com/ROCm-Developer-Tools/HIP/pull/2141
+    # See https://github.com/ROCm/HIP/pull/2141
     patch("0002-Fix-detection-of-HIP_CLANG_ROOT.patch", when="@:3.9.0")
 
-    # See https://github.com/ROCm-Developer-Tools/HIP/pull/2218
+    # See https://github.com/ROCm/HIP/pull/2218
     patch("0003-Improve-compilation-without-git-repo.3.7.0.patch", when="@3.7.0:3.9.0")
     patch("0003-Improve-compilation-without-git-repo.3.10.0.patch", when="@3.10.0:4.0.0")
     patch("0003-Improve-compilation-without-git-repo.4.1.0.patch", when="@4.1.0")
@@ -383,7 +392,7 @@ class Hip(CMakePackage):
         "_disabletests.4.5.0.patch",
         when="@4.5.0:4.5.3",
     )
-    # See https://github.com/ROCm-Developer-Tools/HIP/pull/2219
+    # See https://github.com/ROCm/HIP/pull/2219
     patch("0004-Drop-clang-rt-builtins-linking-on-hip-host.3.7.0.patch", when="@3.7.0:3.9.0")
     patch("0004-Drop-clang-rt-builtins-linking-on-hip-host.3.10.0.patch", when="@3.10.0:4.1.0")
 
@@ -400,14 +409,16 @@ class Hip(CMakePackage):
     patch("0014-remove-compiler-rt-linkage-for-host.5.5.0.patch", when="@5.5")
     patch("0014-remove-compiler-rt-linkage-for-host.5.6.0.patch", when="@5.6.0:5.6")
     patch("0014-Remove-compiler-rt-linkage-for-host-for-5.7.0.patch", when="@5.7.0:5.7")
-    patch("0015-reverting-operator-mixup-fix-for-slate.patch", when="@5.6:")
-    patch("0017-Set-PARAMETERS_MIN_ALIGNMENT-to-the-native-alignment.patch", when="@5.7")
+    patch("0014-remove-compiler-rt-linkage-for-host.6.0.patch", when="@6.0:")
+    patch("0015-reverting-operator-mixup-fix-for-slate.patch", when="@5.6:6.0")
+    patch("0017-Set-PARAMETERS_MIN_ALIGNMENT-to-the-native-alignment.patch", when="@5.7:6.0")
+    patch("0018-reverting-hipMemoryType-with-memoryType.patch", when="@6.0")
 
-    # See https://github.com/ROCm-Developer-Tools/HIP/pull/3206
+    # See https://github.com/ROCm/HIP/pull/3206
     patch(
-        "https://github.com/ROCm-Developer-Tools/HIP/commit/50ee82f6bc4aad10908ce09198c9f7ebfb2a3561.patch?full_index=1",
+        "https://github.com/ROCm/HIP/commit/50ee82f6bc4aad10908ce09198c9f7ebfb2a3561.patch?full_index=1",
         sha256="c2ee21cdc55262c7c6ba65546b5ca5f65ea89730",
-        when="@5.2:",
+        when="@5.2:5.7",
     )
 
     @property
@@ -533,7 +544,7 @@ class Hip(CMakePackage):
 
             # This is a variable that does not exist in hipcc but was introduced
             # in a patch of ours since 3.5.0 to locate rocm_agent_enumerator:
-            # https://github.com/ROCm-Developer-Tools/HIP/pull/2138
+            # https://github.com/ROCm/HIP/pull/2138
             env.set("ROCMINFO_PATH", paths["rocminfo"])
 
             # This one is used in hipcc to run `clang --hip-device-lib-path=...`
@@ -548,7 +559,7 @@ class Hip(CMakePackage):
 
             # Used in comgr and seems necessary when using the JIT compiler, e.g.
             # hiprtcCreateProgram:
-            # https://github.com/RadeonOpenCompute/ROCm-CompilerSupport/blob/rocm-4.0.0/lib/comgr/src/comgr-env.cpp
+            # https://github.com/ROCm/ROCm-CompilerSupport/blob/rocm-4.0.0/lib/comgr/src/comgr-env.cpp
             env.set("LLVM_PATH", paths["llvm-amdgpu"])
             env.set("COMGR_PATH", paths["comgr"])
 
@@ -560,7 +571,7 @@ class Hip(CMakePackage):
             # and parsing of the <prefix>/bin/.hipVersion file. Let's just set this
             # to the hip prefix directory for non-external builds so that the
             # bin/.hipVersion file can still be parsed.
-            # See also https://github.com/ROCm-Developer-Tools/HIP/issues/2223
+            # See also https://github.com/ROCm/HIP/issues/2223
             if "@3.8.0:" in self.spec:
                 env.append_path(
                     "HIPCC_COMPILE_FLAGS_APPEND",
diff --git a/var/spack/repos/builtin/packages/hipblas/package.py b/var/spack/repos/builtin/packages/hipblas/package.py
index e05dfd3768..46b02ad352 100644
--- a/var/spack/repos/builtin/packages/hipblas/package.py
+++ b/var/spack/repos/builtin/packages/hipblas/package.py
@@ -12,9 +12,9 @@ class Hipblas(CMakePackage, CudaPackage, ROCmPackage):
     """hipBLAS is a BLAS marshalling library, with multiple
     supported backends"""
 
-    homepage = "https://github.com/ROCmSoftwarePlatform/hipBLAS"
-    git = "https://github.com/ROCmSoftwarePlatform/hipBLAS.git"
-    url = "https://github.com/ROCmSoftwarePlatform/hipBLAS/archive/rocm-5.5.0.tar.gz"
+    homepage = "https://github.com/ROCm/hipBLAS"
+    git = "https://github.com/ROCm/hipBLAS.git"
+    url = "https://github.com/ROCm/hipBLAS/archive/rocm-6.0.0.tar.gz"
     tags = ["rocm"]
 
     maintainers("cgmb", "srekolam", "renjithravindrankannath", "haampie")
@@ -24,6 +24,7 @@ class Hipblas(CMakePackage, CudaPackage, ROCmPackage):
 
     version("develop", branch="develop")
     version("master", branch="master")
+    version("6.0.0", sha256="8fbd0c244fe82eded866e06d2399b1d91ab5d43d2ebcb73382c7ce1ae48d9cb3")
     version("5.7.1", sha256="794e9298f48ffbe3bd1c1ab87a5c2c2b953713500155fdec9ef8cbb11f81fc8a")
     version("5.7.0", sha256="8c6cd2ffa4ce6ab03e05feffe074685b5525610870aebe9d78f817b3037f33a4")
     version("5.6.1", sha256="f9da82fbefc68b84081ea0ed0139b91d2a540357fcf505c7f1d57eab01eb327c")
@@ -136,11 +137,14 @@ class Hipblas(CMakePackage, CudaPackage, ROCmPackage):
     patch("link-clients-blas.patch", when="@4.3.0:4.3.2")
     patch("link-clients-blas-4.5.0.patch", when="@4.5.0:4.5.2")
     patch("hipblas-link-clients-blas-5.0.0.patch", when="@5.0.0:5.0.2")
-    patch("remove-hipblas-clients-file-installation.patch", when="@5.5:")
+    patch("remove-hipblas-clients-file-installation.patch", when="@5.5:5.7.1")
+    patch("remove-hipblas-clients-file-installation-6.0.patch", when="@6.0:")
 
-    depends_on("rocm-cmake@5.2.0:", type="build", when="@5.2.0:")
+    depends_on("rocm-cmake@5.2.0:", type="build", when="@5.2.0:5.7")
     depends_on("rocm-cmake@4.5.0:", type="build", when="@4.5.0:")
     depends_on("rocm-cmake@3.5.0:", type="build")
+    for ver in ["6.0.0"]:
+        depends_on("rocm-cmake@" + ver, when="+rocm @" + ver)
 
     depends_on("hip +cuda", when="+cuda")
 
@@ -174,12 +178,12 @@ class Hipblas(CMakePackage, CudaPackage, ROCmPackage):
         "5.6.1",
         "5.7.0",
         "5.7.1",
+        "6.0.0",
         "master",
         "develop",
     ]:
         depends_on("rocsolver@" + ver, when="+rocm @" + ver)
         depends_on("rocblas@" + ver, when="+rocm @" + ver)
-
     for tgt in ROCmPackage.amdgpu_targets:
         depends_on(
             "rocblas amdgpu_target={0}".format(tgt), when="+rocm amdgpu_target={0}".format(tgt)
diff --git a/var/spack/repos/builtin/packages/hipblas/remove-hipblas-clients-file-installation-6.0.patch b/var/spack/repos/builtin/packages/hipblas/remove-hipblas-clients-file-installation-6.0.patch
new file mode 100644
index 0000000000..ca6fa8f413
--- /dev/null
+++ b/var/spack/repos/builtin/packages/hipblas/remove-hipblas-clients-file-installation-6.0.patch
@@ -0,0 +1,32 @@
+From 120af1b2483868ebdc2ee5f137418d23c14178ad Mon Sep 17 00:00:00 2001
+From: Renjith Ravindran <Renjith.RavindranKannath@amd.com>
+Date: Wed, 10 Jan 2024 04:28:15 +0000
+Subject: [PATCH] Remove hipblas clients file installation
+
+---
+ clients/CMakeLists.txt | 12 ------------
+ 1 file changed, 12 deletions(-)
+
+diff --git a/clients/CMakeLists.txt b/clients/CMakeLists.txt
+index 8206ad7..6a59808 100644
+--- a/clients/CMakeLists.txt
++++ b/clients/CMakeLists.txt
+@@ -135,15 +135,3 @@ add_custom_command( OUTPUT "${HIPBLAS_GENTEST}"
+ 
+ add_custom_target( hipblas-common DEPENDS "${HIPBLAS_COMMON}" "${HIPBLAS_TEMPLATE}" "${HIPBLAS_SMOKE}" "${HIPBLAS_GENTEST}" )
+ 
+-if( BUILD_CLIENTS_TESTS OR BUILD_CLIENTS_BENCHMARKS )
+-  rocm_install(
+-    FILES ${HIPBLAS_COMMON} ${HIPBLAS_TEMPLATE} ${HIPBLAS_SMOKE}
+-    DESTINATION "${CMAKE_INSTALL_BINDIR}"
+-    COMPONENT clients-common
+-  )
+-  rocm_install(
+-    PROGRAMS ${HIPBLAS_GENTEST}
+-    DESTINATION "${CMAKE_INSTALL_BINDIR}"
+-    COMPONENT clients-common
+-  )
+-endif()
+-- 
+2.39.3
+
diff --git a/var/spack/repos/builtin/packages/hipcub/package.py b/var/spack/repos/builtin/packages/hipcub/package.py
index 61c05e7431..34e16cd4bc 100644
--- a/var/spack/repos/builtin/packages/hipcub/package.py
+++ b/var/spack/repos/builtin/packages/hipcub/package.py
@@ -9,14 +9,15 @@ from spack.package import *
 class Hipcub(CMakePackage, CudaPackage, ROCmPackage):
     """Radeon Open Compute Parallel Primitives Library"""
 
-    homepage = "https://github.com/ROCmSoftwarePlatform/hipCUB"
-    git = "https://github.com/ROCmSoftwarePlatform/hipCUB.git"
-    url = "https://github.com/ROCmSoftwarePlatform/hipCUB/archive/rocm-5.5.0.tar.gz"
+    homepage = "https://github.com/ROCm/hipCUB"
+    git = "https://github.com/ROCm/hipCUB.git"
+    url = "https://github.com/ROCm/hipCUB/archive/rocm-6.0.0.tar.gz"
     tags = ["rocm"]
 
     license("BSD-3-Clause")
 
     maintainers("srekolam", "renjithravindrankannath")
+    version("6.0.0", sha256="8d9f6e1e3f8433a2ceae1b0efd6727c21383980077e264725d00d5fee165bd30")
     version("5.7.1", sha256="9b23a58408bc4c549d3c754196cb3e2c1a50e177ab0a286101cbea2f7f173945")
     version("5.7.0", sha256="899356867f662d9a6f3870bb4a496f605a3143c6ad4d1fa9e9faead68fa8d13b")
     version("5.6.1", sha256="4b9479daa40424c9ddbc14ce967aa170680f8ca1ed01a514e6e30ccfa22552ce")
@@ -157,6 +158,7 @@ class Hipcub(CMakePackage, CudaPackage, ROCmPackage):
         "5.6.1",
         "5.7.0",
         "5.7.1",
+        "6.0.0",
     ]:
         depends_on("rocprim@" + ver, when="+rocm @" + ver)
         depends_on("rocm-cmake@%s:" % ver, type="build", when="@" + ver)
diff --git a/var/spack/repos/builtin/packages/hipfft/001-remove-submodule-and-sync-shared-files-from-rocFFT.patch b/var/spack/repos/builtin/packages/hipfft/001-remove-submodule-and-sync-shared-files-from-rocFFT.patch
new file mode 100644
index 0000000000..537794d3cc
--- /dev/null
+++ b/var/spack/repos/builtin/packages/hipfft/001-remove-submodule-and-sync-shared-files-from-rocFFT.patch
@@ -0,0 +1,11431 @@
+From 27ae15a459f45f1acfcb1a9b1c8d491d9f731fd4 Mon Sep 17 00:00:00 2001
+From: Steve Leung <Steve.Leung@amd.com>
+Date: Thu, 4 Jan 2024 16:36:08 -0700
+Subject: [PATCH] remove submodule and sync shared files from rocFFT, update
+ CHANGELOG.md
+
+---
+ clients/CMakeLists.txt                   |   15 -
+ clients/bench/CMakeLists.txt             |    4 +-
+ clients/bench/bench.cpp                  |    2 +-
+ clients/hipfft_params.h                  |    2 +-
+ clients/tests/CMakeLists.txt             |   11 +-
+ clients/tests/accuracy_test_1D.cpp       |    8 +-
+ clients/tests/accuracy_test_2D.cpp       |    8 +-
+ clients/tests/accuracy_test_3D.cpp       |    8 +-
+ clients/tests/accuracy_test_callback.cpp |    2 +-
+ clients/tests/gtest_main.cpp             |    6 +-
+ clients/tests/hipfft_accuracy_test.cpp   |   11 +-
+ clients/tests/hipfft_accuracy_test.h     |    2 +-
+ clients/tests/multi_device_test.cpp      |    2 +-
+ cmake/dependencies.cmake                 |    3 -
+ library/src/amd_detail/hipfft.cpp        |    8 +-
+ shared/accuracy_test.h                   | 1949 +++++++++++++
+ shared/arithmetic.h                      |   61 +
+ shared/array_predicate.h                 |   47 +
+ shared/array_validator.cpp               |  549 ++++
+ shared/array_validator.h                 |   31 +
+ shared/concurrency.h                     |   41 +
+ shared/data_gen_device.h                 | 1303 +++++++++
+ shared/data_gen_host.h                   |  881 ++++++
+ shared/device_properties.h               |   74 +
+ shared/enum_to_string.h                  |   81 +
+ shared/environment.h                     |   97 +
+ shared/fft_params.h                      | 3274 ++++++++++++++++++++++
+ shared/fftw_transform.h                  |  493 ++++
+ shared/gpubuf.h                          |  134 +
+ shared/hip_object_wrapper.h              |   86 +
+ shared/hostbuf.h                         |  158 ++
+ shared/increment.h                       |  100 +
+ shared/precision_type.h                  |   70 +
+ shared/printbuffer.h                     |  108 +
+ shared/ptrdiff.h                         |   40 +
+ shared/rocfft_accuracy_test.h            |   29 +
+ shared/rocfft_against_fftw.h             |  231 ++
+ shared/rocfft_complex.h                  |  346 +++
+ shared/rocfft_hip.h                      |   52 +
+ shared/rocfft_params.h                   |  585 ++++
+ shared/test_params.h                     |   51 +
+ shared/work_queue.h                      |   49 +
+ 46 files changed, 10966 insertions(+), 66 deletions(-)
+ create mode 100644 shared/accuracy_test.h
+ create mode 100644 shared/arithmetic.h
+ create mode 100644 shared/array_predicate.h
+ create mode 100644 shared/array_validator.cpp
+ create mode 100644 shared/array_validator.h
+ create mode 100644 shared/concurrency.h
+ create mode 100644 shared/data_gen_device.h
+ create mode 100644 shared/data_gen_host.h
+ create mode 100644 shared/device_properties.h
+ create mode 100644 shared/enum_to_string.h
+ create mode 100644 shared/environment.h
+ create mode 100644 shared/fft_params.h
+ create mode 100644 shared/fftw_transform.h
+ create mode 100644 shared/gpubuf.h
+ create mode 100644 shared/hip_object_wrapper.h
+ create mode 100644 shared/hostbuf.h
+ create mode 100644 shared/increment.h
+ create mode 100644 shared/precision_type.h
+ create mode 100644 shared/printbuffer.h
+ create mode 100644 shared/ptrdiff.h
+ create mode 100644 shared/rocfft_accuracy_test.h
+ create mode 100644 shared/rocfft_against_fftw.h
+ create mode 100644 shared/rocfft_complex.h
+ create mode 100644 shared/rocfft_hip.h
+ create mode 100644 shared/rocfft_params.h
+ create mode 100644 shared/test_params.h
+ create mode 100644 shared/work_queue.h
+
+diff --git a/clients/CMakeLists.txt b/clients/CMakeLists.txt
+index 1db0d9c..b99a9e5 100644
+--- a/clients/CMakeLists.txt
++++ b/clients/CMakeLists.txt
+@@ -65,21 +65,6 @@ if( NOT CMAKE_CXX_COMPILER MATCHES ".*/hipcc$" AND NOT CMAKE_CXX_COMPILER_ID STR
+ endif()
+ 
+ 
+-if( GIT_FOUND AND EXISTS "${CMAKE_SOURCE_DIR}/.git" )
+-  message(STATUS "rocFFT submodule update")
+-  execute_process(COMMAND ${GIT_EXECUTABLE} submodule update --init --recursive
+-    WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}/clients/rocFFT
+-    RESULT_VARIABLE GIT_SUBMOD_RESULT)
+-  if( NOT GIT_SUBMOD_RESULT EQUAL "0" )
+-    message(FATAL_ERROR "git submodule update --init --recursive failed with ${GIT_SUBMOD_RESULT}, please checkout submodules manually.")
+-  endif( )
+-endif( )
+-
+-if( NOT EXISTS "${CMAKE_SOURCE_DIR}/clients/rocFFT/CMakeLists.txt" )
+-  message(FATAL_ERROR "The rocFFT submodule is not present!  Please update git submodules and try again. ${CMAKE_CURRENT_SOURCE_DIR}/clients/rocFFT/CMakeLists.txt")
+-endif( )
+-
+-
+ # This option only works for make/nmake and the ninja generators, but no reason it shouldn't be on
+ # all the time
+ # This tells cmake to create a compile_commands.json file that can be used with clang tooling or vim
+diff --git a/clients/bench/CMakeLists.txt b/clients/bench/CMakeLists.txt
+index b5cef9b..ccb8c29 100644
+--- a/clients/bench/CMakeLists.txt
++++ b/clients/bench/CMakeLists.txt
+@@ -26,8 +26,8 @@ find_package( Boost COMPONENTS program_options REQUIRED)
+ set( Boost_USE_STATIC_LIBS OFF )
+ 
+ 
+-set( hipfft_bench_source bench.cpp ../rocFFT/shared/array_validator.cpp )
+-set( hipfft_bench_includes bench.h ../rocFFT/shared/array_validator.h )
++set( hipfft_bench_source bench.cpp ../../shared/array_validator.cpp )
++set( hipfft_bench_includes bench.h ../../shared/array_validator.h )
+ 
+ add_executable( hipfft-bench ${hipfft_bench_source} ${hipfft_bench_includes} )
+ 
+diff --git a/clients/bench/bench.cpp b/clients/bench/bench.cpp
+index 894769c..a906879 100644
+--- a/clients/bench/bench.cpp
++++ b/clients/bench/bench.cpp
+@@ -29,7 +29,7 @@
+ #include <boost/program_options.hpp>
+ namespace po = boost::program_options;
+ 
+-#include "../rocFFT/shared/gpubuf.h"
++#include "../../shared/gpubuf.h"
+ 
+ int main(int argc, char* argv[])
+ {
+diff --git a/clients/hipfft_params.h b/clients/hipfft_params.h
+index b8b58ac..75d9db9 100644
+--- a/clients/hipfft_params.h
++++ b/clients/hipfft_params.h
+@@ -23,9 +23,9 @@
+ 
+ #include <optional>
+ 
++#include "../shared/fft_params.h"
+ #include "hipfft/hipfft.h"
+ #include "hipfft/hipfftXt.h"
+-#include "rocFFT/shared/fft_params.h"
+ 
+ inline fft_status fft_status_from_hipfftparams(const hipfftResult_t val)
+ {
+diff --git a/clients/tests/CMakeLists.txt b/clients/tests/CMakeLists.txt
+index 9742a45..2d1aac0 100644
+--- a/clients/tests/CMakeLists.txt
++++ b/clients/tests/CMakeLists.txt
+@@ -37,14 +37,7 @@ set( hipfft-test_source
+   accuracy_test_3D.cpp
+   accuracy_test_callback.cpp
+   multi_device_test.cpp
+-  ../rocFFT/shared/array_validator.cpp
+-  )
+-
+-set( hipfft-test_includes
+-  ../rocFFT/clients/tests/fftw_transform.h
+-  ../rocFFT/clients/tests/rocfft_against_fftw.h
+-  ../rocFFT/clients/tests/misc/include/test_exception.h
+-  ../rocFFT/shared/array_validator.h
++  ../../shared/array_validator.cpp
+   )
+ 
+ add_executable( hipfft-test ${hipfft-test_source} ${hipfft-test_includes} )
+@@ -56,8 +49,6 @@ target_include_directories(
+   $<BUILD_INTERFACE:${FFTW_INCLUDE_DIRS}>
+   $<BUILD_INTERFACE:${hip_INCLUDE_DIRS}>
+   $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../../library/include>
+-  $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../rocFFT/library/include>
+-  $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../rocFFT/clients/tests>
+   )
+ 
+ 
+diff --git a/clients/tests/accuracy_test_1D.cpp b/clients/tests/accuracy_test_1D.cpp
+index 27e849d..57d846a 100644
+--- a/clients/tests/accuracy_test_1D.cpp
++++ b/clients/tests/accuracy_test_1D.cpp
+@@ -23,11 +23,11 @@
+ #include <stdexcept>
+ #include <vector>
+ 
+-#include "../rocFFT/shared/fft_params.h"
++#include "../../shared/fft_params.h"
+ 
+-#include "accuracy_test.h"
+-#include "fftw_transform.h"
+-#include "rocfft_against_fftw.h"
++#include "../../shared/accuracy_test.h"
++#include "../../shared/fftw_transform.h"
++#include "../../shared/rocfft_against_fftw.h"
+ 
+ using ::testing::ValuesIn;
+ 
+diff --git a/clients/tests/accuracy_test_2D.cpp b/clients/tests/accuracy_test_2D.cpp
+index 1674593..6f618c0 100644
+--- a/clients/tests/accuracy_test_2D.cpp
++++ b/clients/tests/accuracy_test_2D.cpp
+@@ -23,11 +23,11 @@
+ #include <stdexcept>
+ #include <vector>
+ 
+-#include "../rocFFT/shared/fft_params.h"
++#include "../../shared/fft_params.h"
+ 
+-#include "accuracy_test.h"
+-#include "fftw_transform.h"
+-#include "rocfft_against_fftw.h"
++#include "../../shared/accuracy_test.h"
++#include "../../shared/fftw_transform.h"
++#include "../../shared/rocfft_against_fftw.h"
+ 
+ using ::testing::ValuesIn;
+ 
+diff --git a/clients/tests/accuracy_test_3D.cpp b/clients/tests/accuracy_test_3D.cpp
+index a87476a..941ec24 100644
+--- a/clients/tests/accuracy_test_3D.cpp
++++ b/clients/tests/accuracy_test_3D.cpp
+@@ -23,11 +23,11 @@
+ #include <stdexcept>
+ #include <vector>
+ 
+-#include "../rocFFT/shared/fft_params.h"
++#include "../../shared/fft_params.h"
+ 
+-#include "accuracy_test.h"
+-#include "fftw_transform.h"
+-#include "rocfft_against_fftw.h"
++#include "../../shared/accuracy_test.h"
++#include "../../shared/fftw_transform.h"
++#include "../../shared/rocfft_against_fftw.h"
+ 
+ using ::testing::ValuesIn;
+ 
+diff --git a/clients/tests/accuracy_test_callback.cpp b/clients/tests/accuracy_test_callback.cpp
+index 4782830..b5cc4a7 100644
+--- a/clients/tests/accuracy_test_callback.cpp
++++ b/clients/tests/accuracy_test_callback.cpp
+@@ -18,7 +18,7 @@
+ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ // THE SOFTWARE.
+ 
+-#include "accuracy_test.h"
++#include "../../shared/accuracy_test.h"
+ 
+ std::vector<std::vector<size_t>> callback_sizes = {
+     // some single kernel sizes
+diff --git a/clients/tests/gtest_main.cpp b/clients/tests/gtest_main.cpp
+index 1f0ae83..2f7674e 100644
+--- a/clients/tests/gtest_main.cpp
++++ b/clients/tests/gtest_main.cpp
+@@ -30,10 +30,10 @@
+ #include <streambuf>
+ #include <string>
+ 
++#include "../../shared/concurrency.h"
++#include "../../shared/environment.h"
++#include "../../shared/work_queue.h"
+ #include "../hipfft_params.h"
+-#include "../rocFFT/shared/concurrency.h"
+-#include "../rocFFT/shared/environment.h"
+-#include "../rocFFT/shared/work_queue.h"
+ #include "hipfft/hipfft.h"
+ #include "hipfft_accuracy_test.h"
+ #include "hipfft_test_params.h"
+diff --git a/clients/tests/hipfft_accuracy_test.cpp b/clients/tests/hipfft_accuracy_test.cpp
+index 2abaf74..609239a 100644
+--- a/clients/tests/hipfft_accuracy_test.cpp
++++ b/clients/tests/hipfft_accuracy_test.cpp
+@@ -29,11 +29,12 @@
+ #include "hipfft/hipfft.h"
+ 
+ #include "../hipfft_params.h"
+-#include "../rocFFT/clients/tests/fftw_transform.h"
+-#include "../rocFFT/clients/tests/rocfft_accuracy_test.h"
+-#include "../rocFFT/clients/tests/rocfft_against_fftw.h"
+-#include "../rocFFT/shared/gpubuf.h"
+-#include "../rocFFT/shared/rocfft_complex.h"
++
++#include "../../shared/accuracy_test.h"
++#include "../../shared/fftw_transform.h"
++#include "../../shared/gpubuf.h"
++#include "../../shared/rocfft_against_fftw.h"
++#include "../../shared/rocfft_complex.h"
+ 
+ void fft_vs_reference(hipfft_params& params, bool round_trip)
+ {
+diff --git a/clients/tests/hipfft_accuracy_test.h b/clients/tests/hipfft_accuracy_test.h
+index 0491bd9..181150e 100644
+--- a/clients/tests/hipfft_accuracy_test.h
++++ b/clients/tests/hipfft_accuracy_test.h
+@@ -23,8 +23,8 @@
+ #ifndef ROCFFT_ACCURACY_TEST
+ #define ROCFFT_ACCURACY_TEST
+ 
++#include "../../shared/accuracy_test.h"
+ #include "../hipfft_params.h"
+-#include "../rocFFT/clients/tests/accuracy_test.h"
+ 
+ void fft_vs_reference(hipfft_params& params, bool round_trip = false);
+ 
+diff --git a/clients/tests/multi_device_test.cpp b/clients/tests/multi_device_test.cpp
+index b3dc4c9..3274b80 100644
+--- a/clients/tests/multi_device_test.cpp
++++ b/clients/tests/multi_device_test.cpp
+@@ -18,7 +18,7 @@
+ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ // THE SOFTWARE.
+ 
+-#include "accuracy_test.h"
++#include "../../shared/accuracy_test.h"
+ #include <gtest/gtest.h>
+ #include <hip/hip_runtime_api.h>
+ 
+diff --git a/cmake/dependencies.cmake b/cmake/dependencies.cmake
+index 5810e37..bdbf689 100644
+--- a/cmake/dependencies.cmake
++++ b/cmake/dependencies.cmake
+@@ -21,9 +21,6 @@
+ #
+ # #############################################################################
+ 
+-# Git
+-find_package(Git REQUIRED)
+-
+ # HIP
+ if( NOT CMAKE_CXX_COMPILER MATCHES ".*/hipcc$" )
+   if( NOT BUILD_WITH_LIB STREQUAL "CUDA" )
+diff --git a/library/src/amd_detail/hipfft.cpp b/library/src/amd_detail/hipfft.cpp
+index c2f7036..3d4f61f 100644
+--- a/library/src/amd_detail/hipfft.cpp
++++ b/library/src/amd_detail/hipfft.cpp
+@@ -27,10 +27,10 @@
+ #include <string>
+ #include <vector>
+ 
+-#include "../../../clients/rocFFT/shared/arithmetic.h"
+-#include "../../../clients/rocFFT/shared/gpubuf.h"
+-#include "../../../clients/rocFFT/shared/ptrdiff.h"
+-#include "../../../clients/rocFFT/shared/rocfft_hip.h"
++#include "../../../shared/arithmetic.h"
++#include "../../../shared/gpubuf.h"
++#include "../../../shared/ptrdiff.h"
++#include "../../../shared/rocfft_hip.h"
+ 
+ #define ROC_FFT_CHECK_ALLOC_FAILED(ret)   \
+     {                                     \
+diff --git a/shared/accuracy_test.h b/shared/accuracy_test.h
+new file mode 100644
+index 0000000..362a7c1
+--- /dev/null
++++ b/shared/accuracy_test.h
+@@ -0,0 +1,1949 @@
++// Copyright (C) 2020 - 2023 Advanced Micro Devices, Inc. All rights reserved.
++//
++// Permission is hereby granted, free of charge, to any person obtaining a copy
++// of this software and associated documentation files (the "Software"), to deal
++// in the Software without restriction, including without limitation the rights
++// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
++// copies of the Software, and to permit persons to whom the Software is
++// furnished to do so, subject to the following conditions:
++//
++// The above copyright notice and this permission notice shall be included in
++// all copies or substantial portions of the Software.
++//
++// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
++// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
++// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
++// THE SOFTWARE.
++
++#pragma once
++
++#ifndef ACCURACY_TEST
++#define ACCURACY_TEST
++
++#include <algorithm>
++#include <functional>
++#include <future>
++#include <iterator>
++#include <string>
++#include <vector>
++
++#include "enum_to_string.h"
++#include "fft_params.h"
++#include "fftw_transform.h"
++#include "gpubuf.h"
++#include "rocfft_against_fftw.h"
++#include "test_params.h"
++
++extern int    verbose;
++extern size_t ramgb;
++extern bool   fftw_compare;
++
++static const size_t ONE_GiB = 1 << 30;
++
++inline size_t bytes_to_GiB(const size_t bytes)
++{
++    return bytes == 0 ? 0 : (bytes - 1 + ONE_GiB) / ONE_GiB;
++}
++
++typedef std::tuple<fft_transform_type, fft_result_placement, fft_array_type, fft_array_type>
++    type_place_io_t;
++
++// Remember the results of the last FFT we computed with FFTW.  Tests
++// are ordered so that later cases can often reuse this result.
++struct last_cpu_fft_cache
++{
++    // keys to the cache
++    std::vector<size_t> length;
++    size_t              nbatch         = 0;
++    fft_transform_type  transform_type = fft_transform_type_complex_forward;
++    bool                run_callbacks  = false;
++    fft_precision       precision      = fft_precision_single;
++
++    // FFTW input/output
++    std::vector<hostbuf> cpu_input;
++    std::vector<hostbuf> cpu_output;
++};
++extern last_cpu_fft_cache last_cpu_fft_data;
++
++struct system_memory
++{
++    size_t total_bytes = 0;
++    size_t free_bytes  = 0;
++};
++extern system_memory start_memory;
++
++system_memory get_system_memory();
++
++// Estimate the amount of host memory needed for buffers.
++inline size_t needed_ram_buffers(const fft_params& params, const int verbose)
++{
++    // This calculation is assuming contiguous data but noncontiguous buffers
++    // are assumed to require a close enough amount of space for the purposes
++    // of this estimate.
++
++    size_t needed_ram = 6
++                        * std::accumulate(params.length.begin(),
++                                          params.length.end(),
++                                          static_cast<size_t>(1),
++                                          std::multiplies<size_t>());
++
++    // Account for precision and data type:
++    if(params.transform_type != fft_transform_type_real_forward
++       && params.transform_type != fft_transform_type_real_inverse)
++    {
++        needed_ram *= 2;
++    }
++    switch(params.precision)
++    {
++    case fft_precision_half:
++        needed_ram *= 2;
++        break;
++    case fft_precision_single:
++        needed_ram *= 4;
++        break;
++    case fft_precision_double:
++        needed_ram *= 8;
++        break;
++    }
++
++    needed_ram *= params.nbatch;
++
++    if(verbose)
++    {
++        std::cout << "required host memory for buffers (GiB): " << bytes_to_GiB(needed_ram) << "\n";
++    }
++
++    return needed_ram;
++}
++
++template <typename Tfloat>
++bool fftw_plan_uses_bluestein(const typename fftw_trait<Tfloat>::fftw_plan_type& cpu_plan)
++{
++#ifdef FFTW_HAVE_SPRINT_PLAN
++    char*       print_plan_c_str = fftw_sprint_plan<Tfloat>(cpu_plan);
++    std::string print_plan(print_plan_c_str);
++    free(print_plan_c_str);
++    return print_plan.find("bluestein") != std::string::npos;
++#else
++    // assume worst case (bluestein is always used)
++    return true;
++#endif
++}
++
++// Estimate the amount of host memory needed for fftw.
++template <typename Tfloat>
++inline size_t needed_ram_fftw(const fft_params&                                  contiguous_params,
++                              const typename fftw_trait<Tfloat>::fftw_plan_type& cpu_plan,
++                              const int                                          verbose)
++{
++    size_t total_length = std::accumulate(contiguous_params.length.begin(),
++                                          contiguous_params.length.end(),
++                                          static_cast<size_t>(1),
++                                          std::multiplies<size_t>());
++    size_t needed_ram   = 0;
++    // Detect Bluestein in plan
++    if(fftw_plan_uses_bluestein<Tfloat>(cpu_plan))
++    {
++        for(size_t dim : contiguous_params.length)
++        {
++            unsigned int needed_ram_dim = dim;
++
++            // Next-plus-one-power-of-two multiplied any other lengths
++            needed_ram_dim--;
++
++            needed_ram_dim |= needed_ram_dim >> 2;
++            needed_ram_dim |= needed_ram_dim >> 4;
++            needed_ram_dim |= needed_ram_dim >> 8;
++            needed_ram_dim |= needed_ram_dim >> 16;
++
++            needed_ram_dim++;
++
++            needed_ram_dim *= 2 * (total_length / dim);
++
++            if(needed_ram_dim > needed_ram)
++            {
++                needed_ram = needed_ram_dim;
++            }
++        }
++    }
++
++    // Account for precision and data type:
++    if(contiguous_params.transform_type != fft_transform_type_real_forward
++       && contiguous_params.transform_type != fft_transform_type_real_inverse)
++    {
++        needed_ram *= 2;
++    }
++    switch(contiguous_params.precision)
++    {
++    case fft_precision_half:
++        needed_ram *= 2;
++        break;
++    case fft_precision_single:
++        needed_ram *= 4;
++        break;
++    case fft_precision_double:
++        needed_ram *= 8;
++        break;
++    }
++
++    needed_ram *= contiguous_params.nbatch;
++
++    if(verbose)
++    {
++        std::cout << "required host memory for FFTW (GiB): " << bytes_to_GiB(needed_ram) << "\n";
++    }
++
++    return needed_ram;
++}
++
++// Base gtest class for comparison with FFTW.
++class accuracy_test : public ::testing::TestWithParam<fft_params>
++{
++protected:
++    void SetUp() override {}
++    void TearDown() override {}
++
++public:
++    static std::string TestName(const testing::TestParamInfo<accuracy_test::ParamType>& info)
++    {
++        return info.param.token();
++    }
++};
++
++const static std::vector<size_t> batch_range = {2, 1};
++
++const static std::vector<fft_precision> precision_range_full
++    = {fft_precision_double, fft_precision_single, fft_precision_half};
++const static std::vector<fft_precision> precision_range_sp_dp
++    = {fft_precision_double, fft_precision_single};
++
++const static std::vector<fft_result_placement> place_range
++    = {fft_placement_inplace, fft_placement_notinplace};
++const static std::vector<fft_transform_type> trans_type_range
++    = {fft_transform_type_complex_forward, fft_transform_type_real_forward};
++const static std::vector<fft_transform_type> trans_type_range_complex
++    = {fft_transform_type_complex_forward};
++const static std::vector<fft_transform_type> trans_type_range_real
++    = {fft_transform_type_real_forward};
++
++// Given a vector of vector of lengths, generate all unique permutations.
++// Add an optional vector of ad-hoc lengths to the result.
++inline std::vector<std::vector<size_t>>
++    generate_lengths(const std::vector<std::vector<size_t>>& inlengths)
++{
++    std::vector<std::vector<size_t>> output;
++    if(inlengths.size() == 0)
++    {
++        return output;
++    }
++    const size_t        dim = inlengths.size();
++    std::vector<size_t> looplength(dim);
++    for(unsigned int i = 0; i < dim; ++i)
++    {
++        looplength[i] = inlengths[i].size();
++    }
++    for(unsigned int idx = 0; idx < inlengths.size(); ++idx)
++    {
++        std::vector<size_t> index(dim);
++        do
++        {
++            std::vector<size_t> length(dim);
++            for(unsigned int i = 0; i < dim; ++i)
++            {
++                length[i] = inlengths[i][index[i]];
++            }
++            output.push_back(length);
++        } while(increment_rowmajor(index, looplength));
++    }
++    // uniquify the result
++    std::sort(output.begin(), output.end());
++    output.erase(std::unique(output.begin(), output.end()), output.end());
++    return output;
++}
++
++// Return the valid rocFFT input and output types for a given transform type.
++inline std::vector<std::pair<fft_array_type, fft_array_type>>
++    iotypes(const fft_transform_type   transformType,
++            const fft_result_placement place,
++            const bool                 planar = true)
++{
++    std::vector<std::pair<fft_array_type, fft_array_type>> iotypes;
++    switch(transformType)
++    {
++    case fft_transform_type_complex_forward:
++    case fft_transform_type_complex_inverse:
++        iotypes.push_back(std::make_pair<fft_array_type, fft_array_type>(
++            fft_array_type_complex_interleaved, fft_array_type_complex_interleaved));
++        if(planar)
++        {
++            iotypes.push_back(std::make_pair<fft_array_type, fft_array_type>(
++                fft_array_type_complex_planar, fft_array_type_complex_planar));
++            if(place == fft_placement_notinplace)
++            {
++                iotypes.push_back(std::make_pair<fft_array_type, fft_array_type>(
++                    fft_array_type_complex_planar, fft_array_type_complex_interleaved));
++                iotypes.push_back(std::make_pair<fft_array_type, fft_array_type>(
++                    fft_array_type_complex_interleaved, fft_array_type_complex_planar));
++            }
++        }
++        break;
++    case fft_transform_type_real_forward:
++        iotypes.push_back(std::make_pair<fft_array_type, fft_array_type>(
++            fft_array_type_real, fft_array_type_hermitian_interleaved));
++        if(planar && place == fft_placement_notinplace)
++        {
++            iotypes.push_back(std::make_pair<fft_array_type, fft_array_type>(
++                fft_array_type_real, fft_array_type_hermitian_planar));
++        }
++        break;
++    case fft_transform_type_real_inverse:
++        iotypes.push_back(std::make_pair<fft_array_type, fft_array_type>(
++            fft_array_type_hermitian_interleaved, fft_array_type_real));
++        if(planar && place == fft_placement_notinplace)
++        {
++            iotypes.push_back(std::make_pair<fft_array_type, fft_array_type>(
++                fft_array_type_hermitian_planar, fft_array_type_real));
++        }
++        break;
++    default:
++        throw std::runtime_error("Invalid transform type");
++    }
++    return iotypes;
++}
++
++// Generate all combinations of input/output types, from combinations of transform and placement
++// types.
++static std::vector<type_place_io_t>
++    generate_types(fft_transform_type                       transform_type,
++                   const std::vector<fft_result_placement>& place_range,
++                   const bool                               planar)
++{
++    std::vector<type_place_io_t> ret;
++    for(auto place : place_range)
++    {
++        for(auto iotype : iotypes(transform_type, place, planar))
++        {
++            ret.push_back(std::make_tuple(transform_type, place, iotype.first, iotype.second));
++        }
++    }
++    return ret;
++}
++
++struct stride_generator
++{
++    struct stride_dist
++    {
++        stride_dist(const std::vector<size_t>& s, size_t d)
++            : stride(s)
++            , dist(d)
++        {
++        }
++        std::vector<size_t> stride;
++        size_t              dist;
++    };
++
++    // NOTE: allow for this ctor to be implicit, so it's less typing for a test writer
++    //
++    // cppcheck-suppress noExplicitConstructor
++    stride_generator(const std::vector<std::vector<size_t>>& stride_list_in)
++        : stride_list(stride_list_in)
++    {
++    }
++    virtual std::vector<stride_dist> generate(const std::vector<size_t>& lengths,
++                                              size_t                     batch) const
++    {
++        std::vector<stride_dist> ret;
++        for(const auto& s : stride_list)
++            ret.emplace_back(s, 0);
++        return ret;
++    }
++    std::vector<std::vector<size_t>> stride_list;
++};
++
++// Generate strides such that batch is essentially the innermost dimension
++// e.g. given a batch-2 4x3x2 transform which logically looks like:
++//
++// batch0:
++// A B A B
++// A B A B
++// A B A B
++//
++// A B A B
++// A B A B
++// A B A B
++//
++// batch1:
++// A B A B
++// A B A B
++// A B A B
++//
++// A B A B
++// A B A B
++// A B A B
++//
++// we instead do stride-2 4x3x2 transform where first batch is the
++// A's and second batch is the B's.
++struct stride_generator_3D_inner_batch : public stride_generator
++{
++    explicit stride_generator_3D_inner_batch(const std::vector<std::vector<size_t>>& stride_list_in)
++        : stride_generator(stride_list_in)
++    {
++    }
++    std::vector<stride_dist> generate(const std::vector<size_t>& lengths,
++                                      size_t                     batch) const override
++    {
++        std::vector<stride_dist> ret = stride_generator::generate(lengths, batch);
++        std::vector<size_t> strides{lengths[1] * lengths[2] * batch, lengths[2] * batch, batch};
++        ret.emplace_back(strides, 1);
++        return ret;
++    }
++};
++
++// Create an array of parameters to pass to gtest.  Base generator
++// that allows choosing transform type.
++inline auto param_generator_base(const std::vector<fft_transform_type>&   type_range,
++                                 const std::vector<std::vector<size_t>>&  v_lengths,
++                                 const std::vector<fft_precision>&        precision_range,
++                                 const std::vector<size_t>&               batch_range,
++                                 decltype(generate_types)                 types_generator,
++                                 const stride_generator&                  istride,
++                                 const stride_generator&                  ostride,
++                                 const std::vector<std::vector<size_t>>&  ioffset_range,
++                                 const std::vector<std::vector<size_t>>&  ooffset_range,
++                                 const std::vector<fft_result_placement>& place_range,
++                                 const bool                               planar        = true,
++                                 const bool                               run_callbacks = false)
++{
++
++    std::vector<fft_params> params;
++
++    // For any length, we compute double-precision CPU reference
++    // for largest batch size first and reuse for smaller batch
++    // sizes, then convert to single-precision.
++
++    for(auto& transform_type : type_range)
++    {
++        for(const auto& lengths : v_lengths)
++        {
++            // try to ensure that we are given literal lengths, not
++            // something to be passed to generate_lengths
++            if(lengths.empty() || lengths.size() > 3)
++            {
++                continue;
++            }
++            {
++                for(const auto precision : precision_range)
++                {
++                    for(const auto batch : batch_range)
++                    {
++                        for(const auto& types :
++                            types_generator(transform_type, place_range, planar))
++                        {
++                            for(const auto& istride_dist : istride.generate(lengths, batch))
++                            {
++                                for(const auto& ostride_dist : ostride.generate(lengths, batch))
++                                {
++                                    for(const auto& ioffset : ioffset_range)
++                                    {
++                                        for(const auto& ooffset : ooffset_range)
++                                        {
++                                            fft_params param;
++
++                                            param.length         = lengths;
++                                            param.istride        = istride_dist.stride;
++                                            param.ostride        = ostride_dist.stride;
++                                            param.nbatch         = batch;
++                                            param.precision      = precision;
++                                            param.transform_type = std::get<0>(types);
++                                            param.placement      = std::get<1>(types);
++                                            param.idist          = istride_dist.dist;
++                                            param.odist          = ostride_dist.dist;
++                                            param.itype          = std::get<2>(types);
++                                            param.otype          = std::get<3>(types);
++                                            param.ioffset        = ioffset;
++                                            param.ooffset        = ooffset;
++
++                                            if(run_callbacks)
++                                            {
++                                                // add a test if both input and output support callbacks
++                                                if(param.itype != fft_array_type_complex_planar
++                                                   && param.itype != fft_array_type_hermitian_planar
++                                                   && param.otype != fft_array_type_complex_planar
++                                                   && param.otype
++                                                          != fft_array_type_hermitian_planar)
++                                                {
++                                                    param.run_callbacks = true;
++                                                }
++                                                else
++                                                {
++                                                    continue;
++                                                }
++                                            }
++                                            param.validate();
++
++                                            // Keeping the random number generator here
++                                            // allows one to run the same tests for a given
++                                            // random seed; ie the test suite is repeatable.
++                                            std::hash<std::string>           hasher;
++                                            std::ranlux24_base               gen(random_seed
++                                                                   + hasher(param.token()));
++                                            std::uniform_real_distribution<> dis(0.0, 1.0);
++
++                                            if(param.is_planar())
++                                            {
++                                                const double roll = dis(gen);
++                                                if(roll > planar_prob)
++                                                {
++                                                    if(verbose > 4)
++                                                    {
++                                                        std::cout << "Planar transform skipped "
++                                                                     "(planar_prob: "
++                                                                  << planar_prob << " > " << roll
++                                                                  << ")\n";
++                                                    }
++                                                    continue;
++                                                }
++                                            }
++                                            if(run_callbacks)
++                                            {
++                                                const double roll = dis(gen);
++                                                if(roll > callback_prob)
++                                                {
++
++                                                    if(verbose > 4)
++                                                    {
++                                                        std::cout << "Callback transform skipped "
++                                                                     "(planar_prob: "
++                                                                  << planar_prob << " > " << roll
++                                                                  << ")\n";
++                                                    }
++                                                    continue;
++                                                }
++                                            }
++
++                                            if(param.valid(0))
++                                            {
++                                                params.push_back(param);
++                                            }
++                                        }
++                                    }
++                                }
++                            }
++                        }
++                    }
++                }
++            }
++        }
++    }
++    return params;
++}
++
++// Create an array of parameters to pass to gtest.  Default generator
++// that picks all transform types.
++inline auto param_generator(const std::vector<std::vector<size_t>>&  v_lengths,
++                            const std::vector<fft_precision>&        precision_range,
++                            const std::vector<size_t>&               batch_range,
++                            const stride_generator&                  istride,
++                            const stride_generator&                  ostride,
++                            const std::vector<std::vector<size_t>>&  ioffset_range,
++                            const std::vector<std::vector<size_t>>&  ooffset_range,
++                            const std::vector<fft_result_placement>& place_range,
++                            const bool                               planar,
++                            const bool                               run_callbacks = false)
++{
++    return param_generator_base(trans_type_range,
++                                v_lengths,
++                                precision_range,
++                                batch_range,
++                                generate_types,
++                                istride,
++                                ostride,
++                                ioffset_range,
++                                ooffset_range,
++                                place_range,
++                                planar,
++                                run_callbacks);
++}
++
++// Create an array of parameters to pass to gtest.  Only tests complex-type transforms
++inline auto param_generator_complex(const std::vector<std::vector<size_t>>&  v_lengths,
++                                    const std::vector<fft_precision>&        precision_range,
++                                    const std::vector<size_t>&               batch_range,
++                                    const stride_generator&                  istride,
++                                    const stride_generator&                  ostride,
++                                    const std::vector<std::vector<size_t>>&  ioffset_range,
++                                    const std::vector<std::vector<size_t>>&  ooffset_range,
++                                    const std::vector<fft_result_placement>& place_range,
++                                    const bool                               planar,
++                                    const bool                               run_callbacks = false)
++{
++    return param_generator_base(trans_type_range_complex,
++                                v_lengths,
++                                precision_range,
++                                batch_range,
++                                generate_types,
++                                istride,
++                                ostride,
++                                ioffset_range,
++                                ooffset_range,
++                                place_range,
++                                planar,
++                                run_callbacks);
++}
++
++// Create an array of parameters to pass to gtest.
++inline auto param_generator_real(const std::vector<std::vector<size_t>>&  v_lengths,
++                                 const std::vector<fft_precision>&        precision_range,
++                                 const std::vector<size_t>&               batch_range,
++                                 const stride_generator&                  istride,
++                                 const stride_generator&                  ostride,
++                                 const std::vector<std::vector<size_t>>&  ioffset_range,
++                                 const std::vector<std::vector<size_t>>&  ooffset_range,
++                                 const std::vector<fft_result_placement>& place_range,
++                                 const bool                               planar,
++                                 const bool                               run_callbacks = false)
++{
++    return param_generator_base(trans_type_range_real,
++                                v_lengths,
++                                precision_range,
++                                batch_range,
++                                generate_types,
++                                istride,
++                                ostride,
++                                ioffset_range,
++                                ooffset_range,
++                                place_range,
++                                planar,
++                                run_callbacks);
++}
++
++template <class Tcontainer>
++auto param_generator_token(const Tcontainer& tokens)
++{
++    std::vector<fft_params> params;
++    params.reserve(tokens.size());
++    for(auto t : tokens)
++    {
++        params.push_back({});
++        params.back().from_token(t);
++    }
++    return params;
++}
++
++struct callback_test_data
++{
++    // scalar to modify the input/output with
++    double scalar;
++    // base address of input, to ensure that each callback gets an offset from that base
++    void* base;
++};
++
++void* get_load_callback_host(fft_array_type itype,
++                             fft_precision  precision,
++                             bool           round_trip_inverse);
++void  apply_load_callback(const fft_params& params, std::vector<hostbuf>& input);
++void  apply_store_callback(const fft_params& params, std::vector<hostbuf>& output);
++void* get_store_callback_host(fft_array_type otype,
++                              fft_precision  precision,
++                              bool           round_trip_inverse);
++
++static auto allocate_cpu_fft_buffer(const fft_precision        precision,
++                                    const fft_array_type       type,
++                                    const std::vector<size_t>& size)
++{
++    // FFTW does not support half-precision, so we do single instead.
++    // So if we need to do a half-precision FFTW transform, allocate
++    // enough buffer for single-precision instead.
++    return allocate_host_buffer(
++        precision == fft_precision_half ? fft_precision_single : precision, type, size);
++}
++
++template <typename Tfloat>
++inline void execute_cpu_fft(fft_params&                                  params,
++                            fft_params&                                  contiguous_params,
++                            typename fftw_trait<Tfloat>::fftw_plan_type& cpu_plan,
++                            std::vector<hostbuf>&                        cpu_input,
++                            std::vector<hostbuf>&                        cpu_output)
++{
++    // CPU output might not be allocated already for us, if FFTW never
++    // needed an output buffer during planning
++    if(cpu_output.empty())
++        cpu_output = allocate_cpu_fft_buffer(
++            contiguous_params.precision, contiguous_params.otype, contiguous_params.osize);
++
++    // If this is either C2R or callbacks are enabled, the
++    // input will be modified.  So we need to modify the copy instead.
++    std::vector<hostbuf>  cpu_input_copy(cpu_input.size());
++    std::vector<hostbuf>* input_ptr = &cpu_input;
++    if(params.run_callbacks || contiguous_params.transform_type == fft_transform_type_real_inverse)
++    {
++        for(size_t i = 0; i < cpu_input.size(); ++i)
++        {
++            cpu_input_copy[i] = cpu_input[i].copy();
++        }
++
++        input_ptr = &cpu_input_copy;
++    }
++
++    // run FFTW (which may destroy CPU input)
++    apply_load_callback(params, *input_ptr);
++    fftw_run<Tfloat>(contiguous_params.transform_type, cpu_plan, *input_ptr, cpu_output);
++    // clean up
++    fftw_destroy_plan_type(cpu_plan);
++    // ask FFTW to fully clean up, since it tries to cache plan details
++    fftw_cleanup();
++    cpu_plan = nullptr;
++    apply_store_callback(params, cpu_output);
++}
++
++// execute the GPU transform
++template <class Tparams>
++inline void execute_gpu_fft(Tparams&              params,
++                            std::vector<void*>&   pibuffer,
++                            std::vector<void*>&   pobuffer,
++                            std::vector<gpubuf>&  obuffer,
++                            std::vector<hostbuf>& gpu_output,
++                            bool                  round_trip_inverse = false)
++{
++    gpubuf_t<callback_test_data> load_cb_data_dev;
++    gpubuf_t<callback_test_data> store_cb_data_dev;
++    if(params.run_callbacks)
++    {
++        void* load_cb_host
++            = get_load_callback_host(params.itype, params.precision, round_trip_inverse);
++
++        callback_test_data load_cb_data_host;
++
++        if(round_trip_inverse)
++        {
++            load_cb_data_host.scalar = params.store_cb_scalar;
++        }
++        else
++        {
++            load_cb_data_host.scalar = params.load_cb_scalar;
++        }
++
++        load_cb_data_host.base = pibuffer.front();
++
++        auto hip_status = hipSuccess;
++
++        hip_status = load_cb_data_dev.alloc(sizeof(callback_test_data));
++        if(hip_status != hipSuccess)
++        {
++            ++n_hip_failures;
++            if(skip_runtime_fails)
++            {
++                GTEST_SKIP();
++            }
++            else
++            {
++                GTEST_FAIL();
++            }
++        }
++        hip_status = hipMemcpy(load_cb_data_dev.data(),
++                               &load_cb_data_host,
++                               sizeof(callback_test_data),
++                               hipMemcpyHostToDevice);
++        if(hip_status != hipSuccess)
++        {
++            ++n_hip_failures;
++            if(skip_runtime_fails)
++            {
++                GTEST_SKIP();
++            }
++            else
++            {
++                GTEST_FAIL();
++            }
++        }
++
++        void* store_cb_host
++            = get_store_callback_host(params.otype, params.precision, round_trip_inverse);
++
++        callback_test_data store_cb_data_host;
++
++        if(round_trip_inverse)
++        {
++            store_cb_data_host.scalar = params.load_cb_scalar;
++        }
++        else
++        {
++            store_cb_data_host.scalar = params.store_cb_scalar;
++        }
++
++        store_cb_data_host.base = pobuffer.front();
++
++        hip_status = store_cb_data_dev.alloc(sizeof(callback_test_data));
++        if(hip_status != hipSuccess)
++        {
++            ++n_hip_failures;
++            if(skip_runtime_fails)
++            {
++                GTEST_SKIP();
++            }
++            else
++            {
++                GTEST_FAIL();
++            }
++        }
++
++        hip_status = hipMemcpy(store_cb_data_dev.data(),
++                               &store_cb_data_host,
++                               sizeof(callback_test_data),
++                               hipMemcpyHostToDevice);
++        if(hip_status != hipSuccess)
++        {
++            ++n_hip_failures;
++            if(skip_runtime_fails)
++            {
++                GTEST_SKIP();
++            }
++            else
++            {
++                GTEST_FAIL();
++            }
++        }
++
++        auto fft_status = params.set_callbacks(
++            load_cb_host, load_cb_data_dev.data(), store_cb_host, store_cb_data_dev.data());
++        if(fft_status != fft_status_success)
++            throw std::runtime_error("set callback failure");
++    }
++
++    // Execute the transform:
++    auto fft_status = params.execute(pibuffer.data(), pobuffer.data());
++    if(fft_status != fft_status_success)
++        throw std::runtime_error("rocFFT plan execution failure");
++
++    // if not comparing, then just executing the GPU FFT is all we
++    // need to do
++    if(!fftw_compare)
++        return;
++
++    // finalize a multi-GPU transform
++    params.multi_gpu_finalize(obuffer, pobuffer);
++
++    ASSERT_TRUE(!gpu_output.empty()) << "no output buffers";
++    for(unsigned int idx = 0; idx < gpu_output.size(); ++idx)
++    {
++        ASSERT_TRUE(gpu_output[idx].data() != nullptr)
++            << "output buffer index " << idx << " is empty";
++        auto hip_status = hipMemcpy(gpu_output[idx].data(),
++                                    pobuffer.at(idx),
++                                    gpu_output[idx].size(),
++                                    hipMemcpyDeviceToHost);
++        if(hip_status != hipSuccess)
++        {
++            ++n_hip_failures;
++            if(skip_runtime_fails)
++            {
++                GTEST_SKIP() << "hipMemcpy failure";
++            }
++            else
++            {
++                GTEST_FAIL() << "hipMemcpy failure";
++            }
++        }
++    }
++    if(verbose > 2)
++    {
++        std::cout << "GPU output:\n";
++        params.print_obuffer(gpu_output);
++    }
++    if(verbose > 5)
++    {
++        std::cout << "flat GPU output:\n";
++        params.print_obuffer_flat(gpu_output);
++    }
++}
++
++template <typename Tfloat>
++static void assert_init_value(const std::vector<hostbuf>& output,
++                              const size_t                idx,
++                              const Tfloat                orig_value);
++
++template <>
++void assert_init_value(const std::vector<hostbuf>& output, const size_t idx, const float orig_value)
++{
++    float actual_value = reinterpret_cast<const float*>(output.front().data())[idx];
++    ASSERT_EQ(actual_value, orig_value) << "index " << idx;
++}
++
++template <>
++void assert_init_value(const std::vector<hostbuf>& output,
++                       const size_t                idx,
++                       const double                orig_value)
++{
++    double actual_value = reinterpret_cast<const double*>(output.front().data())[idx];
++    ASSERT_EQ(actual_value, orig_value) << "index " << idx;
++}
++
++template <>
++void assert_init_value(const std::vector<hostbuf>& output,
++                       const size_t                idx,
++                       const rocfft_complex<float> orig_value)
++{
++    // if this is interleaved, check directly
++    if(output.size() == 1)
++    {
++        rocfft_complex<float> actual_value
++            = reinterpret_cast<const rocfft_complex<float>*>(output.front().data())[idx];
++        ASSERT_EQ(actual_value.x, orig_value.x) << "x index " << idx;
++        ASSERT_EQ(actual_value.y, orig_value.y) << "y index " << idx;
++    }
++    else
++    {
++        // planar
++        rocfft_complex<float> actual_value{
++            reinterpret_cast<const float*>(output.front().data())[idx],
++            reinterpret_cast<const float*>(output.back().data())[idx]};
++        ASSERT_EQ(actual_value.x, orig_value.x) << "x index " << idx;
++        ASSERT_EQ(actual_value.y, orig_value.y) << "y index " << idx;
++    }
++}
++
++template <>
++void assert_init_value(const std::vector<hostbuf>&  output,
++                       const size_t                 idx,
++                       const rocfft_complex<double> orig_value)
++{
++    // if this is interleaved, check directly
++    if(output.size() == 1)
++    {
++        rocfft_complex<double> actual_value
++            = reinterpret_cast<const rocfft_complex<double>*>(output.front().data())[idx];
++        ASSERT_EQ(actual_value.x, orig_value.x) << "x index " << idx;
++        ASSERT_EQ(actual_value.y, orig_value.y) << "y index " << idx;
++    }
++    else
++    {
++        // planar
++        rocfft_complex<double> actual_value{
++            reinterpret_cast<const double*>(output.front().data())[idx],
++            reinterpret_cast<const double*>(output.back().data())[idx]};
++        ASSERT_EQ(actual_value.x, orig_value.x) << "x index " << idx;
++        ASSERT_EQ(actual_value.y, orig_value.y) << "y index " << idx;
++    }
++}
++
++static const int OUTPUT_INIT_PATTERN = 0xcd;
++template <class Tfloat>
++void check_single_output_stride(const std::vector<hostbuf>& output,
++                                const size_t                offset,
++                                const std::vector<size_t>&  length,
++                                const std::vector<size_t>&  stride,
++                                const size_t                i)
++{
++    Tfloat orig;
++    memset(static_cast<void*>(&orig), OUTPUT_INIT_PATTERN, sizeof(Tfloat));
++
++    size_t curLength         = length[i];
++    size_t curStride         = stride[i];
++    size_t nextSmallerLength = i == length.size() - 1 ? 0 : length[i + 1];
++    size_t nextSmallerStride = i == stride.size() - 1 ? 0 : stride[i + 1];
++
++    if(nextSmallerLength == 0)
++    {
++        // this is the fastest dim, indexes that are not multiples of
++        // the stride should be the initial value
++        for(size_t idx = 0; idx < (curLength - 1) * curStride; ++idx)
++        {
++            if(idx % curStride != 0)
++                assert_init_value<Tfloat>(output, idx, orig);
++        }
++    }
++    else
++    {
++        for(size_t lengthIdx = 0; lengthIdx < curLength; ++lengthIdx)
++        {
++            // check that the space after the next smaller dim and the
++            // end of this dim is initial value
++            for(size_t idx = nextSmallerLength * nextSmallerStride; idx < curStride; ++idx)
++                assert_init_value<Tfloat>(output, idx, orig);
++
++            check_single_output_stride<Tfloat>(
++                output, offset + lengthIdx * curStride, length, stride, i + 1);
++        }
++    }
++}
++
++template <class Tparams>
++void check_output_strides(const std::vector<hostbuf>& output, Tparams& params)
++{
++    // treat batch+dist like highest length+stride, if batch > 1
++    std::vector<size_t> length;
++    std::vector<size_t> stride;
++    if(params.nbatch > 1)
++    {
++        length.push_back(params.nbatch);
++        stride.push_back(params.odist);
++    }
++
++    auto olength = params.olength();
++    std::copy(olength.begin(), olength.end(), std::back_inserter(length));
++    std::copy(params.ostride.begin(), params.ostride.end(), std::back_inserter(stride));
++
++    if(params.precision == fft_precision_single)
++    {
++        if(params.otype == fft_array_type_real)
++            check_single_output_stride<float>(output, 0, length, stride, 0);
++        else
++            check_single_output_stride<rocfft_complex<float>>(output, 0, length, stride, 0);
++    }
++    else
++    {
++        if(params.otype == fft_array_type_real)
++            check_single_output_stride<double>(output, 0, length, stride, 0);
++        else
++            check_single_output_stride<rocfft_complex<double>>(output, 0, length, stride, 0);
++    }
++}
++
++// run rocFFT inverse transform
++template <class Tparams>
++inline void run_round_trip_inverse(Tparams&              params,
++                                   std::vector<gpubuf>&  obuffer,
++                                   std::vector<void*>&   pibuffer,
++                                   std::vector<void*>&   pobuffer,
++                                   std::vector<hostbuf>& gpu_output)
++{
++    params.validate();
++
++    // Make sure that the parameters make sense:
++    ASSERT_TRUE(params.valid(verbose));
++
++    // Create FFT plan - this will also allocate work buffer, but will throw a
++    // specific exception if that step fails
++    auto plan_status = fft_status_success;
++    try
++    {
++        plan_status = params.create_plan();
++    }
++    catch(fft_params::work_buffer_alloc_failure& e)
++    {
++        std::stringstream ss;
++        ss << "Failed to allocate work buffer (size: " << params.workbuffersize << ")";
++        ++n_hip_failures;
++        if(skip_runtime_fails)
++        {
++            GTEST_SKIP() << ss.str();
++        }
++        else
++        {
++            GTEST_FAIL() << ss.str();
++        }
++    }
++    ASSERT_EQ(plan_status, fft_status_success) << "round trip inverse plan creation failed";
++
++    auto obuffer_sizes = params.obuffer_sizes();
++
++    if(params.placement != fft_placement_inplace)
++    {
++        for(unsigned int i = 0; i < obuffer_sizes.size(); ++i)
++        {
++            // If we're validating output strides, init the
++            // output buffer to a known pattern and we can check
++            // that the pattern is untouched in places that
++            // shouldn't have been touched.
++            if(params.check_output_strides)
++            {
++                auto hip_status
++                    = hipMemset(obuffer[i].data(), OUTPUT_INIT_PATTERN, obuffer_sizes[i]);
++                if(hip_status != hipSuccess)
++                {
++                    ++n_hip_failures;
++                    if(skip_runtime_fails)
++                    {
++                        GTEST_SKIP() << "hipMemset failure";
++                    }
++                    else
++                    {
++                        GTEST_FAIL() << "hipMemset failure";
++                    }
++                }
++            }
++        }
++    }
++
++    // execute GPU transform
++    execute_gpu_fft(params, pibuffer, pobuffer, obuffer, gpu_output, true);
++}
++
++// compare rocFFT inverse transform with forward transform input
++template <class Tparams>
++inline void compare_round_trip_inverse(Tparams&              params,
++                                       fft_params&           contiguous_params,
++                                       std::vector<hostbuf>& gpu_output,
++                                       std::vector<hostbuf>& cpu_input,
++                                       const VectorNorms&    cpu_input_norm,
++                                       size_t                total_length)
++{
++    if(params.check_output_strides)
++    {
++        check_output_strides<Tparams>(gpu_output, params);
++    }
++
++    // compute GPU output norm
++    std::shared_future<VectorNorms> gpu_norm = std::async(std::launch::async, [&]() {
++        return norm(gpu_output,
++                    params.olength(),
++                    params.nbatch,
++                    params.precision,
++                    params.otype,
++                    params.ostride,
++                    params.odist,
++                    params.ooffset);
++    });
++
++    // compare GPU inverse output to CPU forward input
++    std::unique_ptr<std::vector<std::pair<size_t, size_t>>> linf_failures;
++    if(verbose > 1)
++        linf_failures = std::make_unique<std::vector<std::pair<size_t, size_t>>>();
++    const double linf_cutoff
++        = type_epsilon(params.precision) * cpu_input_norm.l_inf * log(total_length);
++
++    VectorNorms diff = distance(cpu_input,
++                                gpu_output,
++                                params.olength(),
++                                params.nbatch,
++                                params.precision,
++                                contiguous_params.itype,
++                                contiguous_params.istride,
++                                contiguous_params.idist,
++                                params.otype,
++                                params.ostride,
++                                params.odist,
++                                linf_failures.get(),
++                                linf_cutoff,
++                                {0},
++                                params.ooffset,
++                                1.0 / total_length);
++
++    if(verbose > 1)
++    {
++        std::cout << "GPU output Linf norm: " << gpu_norm.get().l_inf << "\n";
++        std::cout << "GPU output L2 norm:   " << gpu_norm.get().l_2 << "\n";
++        std::cout << "GPU linf norm failures:";
++        std::sort(linf_failures->begin(), linf_failures->end());
++        for(const auto& i : *linf_failures)
++        {
++            std::cout << " (" << i.first << "," << i.second << ")";
++        }
++        std::cout << std::endl;
++    }
++
++    EXPECT_TRUE(std::isfinite(gpu_norm.get().l_inf)) << params.str();
++    EXPECT_TRUE(std::isfinite(gpu_norm.get().l_2)) << params.str();
++
++    switch(params.precision)
++    {
++    case fft_precision_half:
++        max_linf_eps_half
++            = std::max(max_linf_eps_half, diff.l_inf / cpu_input_norm.l_inf / log(total_length));
++        max_l2_eps_half
++            = std::max(max_l2_eps_half, diff.l_2 / cpu_input_norm.l_2 * sqrt(log2(total_length)));
++        break;
++    case fft_precision_single:
++        max_linf_eps_single
++            = std::max(max_linf_eps_single, diff.l_inf / cpu_input_norm.l_inf / log(total_length));
++        max_l2_eps_single
++            = std::max(max_l2_eps_single, diff.l_2 / cpu_input_norm.l_2 * sqrt(log2(total_length)));
++        break;
++    case fft_precision_double:
++        max_linf_eps_double
++            = std::max(max_linf_eps_double, diff.l_inf / cpu_input_norm.l_inf / log(total_length));
++        max_l2_eps_double
++            = std::max(max_l2_eps_double, diff.l_2 / cpu_input_norm.l_2 * sqrt(log2(total_length)));
++        break;
++    }
++
++    if(verbose > 1)
++    {
++        std::cout << "L2 diff: " << diff.l_2 << "\n";
++        std::cout << "Linf diff: " << diff.l_inf << "\n";
++    }
++
++    EXPECT_TRUE(diff.l_inf <= linf_cutoff)
++        << "Linf test failed.  Linf:" << diff.l_inf
++        << "\tnormalized Linf: " << diff.l_inf / cpu_input_norm.l_inf << "\tcutoff: " << linf_cutoff
++        << params.str();
++
++    EXPECT_TRUE(diff.l_2 / cpu_input_norm.l_2
++                < sqrt(log2(total_length)) * type_epsilon(params.precision))
++        << "L2 test failed. L2: " << diff.l_2
++        << "\tnormalized L2: " << diff.l_2 / cpu_input_norm.l_2
++        << "\tepsilon: " << sqrt(log2(total_length)) * type_epsilon(params.precision)
++        << params.str();
++}
++
++// RAII type to put data into the cache when this object leaves scope
++struct StoreCPUDataToCache
++{
++    StoreCPUDataToCache(std::vector<hostbuf>& cpu_input, std::vector<hostbuf>& cpu_output)
++        : cpu_input(cpu_input)
++        , cpu_output(cpu_output)
++    {
++    }
++    ~StoreCPUDataToCache()
++    {
++        last_cpu_fft_data.cpu_output.swap(cpu_output);
++        last_cpu_fft_data.cpu_input.swap(cpu_input);
++    }
++    std::vector<hostbuf>& cpu_input;
++    std::vector<hostbuf>& cpu_output;
++};
++
++// run CPU + rocFFT transform with the given params and compare
++template <class Tfloat, class Tparams>
++inline void fft_vs_reference_impl(Tparams& params, bool round_trip)
++{
++    // Call hipGetLastError to reset any errors
++    // returned by previous HIP runtime API calls.
++    hipError_t hip_status = hipGetLastError();
++
++    // Make sure that the parameters make sense:
++    ASSERT_TRUE(params.valid(verbose));
++
++    size_t needed_ram = needed_ram_buffers(params, verbose);
++
++    if(ramgb > 0 && needed_ram > ramgb * ONE_GiB)
++    {
++        GTEST_SKIP() << "needed_ramgb: " << bytes_to_GiB(needed_ram) << ", ramgb limit: " << ramgb
++                     << ".\n";
++    }
++
++    auto ibuffer_sizes = params.ibuffer_sizes();
++    auto obuffer_sizes = params.obuffer_sizes();
++
++    size_t vram_avail = 0;
++
++    if(vramgb == 0)
++    {
++        // Check free and total available memory:
++        size_t free       = 0;
++        size_t total      = 0;
++        auto   hip_status = hipMemGetInfo(&free, &total);
++        if(hip_status != hipSuccess || total == 0)
++        {
++            ++n_hip_failures;
++            std::stringstream ss;
++            if(total == 0)
++                ss << "hipMemGetInfo claims there there isn't any vram";
++            else
++                ss << "hipMemGetInfo failure with error " << hip_status;
++            if(skip_runtime_fails)
++            {
++                GTEST_SKIP() << ss.str();
++            }
++            else
++            {
++                GTEST_FAIL() << ss.str();
++            }
++        }
++        vram_avail = total;
++    }
++    else
++    {
++        vram_avail = vramgb * ONE_GiB;
++    }
++
++    // First try a quick estimation of vram footprint, to speed up skipping tests
++    // that are too large to fit in the gpu (no plan created with the rocFFT backend)
++    const auto raw_vram_footprint
++        = params.fft_params_vram_footprint() + twiddle_table_vram_footprint(params);
++
++    if(!vram_fits_problem(raw_vram_footprint, vram_avail))
++    {
++        GTEST_SKIP() << "Raw problem size (" << bytes_to_GiB(raw_vram_footprint)
++                     << " GiB) raw data too large for device";
++    }
++
++    if(verbose > 2)
++    {
++        std::cout << "Raw problem size: " << raw_vram_footprint << std::endl;
++    }
++
++    // If it passed the quick estimation test, go for the more
++    // accurate calculation that actually creates the plan and
++    // take into account the work buffer size
++    const auto vram_footprint = params.vram_footprint();
++    if(!vram_fits_problem(vram_footprint, vram_avail))
++    {
++        if(verbose)
++        {
++            std::cout << "Problem raw data won't fit on device; skipped." << std::endl;
++        }
++        GTEST_SKIP() << "Problem size (" << bytes_to_GiB(vram_footprint)
++                     << " GiB) raw data too large for device";
++    }
++
++    // Create FFT plan - this will also allocate work buffer, but
++    // will throw a specific exception if that step fails
++    auto plan_status = fft_status_success;
++    try
++    {
++        plan_status = params.create_plan();
++    }
++    catch(fft_params::work_buffer_alloc_failure& e)
++    {
++        ++n_hip_failures;
++        std::stringstream ss;
++        ss << "Work buffer allocation failed with size: " << params.workbuffersize;
++        if(skip_runtime_fails)
++        {
++            GTEST_SKIP() << ss.str();
++        }
++        else
++        {
++            GTEST_FAIL() << ss.str();
++        }
++    }
++    ASSERT_EQ(plan_status, fft_status_success) << "plan creation failed";
++
++    if(!vram_fits_problem(vram_footprint, vram_avail))
++    {
++        if(verbose)
++        {
++            std::cout << "Problem won't fit on device; skipped." << std::endl;
++        }
++        GTEST_SKIP() << "Problem size (" << vram_footprint << ") too large for device";
++        return;
++    }
++
++    fft_params contiguous_params;
++    contiguous_params.length         = params.length;
++    contiguous_params.precision      = params.precision;
++    contiguous_params.placement      = fft_placement_notinplace;
++    contiguous_params.transform_type = params.transform_type;
++    contiguous_params.nbatch         = params.nbatch;
++    contiguous_params.itype          = contiguous_itype(params.transform_type);
++    contiguous_params.otype          = contiguous_otype(contiguous_params.transform_type);
++
++    contiguous_params.validate();
++
++    if(!contiguous_params.valid(verbose))
++    {
++        throw std::runtime_error("Invalid contiguous params");
++    }
++
++    if(verbose > 3)
++    {
++        std::cout << "CPU params:\n";
++        std::cout << contiguous_params.str("\n\t") << std::endl;
++    }
++
++    std::vector<gpubuf> ibuffer(ibuffer_sizes.size());
++    std::vector<void*>  pibuffer(ibuffer_sizes.size());
++    for(unsigned int i = 0; i < ibuffer.size(); ++i)
++    {
++        hip_status = ibuffer[i].alloc(ibuffer_sizes[i]);
++        if(hip_status != hipSuccess)
++        {
++            std::stringstream ss;
++            ss << "hipMalloc failure for input buffer " << i << " size " << ibuffer_sizes[i] << "("
++               << bytes_to_GiB(ibuffer_sizes[i]) << " GiB)"
++               << " with code " << hipError_to_string(hip_status);
++            ++n_hip_failures;
++            if(skip_runtime_fails)
++            {
++                GTEST_SKIP() << ss.str();
++            }
++            else
++            {
++                GTEST_FAIL() << ss.str();
++            }
++        }
++        pibuffer[i] = ibuffer[i].data();
++    }
++
++    // allocation counts in elements, ibuffer_sizes is in bytes
++    auto ibuffer_sizes_elems = ibuffer_sizes;
++    for(auto& buf : ibuffer_sizes_elems)
++        buf /= var_size<size_t>(params.precision, params.itype);
++
++    // Check cache first - nbatch is a >= comparison because we compute
++    // the largest batch size and cache it.  Smaller batch runs can
++    // compare against the larger data.
++    std::vector<hostbuf>                 cpu_input;
++    std::vector<hostbuf>                 cpu_output;
++    std::shared_future<void>             convert_cpu_output_precision;
++    std::shared_future<void>             convert_cpu_input_precision;
++    bool                                 run_fftw = true;
++    std::unique_ptr<StoreCPUDataToCache> store_to_cache;
++    if(fftw_compare && last_cpu_fft_data.length == params.length
++       && last_cpu_fft_data.transform_type == params.transform_type
++       && last_cpu_fft_data.run_callbacks == params.run_callbacks)
++    {
++        if(last_cpu_fft_data.nbatch >= params.nbatch)
++        {
++            // use the cached input/output
++            cpu_input.swap(last_cpu_fft_data.cpu_input);
++            cpu_output.swap(last_cpu_fft_data.cpu_output);
++            run_fftw = false;
++
++            store_to_cache = std::make_unique<StoreCPUDataToCache>(cpu_input, cpu_output);
++
++            if(params.precision != last_cpu_fft_data.precision)
++            {
++                // Tests should be ordered so we do wider first, then narrower.
++                switch(params.precision)
++                {
++                case fft_precision_double:
++                    std::cerr
++                        << "test ordering is incorrect: double precision follows a narrower one"
++                        << std::endl;
++                    abort();
++                    break;
++                case fft_precision_single:
++                    if(last_cpu_fft_data.precision != fft_precision_double)
++                    {
++                        std::cerr
++                            << "test ordering is incorrect: float precision follows a narrower one"
++                            << std::endl;
++                        abort();
++                    }
++                    // convert the input/output to single-precision
++                    convert_cpu_output_precision = std::async(std::launch::async, [&]() {
++                        narrow_precision_inplace<double, float>(cpu_output.front());
++                    });
++                    convert_cpu_input_precision  = std::async(std::launch::async, [&]() {
++                        narrow_precision_inplace<double, float>(cpu_input.front());
++                    });
++                    break;
++                case fft_precision_half:
++                    // convert to half precision
++                    if(last_cpu_fft_data.precision == fft_precision_double)
++                    {
++                        convert_cpu_output_precision = std::async(std::launch::async, [&]() {
++                            narrow_precision_inplace<double, _Float16>(cpu_output.front());
++                        });
++                        convert_cpu_input_precision  = std::async(std::launch::async, [&]() {
++                            narrow_precision_inplace<double, _Float16>(cpu_input.front());
++                        });
++                    }
++                    else if(last_cpu_fft_data.precision == fft_precision_single)
++                    {
++                        convert_cpu_output_precision = std::async(std::launch::async, [&]() {
++                            narrow_precision_inplace<float, _Float16>(cpu_output.front());
++                        });
++                        convert_cpu_input_precision  = std::async(std::launch::async, [&]() {
++                            narrow_precision_inplace<float, _Float16>(cpu_input.front());
++                        });
++                    }
++                    else
++                    {
++                        std::cerr << "unhandled previous precision, cannot convert to half"
++                                  << std::endl;
++                        abort();
++                    }
++                    break;
++                }
++                last_cpu_fft_data.precision = params.precision;
++            }
++        }
++        // If the last result has a smaller batch than the new
++        // params, that might be a developer error - tests should be
++        // ordered to generate the bigger batch first.  But if tests
++        // got filtered or skipped due to insufficient memory, we
++        // might never have tried to generate the bigger batch first.
++        // So just fall through and redo the CPU FFT.
++    }
++    else
++    {
++        // Clear cache explicitly so that even if we didn't get a hit,
++        // we're not uselessly holding on to cached cpu input/output
++        last_cpu_fft_data = last_cpu_fft_cache();
++    }
++
++    // Allocate CPU input
++    if(run_fftw)
++    {
++        cpu_input = allocate_cpu_fft_buffer(
++            contiguous_params.precision, contiguous_params.itype, contiguous_params.isize);
++    }
++
++    // Create FFTW plan - this may write to input, but that's fine
++    // since there's nothing in there right now
++    typename fftw_trait<Tfloat>::fftw_plan_type cpu_plan = nullptr;
++    if(run_fftw)
++    {
++        // Normally, we would want to defer allocation of CPU output
++        // buffer until when we actually do the CPU FFT.  But if we're
++        // using FFTW wisdom, FFTW needs an output buffer at plan
++        // creation time.
++        if(use_fftw_wisdom)
++        {
++            cpu_output = allocate_cpu_fft_buffer(
++                contiguous_params.precision, contiguous_params.otype, contiguous_params.osize);
++        }
++        cpu_plan = fftw_plan_via_rocfft<Tfloat>(contiguous_params.length,
++                                                contiguous_params.istride,
++                                                contiguous_params.ostride,
++                                                contiguous_params.nbatch,
++                                                contiguous_params.idist,
++                                                contiguous_params.odist,
++                                                contiguous_params.transform_type,
++                                                cpu_input,
++                                                cpu_output);
++
++        needed_ram += needed_ram_fftw<Tfloat>(contiguous_params, cpu_plan, verbose);
++
++        if(ramgb > 0 && needed_ram > ramgb * ONE_GiB)
++        {
++            if(verbose)
++            {
++                std::cout << "Problem exceeds memory limit; skipped [rocfft_transform]."
++                          << std::endl;
++            }
++            GTEST_SKIP();
++            return;
++        }
++    }
++
++    std::vector<hostbuf> gpu_input_data;
++
++    // allocate and populate the input buffer (cpu/gpu)
++    if(run_fftw)
++    {
++        gpu_input_data = allocate_host_buffer(params.precision, params.itype, ibuffer_sizes_elems);
++
++        //generate the input directly on the gpu
++        params.compute_input(ibuffer);
++
++        // Copy the input to CPU
++        if(params.itype != contiguous_params.itype || params.istride != contiguous_params.istride
++           || params.idist != contiguous_params.idist || params.isize != contiguous_params.isize)
++        {
++            // Copy input to CPU
++            for(unsigned int idx = 0; idx < ibuffer.size(); ++idx)
++            {
++                hip_status = hipMemcpy(gpu_input_data.at(idx).data(),
++                                       ibuffer[idx].data(),
++                                       ibuffer_sizes[idx],
++                                       hipMemcpyDeviceToHost);
++                if(hip_status != hipSuccess)
++                {
++                    ++n_hip_failures;
++                    if(skip_runtime_fails)
++                    {
++                        GTEST_SKIP() << "hipMemcpy failure with error " << hip_status;
++                    }
++                    else
++                    {
++                        GTEST_FAIL() << "hipMemcpy failure with error " << hip_status;
++                    }
++                }
++            }
++
++            copy_buffers(gpu_input_data,
++                         cpu_input,
++                         params.ilength(),
++                         params.nbatch,
++                         params.precision,
++                         params.itype,
++                         params.istride,
++                         params.idist,
++                         contiguous_params.itype,
++                         contiguous_params.istride,
++                         contiguous_params.idist,
++                         params.ioffset,
++                         contiguous_params.ioffset);
++        }
++        else
++        {
++            // Copy input to CPU
++            for(unsigned int idx = 0; idx < ibuffer.size(); ++idx)
++            {
++                hip_status = hipMemcpy(cpu_input.at(idx).data(),
++                                       ibuffer[idx].data(),
++                                       ibuffer_sizes[idx],
++                                       hipMemcpyDeviceToHost);
++                if(hip_status != hipSuccess)
++                {
++                    ++n_hip_failures;
++                    if(skip_runtime_fails)
++                    {
++                        GTEST_SKIP() << "hipMemcpy failure with error " << hip_status;
++                    }
++                    else
++                    {
++                        GTEST_FAIL() << "hipMemcpy failure with error " << hip_status;
++                    }
++                }
++            }
++        }
++    }
++    else if(fftw_compare)
++    {
++        gpu_input_data = allocate_host_buffer(params.precision, params.itype, ibuffer_sizes_elems);
++
++        // In case the cached cpu input needed conversion, wait for it
++        if(convert_cpu_input_precision.valid())
++            convert_cpu_input_precision.get();
++
++        // gets a pre-computed gpu input buffer from the cpu cache
++        std::vector<hostbuf>* gpu_input = &cpu_input;
++
++        if(params.itype != contiguous_params.itype || params.istride != contiguous_params.istride
++           || params.idist != contiguous_params.idist || params.isize != contiguous_params.isize)
++        {
++            copy_buffers(cpu_input,
++                         gpu_input_data,
++                         params.ilength(),
++                         params.nbatch,
++                         params.precision,
++                         contiguous_params.itype,
++                         contiguous_params.istride,
++                         contiguous_params.idist,
++                         params.itype,
++                         params.istride,
++                         params.idist,
++                         {0},
++                         params.ioffset);
++            gpu_input = &gpu_input_data;
++        }
++
++        // Copy input to GPU
++        for(unsigned int idx = 0; idx < gpu_input->size(); ++idx)
++        {
++            hip_status = hipMemcpy(ibuffer[idx].data(),
++                                   gpu_input->at(idx).data(),
++                                   ibuffer_sizes[idx],
++                                   hipMemcpyHostToDevice);
++
++            if(hip_status != hipSuccess)
++            {
++                ++n_hip_failures;
++                if(skip_runtime_fails)
++                {
++                    GTEST_SKIP() << "hipMemcpy failure with error " << hip_status;
++                }
++                else
++                {
++                    GTEST_FAIL() << "hipMemcpy failure with error " << hip_status;
++                }
++            }
++        }
++    }
++
++    if(verbose > 3)
++    {
++        std::cout << "CPU input:\n";
++        contiguous_params.print_ibuffer(cpu_input);
++    }
++
++    // compute input norm
++    std::shared_future<VectorNorms> cpu_input_norm;
++    if(fftw_compare)
++        cpu_input_norm = std::async(std::launch::async, [&]() {
++            // in case the cached cpu input needed conversion, wait for it
++            if(convert_cpu_input_precision.valid())
++                convert_cpu_input_precision.get();
++
++            auto input_norm = norm(cpu_input,
++                                   contiguous_params.ilength(),
++                                   contiguous_params.nbatch,
++                                   contiguous_params.precision,
++                                   contiguous_params.itype,
++                                   contiguous_params.istride,
++                                   contiguous_params.idist,
++                                   contiguous_params.ioffset);
++            if(verbose > 2)
++            {
++                std::cout << "CPU Input Linf norm:  " << input_norm.l_inf << "\n";
++                std::cout << "CPU Input L2 norm:    " << input_norm.l_2 << "\n";
++            }
++            return input_norm;
++        });
++
++    std::vector<gpubuf>  obuffer_data;
++    std::vector<gpubuf>* obuffer = &obuffer_data;
++    std::vector<void*>   pobuffer;
++
++    // allocate the output buffer
++
++    if(params.placement == fft_placement_inplace)
++    {
++        obuffer = &ibuffer;
++    }
++    else
++    {
++        auto obuffer_sizes = params.obuffer_sizes();
++        obuffer_data.resize(obuffer_sizes.size());
++        for(unsigned int i = 0; i < obuffer_data.size(); ++i)
++        {
++            hip_status = obuffer_data[i].alloc(obuffer_sizes[i]);
++            if(hip_status != hipSuccess)
++            {
++                ++n_hip_failures;
++                std::stringstream ss;
++                ss << "hipMalloc failure for output buffer " << i << " size " << obuffer_sizes[i]
++                   << "(" << bytes_to_GiB(obuffer_sizes[i]) << " GiB)"
++                   << " with code " << hipError_to_string(hip_status);
++                if(skip_runtime_fails)
++                {
++                    GTEST_SKIP() << ss.str();
++                }
++                else
++                {
++                    GTEST_FAIL() << ss.str();
++                }
++            }
++
++            // If we're validating output strides, init the
++            // output buffer to a known pattern and we can check
++            // that the pattern is untouched in places that
++            // shouldn't have been touched.
++            if(params.check_output_strides)
++            {
++                hip_status
++                    = hipMemset(obuffer_data[i].data(), OUTPUT_INIT_PATTERN, obuffer_sizes[i]);
++                if(hip_status != hipSuccess)
++                {
++                    ++n_hip_failures;
++                    if(skip_runtime_fails)
++                    {
++                        GTEST_SKIP() << "hipMemset failure with error " << hip_status;
++                    }
++                    else
++                    {
++                        GTEST_FAIL() << "hipMemset failure with error " << hip_status;
++                    }
++                }
++            }
++        }
++    }
++    pobuffer.resize(obuffer->size());
++    for(unsigned int i = 0; i < obuffer->size(); ++i)
++    {
++        pobuffer[i] = obuffer->at(i).data();
++    }
++
++    // Run CPU transform
++    //
++    // NOTE: This must happen after input is copied to GPU and input
++    // norm is computed, since the CPU FFT may overwrite the input.
++    VectorNorms              cpu_output_norm;
++    std::shared_future<void> cpu_fft;
++    if(fftw_compare)
++        cpu_fft = std::async(std::launch::async, [&]() {
++            // wait for input norm to finish, since we might overwrite input
++            cpu_input_norm.get();
++
++            if(run_fftw)
++                execute_cpu_fft<Tfloat>(params, contiguous_params, cpu_plan, cpu_input, cpu_output);
++            // in case the cached cpu output needed conversion, wait for it
++            else if(convert_cpu_output_precision.valid())
++                convert_cpu_output_precision.get();
++
++            if(verbose > 3)
++            {
++                std::cout << "CPU output:\n";
++                contiguous_params.print_obuffer(cpu_output);
++            }
++
++            cpu_output_norm = norm(cpu_output,
++                                   params.olength(),
++                                   params.nbatch,
++                                   params.precision,
++                                   contiguous_params.otype,
++                                   contiguous_params.ostride,
++                                   contiguous_params.odist,
++                                   contiguous_params.ooffset);
++            if(verbose > 2)
++            {
++                std::cout << "CPU Output Linf norm: " << cpu_output_norm.l_inf << "\n";
++                std::cout << "CPU Output L2 norm:   " << cpu_output_norm.l_2 << "\n";
++            }
++        });
++
++    // scatter data out to multi-GPUs if this is a multi-GPU test
++    params.multi_gpu_prepare(ibuffer, pibuffer, pobuffer);
++
++    // execute GPU transform
++    std::vector<hostbuf> gpu_output
++        = allocate_host_buffer(params.precision, params.otype, params.osize);
++
++    execute_gpu_fft(params, pibuffer, pobuffer, *obuffer, gpu_output);
++
++    params.free();
++
++    if(params.check_output_strides)
++    {
++        check_output_strides<Tparams>(gpu_output, params);
++    }
++
++    // compute GPU output norm
++    std::shared_future<VectorNorms> gpu_norm;
++    if(fftw_compare)
++        gpu_norm = std::async(std::launch::async, [&]() {
++            return norm(gpu_output,
++                        params.olength(),
++                        params.nbatch,
++                        params.precision,
++                        params.otype,
++                        params.ostride,
++                        params.odist,
++                        params.ooffset);
++        });
++
++    // compare output
++    //
++    // Compute the l-infinity and l-2 distance between the CPU and GPU output:
++    // wait for cpu FFT so we can compute cutoff
++
++    const auto total_length = std::accumulate(params.length.begin(),
++                                              params.length.end(),
++                                              static_cast<size_t>(1),
++                                              std::multiplies<size_t>());
++
++    std::unique_ptr<std::vector<std::pair<size_t, size_t>>> linf_failures;
++    if(verbose > 1)
++        linf_failures = std::make_unique<std::vector<std::pair<size_t, size_t>>>();
++    double      linf_cutoff;
++    VectorNorms diff;
++
++    std::shared_future<void> compare_output;
++    if(fftw_compare)
++        compare_output = std::async(std::launch::async, [&]() {
++            cpu_fft.get();
++            linf_cutoff
++                = type_epsilon(params.precision) * cpu_output_norm.l_inf * log(total_length);
++
++            diff = distance(cpu_output,
++                            gpu_output,
++                            params.olength(),
++                            params.nbatch,
++                            params.precision,
++                            contiguous_params.otype,
++                            contiguous_params.ostride,
++                            contiguous_params.odist,
++                            params.otype,
++                            params.ostride,
++                            params.odist,
++                            linf_failures.get(),
++                            linf_cutoff,
++                            {0},
++                            params.ooffset);
++        });
++
++    // Update the cache if this current transform is different from
++    // what's stored.  But if this transform only has a smaller batch
++    // than what's cached, we can still keep the cache around since
++    // the input/output we already have is still valid.
++    const bool update_last_cpu_fft_data
++        = last_cpu_fft_data.length != params.length
++          || last_cpu_fft_data.transform_type != params.transform_type
++          || last_cpu_fft_data.run_callbacks != params.run_callbacks
++          || last_cpu_fft_data.precision != params.precision
++          || params.nbatch > last_cpu_fft_data.nbatch;
++
++    // store cpu output in cache
++    if(update_last_cpu_fft_data)
++    {
++        last_cpu_fft_data.length         = params.length;
++        last_cpu_fft_data.nbatch         = params.nbatch;
++        last_cpu_fft_data.transform_type = params.transform_type;
++        last_cpu_fft_data.run_callbacks  = params.run_callbacks;
++        last_cpu_fft_data.precision      = params.precision;
++    }
++
++    if(compare_output.valid())
++        compare_output.get();
++
++    if(!store_to_cache)
++        store_to_cache = std::make_unique<StoreCPUDataToCache>(cpu_input, cpu_output);
++
++    Tparams params_inverse;
++
++    if(round_trip)
++    {
++        params_inverse.inverse_from_forward(params);
++
++        run_round_trip_inverse<Tparams>(
++            params_inverse, ibuffer, pobuffer, pibuffer, gpu_input_data);
++    }
++
++    if(fftw_compare)
++    {
++        ASSERT_TRUE(std::isfinite(cpu_input_norm.get().l_2));
++        ASSERT_TRUE(std::isfinite(cpu_input_norm.get().l_inf));
++
++        ASSERT_TRUE(std::isfinite(cpu_output_norm.l_2));
++        ASSERT_TRUE(std::isfinite(cpu_output_norm.l_inf));
++
++        if(verbose > 1)
++        {
++            std::cout << "GPU output Linf norm: " << gpu_norm.get().l_inf << "\n";
++            std::cout << "GPU output L2 norm:   " << gpu_norm.get().l_2 << "\n";
++            std::cout << "GPU linf norm failures:";
++            std::sort(linf_failures->begin(), linf_failures->end());
++            for(const auto& i : *linf_failures)
++            {
++                std::cout << " (" << i.first << "," << i.second << ")";
++            }
++            std::cout << std::endl;
++        }
++
++        EXPECT_TRUE(std::isfinite(gpu_norm.get().l_inf)) << params.str();
++        EXPECT_TRUE(std::isfinite(gpu_norm.get().l_2)) << params.str();
++    }
++
++    switch(params.precision)
++    {
++    case fft_precision_half:
++        max_linf_eps_half
++            = std::max(max_linf_eps_half, diff.l_inf / cpu_output_norm.l_inf / log(total_length));
++        max_l2_eps_half
++            = std::max(max_l2_eps_half, diff.l_2 / cpu_output_norm.l_2 * sqrt(log2(total_length)));
++        break;
++    case fft_precision_single:
++        max_linf_eps_single
++            = std::max(max_linf_eps_single, diff.l_inf / cpu_output_norm.l_inf / log(total_length));
++        max_l2_eps_single = std::max(max_l2_eps_single,
++                                     diff.l_2 / cpu_output_norm.l_2 * sqrt(log2(total_length)));
++        break;
++    case fft_precision_double:
++        max_linf_eps_double
++            = std::max(max_linf_eps_double, diff.l_inf / cpu_output_norm.l_inf / log(total_length));
++        max_l2_eps_double = std::max(max_l2_eps_double,
++                                     diff.l_2 / cpu_output_norm.l_2 * sqrt(log2(total_length)));
++        break;
++    }
++
++    if(verbose > 1)
++    {
++        std::cout << "L2 diff: " << diff.l_2 << "\n";
++        std::cout << "Linf diff: " << diff.l_inf << "\n";
++    }
++
++    if(fftw_compare)
++    {
++        EXPECT_TRUE(diff.l_inf <= linf_cutoff)
++            << "Linf test failed.  Linf:" << diff.l_inf
++            << "\tnormalized Linf: " << diff.l_inf / cpu_output_norm.l_inf
++            << "\tcutoff: " << linf_cutoff << params.str();
++
++        EXPECT_TRUE(diff.l_2 / cpu_output_norm.l_2
++                    < sqrt(log2(total_length)) * type_epsilon(params.precision))
++            << "L2 test failed. L2: " << diff.l_2
++            << "\tnormalized L2: " << diff.l_2 / cpu_output_norm.l_2
++            << "\tepsilon: " << sqrt(log2(total_length)) * type_epsilon(params.precision)
++            << params.str();
++    }
++
++    if(round_trip && fftw_compare)
++    {
++        compare_round_trip_inverse<Tparams>(params_inverse,
++                                            contiguous_params,
++                                            gpu_input_data,
++                                            cpu_input,
++                                            cpu_input_norm.get(),
++                                            total_length);
++    }
++}
++
++#endif
+diff --git a/shared/arithmetic.h b/shared/arithmetic.h
+new file mode 100644
+index 0000000..774d342
+--- /dev/null
++++ b/shared/arithmetic.h
+@@ -0,0 +1,61 @@
++/******************************************************************************
++* Copyright (C) 2021 - 2022 Advanced Micro Devices, Inc. All rights reserved.
++*
++* Permission is hereby granted, free of charge, to any person obtaining a copy
++* of this software and associated documentation files (the "Software"), to deal
++* in the Software without restriction, including without limitation the rights
++* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
++* copies of the Software, and to permit persons to whom the Software is
++* furnished to do so, subject to the following conditions:
++*
++* The above copyright notice and this permission notice shall be included in
++* all copies or substantial portions of the Software.
++*
++* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
++* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
++* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
++* THE SOFTWARE.
++*******************************************************************************/
++
++#pragma once
++
++#include <numeric>
++#include <stddef.h>
++
++// arithmetic helper functions
++
++static inline bool IsPo2(size_t u)
++{
++    return (u != 0) && (0 == (u & (u - 1)));
++}
++
++//	help function: Find the smallest power of 2 that is >= n; return its
++//  power of 2 factor
++//	e.g., CeilPo2 (7) returns 3 : (2^3 >= 7)
++static inline size_t CeilPo2(size_t n)
++{
++    size_t v = 1, t = 0;
++    while(v < n)
++    {
++        v <<= 1;
++        t++;
++    }
++
++    return t;
++}
++
++template <typename T>
++static inline T DivRoundingUp(T a, T b)
++{
++    return (a + (b - 1)) / b;
++}
++
++template <typename Titer>
++typename Titer::value_type product(Titer begin, Titer end)
++{
++    return std::accumulate(
++        begin, end, typename Titer::value_type(1), std::multiplies<typename Titer::value_type>());
++}
+diff --git a/shared/array_predicate.h b/shared/array_predicate.h
+new file mode 100644
+index 0000000..92e45b4
+--- /dev/null
++++ b/shared/array_predicate.h
+@@ -0,0 +1,47 @@
++// Copyright (C) 2021 - 2022 Advanced Micro Devices, Inc. All rights reserved.
++//
++// Permission is hereby granted, free of charge, to any person obtaining a copy
++// of this software and associated documentation files (the "Software"), to deal
++// in the Software without restriction, including without limitation the rights
++// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
++// copies of the Software, and to permit persons to whom the Software is
++// furnished to do so, subject to the following conditions:
++//
++// The above copyright notice and this permission notice shall be included in
++// all copies or substantial portions of the Software.
++//
++// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
++// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
++// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
++// THE SOFTWARE.
++
++#ifndef ROCFFT_ARRAY_PREDICATE_H
++#define ROCFFT_ARRAY_PREDICATE_H
++
++#include "rocfft/rocfft.h"
++
++namespace
++{
++    bool array_type_is_complex(rocfft_array_type type)
++    {
++        return type == rocfft_array_type_complex_interleaved
++               || type == rocfft_array_type_complex_planar
++               || type == rocfft_array_type_hermitian_interleaved
++               || type == rocfft_array_type_hermitian_planar;
++    }
++    bool array_type_is_interleaved(rocfft_array_type type)
++    {
++        return type == rocfft_array_type_complex_interleaved
++               || type == rocfft_array_type_hermitian_interleaved;
++    }
++    bool array_type_is_planar(rocfft_array_type type)
++    {
++        return type == rocfft_array_type_complex_planar
++               || type == rocfft_array_type_hermitian_planar;
++    }
++}
++
++#endif
+diff --git a/shared/array_validator.cpp b/shared/array_validator.cpp
+new file mode 100644
+index 0000000..70abb08
+--- /dev/null
++++ b/shared/array_validator.cpp
+@@ -0,0 +1,549 @@
++// Copyright (C) 2022 Advanced Micro Devices, Inc. All rights reserved.
++//
++// Permission is hereby granted, free of charge, to any person obtaining a copy
++// of this software and associated documentation files (the "Software"), to deal
++// in the Software without restriction, including without limitation the rights
++// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
++// copies of the Software, and to permit persons to whom the Software is
++// furnished to do so, subject to the following conditions:
++//
++// The above copyright notice and this permission notice shall be included in
++// all copies or substantial portions of the Software.
++//
++// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
++// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
++// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
++// THE SOFTWARE.
++
++#include <iostream>
++#include <numeric>
++#include <unordered_set>
++
++#include "array_validator.h"
++#include "increment.h"
++
++// Check a 2D array for collisions.
++// The 2D case can be determined via a number-theoretic argument.
++bool valid_length_stride_2d(const size_t l0, const size_t l1, const size_t s0, const size_t s1)
++{
++    if(s0 == s1)
++        return false;
++    const auto c = std::lcm(s0, s1);
++    return !((s0 * (l0 - 1) >= c) && (s1 * (l1 - 1) >= c));
++}
++
++// Compare a 1D direction with a multi-index hyperface for collisions.
++bool valid_length_stride_1d_multi(const unsigned int        idx,
++                                  const std::vector<size_t> l,
++                                  const std::vector<size_t> s,
++                                  const int                 verbose)
++{
++    size_t              l0{0}, s0{0};
++    std::vector<size_t> l1{}, s1{};
++    for(unsigned int i = 0; i < l.size(); ++i)
++    {
++        if(i == idx)
++        {
++            l0 = l[i];
++            s0 = s[i];
++        }
++        else
++        {
++            l1.push_back(l[i]);
++            s1.push_back(s[i]);
++        }
++    }
++
++    if(verbose > 4)
++    {
++        std::cout << "l0: " << l0 << "\ts0: " << s0 << std::endl;
++    }
++
++    // We only need to go to the maximum pointer offset for (l1,s1).
++    const auto max_offset
++        = std::accumulate(l1.begin(), l1.end(), (size_t)1, std::multiplies<size_t>())
++          - std ::inner_product(l1.begin(), l1.end(), s1.begin(), (size_t)0);
++    std::unordered_set<size_t> a0{};
++    for(size_t i = 1; i < l0; ++i)
++    {
++        const auto val = i * s0;
++        if(val <= max_offset)
++            a0.insert(val);
++        else
++            break;
++    }
++
++    if(verbose > 5)
++    {
++        std::cout << "a0:";
++        for(auto i : a0)
++            std::cout << " " << i;
++        std::cout << std::endl;
++
++        std::cout << "l1:";
++        for(auto i : l1)
++            std::cout << " " << i;
++        std::cout << std::endl;
++
++        std::cout << "s1:";
++        for(auto i : s1)
++            std::cout << " " << i;
++        std::cout << std::endl;
++    }
++
++    // TODO: this can be multi-threaded, since find(...) is thread-safe.
++    std::vector<size_t> index(l1.size());
++    std::fill(index.begin(), index.end(), 0);
++    do
++    {
++        const int i = std::inner_product(index.begin(), index.end(), s1.begin(), (size_t)0);
++        if(i > 0 && (i % s0 == 0))
++        {
++            // TODO: use an ordered set and binary search
++            if(verbose > 6)
++                std::cout << i << std::endl;
++            if(a0.find(i) != a0.end())
++            {
++                if(verbose > 4)
++                {
++                    std::cout << "l0: " << l0 << "\ts0: " << s0 << std::endl;
++                    std::cout << "l1:";
++                    for(const auto li : l1)
++                        std::cout << " " << li;
++                    std::cout << " s1:";
++                    for(const auto si : s1)
++                        std::cout << " " << si;
++                    std::cout << std::endl;
++                    std::cout << "Found duplicate: " << i << std::endl;
++                }
++                return false;
++            }
++        }
++    } while(increment_rowmajor(index, l1));
++
++    return true;
++}
++
++// Compare a hyperface with another hyperface for collisions.
++bool valid_length_stride_multi_multi(const std::vector<size_t> l0,
++                                     const std::vector<size_t> s0,
++                                     const std::vector<size_t> l1,
++                                     const std::vector<size_t> s1)
++{
++    std::unordered_set<size_t> a0{};
++
++    const auto max_offset
++        = std::accumulate(l1.begin(), l1.end(), (size_t)1, std::multiplies<size_t>())
++          - std::inner_product(l1.begin(), l1.end(), s1.begin(), (size_t)0);
++    std::vector<size_t> index0(l0.size()); // TODO: check this
++    std::fill(index0.begin(), index0.end(), 0);
++    do
++    {
++        const auto i = std::inner_product(index0.begin(), index0.end(), s0.begin(), (size_t)0);
++        if(i > max_offset)
++            a0.insert(i);
++    } while(increment_rowmajor(index0, l0));
++
++    std::vector<size_t> index1(l1.size());
++    std::fill(index1.begin(), index1.end(), 0);
++    do
++    {
++        const auto i = std::inner_product(index1.begin(), index1.end(), s1.begin(), (size_t)0);
++        if(i > 0)
++        {
++            // TODO: use an ordered set and binary search
++            if(a0.find(i) != a0.end())
++            {
++
++                return false;
++            }
++        }
++    } while(increment_rowmajor(index1, l1));
++
++    return true;
++}
++
++bool valid_length_stride_3d(const std::vector<size_t>& l,
++                            const std::vector<size_t>& s,
++                            const int                  verbose)
++{
++    // Check that 2D faces are valid:
++    if(!valid_length_stride_2d(l[0], l[1], s[0], s[1]))
++        return false;
++    if(!valid_length_stride_2d(l[0], l[2], s[0], s[2]))
++        return false;
++    if(!valid_length_stride_2d(l[1], l[2], s[1], s[2]))
++        return false;
++
++    // If the 2D faces are valid, check an axis vs a face for collisions:
++    bool invalid = false;
++#ifdef _OPENMP
++#pragma omp parallel for
++#endif
++    for(int idx = 0; idx < 3; ++idx)
++    {
++        if(!valid_length_stride_1d_multi(idx, l, s, verbose))
++        {
++#ifdef _OPENMP
++#pragma omp cancel for
++#endif
++            invalid = true;
++        }
++    }
++    if(invalid)
++        return false;
++    return true;
++}
++
++bool valid_length_stride_4d(const std::vector<size_t>& l,
++                            const std::vector<size_t>& s,
++                            const int                  verbose)
++{
++    if(l.size() != 4)
++    {
++        throw std::runtime_error("Incorrect dimensions for valid_length_stride_4d");
++    }
++
++    // Check that 2D faces are valid:
++    for(int idx0 = 0; idx0 < 3; ++idx0)
++    {
++        for(int idx1 = idx0 + 1; idx1 < 4; ++idx1)
++        {
++            if(!valid_length_stride_2d(l[idx0], l[idx1], s[idx0], s[idx1]))
++                return false;
++        }
++    }
++
++    bool invalid = false;
++    // Check that 1D vs 3D faces are valid:
++#ifdef _OPENMP
++#pragma omp parallel for
++#endif
++    for(int idx0 = 0; idx0 < 4; ++idx0)
++    {
++        if(!valid_length_stride_1d_multi(idx0, l, s, verbose))
++        {
++#ifdef _OPENMP
++#pragma omp cancel for
++#endif
++            invalid = true;
++        }
++    }
++    if(invalid)
++        return false;
++
++    // Check that 2D vs 2D faces are valid:
++
++    // First, get all the permutations
++    std::vector<std::vector<size_t>> perms;
++    std::vector<size_t>              v(l.size());
++    std::fill(v.begin(), v.begin() + 2, 0);
++    std::fill(v.begin() + 2, v.end(), 1);
++    do
++    {
++        perms.push_back(v);
++        if(verbose > 3)
++        {
++            std::cout << "v:";
++            for(const auto i : v)
++            {
++                std::cout << " " << i;
++            }
++            std::cout << "\n";
++        }
++    } while(std::next_permutation(v.begin(), v.end()));
++
++    // Then loop over all of the permutations.
++#ifdef _OPENMP
++#pragma omp parallel for
++#endif
++    for(size_t iperm = 0; iperm < perms.size(); ++iperm)
++    {
++        std::vector<size_t> l0(2);
++        std::vector<size_t> s0(2);
++        std::vector<size_t> l1(2);
++        std::vector<size_t> s1(2);
++        for(size_t i = 0; i < l.size(); ++i)
++        {
++            if(perms[iperm][i] == 0)
++            {
++                l0.push_back(l[i]);
++                s0.push_back(s[i]);
++            }
++            else
++            {
++                l1.push_back(l[i]);
++                s1.push_back(s[i]);
++            }
++        }
++
++        if(verbose > 3)
++        {
++            std::cout << "\tl0:";
++            for(const auto i : l0)
++            {
++                std::cout << " " << i;
++            }
++            std::cout << "\n";
++            std::cout << "\ts0:";
++            for(const auto i : s0)
++            {
++                std::cout << " " << i;
++            }
++            std::cout << "\n";
++            std::cout << "\tl1:";
++            for(const auto i : l1)
++            {
++                std::cout << " " << i;
++            }
++            std::cout << "\n";
++            std::cout << "\ts1:";
++            for(const auto i : s1)
++            {
++                std::cout << " " << i;
++            }
++            std::cout << "\n";
++        }
++
++        if(!valid_length_stride_multi_multi(l0, s0, l1, s1))
++        {
++#ifdef _OPENMP
++#pragma omp cancel for
++#endif
++            invalid = true;
++        }
++    }
++    if(invalid)
++        return false;
++
++    return true;
++}
++
++bool valid_length_stride_generald(const std::vector<size_t> l,
++                                  const std::vector<size_t> s,
++                                  const int                 verbose)
++{
++    if(verbose > 2)
++    {
++        std::cout << "checking dimension " << l.size() << std::endl;
++    }
++
++    // Recurse on d-1 hyper-faces:
++    for(unsigned int idx = 0; idx < l.size(); ++idx)
++    {
++        std::vector<size_t> l0{};
++        std::vector<size_t> s0{};
++        for(size_t i = 0; i < l.size(); ++i)
++        {
++            if(i != idx)
++            {
++                l0.push_back(l[i]);
++                s0.push_back(s[i]);
++            }
++        }
++        if(!array_valid(l0, s0, verbose))
++            return false;
++    }
++
++    // Handle the 1D vs (N-1) case:
++    for(unsigned int idx = 0; idx < l.size(); ++idx)
++    {
++        if(!valid_length_stride_1d_multi(idx, l, s, verbose))
++            return false;
++    }
++
++    for(size_t dim0 = 2; dim0 <= l.size() / 2; ++dim0)
++    {
++        const size_t dim1 = l.size() - dim0;
++        if(verbose > 2)
++            std::cout << "dims: " << dim0 << " " << dim1 << std::endl;
++
++        // We iterate over all permutations of an array of length l.size() which contains dim0 zeros
++        // and dim1 ones.  We start with {0, ..., 0, 1, ... 1} to guarantee that we hit all the
++        // possibilities.
++
++        // First, get all the permutations
++        std::vector<std::vector<size_t>> perms;
++        std::vector<size_t>              v(l.size());
++        std::fill(v.begin(), v.begin() + dim1, 0);
++        std::fill(v.begin() + dim1, v.end(), 1);
++        do
++        {
++            perms.push_back(v);
++            if(verbose > 3)
++            {
++                std::cout << "v:";
++                for(const auto i : v)
++                {
++                    std::cout << " " << i;
++                }
++                std::cout << "\n";
++            }
++
++        } while(std::next_permutation(v.begin(), v.end()));
++
++        bool invalid = false;
++        // Then loop over all of the permutations.
++#ifdef _OPENMP
++#pragma omp parallel for
++#endif
++        for(size_t iperm = 0; iperm < perms.size(); ++iperm)
++        {
++            std::vector<size_t> l0(dim0);
++            std::vector<size_t> s0(dim0);
++            std::vector<size_t> l1(dim1);
++            std::vector<size_t> s1(dim1);
++
++            for(size_t i = 0; i < l.size(); ++i)
++            {
++                if(v[i] == 0)
++                {
++                    l0.push_back(l[i]);
++                    s0.push_back(s[i]);
++                }
++                else
++                {
++                    l1.push_back(l[i]);
++                    s1.push_back(s[i]);
++                }
++            }
++
++            if(verbose > 3)
++            {
++                std::cout << "\tl0:";
++                for(const auto i : l0)
++                {
++                    std::cout << " " << i;
++                }
++                std::cout << "\n";
++                std::cout << "\ts0:";
++                for(const auto i : s0)
++                {
++                    std::cout << " " << i;
++                }
++                std::cout << "\n";
++                std::cout << "\tl1:";
++                for(const auto i : l1)
++                {
++                    std::cout << " " << i;
++                }
++                std::cout << "\n";
++                std::cout << "\ts1:";
++                for(const auto i : s1)
++                {
++                    std::cout << " " << i;
++                }
++                std::cout << "\n";
++            }
++
++            if(!valid_length_stride_multi_multi(l0, s0, l1, s1))
++            {
++#ifdef _OPENMP
++#pragma omp cancel for
++#endif
++                invalid = true;
++            }
++        }
++        if(invalid)
++            return false;
++    }
++
++    return true;
++}
++
++bool sort_by_stride(const std::pair<size_t, size_t>& ls0, const std::pair<size_t, size_t>& ls1)
++{
++    return ls0.second < ls1.second;
++}
++
++bool array_valid(const std::vector<size_t>& length,
++                 const std::vector<size_t>& stride,
++                 const int                  verbose)
++{
++    if(length.size() != stride.size())
++        return false;
++
++    // If a length is 1, then the stride is irrelevant.
++    // If a length is > 1, then the corresponding stride must be > 1.
++    std::vector<size_t> l{}, s{};
++    for(unsigned int i = 0; i < length.size(); ++i)
++    {
++        if(length[i] > 1)
++        {
++            if(stride[i] == 0)
++                return false;
++            l.push_back(length[i]);
++            s.push_back(stride[i]);
++        }
++    }
++
++    if(length.size() > 1)
++    {
++        // Check happy path.
++        bool                                   happy_path = true;
++        std::vector<std::pair<size_t, size_t>> ls;
++        for(size_t idx = 0; idx < length.size(); ++idx)
++        {
++            ls.push_back(std::pair(length[idx], stride[idx]));
++        }
++        std::sort(ls.begin(), ls.end(), sort_by_stride);
++
++        if(verbose > 2)
++        {
++            for(size_t idx = 0; idx < ls.size(); ++idx)
++            {
++                std::cout << ls[idx].first << "\t" << ls[idx].second << "\n";
++            }
++        }
++
++        for(size_t idx = 1; idx < ls.size(); ++idx)
++        {
++            if(ls[idx].second < ls[idx - 1].first * ls[idx - 1].second)
++            {
++                happy_path = false;
++                break;
++            }
++        }
++        if(happy_path)
++        {
++            if(verbose > 2)
++            {
++                std::cout << "happy path\n";
++            }
++            return true;
++        }
++    }
++
++    switch(l.size())
++    {
++    case 0:
++        return true;
++        break;
++    case 1:
++        return s[0] != 0;
++        break;
++    case 2:
++    {
++        return valid_length_stride_2d(l[0], l[1], s[0], s[1]);
++        break;
++    }
++    case 3:
++    {
++        return valid_length_stride_3d(l, s, verbose);
++        break;
++    }
++    case 4:
++    {
++        return valid_length_stride_4d(l, s, verbose);
++        break;
++    }
++    default:
++        return valid_length_stride_generald(l, s, verbose);
++        return true;
++    }
++
++    return true;
++}
+diff --git a/shared/array_validator.h b/shared/array_validator.h
+new file mode 100644
+index 0000000..ce85173
+--- /dev/null
++++ b/shared/array_validator.h
+@@ -0,0 +1,31 @@
++// Copyright (C) 2022 Advanced Micro Devices, Inc. All rights reserved.
++//
++// Permission is hereby granted, free of charge, to any person obtaining a copy
++// of this software and associated documentation files (the "Software"), to deal
++// in the Software without restriction, including without limitation the rights
++// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
++// copies of the Software, and to permit persons to whom the Software is
++// furnished to do so, subject to the following conditions:
++//
++// The above copyright notice and this permission notice shall be included in
++// all copies or substantial portions of the Software.
++//
++// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
++// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
++// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
++// THE SOFTWARE.
++
++#ifndef ARRAY_VALIDATOR_H
++#define ARRAY_VALIDATOR_H
++
++#include <vector>
++
++// Checks whether the array with given length and stride has multi-index collisions.
++bool array_valid(const std::vector<size_t>& length,
++                 const std::vector<size_t>& stride,
++                 const int                  verbose = 0);
++
++#endif
+diff --git a/shared/concurrency.h b/shared/concurrency.h
+new file mode 100644
+index 0000000..a36c7c1
+--- /dev/null
++++ b/shared/concurrency.h
+@@ -0,0 +1,41 @@
++// Copyright (C) 2022 Advanced Micro Devices, Inc. All rights reserved.
++//
++// Permission is hereby granted, free of charge, to any person obtaining a copy
++// of this software and associated documentation files (the "Software"), to deal
++// in the Software without restriction, including without limitation the rights
++// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
++// copies of the Software, and to permit persons to whom the Software is
++// furnished to do so, subject to the following conditions:
++//
++// The above copyright notice and this permission notice shall be included in
++// all copies or substantial portions of the Software.
++//
++// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
++// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
++// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
++// THE SOFTWARE.
++
++#pragma once
++
++#include <thread>
++
++#ifndef WIN32
++#include <sched.h>
++#endif
++
++// work out how many parallel tasks to run, based on available
++// resources.  on Linux, this will look at the cpu affinity mask (if
++// available) which might be restricted in a container.  otherwise,
++// return std::thread::hardware_concurrency().
++static unsigned int rocfft_concurrency()
++{
++#ifndef WIN32
++    cpu_set_t cpuset;
++    if(sched_getaffinity(0, sizeof(cpuset), &cpuset) == 0)
++        return CPU_COUNT(&cpuset);
++#endif
++    return std::thread::hardware_concurrency();
++}
+diff --git a/shared/data_gen_device.h b/shared/data_gen_device.h
+new file mode 100644
+index 0000000..77fb012
+--- /dev/null
++++ b/shared/data_gen_device.h
+@@ -0,0 +1,1303 @@
++// Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved.
++//
++// Permission is hereby granted, free of charge, to any person obtaining a copy
++// of this software and associated documentation files (the "Software"), to deal
++// in the Software without restriction, including without limitation the rights
++// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
++// copies of the Software, and to permit persons to whom the Software is
++// furnished to do so, subject to the following conditions:
++//
++// The above copyright notice and this permission notice shall be included in
++// all copies or substantial portions of the Software.
++//
++// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
++// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
++// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
++// THE SOFTWARE.
++
++#ifndef DATA_GEN_DEVICE_H
++#define DATA_GEN_DEVICE_H
++
++// rocRAND can generate warnings if inline asm is not available for
++// some architectures.  data generation isn't performance-critical,
++// so just disable inline asm to prevent the warnings.
++#define ROCRAND_DISABLE_INLINE_ASM
++
++#include "../shared/arithmetic.h"
++#include "../shared/device_properties.h"
++#include "../shared/gpubuf.h"
++#include "../shared/increment.h"
++#include "../shared/rocfft_complex.h"
++#include <hip/hip_runtime.h>
++#include <hip/hip_runtime_api.h>
++#include <hiprand/hiprand.h>
++#include <hiprand/hiprand_kernel.h>
++#include <limits>
++#include <vector>
++
++static const unsigned int DATA_GEN_THREADS    = 8;
++static const unsigned int DATA_GEN_GRID_Y_MAX = 64;
++
++template <typename T>
++struct input_val_1D
++{
++    T val1;
++};
++
++template <typename T>
++struct input_val_2D
++{
++    T val1;
++    T val2;
++};
++
++template <typename T>
++struct input_val_3D
++{
++    T val1;
++    T val2;
++    T val3;
++};
++
++template <typename T>
++static input_val_1D<T> get_input_val(const T& val)
++{
++    return input_val_1D<T>{val};
++}
++
++template <typename T>
++static input_val_2D<T> get_input_val(const std::tuple<T, T>& val)
++{
++    return input_val_2D<T>{std::get<0>(val), std::get<1>(val)};
++}
++
++template <typename T>
++static input_val_3D<T> get_input_val(const std::tuple<T, T, T>& val)
++{
++    return input_val_3D<T>{std::get<0>(val), std::get<1>(val), std::get<2>(val)};
++}
++
++template <typename T>
++__device__ static size_t
++    compute_index(const input_val_1D<T>& length, const input_val_1D<T>& stride, size_t base)
++{
++    return (length.val1 * stride.val1) + base;
++}
++
++template <typename T>
++__device__ static size_t
++    compute_index(const input_val_2D<T>& length, const input_val_2D<T>& stride, size_t base)
++{
++    return (length.val1 * stride.val1) + (length.val2 * stride.val2) + base;
++}
++
++template <typename T>
++__device__ static size_t
++    compute_index(const input_val_3D<T>& length, const input_val_3D<T>& stride, size_t base)
++{
++    return (length.val1 * stride.val1) + (length.val2 * stride.val2) + (length.val3 * stride.val3)
++           + base;
++}
++
++template <typename T>
++static inline input_val_1D<T> make_zero_length(const input_val_1D<T>& whole_length)
++{
++    return input_val_1D<T>{0};
++}
++
++template <typename T>
++static inline input_val_2D<T> make_zero_length(const input_val_2D<T>& whole_length)
++{
++    return input_val_2D<T>{0, 0};
++}
++
++template <typename T>
++static inline input_val_3D<T> make_zero_length(const input_val_3D<T>& whole_length)
++{
++    return input_val_3D<T>{0, 0, 0};
++}
++
++template <typename T>
++static inline input_val_1D<T> make_unit_stride(const input_val_1D<T>& whole_length)
++{
++    return input_val_1D<T>{1};
++}
++
++template <typename T>
++static inline input_val_2D<T> make_unit_stride(const input_val_2D<T>& whole_length)
++{
++    return input_val_2D<T>{1, whole_length.val1};
++}
++
++template <typename T>
++static inline input_val_3D<T> make_unit_stride(const input_val_3D<T>& whole_length)
++{
++    return input_val_3D<T>{1, whole_length.val1, whole_length.val1 * whole_length.val2};
++}
++
++template <typename T>
++__device__ static input_val_1D<T> get_length(const size_t i, const input_val_1D<T>& whole_length)
++{
++    auto xlen = whole_length.val1;
++
++    auto xidx = i % xlen;
++
++    return input_val_1D<T>{xidx};
++}
++
++template <typename T>
++__device__ static input_val_2D<T> get_length(const size_t i, const input_val_2D<T>& whole_length)
++{
++    auto xlen = whole_length.val1;
++    auto ylen = whole_length.val2;
++
++    auto xidx = i % xlen;
++    auto yidx = i / xlen % ylen;
++
++    return input_val_2D<T>{xidx, yidx};
++}
++
++template <typename T>
++__device__ static input_val_3D<T> get_length(const size_t i, const input_val_3D<T>& whole_length)
++{
++    auto xlen = whole_length.val1;
++    auto ylen = whole_length.val2;
++    auto zlen = whole_length.val3;
++
++    auto xidx = i % xlen;
++    auto yidx = i / xlen % ylen;
++    auto zidx = i / xlen / ylen % zlen;
++
++    return input_val_3D<T>{xidx, yidx, zidx};
++}
++
++template <typename T>
++__device__ static size_t get_batch(const size_t i, const input_val_1D<T>& whole_length)
++{
++    auto xlen = whole_length.val1;
++
++    auto yidx = i / xlen;
++
++    return yidx;
++}
++
++template <typename T>
++__device__ static size_t get_batch(const size_t i, const input_val_2D<T>& whole_length)
++{
++    auto xlen = whole_length.val1;
++    auto ylen = whole_length.val2;
++
++    auto zidx = i / xlen / ylen;
++
++    return zidx;
++}
++
++template <typename T>
++__device__ static size_t get_batch(const size_t i, const input_val_3D<T>& length)
++{
++    auto xlen = length.val1;
++    auto ylen = length.val2;
++    auto zlen = length.val3;
++
++    auto widx = i / xlen / ylen / zlen;
++
++    return widx;
++}
++
++__device__ static double make_random_val(hiprandStatePhilox4_32_10* gen_state, double offset)
++{
++    return hiprand_uniform_double(gen_state) + offset;
++}
++
++__device__ static float make_random_val(hiprandStatePhilox4_32_10* gen_state, float offset)
++{
++    return hiprand_uniform(gen_state) + offset;
++}
++
++__device__ static _Float16 make_random_val(hiprandStatePhilox4_32_10* gen_state, _Float16 offset)
++{
++    return static_cast<_Float16>(hiprand_uniform(gen_state)) + offset;
++}
++
++template <typename Tcomplex>
++__device__ static void set_imag_zero(const size_t pos, Tcomplex* x)
++{
++    x[pos].y = 0.0;
++}
++
++template <typename Tfloat>
++__device__ static void set_imag_zero(const size_t pos, Tfloat* xreal, Tfloat* ximag)
++{
++    ximag[pos] = 0.0;
++}
++
++template <typename Tcomplex>
++__device__ static void conjugate(const size_t pos, const size_t cpos, Tcomplex* x)
++{
++    x[pos].x = x[cpos].x;
++    x[pos].y = -x[cpos].y;
++}
++
++template <typename Tfloat>
++__device__ static void conjugate(const size_t pos, const size_t cpos, Tfloat* xreal, Tfloat* ximag)
++{
++    xreal[pos] = xreal[cpos];
++    ximag[pos] = -ximag[cpos];
++}
++
++template <typename Tint, typename Treal>
++__global__ static void __launch_bounds__(DATA_GEN_THREADS)
++    generate_random_interleaved_data_kernel(const Tint             whole_length,
++                                            const Tint             zero_length,
++                                            const size_t           idist,
++                                            const size_t           isize,
++                                            const Tint             istride,
++                                            rocfft_complex<Treal>* data)
++{
++    auto const i = static_cast<size_t>(threadIdx.x) + blockIdx.x * blockDim.x
++                   + blockIdx.y * gridDim.x * DATA_GEN_THREADS;
++    static_assert(sizeof(i) >= sizeof(isize));
++    if(i < isize)
++    {
++        auto i_length = get_length(i, whole_length);
++        auto i_batch  = get_batch(i, whole_length);
++        auto i_base   = i_batch * idist;
++
++        auto seed = compute_index(zero_length, istride, i_base);
++        auto idx  = compute_index(i_length, istride, i_base);
++
++        hiprandStatePhilox4_32_10 gen_state;
++        hiprand_init(seed, idx, 0, &gen_state);
++
++        data[idx].x = make_random_val(&gen_state, static_cast<Treal>(-0.5));
++        data[idx].y = make_random_val(&gen_state, static_cast<Treal>(-0.5));
++    }
++}
++
++template <typename Tint, typename Treal>
++__global__ static void __launch_bounds__(DATA_GEN_THREADS)
++    generate_interleaved_data_kernel(const Tint             whole_length,
++                                     const size_t           idist,
++                                     const size_t           isize,
++                                     const Tint             istride,
++                                     const Tint             ustride,
++                                     const Treal            inv_scale,
++                                     rocfft_complex<Treal>* data)
++{
++    auto const i = static_cast<size_t>(threadIdx.x) + blockIdx.x * blockDim.x
++                   + blockIdx.y * gridDim.x * DATA_GEN_THREADS;
++    static_assert(sizeof(i) >= sizeof(isize));
++    if(i < isize)
++    {
++        const auto i_length = get_length(i, whole_length);
++        const auto i_batch  = get_batch(i, whole_length);
++        const auto i_base   = i_batch * idist;
++
++        const auto val = static_cast<Treal>(-0.5)
++                         + static_cast<Treal>(
++                               static_cast<unsigned long long>(compute_index(i_length, ustride, 0)))
++                               * inv_scale;
++
++        const auto idx = compute_index(i_length, istride, i_base);
++
++        data[idx].x = val;
++        data[idx].y = val;
++    }
++}
++
++template <typename Tint, typename Treal>
++__global__ static void __launch_bounds__(DATA_GEN_THREADS)
++    generate_random_planar_data_kernel(const Tint   whole_length,
++                                       const Tint   zero_length,
++                                       const size_t idist,
++                                       const size_t isize,
++                                       const Tint   istride,
++                                       Treal*       real_data,
++                                       Treal*       imag_data)
++{
++    auto const i = static_cast<size_t>(threadIdx.x) + blockIdx.x * blockDim.x
++                   + blockIdx.y * gridDim.x * DATA_GEN_THREADS;
++    static_assert(sizeof(i) >= sizeof(isize));
++    if(i < isize)
++    {
++        auto i_length = get_length(i, whole_length);
++        auto i_batch  = get_batch(i, whole_length);
++        auto i_base   = i_batch * idist;
++
++        auto seed = compute_index(zero_length, istride, i_base);
++        auto idx  = compute_index(i_length, istride, i_base);
++
++        hiprandStatePhilox4_32_10 gen_state;
++        hiprand_init(seed, idx, 0, &gen_state);
++
++        real_data[idx] = make_random_val(&gen_state, static_cast<Treal>(-0.5));
++        imag_data[idx] = make_random_val(&gen_state, static_cast<Treal>(-0.5));
++    }
++}
++
++template <typename Tint, typename Treal>
++__global__ static void __launch_bounds__(DATA_GEN_THREADS)
++    generate_planar_data_kernel(const Tint   whole_length,
++                                const size_t idist,
++                                const size_t isize,
++                                const Tint   istride,
++                                const Tint   ustride,
++                                const Treal  inv_scale,
++                                Treal*       real_data,
++                                Treal*       imag_data)
++{
++    auto const i = static_cast<size_t>(threadIdx.x) + blockIdx.x * blockDim.x
++                   + blockIdx.y * gridDim.x * DATA_GEN_THREADS;
++    static_assert(sizeof(i) >= sizeof(isize));
++    if(i < isize)
++    {
++        const auto i_length = get_length(i, whole_length);
++        const auto i_batch  = get_batch(i, whole_length);
++        const auto i_base   = i_batch * idist;
++
++        const auto val = static_cast<Treal>(-0.5)
++                         + static_cast<Treal>(
++                               static_cast<unsigned long long>(compute_index(i_length, ustride, 0)))
++                               * inv_scale;
++
++        const auto idx = compute_index(i_length, istride, i_base);
++
++        real_data[idx] = val;
++        imag_data[idx] = val;
++    }
++}
++
++template <typename Tint, typename Treal>
++__global__ static void __launch_bounds__(DATA_GEN_THREADS)
++    generate_random_real_data_kernel(const Tint   whole_length,
++                                     const Tint   zero_length,
++                                     const size_t idist,
++                                     const size_t isize,
++                                     const Tint   istride,
++                                     Treal*       data)
++{
++    auto const i = static_cast<size_t>(threadIdx.x) + blockIdx.x * blockDim.x
++                   + blockIdx.y * gridDim.x * DATA_GEN_THREADS;
++    static_assert(sizeof(i) >= sizeof(isize));
++    if(i < isize)
++    {
++        auto i_length = get_length(i, whole_length);
++        auto i_batch  = get_batch(i, whole_length);
++        auto i_base   = i_batch * idist;
++
++        auto seed = compute_index(zero_length, istride, i_base);
++        auto idx  = compute_index(i_length, istride, i_base);
++
++        hiprandStatePhilox4_32_10 gen_state;
++        hiprand_init(seed, idx, 0, &gen_state);
++
++        data[idx] = make_random_val(&gen_state, static_cast<Treal>(-0.5));
++    }
++}
++
++template <typename Tint, typename Treal>
++__global__ static void __launch_bounds__(DATA_GEN_THREADS)
++    generate_real_data_kernel(const Tint   whole_length,
++                              const size_t idist,
++                              const size_t isize,
++                              const Tint   istride,
++                              const Tint   ustride,
++                              const Treal  inv_scale,
++                              Treal*       data)
++{
++    auto const i = static_cast<size_t>(threadIdx.x) + blockIdx.x * blockDim.x
++                   + blockIdx.y * gridDim.x * DATA_GEN_THREADS;
++    static_assert(sizeof(i) >= sizeof(isize));
++    if(i < isize)
++    {
++        const auto i_length = get_length(i, whole_length);
++        const auto i_batch  = get_batch(i, whole_length);
++        const auto i_base   = i_batch * idist;
++
++        const auto val = static_cast<Treal>(-0.5)
++                         + static_cast<Treal>(
++                               static_cast<unsigned long long>(compute_index(i_length, ustride, 0)))
++                               * inv_scale;
++
++        const auto idx = compute_index(i_length, istride, i_base);
++
++        data[idx] = val;
++    }
++}
++
++// For complex-to-real transforms, the input data must be Hermitiam-symmetric.
++// That is, u_k is the complex conjugate of u_{-k}, where k is the wavevector in Fourier
++// space.  For multi-dimensional data, this means that we only need to store a bit more
++// than half of the complex values; the rest are redundant.  However, there are still
++// some restrictions:
++// * the origin and Nyquist value(s) must be real-valued
++// * some of the remaining values are still redundant, and you might get different results
++//   than you expect if the values don't agree.
++
++template <typename Tcomplex>
++__global__ static void impose_hermitian_symmetry_interleaved_1D_kernel(Tcomplex*    x,
++                                                                       const size_t Nx,
++                                                                       const size_t xstride,
++                                                                       const size_t dist,
++                                                                       const size_t batch_total,
++                                                                       const bool   Nxeven)
++{
++    auto id_batch = static_cast<size_t>(threadIdx.x) + blockIdx.x * blockDim.x;
++    static_assert(sizeof(id_batch) == sizeof(size_t));
++
++    if(id_batch < batch_total)
++    {
++        id_batch *= dist;
++
++        set_imag_zero(id_batch, x);
++
++        if(Nxeven)
++            set_imag_zero(id_batch + (Nx / 2) * xstride, x);
++    }
++}
++
++template <typename Tfloat>
++__global__ static void impose_hermitian_symmetry_planar_1D_kernel(Tfloat*      xreal,
++                                                                  Tfloat*      ximag,
++                                                                  const size_t Nx,
++                                                                  const size_t xstride,
++                                                                  const size_t dist,
++                                                                  const size_t batch_total,
++                                                                  const bool   Nxeven)
++{
++    auto id_batch = static_cast<size_t>(threadIdx.x) + blockIdx.x * blockDim.x;
++    static_assert(sizeof(id_batch) == sizeof(size_t));
++
++    if(id_batch < batch_total)
++    {
++        id_batch *= dist;
++
++        set_imag_zero(id_batch, xreal, ximag);
++
++        if(Nxeven)
++            set_imag_zero(id_batch + (Nx / 2) * xstride, xreal, ximag);
++    }
++}
++
++template <typename Tcomplex>
++__global__ static void impose_hermitian_symmetry_interleaved_2D_kernel(Tcomplex*    x,
++                                                                       const size_t Nx,
++                                                                       const size_t Ny,
++                                                                       const size_t xstride,
++                                                                       const size_t ystride,
++                                                                       const size_t dist,
++                                                                       const size_t batch_total,
++                                                                       const size_t x_total,
++                                                                       const bool   Nxeven,
++                                                                       const bool   Nyeven)
++{
++    auto       id_batch = static_cast<size_t>(threadIdx.x) + blockIdx.x * blockDim.x;
++    const auto id_x     = static_cast<size_t>(threadIdx.y) + blockIdx.y * blockDim.y;
++    static_assert(sizeof(id_batch) == sizeof(size_t));
++    static_assert(sizeof(id_x) == sizeof(size_t));
++
++    if(id_batch < batch_total)
++    {
++        id_batch *= dist;
++
++        if(id_x == 0)
++            set_imag_zero(id_batch, x);
++
++        if(id_x == 0 && Nxeven)
++            set_imag_zero(id_batch + (Nx / 2) * xstride, x);
++
++        if(id_x == 0 && Nyeven)
++            set_imag_zero(id_batch + ystride * (Ny / 2), x);
++
++        if(id_x == 0 && Nxeven && Nyeven)
++            set_imag_zero(id_batch + xstride * (Nx / 2) + ystride * (Ny / 2), x);
++
++        if(id_x < x_total)
++        {
++            conjugate(id_batch + xstride * (Nx - (id_x + 1)), id_batch + xstride * (id_x + 1), x);
++
++            if(Nyeven)
++                conjugate(id_batch + xstride * (Nx - (id_x + 1)) + ystride * (Ny / 2),
++                          id_batch + xstride * (id_x + 1) + ystride * (Ny / 2),
++                          x);
++        }
++    }
++}
++
++template <typename Tfloat>
++__global__ static void impose_hermitian_symmetry_planar_2D_kernel(Tfloat*      xreal,
++                                                                  Tfloat*      ximag,
++                                                                  const size_t Nx,
++                                                                  const size_t Ny,
++                                                                  const size_t xstride,
++                                                                  const size_t ystride,
++                                                                  const size_t dist,
++                                                                  const size_t batch_total,
++                                                                  const size_t x_total,
++                                                                  const bool   Nxeven,
++                                                                  const bool   Nyeven)
++{
++    auto       id_batch = static_cast<size_t>(threadIdx.x) + blockIdx.x * blockDim.x;
++    const auto id_x     = static_cast<size_t>(threadIdx.y) + blockIdx.y * blockDim.y;
++    static_assert(sizeof(id_batch) == sizeof(size_t));
++    static_assert(sizeof(id_x) == sizeof(size_t));
++
++    if(id_batch < batch_total)
++    {
++        id_batch *= dist;
++
++        if(id_x == 0)
++            set_imag_zero(id_batch, xreal, ximag);
++
++        if(id_x == 0 && Nxeven)
++            set_imag_zero(id_batch + (Nx / 2) * xstride, xreal, ximag);
++
++        if(id_x == 0 && Nyeven)
++            set_imag_zero(id_batch + ystride * (Ny / 2), xreal, ximag);
++
++        if(id_x == 0 && Nxeven && Nyeven)
++            set_imag_zero(id_batch + xstride * (Nx / 2) + ystride * (Ny / 2), xreal, ximag);
++
++        if(id_x < x_total)
++        {
++            conjugate(id_batch + xstride * (Nx - (id_x + 1)),
++                      id_batch + xstride * (id_x + 1),
++                      xreal,
++                      ximag);
++
++            if(Nyeven)
++                conjugate(id_batch + xstride * (Nx - (id_x + 1)) + ystride * (Ny / 2),
++                          id_batch + xstride * (id_x + 1) + ystride * (Ny / 2),
++                          xreal,
++                          ximag);
++        }
++    }
++}
++
++template <typename Tcomplex>
++__global__ static void impose_hermitian_symmetry_interleaved_3D_kernel(Tcomplex*    x,
++                                                                       const size_t Nx,
++                                                                       const size_t Ny,
++                                                                       const size_t Nz,
++                                                                       const size_t xstride,
++                                                                       const size_t ystride,
++                                                                       const size_t zstride,
++                                                                       const size_t dist,
++                                                                       const size_t batch_total,
++                                                                       const size_t x_total,
++                                                                       const size_t y_total,
++                                                                       const size_t y_total_half,
++                                                                       const bool   Nxeven,
++                                                                       const bool   Nyeven,
++                                                                       const bool   Nzeven)
++{
++    auto       id_batch = static_cast<size_t>(threadIdx.x) + blockIdx.x * blockDim.x;
++    const auto id_x     = static_cast<size_t>(threadIdx.y) + blockIdx.y * blockDim.y;
++    const auto id_y     = static_cast<size_t>(threadIdx.z) + blockIdx.z * blockDim.z;
++    static_assert(sizeof(id_batch) == sizeof(size_t));
++    static_assert(sizeof(id_x) == sizeof(size_t));
++    static_assert(sizeof(id_y) == sizeof(size_t));
++
++    if(id_batch < batch_total)
++    {
++        auto id_x_y_zero = (id_x == 0 && id_y == 0);
++
++        id_batch *= dist;
++
++        if(id_x_y_zero)
++            set_imag_zero(id_batch, x);
++
++        if(Nxeven && id_x_y_zero)
++            set_imag_zero(id_batch + xstride * (Nx / 2), x);
++
++        if(Nyeven && id_x_y_zero)
++            set_imag_zero(id_batch + ystride * (Ny / 2), x);
++
++        if(Nzeven && id_x_y_zero)
++            set_imag_zero(id_batch + zstride * (Nz / 2), x);
++
++        if(Nxeven && Nyeven && id_x_y_zero)
++            set_imag_zero(id_batch + xstride * (Nx / 2) + ystride * (Ny / 2), x);
++
++        if(Nxeven && Nzeven && id_x_y_zero)
++            set_imag_zero(id_batch + xstride * (Nx / 2) + zstride * (Nz / 2), x);
++
++        if(Nyeven && Nzeven && id_x_y_zero)
++            set_imag_zero(id_batch + ystride * (Ny / 2) + zstride * (Nz / 2), x);
++
++        if(Nxeven && Nyeven && Nzeven && id_x_y_zero)
++            set_imag_zero(id_batch + xstride * (Nx / 2) + ystride * (Ny / 2) + zstride * (Nz / 2),
++                          x);
++
++        if(id_x == 0 && id_y < y_total_half)
++            conjugate(id_batch + ystride * (Ny - (id_y + 1)), id_batch + ystride * (id_y + 1), x);
++
++        if(Nxeven && id_x == 0 && id_y < y_total_half)
++            conjugate(id_batch + xstride * (Nx / 2) + ystride * (Ny - (id_y + 1)),
++                      id_batch + xstride * (Nx / 2) + ystride * (id_y + 1),
++                      x);
++
++        if(id_x < x_total && id_y == 0)
++            conjugate(id_batch + xstride * (Nx - (id_x + 1)), id_batch + xstride * (id_x + 1), x);
++
++        if(Nyeven && id_x < x_total && id_y == 0)
++            conjugate(id_batch + xstride * (Nx - (id_x + 1)) + ystride * (Ny / 2),
++                      id_batch + xstride * (id_x + 1) + ystride * (Ny / 2),
++                      x);
++
++        if(id_x < x_total && id_y < y_total)
++            conjugate(id_batch + xstride * (Nx - (id_x + 1)) + ystride * (Ny - (id_y + 1)),
++                      id_batch + xstride * (id_x + 1) + ystride * (id_y + 1),
++                      x);
++
++        if(Nzeven)
++        {
++            if(id_x < x_total && id_y == 0)
++                conjugate(id_batch + xstride * (Nx - (id_x + 1)) + zstride * (Nz / 2),
++                          id_batch + xstride * (id_x + 1) + zstride * (Nz / 2),
++                          x);
++
++            if(Nyeven && id_x < x_total && id_y == 0)
++                conjugate(id_batch + xstride * (Nx - (id_x + 1)) + zstride * (Nz / 2),
++                          id_batch + xstride * (id_x + 1) + zstride * (Nz / 2),
++                          x);
++
++            if(id_x == 0 && id_y < y_total_half)
++                conjugate(id_batch + ystride * (Ny - (id_y + 1)) + zstride * (Nz / 2),
++                          id_batch + ystride * (id_y + 1) + zstride * (Nz / 2),
++                          x);
++
++            if(Nxeven && id_x == 0 && id_y < y_total_half)
++                conjugate(id_batch + xstride * (Nx / 2) + ystride * (Ny - (id_y + 1))
++                              + zstride * (Nz / 2),
++                          id_batch + xstride * (Nx / 2) + ystride * (id_y + 1) + zstride * (Nz / 2),
++                          x);
++
++            if(id_x < x_total && id_y < y_total)
++                conjugate(id_batch + xstride * (Nx - (id_x + 1)) + ystride * (Ny - (id_y + 1))
++                              + zstride * (Nz / 2),
++                          id_batch + xstride * (id_x + 1) + ystride * (id_y + 1)
++                              + zstride * (Nz / 2),
++                          x);
++        }
++    }
++}
++
++template <typename Tfloat>
++__global__ static void impose_hermitian_symmetry_planar_3D_kernel(Tfloat*      xreal,
++                                                                  Tfloat*      ximag,
++                                                                  const size_t Nx,
++                                                                  const size_t Ny,
++                                                                  const size_t Nz,
++                                                                  const size_t xstride,
++                                                                  const size_t ystride,
++                                                                  const size_t zstride,
++                                                                  const size_t dist,
++                                                                  const size_t batch_total,
++                                                                  const size_t x_total,
++                                                                  const size_t y_total,
++                                                                  const size_t y_total_half,
++                                                                  const bool   Nxeven,
++                                                                  const bool   Nyeven,
++                                                                  const bool   Nzeven)
++{
++    auto       id_batch = static_cast<size_t>(threadIdx.x) + blockIdx.x * blockDim.x;
++    const auto id_x     = static_cast<size_t>(threadIdx.y) + blockIdx.y * blockDim.y;
++    const auto id_y     = static_cast<size_t>(threadIdx.z) + blockIdx.z * blockDim.z;
++    static_assert(sizeof(id_batch) == sizeof(size_t));
++    static_assert(sizeof(id_x) == sizeof(size_t));
++    static_assert(sizeof(id_y) == sizeof(size_t));
++
++    if(id_batch < batch_total)
++    {
++        auto id_x_y_zero = (id_x == 0 && id_y == 0);
++
++        id_batch *= dist;
++
++        if(id_x_y_zero)
++            set_imag_zero(id_batch, xreal, ximag);
++
++        if(Nxeven && id_x_y_zero)
++            set_imag_zero(id_batch + xstride * (Nx / 2), xreal, ximag);
++
++        if(Nyeven && id_x_y_zero)
++            set_imag_zero(id_batch + ystride * (Ny / 2), xreal, ximag);
++
++        if(Nzeven && id_x_y_zero)
++            set_imag_zero(id_batch + zstride * (Nz / 2), xreal, ximag);
++
++        if(Nxeven && Nyeven && id_x_y_zero)
++            set_imag_zero(id_batch + xstride * (Nx / 2) + ystride * (Ny / 2), xreal, ximag);
++
++        if(Nxeven && Nzeven && id_x_y_zero)
++            set_imag_zero(id_batch + xstride * (Nx / 2) + zstride * (Nz / 2), xreal, ximag);
++
++        if(Nyeven && Nzeven && id_x_y_zero)
++            set_imag_zero(id_batch + ystride * (Ny / 2) + zstride * (Nz / 2), xreal, ximag);
++
++        if(Nxeven && Nyeven && Nzeven && id_x_y_zero)
++            set_imag_zero(id_batch + xstride * (Nx / 2) + ystride * (Ny / 2) + zstride * (Nz / 2),
++                          xreal,
++                          ximag);
++
++        if(id_x == 0 && id_y < y_total_half)
++            conjugate(id_batch + ystride * (Ny - (id_y + 1)),
++                      id_batch + ystride * (id_y + 1),
++                      xreal,
++                      ximag);
++
++        if(Nxeven && id_x == 0 && id_y < y_total_half)
++            conjugate(id_batch + xstride * (Nx / 2) + ystride * (Ny - (id_y + 1)),
++                      id_batch + xstride * (Nx / 2) + ystride * (id_y + 1),
++                      xreal,
++                      ximag);
++
++        if(id_x < x_total && id_y == 0)
++            conjugate(id_batch + xstride * (Nx - (id_x + 1)),
++                      id_batch + xstride * (id_x + 1),
++                      xreal,
++                      ximag);
++
++        if(Nyeven && id_x < x_total && id_y == 0)
++            conjugate(id_batch + xstride * (Nx - (id_x + 1)) + ystride * (Ny / 2),
++                      id_batch + xstride * (id_x + 1) + ystride * (Ny / 2),
++                      xreal,
++                      ximag);
++
++        if(id_x < x_total && id_y < y_total)
++            conjugate(id_batch + xstride * (Nx - (id_x + 1)) + ystride * (Ny - (id_y + 1)),
++                      id_batch + xstride * (id_x + 1) + ystride * (id_y + 1),
++                      xreal,
++                      ximag);
++
++        if(Nzeven)
++        {
++            if(id_x < x_total && id_y == 0)
++                conjugate(id_batch + xstride * (Nx - (id_x + 1)) + zstride * (Nz / 2),
++                          id_batch + xstride * (id_x + 1) + zstride * (Nz / 2),
++                          xreal,
++                          ximag);
++
++            if(Nyeven && id_x < x_total && id_y == 0)
++                conjugate(id_batch + xstride * (Nx - (id_x + 1)) + zstride * (Nz / 2),
++                          id_batch + xstride * (id_x + 1) + zstride * (Nz / 2),
++                          xreal,
++                          ximag);
++
++            if(id_x == 0 && id_y < y_total_half)
++                conjugate(id_batch + ystride * (Ny - (id_y + 1)) + zstride * (Nz / 2),
++                          id_batch + ystride * (id_y + 1) + zstride * (Nz / 2),
++                          xreal,
++                          ximag);
++
++            if(Nxeven && id_x == 0 && id_y < y_total_half)
++                conjugate(id_batch + xstride * (Nx / 2) + ystride * (Ny - (id_y + 1))
++                              + zstride * (Nz / 2),
++                          id_batch + xstride * (Nx / 2) + ystride * (id_y + 1) + zstride * (Nz / 2),
++                          xreal,
++                          ximag);
++
++            if(id_x < x_total && id_y < y_total)
++                conjugate(id_batch + xstride * (Nx - (id_x + 1)) + ystride * (Ny - (id_y + 1))
++                              + zstride * (Nz / 2),
++                          id_batch + xstride * (id_x + 1) + ystride * (id_y + 1)
++                              + zstride * (Nz / 2),
++                          xreal,
++                          ximag);
++        }
++    }
++}
++
++// get grid dimensions for data gen kernel
++static dim3 generate_data_gridDim(const size_t isize)
++{
++    auto blockSize = DATA_GEN_THREADS;
++    // total number of blocks needed in the grid
++    auto numBlocks_setup = DivRoundingUp<size_t>(isize, blockSize);
++
++    // Total work items per dimension in the grid is counted in
++    // uint32_t.  Since each thread initializes one element, very
++    // large amounts of data will overflow this total size if we do
++    // all this work in one grid dimension, causing launch failure.
++    //
++    // CUDA also generally allows for effectively unlimited grid X
++    // dim, but Y and Z are more limited.
++    auto gridDim_y = std::min<unsigned int>(DATA_GEN_GRID_Y_MAX, numBlocks_setup);
++    auto gridDim_x = DivRoundingUp<unsigned int>(numBlocks_setup, DATA_GEN_GRID_Y_MAX);
++    return {gridDim_x, gridDim_y};
++}
++
++// get grid dimensions for hermitian symmetrizer kernel
++static dim3 generate_hermitian_gridDim(const std::vector<size_t>& length,
++                                       const size_t               batch,
++                                       const size_t               blockSize)
++{
++    dim3 gridDim;
++
++    switch(length.size())
++    {
++    case 1:
++        gridDim = dim3(DivRoundingUp<size_t>(batch, blockSize));
++        break;
++    case 2:
++        gridDim = dim3(DivRoundingUp<size_t>(batch, blockSize),
++                       DivRoundingUp<size_t>((length[0] + 1) / 2 - 1, blockSize));
++        break;
++    case 3:
++        gridDim = dim3(DivRoundingUp<size_t>(batch, blockSize),
++                       DivRoundingUp<size_t>((length[0] + 1) / 2 - 1, blockSize),
++                       DivRoundingUp<size_t>(length[1] - 1, blockSize));
++        break;
++    default:
++        throw std::runtime_error("Invalid dimension for impose_hermitian_symmetry");
++    }
++
++    return gridDim;
++}
++
++static dim3 generate_blockDim(const std::vector<size_t>& length, const size_t blockSize)
++{
++    dim3 blockDim;
++
++    switch(length.size())
++    {
++    case 1:
++        blockDim = dim3(blockSize);
++        break;
++    case 2:
++        blockDim = dim3(blockSize, blockSize);
++        break;
++    case 3:
++        blockDim = dim3(blockSize, blockSize, blockSize);
++        break;
++    default:
++        throw std::runtime_error("Invalid dimension for impose_hermitian_symmetry");
++    }
++
++    return blockDim;
++}
++
++template <typename Tint, typename Treal>
++static void generate_random_interleaved_data(const Tint&            whole_length,
++                                             const size_t           idist,
++                                             const size_t           isize,
++                                             const Tint&            whole_stride,
++                                             rocfft_complex<Treal>* input_data,
++                                             const hipDeviceProp_t& deviceProp)
++{
++    auto input_length = get_input_val(whole_length);
++    auto zero_length  = make_zero_length(input_length);
++    auto input_stride = get_input_val(whole_stride);
++
++    dim3 gridDim = generate_data_gridDim(isize);
++    dim3 blockDim{DATA_GEN_THREADS};
++
++    launch_limits_check("generate_random_interleaved_data_kernel", gridDim, blockDim, deviceProp);
++
++    hipLaunchKernelGGL(
++        HIP_KERNEL_NAME(generate_random_interleaved_data_kernel<decltype(input_length), Treal>),
++        gridDim,
++        blockDim,
++        0, // sharedMemBytes
++        0, // stream
++        input_length,
++        zero_length,
++        idist,
++        isize,
++        input_stride,
++        input_data);
++    auto err = hipGetLastError();
++    if(err != hipSuccess)
++        throw std::runtime_error("generate_random_interleaved_data_kernel launch failure: "
++                                 + std::string(hipGetErrorName(err)));
++}
++
++template <typename Tint, typename Treal>
++static void generate_interleaved_data(const Tint&            whole_length,
++                                      const size_t           idist,
++                                      const size_t           isize,
++                                      const Tint&            whole_stride,
++                                      const size_t           nbatch,
++                                      rocfft_complex<Treal>* input_data,
++                                      const hipDeviceProp_t& deviceProp)
++{
++    const auto input_length = get_input_val(whole_length);
++    const auto input_stride = get_input_val(whole_stride);
++    const auto unit_stride  = make_unit_stride(input_length);
++
++    const auto inv_scale
++        = static_cast<Treal>(1.0)
++          / static_cast<Treal>(static_cast<unsigned long long>(isize) / nbatch - 1);
++
++    dim3 gridDim = generate_data_gridDim(isize);
++    dim3 blockDim{DATA_GEN_THREADS};
++
++    launch_limits_check("generate_interleaved_data_kernel", gridDim, blockDim, deviceProp);
++
++    hipLaunchKernelGGL(
++        HIP_KERNEL_NAME(generate_interleaved_data_kernel<decltype(input_length), Treal>),
++        gridDim,
++        blockDim,
++        0, // sharedMemBytes
++        0, // stream
++        input_length,
++        idist,
++        isize,
++        input_stride,
++        unit_stride,
++        inv_scale,
++        input_data);
++    auto err = hipGetLastError();
++    if(err != hipSuccess)
++        throw std::runtime_error("generate_interleaved_data_kernel launch failure: "
++                                 + std::string(hipGetErrorName(err)));
++}
++
++template <typename Tint, typename Treal>
++static void generate_random_planar_data(const Tint&            whole_length,
++                                        const size_t           idist,
++                                        const size_t           isize,
++                                        const Tint&            whole_stride,
++                                        Treal*                 real_data,
++                                        Treal*                 imag_data,
++                                        const hipDeviceProp_t& deviceProp)
++{
++    const auto input_length = get_input_val(whole_length);
++    const auto zero_length  = make_zero_length(input_length);
++    const auto input_stride = get_input_val(whole_stride);
++
++    dim3 gridDim = generate_data_gridDim(isize);
++    dim3 blockDim{DATA_GEN_THREADS};
++
++    launch_limits_check("generate_random_planar_data_kernel", gridDim, blockDim, deviceProp);
++
++    hipLaunchKernelGGL(
++        HIP_KERNEL_NAME(generate_random_planar_data_kernel<decltype(input_length), Treal>),
++        gridDim,
++        blockDim,
++        0, // sharedMemBytes
++        0, // stream
++        input_length,
++        zero_length,
++        idist,
++        isize,
++        input_stride,
++        real_data,
++        imag_data);
++    auto err = hipGetLastError();
++    if(err != hipSuccess)
++        throw std::runtime_error("generate_random_planar_data_kernel launch failure: "
++                                 + std::string(hipGetErrorName(err)));
++}
++
++template <typename Tint, typename Treal>
++static void generate_planar_data(const Tint&            whole_length,
++                                 const size_t           idist,
++                                 const size_t           isize,
++                                 const Tint&            whole_stride,
++                                 const size_t           nbatch,
++                                 Treal*                 real_data,
++                                 Treal*                 imag_data,
++                                 const hipDeviceProp_t& deviceProp)
++{
++    const auto input_length = get_input_val(whole_length);
++    const auto input_stride = get_input_val(whole_stride);
++    const auto unit_stride  = make_unit_stride(input_length);
++
++    const auto inv_scale
++        = static_cast<Treal>(1.0)
++          / static_cast<Treal>(static_cast<unsigned long long>(isize) / nbatch - 1);
++
++    dim3 gridDim = generate_data_gridDim(isize);
++    dim3 blockDim{DATA_GEN_THREADS};
++
++    launch_limits_check("generate_planar_data_kernel", gridDim, blockDim, deviceProp);
++
++    hipLaunchKernelGGL(HIP_KERNEL_NAME(generate_planar_data_kernel<decltype(input_length), Treal>),
++                       gridDim,
++                       blockDim,
++                       0, // sharedMemBytes
++                       0, // stream
++                       input_length,
++                       idist,
++                       isize,
++                       input_stride,
++                       unit_stride,
++                       inv_scale,
++                       real_data,
++                       imag_data);
++    auto err = hipGetLastError();
++    if(err != hipSuccess)
++        throw std::runtime_error("generate_planar_data_kernel launch failure: "
++                                 + std::string(hipGetErrorName(err)));
++}
++
++template <typename Tint, typename Treal>
++static void generate_random_real_data(const Tint&            whole_length,
++                                      const size_t           idist,
++                                      const size_t           isize,
++                                      const Tint&            whole_stride,
++                                      Treal*                 input_data,
++                                      const hipDeviceProp_t& deviceProp)
++{
++    const auto input_length = get_input_val(whole_length);
++    const auto zero_length  = make_zero_length(input_length);
++    const auto input_stride = get_input_val(whole_stride);
++
++    dim3 gridDim = generate_data_gridDim(isize);
++    dim3 blockDim{DATA_GEN_THREADS};
++
++    launch_limits_check("generate_random_real_data_kernel", gridDim, blockDim, deviceProp);
++
++    hipLaunchKernelGGL(
++        HIP_KERNEL_NAME(generate_random_real_data_kernel<decltype(input_length), Treal>),
++        gridDim,
++        blockDim,
++        0, // sharedMemBytes
++        0, // stream
++        input_length,
++        zero_length,
++        idist,
++        isize,
++        input_stride,
++        input_data);
++    auto err = hipGetLastError();
++    if(err != hipSuccess)
++        throw std::runtime_error("generate_random_real_data_kernel launch failure: "
++                                 + std::string(hipGetErrorName(err)));
++}
++
++template <typename Tint, typename Treal>
++static void generate_real_data(const Tint&            whole_length,
++                               const size_t           idist,
++                               const size_t           isize,
++                               const Tint&            whole_stride,
++                               const size_t           nbatch,
++                               Treal*                 input_data,
++                               const hipDeviceProp_t& deviceProp)
++{
++    const auto input_length = get_input_val(whole_length);
++    const auto input_stride = get_input_val(whole_stride);
++    const auto unit_stride  = make_unit_stride(input_length);
++
++    const auto inv_scale
++        = static_cast<Treal>(1.0)
++          / static_cast<Treal>(static_cast<unsigned long long>(isize) / nbatch - 1);
++
++    dim3 gridDim = generate_data_gridDim(isize);
++    dim3 blockDim{DATA_GEN_THREADS};
++
++    launch_limits_check("generate_real_data_kernel", gridDim, blockDim, deviceProp);
++
++    hipLaunchKernelGGL(HIP_KERNEL_NAME(generate_real_data_kernel<decltype(input_length), Treal>),
++                       gridDim,
++                       blockDim,
++                       0, // sharedMemBytes
++                       0, // stream
++                       input_length,
++                       idist,
++                       isize,
++                       input_stride,
++                       unit_stride,
++                       inv_scale,
++                       input_data);
++    auto err = hipGetLastError();
++    if(err != hipSuccess)
++        throw std::runtime_error("generate_real_data_kernel launch failure: "
++                                 + std::string(hipGetErrorName(err)));
++}
++
++template <typename Tcomplex>
++static void impose_hermitian_symmetry_interleaved(const std::vector<size_t>& length,
++                                                  const std::vector<size_t>& ilength,
++                                                  const std::vector<size_t>& stride,
++                                                  const size_t               dist,
++                                                  const size_t               batch,
++                                                  Tcomplex*                  input_data,
++                                                  const hipDeviceProp_t&     deviceProp)
++{
++    auto blockSize = DATA_GEN_THREADS;
++    auto blockDim  = generate_blockDim(length, blockSize);
++    auto gridDim   = generate_hermitian_gridDim(length, batch, blockSize);
++
++    switch(length.size())
++    {
++    case 1:
++    {
++        launch_limits_check(
++            "impose_hermitian_symmetry_interleaved_1D_kernel", gridDim, blockDim, deviceProp);
++
++        hipLaunchKernelGGL(impose_hermitian_symmetry_interleaved_1D_kernel<Tcomplex>,
++                           gridDim,
++                           blockDim,
++                           0,
++                           0,
++                           input_data,
++                           length[0],
++                           stride[0],
++                           dist,
++                           batch,
++                           length[0] % 2 == 0);
++
++        break;
++    }
++    case 2:
++    {
++        launch_limits_check(
++            "impose_hermitian_symmetry_interleaved_2D_kernel", gridDim, blockDim, deviceProp);
++
++        hipLaunchKernelGGL(impose_hermitian_symmetry_interleaved_2D_kernel<Tcomplex>,
++                           gridDim,
++                           blockDim,
++                           0,
++                           0,
++                           input_data,
++                           length[0],
++                           length[1],
++                           stride[0],
++                           stride[1],
++                           dist,
++                           batch,
++                           (ilength[0] + 1) / 2 - 1,
++                           length[0] % 2 == 0,
++                           length[1] % 2 == 0);
++
++        break;
++    }
++    case 3:
++    {
++        launch_limits_check(
++            "impose_hermitian_symmetry_interleaved_3D_kernel", gridDim, blockDim, deviceProp);
++
++        hipLaunchKernelGGL(impose_hermitian_symmetry_interleaved_3D_kernel<Tcomplex>,
++                           gridDim,
++                           blockDim,
++                           0,
++                           0,
++                           input_data,
++                           length[0],
++                           length[1],
++                           length[2],
++                           stride[0],
++                           stride[1],
++                           stride[2],
++                           dist,
++                           batch,
++                           (ilength[0] + 1) / 2 - 1,
++                           ilength[1] - 1,
++                           (ilength[1] + 1) / 2 - 1,
++                           length[0] % 2 == 0,
++                           length[1] % 2 == 0,
++                           length[2] % 2 == 0);
++        break;
++    }
++    default:
++        throw std::runtime_error("Invalid dimension for impose_hermitian_symmetry");
++    }
++    auto err = hipGetLastError();
++    if(err != hipSuccess)
++        throw std::runtime_error("impose_hermitian_symmetry_interleaved_kernel launch failure: "
++                                 + std::string(hipGetErrorName(err)));
++}
++
++template <typename Tfloat>
++static void impose_hermitian_symmetry_planar(const std::vector<size_t>& length,
++                                             const std::vector<size_t>& ilength,
++                                             const std::vector<size_t>& stride,
++                                             const size_t               dist,
++                                             const size_t               batch,
++                                             Tfloat*                    input_data_real,
++                                             Tfloat*                    input_data_imag,
++                                             const hipDeviceProp_t&     deviceProp)
++{
++    auto blockSize = DATA_GEN_THREADS;
++    auto blockDim  = generate_blockDim(length, blockSize);
++    auto gridDim   = generate_hermitian_gridDim(length, batch, blockSize);
++
++    switch(length.size())
++    {
++    case 1:
++    {
++        launch_limits_check(
++            "impose_hermitian_symmetry_planar_1D_kernel", gridDim, blockDim, deviceProp);
++
++        hipLaunchKernelGGL(impose_hermitian_symmetry_planar_1D_kernel<Tfloat>,
++                           gridDim,
++                           blockDim,
++                           0,
++                           0,
++                           input_data_real,
++                           input_data_imag,
++                           length[0],
++                           stride[0],
++                           dist,
++                           batch,
++                           length[0] % 2 == 0);
++
++        break;
++    }
++    case 2:
++    {
++        launch_limits_check(
++            "impose_hermitian_symmetry_planar_2D_kernel", gridDim, blockDim, deviceProp);
++
++        hipLaunchKernelGGL(impose_hermitian_symmetry_planar_2D_kernel<Tfloat>,
++                           gridDim,
++                           blockDim,
++                           0,
++                           0,
++                           input_data_real,
++                           input_data_imag,
++                           length[0],
++                           length[1],
++                           stride[0],
++                           stride[1],
++                           dist,
++                           batch,
++                           (ilength[0] + 1) / 2 - 1,
++                           length[0] % 2 == 0,
++                           length[1] % 2 == 0);
++
++        break;
++    }
++    case 3:
++    {
++        launch_limits_check(
++            "impose_hermitian_symmetry_planar_3D_kernel", gridDim, blockDim, deviceProp);
++
++        hipLaunchKernelGGL(impose_hermitian_symmetry_planar_3D_kernel<Tfloat>,
++                           gridDim,
++                           blockDim,
++                           0,
++                           0,
++                           input_data_real,
++                           input_data_imag,
++                           length[0],
++                           length[1],
++                           length[2],
++                           stride[0],
++                           stride[1],
++                           stride[2],
++                           dist,
++                           batch,
++                           (ilength[0] + 1) / 2 - 1,
++                           ilength[1] - 1,
++                           (ilength[1] + 1) / 2 - 1,
++                           length[0] % 2 == 0,
++                           length[1] % 2 == 0,
++                           length[2] % 2 == 0);
++        break;
++    }
++    default:
++        throw std::runtime_error("Invalid dimension for impose_hermitian_symmetry");
++    }
++    auto err = hipGetLastError();
++    if(err != hipSuccess)
++        throw std::runtime_error("impose_hermitian_symmetry_planar_kernel launch failure: "
++                                 + std::string(hipGetErrorName(err)));
++}
++
++#endif // DATA_GEN_DEVICE_H
+diff --git a/shared/data_gen_host.h b/shared/data_gen_host.h
+new file mode 100644
+index 0000000..29d3854
+--- /dev/null
++++ b/shared/data_gen_host.h
+@@ -0,0 +1,881 @@
++// Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved.
++//
++// Permission is hereby granted, free of charge, to any person obtaining a copy
++// of this software and associated documentation files (the "Software"), to deal
++// in the Software without restriction, including without limitation the rights
++// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
++// copies of the Software, and to permit persons to whom the Software is
++// furnished to do so, subject to the following conditions:
++//
++// The above copyright notice and this permission notice shall be included in
++// all copies or substantial portions of the Software.
++//
++// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
++// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
++// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
++// THE SOFTWARE.
++
++#ifndef DATA_GEN_HOST_H
++#define DATA_GEN_HOST_H
++
++#include "../shared/hostbuf.h"
++#include "../shared/increment.h"
++#include <complex>
++#include <limits>
++#include <random>
++#include <tuple>
++#include <vector>
++
++// Specialized computation of index given 1-, 2-, 3- dimension length + stride
++template <typename T1, typename T2>
++size_t compute_index(T1 length, T2 stride, size_t base)
++{
++    return (length * stride) + base;
++}
++
++template <typename T1, typename T2>
++size_t
++    compute_index(const std::tuple<T1, T1>& length, const std::tuple<T2, T2>& stride, size_t base)
++{
++    static_assert(std::is_integral<T1>::value, "Integral required.");
++    static_assert(std::is_integral<T2>::value, "Integral required.");
++    return (std::get<0>(length) * std::get<0>(stride)) + (std::get<1>(length) * std::get<1>(stride))
++           + base;
++}
++
++template <typename T1, typename T2>
++size_t compute_index(const std::tuple<T1, T1, T1>& length,
++                     const std::tuple<T2, T2, T2>& stride,
++                     size_t                        base)
++{
++    static_assert(std::is_integral<T1>::value, "Integral required.");
++    static_assert(std::is_integral<T2>::value, "Integral required.");
++    return (std::get<0>(length) * std::get<0>(stride)) + (std::get<1>(length) * std::get<1>(stride))
++           + (std::get<2>(length) * std::get<2>(stride)) + base;
++}
++
++// count the number of total iterations for 1-, 2-, and 3-D dimensions
++template <typename T1>
++size_t count_iters(const T1& i)
++{
++    return i;
++}
++
++template <typename T1>
++size_t count_iters(const std::tuple<T1, T1>& i)
++{
++    return std::get<0>(i) * std::get<1>(i);
++}
++
++template <typename T1>
++size_t count_iters(const std::tuple<T1, T1, T1>& i)
++{
++    return std::get<0>(i) * std::get<1>(i) * std::get<2>(i);
++}
++
++template <typename T1>
++T1 make_unit_stride(const T1& whole_length)
++{
++    return static_cast<T1>(1);
++}
++
++template <typename T1>
++std::tuple<T1, T1> make_unit_stride(const std::tuple<T1, T1>& whole_length)
++{
++    return std::make_tuple(static_cast<T1>(1), static_cast<T1>(std::get<0>(whole_length)));
++}
++
++template <typename T1>
++std::tuple<T1, T1, T1> make_unit_stride(const std::tuple<T1, T1, T1>& whole_length)
++{
++    return std::make_tuple(static_cast<T1>(1),
++                           static_cast<T1>(std::get<0>(whole_length)),
++                           static_cast<T1>(std::get<0>(whole_length))
++                               * static_cast<T1>(std::get<1>(whole_length)));
++}
++
++// Work out how many partitions to break our iteration problem into
++template <typename T1>
++static size_t compute_partition_count(T1 length)
++{
++#ifdef _OPENMP
++    // we seem to get contention from too many threads, which slows
++    // things down.  particularly noticeable with mix_3D tests
++    static const size_t MAX_PARTITIONS = 8;
++    size_t              iters          = count_iters(length);
++    size_t hw_threads = std::min(MAX_PARTITIONS, static_cast<size_t>(omp_get_num_procs()));
++    if(!hw_threads)
++        return 1;
++
++    // don't bother threading problem sizes that are too small. pick
++    // an arbitrary number of iterations and ensure that each thread
++    // has at least that many iterations to process
++    static const size_t MIN_ITERS_PER_THREAD = 2048;
++
++    // either use the whole CPU, or use ceil(iters/iters_per_thread)
++    return std::min(hw_threads, (iters + MIN_ITERS_PER_THREAD + 1) / MIN_ITERS_PER_THREAD);
++#else
++    return 1;
++#endif
++}
++
++// Break a scalar length into some number of pieces, returning
++// [(start0, end0), (start1, end1), ...]
++template <typename T1>
++std::vector<std::pair<T1, T1>> partition_base(const T1& length, size_t num_parts)
++{
++    static_assert(std::is_integral<T1>::value, "Integral required.");
++
++    // make sure we don't exceed the length
++    num_parts = std::min(length, num_parts);
++
++    std::vector<std::pair<T1, T1>> ret(num_parts);
++    auto                           partition_size = length / num_parts;
++    T1                             cur_partition  = 0;
++    for(size_t i = 0; i < num_parts; ++i, cur_partition += partition_size)
++    {
++        ret[i].first  = cur_partition;
++        ret[i].second = cur_partition + partition_size;
++    }
++    // last partition might not divide evenly, fix it up
++    ret.back().second = length;
++    return ret;
++}
++
++// Returns pairs of startindex, endindex, for 1D, 2D, 3D lengths
++template <typename T1>
++std::vector<std::pair<T1, T1>> partition_rowmajor(const T1& length)
++{
++    return partition_base(length, compute_partition_count(length));
++}
++
++// Partition on the leftmost part of the tuple, for row-major indexing
++template <typename T1>
++std::vector<std::pair<std::tuple<T1, T1>, std::tuple<T1, T1>>>
++    partition_rowmajor(const std::tuple<T1, T1>& length)
++{
++    auto partitions = partition_base(std::get<0>(length), compute_partition_count(length));
++    std::vector<std::pair<std::tuple<T1, T1>, std::tuple<T1, T1>>> ret(partitions.size());
++    for(size_t i = 0; i < partitions.size(); ++i)
++    {
++        std::get<0>(ret[i].first)  = partitions[i].first;
++        std::get<1>(ret[i].first)  = 0;
++        std::get<0>(ret[i].second) = partitions[i].second;
++        std::get<1>(ret[i].second) = std::get<1>(length);
++    }
++    return ret;
++}
++template <typename T1>
++std::vector<std::pair<std::tuple<T1, T1, T1>, std::tuple<T1, T1, T1>>>
++    partition_rowmajor(const std::tuple<T1, T1, T1>& length)
++{
++    auto partitions = partition_base(std::get<0>(length), compute_partition_count(length));
++    std::vector<std::pair<std::tuple<T1, T1, T1>, std::tuple<T1, T1, T1>>> ret(partitions.size());
++    for(size_t i = 0; i < partitions.size(); ++i)
++    {
++        std::get<0>(ret[i].first)  = partitions[i].first;
++        std::get<1>(ret[i].first)  = 0;
++        std::get<2>(ret[i].first)  = 0;
++        std::get<0>(ret[i].second) = partitions[i].second;
++        std::get<1>(ret[i].second) = std::get<1>(length);
++        std::get<2>(ret[i].second) = std::get<2>(length);
++    }
++    return ret;
++}
++
++// For complex-to-real transforms, the input data must be Hermitiam-symmetric.
++// That is, u_k is the complex conjugate of u_{-k}, where k is the wavevector in Fourier
++// space.  For multi-dimensional data, this means that we only need to store a bit more
++// than half of the complex values; the rest are redundant.  However, there are still
++// some restrictions:
++// * the origin and Nyquist value(s) must be real-valued
++// * some of the remaining values are still redundant, and you might get different results
++//   than you expect if the values don't agree.
++// Below are some example kernels which impose Hermitian symmetry on a complex array
++// of the given dimensions.
++
++template <typename Tfloat, typename Tsize>
++static void impose_hermitian_symmetry_interleaved_1D(std::vector<hostbuf>&     vals,
++                                                     const std::vector<Tsize>& length,
++                                                     const std::vector<Tsize>& istride,
++                                                     const Tsize               idist,
++                                                     const Tsize               nbatch)
++{
++    for(unsigned int ibatch = 0; ibatch < nbatch; ++ibatch)
++    {
++        auto data = ((std::complex<Tfloat>*)vals[0].data()) + ibatch * idist;
++
++        data[0].imag(0.0);
++
++        if(length[0] % 2 == 0)
++        {
++            data[istride[0] * (length[0] / 2)].imag(0.0);
++        }
++    }
++}
++
++template <typename Tfloat, typename Tsize>
++static void impose_hermitian_symmetry_planar_1D(std::vector<hostbuf>&     vals,
++                                                const std::vector<Tsize>& length,
++                                                const std::vector<Tsize>& istride,
++                                                const Tsize               idist,
++                                                const Tsize               nbatch)
++{
++    for(unsigned int ibatch = 0; ibatch < nbatch; ++ibatch)
++    {
++        auto data_imag = ((Tfloat*)vals[1].data()) + ibatch * idist;
++
++        data_imag[0] = 0.0;
++
++        if(length[0] % 2 == 0)
++        {
++            data_imag[istride[0] * (length[0] / 2)] = 0.0;
++        }
++    }
++}
++
++template <typename Tfloat, typename Tsize>
++static void impose_hermitian_symmetry_interleaved_2D(std::vector<hostbuf>&     vals,
++                                                     const std::vector<Tsize>& length,
++                                                     const std::vector<Tsize>& istride,
++                                                     const Tsize               idist,
++                                                     const Tsize               nbatch)
++{
++    for(unsigned int ibatch = 0; ibatch < nbatch; ++ibatch)
++    {
++        auto data = ((std::complex<Tfloat>*)vals[0].data()) + ibatch * idist;
++
++        data[0].imag(0.0);
++
++        if(length[0] % 2 == 0)
++        {
++            data[istride[0] * (length[0] / 2)].imag(0.0);
++        }
++
++        if(length[1] % 2 == 0)
++        {
++            data[istride[1] * (length[1] / 2)].imag(0.0);
++        }
++
++        if(length[0] % 2 == 0 && length[1] % 2 == 0)
++        {
++            data[istride[0] * (length[0] / 2) + istride[1] * (length[1] / 2)].imag(0.0);
++        }
++
++        for(unsigned int i = 1; i < (length[0] + 1) / 2; ++i)
++        {
++            data[istride[0] * (length[0] - i)] = std::conj(data[istride[0] * i]);
++        }
++
++        if(length[1] % 2 == 0)
++        {
++            for(unsigned int i = 1; i < (length[0] + 1) / 2; ++i)
++            {
++                data[istride[0] * (length[0] - i) + istride[1] * (length[1] / 2)]
++                    = std::conj(data[istride[0] * i + istride[1] * (length[1] / 2)]);
++            }
++        }
++    }
++}
++
++template <typename Tfloat, typename Tsize>
++static void impose_hermitian_symmetry_planar_2D(std::vector<hostbuf>&     vals,
++                                                const std::vector<Tsize>& length,
++                                                const std::vector<Tsize>& istride,
++                                                const Tsize               idist,
++                                                const Tsize               nbatch)
++{
++    for(unsigned int ibatch = 0; ibatch < nbatch; ++ibatch)
++    {
++        auto data_real = ((Tfloat*)vals[0].data()) + ibatch * idist;
++        auto data_imag = ((Tfloat*)vals[1].data()) + ibatch * idist;
++
++        data_imag[0] = 0.0;
++
++        if(length[0] % 2 == 0)
++        {
++            data_imag[istride[0] * (length[0] / 2)] = 0.0;
++        }
++
++        if(length[1] % 2 == 0)
++        {
++            data_imag[istride[1] * (length[1] / 2)] = 0.0;
++        }
++
++        if(length[0] % 2 == 0 && length[1] % 2 == 0)
++        {
++            data_imag[istride[0] * (length[0] / 2) + istride[1] * (length[1] / 2)] = 0.0;
++        }
++
++        for(unsigned int i = 1; i < (length[0] + 1) / 2; ++i)
++        {
++            data_real[istride[0] * (length[0] - i)] = data_real[istride[0] * i];
++            data_imag[istride[0] * (length[0] - i)] = -data_imag[istride[0] * i];
++        }
++
++        if(length[1] % 2 == 0)
++        {
++            for(unsigned int i = 1; i < (length[0] + 1) / 2; ++i)
++            {
++                data_real[istride[0] * (length[0] - i) + istride[1] * (length[1] / 2)]
++                    = data_real[istride[0] * i + istride[1] * (length[1] / 2)];
++                data_imag[istride[0] * (length[0] - i) + istride[1] * (length[1] / 2)]
++                    = -data_imag[istride[0] * i + istride[1] * (length[1] / 2)];
++            }
++        }
++    }
++}
++
++template <typename Tfloat, typename Tsize>
++static void impose_hermitian_symmetry_interleaved_3D(std::vector<hostbuf>&     vals,
++                                                     const std::vector<Tsize>& length,
++                                                     const std::vector<Tsize>& istride,
++                                                     const Tsize               idist,
++                                                     const Tsize               nbatch)
++{
++    for(unsigned int ibatch = 0; ibatch < nbatch; ++ibatch)
++    {
++        auto data = ((std::complex<Tfloat>*)vals[0].data()) + ibatch * idist;
++
++        data[0].imag(0.0);
++
++        if(length[0] % 2 == 0)
++        {
++            data[istride[0] * (length[0] / 2)].imag(0.0);
++        }
++
++        if(length[1] % 2 == 0)
++        {
++            data[istride[1] * (length[1] / 2)].imag(0.0);
++        }
++
++        if(length[2] % 2 == 0)
++        {
++            data[istride[2] * (length[2] / 2)].imag(0.0);
++        }
++
++        if(length[0] % 2 == 0 && length[1] % 2 == 0)
++        {
++            data[istride[0] * (length[0] / 2) + istride[1] * (length[1] / 2)].imag(0.0);
++        }
++
++        if(length[0] % 2 == 0 && length[2] % 2 == 0)
++        {
++            data[istride[0] * (length[0] / 2) + istride[2] * (length[2] / 2)].imag(0.0);
++        }
++        if(length[1] % 2 == 0 && length[2] % 2 == 0)
++        {
++            data[istride[1] * (length[1] / 2) + istride[2] * (length[2] / 2)].imag(0.0);
++        }
++
++        if(length[0] % 2 == 0 && length[1] % 2 == 0 && length[2] % 2 == 0)
++        {
++            data[istride[0] * (length[0] / 2) + istride[1] * (length[1] / 2)
++                 + istride[2] * (length[2] / 2)]
++                .imag(0.0);
++        }
++
++        // y-axis:
++        for(unsigned int j = 1; j < (length[1] + 1) / 2; ++j)
++        {
++            data[istride[1] * (length[1] - j)] = std::conj(data[istride[1] * j]);
++        }
++
++        if(length[0] % 2 == 0)
++        {
++            // y-axis at x-nyquist
++            for(unsigned int j = 1; j < (length[1] + 1) / 2; ++j)
++            {
++                data[istride[0] * (length[0] / 2) + istride[1] * (length[1] - j)]
++                    = std::conj(data[istride[0] * (length[0] / 2) + istride[1] * j]);
++            }
++        }
++
++        // x-axis:
++        for(unsigned int i = 1; i < (length[0] + 1) / 2; ++i)
++        {
++            data[istride[0] * (length[0] - i)] = std::conj(data[istride[0] * i]);
++        }
++
++        if(length[1] % 2 == 0)
++        {
++            // x-axis at y-nyquist
++            for(unsigned int i = 1; i < (length[0] + 1) / 2; ++i)
++            {
++                data[istride[0] * (length[0] - i) + istride[1] * (length[1] / 2)]
++                    = std::conj(data[istride[0] * i + istride[1] * (length[1] / 2)]);
++            }
++        }
++
++        // x-y plane:
++        for(unsigned int i = 1; i < (length[0] + 1) / 2; ++i)
++        {
++            for(unsigned int j = 1; j < length[1]; ++j)
++            {
++                data[istride[0] * (length[0] - i) + istride[1] * (length[1] - j)]
++                    = std::conj(data[istride[0] * i + istride[1] * j]);
++            }
++        }
++
++        if(length[2] % 2 == 0)
++        {
++            // x-axis at z-nyquist
++            for(unsigned int i = 1; i < (length[0] + 1) / 2; ++i)
++            {
++                data[istride[0] * (length[0] - i) + istride[2] * (length[2] / 2)]
++                    = std::conj(data[istride[0] * i + istride[2] * (length[2] / 2)]);
++            }
++            if(length[1] % 2 == 0)
++            {
++                // x-axis at yz-nyquist
++                for(unsigned int i = 1; i < (length[0] + 1) / 2; ++i)
++                {
++                    data[istride[0] * (length[0] - i) + istride[2] * (length[2] / 2)]
++                        = std::conj(data[istride[0] * i + istride[2] * (length[2] / 2)]);
++                }
++            }
++
++            // y-axis: at z-nyquist
++            for(unsigned int j = 1; j < (length[1] + 1) / 2; ++j)
++            {
++                data[istride[1] * (length[1] - j) + istride[2] * (length[2] / 2)]
++                    = std::conj(data[istride[1] * j + istride[2] * (length[2] / 2)]);
++            }
++
++            if(length[0] % 2 == 0)
++            {
++                // y-axis: at xz-nyquist
++                for(unsigned int j = 1; j < (length[1] + 1) / 2; ++j)
++                {
++                    data[istride[0] * (length[0] / 2) + istride[1] * (length[1] - j)
++                         + istride[2] * (length[2] / 2)]
++                        = std::conj(data[istride[0] * (length[0] / 2) + istride[1] * j
++                                         + istride[2] * (length[2] / 2)]);
++                }
++            }
++
++            // x-y plane: at z-nyquist
++            for(unsigned int i = 1; i < (length[0] + 1) / 2; ++i)
++            {
++                for(unsigned int j = 1; j < length[1]; ++j)
++                {
++                    data[istride[0] * (length[0] - i) + istride[1] * (length[1] - j)
++                         + istride[2] * (length[2] / 2)]
++                        = std::conj(
++                            data[istride[0] * i + istride[1] * j + istride[2] * (length[2] / 2)]);
++                }
++            }
++        }
++    }
++}
++
++template <typename Tfloat, typename Tsize>
++static void impose_hermitian_symmetry_planar_3D(std::vector<hostbuf>&     vals,
++                                                const std::vector<Tsize>& length,
++                                                const std::vector<Tsize>& istride,
++                                                const Tsize               idist,
++                                                const Tsize               nbatch)
++{
++    for(unsigned int ibatch = 0; ibatch < nbatch; ++ibatch)
++    {
++        auto data_real = ((Tfloat*)vals[0].data()) + ibatch * idist;
++        auto data_imag = ((Tfloat*)vals[1].data()) + ibatch * idist;
++
++        data_imag[0] = 0.0;
++
++        if(length[0] % 2 == 0)
++        {
++            data_imag[istride[0] * (length[0] / 2)] = 0.0;
++        }
++
++        if(length[1] % 2 == 0)
++        {
++            data_imag[istride[1] * (length[1] / 2)] = 0.0;
++        }
++
++        if(length[2] % 2 == 0)
++        {
++            data_imag[istride[2] * (length[2] / 2)] = 0.0;
++        }
++
++        if(length[0] % 2 == 0 && length[1] % 2 == 0)
++        {
++            data_imag[istride[0] * (length[0] / 2) + istride[1] * (length[1] / 2)] = 0.0;
++        }
++
++        if(length[0] % 2 == 0 && length[2] % 2 == 0)
++        {
++            data_imag[istride[0] * (length[0] / 2) + istride[2] * (length[2] / 2)] = 0.0;
++        }
++        if(length[1] % 2 == 0 && length[2] % 2 == 0)
++        {
++            data_imag[istride[1] * (length[1] / 2) + istride[2] * (length[2] / 2)] = 0.0;
++        }
++
++        if(length[0] % 2 == 0 && length[1] % 2 == 0 && length[2] % 2 == 0)
++        {
++            data_imag[istride[0] * (length[0] / 2) + istride[1] * (length[1] / 2)
++                      + istride[2] * (length[2] / 2)]
++                = 0.0;
++        }
++
++        // y-axis:
++        for(unsigned int j = 1; j < (length[1] + 1) / 2; ++j)
++        {
++            data_real[istride[1] * (length[1] - j)] = data_real[istride[1] * j];
++            data_imag[istride[1] * (length[1] - j)] = -data_imag[istride[1] * j];
++        }
++
++        if(length[0] % 2 == 0)
++        {
++            // y-axis at x-nyquist
++            for(unsigned int j = 1; j < (length[1] + 1) / 2; ++j)
++            {
++                data_real[istride[0] * (length[0] / 2) + istride[1] * (length[1] - j)]
++                    = data_real[istride[0] * (length[0] / 2) + istride[1] * j];
++                data_imag[istride[0] * (length[0] / 2) + istride[1] * (length[1] - j)]
++                    = -data_imag[istride[0] * (length[0] / 2) + istride[1] * j];
++            }
++        }
++
++        // x-axis:
++        for(unsigned int i = 1; i < (length[0] + 1) / 2; ++i)
++        {
++            data_real[istride[0] * (length[0] - i)] = data_real[istride[0] * i];
++            data_imag[istride[0] * (length[0] - i)] = -data_imag[istride[0] * i];
++        }
++
++        if(length[1] % 2 == 0)
++        {
++            // x-axis at y-nyquist
++            for(unsigned int i = 1; i < (length[0] + 1) / 2; ++i)
++            {
++                data_real[istride[0] * (length[0] - i) + istride[1] * (length[1] / 2)]
++                    = data_real[istride[0] * i + istride[1] * (length[1] / 2)];
++                data_imag[istride[0] * (length[0] - i) + istride[1] * (length[1] / 2)]
++                    = -data_imag[istride[0] * i + istride[1] * (length[1] / 2)];
++            }
++        }
++
++        // x-y plane:
++        for(unsigned int i = 1; i < (length[0] + 1) / 2; ++i)
++        {
++            for(unsigned int j = 1; j < length[1]; ++j)
++            {
++                data_real[istride[0] * (length[0] - i) + istride[1] * (length[1] - j)]
++                    = data_real[istride[0] * i + istride[1] * j];
++                data_imag[istride[0] * (length[0] - i) + istride[1] * (length[1] - j)]
++                    = -data_imag[istride[0] * i + istride[1] * j];
++            }
++        }
++
++        if(length[2] % 2 == 0)
++        {
++            // x-axis at z-nyquist
++            for(unsigned int i = 1; i < (length[0] + 1) / 2; ++i)
++            {
++                data_real[istride[0] * (length[0] - i) + istride[2] * (length[2] / 2)]
++                    = data_real[istride[0] * i + istride[2] * (length[2] / 2)];
++                data_imag[istride[0] * (length[0] - i) + istride[2] * (length[2] / 2)]
++                    = -data_imag[istride[0] * i + istride[2] * (length[2] / 2)];
++            }
++            if(length[1] % 2 == 0)
++            {
++                // x-axis at yz-nyquist
++                for(unsigned int i = 1; i < (length[0] + 1) / 2; ++i)
++                {
++                    data_real[istride[0] * (length[0] - i) + istride[2] * (length[2] / 2)]
++                        = data_real[istride[0] * i + istride[2] * (length[2] / 2)];
++                    data_imag[istride[0] * (length[0] - i) + istride[2] * (length[2] / 2)]
++                        = -data_imag[istride[0] * i + istride[2] * (length[2] / 2)];
++                }
++            }
++
++            // y-axis: at z-nyquist
++            for(unsigned int j = 1; j < (length[1] + 1) / 2; ++j)
++            {
++                data_real[istride[1] * (length[1] - j) + istride[2] * (length[2] / 2)]
++                    = data_real[istride[1] * j + istride[2] * (length[2] / 2)];
++                data_imag[istride[1] * (length[1] - j) + istride[2] * (length[2] / 2)]
++                    = -data_imag[istride[1] * j + istride[2] * (length[2] / 2)];
++            }
++
++            if(length[0] % 2 == 0)
++            {
++                // y-axis: at xz-nyquist
++                for(unsigned int j = 1; j < (length[1] + 1) / 2; ++j)
++                {
++                    data_real[istride[0] * (length[0] / 2) + istride[1] * (length[1] - j)
++                              + istride[2] * (length[2] / 2)]
++                        = data_real[istride[0] * (length[0] / 2) + istride[1] * j
++                                    + istride[2] * (length[2] / 2)];
++                    data_imag[istride[0] * (length[0] / 2) + istride[1] * (length[1] - j)
++                              + istride[2] * (length[2] / 2)]
++                        = -data_imag[istride[0] * (length[0] / 2) + istride[1] * j
++                                     + istride[2] * (length[2] / 2)];
++                }
++            }
++
++            // x-y plane: at z-nyquist
++            for(unsigned int i = 1; i < (length[0] + 1) / 2; ++i)
++            {
++                for(unsigned int j = 1; j < length[1]; ++j)
++                {
++                    data_real[istride[0] * (length[0] - i) + istride[1] * (length[1] - j)
++                              + istride[2] * (length[2] / 2)]
++                        = data_real[istride[0] * i + istride[1] * j + istride[2] * (length[2] / 2)];
++                    data_imag[istride[0] * (length[0] - i) + istride[1] * (length[1] - j)
++                              + istride[2] * (length[2] / 2)]
++                        = -data_imag[istride[0] * i + istride[1] * j
++                                     + istride[2] * (length[2] / 2)];
++                }
++            }
++        }
++    }
++}
++
++template <typename Tfloat, typename Tint1>
++static void generate_random_interleaved_data(std::vector<hostbuf>& input,
++                                             const Tint1&          whole_length,
++                                             const Tint1&          whole_stride,
++                                             const size_t          idist,
++                                             const size_t          nbatch)
++{
++    auto   idata      = (std::complex<Tfloat>*)input[0].data();
++    size_t i_base     = 0;
++    auto   partitions = partition_rowmajor(whole_length);
++    for(unsigned int b = 0; b < nbatch; b++, i_base += idist)
++    {
++#pragma omp parallel for num_threads(partitions.size())
++        for(size_t part = 0; part < partitions.size(); ++part)
++        {
++            auto         index  = partitions[part].first;
++            const auto   length = partitions[part].second;
++            std::mt19937 gen(compute_index(index, whole_stride, i_base));
++            do
++            {
++                const auto                 i = compute_index(index, whole_stride, i_base);
++                const Tfloat               x = (Tfloat)gen() / (Tfloat)gen.max();
++                const Tfloat               y = (Tfloat)gen() / (Tfloat)gen.max();
++                const std::complex<Tfloat> val(x, y);
++                idata[i] = val;
++            } while(increment_rowmajor(index, length));
++        }
++    }
++}
++
++template <typename Tfloat, typename Tint1>
++static void generate_interleaved_data(std::vector<hostbuf>& input,
++                                      const Tint1&          whole_length,
++                                      const Tint1&          whole_stride,
++                                      const size_t          idist,
++                                      const size_t          nbatch)
++{
++    auto   idata       = (std::complex<Tfloat>*)input[0].data();
++    size_t i_base      = 0;
++    auto   partitions  = partition_rowmajor(whole_length);
++    auto   unit_stride = make_unit_stride(whole_length);
++
++    const Tfloat inv_scale = 1.0 / static_cast<Tfloat>(count_iters(whole_length) - 1);
++
++    for(unsigned int b = 0; b < nbatch; b++, i_base += idist)
++    {
++#pragma omp parallel for num_threads(partitions.size())
++        for(size_t part = 0; part < partitions.size(); ++part)
++        {
++            auto       index  = partitions[part].first;
++            const auto length = partitions[part].second;
++            do
++            {
++                const auto val_xy
++                    = -0.5 + static_cast<Tfloat>(compute_index(index, unit_stride, 0)) * inv_scale;
++
++                const std::complex<Tfloat> val(val_xy, val_xy);
++
++                const auto i = compute_index(index, whole_stride, i_base);
++
++                idata[i] = val;
++            } while(increment_rowmajor(index, length));
++        }
++    }
++}
++
++template <typename Tfloat, typename Tint1>
++static void generate_random_planar_data(std::vector<hostbuf>& input,
++                                        const Tint1&          whole_length,
++                                        const Tint1&          whole_stride,
++                                        const size_t          idist,
++                                        const size_t          nbatch)
++{
++    auto   ireal      = (Tfloat*)input[0].data();
++    auto   iimag      = (Tfloat*)input[1].data();
++    size_t i_base     = 0;
++    auto   partitions = partition_rowmajor(whole_length);
++    for(unsigned int b = 0; b < nbatch; b++, i_base += idist)
++    {
++#pragma omp parallel for num_threads(partitions.size())
++        for(size_t part = 0; part < partitions.size(); ++part)
++        {
++            auto         index  = partitions[part].first;
++            const auto   length = partitions[part].second;
++            std::mt19937 gen(compute_index(index, whole_stride, i_base));
++            do
++            {
++                const auto                 i = compute_index(index, whole_stride, i_base);
++                const std::complex<Tfloat> val((Tfloat)gen() / (Tfloat)gen.max(),
++                                               (Tfloat)gen() / (Tfloat)gen.max());
++                ireal[i] = val.real();
++                iimag[i] = val.imag();
++            } while(increment_rowmajor(index, length));
++        }
++    }
++}
++
++template <typename Tfloat, typename Tint1>
++static void generate_planar_data(std::vector<hostbuf>& input,
++                                 const Tint1&          whole_length,
++                                 const Tint1&          whole_stride,
++                                 const size_t          idist,
++                                 const size_t          nbatch)
++{
++
++    auto   ireal       = (Tfloat*)input[0].data();
++    auto   iimag       = (Tfloat*)input[1].data();
++    size_t i_base      = 0;
++    auto   partitions  = partition_rowmajor(whole_length);
++    auto   unit_stride = make_unit_stride(whole_length);
++
++    const Tfloat inv_scale = 1.0 / static_cast<Tfloat>(count_iters(whole_length) - 1);
++
++    for(unsigned int b = 0; b < nbatch; b++, i_base += idist)
++    {
++#pragma omp parallel for num_threads(partitions.size())
++        for(size_t part = 0; part < partitions.size(); ++part)
++        {
++            auto       index  = partitions[part].first;
++            const auto length = partitions[part].second;
++            do
++            {
++                const auto val_xy
++                    = -0.5 + static_cast<Tfloat>(compute_index(index, unit_stride, 0)) * inv_scale;
++
++                const auto i = compute_index(index, whole_stride, i_base);
++
++                ireal[i] = val_xy;
++                iimag[i] = val_xy;
++            } while(increment_rowmajor(index, length));
++        }
++    }
++}
++
++template <typename Tfloat, typename Tint1>
++static void generate_random_real_data(std::vector<hostbuf>& input,
++                                      const Tint1&          whole_length,
++                                      const Tint1&          whole_stride,
++                                      const size_t          idist,
++                                      const size_t          nbatch)
++{
++    auto   idata      = (Tfloat*)input[0].data();
++    size_t i_base     = 0;
++    auto   partitions = partition_rowmajor(whole_length);
++    for(unsigned int b = 0; b < nbatch; b++, i_base += idist)
++    {
++#pragma omp parallel for num_threads(partitions.size())
++        for(size_t part = 0; part < partitions.size(); ++part)
++        {
++            auto         index  = partitions[part].first;
++            const auto   length = partitions[part].second;
++            std::mt19937 gen(compute_index(index, whole_stride, i_base));
++            do
++            {
++                const auto   i   = compute_index(index, whole_stride, i_base);
++                const Tfloat val = (Tfloat)gen() / (Tfloat)gen.max();
++                idata[i]         = val;
++            } while(increment_rowmajor(index, length));
++        }
++    }
++}
++
++template <typename Tfloat, typename Tint1>
++static void generate_real_data(std::vector<hostbuf>& input,
++                               const Tint1&          whole_length,
++                               const Tint1&          whole_stride,
++                               const size_t          idist,
++                               const size_t          nbatch)
++{
++
++    auto   idata       = (Tfloat*)input[0].data();
++    size_t i_base      = 0;
++    auto   partitions  = partition_rowmajor(whole_length);
++    auto   unit_stride = make_unit_stride(whole_length);
++
++    const Tfloat inv_scale = 1.0 / static_cast<Tfloat>(count_iters(whole_length) - 1);
++
++    for(unsigned int b = 0; b < nbatch; b++, i_base += idist)
++    {
++#pragma omp parallel for num_threads(partitions.size())
++        for(size_t part = 0; part < partitions.size(); ++part)
++        {
++            auto       index  = partitions[part].first;
++            const auto length = partitions[part].second;
++            do
++            {
++                const auto i = compute_index(index, whole_stride, i_base);
++
++                idata[i]
++                    = -0.5 + static_cast<Tfloat>(compute_index(index, unit_stride, 0)) * inv_scale;
++            } while(increment_rowmajor(index, length));
++        }
++    }
++}
++
++template <typename Tfloat, typename Tsize>
++static void impose_hermitian_symmetry_interleaved(std::vector<hostbuf>&     vals,
++                                                  const std::vector<Tsize>& length,
++                                                  const std::vector<Tsize>& istride,
++                                                  const Tsize               idist,
++                                                  const Tsize               nbatch)
++{
++    switch(length.size())
++    {
++    case 1:
++        impose_hermitian_symmetry_interleaved_1D<Tfloat>(vals, length, istride, idist, nbatch);
++        break;
++    case 2:
++        impose_hermitian_symmetry_interleaved_2D<Tfloat>(vals, length, istride, idist, nbatch);
++        break;
++    case 3:
++        impose_hermitian_symmetry_interleaved_3D<Tfloat>(vals, length, istride, idist, nbatch);
++        break;
++    default:
++        throw std::runtime_error("Invalid dimension for impose_hermitian_symmetry");
++    }
++}
++
++template <typename Tfloat, typename Tsize>
++static void impose_hermitian_symmetry_planar(std::vector<hostbuf>&     vals,
++                                             const std::vector<Tsize>& length,
++                                             const std::vector<Tsize>& istride,
++                                             const Tsize               idist,
++                                             const Tsize               nbatch)
++{
++    switch(length.size())
++    {
++    case 1:
++        impose_hermitian_symmetry_planar_1D<Tfloat>(vals, length, istride, idist, nbatch);
++        break;
++    case 2:
++        impose_hermitian_symmetry_planar_2D<Tfloat>(vals, length, istride, idist, nbatch);
++        break;
++    case 3:
++        impose_hermitian_symmetry_planar_3D<Tfloat>(vals, length, istride, idist, nbatch);
++        break;
++    default:
++        throw std::runtime_error("Invalid dimension for impose_hermitian_symmetry");
++    }
++}
++
++#endif // DATA_GEN_HOST_H
+diff --git a/shared/device_properties.h b/shared/device_properties.h
+new file mode 100644
+index 0000000..6e2e1e1
+--- /dev/null
++++ b/shared/device_properties.h
+@@ -0,0 +1,74 @@
++// Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved.
++//
++// Permission is hereby granted, free of charge, to any person obtaining a copy
++// of this software and associated documentation files (the "Software"), to deal
++// in the Software without restriction, including without limitation the rights
++// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
++// copies of the Software, and to permit persons to whom the Software is
++// furnished to do so, subject to the following conditions:
++//
++// The above copyright notice and this permission notice shall be included in
++// all copies or substantial portions of the Software.
++//
++// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
++// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
++// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
++// THE SOFTWARE.
++
++#ifndef ROCFFT_DEVICE_PROPS_H
++#define ROCFFT_DEVICE_PROPS_H
++
++#include <cstdint>
++#include <hip/hip_runtime_api.h>
++#include <stdexcept>
++
++// get device properties
++static hipDeviceProp_t get_curr_device_prop()
++{
++    hipDeviceProp_t prop;
++    int             deviceId = 0;
++    if(hipGetDevice(&deviceId) != hipSuccess)
++        throw std::runtime_error("hipGetDevice failed.");
++
++    if(hipGetDeviceProperties(&prop, deviceId) != hipSuccess)
++        throw std::runtime_error("hipGetDeviceProperties failed for deviceId "
++                                 + std::to_string(deviceId));
++
++    return prop;
++}
++
++// check that the given grid/block dims will fit into the limits in
++// the device properties.  throws std::runtime_error if the limits
++// are exceeded.
++static void launch_limits_check(const std::string&     kernel_name,
++                                const dim3             gridDim,
++                                const dim3             blockDim,
++                                const hipDeviceProp_t& deviceProp)
++{
++    // Need lots of casting here because dim3 is unsigned but device
++    // props are signed.  Cast direct comparisons to fix signedness
++    // issues.  Promote types to 64-bit when multiplying to try to
++    // avoid overflow.
++
++    // Block limits along each dimension
++    if(blockDim.x > static_cast<uint32_t>(deviceProp.maxThreadsDim[0])
++       || blockDim.y > static_cast<uint32_t>(deviceProp.maxThreadsDim[1])
++       || blockDim.z > static_cast<uint32_t>(deviceProp.maxThreadsDim[2]))
++        throw std::runtime_error("max threads per dim exceeded: " + kernel_name);
++
++    // Total threads for the whole block
++    if(static_cast<uint64_t>(blockDim.x) * blockDim.y * blockDim.z
++       > static_cast<uint64_t>(deviceProp.maxThreadsPerBlock))
++        throw std::runtime_error("max threads per block exceeded: " + kernel_name);
++
++    // Grid dimension limits
++    if(gridDim.x > static_cast<uint32_t>(deviceProp.maxGridSize[0])
++       || gridDim.y > static_cast<uint32_t>(deviceProp.maxGridSize[1])
++       || gridDim.z > static_cast<uint32_t>(deviceProp.maxGridSize[2]))
++        throw std::runtime_error("max grid size exceeded: " + kernel_name);
++}
++
++#endif
+diff --git a/shared/enum_to_string.h b/shared/enum_to_string.h
+new file mode 100644
+index 0000000..1c2fba0
+--- /dev/null
++++ b/shared/enum_to_string.h
+@@ -0,0 +1,81 @@
++// Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved.
++//
++// Permission is hereby granted, free of charge, to any person obtaining a copy
++// of this software and associated documentation files (the "Software"), to deal
++// in the Software without restriction, including without limitation the rights
++// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
++// copies of the Software, and to permit persons to whom the Software is
++// furnished to do so, subject to the following conditions:
++//
++// The above copyright notice and this permission notice shall be included in
++// all copies or substantial portions of the Software.
++//
++// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
++// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
++// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
++// THE SOFTWARE.
++
++#ifndef ENUM_TO_STRING_H
++#define ENUM_TO_STRING_H
++
++#include "fft_params.h"
++
++// Return the string of the hipError code.
++static std::string hipError_to_string(const hipError_t ret)
++{
++    switch(ret)
++    {
++    case hipSuccess:
++        return "hipSuccess";
++    case hipErrorInvalidContext:
++        return "hipErrorInvalidContext";
++    case hipErrorInvalidKernelFile:
++        return "hipErrorInvalidKernelFile";
++    case hipErrorMemoryAllocation:
++        return "hipErrorMemoryAllocation";
++    case hipErrorInitializationError:
++        return "hipErrorInitializationError";
++    case hipErrorLaunchFailure:
++        return "hipErrorLaunchFailure";
++    case hipErrorLaunchOutOfResources:
++        return "hipErrorLaunchOutOfResources";
++    case hipErrorInvalidDevice:
++        return "hipErrorInvalidDevice";
++    case hipErrorInvalidValue:
++        return "hipErrorInvalidValue";
++    case hipErrorInvalidDevicePointer:
++        return "hipErrorInvalidDevicePointer";
++    case hipErrorInvalidMemcpyDirection:
++        return "hipErrorInvalidMemcpyDirection";
++    case hipErrorUnknown:
++        return "hipErrorUnknown";
++    case hipErrorInvalidResourceHandle:
++        return "hipErrorInvalidResourceHandle";
++    case hipErrorNotReady:
++        return "hipErrorNotReady";
++    case hipErrorNoDevice:
++        return "hipErrorNoDevice";
++    case hipErrorPeerAccessAlreadyEnabled:
++        return "hipErrorPeerAccessAlreadyEnabled";
++    case hipErrorPeerAccessNotEnabled:
++        return "hipErrorPeerAccessNotEnabled";
++    case hipErrorRuntimeMemory:
++        return "hipErrorRuntimeMemory";
++    case hipErrorRuntimeOther:
++        return "hipErrorRuntimeOther";
++    case hipErrorHostMemoryAlreadyRegistered:
++        return "hipErrorHostMemoryAlreadyRegistered";
++    case hipErrorHostMemoryNotRegistered:
++        return "hipErrorHostMemoryNotRegistered";
++    case hipErrorMapBufferObjectFailed:
++        return "hipErrorMapBufferObjectFailed";
++    case hipErrorTbd:
++        return "hipErrorTbd";
++    default:
++        throw std::runtime_error("unknown hipError");
++    }
++}
++#endif
+diff --git a/shared/environment.h b/shared/environment.h
+new file mode 100644
+index 0000000..7be56a0
+--- /dev/null
++++ b/shared/environment.h
+@@ -0,0 +1,97 @@
++// Copyright (C) 2021 - 2022 Advanced Micro Devices, Inc. All rights reserved.
++//
++// Permission is hereby granted, free of charge, to any person obtaining a copy
++// of this software and associated documentation files (the "Software"), to deal
++// in the Software without restriction, including without limitation the rights
++// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
++// copies of the Software, and to permit persons to whom the Software is
++// furnished to do so, subject to the following conditions:
++//
++// The above copyright notice and this permission notice shall be included in
++// all copies or substantial portions of the Software.
++//
++// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
++// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
++// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
++// THE SOFTWARE.
++
++// wrappers around environment variable routines
++
++#pragma once
++
++#include <string>
++
++// Windows provides "getenv" and "_putenv", but those modify the
++// runtime's copy of the environment.  The actual environment in the
++// process control block is accessed using GetEnvironmentVariable and
++// SetEnvironmentVariable.
++
++#ifdef WIN32
++#include <windows.h>
++static void rocfft_setenv(const char* var, const char* value)
++{
++    SetEnvironmentVariable(var, value);
++}
++static void rocfft_unsetenv(const char* var)
++{
++    SetEnvironmentVariable(var, nullptr);
++}
++static std::string rocfft_getenv(const char* var)
++{
++    DWORD       size = GetEnvironmentVariable(var, nullptr, 0);
++    std::string ret;
++    if(size)
++    {
++        ret.resize(size);
++        GetEnvironmentVariable(var, ret.data(), size);
++        // GetEnvironmentVariable counts the terminating null, so remove it
++        while(!ret.empty() && ret.back() == 0)
++            ret.pop_back();
++    }
++    return ret;
++}
++
++#else
++
++#include <stdlib.h>
++
++static void rocfft_setenv(const char* var, const char* value)
++{
++    setenv(var, value, 1);
++}
++static void rocfft_unsetenv(const char* var)
++{
++    unsetenv(var);
++}
++static std::string rocfft_getenv(const char* var)
++{
++    auto value = getenv(var);
++    return value ? value : "";
++}
++#endif
++
++// RAII object to set an environment variable and restore it to its
++// previous value on destruction
++struct EnvironmentSetTemp
++{
++    EnvironmentSetTemp(const char* _var, const char* val)
++        : var(_var)
++    {
++        auto val_ptr = rocfft_getenv(_var);
++        if(!val_ptr.empty())
++            oldvalue = val_ptr;
++        rocfft_setenv(_var, val);
++    }
++    ~EnvironmentSetTemp()
++    {
++        if(oldvalue.empty())
++            rocfft_unsetenv(var.c_str());
++        else
++            rocfft_setenv(var.c_str(), oldvalue.c_str());
++    }
++    std::string var;
++    std::string oldvalue;
++};
+diff --git a/shared/fft_params.h b/shared/fft_params.h
+new file mode 100644
+index 0000000..bf428ef
+--- /dev/null
++++ b/shared/fft_params.h
+@@ -0,0 +1,3274 @@
++// Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved.
++//
++// Permission is hereby granted, free of charge, to any person obtaining a copy
++// of this software and associated documentation files (the "Software"), to deal
++// in the Software without restriction, including without limitation the rights
++// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
++// copies of the Software, and to permit persons to whom the Software is
++// furnished to do so, subject to the following conditions:
++//
++// The above copyright notice and this permission notice shall be included in
++// all copies or substantial portions of the Software.
++//
++// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
++// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
++// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
++// THE SOFTWARE.
++
++#ifndef FFT_PARAMS_H
++#define FFT_PARAMS_H
++
++#include <algorithm>
++#include <hip/hip_runtime.h>
++#include <iostream>
++#include <mutex>
++#include <numeric>
++#include <sstream>
++#ifdef _OPENMP
++#include <omp.h>
++#endif
++#include <random>
++#include <tuple>
++#include <unordered_set>
++#include <vector>
++
++#include "../shared/arithmetic.h"
++#include "../shared/array_validator.h"
++#include "../shared/data_gen_device.h"
++#include "../shared/data_gen_host.h"
++#include "../shared/device_properties.h"
++#include "../shared/printbuffer.h"
++#include "../shared/ptrdiff.h"
++
++enum fft_status
++{
++    fft_status_success,
++    fft_status_failure,
++    fft_status_invalid_arg_value,
++    fft_status_invalid_dimensions,
++    fft_status_invalid_array_type,
++    fft_status_invalid_strides,
++    fft_status_invalid_distance,
++    fft_status_invalid_offset,
++    fft_status_invalid_work_buffer,
++};
++
++enum fft_transform_type
++{
++    fft_transform_type_complex_forward,
++    fft_transform_type_complex_inverse,
++    fft_transform_type_real_forward,
++    fft_transform_type_real_inverse,
++};
++
++enum fft_precision
++{
++    fft_precision_half,
++    fft_precision_single,
++    fft_precision_double,
++};
++
++static std::istream& operator>>(std::istream& str, fft_precision& precision)
++{
++    std::string word;
++    str >> word;
++
++    if(word == "half")
++        precision = fft_precision_half;
++    else if(word == "single")
++        precision = fft_precision_single;
++    else if(word == "double")
++        precision = fft_precision_double;
++    else
++        throw std::runtime_error("Invalid precision specified");
++    return str;
++}
++
++// fft_input_generator: linearly spaced sequence in [-0.5,0.5]
++// fft_input_random_generator: pseudo-random sequence in [-0.5,0.5]
++enum fft_input_generator
++{
++    fft_input_random_generator_device,
++    fft_input_random_generator_host,
++    fft_input_generator_device,
++    fft_input_generator_host,
++};
++
++static std::istream& operator>>(std::istream& str, fft_input_generator& gen)
++{
++    std::string word;
++    str >> word;
++
++    if(word == "0")
++        gen = fft_input_random_generator_device;
++    else if(word == "1")
++        gen = fft_input_random_generator_host;
++    else if(word == "2")
++        gen = fft_input_generator_device;
++    else if(word == "3")
++        gen = fft_input_generator_host;
++    else
++        throw std::runtime_error("Invalid input generator specified");
++    return str;
++}
++
++enum fft_array_type
++{
++    fft_array_type_complex_interleaved,
++    fft_array_type_complex_planar,
++    fft_array_type_real,
++    fft_array_type_hermitian_interleaved,
++    fft_array_type_hermitian_planar,
++    fft_array_type_unset,
++};
++
++enum fft_result_placement
++{
++    fft_placement_inplace,
++    fft_placement_notinplace,
++};
++
++// Determine the size of the data type given the precision and type.
++template <typename Tsize>
++inline Tsize var_size(const fft_precision precision, const fft_array_type type)
++{
++    size_t var_size = 0;
++    switch(precision)
++    {
++    case fft_precision_half:
++        var_size = sizeof(_Float16);
++        break;
++    case fft_precision_single:
++        var_size = sizeof(float);
++        break;
++    case fft_precision_double:
++        var_size = sizeof(double);
++        break;
++    }
++    switch(type)
++    {
++    case fft_array_type_complex_interleaved:
++    case fft_array_type_hermitian_interleaved:
++        var_size *= 2;
++        break;
++    default:
++        break;
++    }
++    return var_size;
++}
++// Given an array type and transform length, strides, etc, load random floats in [0,1]
++// into the input array of floats/doubles or complex floats/doubles gpu buffers.
++template <typename Tfloat, typename Tint1>
++inline void set_input(std::vector<gpubuf>&       input,
++                      const fft_input_generator  igen,
++                      const fft_array_type       itype,
++                      const std::vector<size_t>& length,
++                      const std::vector<size_t>& ilength,
++                      const std::vector<size_t>& istride,
++                      const Tint1&               whole_length,
++                      const Tint1&               whole_stride,
++                      const size_t               idist,
++                      const size_t               nbatch,
++                      const hipDeviceProp_t&     deviceProp)
++{
++    auto isize = count_iters(whole_length) * nbatch;
++
++    switch(itype)
++    {
++    case fft_array_type_complex_interleaved:
++    case fft_array_type_hermitian_interleaved:
++    {
++        auto ibuffer = (rocfft_complex<Tfloat>*)input[0].data();
++
++        if(igen == fft_input_generator_device)
++            generate_interleaved_data(
++                whole_length, idist, isize, whole_stride, nbatch, ibuffer, deviceProp);
++        else if(igen == fft_input_random_generator_device)
++            generate_random_interleaved_data(
++                whole_length, idist, isize, whole_stride, ibuffer, deviceProp);
++
++        if(itype == fft_array_type_hermitian_interleaved)
++        {
++            auto ibuffer_2 = (rocfft_complex<Tfloat>*)input[0].data();
++            impose_hermitian_symmetry_interleaved(
++                length, ilength, istride, idist, nbatch, ibuffer_2, deviceProp);
++        }
++
++        break;
++    }
++    case fft_array_type_complex_planar:
++    case fft_array_type_hermitian_planar:
++    {
++        auto ibuffer_real = (Tfloat*)input[0].data();
++        auto ibuffer_imag = (Tfloat*)input[1].data();
++
++        if(igen == fft_input_generator_device)
++            generate_planar_data(whole_length,
++                                 idist,
++                                 isize,
++                                 whole_stride,
++                                 nbatch,
++                                 ibuffer_real,
++                                 ibuffer_imag,
++                                 deviceProp);
++        else if(igen == fft_input_random_generator_device)
++            generate_random_planar_data(
++                whole_length, idist, isize, whole_stride, ibuffer_real, ibuffer_imag, deviceProp);
++
++        if(itype == fft_array_type_hermitian_planar)
++            impose_hermitian_symmetry_planar(
++                length, ilength, istride, idist, nbatch, ibuffer_real, ibuffer_imag, deviceProp);
++
++        break;
++    }
++    case fft_array_type_real:
++    {
++        auto ibuffer = (Tfloat*)input[0].data();
++
++        if(igen == fft_input_generator_device)
++            generate_real_data(
++                whole_length, idist, isize, whole_stride, nbatch, ibuffer, deviceProp);
++        else if(igen == fft_input_random_generator_device)
++            generate_random_real_data(
++                whole_length, idist, isize, whole_stride, ibuffer, deviceProp);
++
++        break;
++    }
++    default:
++        throw std::runtime_error("Input layout format not yet supported");
++    }
++}
++
++template <typename Tfloat, typename Tint1>
++inline void set_input(std::vector<hostbuf>&      input,
++                      const fft_input_generator  igen,
++                      const fft_array_type       itype,
++                      const std::vector<size_t>& length,
++                      const std::vector<size_t>& ilength,
++                      const std::vector<size_t>& istride,
++                      const Tint1&               whole_length,
++                      const Tint1&               whole_stride,
++                      const size_t               idist,
++                      const size_t               nbatch,
++                      const hipDeviceProp_t&     deviceProp)
++{
++    switch(itype)
++    {
++    case fft_array_type_complex_interleaved:
++    case fft_array_type_hermitian_interleaved:
++    {
++        if(igen == fft_input_generator_host)
++            generate_interleaved_data<Tfloat>(input, whole_length, whole_stride, idist, nbatch);
++        else if(igen == fft_input_random_generator_host)
++            generate_random_interleaved_data<Tfloat>(
++                input, whole_length, whole_stride, idist, nbatch);
++
++        if(itype == fft_array_type_hermitian_interleaved)
++            impose_hermitian_symmetry_interleaved<Tfloat>(input, length, istride, idist, nbatch);
++
++        break;
++    }
++    case fft_array_type_complex_planar:
++    case fft_array_type_hermitian_planar:
++    {
++        if(igen == fft_input_generator_host)
++            generate_planar_data<Tfloat>(input, whole_length, whole_stride, idist, nbatch);
++        else if(igen == fft_input_random_generator_host)
++            generate_random_planar_data<Tfloat>(input, whole_length, whole_stride, idist, nbatch);
++
++        if(itype == fft_array_type_hermitian_planar)
++            impose_hermitian_symmetry_planar<Tfloat>(input, length, istride, idist, nbatch);
++
++        break;
++    }
++    case fft_array_type_real:
++    {
++        if(igen == fft_input_generator_host)
++            generate_real_data<Tfloat>(input, whole_length, whole_stride, idist, nbatch);
++        else if(igen == fft_input_random_generator_host)
++            generate_random_real_data<Tfloat>(input, whole_length, whole_stride, idist, nbatch);
++
++        break;
++    }
++    default:
++        throw std::runtime_error("Input layout format not yet supported");
++    }
++}
++
++// unroll set_input for dimension 1, 2, 3
++template <typename Tbuff, typename Tfloat>
++inline void set_input(std::vector<Tbuff>&        input,
++                      const fft_input_generator  igen,
++                      const fft_array_type       itype,
++                      const std::vector<size_t>& length,
++                      const std::vector<size_t>& ilength,
++                      const std::vector<size_t>& istride,
++                      const size_t               idist,
++                      const size_t               nbatch,
++                      const hipDeviceProp_t&     deviceProp)
++{
++    switch(length.size())
++    {
++    case 1:
++        set_input<Tfloat>(input,
++                          igen,
++                          itype,
++                          length,
++                          ilength,
++                          istride,
++                          ilength[0],
++                          istride[0],
++                          idist,
++                          nbatch,
++                          deviceProp);
++        break;
++    case 2:
++        set_input<Tfloat>(input,
++                          igen,
++                          itype,
++                          length,
++                          ilength,
++                          istride,
++                          std::make_tuple(ilength[0], ilength[1]),
++                          std::make_tuple(istride[0], istride[1]),
++                          idist,
++                          nbatch,
++                          deviceProp);
++        break;
++    case 3:
++        set_input<Tfloat>(input,
++                          igen,
++                          itype,
++                          length,
++                          ilength,
++                          istride,
++                          std::make_tuple(ilength[0], ilength[1], ilength[2]),
++                          std::make_tuple(istride[0], istride[1], istride[2]),
++                          idist,
++                          nbatch,
++                          deviceProp);
++        break;
++    default:
++        abort();
++    }
++}
++
++// Container class for test parameters.
++class fft_params
++{
++public:
++    // All parameters are row-major.
++    std::vector<size_t>  length;
++    std::vector<size_t>  istride;
++    std::vector<size_t>  ostride;
++    size_t               nbatch         = 1;
++    fft_precision        precision      = fft_precision_single;
++    fft_input_generator  igen           = fft_input_random_generator_device;
++    fft_transform_type   transform_type = fft_transform_type_complex_forward;
++    fft_result_placement placement      = fft_placement_inplace;
++    size_t               idist          = 0;
++    size_t               odist          = 0;
++    fft_array_type       itype          = fft_array_type_unset;
++    fft_array_type       otype          = fft_array_type_unset;
++    std::vector<size_t>  ioffset        = {0, 0};
++    std::vector<size_t>  ooffset        = {0, 0};
++
++    std::vector<size_t> isize;
++    std::vector<size_t> osize;
++
++    size_t workbuffersize = 0;
++
++    struct fft_brick
++    {
++        // all vectors here are row-major, with same length as FFT
++        // dimension + 1 (for batch dimension)
++
++        // inclusive lower bound of brick
++        std::vector<size_t> lower;
++        // exclusive upper bound of brick
++        std::vector<size_t> upper;
++        // stride of brick in memory
++        std::vector<size_t> stride;
++
++        // compute the length of this brick
++        std::vector<size_t> length() const
++        {
++            std::vector<size_t> ret;
++            for(size_t i = 0; i < lower.size(); ++i)
++                ret.push_back(upper[i] - lower[i]);
++            return ret;
++        }
++
++        // compute offset of lower bound in a field with the given
++        // stride + dist (batch stride is separate)
++        size_t lower_field_offset(std::vector<size_t> stride, size_t dist) const
++        {
++            // brick strides include batch, so adjust our input accordingly
++            stride.insert(stride.begin(), dist);
++
++            return std::inner_product(lower.begin(), lower.end(), stride.begin(), 0);
++        }
++
++        // location of the brick
++        int device = 0;
++    };
++
++    struct fft_field
++    {
++        std::vector<fft_brick> bricks;
++    };
++    // optional brick decomposition of inputs/outputs
++    std::vector<fft_field> ifields;
++    std::vector<fft_field> ofields;
++
++    // run testing load/store callbacks
++    bool                    run_callbacks   = false;
++    static constexpr double load_cb_scalar  = 0.457813941;
++    static constexpr double store_cb_scalar = 0.391504938;
++
++    // Check that data outside of output strides is not overwritten.
++    // This is only set explicitly on some tests where there's space
++    // between dimensions, but the dimensions are still in-order.
++    // We're not trying to generically find holes in arbitrary data
++    // layouts.
++    //
++    // NOTE: this flag is not included in tokens, since it doesn't
++    // affect how the FFT library behaves.
++    bool check_output_strides = false;
++
++    // scaling factor - we do a pointwise multiplication of outputs by
++    // this factor
++    double scale_factor = 1.0;
++
++    fft_params(){};
++    virtual ~fft_params(){};
++
++    // Given an array type, return the name as a string.
++    static std::string array_type_name(const fft_array_type type, bool verbose = true)
++    {
++        switch(type)
++        {
++        case fft_array_type_complex_interleaved:
++            return verbose ? "fft_array_type_complex_interleaved" : "CI";
++        case fft_array_type_complex_planar:
++            return verbose ? "fft_array_type_complex_planar" : "CP";
++        case fft_array_type_real:
++            return verbose ? "fft_array_type_real" : "R";
++        case fft_array_type_hermitian_interleaved:
++            return verbose ? "fft_array_type_hermitian_interleaved" : "HI";
++        case fft_array_type_hermitian_planar:
++            return verbose ? "fft_array_type_hermitian_planar" : "HP";
++        case fft_array_type_unset:
++            return verbose ? "fft_array_type_unset" : "UN";
++        }
++        return "";
++    }
++
++    std::string transform_type_name() const
++    {
++        switch(transform_type)
++        {
++        case fft_transform_type_complex_forward:
++            return "fft_transform_type_complex_forward";
++        case fft_transform_type_complex_inverse:
++            return "fft_transform_type_complex_inverse";
++        case fft_transform_type_real_forward:
++            return "fft_transform_type_real_forward";
++        case fft_transform_type_real_inverse:
++            return "fft_transform_type_real_inverse";
++        default:
++            throw std::runtime_error("Invalid transform type");
++        }
++    }
++
++    // Convert to string for output.
++    std::string str(const std::string& separator = ", ") const
++    {
++        // top-level stride/dist are not used when fields are specified.
++        const bool have_ifields = !ifields.empty();
++        const bool have_ofields = !ofields.empty();
++
++        std::stringstream ss;
++        auto print_size_vec = [&](const char* description, const std::vector<size_t>& vec) {
++            ss << description << ":";
++            for(auto i : vec)
++                ss << " " << i;
++            ss << separator;
++        };
++        auto print_fields = [&](const char* description, const std::vector<fft_field>& fields) {
++            for(unsigned int fidx = 0; fidx < fields.size(); ++fidx)
++            {
++                const auto& f = fields[fidx];
++                ss << description << " " << fidx << ":" << separator;
++                for(unsigned int bidx = 0; bidx < f.bricks.size(); ++bidx)
++                {
++                    const auto& b = f.bricks[bidx];
++                    ss << " brick " << bidx << ":" << separator;
++                    print_size_vec("  lower", b.lower);
++                    print_size_vec("  upper", b.upper);
++                    print_size_vec("  stride", b.stride);
++                    ss << "  device: " << b.device << separator;
++                }
++            }
++        };
++
++        print_size_vec("length", length);
++        if(have_ifields)
++        {
++            print_fields("ifield", ifields);
++        }
++        else
++        {
++            print_size_vec("istride", istride);
++            ss << "idist: " << idist << separator;
++        }
++
++        if(have_ofields)
++        {
++            print_fields("ofield", ofields);
++        }
++        else
++        {
++            print_size_vec("ostride", ostride);
++            ss << "odist: " << odist << separator;
++        }
++
++        ss << "batch: " << nbatch << separator;
++        print_size_vec("isize", isize);
++        print_size_vec("osize", osize);
++
++        print_size_vec("ioffset", ioffset);
++        print_size_vec("ooffset", ooffset);
++
++        if(placement == fft_placement_inplace)
++            ss << "in-place";
++        else
++            ss << "out-of-place";
++        ss << separator;
++        ss << "transform_type: " << transform_type_name() << separator;
++        ss << array_type_name(itype) << " -> " << array_type_name(otype) << separator;
++        switch(precision)
++        {
++        case fft_precision_half:
++            ss << "half-precision";
++            break;
++        case fft_precision_single:
++            ss << "single-precision";
++            break;
++        case fft_precision_double:
++            ss << "double-precision";
++            break;
++        }
++        ss << separator;
++
++        print_size_vec("ilength", ilength());
++        print_size_vec("olength", olength());
++
++        print_size_vec("ibuffer_size", ibuffer_sizes());
++        print_size_vec("obuffer_size", obuffer_sizes());
++
++        if(scale_factor != 1.0)
++            ss << "scale factor: " << scale_factor << separator;
++
++        return ss.str();
++    }
++
++    // Produce a stringified token of the test fft params.
++    std::string token() const
++    {
++        std::string ret;
++
++        switch(transform_type)
++        {
++        case fft_transform_type_complex_forward:
++            ret += "complex_forward_";
++            break;
++        case fft_transform_type_complex_inverse:
++            ret += "complex_inverse_";
++            break;
++        case fft_transform_type_real_forward:
++            ret += "real_forward_";
++            break;
++        case fft_transform_type_real_inverse:
++            ret += "real_inverse_";
++            break;
++        }
++
++        auto append_size_vec = [&ret](const std::vector<size_t>& vec) {
++            for(auto s : vec)
++            {
++                ret += "_";
++                ret += std::to_string(s);
++            }
++        };
++
++        ret += "len";
++        append_size_vec(length);
++
++        switch(precision)
++        {
++        case fft_precision_half:
++            ret += "_half_";
++            break;
++        case fft_precision_single:
++            ret += "_single_";
++            break;
++        case fft_precision_double:
++            ret += "_double_";
++            break;
++        }
++
++        switch(placement)
++        {
++        case fft_placement_inplace:
++            ret += "ip_";
++            break;
++        case fft_placement_notinplace:
++            ret += "op_";
++            break;
++        }
++
++        ret += "batch_";
++        ret += std::to_string(nbatch);
++
++        auto append_array_type = [&ret](fft_array_type type) {
++            switch(type)
++            {
++            case fft_array_type_complex_interleaved:
++                ret += "CI";
++                break;
++            case fft_array_type_complex_planar:
++                ret += "CP";
++                break;
++            case fft_array_type_real:
++                ret += "R";
++                break;
++            case fft_array_type_hermitian_interleaved:
++                ret += "HI";
++                break;
++            case fft_array_type_hermitian_planar:
++                ret += "HP";
++                break;
++            default:
++                ret += "UN";
++                break;
++            }
++        };
++
++        auto append_brick_info = [&ret, &append_size_vec](const fft_brick& b) {
++            ret += "_brick";
++
++            ret += "_lower";
++            append_size_vec(b.lower);
++            ret += "_upper";
++            append_size_vec(b.upper);
++            ret += "_stride";
++            append_size_vec(b.stride);
++            ret += "_dev_";
++            ret += std::to_string(b.device);
++        };
++
++        const bool have_ifields = !ifields.empty();
++        const bool have_ofields = !ofields.empty();
++
++        if(have_ifields)
++        {
++            for(const auto& f : ifields)
++            {
++                ret += "_ifield";
++                for(const auto& b : f.bricks)
++                    append_brick_info(b);
++            }
++        }
++        else
++        {
++            ret += "_istride";
++            append_size_vec(istride);
++            ret += "_";
++            append_array_type(itype);
++        }
++
++        if(have_ofields)
++        {
++            for(const auto& f : ofields)
++            {
++                ret += "_ofield";
++                for(const auto& b : f.bricks)
++                    append_brick_info(b);
++            }
++        }
++        else
++        {
++            ret += "_ostride";
++            append_size_vec(ostride);
++            ret += "_";
++            append_array_type(otype);
++        }
++
++        if(!have_ifields)
++        {
++            ret += "_idist_";
++            ret += std::to_string(idist);
++        }
++        if(!have_ofields)
++        {
++            ret += "_odist_";
++            ret += std::to_string(odist);
++        }
++
++        if(!have_ifields)
++        {
++            ret += "_ioffset";
++            append_size_vec(ioffset);
++        }
++
++        if(!have_ofields)
++        {
++            ret += "_ooffset";
++            append_size_vec(ooffset);
++        }
++
++        if(run_callbacks)
++            ret += "_CB";
++
++        if(scale_factor != 1.0)
++            ret += "_scale";
++
++        return ret;
++    }
++
++    // Set all params from a stringified token.
++    void from_token(std::string token)
++    {
++        std::vector<std::string> vals;
++
++        std::string delimiter = "_";
++        {
++            size_t pos = 0;
++            while((pos = token.find(delimiter)) != std::string::npos)
++            {
++                auto val = token.substr(0, pos);
++                vals.push_back(val);
++                token.erase(0, pos + delimiter.length());
++            }
++            vals.push_back(token);
++        }
++
++        auto size_parser
++            = [](const std::vector<std::string>& vals, const std::string token, size_t& pos) {
++                  if(vals[pos++] != token)
++                      throw std::runtime_error("Unable to parse token");
++                  return std::stoull(vals[pos++]);
++              };
++
++        auto vector_parser
++            = [](const std::vector<std::string>& vals, const std::string token, size_t& pos) {
++                  if(vals[pos++] != token)
++                      throw std::runtime_error("Unable to parse token");
++                  std::vector<size_t> vec;
++
++                  while(pos < vals.size())
++                  {
++                      if(std::all_of(vals[pos].begin(), vals[pos].end(), ::isdigit))
++                      {
++                          vec.push_back(std::stoull(vals[pos++]));
++                      }
++                      else
++                      {
++                          break;
++                      }
++                  }
++                  return vec;
++              };
++
++        auto type_parser = [](const std::string& val) {
++            if(val == "CI")
++                return fft_array_type_complex_interleaved;
++            else if(val == "CP")
++                return fft_array_type_complex_planar;
++            else if(val == "R")
++                return fft_array_type_real;
++            else if(val == "HI")
++                return fft_array_type_hermitian_interleaved;
++            else if(val == "HP")
++                return fft_array_type_hermitian_planar;
++            return fft_array_type_unset;
++        };
++
++        auto field_parser = [&vector_parser, &size_parser](const std::vector<std::string>& vals,
++                                                           size_t&                         pos,
++                                                           std::vector<fft_field>&         output) {
++            // skip over ifield/ofield word
++            pos++;
++            fft_field& f = output.emplace_back();
++            while(pos < vals.size() && vals[pos] == "brick")
++            {
++                fft_brick& b = f.bricks.emplace_back();
++                pos++;
++                b.lower  = vector_parser(vals, "lower", pos);
++                b.upper  = vector_parser(vals, "upper", pos);
++                b.stride = vector_parser(vals, "stride", pos);
++                b.device = size_parser(vals, "dev", pos);
++            }
++        };
++
++        size_t pos = 0;
++
++        bool complex = vals[pos++] == "complex";
++        bool forward = vals[pos++] == "forward";
++
++        if(complex && forward)
++            transform_type = fft_transform_type_complex_forward;
++        if(complex && !forward)
++            transform_type = fft_transform_type_complex_inverse;
++        if(!complex && forward)
++            transform_type = fft_transform_type_real_forward;
++        if(!complex && !forward)
++            transform_type = fft_transform_type_real_inverse;
++
++        length = vector_parser(vals, "len", pos);
++
++        if(vals[pos] == "half")
++            precision = fft_precision_half;
++        else if(vals[pos] == "single")
++            precision = fft_precision_single;
++        else if(vals[pos] == "double")
++            precision = fft_precision_double;
++        pos++;
++
++        placement = (vals[pos++] == "ip") ? fft_placement_inplace : fft_placement_notinplace;
++
++        nbatch = size_parser(vals, "batch", pos);
++
++        // strides, bricks etc are mixed in from here, so just keep
++        // looking at the next token to decide what to do
++        while(pos < vals.size())
++        {
++            const auto& next_token = vals[pos];
++            if(next_token == "istride")
++            {
++                istride = vector_parser(vals, "istride", pos);
++                itype   = type_parser(vals[pos]);
++                pos++;
++            }
++            else if(next_token == "ostride")
++            {
++                ostride = vector_parser(vals, "ostride", pos);
++                otype   = type_parser(vals[pos]);
++                pos++;
++            }
++            else if(next_token == "idist")
++                idist = size_parser(vals, "idist", pos);
++            else if(next_token == "odist")
++                odist = size_parser(vals, "odist", pos);
++            else if(next_token == "ioffset")
++                ioffset = vector_parser(vals, "ioffset", pos);
++            else if(next_token == "ooffset")
++                ooffset = vector_parser(vals, "ooffset", pos);
++            else if(next_token == "ifield")
++                field_parser(vals, pos, ifields);
++            else if(next_token == "ofield")
++                field_parser(vals, pos, ofields);
++            else
++                break;
++        }
++
++        if(pos < vals.size() && vals[pos] == "CB")
++        {
++            run_callbacks = true;
++            ++pos;
++        }
++
++        if(pos < vals.size() && vals[pos] == "scale")
++        {
++            // just pick some factor that's not zero or one
++            scale_factor = 0.1239;
++            ++pos;
++        }
++    }
++
++    // Stream output operator (for gtest, etc).
++    friend std::ostream& operator<<(std::ostream& stream, const fft_params& params)
++    {
++        stream << params.str();
++        return stream;
++    }
++
++    // Dimension of the transform.
++    size_t dim() const
++    {
++        return length.size();
++    }
++
++    virtual std::vector<size_t> ilength() const
++    {
++        auto ilength = length;
++        if(transform_type == fft_transform_type_real_inverse)
++            ilength[dim() - 1] = ilength[dim() - 1] / 2 + 1;
++        return ilength;
++    }
++
++    virtual std::vector<size_t> olength() const
++    {
++        auto olength = length;
++        if(transform_type == fft_transform_type_real_forward)
++            olength[dim() - 1] = olength[dim() - 1] / 2 + 1;
++        return olength;
++    }
++
++    static size_t nbuffer(const fft_array_type type)
++    {
++        switch(type)
++        {
++        case fft_array_type_real:
++        case fft_array_type_complex_interleaved:
++        case fft_array_type_hermitian_interleaved:
++            return 1;
++        case fft_array_type_complex_planar:
++        case fft_array_type_hermitian_planar:
++            return 2;
++        case fft_array_type_unset:
++            return 0;
++        }
++        return 0;
++    }
++
++    // Number of input buffers
++    size_t nibuffer() const
++    {
++        return nbuffer(itype);
++    }
++
++    // Number of output buffers
++    size_t nobuffer() const
++    {
++        return nbuffer(otype);
++    }
++
++    void set_iotypes()
++    {
++        if(itype == fft_array_type_unset)
++        {
++            switch(transform_type)
++            {
++            case fft_transform_type_complex_forward:
++            case fft_transform_type_complex_inverse:
++                itype = fft_array_type_complex_interleaved;
++                break;
++            case fft_transform_type_real_forward:
++                itype = fft_array_type_real;
++                break;
++            case fft_transform_type_real_inverse:
++                itype = fft_array_type_hermitian_interleaved;
++                break;
++            default:
++                throw std::runtime_error("Invalid transform type");
++            }
++        }
++        if(otype == fft_array_type_unset)
++        {
++            switch(transform_type)
++            {
++            case fft_transform_type_complex_forward:
++            case fft_transform_type_complex_inverse:
++                otype = fft_array_type_complex_interleaved;
++                break;
++            case fft_transform_type_real_forward:
++                otype = fft_array_type_hermitian_interleaved;
++                break;
++            case fft_transform_type_real_inverse:
++                otype = fft_array_type_real;
++                break;
++            default:
++                throw std::runtime_error("Invalid transform type");
++            }
++        }
++    }
++
++    // Check that the input and output types are consistent.
++    bool check_iotypes() const
++    {
++        switch(itype)
++        {
++        case fft_array_type_complex_interleaved:
++        case fft_array_type_complex_planar:
++        case fft_array_type_hermitian_interleaved:
++        case fft_array_type_hermitian_planar:
++        case fft_array_type_real:
++            break;
++        default:
++            throw std::runtime_error("Invalid Input array type format");
++        }
++
++        switch(otype)
++        {
++        case fft_array_type_complex_interleaved:
++        case fft_array_type_complex_planar:
++        case fft_array_type_hermitian_interleaved:
++        case fft_array_type_hermitian_planar:
++        case fft_array_type_real:
++            break;
++        default:
++            throw std::runtime_error("Invalid Input array type format");
++        }
++
++        // Check that format choices are supported
++        if(transform_type != fft_transform_type_real_forward
++           && transform_type != fft_transform_type_real_inverse)
++        {
++            if(placement == fft_placement_inplace && itype != otype)
++            {
++                throw std::runtime_error(
++                    "In-place transforms must have identical input and output types");
++            }
++        }
++
++        bool okformat = true;
++        switch(itype)
++        {
++        case fft_array_type_complex_interleaved:
++        case fft_array_type_complex_planar:
++            okformat = (otype == fft_array_type_complex_interleaved
++                        || otype == fft_array_type_complex_planar);
++            break;
++        case fft_array_type_hermitian_interleaved:
++        case fft_array_type_hermitian_planar:
++            okformat = otype == fft_array_type_real;
++            break;
++        case fft_array_type_real:
++            okformat = (otype == fft_array_type_hermitian_interleaved
++                        || otype == fft_array_type_hermitian_planar);
++            break;
++        default:
++            throw std::runtime_error("Invalid Input array type format");
++        }
++
++        return okformat;
++    }
++
++    // Given a length vector, set the rest of the strides.
++    // The optional argument stride0 sets the stride for the contiguous dimension.
++    // The optional rcpadding argument sets the stride correctly for in-place
++    // multi-dimensional real/complex transforms.
++    // Format is row-major.
++    template <typename T1>
++    std::vector<T1> compute_stride(const std::vector<T1>&     length,
++                                   const std::vector<size_t>& stride0   = std::vector<size_t>(),
++                                   const bool                 rcpadding = false) const
++    {
++        std::vector<T1> stride(dim());
++
++        size_t dimoffset = 0;
++
++        if(stride0.size() == 0)
++        {
++            // Set the contiguous stride:
++            stride[dim() - 1] = 1;
++            dimoffset         = 1;
++        }
++        else
++        {
++            // Copy the input values to the end of the stride array:
++            for(size_t i = 0; i < stride0.size(); ++i)
++            {
++                stride[dim() - stride0.size() + i] = stride0[i];
++            }
++        }
++
++        if(stride0.size() < dim())
++        {
++            // Compute any remaining values via recursion.
++            for(size_t i = dim() - dimoffset - stride0.size(); i-- > 0;)
++            {
++                auto lengthip1 = length[i + 1];
++                if(rcpadding && i == dim() - 2)
++                {
++                    lengthip1 = 2 * (lengthip1 / 2 + 1);
++                }
++                stride[i] = stride[i + 1] * lengthip1;
++            }
++        }
++
++        return stride;
++    }
++
++    void compute_istride()
++    {
++        istride = compute_stride(ilength(),
++                                 istride,
++                                 placement == fft_placement_inplace
++                                     && transform_type == fft_transform_type_real_forward);
++    }
++
++    void compute_ostride()
++    {
++        ostride = compute_stride(olength(),
++                                 ostride,
++                                 placement == fft_placement_inplace
++                                     && transform_type == fft_transform_type_real_inverse);
++    }
++
++    virtual void compute_isize()
++    {
++        auto   il  = ilength();
++        size_t val = compute_ptrdiff(il, istride, nbatch, idist);
++        isize.resize(nibuffer());
++        for(unsigned int i = 0; i < isize.size(); ++i)
++        {
++            isize[i] = val + ioffset[i];
++        }
++    }
++
++    virtual void compute_osize()
++    {
++        auto   ol  = olength();
++        size_t val = compute_ptrdiff(ol, ostride, nbatch, odist);
++        osize.resize(nobuffer());
++        for(unsigned int i = 0; i < osize.size(); ++i)
++        {
++            osize[i] = val + ooffset[i];
++        }
++    }
++
++    std::vector<size_t> ibuffer_sizes() const
++    {
++        std::vector<size_t> ibuffer_sizes;
++
++        // In-place real-to-complex transforms need to have enough space in the input buffer to
++        // accomadate the output, which is slightly larger.
++        if(placement == fft_placement_inplace && transform_type == fft_transform_type_real_forward)
++        {
++            return obuffer_sizes();
++        }
++
++        if(isize.empty())
++            return ibuffer_sizes;
++
++        switch(itype)
++        {
++        case fft_array_type_complex_planar:
++        case fft_array_type_hermitian_planar:
++            ibuffer_sizes.resize(2);
++            break;
++        default:
++            ibuffer_sizes.resize(1);
++        }
++        for(unsigned i = 0; i < ibuffer_sizes.size(); i++)
++        {
++            ibuffer_sizes[i] = isize[i] * var_size<size_t>(precision, itype);
++        }
++        return ibuffer_sizes;
++    }
++
++    virtual std::vector<size_t> obuffer_sizes() const
++    {
++        std::vector<size_t> obuffer_sizes;
++
++        if(osize.empty())
++            return obuffer_sizes;
++
++        switch(otype)
++        {
++        case fft_array_type_complex_planar:
++        case fft_array_type_hermitian_planar:
++            obuffer_sizes.resize(2);
++            break;
++        default:
++            obuffer_sizes.resize(1);
++        }
++        for(unsigned i = 0; i < obuffer_sizes.size(); i++)
++        {
++            obuffer_sizes[i] = osize[i] * var_size<size_t>(precision, otype);
++        }
++        return obuffer_sizes;
++    }
++
++    // Compute the idist for a given transform based on the placeness, transform type, and data
++    // layout.
++    size_t compute_idist() const
++    {
++        size_t dist = 0;
++        // In-place 1D transforms need extra dist.
++        if(transform_type == fft_transform_type_real_forward && dim() == 1
++           && placement == fft_placement_inplace)
++        {
++            dist = 2 * (length[0] / 2 + 1) * istride[0];
++            return dist;
++        }
++
++        if(transform_type == fft_transform_type_real_inverse && dim() == 1)
++        {
++            dist = (length[0] / 2 + 1) * istride[0];
++            return dist;
++        }
++
++        dist = (transform_type == fft_transform_type_real_inverse)
++                   ? (length[dim() - 1] / 2 + 1) * istride[dim() - 1]
++                   : length[dim() - 1] * istride[dim() - 1];
++        for(unsigned int i = 0; i < dim() - 1; ++i)
++        {
++            dist = std::max(length[i] * istride[i], dist);
++        }
++        return dist;
++    }
++    void set_idist()
++    {
++        if(idist != 0)
++            return;
++        idist = compute_idist();
++    }
++
++    // Compute the odist for a given transform based on the placeness, transform type, and data
++    // layout.  Row-major.
++    size_t compute_odist() const
++    {
++        size_t dist = 0;
++        // In-place 1D transforms need extra dist.
++        if(transform_type == fft_transform_type_real_inverse && dim() == 1
++           && placement == fft_placement_inplace)
++        {
++            dist = 2 * (length[0] / 2 + 1) * ostride[0];
++            return dist;
++        }
++
++        if(transform_type == fft_transform_type_real_forward && dim() == 1)
++        {
++            dist = (length[0] / 2 + 1) * ostride[0];
++            return dist;
++        }
++
++        dist = (transform_type == fft_transform_type_real_forward)
++                   ? (length[dim() - 1] / 2 + 1) * ostride[dim() - 1]
++                   : length[dim() - 1] * ostride[dim() - 1];
++        for(unsigned int i = 0; i < dim() - 1; ++i)
++        {
++            dist = std::max(length[i] * ostride[i], dist);
++        }
++        return dist;
++    }
++    void set_odist()
++    {
++        if(odist != 0)
++            return;
++        odist = compute_odist();
++    }
++
++    // Put the length, stride, batch, and dist into a single length/stride array and pass off to the
++    // validity checker.
++    bool valid_length_stride_batch_dist(const std::vector<size_t>& l0,
++                                        const std::vector<size_t>& s0,
++                                        const size_t               n,
++                                        const size_t               dist,
++                                        const int                  verbose = 0) const
++    {
++        if(l0.size() != s0.size())
++            return false;
++
++        // Length and stride vectors, including bathes:
++        std::vector<size_t> l{}, s{};
++        for(unsigned int i = 0; i < l0.size(); ++i)
++        {
++            if(l0[i] > 1)
++            {
++                if(s0[i] == 0)
++                    return false;
++                l.push_back(l0[i]);
++                s.push_back(s0[i]);
++            }
++        }
++        if(n > 1)
++        {
++            if(dist == 0)
++                return false;
++            l.push_back(n);
++            s.push_back(dist);
++        }
++
++        return array_valid(l, s, verbose);
++    }
++
++    // Return true if the given GPU parameters would produce a valid transform.
++    bool valid(const int verbose) const
++    {
++        if(ioffset.size() < nibuffer() || ooffset.size() < nobuffer())
++            return false;
++
++        // Check that in-place transforms have the same input and output stride:
++        if(placement == fft_placement_inplace)
++        {
++            const auto stridesize = std::min(istride.size(), ostride.size());
++            bool       samestride = true;
++            for(unsigned int i = 0; i < stridesize; ++i)
++            {
++                if(istride[i] != ostride[i])
++                    samestride = false;
++            }
++            if((transform_type == fft_transform_type_complex_forward
++                || transform_type == fft_transform_type_complex_inverse)
++               && !samestride)
++            {
++                // In-place transforms require identical input and output strides.
++                if(verbose)
++                {
++                    std::cout << "istride:";
++                    for(const auto& i : istride)
++                        std::cout << " " << i;
++                    std::cout << " ostride0:";
++                    for(const auto& i : ostride)
++                        std::cout << " " << i;
++                    std::cout << " differ; skipped for in-place transforms: skipping test"
++                              << std::endl;
++                }
++                return false;
++            }
++
++            if((transform_type == fft_transform_type_complex_forward
++                || transform_type == fft_transform_type_complex_inverse)
++               && (idist != odist) && nbatch > 1)
++            {
++                // In-place transforms require identical distance, if
++                // batch > 1.  If batch is 1 then dist is ignored and
++                // the FFT should still work.
++                if(verbose)
++                {
++                    std::cout << "idist:" << idist << " odist:" << odist
++                              << " differ; skipped for in-place transforms: skipping test"
++                              << std::endl;
++                }
++                return false;
++            }
++
++            if((transform_type == fft_transform_type_real_forward
++                || transform_type == fft_transform_type_real_inverse)
++               && (istride.back() != 1 || ostride.back() != 1))
++            {
++                // In-place real/complex transforms require unit strides.
++                if(verbose)
++                {
++                    std::cout
++                        << "istride.back(): " << istride.back()
++                        << " ostride.back(): " << ostride.back()
++                        << " must be unitary for in-place real/complex transforms: skipping test"
++                        << std::endl;
++                }
++                return false;
++            }
++
++            if((itype == fft_array_type_complex_interleaved
++                && otype == fft_array_type_complex_planar)
++               || (itype == fft_array_type_complex_planar
++                   && otype == fft_array_type_complex_interleaved))
++            {
++                if(verbose)
++                {
++                    std::cout << "In-place c2c transforms require identical io types; skipped.\n";
++                }
++                return false;
++            }
++
++            // Check offsets
++            switch(transform_type)
++            {
++            case fft_transform_type_complex_forward:
++            case fft_transform_type_complex_inverse:
++                for(unsigned int i = 0; i < nibuffer(); ++i)
++                {
++                    if(ioffset[i] != ooffset[i])
++                        return false;
++                }
++                break;
++            case fft_transform_type_real_forward:
++                if(ioffset[0] != 2 * ooffset[0])
++                    return false;
++                break;
++            case fft_transform_type_real_inverse:
++                if(2 * ioffset[0] != ooffset[0])
++                    return false;
++                break;
++            }
++        }
++
++        if(!check_iotypes())
++            return false;
++
++        // we can only check output strides on out-of-place
++        // transforms, since we need to initialize output to a known
++        // pattern
++        if(placement == fft_placement_inplace && check_output_strides)
++            return false;
++
++        // Check input and output strides
++        if(valid_length_stride_batch_dist(ilength(), istride, nbatch, idist, verbose) != true)
++        {
++            if(verbose)
++                std::cout << "Invalid input data format.\n";
++            return false;
++        }
++        if(!(ilength() == olength() && istride == ostride && idist == odist))
++        {
++            // Only check if different
++            if(valid_length_stride_batch_dist(olength(), ostride, nbatch, odist, verbose) != true)
++            {
++                if(verbose)
++                    std::cout << "Invalid output data format.\n";
++                return false;
++            }
++        }
++
++        // The parameters are valid.
++        return true;
++    }
++
++    // Fill in any missing parameters.
++    void validate()
++    {
++        set_iotypes();
++        compute_istride();
++        compute_ostride();
++        set_idist();
++        set_odist();
++        compute_isize();
++        compute_osize();
++
++        validate_fields();
++    }
++
++    virtual void validate_fields() const
++    {
++        if(!ifields.empty() || !ofields.empty())
++            throw std::runtime_error("input/output fields are unsupported");
++    }
++
++    // Column-major getters:
++    std::vector<size_t> length_cm() const
++    {
++        auto length_cm = length;
++        std::reverse(std::begin(length_cm), std::end(length_cm));
++        return length_cm;
++    }
++    std::vector<size_t> ilength_cm() const
++    {
++        auto ilength_cm = ilength();
++        std::reverse(std::begin(ilength_cm), std::end(ilength_cm));
++        return ilength_cm;
++    }
++    std::vector<size_t> olength_cm() const
++    {
++        auto olength_cm = olength();
++        std::reverse(std::begin(olength_cm), std::end(olength_cm));
++        return olength_cm;
++    }
++    std::vector<size_t> istride_cm() const
++    {
++        auto istride_cm = istride;
++        std::reverse(std::begin(istride_cm), std::end(istride_cm));
++        return istride_cm;
++    }
++    std::vector<size_t> ostride_cm() const
++    {
++        auto ostride_cm = ostride;
++        std::reverse(std::begin(ostride_cm), std::end(ostride_cm));
++        return ostride_cm;
++    }
++    bool is_planar() const
++    {
++        if(itype == fft_array_type_complex_planar || itype == fft_array_type_hermitian_planar)
++            return true;
++        if(otype == fft_array_type_complex_planar || otype == fft_array_type_hermitian_planar)
++            return true;
++        return false;
++    }
++
++    // Given a data type and dimensions, fill the buffer, imposing Hermitian symmetry if necessary.
++    template <typename Tbuff>
++    inline void compute_input(std::vector<Tbuff>& input)
++    {
++        auto deviceProp = get_curr_device_prop();
++
++        switch(precision)
++        {
++        case fft_precision_half:
++            set_input<Tbuff, _Float16>(
++                input, igen, itype, length, ilength(), istride, idist, nbatch, deviceProp);
++            break;
++        case fft_precision_double:
++            set_input<Tbuff, double>(
++                input, igen, itype, length, ilength(), istride, idist, nbatch, deviceProp);
++            break;
++        case fft_precision_single:
++            set_input<Tbuff, float>(
++                input, igen, itype, length, ilength(), istride, idist, nbatch, deviceProp);
++            break;
++        }
++    }
++
++    template <typename Tstream = std::ostream>
++    void print_ibuffer(const std::vector<hostbuf>& buf, Tstream& stream = std::cout) const
++    {
++        switch(itype)
++        {
++        case fft_array_type_complex_interleaved:
++        case fft_array_type_hermitian_interleaved:
++        {
++            switch(precision)
++            {
++            case fft_precision_half:
++            {
++                buffer_printer<rocfft_complex<_Float16>> s;
++                s.print_buffer(buf, ilength(), istride, nbatch, idist, ioffset);
++                break;
++            }
++            case fft_precision_single:
++            {
++                buffer_printer<rocfft_complex<float>> s;
++                s.print_buffer(buf, ilength(), istride, nbatch, idist, ioffset);
++                break;
++            }
++            case fft_precision_double:
++            {
++                buffer_printer<rocfft_complex<double>> s;
++                s.print_buffer(buf, ilength(), istride, nbatch, idist, ioffset);
++                break;
++            }
++            }
++            break;
++        }
++        case fft_array_type_complex_planar:
++        case fft_array_type_hermitian_planar:
++        case fft_array_type_real:
++        {
++            switch(precision)
++            {
++            case fft_precision_half:
++            {
++                buffer_printer<_Float16> s;
++                s.print_buffer(buf, ilength(), istride, nbatch, idist, ioffset);
++                break;
++            }
++            case fft_precision_single:
++            {
++                buffer_printer<float> s;
++                s.print_buffer(buf, ilength(), istride, nbatch, idist, ioffset);
++                break;
++            }
++            case fft_precision_double:
++            {
++                buffer_printer<double> s;
++                s.print_buffer(buf, ilength(), istride, nbatch, idist, ioffset);
++                break;
++            }
++            }
++            break;
++        }
++        default:
++            throw std::runtime_error("Invalid itype in print_ibuffer");
++        }
++    }
++
++    template <typename Tstream = std::ostream>
++    void print_obuffer(const std::vector<hostbuf>& buf, Tstream& stream = std::cout) const
++    {
++        switch(otype)
++        {
++        case fft_array_type_complex_interleaved:
++        case fft_array_type_hermitian_interleaved:
++        {
++            switch(precision)
++            {
++            case fft_precision_half:
++            {
++                buffer_printer<rocfft_complex<_Float16>> s;
++                s.print_buffer(buf, olength(), ostride, nbatch, odist, ooffset);
++                break;
++            }
++            case fft_precision_single:
++            {
++                buffer_printer<rocfft_complex<float>> s;
++                s.print_buffer(buf, olength(), ostride, nbatch, odist, ooffset);
++                break;
++            }
++            case fft_precision_double:
++                buffer_printer<rocfft_complex<double>> s;
++                s.print_buffer(buf, olength(), ostride, nbatch, odist, ooffset);
++                break;
++            }
++            break;
++        }
++        case fft_array_type_complex_planar:
++        case fft_array_type_hermitian_planar:
++        case fft_array_type_real:
++        {
++            switch(precision)
++            {
++            case fft_precision_half:
++            {
++                buffer_printer<_Float16> s;
++                s.print_buffer(buf, olength(), ostride, nbatch, odist, ooffset);
++                break;
++            }
++            case fft_precision_single:
++            {
++                buffer_printer<float> s;
++                s.print_buffer(buf, olength(), ostride, nbatch, odist, ooffset);
++                break;
++            }
++            case fft_precision_double:
++            {
++                buffer_printer<double> s;
++                s.print_buffer(buf, olength(), ostride, nbatch, odist, ooffset);
++                break;
++            }
++            }
++            break;
++        }
++
++        default:
++            throw std::runtime_error("Invalid itype in print_obuffer");
++        }
++    }
++
++    void print_ibuffer_flat(const std::vector<hostbuf>& buf) const
++    {
++        switch(itype)
++        {
++        case fft_array_type_complex_interleaved:
++        case fft_array_type_hermitian_interleaved:
++        {
++            switch(precision)
++            {
++            case fft_precision_half:
++            {
++                buffer_printer<rocfft_complex<_Float16>> s;
++                s.print_buffer_flat(buf, osize, ooffset);
++                break;
++            }
++            case fft_precision_single:
++            {
++                buffer_printer<rocfft_complex<float>> s;
++                s.print_buffer_flat(buf, osize, ooffset);
++                break;
++            }
++            case fft_precision_double:
++                buffer_printer<rocfft_complex<double>> s;
++                s.print_buffer_flat(buf, osize, ooffset);
++                break;
++            }
++            break;
++        }
++        case fft_array_type_complex_planar:
++        case fft_array_type_hermitian_planar:
++        case fft_array_type_real:
++        {
++            switch(precision)
++            {
++            case fft_precision_half:
++            {
++                buffer_printer<_Float16> s;
++                s.print_buffer_flat(buf, osize, ooffset);
++                break;
++            }
++            case fft_precision_single:
++            {
++                buffer_printer<float> s;
++                s.print_buffer_flat(buf, osize, ooffset);
++                break;
++            }
++            case fft_precision_double:
++            {
++                buffer_printer<double> s;
++                s.print_buffer_flat(buf, osize, ooffset);
++                break;
++            }
++            }
++            break;
++        default:
++            throw std::runtime_error("Invalid itype in print_ibuffer_flat");
++        }
++        }
++    }
++
++    void print_obuffer_flat(const std::vector<hostbuf>& buf) const
++    {
++        switch(otype)
++        {
++        case fft_array_type_complex_interleaved:
++        case fft_array_type_hermitian_interleaved:
++        {
++            switch(precision)
++            {
++            case fft_precision_half:
++            {
++                buffer_printer<rocfft_complex<_Float16>> s;
++                s.print_buffer_flat(buf, osize, ooffset);
++                break;
++            }
++            case fft_precision_single:
++            {
++                buffer_printer<rocfft_complex<float>> s;
++                s.print_buffer_flat(buf, osize, ooffset);
++                break;
++            }
++            case fft_precision_double:
++                buffer_printer<rocfft_complex<double>> s;
++                s.print_buffer_flat(buf, osize, ooffset);
++                break;
++            }
++            break;
++        }
++        case fft_array_type_complex_planar:
++        case fft_array_type_hermitian_planar:
++        case fft_array_type_real:
++        {
++            switch(precision)
++            {
++            case fft_precision_half:
++            {
++                buffer_printer<_Float16> s;
++                s.print_buffer_flat(buf, osize, ooffset);
++                break;
++            }
++            case fft_precision_single:
++            {
++                buffer_printer<float> s;
++                s.print_buffer_flat(buf, osize, ooffset);
++                break;
++            }
++
++            case fft_precision_double:
++            {
++                buffer_printer<double> s;
++                s.print_buffer_flat(buf, osize, ooffset);
++                break;
++            }
++            }
++            break;
++        default:
++            throw std::runtime_error("Invalid itype in print_ibuffer_flat");
++        }
++        }
++    }
++
++    virtual fft_status set_callbacks(void* load_cb_host,
++                                     void* load_cb_data,
++                                     void* store_cb_host,
++                                     void* store_cb_data)
++    {
++        return fft_status_success;
++    }
++
++    virtual fft_status execute(void** in, void** out)
++    {
++        return fft_status_success;
++    };
++
++    size_t fft_params_vram_footprint()
++    {
++        return fft_params::vram_footprint();
++    }
++
++    virtual size_t vram_footprint()
++    {
++        const auto ibuf_size = ibuffer_sizes();
++        size_t     val       = std::accumulate(ibuf_size.begin(), ibuf_size.end(), (size_t)1);
++        if(placement == fft_placement_notinplace)
++        {
++            const auto obuf_size = obuffer_sizes();
++            val += std::accumulate(obuf_size.begin(), obuf_size.end(), (size_t)1);
++        }
++        return val;
++    }
++
++    // Specific exception type for work buffer allocation failure.
++    // Tests that hit this can't fit on the GPU and should be skipped.
++    struct work_buffer_alloc_failure : public std::runtime_error
++    {
++        work_buffer_alloc_failure(const std::string& s)
++            : std::runtime_error(s)
++        {
++        }
++    };
++
++    virtual fft_status create_plan()
++    {
++        return fft_status_success;
++    }
++
++    // Change a forward transform to it's inverse
++    void inverse_from_forward(fft_params& params_forward)
++    {
++        switch(params_forward.transform_type)
++        {
++        case fft_transform_type_complex_forward:
++            transform_type = fft_transform_type_complex_inverse;
++            break;
++        case fft_transform_type_real_forward:
++            transform_type = fft_transform_type_real_inverse;
++            break;
++        default:
++            throw std::runtime_error("Transform type not forward.");
++        }
++
++        length    = params_forward.length;
++        istride   = params_forward.ostride;
++        ostride   = params_forward.istride;
++        nbatch    = params_forward.nbatch;
++        precision = params_forward.precision;
++        placement = params_forward.placement;
++        idist     = params_forward.odist;
++        odist     = params_forward.idist;
++        itype     = params_forward.otype;
++        otype     = params_forward.itype;
++        ioffset   = params_forward.ooffset;
++        ooffset   = params_forward.ioffset;
++
++        run_callbacks = params_forward.run_callbacks;
++
++        check_output_strides = params_forward.check_output_strides;
++
++        scale_factor = 1 / params_forward.scale_factor;
++    }
++
++    // prepare for multi-GPU transform.  Generated input is in ibuffer.
++    // pibuffer, pobuffer are the pointers that will be passed to the
++    // FFT library's "execute" API.
++    virtual void multi_gpu_prepare(std::vector<gpubuf>& ibuffer,
++                                   std::vector<void*>&  pibuffer,
++                                   std::vector<void*>&  pobuffer)
++    {
++    }
++
++    // finalize multi-GPU transform.  pobuffers are the pointers
++    // provided to the FFT library's "execute" API.  obuffer is the
++    // buffer where transform output needs to go for validation
++    virtual void multi_gpu_finalize(std::vector<gpubuf>& obuffer, std::vector<void*>& pobuffer) {}
++
++    // create bricks in the specified field for the specified number
++    // of devices.  The field is split along the highest FFT
++    // dimension, and the length only includes FFT lengths, not batch
++    // dimension.
++    void distribute_field(int                        deviceCount,
++                          std::vector<fft_field>&    fields,
++                          const std::vector<size_t>& field_length)
++    {
++        size_t slowLen = field_length.front();
++        if(slowLen < static_cast<size_t>(deviceCount))
++            throw std::runtime_error("too many devices to distribute length "
++                                     + std::to_string(slowLen));
++
++        auto& field = fields.emplace_back();
++
++        for(int i = 0; i < deviceCount; ++i)
++        {
++            // start at origin
++            std::vector<size_t> field_lower(field_length.size());
++            std::vector<size_t> field_upper(field_length.size());
++
++            // note: slowest FFT dim is index 0 in these coordinates
++            field_lower[0] = slowLen / deviceCount * i;
++
++            // last brick needs to include the whole slow len
++            if(i == deviceCount - 1)
++            {
++                field_upper[0] = slowLen;
++            }
++            else
++            {
++                field_upper[0] = std::min(slowLen, field_lower[0] + slowLen / deviceCount);
++            }
++
++            for(unsigned int upperDim = 1; upperDim < field_length.size(); ++upperDim)
++            {
++                field_upper[upperDim] = field_length[upperDim];
++            }
++
++            // field coordinates also need to include batch
++            field_lower.insert(field_lower.begin(), 0);
++            field_upper.insert(field_upper.begin(), nbatch);
++
++            // bricks have contiguous strides
++            size_t              brick_dist = 1;
++            std::vector<size_t> brick_stride(field_lower.size());
++            for(size_t distIdx = 0; distIdx < field_lower.size(); ++distIdx)
++            {
++                // fill strides from fastest to slowest
++                *(brick_stride.rbegin() + distIdx) = brick_dist;
++                brick_dist *= *(field_upper.rbegin() + distIdx) - *(field_lower.rbegin() + distIdx);
++            }
++            field.bricks.push_back(
++                fft_params::fft_brick{field_lower, field_upper, brick_stride, i});
++        }
++    }
++
++    void distribute_input(int deviceCount)
++    {
++        distribute_field(deviceCount, ifields, length);
++    }
++
++    void distribute_output(int deviceCount)
++    {
++        distribute_field(deviceCount, ofields, olength());
++    }
++};
++
++// This is used with the program_options class so that the user can type an integer on the
++// command line and we store into an enum varaible
++template <typename _Elem, typename _Traits>
++std::basic_istream<_Elem, _Traits>& operator>>(std::basic_istream<_Elem, _Traits>& stream,
++                                               fft_array_type&                     atype)
++{
++    unsigned tmp;
++    stream >> tmp;
++    atype = fft_array_type(tmp);
++    return stream;
++}
++
++// similarly for transform type
++template <typename _Elem, typename _Traits>
++std::basic_istream<_Elem, _Traits>& operator>>(std::basic_istream<_Elem, _Traits>& stream,
++                                               fft_transform_type&                 ttype)
++{
++    unsigned tmp;
++    stream >> tmp;
++    ttype = fft_transform_type(tmp);
++    return stream;
++}
++
++// Returns pairs of startindex, endindex, for 1D, 2D, 3D lengths
++template <typename T1>
++std::vector<std::pair<T1, T1>> partition_colmajor(const T1& length)
++{
++    return partition_base(length, compute_partition_count(length));
++}
++
++// Partition on the rightmost part of the tuple, for col-major indexing
++template <typename T1>
++std::vector<std::pair<std::tuple<T1, T1>, std::tuple<T1, T1>>>
++    partition_colmajor(const std::tuple<T1, T1>& length)
++{
++    auto partitions = partition_base(std::get<1>(length), compute_partition_count(length));
++    std::vector<std::pair<std::tuple<T1, T1>, std::tuple<T1, T1>>> ret(partitions.size());
++    for(size_t i = 0; i < partitions.size(); ++i)
++    {
++        std::get<1>(ret[i].first)  = partitions[i].first;
++        std::get<0>(ret[i].first)  = 0;
++        std::get<1>(ret[i].second) = partitions[i].second;
++        std::get<0>(ret[i].second) = std::get<0>(length);
++    }
++    return ret;
++}
++template <typename T1>
++std::vector<std::pair<std::tuple<T1, T1, T1>, std::tuple<T1, T1, T1>>>
++    partition_colmajor(const std::tuple<T1, T1, T1>& length)
++{
++    auto partitions = partition_base(std::get<2>(length), compute_partition_count(length));
++    std::vector<std::pair<std::tuple<T1, T1, T1>, std::tuple<T1, T1, T1>>> ret(partitions.size());
++    for(size_t i = 0; i < partitions.size(); ++i)
++    {
++        std::get<2>(ret[i].first)  = partitions[i].first;
++        std::get<1>(ret[i].first)  = 0;
++        std::get<0>(ret[i].first)  = 0;
++        std::get<2>(ret[i].second) = partitions[i].second;
++        std::get<1>(ret[i].second) = std::get<1>(length);
++        std::get<0>(ret[i].second) = std::get<0>(length);
++    }
++    return ret;
++}
++
++// Copy data of dimensions length with strides istride and length idist between batches to
++// a buffer with strides ostride and length odist between batches.  The input and output
++// types are identical.
++template <typename Tval, typename Tint1, typename Tint2, typename Tint3>
++inline void copy_buffers_1to1(const Tval*                input,
++                              Tval*                      output,
++                              const Tint1&               whole_length,
++                              const size_t               nbatch,
++                              const Tint2&               istride,
++                              const size_t               idist,
++                              const Tint3&               ostride,
++                              const size_t               odist,
++                              const std::vector<size_t>& ioffset,
++                              const std::vector<size_t>& ooffset)
++{
++    const bool idx_equals_odx = istride == ostride && idist == odist;
++    size_t     idx_base       = 0;
++    size_t     odx_base       = 0;
++    auto       partitions     = partition_rowmajor(whole_length);
++    for(size_t b = 0; b < nbatch; b++, idx_base += idist, odx_base += odist)
++    {
++#ifdef _OPENMP
++#pragma omp parallel for num_threads(partitions.size())
++#endif
++        for(size_t part = 0; part < partitions.size(); ++part)
++        {
++            auto       index  = partitions[part].first;
++            const auto length = partitions[part].second;
++            do
++            {
++                const auto idx = compute_index(index, istride, idx_base);
++                const auto odx = idx_equals_odx ? idx : compute_index(index, ostride, odx_base);
++                output[odx + ooffset[0]] = input[idx + ioffset[0]];
++            } while(increment_rowmajor(index, length));
++        }
++    }
++}
++
++// Copy data of dimensions length with strides istride and length idist between batches to
++// a buffer with strides ostride and length odist between batches.  The input type is
++// planar and the output type is complex interleaved.
++template <typename Tval, typename Tint1, typename Tint2, typename Tint3>
++inline void copy_buffers_2to1(const Tval*                input0,
++                              const Tval*                input1,
++                              rocfft_complex<Tval>*      output,
++                              const Tint1&               whole_length,
++                              const size_t               nbatch,
++                              const Tint2&               istride,
++                              const size_t               idist,
++                              const Tint3&               ostride,
++                              const size_t               odist,
++                              const std::vector<size_t>& ioffset,
++                              const std::vector<size_t>& ooffset)
++{
++    const bool idx_equals_odx = istride == ostride && idist == odist;
++    size_t     idx_base       = 0;
++    size_t     odx_base       = 0;
++    auto       partitions     = partition_rowmajor(whole_length);
++    for(size_t b = 0; b < nbatch; b++, idx_base += idist, odx_base += odist)
++    {
++#ifdef _OPENMP
++#pragma omp parallel for num_threads(partitions.size())
++#endif
++        for(size_t part = 0; part < partitions.size(); ++part)
++        {
++            auto       index  = partitions[part].first;
++            const auto length = partitions[part].second;
++            do
++            {
++                const auto idx = compute_index(index, istride, idx_base);
++                const auto odx = idx_equals_odx ? idx : compute_index(index, ostride, odx_base);
++                output[odx + ooffset[0]]
++                    = rocfft_complex<Tval>(input0[idx + ioffset[0]], input1[idx + ioffset[1]]);
++            } while(increment_rowmajor(index, length));
++        }
++    }
++}
++
++// Copy data of dimensions length with strides istride and length idist between batches to
++// a buffer with strides ostride and length odist between batches.  The input type is
++// complex interleaved and the output type is planar.
++template <typename Tval, typename Tint1, typename Tint2, typename Tint3>
++inline void copy_buffers_1to2(const rocfft_complex<Tval>* input,
++                              Tval*                       output0,
++                              Tval*                       output1,
++                              const Tint1&                whole_length,
++                              const size_t                nbatch,
++                              const Tint2&                istride,
++                              const size_t                idist,
++                              const Tint3&                ostride,
++                              const size_t                odist,
++                              const std::vector<size_t>&  ioffset,
++                              const std::vector<size_t>&  ooffset)
++{
++    const bool idx_equals_odx = istride == ostride && idist == odist;
++    size_t     idx_base       = 0;
++    size_t     odx_base       = 0;
++    auto       partitions     = partition_rowmajor(whole_length);
++    for(size_t b = 0; b < nbatch; b++, idx_base += idist, odx_base += odist)
++    {
++#ifdef _OPENMP
++#pragma omp parallel for num_threads(partitions.size())
++#endif
++        for(size_t part = 0; part < partitions.size(); ++part)
++        {
++            auto       index  = partitions[part].first;
++            const auto length = partitions[part].second;
++            do
++            {
++                const auto idx = compute_index(index, istride, idx_base);
++                const auto odx = idx_equals_odx ? idx : compute_index(index, ostride, odx_base);
++                output0[odx + ooffset[0]] = input[idx + ioffset[0]].real();
++                output1[odx + ooffset[1]] = input[idx + ioffset[0]].imag();
++            } while(increment_rowmajor(index, length));
++        }
++    }
++}
++
++// Copy data of dimensions length with strides istride and length idist between batches to
++// a buffer with strides ostride and length odist between batches.  The input type given
++// by itype, and the output type is given by otype.
++template <typename Tint1, typename Tint2, typename Tint3>
++inline void copy_buffers(const std::vector<hostbuf>& input,
++                         std::vector<hostbuf>&       output,
++                         const Tint1&                length,
++                         const size_t                nbatch,
++                         const fft_precision         precision,
++                         const fft_array_type        itype,
++                         const Tint2&                istride,
++                         const size_t                idist,
++                         const fft_array_type        otype,
++                         const Tint3&                ostride,
++                         const size_t                odist,
++                         const std::vector<size_t>&  ioffset,
++                         const std::vector<size_t>&  ooffset)
++{
++    if(itype == otype)
++    {
++        switch(itype)
++        {
++        case fft_array_type_complex_interleaved:
++        case fft_array_type_hermitian_interleaved:
++            switch(precision)
++            {
++            case fft_precision_half:
++                copy_buffers_1to1(
++                    reinterpret_cast<const rocfft_complex<_Float16>*>(input[0].data()),
++                    reinterpret_cast<rocfft_complex<_Float16>*>(output[0].data()),
++                    length,
++                    nbatch,
++                    istride,
++                    idist,
++                    ostride,
++                    odist,
++                    ioffset,
++                    ooffset);
++                break;
++            case fft_precision_single:
++                copy_buffers_1to1(reinterpret_cast<const rocfft_complex<float>*>(input[0].data()),
++                                  reinterpret_cast<rocfft_complex<float>*>(output[0].data()),
++                                  length,
++                                  nbatch,
++                                  istride,
++                                  idist,
++                                  ostride,
++                                  odist,
++                                  ioffset,
++                                  ooffset);
++                break;
++            case fft_precision_double:
++                copy_buffers_1to1(reinterpret_cast<const rocfft_complex<double>*>(input[0].data()),
++                                  reinterpret_cast<rocfft_complex<double>*>(output[0].data()),
++                                  length,
++                                  nbatch,
++                                  istride,
++                                  idist,
++                                  ostride,
++                                  odist,
++                                  ioffset,
++                                  ooffset);
++                break;
++            }
++            break;
++        case fft_array_type_real:
++        case fft_array_type_complex_planar:
++        case fft_array_type_hermitian_planar:
++            for(unsigned int idx = 0; idx < input.size(); ++idx)
++            {
++                switch(precision)
++                {
++                case fft_precision_half:
++                    copy_buffers_1to1(reinterpret_cast<const _Float16*>(input[idx].data()),
++                                      reinterpret_cast<_Float16*>(output[idx].data()),
++                                      length,
++                                      nbatch,
++                                      istride,
++                                      idist,
++                                      ostride,
++                                      odist,
++                                      ioffset,
++                                      ooffset);
++                    break;
++                case fft_precision_single:
++                    copy_buffers_1to1(reinterpret_cast<const float*>(input[idx].data()),
++                                      reinterpret_cast<float*>(output[idx].data()),
++                                      length,
++                                      nbatch,
++                                      istride,
++                                      idist,
++                                      ostride,
++                                      odist,
++                                      ioffset,
++                                      ooffset);
++                    break;
++                case fft_precision_double:
++                    copy_buffers_1to1(reinterpret_cast<const double*>(input[idx].data()),
++                                      reinterpret_cast<double*>(output[idx].data()),
++                                      length,
++                                      nbatch,
++                                      istride,
++                                      idist,
++                                      ostride,
++                                      odist,
++                                      ioffset,
++                                      ooffset);
++                    break;
++                }
++            }
++            break;
++        default:
++            throw std::runtime_error("Invalid data type");
++        }
++    }
++    else if((itype == fft_array_type_complex_interleaved && otype == fft_array_type_complex_planar)
++            || (itype == fft_array_type_hermitian_interleaved
++                && otype == fft_array_type_hermitian_planar))
++    {
++        // copy 1to2
++        switch(precision)
++        {
++        case fft_precision_half:
++            copy_buffers_1to2(reinterpret_cast<const rocfft_complex<_Float16>*>(input[0].data()),
++                              reinterpret_cast<_Float16*>(output[0].data()),
++                              reinterpret_cast<_Float16*>(output[1].data()),
++                              length,
++                              nbatch,
++                              istride,
++                              idist,
++                              ostride,
++                              odist,
++                              ioffset,
++                              ooffset);
++            break;
++        case fft_precision_single:
++            copy_buffers_1to2(reinterpret_cast<const rocfft_complex<float>*>(input[0].data()),
++                              reinterpret_cast<float*>(output[0].data()),
++                              reinterpret_cast<float*>(output[1].data()),
++                              length,
++                              nbatch,
++                              istride,
++                              idist,
++                              ostride,
++                              odist,
++                              ioffset,
++                              ooffset);
++            break;
++        case fft_precision_double:
++            copy_buffers_1to2(reinterpret_cast<const rocfft_complex<double>*>(input[0].data()),
++                              reinterpret_cast<double*>(output[0].data()),
++                              reinterpret_cast<double*>(output[1].data()),
++                              length,
++                              nbatch,
++                              istride,
++                              idist,
++                              ostride,
++                              odist,
++                              ioffset,
++                              ooffset);
++            break;
++        }
++    }
++    else if((itype == fft_array_type_complex_planar && otype == fft_array_type_complex_interleaved)
++            || (itype == fft_array_type_hermitian_planar
++                && otype == fft_array_type_hermitian_interleaved))
++    {
++        // copy 2 to 1
++        switch(precision)
++        {
++        case fft_precision_half:
++            copy_buffers_2to1(reinterpret_cast<const _Float16*>(input[0].data()),
++                              reinterpret_cast<const _Float16*>(input[1].data()),
++                              reinterpret_cast<rocfft_complex<_Float16>*>(output[0].data()),
++                              length,
++                              nbatch,
++                              istride,
++                              idist,
++                              ostride,
++                              odist,
++                              ioffset,
++                              ooffset);
++            break;
++        case fft_precision_single:
++            copy_buffers_2to1(reinterpret_cast<const float*>(input[0].data()),
++                              reinterpret_cast<const float*>(input[1].data()),
++                              reinterpret_cast<rocfft_complex<float>*>(output[0].data()),
++                              length,
++                              nbatch,
++                              istride,
++                              idist,
++                              ostride,
++                              odist,
++                              ioffset,
++                              ooffset);
++            break;
++        case fft_precision_double:
++            copy_buffers_2to1(reinterpret_cast<const double*>(input[0].data()),
++                              reinterpret_cast<const double*>(input[1].data()),
++                              reinterpret_cast<rocfft_complex<double>*>(output[0].data()),
++                              length,
++                              nbatch,
++                              istride,
++                              idist,
++                              ostride,
++                              odist,
++                              ioffset,
++                              ooffset);
++            break;
++        }
++    }
++    else
++    {
++        throw std::runtime_error("Invalid input and output types.");
++    }
++}
++
++// unroll arbitrary-dimension copy_buffers into specializations for 1-, 2-, 3-dimensions
++template <typename Tint1, typename Tint2, typename Tint3>
++inline void copy_buffers(const std::vector<hostbuf>& input,
++                         std::vector<hostbuf>&       output,
++                         const std::vector<Tint1>&   length,
++                         const size_t                nbatch,
++                         const fft_precision         precision,
++                         const fft_array_type        itype,
++                         const std::vector<Tint2>&   istride,
++                         const size_t                idist,
++                         const fft_array_type        otype,
++                         const std::vector<Tint3>&   ostride,
++                         const size_t                odist,
++                         const std::vector<size_t>&  ioffset,
++                         const std::vector<size_t>&  ooffset)
++{
++    switch(length.size())
++    {
++    case 1:
++        return copy_buffers(input,
++                            output,
++                            length[0],
++                            nbatch,
++                            precision,
++                            itype,
++                            istride[0],
++                            idist,
++                            otype,
++                            ostride[0],
++                            odist,
++                            ioffset,
++                            ooffset);
++    case 2:
++        return copy_buffers(input,
++                            output,
++                            std::make_tuple(length[0], length[1]),
++                            nbatch,
++                            precision,
++                            itype,
++                            std::make_tuple(istride[0], istride[1]),
++                            idist,
++                            otype,
++                            std::make_tuple(ostride[0], ostride[1]),
++                            odist,
++                            ioffset,
++                            ooffset);
++    case 3:
++        return copy_buffers(input,
++                            output,
++                            std::make_tuple(length[0], length[1], length[2]),
++                            nbatch,
++                            precision,
++                            itype,
++                            std::make_tuple(istride[0], istride[1], istride[2]),
++                            idist,
++                            otype,
++                            std::make_tuple(ostride[0], ostride[1], ostride[2]),
++                            odist,
++                            ioffset,
++                            ooffset);
++    default:
++        abort();
++    }
++}
++
++// Compute the L-infinity and L-2 distance between two buffers with strides istride and
++// length idist between batches to a buffer with strides ostride and length odist between
++// batches.  Both buffers are of complex type.
++
++struct VectorNorms
++{
++    double l_2 = 0.0, l_inf = 0.0;
++};
++
++template <typename Tcomplex, typename Tint1, typename Tint2, typename Tint3>
++inline VectorNorms distance_1to1_complex(const Tcomplex*                         input,
++                                         const Tcomplex*                         output,
++                                         const Tint1&                            whole_length,
++                                         const size_t                            nbatch,
++                                         const Tint2&                            istride,
++                                         const size_t                            idist,
++                                         const Tint3&                            ostride,
++                                         const size_t                            odist,
++                                         std::vector<std::pair<size_t, size_t>>* linf_failures,
++                                         const double                            linf_cutoff,
++                                         const std::vector<size_t>&              ioffset,
++                                         const std::vector<size_t>&              ooffset,
++                                         const double output_scalar = 1.0)
++{
++    double linf = 0.0;
++    double l2   = 0.0;
++
++    std::mutex                             linf_failure_lock;
++    std::vector<std::pair<size_t, size_t>> linf_failures_private;
++
++    const bool idx_equals_odx = istride == ostride && idist == odist;
++    size_t     idx_base       = 0;
++    size_t     odx_base       = 0;
++    auto       partitions     = partition_colmajor(whole_length);
++    for(size_t b = 0; b < nbatch; b++, idx_base += idist, odx_base += odist)
++    {
++#ifdef _OPENMP
++#pragma omp parallel for reduction(max : linf) reduction(+ : l2) num_threads(partitions.size()) private(linf_failures_private)
++#endif
++        for(size_t part = 0; part < partitions.size(); ++part)
++        {
++            double     cur_linf = 0.0;
++            double     cur_l2   = 0.0;
++            auto       index    = partitions[part].first;
++            const auto length   = partitions[part].second;
++
++            do
++            {
++                const auto   idx = compute_index(index, istride, idx_base);
++                const auto   odx = idx_equals_odx ? idx : compute_index(index, ostride, odx_base);
++                const double rdiff
++                    = std::abs(static_cast<double>(output[odx + ooffset[0]].real()) * output_scalar
++                               - static_cast<double>(input[idx + ioffset[0]].real()));
++                cur_linf = std::max(rdiff, cur_linf);
++                if(cur_linf > linf_cutoff)
++                {
++                    std::pair<size_t, size_t> fval(b, idx);
++                    if(linf_failures)
++                        linf_failures_private.push_back(fval);
++                }
++                cur_l2 += rdiff * rdiff;
++
++                const double idiff
++                    = std::abs(static_cast<double>(output[odx + ooffset[0]].imag()) * output_scalar
++                               - static_cast<double>(input[idx + ioffset[0]].imag()));
++                cur_linf = std::max(idiff, cur_linf);
++                if(cur_linf > linf_cutoff)
++                {
++                    std::pair<size_t, size_t> fval(b, idx);
++                    if(linf_failures)
++                        linf_failures_private.push_back(fval);
++                }
++                cur_l2 += idiff * idiff;
++
++            } while(increment_rowmajor(index, length));
++            linf = std::max(linf, cur_linf);
++            l2 += cur_l2;
++
++            if(linf_failures)
++            {
++                linf_failure_lock.lock();
++                std::copy(linf_failures_private.begin(),
++                          linf_failures_private.end(),
++                          std::back_inserter(*linf_failures));
++                linf_failure_lock.unlock();
++            }
++        }
++    }
++    return {.l_2 = sqrt(l2), .l_inf = linf};
++}
++
++// Compute the L-infinity and L-2 distance between two buffers with strides istride and
++// length idist between batches to a buffer with strides ostride and length odist between
++// batches.  Both buffers are of real type.
++template <typename Tfloat, typename Tint1, typename Tint2, typename Tint3>
++inline VectorNorms distance_1to1_real(const Tfloat*                           input,
++                                      const Tfloat*                           output,
++                                      const Tint1&                            whole_length,
++                                      const size_t                            nbatch,
++                                      const Tint2&                            istride,
++                                      const size_t                            idist,
++                                      const Tint3&                            ostride,
++                                      const size_t                            odist,
++                                      std::vector<std::pair<size_t, size_t>>* linf_failures,
++                                      const double                            linf_cutoff,
++                                      const std::vector<size_t>&              ioffset,
++                                      const std::vector<size_t>&              ooffset,
++                                      const double                            output_scalar = 1.0)
++{
++    double linf = 0.0;
++    double l2   = 0.0;
++
++    std::mutex                             linf_failure_lock;
++    std::vector<std::pair<size_t, size_t>> linf_failures_private;
++
++    const bool idx_equals_odx = istride == ostride && idist == odist;
++    size_t     idx_base       = 0;
++    size_t     odx_base       = 0;
++    auto       partitions     = partition_rowmajor(whole_length);
++    for(size_t b = 0; b < nbatch; b++, idx_base += idist, odx_base += odist)
++    {
++#ifdef _OPENMP
++#pragma omp parallel for reduction(max : linf) reduction(+ : l2) num_threads(partitions.size()) private(linf_failures_private)
++#endif
++        for(size_t part = 0; part < partitions.size(); ++part)
++        {
++            double     cur_linf = 0.0;
++            double     cur_l2   = 0.0;
++            auto       index    = partitions[part].first;
++            const auto length   = partitions[part].second;
++            do
++            {
++                const auto   idx = compute_index(index, istride, idx_base);
++                const auto   odx = idx_equals_odx ? idx : compute_index(index, ostride, odx_base);
++                const double diff
++                    = std::abs(static_cast<double>(output[odx + ooffset[0]]) * output_scalar
++                               - static_cast<double>(input[idx + ioffset[0]]));
++                cur_linf = std::max(diff, cur_linf);
++                if(cur_linf > linf_cutoff)
++                {
++                    std::pair<size_t, size_t> fval(b, idx);
++                    if(linf_failures)
++                        linf_failures_private.push_back(fval);
++                }
++                cur_l2 += diff * diff;
++
++            } while(increment_rowmajor(index, length));
++            linf = std::max(linf, cur_linf);
++            l2 += cur_l2;
++
++            if(linf_failures)
++            {
++                linf_failure_lock.lock();
++                std::copy(linf_failures_private.begin(),
++                          linf_failures_private.end(),
++                          std::back_inserter(*linf_failures));
++                linf_failure_lock.unlock();
++            }
++        }
++    }
++    return {.l_2 = sqrt(l2), .l_inf = linf};
++}
++
++// Compute the L-infinity and L-2 distance between two buffers with strides istride and
++// length idist between batches to a buffer with strides ostride and length odist between
++// batches.  input is complex-interleaved, output is complex-planar.
++template <typename Tval, typename Tint1, typename T2, typename T3>
++inline VectorNorms distance_1to2(const rocfft_complex<Tval>*             input,
++                                 const Tval*                             output0,
++                                 const Tval*                             output1,
++                                 const Tint1&                            whole_length,
++                                 const size_t                            nbatch,
++                                 const T2&                               istride,
++                                 const size_t                            idist,
++                                 const T3&                               ostride,
++                                 const size_t                            odist,
++                                 std::vector<std::pair<size_t, size_t>>* linf_failures,
++                                 const double                            linf_cutoff,
++                                 const std::vector<size_t>&              ioffset,
++                                 const std::vector<size_t>&              ooffset,
++                                 const double                            output_scalar = 1.0)
++{
++    double linf = 0.0;
++    double l2   = 0.0;
++
++    std::mutex                             linf_failure_lock;
++    std::vector<std::pair<size_t, size_t>> linf_failures_private;
++
++    const bool idx_equals_odx = istride == ostride && idist == odist;
++    size_t     idx_base       = 0;
++    size_t     odx_base       = 0;
++    auto       partitions     = partition_rowmajor(whole_length);
++    for(size_t b = 0; b < nbatch; b++, idx_base += idist, odx_base += odist)
++    {
++#ifdef _OPENMP
++#pragma omp parallel for reduction(max : linf) reduction(+ : l2) num_threads(partitions.size()) private(linf_failures_private)
++#endif
++        for(size_t part = 0; part < partitions.size(); ++part)
++        {
++            double     cur_linf = 0.0;
++            double     cur_l2   = 0.0;
++            auto       index    = partitions[part].first;
++            const auto length   = partitions[part].second;
++            do
++            {
++                const auto   idx = compute_index(index, istride, idx_base);
++                const auto   odx = idx_equals_odx ? idx : compute_index(index, ostride, odx_base);
++                const double rdiff
++                    = std::abs(static_cast<double>(output0[odx + ooffset[0]]) * output_scalar
++                               - static_cast<double>(input[idx + ioffset[0]].real()));
++                cur_linf = std::max(rdiff, cur_linf);
++                if(cur_linf > linf_cutoff)
++                {
++                    std::pair<size_t, size_t> fval(b, idx);
++                    if(linf_failures)
++                        linf_failures_private.push_back(fval);
++                }
++                cur_l2 += rdiff * rdiff;
++
++                const double idiff
++                    = std::abs(static_cast<double>(output1[odx + ooffset[1]]) * output_scalar
++                               - static_cast<double>(input[idx + ioffset[0]].imag()));
++                cur_linf = std::max(idiff, cur_linf);
++                if(cur_linf > linf_cutoff)
++                {
++                    std::pair<size_t, size_t> fval(b, idx);
++                    if(linf_failures)
++                        linf_failures_private.push_back(fval);
++                }
++                cur_l2 += idiff * idiff;
++
++            } while(increment_rowmajor(index, length));
++            linf = std::max(linf, cur_linf);
++            l2 += cur_l2;
++
++            if(linf_failures)
++            {
++                linf_failure_lock.lock();
++                std::copy(linf_failures_private.begin(),
++                          linf_failures_private.end(),
++                          std::back_inserter(*linf_failures));
++                linf_failure_lock.unlock();
++            }
++        }
++    }
++    return {.l_2 = sqrt(l2), .l_inf = linf};
++}
++
++// Compute the L-inifnity and L-2 distance between two buffers of dimension length and
++// with types given by itype, otype, and precision.
++template <typename Tint1, typename Tint2, typename Tint3>
++inline VectorNorms distance(const std::vector<hostbuf>&             input,
++                            const std::vector<hostbuf>&             output,
++                            const Tint1&                            length,
++                            const size_t                            nbatch,
++                            const fft_precision                     precision,
++                            const fft_array_type                    itype,
++                            const Tint2&                            istride,
++                            const size_t                            idist,
++                            const fft_array_type                    otype,
++                            const Tint3&                            ostride,
++                            const size_t                            odist,
++                            std::vector<std::pair<size_t, size_t>>* linf_failures,
++                            const double                            linf_cutoff,
++                            const std::vector<size_t>&              ioffset,
++                            const std::vector<size_t>&              ooffset,
++                            const double                            output_scalar = 1.0)
++{
++    VectorNorms dist;
++
++    if(itype == otype)
++    {
++        switch(itype)
++        {
++        case fft_array_type_complex_interleaved:
++        case fft_array_type_hermitian_interleaved:
++            switch(precision)
++            {
++            case fft_precision_half:
++                dist = distance_1to1_complex(
++                    reinterpret_cast<const rocfft_complex<_Float16>*>(input[0].data()),
++                    reinterpret_cast<const rocfft_complex<_Float16>*>(output[0].data()),
++                    length,
++                    nbatch,
++                    istride,
++                    idist,
++                    ostride,
++                    odist,
++                    linf_failures,
++                    linf_cutoff,
++                    ioffset,
++                    ooffset,
++                    output_scalar);
++                break;
++            case fft_precision_single:
++                dist = distance_1to1_complex(
++                    reinterpret_cast<const rocfft_complex<float>*>(input[0].data()),
++                    reinterpret_cast<const rocfft_complex<float>*>(output[0].data()),
++                    length,
++                    nbatch,
++                    istride,
++                    idist,
++                    ostride,
++                    odist,
++                    linf_failures,
++                    linf_cutoff,
++                    ioffset,
++                    ooffset,
++                    output_scalar);
++                break;
++            case fft_precision_double:
++                dist = distance_1to1_complex(
++                    reinterpret_cast<const rocfft_complex<double>*>(input[0].data()),
++                    reinterpret_cast<const rocfft_complex<double>*>(output[0].data()),
++                    length,
++                    nbatch,
++                    istride,
++                    idist,
++                    ostride,
++                    odist,
++                    linf_failures,
++                    linf_cutoff,
++                    ioffset,
++                    ooffset,
++                    output_scalar);
++                break;
++            }
++            dist.l_2 *= dist.l_2;
++            break;
++        case fft_array_type_real:
++        case fft_array_type_complex_planar:
++        case fft_array_type_hermitian_planar:
++            for(unsigned int idx = 0; idx < input.size(); ++idx)
++            {
++                VectorNorms d;
++                switch(precision)
++                {
++                case fft_precision_half:
++                    d = distance_1to1_real(reinterpret_cast<const _Float16*>(input[idx].data()),
++                                           reinterpret_cast<const _Float16*>(output[idx].data()),
++                                           length,
++                                           nbatch,
++                                           istride,
++                                           idist,
++                                           ostride,
++                                           odist,
++                                           linf_failures,
++                                           linf_cutoff,
++                                           ioffset,
++                                           ooffset,
++                                           output_scalar);
++                    break;
++                case fft_precision_single:
++                    d = distance_1to1_real(reinterpret_cast<const float*>(input[idx].data()),
++                                           reinterpret_cast<const float*>(output[idx].data()),
++                                           length,
++                                           nbatch,
++                                           istride,
++                                           idist,
++                                           ostride,
++                                           odist,
++                                           linf_failures,
++                                           linf_cutoff,
++                                           ioffset,
++                                           ooffset,
++                                           output_scalar);
++                    break;
++                case fft_precision_double:
++                    d = distance_1to1_real(reinterpret_cast<const double*>(input[idx].data()),
++                                           reinterpret_cast<const double*>(output[idx].data()),
++                                           length,
++                                           nbatch,
++                                           istride,
++                                           idist,
++                                           ostride,
++                                           odist,
++                                           linf_failures,
++                                           linf_cutoff,
++                                           ioffset,
++                                           ooffset,
++                                           output_scalar);
++                    break;
++                }
++                dist.l_inf = std::max(d.l_inf, dist.l_inf);
++                dist.l_2 += d.l_2 * d.l_2;
++            }
++            break;
++        default:
++            throw std::runtime_error("Invalid input and output types.");
++        }
++    }
++    else if((itype == fft_array_type_complex_interleaved && otype == fft_array_type_complex_planar)
++            || (itype == fft_array_type_hermitian_interleaved
++                && otype == fft_array_type_hermitian_planar))
++    {
++        switch(precision)
++        {
++        case fft_precision_half:
++            dist = distance_1to2(reinterpret_cast<const rocfft_complex<_Float16>*>(input[0].data()),
++                                 reinterpret_cast<const _Float16*>(output[0].data()),
++                                 reinterpret_cast<const _Float16*>(output[1].data()),
++                                 length,
++                                 nbatch,
++                                 istride,
++                                 idist,
++                                 ostride,
++                                 odist,
++                                 linf_failures,
++                                 linf_cutoff,
++                                 ioffset,
++                                 ooffset,
++                                 output_scalar);
++            break;
++        case fft_precision_single:
++            dist = distance_1to2(reinterpret_cast<const rocfft_complex<float>*>(input[0].data()),
++                                 reinterpret_cast<const float*>(output[0].data()),
++                                 reinterpret_cast<const float*>(output[1].data()),
++                                 length,
++                                 nbatch,
++                                 istride,
++                                 idist,
++                                 ostride,
++                                 odist,
++                                 linf_failures,
++                                 linf_cutoff,
++                                 ioffset,
++                                 ooffset,
++                                 output_scalar);
++            break;
++        case fft_precision_double:
++            dist = distance_1to2(reinterpret_cast<const rocfft_complex<double>*>(input[0].data()),
++                                 reinterpret_cast<const double*>(output[0].data()),
++                                 reinterpret_cast<const double*>(output[1].data()),
++                                 length,
++                                 nbatch,
++                                 istride,
++                                 idist,
++                                 ostride,
++                                 odist,
++                                 linf_failures,
++                                 linf_cutoff,
++                                 ioffset,
++                                 ooffset,
++                                 output_scalar);
++            break;
++        }
++        dist.l_2 *= dist.l_2;
++    }
++    else if((itype == fft_array_type_complex_planar && otype == fft_array_type_complex_interleaved)
++            || (itype == fft_array_type_hermitian_planar
++                && otype == fft_array_type_hermitian_interleaved))
++    {
++        switch(precision)
++        {
++        case fft_precision_half:
++            dist
++                = distance_1to2(reinterpret_cast<const rocfft_complex<_Float16>*>(output[0].data()),
++                                reinterpret_cast<const _Float16*>(input[0].data()),
++                                reinterpret_cast<const _Float16*>(input[1].data()),
++                                length,
++                                nbatch,
++                                ostride,
++                                odist,
++                                istride,
++                                idist,
++                                linf_failures,
++                                linf_cutoff,
++                                ioffset,
++                                ooffset,
++                                output_scalar);
++            break;
++        case fft_precision_single:
++            dist = distance_1to2(reinterpret_cast<const rocfft_complex<float>*>(output[0].data()),
++                                 reinterpret_cast<const float*>(input[0].data()),
++                                 reinterpret_cast<const float*>(input[1].data()),
++                                 length,
++                                 nbatch,
++                                 ostride,
++                                 odist,
++                                 istride,
++                                 idist,
++                                 linf_failures,
++                                 linf_cutoff,
++                                 ioffset,
++                                 ooffset,
++                                 output_scalar);
++            break;
++        case fft_precision_double:
++            dist = distance_1to2(reinterpret_cast<const rocfft_complex<double>*>(output[0].data()),
++                                 reinterpret_cast<const double*>(input[0].data()),
++                                 reinterpret_cast<const double*>(input[1].data()),
++                                 length,
++                                 nbatch,
++                                 ostride,
++                                 odist,
++                                 istride,
++                                 idist,
++                                 linf_failures,
++                                 linf_cutoff,
++                                 ioffset,
++                                 ooffset,
++                                 output_scalar);
++            break;
++        }
++        dist.l_2 *= dist.l_2;
++    }
++    else
++    {
++        throw std::runtime_error("Invalid input and output types.");
++    }
++    dist.l_2 = sqrt(dist.l_2);
++    return dist;
++}
++
++// check if the specified length + stride/dist is contiguous
++template <typename Tint1, typename Tint2>
++bool is_contiguous_rowmajor(const std::vector<Tint1>& length,
++                            const std::vector<Tint2>& stride,
++                            size_t                    dist)
++{
++    size_t expected_stride = 1;
++    auto   stride_it       = stride.rbegin();
++    auto   length_it       = length.rbegin();
++    for(; stride_it != stride.rend() && length_it != length.rend(); ++stride_it, ++length_it)
++    {
++        if(*stride_it != expected_stride)
++            return false;
++        expected_stride *= *length_it;
++    }
++    return expected_stride == dist;
++}
++
++// Unroll arbitrary-dimension distance into specializations for 1-, 2-, 3-dimensions
++template <typename Tint1, typename Tint2, typename Tint3>
++inline VectorNorms distance(const std::vector<hostbuf>&             input,
++                            const std::vector<hostbuf>&             output,
++                            std::vector<Tint1>                      length,
++                            size_t                                  nbatch,
++                            const fft_precision                     precision,
++                            const fft_array_type                    itype,
++                            std::vector<Tint2>                      istride,
++                            const size_t                            idist,
++                            const fft_array_type                    otype,
++                            std::vector<Tint3>                      ostride,
++                            const size_t                            odist,
++                            std::vector<std::pair<size_t, size_t>>* linf_failures,
++                            const double                            linf_cutoff,
++                            const std::vector<size_t>&              ioffset,
++                            const std::vector<size_t>&              ooffset,
++                            const double                            output_scalar = 1.0)
++{
++    // If istride and ostride are both contiguous, collapse them down
++    // to one dimension.  Index calculation is simpler (and faster)
++    // in the 1D case.
++    if(is_contiguous_rowmajor(length, istride, idist)
++       && is_contiguous_rowmajor(length, ostride, odist))
++    {
++        length  = {product(length.begin(), length.end()) * nbatch};
++        istride = {static_cast<Tint2>(1)};
++        ostride = {static_cast<Tint3>(1)};
++        nbatch  = 1;
++    }
++
++    switch(length.size())
++    {
++    case 1:
++        return distance(input,
++                        output,
++                        length[0],
++                        nbatch,
++                        precision,
++                        itype,
++                        istride[0],
++                        idist,
++                        otype,
++                        ostride[0],
++                        odist,
++                        linf_failures,
++                        linf_cutoff,
++                        ioffset,
++                        ooffset,
++                        output_scalar);
++    case 2:
++        return distance(input,
++                        output,
++                        std::make_tuple(length[0], length[1]),
++                        nbatch,
++                        precision,
++                        itype,
++                        std::make_tuple(istride[0], istride[1]),
++                        idist,
++                        otype,
++                        std::make_tuple(ostride[0], ostride[1]),
++                        odist,
++                        linf_failures,
++                        linf_cutoff,
++                        ioffset,
++                        ooffset,
++                        output_scalar);
++    case 3:
++        return distance(input,
++                        output,
++                        std::make_tuple(length[0], length[1], length[2]),
++                        nbatch,
++                        precision,
++                        itype,
++                        std::make_tuple(istride[0], istride[1], istride[2]),
++                        idist,
++                        otype,
++                        std::make_tuple(ostride[0], ostride[1], ostride[2]),
++                        odist,
++                        linf_failures,
++                        linf_cutoff,
++                        ioffset,
++                        ooffset,
++                        output_scalar);
++    default:
++        abort();
++    }
++}
++
++// Compute the L-infinity and L-2 norm of a buffer with strides istride and
++// length idist.  Data is rocfft_complex.
++template <typename Tcomplex, typename T1, typename T2>
++inline VectorNorms norm_complex(const Tcomplex*            input,
++                                const T1&                  whole_length,
++                                const size_t               nbatch,
++                                const T2&                  istride,
++                                const size_t               idist,
++                                const std::vector<size_t>& offset)
++{
++    double linf = 0.0;
++    double l2   = 0.0;
++
++    size_t idx_base   = 0;
++    auto   partitions = partition_rowmajor(whole_length);
++    for(size_t b = 0; b < nbatch; b++, idx_base += idist)
++    {
++#ifdef _OPENMP
++#pragma omp parallel for reduction(max : linf) reduction(+ : l2) num_threads(partitions.size())
++#endif
++        for(size_t part = 0; part < partitions.size(); ++part)
++        {
++            double     cur_linf = 0.0;
++            double     cur_l2   = 0.0;
++            auto       index    = partitions[part].first;
++            const auto length   = partitions[part].second;
++            do
++            {
++                const auto idx = compute_index(index, istride, idx_base);
++
++                const double rval = std::abs(static_cast<double>(input[idx + offset[0]].real()));
++                cur_linf          = std::max(rval, cur_linf);
++                cur_l2 += rval * rval;
++
++                const double ival = std::abs(static_cast<double>(input[idx + offset[0]].imag()));
++                cur_linf          = std::max(ival, cur_linf);
++                cur_l2 += ival * ival;
++
++            } while(increment_rowmajor(index, length));
++            linf = std::max(linf, cur_linf);
++            l2 += cur_l2;
++        }
++    }
++    return {.l_2 = sqrt(l2), .l_inf = linf};
++}
++
++// Compute the L-infinity and L-2 norm of abuffer with strides istride and
++// length idist.  Data is real-valued.
++template <typename Tfloat, typename T1, typename T2>
++inline VectorNorms norm_real(const Tfloat*              input,
++                             const T1&                  whole_length,
++                             const size_t               nbatch,
++                             const T2&                  istride,
++                             const size_t               idist,
++                             const std::vector<size_t>& offset)
++{
++    double linf = 0.0;
++    double l2   = 0.0;
++
++    size_t idx_base   = 0;
++    auto   partitions = partition_rowmajor(whole_length);
++    for(size_t b = 0; b < nbatch; b++, idx_base += idist)
++    {
++#ifdef _OPENMP
++#pragma omp parallel for reduction(max : linf) reduction(+ : l2) num_threads(partitions.size())
++#endif
++        for(size_t part = 0; part < partitions.size(); ++part)
++        {
++            double     cur_linf = 0.0;
++            double     cur_l2   = 0.0;
++            auto       index    = partitions[part].first;
++            const auto length   = partitions[part].second;
++            do
++            {
++                const auto   idx = compute_index(index, istride, idx_base);
++                const double val = std::abs(static_cast<double>(input[idx + offset[0]]));
++                cur_linf         = std::max(val, cur_linf);
++                cur_l2 += val * val;
++
++            } while(increment_rowmajor(index, length));
++            linf = std::max(linf, cur_linf);
++            l2 += cur_l2;
++        }
++    }
++    return {.l_2 = sqrt(l2), .l_inf = linf};
++}
++
++// Compute the L-infinity and L-2 norm of abuffer with strides istride and
++// length idist.  Data format is given by precision and itype.
++template <typename T1, typename T2>
++inline VectorNorms norm(const std::vector<hostbuf>& input,
++                        const T1&                   length,
++                        const size_t                nbatch,
++                        const fft_precision         precision,
++                        const fft_array_type        itype,
++                        const T2&                   istride,
++                        const size_t                idist,
++                        const std::vector<size_t>&  offset)
++{
++    VectorNorms norm;
++
++    switch(itype)
++    {
++    case fft_array_type_complex_interleaved:
++    case fft_array_type_hermitian_interleaved:
++        switch(precision)
++        {
++        case fft_precision_half:
++            norm = norm_complex(reinterpret_cast<const rocfft_complex<_Float16>*>(input[0].data()),
++                                length,
++                                nbatch,
++                                istride,
++                                idist,
++                                offset);
++            break;
++        case fft_precision_single:
++            norm = norm_complex(reinterpret_cast<const rocfft_complex<float>*>(input[0].data()),
++                                length,
++                                nbatch,
++                                istride,
++                                idist,
++                                offset);
++            break;
++        case fft_precision_double:
++            norm = norm_complex(reinterpret_cast<const rocfft_complex<double>*>(input[0].data()),
++                                length,
++                                nbatch,
++                                istride,
++                                idist,
++                                offset);
++            break;
++        }
++        norm.l_2 *= norm.l_2;
++        break;
++    case fft_array_type_real:
++    case fft_array_type_complex_planar:
++    case fft_array_type_hermitian_planar:
++        for(unsigned int idx = 0; idx < input.size(); ++idx)
++        {
++            VectorNorms n;
++            switch(precision)
++            {
++            case fft_precision_half:
++                n = norm_real(reinterpret_cast<const _Float16*>(input[idx].data()),
++                              length,
++                              nbatch,
++                              istride,
++                              idist,
++                              offset);
++                break;
++            case fft_precision_single:
++                n = norm_real(reinterpret_cast<const float*>(input[idx].data()),
++                              length,
++                              nbatch,
++                              istride,
++                              idist,
++                              offset);
++                break;
++            case fft_precision_double:
++                n = norm_real(reinterpret_cast<const double*>(input[idx].data()),
++                              length,
++                              nbatch,
++                              istride,
++                              idist,
++                              offset);
++                break;
++            }
++            norm.l_inf = std::max(n.l_inf, norm.l_inf);
++            norm.l_2 += n.l_2 * n.l_2;
++        }
++        break;
++    default:
++        throw std::runtime_error("Invalid data type");
++    }
++
++    norm.l_2 = sqrt(norm.l_2);
++    return norm;
++}
++
++// Unroll arbitrary-dimension norm into specializations for 1-, 2-, 3-dimensions
++template <typename T1, typename T2>
++inline VectorNorms norm(const std::vector<hostbuf>& input,
++                        std::vector<T1>             length,
++                        size_t                      nbatch,
++                        const fft_precision         precision,
++                        const fft_array_type        type,
++                        std::vector<T2>             stride,
++                        const size_t                dist,
++                        const std::vector<size_t>&  offset)
++{
++    // If stride is contiguous, collapse it down to one dimension.
++    // Index calculation is simpler (and faster) in the 1D case.
++    if(is_contiguous_rowmajor(length, stride, dist))
++    {
++        length = {product(length.begin(), length.end()) * nbatch};
++        stride = {static_cast<T2>(1)};
++        nbatch = 1;
++    }
++
++    switch(length.size())
++    {
++    case 1:
++        return norm(input, length[0], nbatch, precision, type, stride[0], dist, offset);
++    case 2:
++        return norm(input,
++                    std::make_tuple(length[0], length[1]),
++                    nbatch,
++                    precision,
++                    type,
++                    std::make_tuple(stride[0], stride[1]),
++                    dist,
++                    offset);
++    case 3:
++        return norm(input,
++                    std::make_tuple(length[0], length[1], length[2]),
++                    nbatch,
++                    precision,
++                    type,
++                    std::make_tuple(stride[0], stride[1], stride[2]),
++                    dist,
++                    offset);
++    default:
++        abort();
++    }
++}
++
++// Given a data type and precision, the distance between batches, and
++// the batch size, allocate the required host buffer(s).
++static std::vector<hostbuf> allocate_host_buffer(const fft_precision        precision,
++                                                 const fft_array_type       type,
++                                                 const std::vector<size_t>& size)
++{
++    std::vector<hostbuf> buffers(size.size());
++    for(unsigned int i = 0; i < size.size(); ++i)
++    {
++        buffers[i].alloc(size[i] * var_size<size_t>(precision, type));
++    }
++    return buffers;
++}
++
++// Check if the required buffers fit in the device vram.
++inline bool vram_fits_problem(const size_t prob_size, const size_t vram_avail, int deviceId = 0)
++{
++    // We keep a small margin of error for fitting the problem into vram:
++    const size_t extra = 1 << 27;
++
++    return vram_avail > prob_size + extra;
++}
++
++// Computes the twiddle table VRAM footprint for r2c/c2r transforms.
++// This function will return 0 for the other transform types, since
++// the VRAM footprint in rocFFT is negligible for the other cases.
++inline size_t twiddle_table_vram_footprint(const fft_params& params)
++{
++    size_t vram_footprint = 0;
++
++    // Add vram footprint from real/complex even twiddle buffer size.
++    if(params.transform_type == fft_transform_type_real_forward
++       || params.transform_type == fft_transform_type_real_inverse)
++    {
++        const auto realdim = params.length.back();
++        if(realdim % 2 == 0)
++        {
++            const auto complex_size = params.precision == fft_precision_single ? 8 : 16;
++            // even length twiddle size is 1/4 of the real size, but
++            // in complex elements
++            vram_footprint += realdim * complex_size / 4;
++        }
++    }
++
++    return vram_footprint;
++}
++
++#endif
+diff --git a/shared/fftw_transform.h b/shared/fftw_transform.h
+new file mode 100644
+index 0000000..873a373
+--- /dev/null
++++ b/shared/fftw_transform.h
+@@ -0,0 +1,493 @@
++// Copyright (C) 2016 - 2023 Advanced Micro Devices, Inc. All rights reserved.
++//
++// Permission is hereby granted, free of charge, to any person obtaining a copy
++// of this software and associated documentation files (the "Software"), to deal
++// in the Software without restriction, including without limitation the rights
++// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
++// copies of the Software, and to permit persons to whom the Software is
++// furnished to do so, subject to the following conditions:
++//
++// The above copyright notice and this permission notice shall be included in
++// all copies or substantial portions of the Software.
++//
++// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
++// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
++// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
++// THE SOFTWARE.
++
++#pragma once
++#ifndef FFTWTRANSFORM_H
++#define FFTWTRANSFORM_H
++
++#include "hostbuf.h"
++#include "rocfft_complex.h"
++#include "test_params.h"
++#include <fftw3.h>
++#include <vector>
++
++// Function to return maximum error for float and double types.
++//
++// Following Schatzman (1996; Accuracy of the Discrete Fourier
++// Transform and the Fast Fourier Transform), the shape of relative
++// l_2 error vs length should look like
++//
++//   epsilon * sqrt(log2(length)).
++//
++// The magic epsilon constants below were chosen so that we get a
++// reasonable upper bound for (all of) our tests.
++//
++// For rocFFT, prime lengths result in the highest error.  As such,
++// the epsilons below are perhaps too loose for pow2 lengths; but they
++// are appropriate for prime lengths.
++template <typename Tfloat>
++inline double type_epsilon();
++template <>
++inline double type_epsilon<_Float16>()
++{
++    return half_epsilon;
++}
++template <>
++inline double type_epsilon<float>()
++{
++    return single_epsilon;
++}
++template <>
++inline double type_epsilon<double>()
++{
++    return double_epsilon;
++}
++
++// C++ traits to translate float->fftwf_complex and
++// double->fftw_complex.
++// The correct FFTW complex type can be accessed via, for example,
++// using complex_t = typename fftw_complex_trait<Tfloat>::complex_t;
++template <typename Tfloat>
++struct fftw_trait;
++template <>
++struct fftw_trait<_Float16>
++{
++    // fftw does not support half precision, so use single precision and convert
++    using fftw_complex_type = fftwf_complex;
++    using fftw_plan_type    = fftwf_plan;
++};
++template <>
++struct fftw_trait<float>
++{
++    using fftw_complex_type = fftwf_complex;
++    using fftw_plan_type    = fftwf_plan;
++};
++template <>
++struct fftw_trait<double>
++{
++    using fftw_complex_type = fftw_complex;
++    using fftw_plan_type    = fftw_plan;
++};
++
++// Copies the half-precision input buffer to a single-precision
++// buffer.  Note that the input buffer is already sized like it's a
++// single-precision buffer (but only half of it is filled), because
++// we allocate a single-precision buffer for FFTW to plan with.
++static hostbuf half_to_single_copy(const hostbuf& in)
++{
++    auto out      = in.copy();
++    auto in_begin = reinterpret_cast<const _Float16*>(in.data());
++    std::copy_n(in_begin, in.size() / sizeof(_Float16) / 2, reinterpret_cast<float*>(out.data()));
++    return out;
++}
++
++// converts a wider precision buffer to a narrower precision, in-place
++template <typename TfloatIn, typename TfloatOut>
++void narrow_precision_inplace(hostbuf& in)
++{
++    // ensure we're actually shrinking the data
++    static_assert(sizeof(TfloatIn) > sizeof(TfloatOut));
++
++    auto readPtr  = reinterpret_cast<const TfloatIn*>(in.data());
++    auto writePtr = reinterpret_cast<TfloatOut*>(in.data());
++    std::copy_n(readPtr, in.size() / sizeof(TfloatIn), writePtr);
++    in.shrink(in.size() / (sizeof(TfloatIn) / sizeof(TfloatOut)));
++}
++
++static void single_to_half_inplace(hostbuf& in)
++{
++    narrow_precision_inplace<float, _Float16>(in);
++}
++
++// Template wrappers for real-valued FFTW allocators:
++template <typename Tfloat>
++inline Tfloat* fftw_alloc_real_type(size_t n);
++template <>
++inline float* fftw_alloc_real_type<float>(size_t n)
++{
++    return fftwf_alloc_real(n);
++}
++template <>
++inline double* fftw_alloc_real_type<double>(size_t n)
++{
++    return fftw_alloc_real(n);
++}
++
++// Template wrappers for complex-valued FFTW allocators:
++template <typename Tfloat>
++inline typename fftw_trait<Tfloat>::fftw_complex_type* fftw_alloc_complex_type(size_t n);
++template <>
++inline typename fftw_trait<float>::fftw_complex_type* fftw_alloc_complex_type<float>(size_t n)
++{
++    return fftwf_alloc_complex(n);
++}
++template <>
++inline typename fftw_trait<double>::fftw_complex_type* fftw_alloc_complex_type<double>(size_t n)
++{
++    return fftw_alloc_complex(n);
++}
++
++template <typename fftw_type>
++inline fftw_type* fftw_alloc_type(size_t n);
++template <>
++inline float* fftw_alloc_type<float>(size_t n)
++{
++    return fftw_alloc_real_type<float>(n);
++}
++template <>
++inline double* fftw_alloc_type<double>(size_t n)
++{
++    return fftw_alloc_real_type<double>(n);
++}
++template <>
++inline fftwf_complex* fftw_alloc_type<fftwf_complex>(size_t n)
++{
++    return fftw_alloc_complex_type<float>(n);
++}
++template <>
++inline fftw_complex* fftw_alloc_type<fftw_complex>(size_t n)
++{
++    return fftw_alloc_complex_type<double>(n);
++}
++template <>
++inline rocfft_complex<float>* fftw_alloc_type<rocfft_complex<float>>(size_t n)
++{
++    return (rocfft_complex<float>*)fftw_alloc_complex_type<float>(n);
++}
++template <>
++inline rocfft_complex<double>* fftw_alloc_type<rocfft_complex<double>>(size_t n)
++{
++    return (rocfft_complex<double>*)fftw_alloc_complex_type<double>(n);
++}
++
++// Template wrappers for FFTW plan executors:
++template <typename Tfloat>
++inline void fftw_execute_type(typename fftw_trait<Tfloat>::fftw_plan_type plan);
++template <>
++inline void fftw_execute_type<float>(typename fftw_trait<float>::fftw_plan_type plan)
++{
++    return fftwf_execute(plan);
++}
++template <>
++inline void fftw_execute_type<double>(typename fftw_trait<double>::fftw_plan_type plan)
++{
++    return fftw_execute(plan);
++}
++
++// Template wrappers for FFTW plan destroyers:
++template <typename Tfftw_plan>
++inline void fftw_destroy_plan_type(Tfftw_plan plan);
++template <>
++inline void fftw_destroy_plan_type<fftwf_plan>(fftwf_plan plan)
++{
++    return fftwf_destroy_plan(plan);
++}
++template <>
++inline void fftw_destroy_plan_type<fftw_plan>(fftw_plan plan)
++{
++    return fftw_destroy_plan(plan);
++}
++
++// Template wrappers for FFTW c2c planners:
++template <typename Tfloat>
++inline typename fftw_trait<Tfloat>::fftw_plan_type
++    fftw_plan_guru64_dft(int                                             rank,
++                         const fftw_iodim64*                             dims,
++                         int                                             howmany_rank,
++                         const fftw_iodim64*                             howmany_dims,
++                         typename fftw_trait<Tfloat>::fftw_complex_type* in,
++                         typename fftw_trait<Tfloat>::fftw_complex_type* out,
++                         int                                             sign,
++                         unsigned                                        flags);
++
++template <>
++inline typename fftw_trait<_Float16>::fftw_plan_type
++    fftw_plan_guru64_dft<_Float16>(int                                               rank,
++                                   const fftw_iodim64*                               dims,
++                                   int                                               howmany_rank,
++                                   const fftw_iodim64*                               howmany_dims,
++                                   typename fftw_trait<_Float16>::fftw_complex_type* in,
++                                   typename fftw_trait<_Float16>::fftw_complex_type* out,
++                                   int                                               sign,
++                                   unsigned                                          flags)
++{
++    return fftwf_plan_guru64_dft(rank, dims, howmany_rank, howmany_dims, in, out, sign, flags);
++}
++
++template <>
++inline typename fftw_trait<float>::fftw_plan_type
++    fftw_plan_guru64_dft<float>(int                                            rank,
++                                const fftw_iodim64*                            dims,
++                                int                                            howmany_rank,
++                                const fftw_iodim64*                            howmany_dims,
++                                typename fftw_trait<float>::fftw_complex_type* in,
++                                typename fftw_trait<float>::fftw_complex_type* out,
++                                int                                            sign,
++                                unsigned                                       flags)
++{
++    return fftwf_plan_guru64_dft(rank, dims, howmany_rank, howmany_dims, in, out, sign, flags);
++}
++
++template <>
++inline typename fftw_trait<double>::fftw_plan_type
++    fftw_plan_guru64_dft<double>(int                                             rank,
++                                 const fftw_iodim64*                             dims,
++                                 int                                             howmany_rank,
++                                 const fftw_iodim64*                             howmany_dims,
++                                 typename fftw_trait<double>::fftw_complex_type* in,
++                                 typename fftw_trait<double>::fftw_complex_type* out,
++                                 int                                             sign,
++                                 unsigned                                        flags)
++{
++    return fftw_plan_guru64_dft(rank, dims, howmany_rank, howmany_dims, in, out, sign, flags);
++}
++
++// Template wrappers for FFTW c2c executors:
++template <typename Tfloat>
++inline void fftw_plan_execute_c2c(typename fftw_trait<Tfloat>::fftw_plan_type plan,
++                                  std::vector<hostbuf>&                       in,
++                                  std::vector<hostbuf>&                       out);
++
++template <>
++inline void fftw_plan_execute_c2c<_Float16>(typename fftw_trait<_Float16>::fftw_plan_type plan,
++                                            std::vector<hostbuf>&                         in,
++                                            std::vector<hostbuf>&                         out)
++{
++    // since FFTW does not natively support half precision, convert
++    // input to single, execute, then convert output back to half
++    auto in_single = half_to_single_copy(in.front());
++    fftwf_execute_dft(plan,
++                      reinterpret_cast<fftwf_complex*>(in_single.data()),
++                      reinterpret_cast<fftwf_complex*>(out.front().data()));
++    single_to_half_inplace(out.front());
++}
++
++template <>
++inline void fftw_plan_execute_c2c<float>(typename fftw_trait<float>::fftw_plan_type plan,
++                                         std::vector<hostbuf>&                      in,
++                                         std::vector<hostbuf>&                      out)
++{
++    fftwf_execute_dft(plan,
++                      reinterpret_cast<fftwf_complex*>(in.front().data()),
++                      reinterpret_cast<fftwf_complex*>(out.front().data()));
++}
++
++template <>
++inline void fftw_plan_execute_c2c<double>(typename fftw_trait<double>::fftw_plan_type plan,
++                                          std::vector<hostbuf>&                       in,
++                                          std::vector<hostbuf>&                       out)
++{
++    fftw_execute_dft(plan,
++                     reinterpret_cast<fftw_complex*>(in.front().data()),
++                     reinterpret_cast<fftw_complex*>(out.front().data()));
++}
++
++// Template wrappers for FFTW r2c planners:
++template <typename Tfloat>
++inline typename fftw_trait<Tfloat>::fftw_plan_type
++    fftw_plan_guru64_r2c(int                                             rank,
++                         const fftw_iodim64*                             dims,
++                         int                                             howmany_rank,
++                         const fftw_iodim64*                             howmany_dims,
++                         Tfloat*                                         in,
++                         typename fftw_trait<Tfloat>::fftw_complex_type* out,
++                         unsigned                                        flags);
++template <>
++inline typename fftw_trait<_Float16>::fftw_plan_type
++    fftw_plan_guru64_r2c<_Float16>(int                                               rank,
++                                   const fftw_iodim64*                               dims,
++                                   int                                               howmany_rank,
++                                   const fftw_iodim64*                               howmany_dims,
++                                   _Float16*                                         in,
++                                   typename fftw_trait<_Float16>::fftw_complex_type* out,
++                                   unsigned                                          flags)
++{
++    return fftwf_plan_guru64_dft_r2c(
++        rank, dims, howmany_rank, howmany_dims, reinterpret_cast<float*>(in), out, flags);
++}
++template <>
++inline typename fftw_trait<float>::fftw_plan_type
++    fftw_plan_guru64_r2c<float>(int                                            rank,
++                                const fftw_iodim64*                            dims,
++                                int                                            howmany_rank,
++                                const fftw_iodim64*                            howmany_dims,
++                                float*                                         in,
++                                typename fftw_trait<float>::fftw_complex_type* out,
++                                unsigned                                       flags)
++{
++    return fftwf_plan_guru64_dft_r2c(rank, dims, howmany_rank, howmany_dims, in, out, flags);
++}
++template <>
++inline typename fftw_trait<double>::fftw_plan_type
++    fftw_plan_guru64_r2c<double>(int                                             rank,
++                                 const fftw_iodim64*                             dims,
++                                 int                                             howmany_rank,
++                                 const fftw_iodim64*                             howmany_dims,
++                                 double*                                         in,
++                                 typename fftw_trait<double>::fftw_complex_type* out,
++                                 unsigned                                        flags)
++{
++    return fftw_plan_guru64_dft_r2c(rank, dims, howmany_rank, howmany_dims, in, out, flags);
++}
++
++// Template wrappers for FFTW r2c executors:
++template <typename Tfloat>
++inline void fftw_plan_execute_r2c(typename fftw_trait<Tfloat>::fftw_plan_type plan,
++                                  std::vector<hostbuf>&                       in,
++                                  std::vector<hostbuf>&                       out);
++template <>
++inline void fftw_plan_execute_r2c<_Float16>(typename fftw_trait<float>::fftw_plan_type plan,
++                                            std::vector<hostbuf>&                      in,
++                                            std::vector<hostbuf>&                      out)
++{
++    // since FFTW does not natively support half precision, convert
++    // input to single, execute, then convert output back to half
++    auto in_single = half_to_single_copy(in.front());
++    fftwf_execute_dft_r2c(plan,
++                          reinterpret_cast<float*>(in_single.data()),
++                          reinterpret_cast<fftwf_complex*>(out.front().data()));
++    single_to_half_inplace(out.front());
++}
++template <>
++inline void fftw_plan_execute_r2c<float>(typename fftw_trait<float>::fftw_plan_type plan,
++                                         std::vector<hostbuf>&                      in,
++                                         std::vector<hostbuf>&                      out)
++{
++    fftwf_execute_dft_r2c(plan,
++                          reinterpret_cast<float*>(in.front().data()),
++                          reinterpret_cast<fftwf_complex*>(out.front().data()));
++}
++template <>
++inline void fftw_plan_execute_r2c<double>(typename fftw_trait<double>::fftw_plan_type plan,
++                                          std::vector<hostbuf>&                       in,
++                                          std::vector<hostbuf>&                       out)
++{
++    fftw_execute_dft_r2c(plan,
++                         reinterpret_cast<double*>(in.front().data()),
++                         reinterpret_cast<fftw_complex*>(out.front().data()));
++}
++
++// Template wrappers for FFTW c2r planners:
++template <typename Tfloat>
++inline typename fftw_trait<Tfloat>::fftw_plan_type
++    fftw_plan_guru64_c2r(int                                             rank,
++                         const fftw_iodim64*                             dims,
++                         int                                             howmany_rank,
++                         const fftw_iodim64*                             howmany_dims,
++                         typename fftw_trait<Tfloat>::fftw_complex_type* in,
++                         Tfloat*                                         out,
++                         unsigned                                        flags);
++template <>
++inline typename fftw_trait<_Float16>::fftw_plan_type
++    fftw_plan_guru64_c2r<_Float16>(int                                               rank,
++                                   const fftw_iodim64*                               dims,
++                                   int                                               howmany_rank,
++                                   const fftw_iodim64*                               howmany_dims,
++                                   typename fftw_trait<_Float16>::fftw_complex_type* in,
++                                   _Float16*                                         out,
++                                   unsigned                                          flags)
++{
++    return fftwf_plan_guru64_dft_c2r(
++        rank, dims, howmany_rank, howmany_dims, in, reinterpret_cast<float*>(out), flags);
++}
++template <>
++inline typename fftw_trait<float>::fftw_plan_type
++    fftw_plan_guru64_c2r<float>(int                                            rank,
++                                const fftw_iodim64*                            dims,
++                                int                                            howmany_rank,
++                                const fftw_iodim64*                            howmany_dims,
++                                typename fftw_trait<float>::fftw_complex_type* in,
++                                float*                                         out,
++                                unsigned                                       flags)
++{
++    return fftwf_plan_guru64_dft_c2r(rank, dims, howmany_rank, howmany_dims, in, out, flags);
++}
++template <>
++inline typename fftw_trait<double>::fftw_plan_type
++    fftw_plan_guru64_c2r<double>(int                                             rank,
++                                 const fftw_iodim64*                             dims,
++                                 int                                             howmany_rank,
++                                 const fftw_iodim64*                             howmany_dims,
++                                 typename fftw_trait<double>::fftw_complex_type* in,
++                                 double*                                         out,
++                                 unsigned                                        flags)
++{
++    return fftw_plan_guru64_dft_c2r(rank, dims, howmany_rank, howmany_dims, in, out, flags);
++}
++
++// Template wrappers for FFTW c2r executors:
++template <typename Tfloat>
++inline void fftw_plan_execute_c2r(typename fftw_trait<Tfloat>::fftw_plan_type plan,
++                                  std::vector<hostbuf>&                       in,
++                                  std::vector<hostbuf>&                       out);
++template <>
++inline void fftw_plan_execute_c2r<_Float16>(typename fftw_trait<float>::fftw_plan_type plan,
++                                            std::vector<hostbuf>&                      in,
++                                            std::vector<hostbuf>&                      out)
++{
++    // since FFTW does not natively support half precision, convert
++    // input to single, execute, then convert output back to half
++    auto in_single = half_to_single_copy(in.front());
++    fftwf_execute_dft_c2r(plan,
++                          reinterpret_cast<fftwf_complex*>(in_single.data()),
++                          reinterpret_cast<float*>(out.front().data()));
++    single_to_half_inplace(out.front());
++}
++template <>
++inline void fftw_plan_execute_c2r<float>(typename fftw_trait<float>::fftw_plan_type plan,
++                                         std::vector<hostbuf>&                      in,
++                                         std::vector<hostbuf>&                      out)
++{
++    fftwf_execute_dft_c2r(plan,
++                          reinterpret_cast<fftwf_complex*>(in.front().data()),
++                          reinterpret_cast<float*>(out.front().data()));
++}
++template <>
++inline void fftw_plan_execute_c2r<double>(typename fftw_trait<double>::fftw_plan_type plan,
++                                          std::vector<hostbuf>&                       in,
++                                          std::vector<hostbuf>&                       out)
++{
++    fftw_execute_dft_c2r(plan,
++                         reinterpret_cast<fftw_complex*>(in.front().data()),
++                         reinterpret_cast<double*>(out.front().data()));
++}
++
++#ifdef FFTW_HAVE_SPRINT_PLAN
++// Template wrappers for FFTW print plan:
++template <typename Tfloat>
++inline char* fftw_sprint_plan(const typename fftw_trait<Tfloat>::fftw_plan_type plan);
++template <>
++inline char* fftw_sprint_plan<_Float16>(const typename fftw_trait<_Float16>::fftw_plan_type plan)
++{
++    return fftwf_sprint_plan(plan);
++}
++template <>
++inline char* fftw_sprint_plan<float>(const typename fftw_trait<float>::fftw_plan_type plan)
++{
++    return fftwf_sprint_plan(plan);
++}
++template <>
++inline char* fftw_sprint_plan<double>(const typename fftw_trait<double>::fftw_plan_type plan)
++{
++    return fftw_sprint_plan(plan);
++}
++#endif
++
++#endif
+diff --git a/shared/gpubuf.h b/shared/gpubuf.h
+new file mode 100644
+index 0000000..993fa95
+--- /dev/null
++++ b/shared/gpubuf.h
+@@ -0,0 +1,134 @@
++// Copyright (C) 2021 - 2023 Advanced Micro Devices, Inc. All rights reserved.
++//
++// Permission is hereby granted, free of charge, to any person obtaining a copy
++// of this software and associated documentation files (the "Software"), to deal
++// in the Software without restriction, including without limitation the rights
++// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
++// copies of the Software, and to permit persons to whom the Software is
++// furnished to do so, subject to the following conditions:
++//
++// The above copyright notice and this permission notice shall be included in
++// all copies or substantial portions of the Software.
++//
++// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
++// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
++// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
++// THE SOFTWARE.
++
++#ifndef ROCFFT_GPUBUF_H
++#define ROCFFT_GPUBUF_H
++
++#include "rocfft_hip.h"
++#include <cstdlib>
++
++// Simple RAII class for GPU buffers.  T is the type of pointer that
++// data() returns
++template <class T = void>
++class gpubuf_t
++{
++public:
++    gpubuf_t() {}
++    // buffers are movable but not copyable
++    gpubuf_t(gpubuf_t&& other)
++    {
++        std::swap(buf, other.buf);
++        std::swap(bsize, other.bsize);
++        std::swap(device, other.device);
++    }
++    gpubuf_t& operator=(gpubuf_t&& other)
++    {
++        std::swap(buf, other.buf);
++        std::swap(bsize, other.bsize);
++        std::swap(device, other.device);
++        return *this;
++    }
++    gpubuf_t(const gpubuf_t&) = delete;
++    gpubuf_t& operator=(const gpubuf_t&) = delete;
++
++    ~gpubuf_t()
++    {
++        free();
++    }
++
++    static bool use_alloc_managed()
++    {
++        return std::getenv("ROCFFT_MALLOC_MANAGED");
++    }
++
++    hipError_t alloc(const size_t size)
++    {
++        // remember the device that was current as of alloc, so we can
++        // free on the correct device
++        auto ret = hipGetDevice(&device);
++        if(ret != hipSuccess)
++            return ret;
++
++        bsize                     = size;
++        static bool alloc_managed = use_alloc_managed();
++        free();
++        ret = alloc_managed ? hipMallocManaged(&buf, bsize) : hipMalloc(&buf, bsize);
++        if(ret != hipSuccess)
++        {
++            buf   = nullptr;
++            bsize = 0;
++        }
++        return ret;
++    }
++
++    size_t size() const
++    {
++        return bsize;
++    }
++
++    void free()
++    {
++        if(buf != nullptr)
++        {
++            // free on the device we allocated on
++            rocfft_scoped_device dev(device);
++            (void)hipFree(buf);
++            buf   = nullptr;
++            bsize = 0;
++        }
++    }
++
++    // return a pointer to the allocated memory, offset by the
++    // specified number of bytes
++    T* data_offset(size_t offset_bytes = 0) const
++    {
++        void* ptr = static_cast<char*>(buf) + offset_bytes;
++        return static_cast<T*>(ptr);
++    }
++
++    T* data() const
++    {
++        return static_cast<T*>(buf);
++    }
++
++    // equality/bool tests
++    bool operator==(std::nullptr_t n) const
++    {
++        return buf == n;
++    }
++    bool operator!=(std::nullptr_t n) const
++    {
++        return buf != n;
++    }
++    operator bool() const
++    {
++        return buf;
++    }
++
++private:
++    // The GPU buffer
++    void*  buf    = nullptr;
++    size_t bsize  = 0;
++    int    device = 0;
++};
++
++// default gpubuf that gives out void* pointers
++typedef gpubuf_t<> gpubuf;
++#endif
+diff --git a/shared/hip_object_wrapper.h b/shared/hip_object_wrapper.h
+new file mode 100644
+index 0000000..54083ab
+--- /dev/null
++++ b/shared/hip_object_wrapper.h
+@@ -0,0 +1,86 @@
++/******************************************************************************
++* Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved.
++*
++* Permission is hereby granted, free of charge, to any person obtaining a copy
++* of this software and associated documentation files (the "Software"), to deal
++* in the Software without restriction, including without limitation the rights
++* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
++* copies of the Software, and to permit persons to whom the Software is
++* furnished to do so, subject to the following conditions:
++*
++* The above copyright notice and this permission notice shall be included in
++* all copies or substantial portions of the Software.
++*
++* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
++* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
++* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
++* THE SOFTWARE.
++*******************************************************************************/
++
++#ifndef ROCFFT_HIP_OBJ_WRAPPER_H
++#define ROCFFT_HIP_OBJ_WRAPPER_H
++
++#include "rocfft_hip.h"
++
++// RAII wrapper around HIP objects
++template <typename T, auto TCreate, auto TDestroy>
++struct hip_object_wrapper_t
++{
++    hip_object_wrapper_t()
++        : obj(nullptr)
++    {
++    }
++
++    void alloc()
++    {
++        if(obj == nullptr && TCreate(&obj) != hipSuccess)
++            throw std::runtime_error("hip create failure");
++    }
++
++    void free()
++    {
++        if(obj)
++        {
++            (void)TDestroy(obj);
++            obj = nullptr;
++        }
++    }
++
++    operator const T&() const
++    {
++        return obj;
++    }
++    operator T&()
++    {
++        return obj;
++    }
++
++    operator bool() const
++    {
++        return obj != nullptr;
++    }
++
++    ~hip_object_wrapper_t()
++    {
++        free();
++    }
++
++    hip_object_wrapper_t(const hip_object_wrapper_t&) = delete;
++    hip_object_wrapper_t& operator=(const hip_object_wrapper_t&) = delete;
++    hip_object_wrapper_t(hip_object_wrapper_t&& other)
++        : obj(other.obj)
++    {
++        other.obj = nullptr;
++    }
++
++private:
++    T obj;
++};
++
++typedef hip_object_wrapper_t<hipStream_t, hipStreamCreate, hipStreamDestroy> hipStream_wrapper_t;
++typedef hip_object_wrapper_t<hipEvent_t, hipEventCreate, hipEventDestroy>    hipEvent_wrapper_t;
++
++#endif // ROCFFT_HIP_OBJ_WRAPPER_H
+diff --git a/shared/hostbuf.h b/shared/hostbuf.h
+new file mode 100644
+index 0000000..0a96c7d
+--- /dev/null
++++ b/shared/hostbuf.h
+@@ -0,0 +1,158 @@
++// Copyright (C) 2021 - 2023 Advanced Micro Devices, Inc. All rights reserved.
++//
++// Permission is hereby granted, free of charge, to any person obtaining a copy
++// of this software and associated documentation files (the "Software"), to deal
++// in the Software without restriction, including without limitation the rights
++// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
++// copies of the Software, and to permit persons to whom the Software is
++// furnished to do so, subject to the following conditions:
++//
++// The above copyright notice and this permission notice shall be included in
++// all copies or substantial portions of the Software.
++//
++// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
++// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
++// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
++// THE SOFTWARE.
++
++#ifndef ROCFFT_HOSTBUF_H
++#define ROCFFT_HOSTBUF_H
++
++#include "arithmetic.h"
++#include <cstdlib>
++#include <cstring>
++
++#ifndef WIN32
++#include <stdlib.h>
++#include <sys/mman.h>
++#endif
++
++// Simple RAII class for host buffers.  T is the type of pointer that
++// data() returns
++template <class T = void>
++class hostbuf_t
++{
++public:
++    hostbuf_t() {}
++    // buffers are movable but not copyable
++    hostbuf_t(hostbuf_t&& other)
++    {
++        std::swap(buf, other.buf);
++        std::swap(bsize, other.bsize);
++    }
++    hostbuf_t& operator=(hostbuf_t&& other)
++    {
++        std::swap(buf, other.buf);
++        std::swap(bsize, other.bsize);
++        return *this;
++    }
++    hostbuf_t(const hostbuf_t&) = delete;
++    hostbuf_t& operator=(const hostbuf_t&) = delete;
++
++    ~hostbuf_t()
++    {
++        free();
++    }
++
++    void alloc(size_t size)
++    {
++        bsize = size;
++        free();
++
++        // we're aligning to multiples of 64 bytes, so round the
++        // allocation size up to the nearest 64 to keep ASAN happy
++        if(size % 64)
++        {
++            size += 64 - size % 64;
++        }
++
++        // FFTW requires aligned allocations to use faster SIMD instructions.
++        // If enabling hugepages, align to 2 MiB. Otherwise, aligning to
++        // 64 bytes is enough for AVX instructions up to AVX512.
++#ifdef WIN32
++        buf = _aligned_malloc(size, 64);
++#else
++        // On Linux, ask for hugepages to reduce TLB pressure and
++        // improve performance.  Allocations need to be aligned to
++        // the hugepage size, and rounded up to the next whole
++        // hugepage.
++        static const size_t TWO_MiB = 2 * 1024 * 1024;
++        if(size >= TWO_MiB)
++        {
++            size_t rounded_size = DivRoundingUp(size, TWO_MiB) * TWO_MiB;
++            buf                 = aligned_alloc(TWO_MiB, rounded_size);
++            madvise(buf, rounded_size, MADV_HUGEPAGE);
++        }
++        else
++            buf = aligned_alloc(64, size);
++#endif
++    }
++
++    size_t size() const
++    {
++        return bsize;
++    }
++
++    void free()
++    {
++        if(buf != nullptr)
++        {
++#ifdef WIN32
++            _aligned_free(buf);
++#else
++            std::free(buf);
++#endif
++            buf   = nullptr;
++            bsize = 0;
++        }
++    }
++
++    T* data() const
++    {
++        return static_cast<T*>(buf);
++    }
++
++    // Copy method
++    hostbuf_t copy() const
++    {
++        hostbuf_t copy;
++        copy.alloc(bsize);
++        memcpy(copy.buf, buf, bsize);
++        return copy;
++    }
++
++    // shrink the buffer to fit the new size
++    void shrink(size_t new_size)
++    {
++        if(new_size > bsize)
++            throw std::runtime_error("can't shrink hostbuf to larger size");
++        // just pretend the buffer is now that size
++        bsize = new_size;
++    }
++
++    // equality/bool tests
++    bool operator==(std::nullptr_t n) const
++    {
++        return buf == n;
++    }
++    bool operator!=(std::nullptr_t n) const
++    {
++        return buf != n;
++    }
++    operator bool() const
++    {
++        return buf;
++    }
++
++private:
++    // The host buffer
++    void*  buf   = nullptr;
++    size_t bsize = 0;
++};
++
++// default hostbuf that gives out void* pointers
++typedef hostbuf_t<> hostbuf;
++#endif
+diff --git a/shared/increment.h b/shared/increment.h
+new file mode 100644
+index 0000000..90bba1d
+--- /dev/null
++++ b/shared/increment.h
+@@ -0,0 +1,100 @@
++// Copyright (C) 2021 - 2022 Advanced Micro Devices, Inc. All rights reserved.
++//
++// Permission is hereby granted, free of charge, to any person obtaining a copy
++// of this software and associated documentation files (the "Software"), to deal
++// in the Software without restriction, including without limitation the rights
++// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
++// copies of the Software, and to permit persons to whom the Software is
++// furnished to do so, subject to the following conditions:
++//
++// The above copyright notice and this permission notice shall be included in
++// all copies or substantial portions of the Software.
++//
++// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
++// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
++// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
++// THE SOFTWARE.
++
++#ifndef ROCFFT_INCREMENT_H
++#define ROCFFT_INCREMENT_H
++
++#include <algorithm>
++#include <tuple>
++#include <vector>
++
++// Helper functions to iterate over a buffer in row-major order.
++// Indexes may be given as either a tuple or vector of sizes.  They
++// return true if the index was successfully incremented to move to
++// the next element in the buffer.
++
++template <typename T1, typename T2>
++static bool increment_base(T1& index, const T2& length)
++{
++    static_assert(std::is_integral<T1>::value, "Integral required.");
++    static_assert(std::is_integral<T2>::value, "Integral required.");
++    if(index < length - 1)
++    {
++        ++index;
++        return true;
++    }
++    index = 0;
++    return false;
++}
++
++// Increment the index (row-major) for looping over 1, 2, and 3 dimensions length.
++template <typename T1, typename T2>
++static bool increment_rowmajor(T1& index, const T2& length)
++{
++    static_assert(std::is_integral<T1>::value, "Integral required.");
++    static_assert(std::is_integral<T2>::value, "Integral required.");
++    return increment_base(index, length);
++}
++
++template <typename T1, typename T2>
++static bool increment_rowmajor(std::tuple<T1, T1>& index, const std::tuple<T2, T2>& length)
++{
++    if(increment_base(std::get<1>(index), std::get<1>(length)))
++        // we incremented ok, nothing further to do
++        return true;
++    // otherwise, we rolled over
++    return increment_base(std::get<0>(index), std::get<0>(length));
++}
++
++template <typename T1, typename T2>
++static bool increment_rowmajor(std::tuple<T1, T1, T1>& index, const std::tuple<T2, T2, T2>& length)
++{
++    if(increment_base(std::get<2>(index), std::get<2>(length)))
++        // we incremented ok, nothing further to do
++        return true;
++    if(increment_base(std::get<1>(index), std::get<1>(length)))
++        // we incremented ok, nothing further to do
++        return true;
++    // otherwise, we rolled over
++    return increment_base(std::get<0>(index), std::get<0>(length));
++}
++
++// Increment row-major index over arbitrary dimension length
++template <typename T1, typename T2>
++bool increment_rowmajor(std::vector<T1>& index, const std::vector<T2>& length)
++{
++    for(int idim = length.size(); idim-- > 0;)
++    {
++        if(index[idim] < length[idim])
++        {
++            if((++index[idim]) == length[idim])
++            {
++                index[idim] = 0;
++                continue;
++            }
++            // we know we were able to increment something and didn't hit the end
++            return true;
++        }
++    }
++    // End the loop when we get back to the start:
++    return !std::all_of(index.begin(), index.end(), [](int i) { return i == 0; });
++}
++
++#endif
+diff --git a/shared/precision_type.h b/shared/precision_type.h
+new file mode 100644
+index 0000000..526fc9a
+--- /dev/null
++++ b/shared/precision_type.h
+@@ -0,0 +1,70 @@
++// Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved.
++//
++// Permission is hereby granted, free of charge, to any person obtaining a copy
++// of this software and associated documentation files (the "Software"), to deal
++// in the Software without restriction, including without limitation the rights
++// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
++// copies of the Software, and to permit persons to whom the Software is
++// furnished to do so, subject to the following conditions:
++//
++// The above copyright notice and this permission notice shall be included in
++// all copies or substantial portions of the Software.
++//
++// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
++// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
++// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
++// THE SOFTWARE.
++
++#ifndef ROCFFT_PRECISION_TYPE_H
++#define ROCFFT_PRECISION_TYPE_H
++
++#include "array_predicate.h"
++#include "rocfft/rocfft.h"
++
++static size_t real_type_size(rocfft_precision precision)
++{
++    switch(precision)
++    {
++    case rocfft_precision_half:
++        return 2;
++    case rocfft_precision_single:
++        return 4;
++    case rocfft_precision_double:
++        return 8;
++    }
++}
++
++static size_t complex_type_size(rocfft_precision precision)
++{
++    return real_type_size(precision) * 2;
++}
++
++static const char* precision_name(rocfft_precision precision)
++{
++    switch(precision)
++    {
++    case rocfft_precision_half:
++        return "half";
++    case rocfft_precision_single:
++        return "single";
++    case rocfft_precision_double:
++        return "double";
++    }
++}
++
++static size_t element_size(rocfft_precision precision, rocfft_array_type array_type)
++{
++    return array_type_is_complex(array_type) ? complex_type_size(precision)
++                                             : real_type_size(precision);
++}
++
++// offset a pointer by a number of elements, given the elements'
++// precision and type (complex or not)
++static void* ptr_offset(void* p, size_t elems, rocfft_precision precision, rocfft_array_type type)
++{
++    return static_cast<char*>(p) + elems * element_size(precision, type);
++}
++#endif
+diff --git a/shared/printbuffer.h b/shared/printbuffer.h
+new file mode 100644
+index 0000000..5ae0b64
+--- /dev/null
++++ b/shared/printbuffer.h
+@@ -0,0 +1,108 @@
++// Copyright (C) 2021 - 2023 Advanced Micro Devices, Inc. All rights reserved.
++//
++// Permission is hereby granted, free of charge, to any person obtaining a copy
++// of this software and associated documentation files (the "Software"), to deal
++// in the Software without restriction, including without limitation the rights
++// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
++// copies of the Software, and to permit persons to whom the Software is
++// furnished to do so, subject to the following conditions:
++//
++// The above copyright notice and this permission notice shall be included in
++// all copies or substantial portions of the Software.
++//
++// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
++// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
++// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
++// THE SOFTWARE.
++
++#ifndef PRINTBUFFER_H
++#define PRINTBUFFER_H
++
++#include "hostbuf.h"
++#include "increment.h"
++#include <algorithm>
++#include <vector>
++
++// Output a formatted general-dimensional array with given length and stride in batches
++// separated by dist.
++template <typename Toutput, typename T1, typename T2, typename Tsize, typename Tstream>
++inline void printbuffer(const Toutput*         output,
++                        const std::vector<T1>& length,
++                        const std::vector<T2>& stride,
++                        const Tsize            nbatch,
++                        const Tsize            dist,
++                        const size_t           offset,
++                        Tstream&               stream)
++{
++    auto i_base = 0;
++    for(unsigned int b = 0; b < nbatch; b++, i_base += dist)
++    {
++        std::vector<size_t> index(length.size());
++        std::fill(index.begin(), index.end(), 0);
++        do
++        {
++            const int i
++                = std::inner_product(index.begin(), index.end(), stride.begin(), i_base + offset);
++            stream << output[i] << " ";
++            for(int li = index.size(); li-- > 0;)
++            {
++                if(index[li] == (length[li] - 1))
++                {
++                    stream << "\n";
++                }
++                else
++                {
++                    break;
++                }
++            }
++        } while(increment_rowmajor(index, length));
++        stream << std::endl;
++    }
++}
++
++template <typename Telem>
++class buffer_printer
++{
++    // The scalar versions might be part of a planar format.
++public:
++    template <typename Tint1, typename Tint2, typename Tsize, typename Tstream = std::ostream>
++    static void print_buffer(const std::vector<hostbuf>& buf,
++                             const std::vector<Tint1>&   length,
++                             const std::vector<Tint2>&   stride,
++                             const Tsize                 nbatch,
++                             const Tsize                 dist,
++                             const std::vector<size_t>&  offset,
++                             Tstream&                    stream = std::cout)
++    {
++        for(const auto& vec : buf)
++        {
++            printbuffer(reinterpret_cast<const Telem*>(vec.data()),
++                        length,
++                        stride,
++                        nbatch,
++                        dist,
++                        offset[0],
++                        stream);
++        }
++    };
++    template <typename Tstream = std::ostream>
++    static void print_buffer_flat(const std::vector<hostbuf>& buf,
++                                  const std::vector<size_t>&  size,
++                                  const std::vector<size_t>&  offset,
++                                  Tstream&                    stream = std::cout)
++    {
++        for(const auto& vec : buf)
++        {
++            auto data = reinterpret_cast<const Telem*>(vec.data());
++            stream << "idx " << 0;
++            for(size_t i = 0; i < size[0]; ++i)
++                stream << " " << data[i];
++            stream << std::endl;
++        }
++    };
++};
++
++#endif
+diff --git a/shared/ptrdiff.h b/shared/ptrdiff.h
+new file mode 100644
+index 0000000..3bd15de
+--- /dev/null
++++ b/shared/ptrdiff.h
+@@ -0,0 +1,40 @@
++// Copyright (C) 2022 Advanced Micro Devices, Inc. All rights reserved.
++//
++// Permission is hereby granted, free of charge, to any person obtaining a copy
++// of this software and associated documentation files (the "Software"), to deal
++// in the Software without restriction, including without limitation the rights
++// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
++// copies of the Software, and to permit persons to whom the Software is
++// furnished to do so, subject to the following conditions:
++//
++// The above copyright notice and this permission notice shall be included in
++// all copies or substantial portions of the Software.
++//
++// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
++// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
++// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
++// THE SOFTWARE.
++
++#pragma once
++
++// Compute the farthest point from the original pointer.
++static size_t compute_ptrdiff(const std::vector<size_t>& length,
++                              const std::vector<size_t>& stride,
++                              const size_t               nbatch,
++                              const size_t               dist)
++{
++    size_t val = 0;
++    if(!length.empty())
++    {
++        val = 1;
++        for(unsigned int i = 0; i < length.size(); ++i)
++        {
++            val += (length[i] - 1) * stride[i];
++        }
++        val += (nbatch - 1) * dist;
++    }
++    return val;
++}
+diff --git a/shared/rocfft_accuracy_test.h b/shared/rocfft_accuracy_test.h
+new file mode 100644
+index 0000000..4ce3059
+--- /dev/null
++++ b/shared/rocfft_accuracy_test.h
+@@ -0,0 +1,29 @@
++// Copyright (C) 2022 - 2022 Advanced Micro Devices, Inc. All rights reserved.
++//
++// Permission is hereby granted, free of charge, to any person obtaining a copy
++// of this software and associated documentation files (the "Software"), to deal
++// in the Software without restriction, including without limitation the rights
++// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
++// copies of the Software, and to permit persons to whom the Software is
++// furnished to do so, subject to the following conditions:
++//
++// The above copyright notice and this permission notice shall be included in
++// all copies or substantial portions of the Software.
++//
++// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
++// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
++// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
++// THE SOFTWARE.
++
++#ifndef ROCFFT_ACCURACY_TEST
++#define ROCFFT_ACCURACY_TEST
++
++#include "accuracy_test.h"
++#include "rocfft_params.h"
++
++void fft_vs_reference(rocfft_params& params, bool round_trip = false);
++
++#endif
+diff --git a/shared/rocfft_against_fftw.h b/shared/rocfft_against_fftw.h
+new file mode 100644
+index 0000000..d03754c
+--- /dev/null
++++ b/shared/rocfft_against_fftw.h
+@@ -0,0 +1,231 @@
++// Copyright (C) 2016 - 2023 Advanced Micro Devices, Inc. All rights reserved.
++//
++// Permission is hereby granted, free of charge, to any person obtaining a copy
++// of this software and associated documentation files (the "Software"), to deal
++// in the Software without restriction, including without limitation the rights
++// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
++// copies of the Software, and to permit persons to whom the Software is
++// furnished to do so, subject to the following conditions:
++//
++// The above copyright notice and this permission notice shall be included in
++// all copies or substantial portions of the Software.
++//
++// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
++// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
++// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
++// THE SOFTWARE.
++
++#pragma once
++#ifndef ROCFFT_AGAINST_FFTW
++#define ROCFFT_AGAINST_FFTW
++
++#include <gtest/gtest.h>
++#include <math.h>
++#include <stdexcept>
++#include <vector>
++
++#include "fftw_transform.h"
++
++// Return the precision enum for rocFFT based upon the type.
++template <typename Tfloat>
++inline fft_precision precision_selector();
++template <>
++inline fft_precision precision_selector<float>()
++{
++    return fft_precision_single;
++}
++template <>
++inline fft_precision precision_selector<double>()
++{
++    return fft_precision_double;
++}
++
++extern bool use_fftw_wisdom;
++
++// construct and return an FFTW plan with the specified type,
++// precision, and dimensions.  cpu_out is required if we're using
++// wisdom, which runs actual FFTs to work out the best plan.
++template <typename Tfloat>
++static typename fftw_trait<Tfloat>::fftw_plan_type
++    fftw_plan_with_precision(const std::vector<fftw_iodim64>& dims,
++                             const std::vector<fftw_iodim64>& howmany_dims,
++                             const fft_transform_type         transformType,
++                             const size_t                     isize,
++                             void*                            cpu_in,
++                             void*                            cpu_out)
++{
++    using fftw_complex_type = typename fftw_trait<Tfloat>::fftw_complex_type;
++
++    // NB: Using FFTW_MEASURE implies that the input buffer's data
++    // may be destroyed during plan creation.  But if we're wanting
++    // to run FFTW in the first place, we must have just created an
++    // uninitialized input buffer anyway.
++
++    switch(transformType)
++    {
++    case fft_transform_type_complex_forward:
++        return fftw_plan_guru64_dft<Tfloat>(dims.size(),
++                                            dims.data(),
++                                            howmany_dims.size(),
++                                            howmany_dims.data(),
++                                            reinterpret_cast<fftw_complex_type*>(cpu_in),
++                                            reinterpret_cast<fftw_complex_type*>(cpu_out),
++                                            -1,
++                                            use_fftw_wisdom ? FFTW_MEASURE : FFTW_ESTIMATE);
++    case fft_transform_type_complex_inverse:
++        return fftw_plan_guru64_dft<Tfloat>(dims.size(),
++                                            dims.data(),
++                                            howmany_dims.size(),
++                                            howmany_dims.data(),
++                                            reinterpret_cast<fftw_complex_type*>(cpu_in),
++                                            reinterpret_cast<fftw_complex_type*>(cpu_out),
++                                            1,
++                                            use_fftw_wisdom ? FFTW_MEASURE : FFTW_ESTIMATE);
++    case fft_transform_type_real_forward:
++        return fftw_plan_guru64_r2c<Tfloat>(dims.size(),
++                                            dims.data(),
++                                            howmany_dims.size(),
++                                            howmany_dims.data(),
++                                            reinterpret_cast<Tfloat*>(cpu_in),
++                                            reinterpret_cast<fftw_complex_type*>(cpu_out),
++                                            use_fftw_wisdom ? FFTW_MEASURE : FFTW_ESTIMATE);
++    case fft_transform_type_real_inverse:
++        return fftw_plan_guru64_c2r<Tfloat>(dims.size(),
++                                            dims.data(),
++                                            howmany_dims.size(),
++                                            howmany_dims.data(),
++                                            reinterpret_cast<fftw_complex_type*>(cpu_in),
++                                            reinterpret_cast<Tfloat*>(cpu_out),
++                                            use_fftw_wisdom ? FFTW_MEASURE : FFTW_ESTIMATE);
++    default:
++        throw std::runtime_error("Invalid transform type");
++    }
++}
++
++// construct an FFTW plan, given rocFFT parameters.  output is
++// required if planning with wisdom.
++template <typename Tfloat>
++static typename fftw_trait<Tfloat>::fftw_plan_type
++    fftw_plan_via_rocfft(const std::vector<size_t>& length,
++                         const std::vector<size_t>& istride,
++                         const std::vector<size_t>& ostride,
++                         const size_t               nbatch,
++                         const size_t               idist,
++                         const size_t               odist,
++                         const fft_transform_type   transformType,
++                         std::vector<hostbuf>&      input,
++                         std::vector<hostbuf>&      output)
++{
++    // Dimension configuration:
++    std::vector<fftw_iodim64> dims(length.size());
++    for(unsigned int idx = 0; idx < length.size(); ++idx)
++    {
++        dims[idx].n  = length[idx];
++        dims[idx].is = istride[idx];
++        dims[idx].os = ostride[idx];
++    }
++
++    // Batch configuration:
++    std::vector<fftw_iodim64> howmany_dims(1);
++    howmany_dims[0].n  = nbatch;
++    howmany_dims[0].is = idist;
++    howmany_dims[0].os = odist;
++
++    return fftw_plan_with_precision<Tfloat>(dims,
++                                            howmany_dims,
++                                            transformType,
++                                            idist * nbatch,
++                                            input.front().data(),
++                                            output.empty() ? nullptr : output.front().data());
++}
++
++template <typename Tfloat>
++void fftw_run(fft_transform_type                          transformType,
++              typename fftw_trait<Tfloat>::fftw_plan_type cpu_plan,
++              std::vector<hostbuf>&                       cpu_in,
++              std::vector<hostbuf>&                       cpu_out)
++{
++    switch(transformType)
++    {
++    case fft_transform_type_complex_forward:
++    {
++        fftw_plan_execute_c2c<Tfloat>(cpu_plan, cpu_in, cpu_out);
++        break;
++    }
++    case fft_transform_type_complex_inverse:
++    {
++        fftw_plan_execute_c2c<Tfloat>(cpu_plan, cpu_in, cpu_out);
++        break;
++    }
++    case fft_transform_type_real_forward:
++    {
++        fftw_plan_execute_r2c<Tfloat>(cpu_plan, cpu_in, cpu_out);
++        break;
++    }
++    case fft_transform_type_real_inverse:
++    {
++        fftw_plan_execute_c2r<Tfloat>(cpu_plan, cpu_in, cpu_out);
++        break;
++    }
++    }
++}
++
++// Given a transform type, return the contiguous input type.
++inline fft_array_type contiguous_itype(const fft_transform_type transformType)
++{
++    switch(transformType)
++    {
++    case fft_transform_type_complex_forward:
++    case fft_transform_type_complex_inverse:
++        return fft_array_type_complex_interleaved;
++    case fft_transform_type_real_forward:
++        return fft_array_type_real;
++    case fft_transform_type_real_inverse:
++        return fft_array_type_hermitian_interleaved;
++    default:
++        throw std::runtime_error("Invalid transform type");
++    }
++    return fft_array_type_complex_interleaved;
++}
++
++// Given a transform type, return the contiguous output type.
++inline fft_array_type contiguous_otype(const fft_transform_type transformType)
++{
++    switch(transformType)
++    {
++    case fft_transform_type_complex_forward:
++    case fft_transform_type_complex_inverse:
++        return fft_array_type_complex_interleaved;
++    case fft_transform_type_real_forward:
++        return fft_array_type_hermitian_interleaved;
++    case fft_transform_type_real_inverse:
++        return fft_array_type_real;
++    default:
++        throw std::runtime_error("Invalid transform type");
++    }
++    return fft_array_type_complex_interleaved;
++}
++
++// Given a precision, return the acceptable tolerance.
++inline double type_epsilon(const fft_precision precision)
++{
++    switch(precision)
++    {
++    case fft_precision_half:
++        return type_epsilon<_Float16>();
++        break;
++    case fft_precision_single:
++        return type_epsilon<float>();
++        break;
++    case fft_precision_double:
++        return type_epsilon<double>();
++        break;
++    default:
++        throw std::runtime_error("Invalid precision");
++    }
++}
++
++#endif
+diff --git a/shared/rocfft_complex.h b/shared/rocfft_complex.h
+new file mode 100644
+index 0000000..efa0290
+--- /dev/null
++++ b/shared/rocfft_complex.h
+@@ -0,0 +1,346 @@
++// Copyright (C) 2021 - 2023 Advanced Micro Devices, Inc. All rights reserved.
++//
++// Permission is hereby granted, free of charge, to any person obtaining a copy
++// of this software and associated documentation files (the "Software"), to deal
++// in the Software without restriction, including without limitation the rights
++// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
++// copies of the Software, and to permit persons to whom the Software is
++// furnished to do so, subject to the following conditions:
++//
++// The above copyright notice and this permission notice shall be included in
++// all copies or substantial portions of the Software.
++//
++// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
++// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
++// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
++// THE SOFTWARE.
++
++#ifndef ROCFFT_COMPLEX_H
++#define ROCFFT_COMPLEX_H
++
++#include <hip/hip_fp16.h>
++#if !defined(__HIPCC_RTC__)
++#include <iostream>
++#endif
++#include <math.h>
++#include <type_traits>
++
++#ifdef __HIP_PLATFORM_NVIDIA__
++typedef __half _Float16;
++#endif
++
++template <typename Treal>
++struct rocfft_complex
++{
++
++    Treal x; // Real part
++    Treal y; // Imaginary part
++
++    // Constructors
++    // Do not initialize the members x or y by default, to ensure that it can
++    // be used in __shared__ and that it is a trivial class compatible with C.
++    __device__ __host__ rocfft_complex()                      = default;
++    __device__ __host__ rocfft_complex(const rocfft_complex&) = default;
++    __device__ __host__ rocfft_complex(rocfft_complex&&)      = default;
++    __device__ __host__ rocfft_complex& operator=(const rocfft_complex& rhs) & = default;
++    __device__ __host__ rocfft_complex& operator=(rocfft_complex&& rhs) & = default;
++    __device__                          __host__ ~rocfft_complex()        = default;
++
++    // Constructor from real and imaginary parts
++    __device__ __host__ constexpr rocfft_complex(Treal real, Treal imag)
++        : x{real}
++        , y{imag}
++    {
++    }
++
++    // Conversion from different precision
++    template <typename U>
++    __device__ __host__ explicit constexpr rocfft_complex(const rocfft_complex<U>& z)
++        : x(z.x)
++        , y(z.y)
++    {
++    }
++
++    // Accessors
++    __device__ __host__ constexpr Treal real() const
++    {
++        return x;
++    }
++
++    __device__ __host__ constexpr Treal imag() const
++    {
++        return y;
++    }
++
++    // Unary operations
++    __forceinline__ __device__ __host__ rocfft_complex operator-() const
++    {
++        return {-x, -y};
++    }
++
++    __forceinline__ __device__ __host__ rocfft_complex operator+() const
++    {
++        return *this;
++    }
++
++    __device__ __host__ Treal asum(const rocfft_complex& z)
++    {
++        return abs(z.x) + abs(z.y);
++    }
++
++    // Internal real functions
++    static __forceinline__ __device__ __host__ Treal abs(Treal x)
++    {
++        return x < 0 ? -x : x;
++    }
++
++    static __forceinline__ __device__ __host__ float sqrt(float x)
++    {
++        return ::sqrtf(x);
++    }
++
++    static __forceinline__ __device__ __host__ double sqrt(double x)
++    {
++        return ::sqrt(x);
++    }
++
++    // Addition operators
++    __device__ __host__ auto& operator+=(const rocfft_complex& rhs)
++    {
++        return *this = {x + rhs.x, y + rhs.y};
++    }
++
++    __device__ __host__ auto operator+(const rocfft_complex& rhs) const
++    {
++        auto lhs = *this;
++        return lhs += rhs;
++    }
++
++    // Subtraction operators
++    __device__ __host__ auto& operator-=(const rocfft_complex& rhs)
++    {
++        return *this = {x - rhs.x, y - rhs.y};
++    }
++
++    __device__ __host__ auto operator-(const rocfft_complex& rhs) const
++    {
++        auto lhs = *this;
++        return lhs -= rhs;
++    }
++
++    // Multiplication operators
++    __device__ __host__ auto& operator*=(const rocfft_complex& rhs)
++    {
++        return *this = {x * rhs.x - y * rhs.y, y * rhs.x + x * rhs.y};
++    }
++
++    __device__ __host__ auto operator*(const rocfft_complex& rhs) const
++    {
++        auto lhs = *this;
++        return lhs *= rhs;
++    }
++
++    // Division operators
++    __device__ __host__ auto& operator/=(const rocfft_complex& rhs)
++    {
++        // Form of Robert L. Smith's Algorithm 116
++        if(abs(rhs.x) > abs(rhs.y))
++        {
++            Treal ratio = rhs.y / rhs.x;
++            Treal scale = 1 / (rhs.x + rhs.y * ratio);
++            *this       = {(x + y * ratio) * scale, (y - x * ratio) * scale};
++        }
++        else
++        {
++            Treal ratio = rhs.x / rhs.y;
++            Treal scale = 1 / (rhs.x * ratio + rhs.y);
++            *this       = {(y + x * ratio) * scale, (y * ratio - x) * scale};
++        }
++        return *this;
++    }
++
++    __device__ __host__ auto operator/(const rocfft_complex& rhs) const
++    {
++        auto lhs = *this;
++        return lhs /= rhs;
++    }
++
++    // Comparison operators
++    __device__ __host__ constexpr bool operator==(const rocfft_complex& rhs) const
++    {
++        return x == rhs.x && y == rhs.y;
++    }
++
++    __device__ __host__ constexpr bool operator!=(const rocfft_complex& rhs) const
++    {
++        return !(*this == rhs);
++    }
++
++    // Operators for complex-real computations
++    template <typename U>
++    __device__ __host__ auto& operator+=(const U& rhs)
++    {
++        return (x += Treal(rhs)), *this;
++    }
++
++    template <typename U>
++    __device__ __host__ auto& operator-=(const U& rhs)
++    {
++        return (x -= Treal(rhs)), *this;
++    }
++
++    __device__ __host__ auto operator+(const Treal& rhs)
++    {
++        auto lhs = *this;
++        return lhs += rhs;
++    }
++
++    __device__ __host__ auto operator-(const Treal& rhs)
++    {
++        auto lhs = *this;
++        return lhs -= rhs;
++    }
++
++    template <typename U>
++    __device__ __host__ auto& operator*=(const U& rhs)
++    {
++        return (x *= Treal(rhs)), (y *= Treal(rhs)), *this;
++    }
++
++    template <typename U>
++    __device__ __host__ auto operator*(const U& rhs) const
++    {
++        auto lhs = *this;
++        return lhs *= Treal(rhs);
++    }
++
++    template <typename U>
++    __device__ __host__ auto& operator/=(const U& rhs)
++    {
++        return (x /= Treal(rhs)), (y /= Treal(rhs)), *this;
++    }
++
++    template <typename U>
++    __device__ __host__ auto operator/(const U& rhs) const
++    {
++        auto lhs = *this;
++        return lhs /= Treal(rhs);
++    }
++
++    template <typename U>
++    __device__ __host__ constexpr bool operator==(const U& rhs) const
++    {
++        return x == Treal(rhs) && y == 0;
++    }
++
++    template <typename U>
++    __device__ __host__ constexpr bool operator!=(const U& rhs) const
++    {
++        return !(*this == rhs);
++    }
++};
++
++// Stream operators
++#if !defined(__HIPCC_RTC__)
++static std::ostream& operator<<(std::ostream& stream, const _Float16& f)
++{
++    return stream << static_cast<double>(f);
++}
++
++template <typename Treal>
++std::ostream& operator<<(std::ostream& out, const rocfft_complex<Treal>& z)
++{
++    return out << '(' << static_cast<double>(z.x) << ',' << static_cast<double>(z.y) << ')';
++}
++#endif
++
++// Operators for real-complex computations
++template <typename U, typename Treal>
++__device__ __host__ rocfft_complex<Treal> operator+(const U& lhs, const rocfft_complex<Treal>& rhs)
++{
++    return {Treal(lhs) + rhs.x, rhs.y};
++}
++
++template <typename U, typename Treal>
++__device__ __host__ rocfft_complex<Treal> operator-(const U& lhs, const rocfft_complex<Treal>& rhs)
++{
++    return {Treal(lhs) - rhs.x, -rhs.y};
++}
++
++template <typename U, typename Treal>
++__device__ __host__ rocfft_complex<Treal> operator*(const U& lhs, const rocfft_complex<Treal>& rhs)
++{
++    return {Treal(lhs) * rhs.x, Treal(lhs) * rhs.y};
++}
++
++template <typename U, typename Treal>
++__device__ __host__ rocfft_complex<Treal> operator/(const U& lhs, const rocfft_complex<Treal>& rhs)
++{
++    // Form of Robert L. Smith's Algorithm 116
++    if(rocfft_complex<Treal>::abs(rhs.x) > rocfft_complex<Treal>::abs(rhs.y))
++    {
++        Treal ratio = rhs.y / rhs.x;
++        Treal scale = Treal(lhs) / (rhs.x + rhs.y * ratio);
++        return {scale, -scale * ratio};
++    }
++    else
++    {
++        Treal ratio = rhs.x / rhs.y;
++        Treal scale = Treal(lhs) / (rhs.x * ratio + rhs.y);
++        return {ratio * scale, -scale};
++    }
++}
++
++template <typename U, typename Treal>
++__device__ __host__ constexpr bool operator==(const U& lhs, const rocfft_complex<Treal>& rhs)
++{
++    return Treal(lhs) == rhs.x && 0 == rhs.y;
++}
++
++template <typename U, typename Treal>
++__device__ __host__ constexpr bool operator!=(const U& lhs, const rocfft_complex<Treal>& rhs)
++{
++    return !(lhs == rhs);
++}
++
++// Extending std namespace to handle rocfft_complex datatype
++namespace std
++{
++    template <typename Treal>
++    __device__ __host__ constexpr Treal real(const rocfft_complex<Treal>& z)
++    {
++        return z.x;
++    }
++
++    template <typename Treal>
++    __device__ __host__ constexpr Treal imag(const rocfft_complex<Treal>& z)
++    {
++        return z.y;
++    }
++
++    template <typename Treal>
++    __device__ __host__ constexpr rocfft_complex<Treal> conj(const rocfft_complex<Treal>& z)
++    {
++        return {z.x, -z.y};
++    }
++
++    template <typename Treal>
++    __device__ __host__ inline Treal norm(const rocfft_complex<Treal>& z)
++    {
++        return (z.x * z.x) + (z.y * z.y);
++    }
++
++    template <typename Treal>
++    __device__ __host__ inline Treal abs(const rocfft_complex<Treal>& z)
++    {
++        Treal tr = rocfft_complex<Treal>::abs(z.x), ti = rocfft_complex<Treal>::abs(z.y);
++        return tr > ti ? (ti /= tr, tr * rocfft_complex<Treal>::sqrt(ti * ti + 1))
++               : ti    ? (tr /= ti, ti * rocfft_complex<Treal>::sqrt(tr * tr + 1))
++                       : 0;
++    }
++}
++
++#endif // ROCFFT_COMPLEX_H
+diff --git a/shared/rocfft_hip.h b/shared/rocfft_hip.h
+new file mode 100644
+index 0000000..e086cab
+--- /dev/null
++++ b/shared/rocfft_hip.h
+@@ -0,0 +1,52 @@
++// Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved.
++//
++// Permission is hereby granted, free of charge, to any person obtaining a copy
++// of this software and associated documentation files (the "Software"), to deal
++// in the Software without restriction, including without limitation the rights
++// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
++// copies of the Software, and to permit persons to whom the Software is
++// furnished to do so, subject to the following conditions:
++//
++// The above copyright notice and this permission notice shall be included in
++// all copies or substantial portions of the Software.
++//
++// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
++// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
++// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
++// THE SOFTWARE.
++
++#ifndef __ROCFFT_HIP_H__
++#define __ROCFFT_HIP_H__
++
++#include <hip/hip_runtime_api.h>
++#include <stdexcept>
++
++class rocfft_scoped_device
++{
++public:
++    rocfft_scoped_device(int device)
++    {
++        if(hipGetDevice(&orig_device) != hipSuccess)
++            throw std::runtime_error("hipGetDevice failure");
++
++        if(hipSetDevice(device) != hipSuccess)
++            throw std::runtime_error("hipSetDevice failure");
++    }
++    ~rocfft_scoped_device()
++    {
++        (void)hipSetDevice(orig_device);
++    }
++
++    // not copyable or movable
++    rocfft_scoped_device(const rocfft_scoped_device&) = delete;
++    rocfft_scoped_device(rocfft_scoped_device&&)      = delete;
++    rocfft_scoped_device& operator=(const rocfft_scoped_device&) = delete;
++
++private:
++    int orig_device;
++};
++
++#endif // __ROCFFT_HIP_H__
+diff --git a/shared/rocfft_params.h b/shared/rocfft_params.h
+new file mode 100644
+index 0000000..bf9b728
+--- /dev/null
++++ b/shared/rocfft_params.h
+@@ -0,0 +1,585 @@
++// Copyright (C) 2021 - 2023 Advanced Micro Devices, Inc. All rights reserved.
++//
++// Permission is hereby granted, free of charge, to any person obtaining a copy
++// of this software and associated documentation files (the "Software"), to deal
++// in the Software without restriction, including without limitation the rights
++// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
++// copies of the Software, and to permit persons to whom the Software is
++// furnished to do so, subject to the following conditions:
++//
++// The above copyright notice and this permission notice shall be included in
++// all copies or substantial portions of the Software.
++//
++// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
++// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
++// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
++// THE SOFTWARE.
++
++#ifndef ROCFFT_PARAMS_H
++#define ROCFFT_PARAMS_H
++
++#include "../shared/fft_params.h"
++#include "../shared/gpubuf.h"
++#include "rocfft/rocfft.h"
++
++// Return the string of the rocfft_status code
++static std::string rocfft_status_to_string(const rocfft_status ret)
++{
++    switch(ret)
++    {
++    case rocfft_status_success:
++        return "rocfft_status_success";
++    case rocfft_status_failure:
++        return "rocfft_status_failure";
++    case rocfft_status_invalid_arg_value:
++        return "rocfft_status_invalid_arg_value";
++    case rocfft_status_invalid_dimensions:
++        return "rocfft_status_invalid_dimensions";
++    case rocfft_status_invalid_array_type:
++        return "rocfft_status_invalid_array_type";
++    case rocfft_status_invalid_strides:
++        return "rocfft_status_invalid_strides";
++    case rocfft_status_invalid_distance:
++        return "rocfft_status_invalid_distance";
++    case rocfft_status_invalid_offset:
++        return "rocfft_status_invalid_offset";
++    case rocfft_status_invalid_work_buffer:
++        return "rocfft_status_invalid_work_buffer";
++    default:
++        throw std::runtime_error("unknown rocfft_status");
++    }
++}
++
++inline fft_status fft_status_from_rocfftparams(const rocfft_status val)
++{
++    switch(val)
++    {
++    case rocfft_status_success:
++        return fft_status_success;
++    case rocfft_status_failure:
++        return fft_status_failure;
++    case rocfft_status_invalid_arg_value:
++        return fft_status_invalid_arg_value;
++    case rocfft_status_invalid_dimensions:
++        return fft_status_invalid_dimensions;
++    case rocfft_status_invalid_array_type:
++        return fft_status_invalid_array_type;
++    case rocfft_status_invalid_strides:
++        return fft_status_invalid_strides;
++    case rocfft_status_invalid_distance:
++        return fft_status_invalid_distance;
++    case rocfft_status_invalid_offset:
++        return fft_status_invalid_offset;
++    case rocfft_status_invalid_work_buffer:
++        return fft_status_invalid_work_buffer;
++    default:
++        throw std::runtime_error("Invalid status");
++    }
++}
++
++inline rocfft_precision rocfft_precision_from_fftparams(const fft_precision val)
++{
++    switch(val)
++    {
++    case fft_precision_single:
++        return rocfft_precision_single;
++    case fft_precision_double:
++        return rocfft_precision_double;
++    case fft_precision_half:
++        return rocfft_precision_half;
++    default:
++        throw std::runtime_error("Invalid precision");
++    }
++}
++
++inline rocfft_array_type rocfft_array_type_from_fftparams(const fft_array_type val)
++{
++    switch(val)
++    {
++    case fft_array_type_complex_interleaved:
++        return rocfft_array_type_complex_interleaved;
++    case fft_array_type_complex_planar:
++        return rocfft_array_type_complex_planar;
++    case fft_array_type_real:
++        return rocfft_array_type_real;
++    case fft_array_type_hermitian_interleaved:
++        return rocfft_array_type_hermitian_interleaved;
++    case fft_array_type_hermitian_planar:
++        return rocfft_array_type_hermitian_planar;
++    case fft_array_type_unset:
++        return rocfft_array_type_unset;
++    }
++    return rocfft_array_type_unset;
++}
++
++inline rocfft_transform_type rocfft_transform_type_from_fftparams(const fft_transform_type val)
++{
++    switch(val)
++    {
++    case fft_transform_type_complex_forward:
++        return rocfft_transform_type_complex_forward;
++    case fft_transform_type_complex_inverse:
++        return rocfft_transform_type_complex_inverse;
++    case fft_transform_type_real_forward:
++        return rocfft_transform_type_real_forward;
++    case fft_transform_type_real_inverse:
++        return rocfft_transform_type_real_inverse;
++    default:
++        throw std::runtime_error("Invalid transform type");
++    }
++}
++
++inline rocfft_result_placement
++    rocfft_result_placement_from_fftparams(const fft_result_placement val)
++{
++    switch(val)
++    {
++    case fft_placement_inplace:
++        return rocfft_placement_inplace;
++    case fft_placement_notinplace:
++        return rocfft_placement_notinplace;
++    default:
++        throw std::runtime_error("Invalid result placement");
++    }
++}
++
++class rocfft_params : public fft_params
++{
++public:
++    rocfft_plan             plan = nullptr;
++    rocfft_execution_info   info = nullptr;
++    rocfft_plan_description desc = nullptr;
++    gpubuf_t<void>          wbuffer;
++
++    explicit rocfft_params(){};
++
++    explicit rocfft_params(const fft_params& p)
++        : fft_params(p){};
++
++    rocfft_params(const rocfft_params&) = delete;
++    rocfft_params& operator=(const rocfft_params&) = delete;
++
++    ~rocfft_params()
++    {
++        free();
++    };
++
++    void free()
++    {
++        if(plan != nullptr)
++        {
++            rocfft_plan_destroy(plan);
++            plan = nullptr;
++        }
++        if(info != nullptr)
++        {
++            rocfft_execution_info_destroy(info);
++            info = nullptr;
++        }
++        if(desc != nullptr)
++        {
++            rocfft_plan_description_destroy(desc);
++            desc = nullptr;
++        }
++        wbuffer.free();
++    }
++
++    void validate_fields() const override
++    {
++        // row-major lengths including batch (i.e. batch is at the front)
++        std::vector<size_t> length_with_batch{nbatch};
++        std::copy(length.begin(), length.end(), std::back_inserter(length_with_batch));
++
++        auto validate_field = [&](const fft_field& f) {
++            for(const auto& b : f.bricks)
++            {
++                // bricks must have same dim as FFT, including batch
++                if(b.lower.size() != length.size() + 1 || b.upper.size() != length.size() + 1
++                   || b.stride.size() != length.size() + 1)
++                    throw std::runtime_error(
++                        "brick dimension does not match FFT + batch dimension");
++
++                // ensure lower < upper, and that both fit in the FFT + batch dims
++                if(!std::lexicographical_compare(
++                       b.lower.begin(), b.lower.end(), b.upper.begin(), b.upper.end()))
++                    throw std::runtime_error("brick lower index is not less than upper index");
++
++                if(!std::lexicographical_compare(b.lower.begin(),
++                                                 b.lower.end(),
++                                                 length_with_batch.begin(),
++                                                 length_with_batch.end()))
++                    throw std::runtime_error(
++                        "brick lower index is not less than FFT + batch length");
++
++                if(!std::lexicographical_compare(b.upper.begin(),
++                                                 b.upper.end(),
++                                                 length_with_batch.begin(),
++                                                 length_with_batch.end())
++                   && b.upper != length_with_batch)
++                    throw std::runtime_error("brick upper index is not <= FFT + batch length");
++            }
++        };
++
++        for(const auto& ifield : ifields)
++            validate_field(ifield);
++        for(const auto& ofield : ofields)
++            validate_field(ofield);
++    }
++
++    rocfft_precision get_rocfft_precision()
++    {
++        return rocfft_precision_from_fftparams(precision);
++    }
++
++    size_t vram_footprint() override
++    {
++        size_t val = fft_params::vram_footprint();
++        if(setup_structs() != fft_status_success)
++        {
++            throw std::runtime_error("Struct setup failed");
++        }
++        val += workbuffersize;
++
++        return val;
++    }
++
++    // Convert the generic fft_field structure to a rocfft_field
++    // structure that can be passed to rocFFT.  In particular, we need
++    // to convert from row-major to column-major.
++    static rocfft_field fft_field_to_rocfft_field(const fft_field& f)
++    {
++        rocfft_field rfield = nullptr;
++        if(f.bricks.empty())
++            return rfield;
++
++        if(rocfft_field_create(&rfield) != rocfft_status_success)
++            throw std::runtime_error("rocfft_field_create failed");
++        for(const auto& b : f.bricks)
++        {
++            // rocFFT wants column-major bricks and fft_params stores
++            // row-major
++            std::vector<size_t> lower_cm;
++            std::copy(b.lower.rbegin(), b.lower.rend(), std::back_inserter(lower_cm));
++            std::vector<size_t> upper_cm;
++            std::copy(b.upper.rbegin(), b.upper.rend(), std::back_inserter(upper_cm));
++            std::vector<size_t> stride_cm;
++            std::copy(b.stride.rbegin(), b.stride.rend(), std::back_inserter(stride_cm));
++
++            rocfft_brick rbrick = nullptr;
++            if(rocfft_brick_create(&rbrick,
++                                   lower_cm.data(), // field_lower
++                                   upper_cm.data(), // field_upper
++                                   stride_cm.data(), // brick_stride
++                                   lower_cm.size(), // dim
++                                   b.device) // deviceID
++               != rocfft_status_success)
++                throw std::runtime_error("rocfft_brick_create failed");
++
++            if(rocfft_field_add_brick(rfield, rbrick) != rocfft_status_success)
++                throw std::runtime_error("rocfft_field_add_brick failed");
++
++            rocfft_brick_destroy(rbrick);
++        }
++        return rfield;
++    }
++
++    fft_status setup_structs()
++    {
++        rocfft_status fft_status = rocfft_status_success;
++        if(desc == nullptr)
++        {
++            rocfft_plan_description_create(&desc);
++            if(fft_status != rocfft_status_success)
++                return fft_status_from_rocfftparams(fft_status);
++
++            fft_status
++                = rocfft_plan_description_set_data_layout(desc,
++                                                          rocfft_array_type_from_fftparams(itype),
++                                                          rocfft_array_type_from_fftparams(otype),
++                                                          ioffset.data(),
++                                                          ooffset.data(),
++                                                          istride_cm().size(),
++                                                          istride_cm().data(),
++                                                          idist,
++                                                          ostride_cm().size(),
++                                                          ostride_cm().data(),
++                                                          odist);
++            if(fft_status != rocfft_status_success)
++            {
++                throw std::runtime_error("rocfft_plan_description_set_data_layout failed");
++            }
++
++            if(scale_factor != 1.0)
++            {
++                fft_status = rocfft_plan_description_set_scale_factor(desc, scale_factor);
++                if(fft_status != rocfft_status_success)
++                {
++                    throw std::runtime_error("rocfft_plan_description_set_scale_factor failed");
++                }
++            }
++
++            for(const auto& ifield : ifields)
++            {
++                rocfft_field infield = fft_field_to_rocfft_field(ifield);
++                if(rocfft_plan_description_add_infield(desc, infield) != rocfft_status_success)
++                    throw std::runtime_error("rocfft_description_add_infield failed");
++                rocfft_field_destroy(infield);
++            }
++
++            for(const auto& ofield : ofields)
++            {
++                rocfft_field outfield = fft_field_to_rocfft_field(ofield);
++                if(rocfft_plan_description_add_outfield(desc, outfield) != rocfft_status_success)
++                    throw std::runtime_error("rocfft_description_add_outfield failed");
++                rocfft_field_destroy(outfield);
++            }
++        }
++
++        if(plan == nullptr)
++        {
++            fft_status = rocfft_plan_create(&plan,
++                                            rocfft_result_placement_from_fftparams(placement),
++                                            rocfft_transform_type_from_fftparams(transform_type),
++                                            get_rocfft_precision(),
++                                            length_cm().size(),
++                                            length_cm().data(),
++                                            nbatch,
++                                            desc);
++            if(fft_status != rocfft_status_success)
++            {
++                throw std::runtime_error("rocfft_plan_create failed");
++            }
++        }
++
++        if(info == nullptr)
++        {
++            fft_status = rocfft_execution_info_create(&info);
++            if(fft_status != rocfft_status_success)
++            {
++                throw std::runtime_error("rocfft_execution_info_create failed");
++            }
++        }
++
++        fft_status = rocfft_plan_get_work_buffer_size(plan, &workbuffersize);
++        if(fft_status != rocfft_status_success)
++        {
++            throw std::runtime_error("rocfft_plan_get_work_buffer_size failed");
++        }
++
++        return fft_status_from_rocfftparams(fft_status);
++    }
++
++    fft_status create_plan() override
++    {
++        fft_status ret = setup_structs();
++        if(ret != fft_status_success)
++        {
++            return ret;
++        }
++        if(workbuffersize > 0)
++        {
++            hipError_t hip_status = hipSuccess;
++            hip_status            = wbuffer.alloc(workbuffersize);
++            if(hip_status != hipSuccess)
++            {
++                std::ostringstream oss;
++                oss << "work buffer allocation failed (" << workbuffersize << " requested)";
++                size_t mem_free  = 0;
++                size_t mem_total = 0;
++                hip_status       = hipMemGetInfo(&mem_free, &mem_total);
++                if(hip_status == hipSuccess)
++                {
++                    oss << "free vram: " << mem_free << " total vram: " << mem_total;
++                }
++                else
++                {
++                    oss << "hipMemGetInfo also failed";
++                }
++                throw work_buffer_alloc_failure(oss.str());
++            }
++
++            auto rocret
++                = rocfft_execution_info_set_work_buffer(info, wbuffer.data(), workbuffersize);
++            if(rocret != rocfft_status_success)
++            {
++                throw std::runtime_error("rocfft_execution_info_set_work_buffer failed");
++            }
++        }
++
++        return ret;
++    }
++
++    fft_status set_callbacks(void* load_cb_host,
++                             void* load_cb_data,
++                             void* store_cb_host,
++                             void* store_cb_data) override
++    {
++        if(run_callbacks)
++        {
++            auto roc_status
++                = rocfft_execution_info_set_load_callback(info, &load_cb_host, &load_cb_data, 0);
++            if(roc_status != rocfft_status_success)
++                return fft_status_from_rocfftparams(roc_status);
++
++            roc_status
++                = rocfft_execution_info_set_store_callback(info, &store_cb_host, &store_cb_data, 0);
++            if(roc_status != rocfft_status_success)
++                return fft_status_from_rocfftparams(roc_status);
++        }
++        return fft_status_success;
++    }
++
++    fft_status execute(void** in, void** out) override
++    {
++        auto ret = rocfft_execute(plan, in, out, info);
++        return fft_status_from_rocfftparams(ret);
++    }
++
++    // scatter data to multiple GPUs and adjust I/O buffers to match
++    void multi_gpu_prepare(std::vector<gpubuf>& ibuffer,
++                           std::vector<void*>&  pibuffer,
++                           std::vector<void*>&  pobuffer) override
++    {
++        auto alloc_fields = [&](const fft_params::fft_field& field,
++                                fft_array_type               array_type,
++                                std::vector<void*>&          pbuffer,
++                                bool                         copy_input) {
++            if(field.bricks.empty())
++                return;
++
++            // we have a field defined, clear the list of buffers as
++            // we'll be allocating new ones for each brick
++            pbuffer.clear();
++
++            for(const auto& b : field.bricks)
++            {
++                // get brick's length - note that this includes batch
++                // dimension
++                const auto brick_len    = b.length();
++                const auto brick_stride = b.stride;
++
++                const size_t brick_size_elems = product(brick_len.begin(), brick_len.end());
++                const size_t elem_size_bytes  = var_size<size_t>(precision, array_type);
++                const size_t brick_size_bytes = brick_size_elems * elem_size_bytes;
++
++                // set device for the alloc, but we want to return to the
++                // default device as the source of a following memcpy
++                {
++                    rocfft_scoped_device dev(b.device);
++                    multi_gpu_data.emplace_back();
++                    if(multi_gpu_data.back().alloc(brick_size_bytes) != hipSuccess)
++                        throw std::runtime_error("device allocation failure");
++                    pbuffer.push_back(multi_gpu_data.back().data());
++                }
++
++                if(copy_input)
++                {
++                    // For now, assume we're only splitting on highest FFT
++                    // dimension, lower-dimensional FFT data is all
++                    // contiguous, and batches are contiguous in each brick.
++                    //
++                    // That means we can express this as a 2D memcpy.
++                    const size_t unbatched_elems_per_brick
++                        = product(brick_len.begin() + 1, brick_len.end());
++                    const size_t unbatched_elems_per_fft = product(length.begin(), length.end());
++
++                    // get this brick's starting offset in the field
++                    const size_t brick_offset
++                        = b.lower_field_offset(istride, idist) * elem_size_bytes;
++
++                    // copy from original input - note that we're
++                    // assuming interleaved data so ibuffer has only one
++                    // gpubuf
++                    if(hipMemcpy2D(pbuffer.back(),
++                                   unbatched_elems_per_brick * elem_size_bytes,
++                                   ibuffer.front().data_offset(brick_offset),
++                                   unbatched_elems_per_fft * elem_size_bytes,
++                                   unbatched_elems_per_brick * elem_size_bytes,
++                                   brick_len.front(),
++                                   hipMemcpyHostToDevice)
++                       != hipSuccess)
++                        throw std::runtime_error("hipMemcpy failure");
++                }
++            }
++
++            // if we copied the input to all the other devices, and
++            // this is an out-of-place transform, we no longer
++            // need the original input
++            if(copy_input && placement == fft_placement_notinplace)
++                ibuffer.clear();
++        };
++
++        // assume one input, one output field for simple cases
++        if(!ifields.empty())
++            alloc_fields(ifields.front(), itype, pibuffer, true);
++        if(!ofields.empty())
++        {
++            if(!ifields.empty() && placement == fft_placement_inplace)
++                pobuffer = pibuffer;
++            else
++                alloc_fields(ofields.front(), otype, pobuffer, false);
++        }
++    }
++
++    // when preparing for multi-GPU transform, we need to allocate data
++    // on each GPU.  This vector remembers all of those allocations.
++    std::vector<gpubuf> multi_gpu_data;
++
++    // gather data after multi-GPU FFT for verification
++    void multi_gpu_finalize(std::vector<gpubuf>& obuffer, std::vector<void*>& pobuffer) override
++    {
++        if(ofields.empty())
++            return;
++
++        for(size_t i = 0; i < ofields.front().bricks.size(); ++i)
++        {
++            const auto& b         = ofields.front().bricks[i];
++            const auto& brick_ptr = pobuffer[i];
++
++            const auto brick_len = b.length();
++
++            const size_t elem_size_bytes = var_size<size_t>(precision, otype);
++
++            // get this brick's starting offset in the field
++            const size_t brick_offset = b.lower_field_offset(ostride, odist) * elem_size_bytes;
++
++            // switch device to where we're copying from
++            rocfft_scoped_device dev(b.device);
++
++            // For now, assume we're only splitting on highest FFT
++            // dimension, lower-dimensional FFT data is all
++            // contiguous, and batches are contiguous in each brick.
++            //
++            // That means we can express this as a 2D memcpy.
++            const size_t unbatched_elems_per_brick
++                = product(brick_len.begin() + 1, brick_len.end());
++            const auto   output_length = olength();
++            const size_t unbatched_elems_per_fft
++                = product(output_length.begin(), output_length.end());
++
++            // copy to original output buffer - note that
++            // we're assuming interleaved data so obuffer
++            // has only one gpubuf
++            if(hipMemcpy2D(obuffer.front().data_offset(brick_offset),
++                           unbatched_elems_per_fft * elem_size_bytes,
++                           brick_ptr,
++                           unbatched_elems_per_brick * elem_size_bytes,
++                           unbatched_elems_per_brick * elem_size_bytes,
++                           brick_len.front(),
++                           hipMemcpyDeviceToDevice)
++               != hipSuccess)
++                throw std::runtime_error("hipMemcpy failure");
++
++            // device-to-device transfers don't synchronize with the
++            // host, add explicit sync
++            (void)hipDeviceSynchronize();
++        }
++        pobuffer.clear();
++        pobuffer.push_back(obuffer.front().data());
++    }
++};
++
++#endif
+diff --git a/shared/test_params.h b/shared/test_params.h
+new file mode 100644
+index 0000000..8d8f6f7
+--- /dev/null
++++ b/shared/test_params.h
+@@ -0,0 +1,51 @@
++// Copyright (C) 2016 - 2023 Advanced Micro Devices, Inc. All rights reserved.
++//
++// Permission is hereby granted, free of charge, to any person obtaining a copy
++// of this software and associated documentation files (the "Software"), to deal
++// in the Software without restriction, including without limitation the rights
++// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
++// copies of the Software, and to permit persons to whom the Software is
++// furnished to do so, subject to the following conditions:
++//
++// The above copyright notice and this permission notice shall be included in
++// all copies or substantial portions of the Software.
++//
++// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
++// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
++// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
++// THE SOFTWARE.
++
++#pragma once
++#ifndef TESTCONSTANTS_H
++#define TESTCONSTANTS_H
++
++#include <stdexcept>
++
++extern int    verbose;
++extern size_t ramgb;
++extern size_t vramgb;
++
++extern size_t n_random_tests;
++
++extern size_t random_seed;
++extern double planar_prob;
++extern double callback_prob;
++
++extern double half_epsilon;
++extern double single_epsilon;
++extern double double_epsilon;
++extern bool   skip_runtime_fails;
++
++extern double max_linf_eps_double;
++extern double max_l2_eps_double;
++extern double max_linf_eps_single;
++extern double max_l2_eps_single;
++extern double max_linf_eps_half;
++extern double max_l2_eps_half;
++
++extern int n_hip_failures;
++
++#endif
+diff --git a/shared/work_queue.h b/shared/work_queue.h
+new file mode 100644
+index 0000000..e13fc41
+--- /dev/null
++++ b/shared/work_queue.h
+@@ -0,0 +1,49 @@
++// Copyright (C) 2022 Advanced Micro Devices, Inc. All rights reserved.
++//
++// Permission is hereby granted, free of charge, to any person obtaining a copy
++// of this software and associated documentation files (the "Software"), to deal
++// in the Software without restriction, including without limitation the rights
++// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
++// copies of the Software, and to permit persons to whom the Software is
++// furnished to do so, subject to the following conditions:
++//
++// The above copyright notice and this permission notice shall be included in
++// all copies or substantial portions of the Software.
++//
++// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
++// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
++// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
++// THE SOFTWARE.
++
++#pragma once
++
++#include <condition_variable>
++#include <mutex>
++#include <queue>
++template <typename _WorkItem>
++struct WorkQueue
++{
++    void push(_WorkItem&& i)
++    {
++        std::unique_lock<std::mutex> lock(queueMutex);
++        items.emplace(std::move(i));
++        emptyWait.notify_all();
++    }
++    _WorkItem pop()
++    {
++        std::unique_lock<std::mutex> lock(queueMutex);
++        while(items.empty())
++            emptyWait.wait(lock);
++        _WorkItem item(items.front());
++        items.pop();
++        return item;
++    }
++
++private:
++    std::queue<_WorkItem>   items;
++    std::mutex              queueMutex;
++    std::condition_variable emptyWait;
++};
diff --git a/var/spack/repos/builtin/packages/hipfft/package.py b/var/spack/repos/builtin/packages/hipfft/package.py
index 818a9c4935..f5749749ac 100644
--- a/var/spack/repos/builtin/packages/hipfft/package.py
+++ b/var/spack/repos/builtin/packages/hipfft/package.py
@@ -14,9 +14,9 @@ class Hipfft(CMakePackage, CudaPackage, ROCmPackage):
     It sits between the application and the backend FFT library, marshalling
     inputs into the backend and results back to the application."""
 
-    homepage = "https://github.com/ROCmSoftwarePlatform/hipFFT"
-    git = "https://github.com/ROCmSoftwarePlatform/hipFFT.git"
-    url = "https://github.com/ROCmSoftwarePlatform/hipfft/archive/rocm-5.5.0.tar.gz"
+    homepage = "https://github.com/ROCm/hipFFT"
+    git = "https://github.com/ROCm/hipFFT.git"
+    url = "https://github.com/ROCm/hipfft/archive/rocm-6.0.0.tar.gz"
     tags = ["rocm"]
 
     maintainers("renjithravindrankannath", "srekolam")
@@ -24,6 +24,7 @@ class Hipfft(CMakePackage, CudaPackage, ROCmPackage):
     license("MIT")
 
     version("master", branch="master")
+    version("6.0.0", sha256="44f328b7862c066459089dfe62833cb7d626c6ceb71c57d8c7d6bba45dad491e")
     version("5.7.1", sha256="33452576649df479f084076c47d0b30f6f1da34864094bce767dd9bf609f04aa")
     version("5.7.0", sha256="daa5dc44580145e85ff8ffa7eb40a3d1ef41f3217549c01281715ff696a31588")
     version("5.6.1", sha256="d2ae36b8eacd39b865e8a7972b8eb86bcea2de4ac90711bba7e29b39b01eaa74")
@@ -125,6 +126,7 @@ class Hipfft(CMakePackage, CudaPackage, ROCmPackage):
         "5.6.1",
         "5.7.0",
         "5.7.1",
+        "6.0.0",
     ]:
         depends_on("rocm-cmake@%s:" % ver, type="build", when="@" + ver)
         depends_on("rocfft@" + ver, when="+rocm @" + ver)
@@ -133,6 +135,8 @@ class Hipfft(CMakePackage, CudaPackage, ROCmPackage):
         depends_on(
             "rocfft amdgpu_target={0}".format(tgt), when="+rocm amdgpu_target={0}".format(tgt)
         )
+    # https://github.com/ROCm/rocFFT/pull/85)
+    patch("001-remove-submodule-and-sync-shared-files-from-rocFFT.patch", when="@6.0.0")
 
     def cmake_args(self):
         args = [self.define("BUILD_CLIENTS_SAMPLES", "OFF")]
diff --git a/var/spack/repos/builtin/packages/hipfort/package.py b/var/spack/repos/builtin/packages/hipfort/package.py
index be1819bf50..8e8ea5a0a6 100644
--- a/var/spack/repos/builtin/packages/hipfort/package.py
+++ b/var/spack/repos/builtin/packages/hipfort/package.py
@@ -9,14 +9,15 @@ from spack.package import *
 class Hipfort(CMakePackage):
     """Radeon Open Compute Parallel Primitives Library"""
 
-    homepage = "https://github.com/ROCmSoftwarePlatform/hipfort"
-    git = "https://github.com/ROCmSoftwarePlatform/hipfort.git"
-    url = "https://github.com/ROCmSoftwarePlatform/hipfort/archive/rocm-5.5.0.tar.gz"
+    homepage = "https://github.com/ROCm/hipfort"
+    git = "https://github.com/ROCm/hipfort.git"
+    url = "https://github.com/ROCm/hipfort/archive/rocm-6.0.0.tar.gz"
     tags = ["rocm"]
 
     license("MIT")
 
     maintainers("cgmb", "srekolam", "renjithravindrankannath")
+    version("6.0.0", sha256="151cf11648885db799aade0d00a7882589e7195643b02beaa251f1b2a43aceed")
     version("5.7.1", sha256="859fac509e195f3ab97c555b5f63afea325a61aae0f281cb19a970a1b533dead")
     version("5.7.0", sha256="57b04d59f61683a1b141d6d831d10c9fdecea483991ec02d14c14e441e935c05")
     version("5.6.1", sha256="a55345cc9ccaf0cd69d306b8eb9ec2a02c220a57e9c396443cc7273aa3377adc")
@@ -127,6 +128,7 @@ class Hipfort(CMakePackage):
         "5.6.1",
         "5.7.0",
         "5.7.1",
+        "6.0.0",
     ]:
         depends_on("hip@" + ver, type="build", when="@" + ver)
 
diff --git a/var/spack/repos/builtin/packages/hipify-clang/package.py b/var/spack/repos/builtin/packages/hipify-clang/package.py
index ab15e479d4..b1c5f2a7fb 100644
--- a/var/spack/repos/builtin/packages/hipify-clang/package.py
+++ b/var/spack/repos/builtin/packages/hipify-clang/package.py
@@ -10,9 +10,9 @@ class HipifyClang(CMakePackage):
     """hipify-clang is a clang-based tool for translation CUDA
     sources into HIP sources"""
 
-    homepage = "https://github.com/ROCm-Developer-Tools/HIPIFY"
-    git = "https://github.com/ROCm-Developer-Tools/HIPIFY.git"
-    url = "https://github.com/ROCm-Developer-Tools/HIPIFY/archive/rocm-5.5.0.tar.gz"
+    homepage = "https://github.com/ROCm/HIPIFY"
+    git = "https://github.com/ROCm/HIPIFY.git"
+    url = "https://github.com/ROCm/HIPIFY/archive/rocm-6.0.0.tar.gz"
     tags = ["rocm"]
 
     maintainers("srekolam", "renjithravindrankannath")
@@ -20,6 +20,7 @@ class HipifyClang(CMakePackage):
     license("MIT")
 
     version("master", branch="master")
+    version("6.0.0", sha256="91bed2b72a6684a04e078e50b12b36b93f64ff96523283f4e5d9a33c11e6b967")
     version("5.7.1", sha256="43121e62233dab010ab686d6805bc2d3163f0dc5e89cc503d50c4bcd59eeb394")
     version("5.7.0", sha256="10e4386727e102fba166f012147120a6ec776e8d95fbcac3af93e243205d80a6")
     version("5.6.1", sha256="ec3a4f276556f9fd924ea3c89be11b6c6ddf999cdd4387f669e38e41ee0042e8")
@@ -143,11 +144,12 @@ class HipifyClang(CMakePackage):
         "5.6.1",
         "5.7.0",
         "5.7.1",
+        "6.0.0",
         "master",
     ]:
         depends_on("llvm-amdgpu@" + ver, when="@" + ver)
 
-    for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1", "5.7.0", "5.7.1"]:
+    for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1", "5.7.0", "5.7.1", "6.0.0"]:
         depends_on("rocm-core@" + ver, when="@" + ver)
 
     def setup_run_environment(self, env):
diff --git a/var/spack/repos/builtin/packages/hiprand/package.py b/var/spack/repos/builtin/packages/hiprand/package.py
index 0d8666f884..acc3629762 100644
--- a/var/spack/repos/builtin/packages/hiprand/package.py
+++ b/var/spack/repos/builtin/packages/hiprand/package.py
@@ -12,9 +12,9 @@ class Hiprand(CMakePackage, CudaPackage, ROCmPackage):
     """The hipRAND project provides an interface for generating pseudo-random
     and quasi-random numbers with either cuRAND or rocRAND backends."""
 
-    homepage = "https://github.com/ROCmSoftwarePlatform/hipRAND"
-    git = "https://github.com/ROCmSoftwarePlatform/hipRAND.git"
-    url = "https://github.com/ROCmSoftwarePlatform/hipRAND/archive/rocm-5.7.1.tar.gz"
+    homepage = "https://github.com/ROCm/hipRAND"
+    git = "https://github.com/ROCm/hipRAND.git"
+    url = "https://github.com/ROCm/hipRAND/archive/rocm-5.7.1.tar.gz"
     tags = ["rocm"]
 
     maintainers("cgmb", "srekolam", "renjithravindrankannath")
@@ -24,6 +24,7 @@ class Hiprand(CMakePackage, CudaPackage, ROCmPackage):
 
     version("develop", branch="develop")
     version("master", branch="master")
+    version("6.0.0", sha256="7e06c98f9da7c0b20b55b2106cf3a48b9ef6577a79549a455667ae97bd15b61d")
     version("5.7.1", sha256="81a9f5f0960dce125ce1ab1c7eb58bb07c8756346f9e46a1cc65aa61d5a114f8")
     version("5.7.0", sha256="4dee76719839503b02ce7d38e1c61bbdb2da18da7f63a7ef7012c84c71aa0a9d")
     version("5.6.1", sha256="a73d5578bc7f8dff0b8960e4bff97bc4fc28f508a19ed6acd1cfd4d3e76b47ee")
@@ -88,6 +89,7 @@ class Hiprand(CMakePackage, CudaPackage, ROCmPackage):
         "5.6.1",
         "5.7.0",
         "5.7.1",
+        "6.0.0",
         "master",
         "develop",
     ]:
diff --git a/var/spack/repos/builtin/packages/hipsolver/package.py b/var/spack/repos/builtin/packages/hipsolver/package.py
index f39755d03d..81c956334c 100644
--- a/var/spack/repos/builtin/packages/hipsolver/package.py
+++ b/var/spack/repos/builtin/packages/hipsolver/package.py
@@ -16,9 +16,9 @@ class Hipsolver(CMakePackage, CudaPackage, ROCmPackage):
     regardless of the chosen backend. Currently, hipSOLVER supports rocSOLVER
     and cuSOLVER as backends."""
 
-    homepage = "https://github.com/ROCmSoftwarePlatform/hipSOLVER"
-    git = "https://github.com/ROCmSoftwarePlatform/hipSOLVER.git"
-    url = "https://github.com/ROCmSoftwarePlatform/hipSOLVER/archive/rocm-5.5.0.tar.gz"
+    homepage = "https://github.com/ROCm/hipSOLVER"
+    git = "https://github.com/ROCm/hipSOLVER.git"
+    url = "https://github.com/ROCm/hipSOLVER/archive/rocm-6.0.0.tar.gz"
     tags = ["rocm"]
 
     maintainers("cgmb", "srekolam", "renjithravindrankannath")
@@ -28,6 +28,7 @@ class Hipsolver(CMakePackage, CudaPackage, ROCmPackage):
 
     version("develop", branch="develop")
     version("master", branch="master")
+    version("6.0.0", sha256="385849db02189d5e62096457e52ae899ae5c1ae7d409dc1da61f904d8861b48c")
     version("5.7.1", sha256="5592e965c0dc5722931302289643d1ece370220af2c7afc58af97b3395295658")
     version("5.7.0", sha256="0e35795bfbcb57ed8e8437471209fb7d230babcc31d9a4a0b3640c3ee639f4a7")
     version("5.6.1", sha256="2e546bc7771f7bf0aa7892b69cded725941573e8b70614759c3d03c21eb78dde")
@@ -115,6 +116,7 @@ class Hipsolver(CMakePackage, CudaPackage, ROCmPackage):
         "5.6.1",
         "5.7.0",
         "5.7.1",
+        "6.0.0",
         "master",
         "develop",
     ]:
diff --git a/var/spack/repos/builtin/packages/hipsparse/package.py b/var/spack/repos/builtin/packages/hipsparse/package.py
index a195356fa4..0473a3ea3d 100644
--- a/var/spack/repos/builtin/packages/hipsparse/package.py
+++ b/var/spack/repos/builtin/packages/hipsparse/package.py
@@ -12,9 +12,9 @@ class Hipsparse(CMakePackage, CudaPackage, ROCmPackage):
     """hipSPARSE is a SPARSE marshalling library, with
     multiple supported backends"""
 
-    homepage = "https://github.com/ROCmSoftwarePlatform/hipSPARSE"
-    git = "https://github.com/ROCmSoftwarePlatform/hipSPARSE.git"
-    url = "https://github.com/ROCmSoftwarePlatform/hipSPARSE/archive/rocm-5.5.0.tar.gz"
+    homepage = "https://github.com/ROCm/hipSPARSE"
+    git = "https://github.com/ROCm/hipSPARSE.git"
+    url = "https://github.com/ROCm/hipSPARSE/archive/rocm-6.0.0.tar.gz"
     tags = ["rocm"]
 
     maintainers("cgmb", "srekolam", "renjithravindrankannath", "haampie")
@@ -22,6 +22,7 @@ class Hipsparse(CMakePackage, CudaPackage, ROCmPackage):
 
     license("MIT")
 
+    version("6.0.0", sha256="718a5f03b6a579c0542a60d00f5688bec53a181b429b7ee8ce3c8b6c4a78d754")
     version("5.7.1", sha256="16c3818260611226c3576d8d55ad8f51e0890d2473503edf2c9313250ae65ca7")
     version("5.7.0", sha256="729b749b5340034639873a99e6091963374f6f0456c8f36d076c96f03fe43888")
     version("5.6.1", sha256="d636d0c5d1e38cc0c09b1e95380199ec82bd465b94bd6661f0c8d9374d9b565d")
@@ -160,6 +161,7 @@ class Hipsparse(CMakePackage, CudaPackage, ROCmPackage):
         "5.6.1",
         "5.7.0",
         "5.7.1",
+        "6.0.0",
     ]:
         depends_on("rocm-cmake@%s:" % ver, type="build", when="@" + ver)
         depends_on("rocsparse@" + ver, when="+rocm @" + ver)
diff --git a/var/spack/repos/builtin/packages/hsa-rocr-dev/package.py b/var/spack/repos/builtin/packages/hsa-rocr-dev/package.py
index d0a153a595..6b64a0129b 100644
--- a/var/spack/repos/builtin/packages/hsa-rocr-dev/package.py
+++ b/var/spack/repos/builtin/packages/hsa-rocr-dev/package.py
@@ -15,15 +15,16 @@ class HsaRocrDev(CMakePackage):
     HSA ROCm kernel agents.AMD Heterogeneous System Architecture HSA -
     Linux HSA Runtime for Boltzmann (ROCm) platforms."""
 
-    homepage = "https://github.com/RadeonOpenCompute/ROCR-Runtime"
-    git = "https://github.com/RadeonOpenCompute/ROCR-Runtime.git"
-    url = "https://github.com/RadeonOpenCompute/ROCR-Runtime/archive/rocm-5.5.0.tar.gz"
+    homepage = "https://github.com/ROCm/ROCR-Runtime"
+    git = "https://github.com/ROCm/ROCR-Runtime.git"
+    url = "https://github.com/ROCm/ROCR-Runtime/archive/rocm-6.0.0.tar.gz"
     tags = ["rocm"]
 
     maintainers("srekolam", "renjithravindrankannath", "haampie")
     libraries = ["libhsa-runtime64"]
 
     version("master", branch="master")
+    version("6.0.0", sha256="99e8fa1af52d0bf382f28468e1a345af1ff3452c35914a6a7b5eeaf69fc568db")
     version("5.7.1", sha256="655e9bfef4b0b6ad3f9b89c934dc0a8377273bb0bccbda6c399ac5d5d2c1c04c")
     version("5.7.0", sha256="2c56ec5c78a36f2b847afd4632cb25dbf6ecc58661eb2ae038c2552342e6ce23")
     version("5.6.1", sha256="4de9a57c2092edf9398d671c8a2c60626eb7daf358caf710da70d9c105490221")
@@ -154,6 +155,7 @@ class HsaRocrDev(CMakePackage):
         "5.6.1",
         "5.7.0",
         "5.7.1",
+        "6.0.0",
         "master",
     ]:
         depends_on("hsakmt-roct@" + ver, when="@" + ver)
@@ -163,7 +165,7 @@ class HsaRocrDev(CMakePackage):
             "rocm-device-libs@" + ver, when="@{0} ^llvm-amdgpu ~rocm-device-libs".format(ver)
         )
 
-    for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1", "5.7.0", "5.7.1"]:
+    for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1", "5.7.0", "5.7.1", "6.0.0"]:
         depends_on("rocm-core@" + ver, when="@" + ver)
 
     # Both 3.5.0 and 3.7.0 force INSTALL_RPATH in different ways
@@ -210,4 +212,7 @@ class HsaRocrDev(CMakePackage):
 
         if self.spec.satisfies("@5.6:"):
             args.append("-DCMAKE_INSTALL_LIBDIR=lib")
+        if self.spec.satisfies("@6.0:"):
+            args.append(self.define("ROCM_PATCH_VERSION", "60000"))
+
         return args
diff --git a/var/spack/repos/builtin/packages/hsakmt-roct/package.py b/var/spack/repos/builtin/packages/hsakmt-roct/package.py
index e087ea6519..89be71a9ea 100644
--- a/var/spack/repos/builtin/packages/hsakmt-roct/package.py
+++ b/var/spack/repos/builtin/packages/hsakmt-roct/package.py
@@ -14,14 +14,15 @@ class HsakmtRoct(CMakePackage):
     Thunk Interface is a user-mode API interfaces used to interact
     with the ROCk driver."""
 
-    homepage = "https://github.com/RadeonOpenCompute/ROCT-Thunk-Interface"
-    git = "https://github.com/RadeonOpenCompute/ROCT-Thunk-Interface.git"
-    url = "https://github.com/RadeonOpenCompute/ROCT-Thunk-Interface/archive/rocm-5.5.0.tar.gz"
+    homepage = "https://github.com/ROCm/ROCT-Thunk-Interface"
+    git = "https://github.com/ROCm/ROCT-Thunk-Interface.git"
+    url = "https://github.com/ROCm/ROCT-Thunk-Interface/archive/rocm-6.0.0.tar.gz"
     tags = ["rocm"]
 
     maintainers("srekolam", "renjithravindrankannath")
 
     version("master", branch="master")
+    version("6.0.0", sha256="9f4e80bd0a714ce45326941b906a62298c62025eff186dc6c48282ce84c787c7")
     version("5.7.1", sha256="38bc3732886a52ca9cd477ec6fcde3ab17a0ba5dc8e2f7ac34c4de597bd00e8b")
     version("5.7.0", sha256="52293e40c4ba0c653d796e2f6109f5fb4c79f5fb82310ecbfd9a5432acf9da43")
     version("5.6.1", sha256="d60b355bfd21a08e0e36270fd56f98d052c3c6edca47da887fa32bf32759c29b")
@@ -119,11 +120,11 @@ class HsakmtRoct(CMakePackage):
     for ver in ["5.3.0", "5.4.0", "5.4.3"]:
         depends_on("llvm-amdgpu@" + ver, type="test", when="@" + ver)
 
-    for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1", "5.7.0", "5.7.1"]:
+    for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1", "5.7.0", "5.7.1", "6.0.0"]:
         depends_on("rocm-core@" + ver, when="@" + ver)
         depends_on("llvm-amdgpu@" + ver, type="test", when="@" + ver)
 
-    # See https://github.com/RadeonOpenCompute/ROCT-Thunk-Interface/issues/72
+    # See https://github.com/ROCm/ROCT-Thunk-Interface/issues/72
     # and https://github.com/spack/spack/issues/28398
     patch("0001-Remove-compiler-support-libraries-and-libudev-as-req.patch", when="@4.5.0:5.2")
     patch("0002-Remove-compiler-support-libraries-and-libudev-as-req-5.3.patch", when="@5.3.0:5.4")
diff --git a/var/spack/repos/builtin/packages/legion/package.py b/var/spack/repos/builtin/packages/legion/package.py
index 2840d577de..7cc446cded 100644
--- a/var/spack/repos/builtin/packages/legion/package.py
+++ b/var/spack/repos/builtin/packages/legion/package.py
@@ -74,6 +74,7 @@ class Legion(CMakePackage, ROCmPackage):
 
     # https://github.com/spack/spack/issues/37232#issuecomment-1553376552
     patch("hip-offload-arch.patch", when="@23.03.0 +rocm")
+    patch("update-hip-path-legion-23.06.0.patch", when="@23.06.0 ^hip@6.0.0 +rocm")
 
     def patch(self):
         if "network=gasnet conduit=ofi-slingshot11 ^cray-mpich+wrappers" in self.spec:
@@ -349,6 +350,10 @@ class Legion(CMakePackage, ROCmPackage):
             options.append(from_variant("Legion_HIP_ARCH", "amdgpu_target"))
             options.append(from_variant("Legion_HIJACK_HIP", "hip_hijack"))
             options.append(self.define("HIP_PATH", "{0}/hip".format(spec["hip"].prefix)))
+            if "^hip@:5.7" in spec:
+                options.append(self.define("HIP_PATH", "{0}/hip".format(spec["hip"].prefix)))
+            elif "^hip@6.0:" in spec:
+                options.append(self.define("HIP_PATH", "{0}".format(spec["hip"].prefix)))
 
         if "+fortran" in spec:
             # default is off.
diff --git a/var/spack/repos/builtin/packages/legion/update-hip-path-legion-23.06.0.patch b/var/spack/repos/builtin/packages/legion/update-hip-path-legion-23.06.0.patch
new file mode 100644
index 0000000000..9f7f6a7a86
--- /dev/null
+++ b/var/spack/repos/builtin/packages/legion/update-hip-path-legion-23.06.0.patch
@@ -0,0 +1,13 @@
+diff --git a/cmake/FindHIP.cmake b/cmake/FindHIP.cmake
+index f86edd2..24492ad 100644
+--- a/cmake/FindHIP.cmake
++++ b/cmake/FindHIP.cmake
+@@ -22,7 +22,7 @@ if(NOT DEFINED HIP_PATH)
+       set(HIP_PATH $ENV{HIP_PATH} CACHE PATH "Path to where HIP has been installed")
+   endif()
+ endif()
+-include(${HIP_PATH}/cmake/FindHIP.cmake)
++include(${HIP_PATH}/lib/cmake/hip/FindHIP.cmake)
+ 
+ if(NOT HIP_INCLUDE_DIRS)
+   list(APPEND HIP_INCLUDE_DIRS
diff --git a/var/spack/repos/builtin/packages/llvm-amdgpu/package.py b/var/spack/repos/builtin/packages/llvm-amdgpu/package.py
index f8cddebf84..99a2e67488 100644
--- a/var/spack/repos/builtin/packages/llvm-amdgpu/package.py
+++ b/var/spack/repos/builtin/packages/llvm-amdgpu/package.py
@@ -14,9 +14,9 @@ class LlvmAmdgpu(CMakePackage):
     """Toolkit for the construction of highly optimized compilers,
     optimizers, and run-time environments."""
 
-    homepage = "https://github.com/RadeonOpenCompute/llvm-project"
-    git = "https://github.com/RadeonOpenCompute/llvm-project.git"
-    url = "https://github.com/RadeonOpenCompute/llvm-project/archive/rocm-5.5.0.tar.gz"
+    homepage = "https://github.com/ROCm/llvm-project"
+    git = "https://github.com/ROCm/llvm-project.git"
+    url = "https://github.com/ROCm/llvm-project/archive/rocm-6.0.0.tar.gz"
     tags = ["rocm"]
     executables = [r"amdclang", r"amdclang\+\+", r"amdflang", r"clang.*", r"flang.*", "llvm-.*"]
     generator("ninja")
@@ -26,6 +26,7 @@ class LlvmAmdgpu(CMakePackage):
     license("Apache-2.0")
 
     version("master", branch="amd-stg-open")
+    version("6.0.0", sha256="c673708d413d60ca8606ee75c77e9871b6953c59029c987b92f2f6e85f683626")
     version("5.7.1", sha256="6b54c422e45ad19c9bf5ab090ec21753e7f7d854ca78132c30eb146657b168eb")
     version("5.7.0", sha256="4abdf00b297a77c5886cedb37e63acda2ba11cb9f4c0a64e133b05800aadfcf0")
     version("5.6.1", sha256="045e43c0c4a3f4f2f1db9fb603a4f1ea3d56e128147e19ba17909eb57d7f08e5")
@@ -167,12 +168,12 @@ class LlvmAmdgpu(CMakePackage):
     # as per 5.2.0 llvm code. It used to be llvm/bin/../lib/libdevice.
     # Below patch is to look in the old path.
     patch("adjust-openmp-bitcode-directory-for-llvm-link.patch", when="@5.2.0:5.6")
-    patch("0001-update-HIP_PATH-deduction-for-5.7.0.patch", when="@5.7.0:5.7")
+    patch("0001-update-HIP_PATH-deduction-for-5.7.0.patch", when="@5.7.0:6.0")
 
     # Below patch is to set the flag -mcode-object-version=none until
     # the below fix is available in device-libs release code.
-    # https://github.com/RadeonOpenCompute/ROCm-Device-Libs/commit/f0356159dbdc93ea9e545f9b61a7842f9c881fdf
-    patch("patch-llvm-5.5.0.patch", when="@5.5: +rocm-device-libs")
+    # https://github.com/ROCm/ROCm-Device-Libs/commit/f0356159dbdc93ea9e545f9b61a7842f9c881fdf
+    patch("patch-llvm-5.5.0.patch", when="@5.5:5.7 +rocm-device-libs")
 
     # i1 muls can sometimes happen after SCEV.
     # They resulted in ISel failures because we were missing the patterns for them.
@@ -188,6 +189,7 @@ class LlvmAmdgpu(CMakePackage):
 
     # Add device libs sources so they can be an external LLVM project
     for d_version, d_shasum in [
+        ("6.0.0", "198df4550d4560537ba60ac7af9bde31d59779c8ec5d6309627f77a43ab6ef6f"),
         ("5.7.1", "703de8403c0bd0d80f37c970a698f10f148daf144d34f982e4484d04f7c7bbef"),
         ("5.7.0", "0f8780b9098573f1c456bdc84358de924dcf00604330770a383983e1775bf61e"),
         ("5.6.1", "f0dfab272ff936225bfa1e9dabeb3c5d12ce08b812bf53ffbddd2ddfac49761c"),
@@ -221,7 +223,7 @@ class LlvmAmdgpu(CMakePackage):
         resource(
             name="rocm-device-libs",
             placement="rocm-device-libs",
-            url="https://github.com/RadeonOpenCompute/ROCm-Device-Libs/archive/rocm-{0}.tar.gz".format(
+            url="https://github.com/ROCm/ROCm-Device-Libs/archive/rocm-{0}.tar.gz".format(
                 d_version
             ),
             sha256=d_shasum,
@@ -231,11 +233,12 @@ class LlvmAmdgpu(CMakePackage):
     resource(
         name="rocm-device-libs",
         placement="rocm-device-libs",
-        git="https://github.com/RadeonOpenCompute/ROCm-Device-Libs.git",
+        git="https://github.com/ROCm/ROCm-Device-Libs.git",
         branch="amd-stg-open",
         when="@master +rocm-device-libs",
     )
     for d_version, d_shasum in [
+        ("6.0.0", "99e8fa1af52d0bf382f28468e1a345af1ff3452c35914a6a7b5eeaf69fc568db"),
         ("5.7.1", "655e9bfef4b0b6ad3f9b89c934dc0a8377273bb0bccbda6c399ac5d5d2c1c04c"),
         ("5.7.0", "2c56ec5c78a36f2b847afd4632cb25dbf6ecc58661eb2ae038c2552342e6ce23"),
         ("5.6.1", "4de9a57c2092edf9398d671c8a2c60626eb7daf358caf710da70d9c105490221"),
@@ -244,19 +247,20 @@ class LlvmAmdgpu(CMakePackage):
         resource(
             name="hsa-runtime",
             placement="hsa-runtime",
-            url=f"https://github.com/RadeonOpenCompute/ROCR-Runtime/archive/rocm-{d_version}.tar.gz",
+            url=f"https://github.com/ROCm/ROCR-Runtime/archive/rocm-{d_version}.tar.gz",
             sha256=d_shasum,
             when="@{0}".format(d_version),
         )
     resource(
         name="hsa-runtime",
         placement="hsa-runtime",
-        git="https://github.com/RadeonOpenCompute/ROCR-Runtime.git",
+        git="https://github.com/ROCm/ROCR-Runtime.git",
         branch="master",
         when="@master",
     )
 
     for d_version, d_shasum in [
+        ("6.0.0", "04353d27a512642a5e5339532a39d0aabe44e0964985de37b150a2550385800a"),
         ("5.7.1", "3b9433b4a0527167c3e9dfc37a3c54e0550744b8d4a8e1be298c8d4bcedfee7c"),
         ("5.7.0", "e234bcb93d602377cfaaacb59aeac5796edcd842a618162867b7e670c3a2c42c"),
         ("5.6.1", "0a85d84619f98be26ca7a32c71f94ed3c4e9866133789eabb451be64ce739300"),
@@ -265,14 +269,14 @@ class LlvmAmdgpu(CMakePackage):
         resource(
             name="comgr",
             placement="comgr",
-            url=f"https://github.com/RadeonOpenCompute/ROCm-CompilerSupport/archive/rocm-{d_version}.tar.gz",
+            url=f"https://github.com/ROCm/ROCm-CompilerSupport/archive/rocm-{d_version}.tar.gz",
             sha256=d_shasum,
             when="@{0}".format(d_version),
         )
     resource(
         name="comgr",
         placement="comgr",
-        git="https://github.com/RadeonOpenCompute/ROCm-CompilerSupport.git",
+        git="https://github.com/ROCm/ROCm-CompilerSupport.git",
         branch="amd-stg-open",
         when="@master",
     )
diff --git a/var/spack/repos/builtin/packages/magma/0001-fix-magma-build-error-with-rocm-6.0.0.patch b/var/spack/repos/builtin/packages/magma/0001-fix-magma-build-error-with-rocm-6.0.0.patch
new file mode 100644
index 0000000000..accc271419
--- /dev/null
+++ b/var/spack/repos/builtin/packages/magma/0001-fix-magma-build-error-with-rocm-6.0.0.patch
@@ -0,0 +1,99 @@
+From 4f7d9ff22996ba3000ee344a0f84f73c27257f47 Mon Sep 17 00:00:00 2001
+From: sreenivasa murthy kolam <sreenivasamurthy.kolam@amd.com>
+Date: Wed, 17 Jan 2024 11:44:32 +0000
+Subject: [PATCH] Fix Build Failure with rocm-6.0.0 . Add extra parameter for
+ hipblasZtrmm(),hipblasCtrmm()etc
+
+---
+ interface_hip/blas_c_v2.cpp | 3 ++-
+ interface_hip/blas_d_v2.cpp | 3 ++-
+ interface_hip/blas_s_v2.cpp | 3 ++-
+ interface_hip/blas_z_v2.cpp | 3 ++-
+ interface_hip/interface.cpp | 5 ++---
+ 5 files changed, 10 insertions(+), 7 deletions(-)
+
+diff --git a/interface_hip/blas_c_v2.cpp b/interface_hip/blas_c_v2.cpp
+index 6147857..a406faf 100644
+--- a/interface_hip/blas_c_v2.cpp
++++ b/interface_hip/blas_c_v2.cpp
+@@ -1858,7 +1858,8 @@ magma_ctrmm(
+                     hipblas_diag_const( diag ),
+ 		    int(m), int(n),
+ 		    (hipblasComplex*)&alpha, (const hipblasComplex*)dA, int(ldda),
+-		    (hipblasComplex*)dB, int(lddb) );
++		    (hipblasComplex*)dB, int(lddb),
++		    (hipblasComplex*)dB, int(lddb) ); /* C same as B; less efficient */
+     #else
+         hipblasCtrmm(
+                     queue->hipblas_handle(),
+diff --git a/interface_hip/blas_d_v2.cpp b/interface_hip/blas_d_v2.cpp
+index 340f0b2..8c1ecd4 100644
+--- a/interface_hip/blas_d_v2.cpp
++++ b/interface_hip/blas_d_v2.cpp
+@@ -1858,7 +1858,8 @@ magma_dtrmm(
+                     hipblas_diag_const( diag ),
+ 		    int(m), int(n),
+ 		    (double*)&alpha, (const double*)dA, int(ldda),
+-		    (double*)dB, int(lddb) );
++		    (double*)dB, int(lddb),
++		    (double*)dB, int(lddb) ); /* C same as B; less efficient */
+     #else
+         hipblasDtrmm(
+                     queue->hipblas_handle(),
+diff --git a/interface_hip/blas_s_v2.cpp b/interface_hip/blas_s_v2.cpp
+index 87aeba3..a2cfc02 100644
+--- a/interface_hip/blas_s_v2.cpp
++++ b/interface_hip/blas_s_v2.cpp
+@@ -1858,7 +1858,8 @@ magma_strmm(
+                     hipblas_diag_const( diag ),
+ 		    int(m), int(n),
+ 		    (float*)&alpha, (const float*)dA, int(ldda),
+-		    (float*)dB, int(lddb) );
++		    (float*)dB, int(lddb),
++		    (float*)dB, int(lddb) ); /* C same as B; less efficient */
+     #else
+         hipblasStrmm(
+                     queue->hipblas_handle(),
+diff --git a/interface_hip/blas_z_v2.cpp b/interface_hip/blas_z_v2.cpp
+index 3c7e87a..eb9e2e6 100644
+--- a/interface_hip/blas_z_v2.cpp
++++ b/interface_hip/blas_z_v2.cpp
+@@ -1858,7 +1858,8 @@ magma_ztrmm(
+                     hipblas_diag_const( diag ),
+ 		    int(m), int(n),
+ 		    (hipblasDoubleComplex*)&alpha, (const hipblasDoubleComplex*)dA, int(ldda),
+-		    (hipblasDoubleComplex*)dB, int(lddb) );
++		    (hipblasDoubleComplex*)dB, int(lddb),
++		    (hipblasDoubleComplex*)dB, int(lddb) ); /* C same as B; less efficient */
+     #else
+         hipblasZtrmm(
+                     queue->hipblas_handle(),
+diff --git a/interface_hip/interface.cpp b/interface_hip/interface.cpp
+index 2b35b34..7c76426 100644
+--- a/interface_hip/interface.cpp
++++ b/interface_hip/interface.cpp
+@@ -209,11 +209,10 @@ magma_init()
+                 else {
+                     g_magma_devices[dev].memory          = prop.totalGlobalMem;
+                     g_magma_devices[dev].shmem_block     = prop.sharedMemPerBlock;
+-                    #ifdef MAGMA_HAVE_CUDA
+                     g_magma_devices[dev].cuda_arch       = prop.major*100 + prop.minor*10;
++                    #ifdef MAGMA_HAVE_CUDA
+                     g_magma_devices[dev].shmem_multiproc = prop.sharedMemPerMultiprocessor;
+                     #elif defined(MAGMA_HAVE_HIP)
+-                    g_magma_devices[dev].cuda_arch       = prop.gcnArch;
+                     g_magma_devices[dev].shmem_multiproc = prop.maxSharedMemoryPerMultiProcessor;
+                     #endif
+ 
+@@ -464,7 +463,7 @@ magma_print_environment()
+                 prop.name,
+                 prop.clockRate / 1000.,
+                 prop.totalGlobalMem / (1024.*1024.),
+-                prop.gcnArch );
++                prop.gcnArchName );
+         #endif
+     }
+ 
+-- 
+2.39.3
+
diff --git a/var/spack/repos/builtin/packages/magma/package.py b/var/spack/repos/builtin/packages/magma/package.py
index 26ed916e72..585f360a7b 100644
--- a/var/spack/repos/builtin/packages/magma/package.py
+++ b/var/spack/repos/builtin/packages/magma/package.py
@@ -78,6 +78,7 @@ class Magma(CMakePackage, CudaPackage, ROCmPackage):
     patch("magma-2.5.0.patch", when="@2.5.0")
     patch("magma-2.5.0-cmake.patch", when="@2.5.0")
     patch("cmake-W.patch", when="@2.5.0:%nvhpc")
+    patch("0001-fix-magma-build-error-with-rocm-6.0.0.patch", when="@2.7.2 ^hip@6.0.0 + rocm")
 
     @run_before("cmake")
     def generate_gpu_config(self):
@@ -146,7 +147,7 @@ class Magma(CMakePackage, CudaPackage, ROCmPackage):
         if "+rocm" in spec:
             options.append(define("MAGMA_ENABLE_HIP", True))
             options.append(define("CMAKE_CXX_COMPILER", spec["hip"].hipcc))
-            # See https://github.com/ROCmSoftwarePlatform/rocFFT/issues/322
+            # See https://github.com/ROCm/rocFFT/issues/322
             if spec.satisfies("^cmake@3.21.0:3.21.2"):
                 options.append(define("__skip_rocmclang", True))
         else:
diff --git a/var/spack/repos/builtin/packages/mfem/mfem-hip.patch b/var/spack/repos/builtin/packages/mfem/mfem-hip.patch
new file mode 100644
index 0000000000..565bae348c
--- /dev/null
+++ b/var/spack/repos/builtin/packages/mfem/mfem-hip.patch
@@ -0,0 +1,24 @@
+From 93ab69cac72cc2d13cfd4b7efcc235bdbca2b9f5 Mon Sep 17 00:00:00 2001
+From: Afzal Patel <afzal.patel@amd.com>
+Date: Wed, 17 Jan 2024 11:44:18 -0800
+Subject: [PATCH] Add hip library path to ghv flags so libamdhip64 can be found
+
+---
+ config/makefile | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/config/makefile b/config/makefile
+index 627d117..a453865 100644
+--- a/config/makefile
++++ b/config/makefile
+@@ -38,7 +38,7 @@ all: header config-mk
+ MPI = $(MFEM_USE_MPI:NO=)
+ GHV_CXX ?= $(MFEM_CXX)
+ GHV = get_hypre_version
+-GHV_FLAGS = $(subst @MFEM_DIR@,$(if $(MFEM_DIR),$(MFEM_DIR),..),$(HYPRE_OPT))
++GHV_FLAGS = $(subst @MFEM_DIR@,$(if $(MFEM_DIR),$(MFEM_DIR),..),$(HYPRE_OPT)) $(HIP_LIB)
+ SMX = $(if $(MFEM_USE_PUMI:NO=),MFEM_USE_SIMMETRIX)
+ SMX_PATH = $(PUMI_DIR)/include/gmi_sim.h
+ SMX_FILE = $(subst @MFEM_DIR@,$(if $(MFEM_DIR),$(MFEM_DIR),..),$(SMX_PATH))
+--
+2.25.1
+\ No newline at end of file
diff --git a/var/spack/repos/builtin/packages/mfem/package.py b/var/spack/repos/builtin/packages/mfem/package.py
index 618b397181..ddd7be363d 100644
--- a/var/spack/repos/builtin/packages/mfem/package.py
+++ b/var/spack/repos/builtin/packages/mfem/package.py
@@ -480,6 +480,7 @@ class Mfem(Package, CudaPackage, ROCmPackage):
         when="@4.6.0 +gslib+shared+miniapps",
         sha256="2a31682d876626529e2778a216d403648b83b90997873659a505d982d0e65beb",
     )
+    patch("mfem-hip.patch", when="+rocm ^hip@6.0:")
 
     phases = ["configure", "build", "install"]
 
@@ -954,6 +955,7 @@ class Mfem(Package, CudaPackage, ROCmPackage):
             options += ["HIP_CXX=%s" % spec["hip"].hipcc, "HIP_ARCH=%s" % amdgpu_target]
             hip_headers = HeaderList([])
             hip_libs = LibraryList([])
+            hip_libs += find_libraries("libamdhip64", spec["hip"].prefix.lib)
             # To use a C++ compiler that supports -xhip flag one can use
             # something like this:
             #   options += [
diff --git a/var/spack/repos/builtin/packages/migraphx/package.py b/var/spack/repos/builtin/packages/migraphx/package.py
index 1245a48109..efc4280521 100644
--- a/var/spack/repos/builtin/packages/migraphx/package.py
+++ b/var/spack/repos/builtin/packages/migraphx/package.py
@@ -11,9 +11,9 @@ from spack.package import *
 class Migraphx(CMakePackage):
     """AMD's graph optimization engine."""
 
-    homepage = "https://github.com/ROCmSoftwarePlatform/AMDMIGraphX"
-    git = "https://github.com/ROCmSoftwarePlatform/AMDMIGraphX.git"
-    url = "https://github.com/ROCmSoftwarePlatform/AMDMIGraphX/archive/rocm-5.5.0.tar.gz"
+    homepage = "https://github.com/ROCm/AMDMIGraphX"
+    git = "https://github.com/ROCm/AMDMIGraphX.git"
+    url = "https://github.com/ROCm/AMDMIGraphX/archive/rocm-6.0.0.tar.gz"
     tags = ["rocm"]
 
     maintainers("srekolam", "renjithravindrankannath")
@@ -21,6 +21,7 @@ class Migraphx(CMakePackage):
 
     license("MIT")
 
+    version("6.0.0", sha256="7bb3f5011da9b1f3b79707b06118c523c1259215f650c2ffa5622a7e1d88868f")
     version("5.7.1", sha256="3e58c043a5a7d1357ee05725fd6cd41e190b070f1ba57f61300128429902089c")
     version("5.7.0", sha256="14f13554367d2d6490d66f8b5b739203225e7acce25085559e7c4acf29e2a4d5")
     version("5.6.1", sha256="b108c33f07572ffd880b20f6de06f1934ab2a1b41ae69095612322ac412fa91c")
@@ -108,7 +109,7 @@ class Migraphx(CMakePackage):
     )
 
     def url_for_version(self, version):
-        url = "https://github.com/ROCmSoftwarePlatform/AMDMIGraphX/archive/"
+        url = "https://github.com/ROCm/AMDMIGraphX/archive/"
         if version <= Version("3.5.0"):
             url += "{0}.tar.gz".format(version)
         else:
@@ -168,6 +169,7 @@ class Migraphx(CMakePackage):
         "5.6.1",
         "5.7.0",
         "5.7.1",
+        "6.0.0",
     ]:
         depends_on("rocm-cmake@%s:" % ver, type="build", when="@" + ver)
         depends_on("hip@" + ver, when="@" + ver)
@@ -175,7 +177,7 @@ class Migraphx(CMakePackage):
         depends_on("rocblas@" + ver, when="@" + ver)
         depends_on("miopen-hip@" + ver, when="@" + ver)
 
-    for ver in ["5.7.0", "5.7.1"]:
+    for ver in ["5.7.0", "5.7.1", "6.0.0"]:
         depends_on("composable-kernel@" + ver, when="@" + ver)
 
     @property
diff --git a/var/spack/repos/builtin/packages/miopen-hip/package.py b/var/spack/repos/builtin/packages/miopen-hip/package.py
index ee3b78a5ff..8bafc28701 100644
--- a/var/spack/repos/builtin/packages/miopen-hip/package.py
+++ b/var/spack/repos/builtin/packages/miopen-hip/package.py
@@ -12,9 +12,9 @@ from spack.pkg.builtin.boost import Boost
 class MiopenHip(CMakePackage):
     """AMD's library for high performance machine learning primitives."""
 
-    homepage = "https://github.com/ROCmSoftwarePlatform/MIOpen"
-    git = "https://github.com/ROCmSoftwarePlatform/MIOpen.git"
-    url = "https://github.com/ROCmSoftwarePlatform/MIOpen/archive/rocm-5.5.0.tar.gz"
+    homepage = "https://github.com/ROCm/MIOpen"
+    git = "https://github.com/ROCm/MIOpen.git"
+    url = "https://github.com/ROCm/MIOpen/archive/rocm-6.0.0.tar.gz"
     tags = ["rocm"]
 
     maintainers("srekolam", "renjithravindrankannath")
@@ -22,6 +22,7 @@ class MiopenHip(CMakePackage):
 
     license("MIT")
 
+    version("6.0.0", sha256="a0718a48353be30ff98118ade511f0c1b454e394d8f934aefe7dd6946562b2e9")
     version("5.7.1", sha256="912a658fe21ce6f1982b0f2ff251c3f7bb618f2e7e9876d983bcb54e3cd7129e")
     version("5.7.0", sha256="5cd0b62254469e1c246d5890d2b78f8aedcf42cf8a327eabc1a391b83bcd14e1")
     version("5.6.1", sha256="ff627d68ed9e52433a3c808b5d3ff179a398b77ce81b00cfea7b2c4da5162c6c")
@@ -124,7 +125,7 @@ class MiopenHip(CMakePackage):
     patch("0001-Add-rocm-path-and-rocm-device-lib-path-flags.patch", when="@3.9.0:5.0.2")
     patch("miopen-hip-include-nlohmann-include-directory.patch", when="@5.4.0:")
     patch(
-        "https://github.com/ROCmSoftwarePlatform/MIOpen/pull/2276/commits/f60aa1ff89f8fb596b4a6a4c70aa7d557803db87.patch?full_index=1",
+        "https://github.com/ROCm/MIOpen/pull/2276/commits/f60aa1ff89f8fb596b4a6a4c70aa7d557803db87.patch?full_index=1",
         sha256="c777d9f4cd2bbfec632b38620c0f70bb0cce8da1",
         when="@5.7:",
     )
@@ -159,6 +160,7 @@ class MiopenHip(CMakePackage):
         "5.6.1",
         "5.7.0",
         "5.7.1",
+        "6.0.0",
     ]:
         depends_on("rocm-cmake@%s:" % ver, type="build", when="@" + ver)
         depends_on("hip@" + ver, when="@" + ver)
@@ -168,7 +170,7 @@ class MiopenHip(CMakePackage):
     for ver in ["5.1.0", "5.1.3", "5.2.0", "5.2.1", "5.2.3", "5.3.0", "5.3.3"]:
         depends_on("mlirmiopen@" + ver, when="@" + ver)
 
-    for ver in ["5.5.1", "5.6.0", "5.6.1", "5.7.0", "5.7.1"]:
+    for ver in ["5.5.1", "5.6.0", "5.6.1", "5.7.0", "5.7.1", "6.0.0"]:
         depends_on("nlohmann-json", type="link")
         depends_on("composable-kernel@" + ver, when="@" + ver)
     for ver in ["5.4.0", "5.4.3", "5.5.0"]:
diff --git a/var/spack/repos/builtin/packages/miopen-opencl/package.py b/var/spack/repos/builtin/packages/miopen-opencl/package.py
index ec5eac8a96..5ec89b243d 100644
--- a/var/spack/repos/builtin/packages/miopen-opencl/package.py
+++ b/var/spack/repos/builtin/packages/miopen-opencl/package.py
@@ -12,9 +12,9 @@ from spack.pkg.builtin.boost import Boost
 class MiopenOpencl(CMakePackage):
     """AMD's library for high performance machine learning primitives."""
 
-    homepage = "https://github.com/ROCmSoftwarePlatform/MIOpen"
-    git = "https://github.com/ROCmSoftwarePlatform/MIOpen.git"
-    url = "https://github.com/ROCmSoftwarePlatform/MIOpen/archive/rocm-5.5.0.tar.gz"
+    homepage = "https://github.com/ROCm/MIOpen"
+    git = "https://github.com/ROCm/MIOpen.git"
+    url = "https://github.com/ROCm/MIOpen/archive/rocm-6.0.0.tar.gz"
     tags = ["rocm"]
 
     maintainers("srekolam", "renjithravindrankannath")
diff --git a/var/spack/repos/builtin/packages/miopen-tensile/package.py b/var/spack/repos/builtin/packages/miopen-tensile/package.py
index 11dece2143..1d64b792d4 100644
--- a/var/spack/repos/builtin/packages/miopen-tensile/package.py
+++ b/var/spack/repos/builtin/packages/miopen-tensile/package.py
@@ -12,9 +12,9 @@ class MiopenTensile(CMakePackage):
     """MIOpenTensile provides host-callable interfaces to Tensile library.
     MIOpenTensile supports one programming model: HIP"""
 
-    homepage = "https://github.com/ROCmSoftwarePlatform/MIOpenTensile"
-    git = "https://github.com/ROCmSoftwarePlatform/MIOpenTensile.git"
-    url = "https://github.com/ROCmSoftwarePlatform/MIOpentensile/archive/rocm-5.0.0.tar.gz"
+    homepage = "https://github.com/ROCm/MIOpenTensile"
+    git = "https://github.com/ROCm/MIOpenTensile.git"
+    url = "https://github.com/ROCm/MIOpentensile/archive/rocm-5.0.0.tar.gz"
     tags = ["rocm"]
 
     maintainers("srekolam")
@@ -72,7 +72,7 @@ class MiopenTensile(CMakePackage):
 
     resource(
         name="Tensile",
-        git="https://github.com/ROCmSoftwarePlatform/Tensile.git",
+        git="https://github.com/ROCm/Tensile.git",
         commit="9cbabb07f81e932b9c98bf5ae48fbd7fcef615cf",
         when="@4.5.0:",
     )
diff --git a/var/spack/repos/builtin/packages/miopengemm/package.py b/var/spack/repos/builtin/packages/miopengemm/package.py
index 937210ec77..e67185563e 100644
--- a/var/spack/repos/builtin/packages/miopengemm/package.py
+++ b/var/spack/repos/builtin/packages/miopengemm/package.py
@@ -12,9 +12,9 @@ class Miopengemm(CMakePackage):
     """An OpenCL general matrix multiplication (GEMM) API
     and kernel generator"""
 
-    homepage = "https://github.com/ROCmSoftwarePlatform/MIOpenGEMM"
-    git = "https://github.com/ROCmSoftwarePlatform/MIOpenGEMM.git"
-    url = "https://github.com/ROCmSoftwarePlatform/MIOpenGEMM/archive/rocm-5.5.0.tar.gz"
+    homepage = "https://github.com/ROCm/MIOpenGEMM"
+    git = "https://github.com/ROCm/MIOpenGEMM.git"
+    url = "https://github.com/ROCm/MIOpenGEMM/archive/rocm-6.0.0.tar.gz"
     tags = ["rocm"]
 
     maintainers("srekolam", "renjithravindrankannath")
@@ -22,8 +22,8 @@ class Miopengemm(CMakePackage):
 
     def url_for_version(self, version):
         if version == Version("1.1.6"):
-            return "https://github.com/ROCmSoftwarePlatform/MIOpenGEMM/archive/1.1.6.tar.gz"
-        url = "https://github.com/ROCmSoftwarePlatform/MIOpenGEMM/archive/rocm-{0}.tar.gz"
+            return "https://github.com/ROCm/MIOpenGEMM/archive/1.1.6.tar.gz"
+        url = "https://github.com/ROCm/MIOpenGEMM/archive/rocm-{0}.tar.gz"
         return url.format(version)
 
     license("MIT")
diff --git a/var/spack/repos/builtin/packages/mivisionx/package.py b/var/spack/repos/builtin/packages/mivisionx/package.py
index 153469f16e..5e2549631f 100644
--- a/var/spack/repos/builtin/packages/mivisionx/package.py
+++ b/var/spack/repos/builtin/packages/mivisionx/package.py
@@ -13,7 +13,7 @@ class Mivisionx(CMakePackage):
 
     homepage = "https://github.com/GPUOpen-ProfessionalCompute-Libraries/MIVisionX"
     git = "https://github.com/GPUOpen-ProfessionalCompute-Libraries/MIVisionX.git"
-    url = "https://github.com/GPUOpen-ProfessionalCompute-Libraries/MIVisionX/archive/rocm-5.5.0.tar.gz"
+    url = "https://github.com/GPUOpen-ProfessionalCompute-Libraries/MIVisionX/archive/rocm-6.0.0.tar.gz"
 
     maintainers("srekolam", "renjithravindrankannath")
     tags = ["rocm"]
@@ -27,6 +27,7 @@ class Mivisionx(CMakePackage):
 
     license("MIT")
 
+    version("6.0.0", sha256="01324a12f21ea0e29a4d7d7c60498ba9231723569fedcdd90f28ddffb5e0570e")
     version("5.7.1", sha256="bfc074bc32ebe84c72149ee6abb30b5b6499023d5b98269232de82e35d0505a8")
     version("5.7.0", sha256="07e4ec8a8c06a9a8bb6394a043c9c3e7176acd3b462a16de91ef9518a64df9ba")
     version("5.6.1", sha256="b2ff95c1488e244f379482631dae4f9ab92d94a513d180e03607aa1e184b5b0a")
@@ -369,6 +370,7 @@ class Mivisionx(CMakePackage):
             "5.6.1",
             "5.7.0",
             "5.7.1",
+            "6.0.0",
         ]:
             depends_on("miopen-hip@" + ver, when="@" + ver)
         for ver in [
@@ -381,11 +383,12 @@ class Mivisionx(CMakePackage):
             "5.6.1",
             "5.7.0",
             "5.7.1",
+            "6.0.0",
         ]:
             depends_on("migraphx@" + ver, when="@" + ver)
             depends_on("hip@" + ver, when="@" + ver)
 
-    for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1", "5.7.0", "5.7.1"]:
+    for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1", "5.7.0", "5.7.1", "6.0.0"]:
         depends_on("rocm-core@" + ver, when="@" + ver)
         depends_on("python@3.5:", type="build")
 
diff --git a/var/spack/repos/builtin/packages/mlirmiopen/package.py b/var/spack/repos/builtin/packages/mlirmiopen/package.py
index eeed27450d..7cfe466a83 100644
--- a/var/spack/repos/builtin/packages/mlirmiopen/package.py
+++ b/var/spack/repos/builtin/packages/mlirmiopen/package.py
@@ -10,9 +10,9 @@ from spack.package import *
 class Mlirmiopen(CMakePackage):
     """Multi-Level Intermediate Representation for rocm miopen project."""
 
-    homepage = "https://github.com/ROCmSoftwarePlatform/llvm-project-mlir"
-    url = "https://github.com/ROCmSoftwarePlatform/llvm-project-mlir/archive/refs/tags/rocm-5.4.0.tar.gz"
-    git = "https://github.com/ROCmSoftwarePlatform/llvm-project-mlir.git"
+    homepage = "https://github.com/ROCm/llvm-project-mlir"
+    url = "https://github.com/ROCm/llvm-project-mlir/archive/refs/tags/rocm-5.4.0.tar.gz"
+    git = "https://github.com/ROCm/llvm-project-mlir.git"
     tags = ["rocm"]
 
     maintainers("srekolam")
diff --git a/var/spack/repos/builtin/packages/petsc/Handle-hipsparse-api-changes-for-rocm-6.0.patch b/var/spack/repos/builtin/packages/petsc/Handle-hipsparse-api-changes-for-rocm-6.0.patch
new file mode 100644
index 0000000000..674c083f51
--- /dev/null
+++ b/var/spack/repos/builtin/packages/petsc/Handle-hipsparse-api-changes-for-rocm-6.0.patch
@@ -0,0 +1,70 @@
+From 3c9aaca12a1ae6000ff3cfd0564f7b2ab45396d2 Mon Sep 17 00:00:00 2001
+From: Renjith Ravindran <Renjith.RavindranKannath@amd.com>
+Date: Thu, 18 Jan 2024 07:38:25 +0000
+Subject: [PATCH] Handle the hipsparse api changes for rocm 6.0
+
+---
+ .../impls/aij/seq/seqhipsparse/aijhipsparse.hip.cpp  | 12 ++++++------
+ 1 file changed, 6 insertions(+), 6 deletions(-)
+
+diff --git a/src/mat/impls/aij/seq/seqhipsparse/aijhipsparse.hip.cpp b/src/mat/impls/aij/seq/seqhipsparse/aijhipsparse.hip.cpp
+index e6f878f..4bf52cd 100644
+--- a/src/mat/impls/aij/seq/seqhipsparse/aijhipsparse.hip.cpp
++++ b/src/mat/impls/aij/seq/seqhipsparse/aijhipsparse.hip.cpp
+@@ -1258,7 +1258,7 @@ static PetscErrorCode MatSolve_SeqAIJHIPSPARSE_ILU0(Mat fact, Vec b, Vec x)
+   /* Solve L*y = b */
+   PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
+   PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
+-  #if PETSC_PKG_HIP_VERSION_EQ(5, 6, 31061)                                                                                                       // i.e., 5.6.0
++  #if (PETSC_PKG_HIP_VERSION_EQ(5, 6, 31061)||PETSC_PKG_HIP_VERSION_EQ(6, 0, 32830))                                                // i.e., 5.6.0
+   PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L,                   /* L Y = X */
+                                          fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L)); // hipsparseSpSV_solve() secretely uses the external buffer used in hipsparseSpSV_analysis()!
+   #else
+@@ -1267,7 +1267,7 @@ static PetscErrorCode MatSolve_SeqAIJHIPSPARSE_ILU0(Mat fact, Vec b, Vec x)
+   #endif
+   /* Solve U*x = y */
+   PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_X, xarray));
+-  #if PETSC_PKG_HIP_VERSION_EQ(5, 6, 31061)                                                                                     // i.e., 5.6.0
++  #if (PETSC_PKG_HIP_VERSION_EQ(5, 6, 31061)||PETSC_PKG_HIP_VERSION_EQ(6, 0, 32830))                                                // i.e., 5.6.0
+   PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, /* U X = Y */
+                                          fs->dnVecDescr_Y, fs->dnVecDescr_X, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U));
+   #else
+@@ -1316,7 +1316,7 @@ static PetscErrorCode MatSolveTranspose_SeqAIJHIPSPARSE_ILU0(Mat fact, Vec b, Ve
+   /* Solve Ut*y = b */
+   PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
+   PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
+-  #if PETSC_PKG_HIP_VERSION_EQ(5, 6, 31061)                                                                                 // i.e., 5.6.0
++  #if (PETSC_PKG_HIP_VERSION_EQ(5, 6, 31061)||PETSC_PKG_HIP_VERSION_EQ(6, 0, 32830))                                                // i.e., 5.6.0
+   PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, /* Ut Y = X */
+                                          fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut));
+   #else
+@@ -1325,7 +1325,7 @@ static PetscErrorCode MatSolveTranspose_SeqAIJHIPSPARSE_ILU0(Mat fact, Vec b, Ve
+   #endif
+   /* Solve Lt*x = y */
+   PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_X, xarray));
+-  #if PETSC_PKG_HIP_VERSION_EQ(5, 6, 31061)                                                                                 // i.e., 5.6.0
++  #if (PETSC_PKG_HIP_VERSION_EQ(5, 6, 31061)||PETSC_PKG_HIP_VERSION_EQ(6, 0, 32830))                                                // i.e., 5.6.0
+   PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
+                                          fs->dnVecDescr_Y, fs->dnVecDescr_X, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt));
+   #else
+@@ -1559,7 +1559,7 @@ static PetscErrorCode MatSolve_SeqAIJHIPSPARSE_ICC0(Mat fact, Vec b, Vec x)
+   /* Solve L*y = b */
+   PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
+   PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
+-  #if PETSC_PKG_HIP_VERSION_EQ(5, 6, 31061)                                                                                     // i.e., 5.6.0
++  #if (PETSC_PKG_HIP_VERSION_EQ(5, 6, 31061)||PETSC_PKG_HIP_VERSION_EQ(6, 0, 32830))                                                // i.e., 5.6.0
+   PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */
+                                          fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L));
+   #else
+@@ -1568,7 +1568,7 @@ static PetscErrorCode MatSolve_SeqAIJHIPSPARSE_ICC0(Mat fact, Vec b, Vec x)
+   #endif
+   /* Solve Lt*x = y */
+   PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_X, xarray));
+-  #if PETSC_PKG_HIP_VERSION_EQ(5, 6, 31061)                                                                                 // i.e., 5.6.0
++  #if (PETSC_PKG_HIP_VERSION_EQ(5, 6, 31061)||PETSC_PKG_HIP_VERSION_EQ(6, 0, 32830))                                                // i.e., 5.6.0
+   PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
+                                          fs->dnVecDescr_Y, fs->dnVecDescr_X, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt));
+   #else
+-- 
+2.39.3
+
diff --git a/var/spack/repos/builtin/packages/petsc/package.py b/var/spack/repos/builtin/packages/petsc/package.py
index 5a4c011002..67a872ea8b 100644
--- a/var/spack/repos/builtin/packages/petsc/package.py
+++ b/var/spack/repos/builtin/packages/petsc/package.py
@@ -21,7 +21,7 @@ class Petsc(Package, CudaPackage, ROCmPackage):
     tags = ["e4s"]
 
     version("main", branch="main")
-
+    version("3.20.3", sha256="75a94fb44df0512f51ad093fa784e56b61f51b7ead5956fbe49185c203f8c245")
     version("3.20.2", sha256="2a2d08b5f0e3d0198dae2c42ce1fd036f25c153ef2bb4a2d320ca141ac7cd30b")
     version("3.20.1", sha256="3d54f13000c9c8ceb13ca4f24f93d838319019d29e6de5244551a3ec22704f32")
     version("3.20.0", sha256="c152ccb12cb2353369d27a65470d4044a0c67e0b69814368249976f5bb232bd4")
@@ -172,6 +172,9 @@ class Petsc(Package, CudaPackage, ROCmPackage):
         )
         patch("hip-5.6.0-for-3.18.diff", when="@3.18:3.19 ^hipsparse@5.6.0")
         patch("hip-5.7-plus-for-3.18.diff", when="@3.18:3.19 ^hipsparse@5.7:")
+        patch(
+            "Handle-hipsparse-api-changes-for-rocm-6.0.patch", when="@3.20.2:3.20.3 ^hipsparse@6.0"
+        )
 
     # 3.8.0 has a build issue with MKL - so list this conflict explicitly
     conflicts("^intel-mkl", when="@3.8.0")
diff --git a/var/spack/repos/builtin/packages/raja/package.py b/var/spack/repos/builtin/packages/raja/package.py
index fb67631779..9bb463412f 100644
--- a/var/spack/repos/builtin/packages/raja/package.py
+++ b/var/spack/repos/builtin/packages/raja/package.py
@@ -114,6 +114,14 @@ class Raja(CachedCMakePackage, CudaPackage, ROCmPackage):
         when="@:0.13.0 ^blt@0.4:",
     )
 
+    # Backward compatibility is stopped from ROCm 6.0
+    # Future relase will have the change from PR https://github.com/LLNL/RAJA/pull/1568
+    patch(
+        "https://github.com/LLNL/RAJA/commit/406eb8dee05a41eb32c421c375688a4863b60642.patch?full_index=1",
+        sha256="d9ce5ef038555cbccb330a9016b7be77e56ae0660583cba955dab9d0297a4b07",
+        when="^hip@6.0.0",
+    )
+
     variant("openmp", default=True, description="Build OpenMP backend")
     variant("shared", default=True, description="Build Shared Libs")
     variant("plugins", default=False, description="Enable runtime plugins")
diff --git a/var/spack/repos/builtin/packages/rccl-tests/package.py b/var/spack/repos/builtin/packages/rccl-tests/package.py
index 18131077e4..a27bebac07 100644
--- a/var/spack/repos/builtin/packages/rccl-tests/package.py
+++ b/var/spack/repos/builtin/packages/rccl-tests/package.py
@@ -10,9 +10,9 @@ class RcclTests(MakefilePackage):
     """These tests check both the performance and the correctness of RCCL
     operations. They can be compiled against RCCL."""
 
-    homepage = "https://github.com/ROCmSoftwarePlatform/rccl-tests"
-    git = "https://github.com/ROCmSoftwarePlatform/rccl-tests.git"
-    url = "https://github.com/ROCmSoftwarePlatform/rccl-tests.git"
+    homepage = "https://github.com/ROCm/rccl-tests"
+    git = "https://github.com/ROCm/rccl-tests.git"
+    url = "https://github.com/ROCm/rccl-tests.git"
     tags = ["rocm"]
 
     maintainers("bvanessen")
diff --git a/var/spack/repos/builtin/packages/rccl/0004-Set-rocm-core-path-for-version-file.patch b/var/spack/repos/builtin/packages/rccl/0004-Set-rocm-core-path-for-version-file.patch
new file mode 100644
index 0000000000..fd03def3ee
--- /dev/null
+++ b/var/spack/repos/builtin/packages/rccl/0004-Set-rocm-core-path-for-version-file.patch
@@ -0,0 +1,13 @@
+diff --git a/CMakeLists.txt b/CMakeLists.txt
+index 5384287..ea6fd4b 100644
+--- a/CMakeLists.txt
++++ b/CMakeLists.txt
+@@ -121,7 +121,7 @@ message(STATUS "hipcc version:    ${hipcc_version_string}")
+ 
+ ## Check for ROCm version
+ execute_process(
+-  COMMAND         bash "-c" "cat ${ROCM_PATH}/.info/version"
++  COMMAND         bash "-c" "cat $ENV{ROCMCORE_PATH}/.info/version"
+   OUTPUT_VARIABLE rocm_version_string
+ )
+ string(REGEX MATCH "([0-9]+)\\.([0-9]+)\\.([0-9]+)" rocm_version_matches ${rocm_version_string})
diff --git a/var/spack/repos/builtin/packages/rccl/package.py b/var/spack/repos/builtin/packages/rccl/package.py
index 9b388d1a27..52519c0194 100644
--- a/var/spack/repos/builtin/packages/rccl/package.py
+++ b/var/spack/repos/builtin/packages/rccl/package.py
@@ -14,13 +14,14 @@ class Rccl(CMakePackage):
     implementing all-reduce, all-gather, reduce, broadcast,
     and reduce-scatter."""
 
-    homepage = "https://github.com/ROCmSoftwarePlatform/rccl"
-    git = "https://github.com/ROCmSoftwarePlatform/rccl.git"
-    url = "https://github.com/ROCmSoftwarePlatform/rccl/archive/rocm-5.5.0.tar.gz"
+    homepage = "https://github.com/ROCm/rccl"
+    git = "https://github.com/ROCm/rccl.git"
+    url = "https://github.com/ROCm/rccl/archive/rocm-6.0.0.tar.gz"
     tags = ["rocm"]
 
     maintainers("srekolam", "renjithravindrankannath")
     libraries = ["librccl"]
+    version("6.0.0", sha256="0496d5a5f2e48c92cd390ab318df31a53cf7ec590988c2574c9f3d99c38b0fa7")
     version("5.7.1", sha256="fb4c1f0084196d1226ce8a726d0f012d3890b54508a06ca87bbda619be8b90b1")
     version("5.7.0", sha256="4c2825a3e4323ef3c2f8855ef445c1a81cf1992fb37e3e8a07a50db354aa3954")
     version("5.6.1", sha256="27ec6b86a1a329684d808f728c1fce134517ac8e6e7047689f95dbf8386c077e")
@@ -119,6 +120,7 @@ class Rccl(CMakePackage):
     patch("0001-Fix-numactl-path-issue.patch", when="@3.7.0:4.3.2")
     patch("0002-Fix-numactl-rocm-smi-path-issue.patch", when="@4.5.0:5.2.1")
     patch("0003-Fix-numactl-rocm-smi-path-issue.patch", when="@5.2.3:5.6")
+    patch("0004-Set-rocm-core-path-for-version-file.patch", when="@6.0:")
 
     depends_on("cmake@3.5:", type="build")
     for ver in [
@@ -151,6 +153,7 @@ class Rccl(CMakePackage):
         "5.6.1",
         "5.7.0",
         "5.7.1",
+        "6.0.0",
     ]:
         depends_on("rocm-cmake@%s:" % ver, type="build", when="@" + ver)
         depends_on("hip@" + ver, when="@" + ver)
@@ -186,6 +189,7 @@ class Rccl(CMakePackage):
         "5.6.1",
         "5.7.0",
         "5.7.1",
+        "6.0.0",
     ]:
         depends_on("numactl@2:", when="@" + ver)
     for ver in [
@@ -208,12 +212,14 @@ class Rccl(CMakePackage):
         "5.6.1",
         "5.7.0",
         "5.7.1",
+        "6.0.0",
     ]:
         depends_on("rocm-smi-lib@" + ver, when="@" + ver)
         depends_on("chrpath", when="@5.3.0:")
 
-    for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1", "5.7.0", "5.7.1"]:
+    for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1", "5.7.0", "5.7.1", "6.0.0"]:
         depends_on("rocm-core@" + ver, when="@" + ver)
+
     depends_on("googletest@1.11.0:", when="@5.3:")
 
     @classmethod
@@ -229,6 +235,7 @@ class Rccl(CMakePackage):
 
     def setup_build_environment(self, env):
         env.set("CXX", self.spec["hip"].hipcc)
+        env.set("ROCMCORE_PATH", self.spec["rocm-core"].prefix)
 
     def cmake_args(self):
         args = []
diff --git a/var/spack/repos/builtin/packages/rdc/package.py b/var/spack/repos/builtin/packages/rdc/package.py
index fbcb130fb2..f4466bc991 100644
--- a/var/spack/repos/builtin/packages/rdc/package.py
+++ b/var/spack/repos/builtin/packages/rdc/package.py
@@ -12,8 +12,8 @@ from spack.package import *
 class Rdc(CMakePackage):
     """ROCm Data Center Tool"""
 
-    homepage = "https://github.com/RadeonOpenCompute/rdc"
-    url = "https://github.com/RadeonOpenCompute/rdc/archive/rocm-5.5.0.tar.gz"
+    homepage = "https://github.com/ROCm/rdc"
+    url = "https://github.com/ROCm/rdc/archive/rocm-6.0.0.tar.gz"
     tags = ["rocm"]
 
     maintainers("srekolam", "renjithravindrankannath")
@@ -21,13 +21,14 @@ class Rdc(CMakePackage):
 
     def url_for_version(self, version):
         if version == Version("3.9.0"):
-            return "https://github.com/RadeonOpenCompute/rdc/archive/rdc_so_ver-0.3.tar.gz"
+            return "https://github.com/ROCm/rdc/archive/rdc_so_ver-0.3.tar.gz"
 
-        url = "https://github.com/RadeonOpenCompute/rdc/archive/rocm-{0}.tar.gz"
+        url = "https://github.com/ROCm/rdc/archive/rocm-{0}.tar.gz"
         return url.format(version)
 
     license("MIT")
 
+    version("6.0.0", sha256="5e3847a919d5f7efe99d8d76c96e78401659eccd1fb234b1b8cb4304096d6e89")
     version("5.7.1", sha256="5251eb3085f2019246b332e9552dfae1572cf64ddf58306b81cbe7108019ffee")
     version("5.7.0", sha256="924e94f14f6390d7a6ff7863fb4e2085c1ff5f9c12b8bd46471eb31f001c4f14")
     version("5.6.1", sha256="9e9f57cebbc5ae386a405957ed2c17344cdb42db5e1a71285f2c9bc09eea6519")
@@ -140,6 +141,7 @@ class Rdc(CMakePackage):
         "5.6.1",
         "5.7.0",
         "5.7.1",
+        "6.0.0",
     ]:
         depends_on("rocm-smi-lib@" + ver, type=("build", "link"), when="@" + ver)
 
@@ -161,10 +163,11 @@ class Rdc(CMakePackage):
         "5.6.1",
         "5.7.0",
         "5.7.1",
+        "6.0.0",
     ]:
         depends_on("hsa-rocr-dev@" + ver, when="@" + ver)
 
-    for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1", "5.7.0", "5.7.1"]:
+    for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1", "5.7.0", "5.7.1", "6.0.0"]:
         depends_on("rocm-core@" + ver, when="@" + ver)
 
     def patch(self):
diff --git a/var/spack/repos/builtin/packages/rocalution/package.py b/var/spack/repos/builtin/packages/rocalution/package.py
index 103fcd7373..d04530e77b 100644
--- a/var/spack/repos/builtin/packages/rocalution/package.py
+++ b/var/spack/repos/builtin/packages/rocalution/package.py
@@ -17,9 +17,9 @@ class Rocalution(CMakePackage):
      generic and flexible design that allows seamless integration with
     other scientific software packages."""
 
-    homepage = "https://github.com/ROCmSoftwarePlatform/rocALUTION"
-    git = "https://github.com/ROCmSoftwarePlatform/rocALUTION.git"
-    url = "https://github.com/ROCmSoftwarePlatform/rocALUTION/archive/rocm-5.5.0.tar.gz"
+    homepage = "https://github.com/ROCm/rocALUTION"
+    git = "https://github.com/ROCm/rocALUTION.git"
+    url = "https://github.com/ROCm/rocALUTION/archive/rocm-6.0.0.tar.gz"
     tags = ["rocm"]
 
     maintainers("cgmb", "srekolam", "renjithravindrankannath")
@@ -27,6 +27,7 @@ class Rocalution(CMakePackage):
 
     license("MIT")
 
+    version("6.0.0", sha256="cabf37691b8db00c82bda49c7dcfaefd9b9067b7d097afa43b7a5f86c45bff99")
     version("5.7.1", sha256="b95afa1285759843c5fea1ad6e1c1edf283922e0d448db03a3e1f42b6942bc24")
     version("5.7.0", sha256="48232a0d1250debce89e39a233bd0b5d52324a2454c078b99c9d44965cbbc0e9")
     version("5.6.1", sha256="7197b3617a0c91e90adaa32003c04d247a5f585d216e77493d20984ba215addb")
@@ -165,6 +166,7 @@ class Rocalution(CMakePackage):
         "5.6.1",
         "5.7.0",
         "5.7.1",
+        "6.0.0",
     ]:
         depends_on("hip@" + ver, when="@" + ver)
         depends_on("rocprim@" + ver, when="@" + ver)
diff --git a/var/spack/repos/builtin/packages/rocblas/package.py b/var/spack/repos/builtin/packages/rocblas/package.py
index 1012b89a17..854d897e5d 100644
--- a/var/spack/repos/builtin/packages/rocblas/package.py
+++ b/var/spack/repos/builtin/packages/rocblas/package.py
@@ -11,9 +11,9 @@ from spack.package import *
 class Rocblas(CMakePackage):
     """Radeon Open Compute BLAS library"""
 
-    homepage = "https://github.com/ROCmSoftwarePlatform/rocBLAS/"
-    git = "https://github.com/ROCmSoftwarePlatform/rocBLAS.git"
-    url = "https://github.com/ROCmSoftwarePlatform/rocBLAS/archive/rocm-5.5.0.tar.gz"
+    homepage = "https://github.com/ROCm/rocBLAS/"
+    git = "https://github.com/ROCm/rocBLAS.git"
+    url = "https://github.com/ROCm/rocBLAS/archive/rocm-6.0.0.tar.gz"
     tags = ["rocm"]
 
     maintainers("cgmb", "srekolam", "renjithravindrankannath", "haampie")
@@ -23,6 +23,7 @@ class Rocblas(CMakePackage):
 
     version("develop", branch="develop")
     version("master", branch="master")
+    version("6.0.0", sha256="befa4a75f1de0ea37f2358d4c2de5406d7bce671ca9936e2294b64d3b3bafb60")
     version("5.7.1", sha256="2984a5ed0ea5a05d40996ee3fddecb24399cbe8ea3e4921fc254e54d8f52fe4f")
     version("5.7.0", sha256="024edd98de9687ee5394badc4dd4c543eef4eb3f71c96ff64100705d851e1744")
     version("5.6.1", sha256="73896ebd445162a69af97f9fd462684609b4e0cf617eab450cd4558b4a23941e")
@@ -131,8 +132,8 @@ class Rocblas(CMakePackage):
     conflicts("amdgpu_target=gfx1012", when="@:4.2.1")
     conflicts("amdgpu_target=gfx1030", when="@:4.2.1")
     # https://reviews.llvm.org/D124866
-    # https://github.com/ROCm-Developer-Tools/HIP/issues/2678
-    # https://github.com/ROCm-Developer-Tools/hipamd/blob/rocm-5.2.x/include/hip/amd_detail/host_defines.h#L50
+    # https://github.com/ROCm/HIP/issues/2678
+    # https://github.com/ROCm/hipamd/blob/rocm-5.2.x/include/hip/amd_detail/host_defines.h#L50
     conflicts("%gcc@12", when="@5.2")
 
     depends_on("cmake@3.16.8:", type="build", when="@4.2.0:")
@@ -182,6 +183,7 @@ class Rocblas(CMakePackage):
         "5.6.1",
         "5.7.0",
         "5.7.1",
+        "6.0.0",
     ]:
         depends_on("hip@" + ver, when="@" + ver)
         depends_on("llvm-amdgpu@" + ver, type="build", when="@" + ver)
@@ -232,10 +234,11 @@ class Rocblas(CMakePackage):
         ("@5.6.1", "7d0a9d040c3bbae893df7ecef6a19d9cd1c304aa"),
         ("@5.7.0", "97e0cfc2c8cb87a1e38901d99c39090dc4181652"),
         ("@5.7.1", "97e0cfc2c8cb87a1e38901d99c39090dc4181652"),
+        ("@6.0.0", "17df881bde80fc20f997dfb290f4bb4b0e05a7e9"),
     ]:
         resource(
             name="Tensile",
-            git="https://github.com/ROCmSoftwarePlatform/Tensile.git",
+            git="https://github.com/ROCm/Tensile.git",
             commit=t_commit,
             when="{} +tensile".format(t_version),
         )
@@ -243,12 +246,12 @@ class Rocblas(CMakePackage):
     for ver in ["master", "develop"]:
         resource(
             name="Tensile",
-            git="https://github.com/ROCmSoftwarePlatform/Tensile.git",
+            git="https://github.com/ROCm/Tensile.git",
             branch=ver,
             when="@{} +tensile".format(ver),
         )
 
-    # Status: https://github.com/ROCmSoftwarePlatform/Tensile/commit/a488f7dadba34f84b9658ba92ce9ec5a0615a087
+    # Status: https://github.com/ROCm/Tensile/commit/a488f7dadba34f84b9658ba92ce9ec5a0615a087
     # Not yet landed in 3.7.0, nor 3.8.0.
     patch("0001-Fix-compilation-error-with-StringRef-to-basic-string.patch", when="@:3.8")
     patch("0002-Fix-rocblas-clients-blas.patch", when="@4.2.0:4.3.1")
@@ -256,7 +259,7 @@ class Rocblas(CMakePackage):
     # Finding Python package and set command python as python3
     patch("0004-Find-python.patch", when="@5.2.0:5.4")
     patch("0006-Guard-use-of-OpenMP-to-make-it-optional-5.4.patch", when="@5.4")
-    patch("0007-add-rocm-openmp-extras-include-dir.patch", when="@5.6:")
+    patch("0007-add-rocm-openmp-extras-include-dir.patch", when="@5.6:5.7")
 
     def setup_build_environment(self, env):
         env.set("CXX", self.spec["hip"].hipcc)
@@ -309,14 +312,14 @@ class Rocblas(CMakePackage):
             # Restrict the number of jobs Tensile can spawn.
             # If we don't specify otherwise, Tensile creates a job per available core,
             # and that consumes a lot of system memory.
-            # https://github.com/ROCmSoftwarePlatform/Tensile/blob/93e10678a0ced7843d9332b80bc17ebf9a166e8e/Tensile/Parallel.py#L38
+            # https://github.com/ROCm/Tensile/blob/93e10678a0ced7843d9332b80bc17ebf9a166e8e/Tensile/Parallel.py#L38
             args.append(self.define("Tensile_CPU_THREADS", min(16, make_jobs)))
 
-        # See https://github.com/ROCmSoftwarePlatform/rocBLAS/commit/c1895ba4bb3f4f5947f3818ebd155cf71a27b634
+        # See https://github.com/ROCm/rocBLAS/commit/c1895ba4bb3f4f5947f3818ebd155cf71a27b634
         if "auto" not in self.spec.variants["amdgpu_target"]:
             args.append(self.define_from_variant(arch_define_name, "amdgpu_target"))
 
-        # See https://github.com/ROCmSoftwarePlatform/rocBLAS/issues/1196
+        # See https://github.com/ROCm/rocBLAS/issues/1196
         if self.spec.satisfies("^cmake@3.21.0:3.21.2"):
             args.append(self.define("__skip_rocmclang", "ON"))
 
diff --git a/var/spack/repos/builtin/packages/rocfft/package.py b/var/spack/repos/builtin/packages/rocfft/package.py
index 229dd4bdb0..815bb03132 100644
--- a/var/spack/repos/builtin/packages/rocfft/package.py
+++ b/var/spack/repos/builtin/packages/rocfft/package.py
@@ -11,16 +11,16 @@ from spack.package import *
 class Rocfft(CMakePackage):
     """Radeon Open Compute FFT library"""
 
-    homepage = "https://github.com/ROCmSoftwarePlatform/rocFFT/"
-    git = "https://github.com/ROCmSoftwarePlatform/rocFFT.git"
-    url = "https://github.com/ROCmSoftwarePlatform/rocfft/archive/rocm-5.5.0.tar.gz"
+    homepage = "https://github.com/ROCm/rocFFT/"
+    git = "https://github.com/ROCm/rocFFT.git"
+    url = "https://github.com/ROCm/rocfft/archive/rocm-6.0.0.tar.gz"
     tags = ["rocm"]
 
     maintainers("cgmb", "srekolam", "renjithravindrankannath", "haampie")
     libraries = ["librocfft"]
 
     license("MIT")
-
+    version("6.0.0", sha256="fb8ba56572702e77e4383d922cd1fee4ad3fa5f63a5ebdb3d9c354439a446992")
     version("5.7.1", sha256="202f11f60dc8738e29bbd1b397d419e032794f8bffb7f48f2b31f09cc5f08bc2")
     version("5.7.0", sha256="3c4a1537a6ec76dc9b622644fe3890647306bf9f28f61c5d2028259c31bb964f")
     version("5.6.1", sha256="a65861e453587c3e6393da75b0b1976508c61f968aecda77fbec920fea48489e")
@@ -167,6 +167,7 @@ class Rocfft(CMakePackage):
         "5.6.1",
         "5.7.0",
         "5.7.1",
+        "6.0.0",
     ]:
         depends_on("hip@" + ver, when="@" + ver)
         depends_on("rocm-cmake@%s:" % ver, type="build", when="@" + ver)
@@ -178,6 +179,14 @@ class Rocfft(CMakePackage):
     # Patch to add install prefix header location for sqlite for 5.4
     patch("0004-fix-missing-sqlite-include-paths.patch", when="@5.4.0:5.5")
 
+    # Set LD_LIBRARY_PATH for executing the binaries from build directoryfix missing type
+    # https://github.com/ROCm/rocFFT/pull/449)
+    patch(
+        "https://github.com/ROCm/rocFFT/commit/0ec78f1daac2d7fa1415f4deff0d129252c1c9de.patch?full_index=1",
+        sha256="bac7873185ac60f2aaa50e278f0b8d52b4d79d586bf7f52db1da33559569ba54",
+        when="@6.0.0",
+    )
+
     def setup_build_environment(self, env):
         env.set("CXX", self.spec["hip"].hipcc)
 
@@ -214,7 +223,7 @@ class Rocfft(CMakePackage):
                 self.define_from_variant("AMDGPU_TARGETS_SRAM_ECC", "amdgpu_target_sram_ecc")
             )
 
-        # See https://github.com/ROCmSoftwarePlatform/rocFFT/issues/322
+        # See https://github.com/ROCm/rocFFT/issues/322
         if self.spec.satisfies("^cmake@3.21.0:3.21.2"):
             args.append(self.define("__skip_rocmclang", "ON"))
 
diff --git a/var/spack/repos/builtin/packages/rocm-bandwidth-test/package.py b/var/spack/repos/builtin/packages/rocm-bandwidth-test/package.py
index 27806866a4..ffb8f927f0 100644
--- a/var/spack/repos/builtin/packages/rocm-bandwidth-test/package.py
+++ b/var/spack/repos/builtin/packages/rocm-bandwidth-test/package.py
@@ -10,14 +10,15 @@ from spack.package import *
 class RocmBandwidthTest(CMakePackage):
     """Test to measure PciE bandwidth on ROCm platforms"""
 
-    homepage = "https://github.com/RadeonOpenCompute/rocm_bandwidth_test"
-    git = "https://github.com/RadeonOpenCompute/rocm_bandwidth_test.git"
-    url = "https://github.com/RadeonOpenCompute/rocm_bandwidth_test/archive/rocm-5.5.0.tar.gz"
+    homepage = "https://github.com/ROCm/rocm_bandwidth_test"
+    git = "https://github.com/ROCm/rocm_bandwidth_test.git"
+    url = "https://github.com/ROCm/rocm_bandwidth_test/archive/rocm-6.0.0.tar.gz"
     tags = ["rocm"]
 
     maintainers("srekolam", "renjithravindrankannath")
 
     version("master", branch="master")
+    version("6.0.0", sha256="9023401bd6a896059545b8e6263c6730afd89d7d45c0f5866261c300415532a6")
     version("5.7.1", sha256="7426ef1e317b8293e4d6389673cfa8c63efb3f7d061e2f50a6f0b1b706e2a2a7")
     version("5.7.0", sha256="fa95c28488ab4bb6d920b9f3c316554ca340f44c87ec2efb4cf8fa488e63ddd9")
     version("5.6.1", sha256="849af715d08dfd89e7aa5e4453b624151db1cafaa567ab5fa36a77948b90bf0d")
@@ -136,12 +137,13 @@ class RocmBandwidthTest(CMakePackage):
         "5.6.1",
         "5.7.0",
         "5.7.1",
+        "6.0.0",
         "master",
     ]:
         depends_on("hsa-rocr-dev@" + ver, when="@" + ver)
         depends_on("hsakmt-roct@" + ver, when="@" + ver)
 
-    for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1", "5.7.0", "5.7.1"]:
+    for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1", "5.7.0", "5.7.1", "6.0.0"]:
         depends_on("rocm-core@" + ver, when="@" + ver)
 
     build_targets = ["package"]
diff --git a/var/spack/repos/builtin/packages/rocm-clang-ocl/package.py b/var/spack/repos/builtin/packages/rocm-clang-ocl/package.py
index 6961c15b80..aeca0c39a2 100644
--- a/var/spack/repos/builtin/packages/rocm-clang-ocl/package.py
+++ b/var/spack/repos/builtin/packages/rocm-clang-ocl/package.py
@@ -9,15 +9,16 @@ from spack.package import *
 class RocmClangOcl(CMakePackage):
     """OpenCL compilation with clang compiler"""
 
-    homepage = "https://github.com/RadeonOpenCompute/clang-ocl"
-    git = "https://github.com/RadeonOpenCompute/clang-ocl.git"
-    url = "https://github.com/RadeonOpenCompute/clang-ocl/archive/rocm-5.5.0.tar.gz"
+    homepage = "https://github.com/ROCm/clang-ocl"
+    git = "https://github.com/ROCm/clang-ocl.git"
+    url = "https://github.com/ROCm/clang-ocl/archive/rocm-6.0.0.tar.gz"
     tags = ["rocm"]
 
     license("MIT")
 
     maintainers("srekolam", "renjithravindrankannath")
     version("master", branch="master")
+    version("6.0.0", sha256="74b5a64c32f3c57e7e4de638fffabbf448ecdb3dd8e65678b7ba0633352b4ca3")
     version("5.7.1", sha256="32e4430d009cbbf5404ca9cbbb549b36897fa1826bc2285372e293cfe7531bf8")
     version("5.7.0", sha256="c9ca80bfee674e740039256a846107373f1cf6554dc28398599976d8646a0392")
     version("5.6.1", sha256="c41deb1b564d939fc897b2bbdb13570b2234fa4c052a39783f5ad2dd1052f901")
@@ -136,6 +137,7 @@ class RocmClangOcl(CMakePackage):
         "5.6.1",
         "5.7.0",
         "5.7.1",
+        "6.0.0",
         "master",
     ]:
         depends_on("rocm-cmake@%s:" % ver, type="build", when="@" + ver)
@@ -145,7 +147,7 @@ class RocmClangOcl(CMakePackage):
         depends_on(
             "rocm-device-libs@" + ver, when="@{0} ^llvm-amdgpu ~rocm-device-libs".format(ver)
         )
-    for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1", "5.7.0", "5.7.1"]:
+    for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1", "5.7.0", "5.7.1", "6.0.0"]:
         depends_on("rocm-core@" + ver, when="@" + ver)
 
     test_src_dir = "test"
diff --git a/var/spack/repos/builtin/packages/rocm-cmake/package.py b/var/spack/repos/builtin/packages/rocm-cmake/package.py
index c14999a989..a5cbb03c5b 100644
--- a/var/spack/repos/builtin/packages/rocm-cmake/package.py
+++ b/var/spack/repos/builtin/packages/rocm-cmake/package.py
@@ -11,9 +11,9 @@ class RocmCmake(CMakePackage):
     """rocm-cmake provides CMake modules for common build tasks
     in the ROCm software stack"""
 
-    homepage = "https://github.com/RadeonOpenCompute/rocm-cmake"
-    git = "https://github.com/RadeonOpenCompute/rocm-cmake.git"
-    url = "https://github.com/RadeonOpenCompute/rocm-cmake/archive/rocm-5.6.0.tar.gz"
+    homepage = "https://github.com/ROCm/rocm-cmake"
+    git = "https://github.com/ROCm/rocm-cmake.git"
+    url = "https://github.com/ROCm/rocm-cmake/archive/rocm-5.6.0.tar.gz"
     tags = ["rocm"]
 
     maintainers("srekolam", "renjithravindrankannath")
@@ -21,6 +21,7 @@ class RocmCmake(CMakePackage):
     license("MIT")
 
     version("master", branch="master")
+    version("6.0.0", sha256="82bd97ba23d1883ef38bb667e92f7367fedc50d6c11c82f54cced4ab04b0412d")
     version("5.7.1", sha256="4a4c6aa09576ccb834f869bdcb49e98cc0f0bac3678b802358065d1179a9d6f1")
     version("5.7.0", sha256="93b98144201a1143eeca32744a9927d063f4685189f132ba52a6f3bba158a86b")
     version("5.6.1", sha256="98bf5fe2e6e12f55d122807d0060f1bb19c80d63d2c2f6fee579c40bfd244fa6")
@@ -110,7 +111,7 @@ class RocmCmake(CMakePackage):
     depends_on("cmake@3:", type="build")
     depends_on("cmake@3.6:", type="build", when="@4.1.0:")
 
-    for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1", "5.7.0", "5.7.1"]:
+    for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1", "5.7.0", "5.7.1", "6.0.0"]:
         depends_on("rocm-core@" + ver, when="@" + ver)
 
     test_src_dir = "test"
diff --git a/var/spack/repos/builtin/packages/rocm-core/package.py b/var/spack/repos/builtin/packages/rocm-core/package.py
index 9d6bca46b6..54c1a526dd 100644
--- a/var/spack/repos/builtin/packages/rocm-core/package.py
+++ b/var/spack/repos/builtin/packages/rocm-core/package.py
@@ -12,8 +12,8 @@ class RocmCore(CMakePackage):
     It also provides the Lmod modules files for the ROCm release.
     getROCmVersion function provides the ROCm version."""
 
-    homepage = "https://github.com/RadeonOpenCompute/rocm-core"
-    url = "https://github.com/RadeonOpenCompute/rocm-core/archive/refs/tags/rocm-5.5.0.tar.gz"
+    homepage = "https://github.com/ROCm/rocm-core"
+    url = "https://github.com/ROCm/rocm-core/archive/refs/tags/rocm-6.0.0.tar.gz"
     tags = ["rocm"]
 
     maintainers("srekolam", "renjithravindrankannath")
@@ -21,6 +21,7 @@ class RocmCore(CMakePackage):
 
     license("MIT")
 
+    version("6.0.0", sha256="d950ee4b63336f34579b6e1dda2d05966b7afa9c84bcdc13874991d1147dc788")
     version("5.7.1", sha256="fc4915019ddfd126e8ef6a15006bce3aa7bd5fd11dc8eb04ce2ee6bdf9c6ae7f")
     version("5.7.0", sha256="722689bfec46c35f5428a41c5aacfc31efec2294fc3b0112861c562f8a71ac93")
     version("5.6.1", sha256="eeef75e16e05380ccbc8df17a02dc141a66dddaadb444a97f7278f78067c498c")
diff --git a/var/spack/repos/builtin/packages/rocm-dbgapi/package.py b/var/spack/repos/builtin/packages/rocm-dbgapi/package.py
index 92b4ec72a9..d068de3456 100644
--- a/var/spack/repos/builtin/packages/rocm-dbgapi/package.py
+++ b/var/spack/repos/builtin/packages/rocm-dbgapi/package.py
@@ -14,9 +14,9 @@ class RocmDbgapi(CMakePackage):
     control of the execution and inspection of execution state of
     AMD's commercially available GPU architectures."""
 
-    homepage = "https://github.com/ROCm-Developer-Tools/ROCdbgapi"
-    git = "https://github.com/ROCm-Developer-Tools/ROCdbgapi.git"
-    url = "https://github.com/ROCm-Developer-Tools/ROCdbgapi/archive/rocm-5.5.0.tar.gz"
+    homepage = "https://github.com/ROCm/ROCdbgapi"
+    git = "https://github.com/ROCm/ROCdbgapi.git"
+    url = "https://github.com/ROCm/ROCdbgapi/archive/rocm-6.0.0.tar.gz"
     tags = ["rocm"]
 
     maintainers("srekolam", "renjithravindrankannath")
@@ -25,6 +25,7 @@ class RocmDbgapi(CMakePackage):
     license("MIT")
 
     version("master", branch="amd-master")
+    version("6.0.0", sha256="4e823eba255e46b93aff05fd5938ef2a51693ffd74debebffc1aabfce613805c")
     version("5.7.1", sha256="0ee9c2f083868849f2ea0cec7010e0270c27e7679ccbbadd12072cc0ef6c8a6f")
     version("5.7.0", sha256="285ddded8e7f1981d8861ffc1cd7770b78129e4955da08ad55a4779945699716")
     version("5.6.1", sha256="c7241bf94bdb97a4cf1befbf25b8c35720797710da6f6b5b9d6a4094c1bc9c8b")
@@ -144,12 +145,13 @@ class RocmDbgapi(CMakePackage):
         "5.6.1",
         "5.7.0",
         "5.7.1",
+        "6.0.0",
         "master",
     ]:
         depends_on("hsa-rocr-dev@" + ver, type="build", when="@" + ver)
         depends_on("comgr@" + ver, type=("build", "link"), when="@" + ver)
 
-    for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1", "5.7.0", "5.7.1"]:
+    for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1", "5.7.0", "5.7.1", "6.0.0"]:
         depends_on("rocm-core@" + ver, when="@" + ver)
 
     @classmethod
diff --git a/var/spack/repos/builtin/packages/rocm-debug-agent/package.py b/var/spack/repos/builtin/packages/rocm-debug-agent/package.py
index 5f95ebf8e4..a397fb6f56 100644
--- a/var/spack/repos/builtin/packages/rocm-debug-agent/package.py
+++ b/var/spack/repos/builtin/packages/rocm-debug-agent/package.py
@@ -11,13 +11,14 @@ from spack.package import *
 class RocmDebugAgent(CMakePackage):
     """Radeon Open Compute (ROCm) debug agent"""
 
-    homepage = "https://github.com/ROCm-Developer-Tools/rocr_debug_agent"
-    git = "https://github.com/ROCm-Developer-Tools/rocr_debug_agent.git"
-    url = "https://github.com/ROCm-Developer-Tools/rocr_debug_agent/archive/rocm-5.5.0.tar.gz"
+    homepage = "https://github.com/ROCm/rocr_debug_agent"
+    git = "https://github.com/ROCm/rocr_debug_agent.git"
+    url = "https://github.com/ROCm/rocr_debug_agent/archive/rocm-6.0.0.tar.gz"
     tags = ["rocm"]
 
     maintainers("srekolam", "renjithravindrankannath")
     libraries = ["librocm-debug-agent"]
+    version("6.0.0", sha256="705be2c2bd0f5c7d1e286eb9b94045b2bd017ff323f07bca9aa7c81f2d168524")
     version("5.7.1", sha256="3b8d2835935da98f41e7cfc5b808c596ac06dd705b9a07bb70283e002f8dea6a")
     version("5.7.0", sha256="d9344ed02e82a01140f2162e901e6a519e5fee6b498e2f49417730ee2660c5c1")
     version("5.6.1", sha256="d3b1d5d757489ed3cc66d351cec56b7b850aaa7ecf6a55b0350b89c3dee3153a")
@@ -105,7 +106,7 @@ class RocmDebugAgent(CMakePackage):
     )
 
     def url_for_version(self, version):
-        url = "https://github.com/ROCm-Developer-Tools/rocr_debug_agent/archive/"
+        url = "https://github.com/ROCm/rocr_debug_agent/archive/"
         if version <= Version("3.7.0"):
             url += "roc-{0}.tar.gz".format(version)
         else:
@@ -146,6 +147,7 @@ class RocmDebugAgent(CMakePackage):
         "5.6.1",
         "5.7.0",
         "5.7.1",
+        "6.0.0",
     ]:
         depends_on("hsa-rocr-dev@" + ver, when="@" + ver)
         depends_on("hsakmt-roct@" + ver, when="@" + ver)
@@ -179,14 +181,15 @@ class RocmDebugAgent(CMakePackage):
         "5.6.1",
         "5.7.0",
         "5.7.1",
+        "6.0.0",
     ]:
         depends_on("rocm-dbgapi@" + ver, when="@" + ver)
         depends_on("hip@" + ver, when="@" + ver)
 
-    for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1", "5.7.0", "5.7.1"]:
+    for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1", "5.7.0", "5.7.1", "6.0.0"]:
         depends_on("rocm-core@" + ver, when="@" + ver)
 
-    # https://github.com/ROCm-Developer-Tools/rocr_debug_agent/pull/4
+    # https://github.com/ROCm/rocr_debug_agent/pull/4
     patch("0001-Drop-overly-strict-Werror-flag.patch", when="@3.7.0:")
     patch("0002-add-hip-architecture.patch", when="@3.9.0:")
 
diff --git a/var/spack/repos/builtin/packages/rocm-device-libs/package.py b/var/spack/repos/builtin/packages/rocm-device-libs/package.py
index b83682d120..6ba87f4dab 100644
--- a/var/spack/repos/builtin/packages/rocm-device-libs/package.py
+++ b/var/spack/repos/builtin/packages/rocm-device-libs/package.py
@@ -10,14 +10,15 @@ from spack.package import *
 class RocmDeviceLibs(CMakePackage):
     """set of AMD specific device-side language runtime libraries"""
 
-    homepage = "https://github.com/RadeonOpenCompute/ROCm-Device-Libs"
-    git = "https://github.com/RadeonOpenCompute/ROCm-Device-Libs.git"
-    url = "https://github.com/RadeonOpenCompute/ROCm-Device-Libs/archive/rocm-5.5.0.tar.gz"
+    homepage = "https://github.com/ROCm/ROCm-Device-Libs"
+    git = "https://github.com/ROCm/ROCm-Device-Libs.git"
+    url = "https://github.com/ROCm/ROCm-Device-Libs/archive/rocm-6.0.0.tar.gz"
     tags = ["rocm"]
 
     maintainers("srekolam", "renjithravindrankannath", "haampie")
 
     version("master", branch="amd-stg-open")
+    version("6.0.0", sha256="198df4550d4560537ba60ac7af9bde31d59779c8ec5d6309627f77a43ab6ef6f")
     version("5.7.1", sha256="703de8403c0bd0d80f37c970a698f10f148daf144d34f982e4484d04f7c7bbef")
     version("5.7.0", sha256="0f8780b9098573f1c456bdc84358de924dcf00604330770a383983e1775bf61e")
     version("5.6.1", sha256="f0dfab272ff936225bfa1e9dabeb3c5d12ce08b812bf53ffbddd2ddfac49761c")
@@ -146,11 +147,12 @@ class RocmDeviceLibs(CMakePackage):
         "5.6.1",
         "5.7.0",
         "5.7.1",
+        "6.0.0",
         "master",
     ]:
         depends_on("llvm-amdgpu@" + ver, when="@" + ver)
 
-    for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1", "5.7.0", "5.7.1"]:
+    for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1", "5.7.0", "5.7.1", "6.0.0"]:
         depends_on("rocm-core@" + ver, when="@" + ver)
 
     def cmake_args(self):
diff --git a/var/spack/repos/builtin/packages/rocm-gdb/package.py b/var/spack/repos/builtin/packages/rocm-gdb/package.py
index 8c29704b29..5a7c06d8eb 100644
--- a/var/spack/repos/builtin/packages/rocm-gdb/package.py
+++ b/var/spack/repos/builtin/packages/rocm-gdb/package.py
@@ -11,13 +11,14 @@ class RocmGdb(AutotoolsPackage):
     """This is ROCmgdb, the ROCm source-level debugger for Linux,
     based on GDB, the GNU source-level debugger."""
 
-    homepage = "https://github.com/ROCm-Developer-Tools/ROCgdb/"
-    url = "https://github.com/ROCm-Developer-Tools/ROCgdb/archive/rocm-5.5.0.tar.gz"
+    homepage = "https://github.com/ROCm/ROCgdb"
+    url = "https://github.com/ROCm/ROCgdb/archive/rocm-6.0.0.tar.gz"
     tags = ["rocm"]
 
     license("LGPL-2.0-or-later")
 
     maintainers("srekolam", "renjithravindrankannath")
+    version("6.0.0", sha256="0db4ab32ca729e69688cdb238df274ce5cf58b5cb2538584662cca4358708c2b")
     version("5.7.1", sha256="5cd150b5796aea9d77efd43b89d30a34fa4125338179eb87c6053abcac9f3c62")
     version("5.7.0", sha256="94fba57b2f17b593de61f7593b404fabc00b054d38567be57d12cf7654b7969a")
     version("5.6.1", sha256="d2b40d4c5aa41a6ce2a84307627b30d16a458672e03e13f9d27c12f2dc3f21d6")
@@ -145,11 +146,12 @@ class RocmGdb(AutotoolsPackage):
         "5.6.1",
         "5.7.0",
         "5.7.1",
+        "6.0.0",
     ]:
         depends_on("rocm-dbgapi@" + ver, type="link", when="@" + ver)
         depends_on("comgr@" + ver, type="link", when="@" + ver)
 
-    for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1", "5.7.0", "5.7.1"]:
+    for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1", "5.7.0", "5.7.1", "6.0.0"]:
         depends_on("rocm-core@" + ver, when="@" + ver)
 
     build_directory = "spack-build"
@@ -160,7 +162,7 @@ class RocmGdb(AutotoolsPackage):
             # Distributor options
             "--program-prefix=roc",
             "--enable-64-bit-bfd",
-            "--with-bugurl=https://github.com/ROCm-Developer-Tools/ROCgdb/issues",
+            "--with-bugurl=https://github.com/ROCm/ROCgdb/issues",
             "--with-pkgversion=-ROCm",
             "--enable-targets=x86_64-linux-gnu,amdgcn-amd-amdhsa",
             "--disable-ld",
diff --git a/var/spack/repos/builtin/packages/rocm-opencl/package.py b/var/spack/repos/builtin/packages/rocm-opencl/package.py
index 9435c1a8ec..8aa0b0a391 100644
--- a/var/spack/repos/builtin/packages/rocm-opencl/package.py
+++ b/var/spack/repos/builtin/packages/rocm-opencl/package.py
@@ -12,8 +12,8 @@ from spack.package import *
 class RocmOpencl(CMakePackage):
     """OpenCL: Open Computing Language on ROCclr"""
 
-    homepage = "https://github.com/RadeonOpenCompute/ROCm-OpenCL-Runtime"
-    git = "https://github.com/RadeonOpenCompute/ROCm-OpenCL-Runtime.git"
+    homepage = "https://github.com/ROCm/ROCm-OpenCL-Runtime"
+    git = "https://github.com/ROCm/ROCm-OpenCL-Runtime.git"
     tags = ["rocm"]
 
     maintainers("srekolam", "renjithravindrankannath")
@@ -146,9 +146,7 @@ class RocmOpencl(CMakePackage):
     ]:
         resource(
             name="rocclr",
-            url="https://github.com/ROCm-Developer-Tools/ROCclr/archive/rocm-{0}.tar.gz".format(
-                d_version
-            ),
+            url="https://github.com/ROCm/ROCclr/archive/rocm-{0}.tar.gz".format(d_version),
             sha256=d_shasum,
             expand=True,
             destination="",
diff --git a/var/spack/repos/builtin/packages/rocm-openmp-extras/package.py b/var/spack/repos/builtin/packages/rocm-openmp-extras/package.py
index 836698b92b..d23a487914 100644
--- a/var/spack/repos/builtin/packages/rocm-openmp-extras/package.py
+++ b/var/spack/repos/builtin/packages/rocm-openmp-extras/package.py
@@ -8,8 +8,8 @@ import re
 
 from spack.package import *
 
-tools_url = "https://github.com/ROCm-Developer-Tools"
-compute_url = "https://github.com/RadeonOpenCompute"
+tools_url = "https://github.com/ROCm"
+compute_url = "https://github.com/ROCm"
 
 # Arrays of hashes are in order of the versions array below
 # For example array[0] = 3.9.0, array[1] = 3.10.0, etc.
@@ -41,6 +41,7 @@ aomp = [
     "6c051bf7625f682ba3d2ea80b46a38ca2cbcd20f5d89ae3433602d3e7ef0403a",
     "4f34fa02db410808c5e629f30f8804210b42c4ff7d31aa80606deaed43054c3c",
     "ed7bbf92230b6535a353ed032a39a9f16e9987397798100392fc25e40c8a1a4e",
+    "1b2c0934ef16e17b2377944fae8c9b3db6dc64b7e43932ddfe2eeefdf6821410",
 ]
 
 devlib = [
@@ -70,6 +71,7 @@ devlib = [
     "f0dfab272ff936225bfa1e9dabeb3c5d12ce08b812bf53ffbddd2ddfac49761c",
     "0f8780b9098573f1c456bdc84358de924dcf00604330770a383983e1775bf61e",
     "703de8403c0bd0d80f37c970a698f10f148daf144d34f982e4484d04f7c7bbef",
+    "198df4550d4560537ba60ac7af9bde31d59779c8ec5d6309627f77a43ab6ef6f",
 ]
 
 llvm = [
@@ -99,6 +101,7 @@ llvm = [
     "045e43c0c4a3f4f2f1db9fb603a4f1ea3d56e128147e19ba17909eb57d7f08e5",
     "4abdf00b297a77c5886cedb37e63acda2ba11cb9f4c0a64e133b05800aadfcf0",
     "6b54c422e45ad19c9bf5ab090ec21753e7f7d854ca78132c30eb146657b168eb",
+    "c673708d413d60ca8606ee75c77e9871b6953c59029c987b92f2f6e85f683626",
 ]
 
 flang = [
@@ -128,6 +131,7 @@ flang = [
     "5ebcbca2e03bd0686e677f44ea551e97bd9395c6b119f832fa784818733aa652",
     "cc4f1973b1b8e7bcc4f09e3381bae4e1a2e51ea4e2598fc1b520ccb8bf24d28c",
     "8fd618d81af092416b267c4d00c801731f7a00c0f8d4aedb795e52a4ec1bf183",
+    "fcb319ddb2aa3004a6ae60370ab4425f529336b1cee50f29200e697e61b53586",
 ]
 
 extras = [
@@ -157,6 +161,7 @@ extras = [
     "437e2017cfe2ab73b15ada0fc1ea88f794f0b108cc5410f457268ae7e4e8985a",
     "be59433dd85d4b8f0eaff87e0cc424a814152c67f3a682d1343c4bd61dd49a0f",
     "8060c6879708faf5f7d417b19a479dec9b7b9583a1b885f12d247faf831f7f0b",
+    "f37e1107e4da5b083e794244f3d0c9fd073ccb6fd6015e635349d8f0d679c4b8",
 ]
 
 versions = [
@@ -186,6 +191,7 @@ versions = [
     "5.6.1",
     "5.7.0",
     "5.7.1",
+    "6.0.0",
 ]
 versions_dict = dict()  # type: Dict[str,Dict[str,str]]
 components = ["aomp", "devlib", "llvm", "flang", "extras"]
@@ -203,12 +209,13 @@ class RocmOpenmpExtras(Package):
     """OpenMP support for ROCm LLVM."""
 
     homepage = tools_url + "/aomp"
-    url = tools_url + "/aomp/archive/rocm-5.5.0.tar.gz"
+    url = tools_url + "/aomp/archive/rocm-6.0.0.tar.gz"
     tags = ["rocm"]
 
     license("Apache-2.0")
 
     maintainers("srekolam", "renjithravindrankannath", "estewart08")
+    version("6.0.0", sha256=versions_dict["6.0.0"]["aomp"])
     version("5.7.1", sha256=versions_dict["5.7.1"]["aomp"])
     version("5.7.0", sha256=versions_dict["5.7.0"]["aomp"])
     version("5.6.1", sha256=versions_dict["5.6.1"]["aomp"])
@@ -243,8 +250,8 @@ class RocmOpenmpExtras(Package):
     depends_on("awk", type="build")
     depends_on("elfutils", type=("build", "link"))
     depends_on("libffi", type=("build", "link"))
-    depends_on("libdrm", when="@5.7")
-    depends_on("numactl", when="@5.7")
+    depends_on("libdrm", when="@5.7:6.0")
+    depends_on("numactl", when="@5.7:6.0")
 
     for ver in [
         "3.9.0",
@@ -273,13 +280,14 @@ class RocmOpenmpExtras(Package):
         "5.6.1",
         "5.7.0",
         "5.7.1",
+        "6.0.0",
     ]:
         depends_on("hsakmt-roct@" + ver, when="@" + ver)
         depends_on("comgr@" + ver, when="@" + ver)
         depends_on("hsa-rocr-dev@" + ver, when="@" + ver)
         depends_on("llvm-amdgpu@{0} ~openmp".format(ver), when="@" + ver)
 
-    for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1", "5.7.0", "5.7.1"]:
+    for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1", "5.7.0", "5.7.1", "6.0.0"]:
         depends_on("rocm-core@" + ver, when="@" + ver)
 
         # tag changed to 'rocm-' in 4.0.0
@@ -327,7 +335,7 @@ class RocmOpenmpExtras(Package):
             placement="llvm-project",
             when="@" + ver,
         )
-    patch("0001-Linking-hsakmt-libdrm-and-numactl-libraries.patch", when="@5.7")
+    patch("0001-Linking-hsakmt-libdrm-and-numactl-libraries.patch", when="@5.7:6.0")
 
     def setup_run_environment(self, env):
         devlibs_prefix = self.spec["llvm-amdgpu"].prefix
@@ -497,7 +505,7 @@ class RocmOpenmpExtras(Package):
         devlibs_src = "{0}/rocm-openmp-extras/rocm-device-libs".format(src)
         hsa_prefix = self.spec["hsa-rocr-dev"].prefix
         hsakmt_prefix = self.spec["hsakmt-roct"].prefix
-        if self.spec.satisfies("@5.7"):
+        if self.spec.satisfies("@5.7:6.0"):
             libdrm_prefix = self.spec["libdrm"].prefix
             numactl_prefix = self.spec["numactl"].prefix
         comgr_prefix = self.spec["comgr"].prefix
@@ -576,7 +584,7 @@ class RocmOpenmpExtras(Package):
             "-DCMAKE_CXX_FLAGS=-isystem{0} -I{1}".format(elfutils_inc, ffi_inc),
             "-DNEW_BC_PATH=1",
         ]
-        if self.spec.satisfies("@5.7"):
+        if self.spec.satisfies("@5.7:6.0"):
             openmp_common_args += [
                 "-DLIBDRM_LIB={0}/lib".format(libdrm_prefix),
                 "-DHSAKMT_INC_PATH={0}/include".format(hsakmt_prefix),
diff --git a/var/spack/repos/builtin/packages/rocm-smi-lib/package.py b/var/spack/repos/builtin/packages/rocm-smi-lib/package.py
index fdd2bf216c..23af4a7653 100644
--- a/var/spack/repos/builtin/packages/rocm-smi-lib/package.py
+++ b/var/spack/repos/builtin/packages/rocm-smi-lib/package.py
@@ -16,15 +16,16 @@ class RocmSmiLib(CMakePackage):
     """It is a C library for Linux that provides a user space interface
     for applications to monitor and control GPU applications."""
 
-    homepage = "https://github.com/RadeonOpenCompute/rocm_smi_lib"
-    git = "https://github.com/RadeonOpenCompute/rocm_smi_lib.git"
-    url = "https://github.com/RadeonOpenCompute/rocm_smi_lib/archive/rocm-5.5.0.tar.gz"
+    homepage = "https://github.com/ROCm/rocm_smi_lib"
+    git = "https://github.com/ROCm/rocm_smi_lib.git"
+    url = "https://github.com/ROCm/rocm_smi_lib/archive/rocm-6.0.0.tar.gz"
     tags = ["rocm"]
 
     maintainers("srekolam", "renjithravindrankannath")
     libraries = ["librocm_smi64"]
 
     version("master", branch="master")
+    version("6.0.0", sha256="0053b42402fd007e5ca9b3186c70f2c6f1b3026558f328722adadc2838c51309")
     version("5.7.1", sha256="4d79cb0482b2f801cc7824172743e3dd2b44b9f6784d1ca2e5067f2fbb4ef803")
     version("5.7.0", sha256="a399db3d9fc113ce2dd1ab5608a1cf9129ec4b6a2a79ab7922b1d9f43c454640")
     version("5.6.1", sha256="9e94f9a941202c3d7ce917fd1cd78c4e0f06f48d6c929f3aa916378ccef1e02c")
@@ -116,7 +117,7 @@ class RocmSmiLib(CMakePackage):
     depends_on("cmake@3:", type="build")
     depends_on("python@3:", type=("build", "run"), when="@3.9.0:")
 
-    for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1", "5.7.0", "5.7.1"]:
+    for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1", "5.7.0", "5.7.1", "6.0.0"]:
         depends_on("rocm-core@" + ver, when="@" + ver)
     patch("disable_pdf_generation_with_doxygen_and_latex.patch", when="@4.5.2:5.6")
 
diff --git a/var/spack/repos/builtin/packages/rocm-smi/package.py b/var/spack/repos/builtin/packages/rocm-smi/package.py
index 0cc265c849..4e927b1f01 100644
--- a/var/spack/repos/builtin/packages/rocm-smi/package.py
+++ b/var/spack/repos/builtin/packages/rocm-smi/package.py
@@ -14,11 +14,11 @@ class RocmSmi(MakefilePackage):
     management of your ROCm enabled system
 
     Note: After ROCm 3.9, this project moved to
-          https://github.com/RadeonOpenCompute/rocm_smi_lib/tree/master/python_smi_tools
+          https://github.com/ROCm/rocm_smi_lib/tree/master/python_smi_tools
           The spack package is called: rocm-smi-lib"""
 
-    homepage = "https://github.com/RadeonOpenCompute/ROC-smi"
-    url = "https://github.com/RadeonOpenCompute/ROC-smi/archive/rocm-4.1.0.tar.gz"
+    homepage = "https://github.com/ROCm/ROC-smi"
+    url = "https://github.com/ROCm/ROC-smi/archive/rocm-4.1.0.tar.gz"
 
     maintainers("srekolam", "renjithravindrankannath")
     tags = ["rocm"]
diff --git a/var/spack/repos/builtin/packages/rocm-tensile/package.py b/var/spack/repos/builtin/packages/rocm-tensile/package.py
index c92e4b34d6..8b869452cc 100644
--- a/var/spack/repos/builtin/packages/rocm-tensile/package.py
+++ b/var/spack/repos/builtin/packages/rocm-tensile/package.py
@@ -11,14 +11,15 @@ from spack.pkg.builtin.boost import Boost
 class RocmTensile(CMakePackage):
     """Radeon Open Compute Tensile library"""
 
-    homepage = "https://github.com/ROCmSoftwarePlatform/Tensile/"
-    git = "https://github.com/ROCmSoftwarePlatform/Tensile.git"
-    url = "https://github.com/ROCmSoftwarePlatform/Tensile/archive/rocm-5.5.0.tar.gz"
+    homepage = "https://github.com/ROCm/Tensile/"
+    git = "https://github.com/ROCm/Tensile.git"
+    url = "https://github.com/ROCm/Tensile/archive/rocm-6.0.0.tar.gz"
     tags = ["rocm"]
 
     license("MIT")
 
     maintainers("srekolam", "renjithravindrankannath", "haampie")
+    version("6.0.0", sha256="5d90add62d1439b7daf0527316e950e454e5d8beefb4f723865fe9ab26c7aa42")
     version("5.7.1", sha256="9211a51b23c22b7a79e4e494e8ff3c31e90bf21adb8cce260acc57891fb2c917")
     version("5.7.0", sha256="fe2ae067c1c579f33d7a1e26da3fe6b4ed44befa08f9dfce2ceae586f184b816")
     version("5.6.1", sha256="3e78c933563fade8781a1dca2079bff135af2f5d2c6eb0147797d2c1f24d006c")
@@ -166,6 +167,7 @@ class RocmTensile(CMakePackage):
         "5.6.1",
         "5.7.0",
         "5.7.1",
+        "6.0.0",
     ]:
         depends_on("rocm-cmake@" + ver, type="build", when="@" + ver)
         depends_on("hip@" + ver, when="@" + ver)
@@ -187,6 +189,7 @@ class RocmTensile(CMakePackage):
         "5.6.1",
         "5.7.0",
         "5.7.1",
+        "6.0.0",
     ]:
         depends_on("rocm-openmp-extras@" + ver, when="@" + ver)
 
@@ -218,11 +221,12 @@ class RocmTensile(CMakePackage):
         "5.6.1",
         "5.7.0",
         "5.7.1",
+        "6.0.0",
     ]:
         depends_on("rocm-smi-lib@" + ver, type="build", when="@" + ver)
 
     root_cmakelists_dir = "Tensile/Source"
-    # Status: https://github.com/ROCmSoftwarePlatform/Tensile/commit/a488f7dadba34f84b9658ba92ce9ec5a0615a087
+    # Status: https://github.com/ROCm/Tensile/commit/a488f7dadba34f84b9658ba92ce9ec5a0615a087
     # Not yet landed in 3.7.0, nor 3.8.0.
     patch("0001-fix-compile-error.patch", when="@3.7.0:3.8.0")
     patch("0002-require-openmp-when-tensile-use-openmp-is-on.patch", when="@3.9.0:4.0.0")
diff --git a/var/spack/repos/builtin/packages/rocm-validation-suite/009-replacing-rocm-path-with-package-path.patch b/var/spack/repos/builtin/packages/rocm-validation-suite/009-replacing-rocm-path-with-package-path.patch
new file mode 100644
index 0000000000..ae21de8c82
--- /dev/null
+++ b/var/spack/repos/builtin/packages/rocm-validation-suite/009-replacing-rocm-path-with-package-path.patch
@@ -0,0 +1,636 @@
+From 7bb26280b6da667573a581780f97856985b44e4e Mon Sep 17 00:00:00 2001
+From: Renjith Ravindran <Renjith.RavindranKannath@amd.com>
+Date: Fri, 12 Jan 2024 09:31:21 +0000
+Subject: [PATCH] Updating cmake with include and library path for spack
+
+---
+ CMakeLists.txt                 | 21 +++++++++++----------
+ babel.so/CMakeLists.txt        | 18 +++++++++---------
+ cmake_modules/tests_unit.cmake |  3 ++-
+ edp.so/CMakeLists.txt          |  6 +++---
+ gm.so/CMakeLists.txt           |  6 +++---
+ gpup.so/CMakeLists.txt         |  8 ++++----
+ gst.so/CMakeLists.txt          | 10 +++++-----
+ iet.so/CMakeLists.txt          |  6 +++---
+ mem.so/CMakeLists.txt          |  6 +++---
+ pbqt.so/CMakeLists.txt         |  6 +++---
+ pebb.so/CMakeLists.txt         |  4 ++--
+ peqt.so/CMakeLists.txt         |  6 +++---
+ perf.so/CMakeLists.txt         |  8 ++++----
+ pesm.so/CMakeLists.txt         |  8 ++++----
+ rcqt.so/CMakeLists.txt         |  6 +++---
+ rvs/CMakeLists.txt             | 15 ++++++++-------
+ rvs/tests.cmake                |  6 ++++--
+ rvslib/CMakeLists.txt          |  2 +-
+ smqt.so/CMakeLists.txt         |  6 +++---
+ testif.so/CMakeLists.txt       | 20 ++++++++++----------
+ 20 files changed, 88 insertions(+), 83 deletions(-)
+
+diff --git a/CMakeLists.txt b/CMakeLists.txt
+index b25eca4..eeee55d 100644
+--- a/CMakeLists.txt
++++ b/CMakeLists.txt
+@@ -70,13 +70,14 @@ endif(rocblas_FOUND)
+ # variables since we will pass them as cmake params appropriately, and 
+ # all find_packages relevant to this build will be in ROCM path hence appending it to CMAKE_PREFIX_PATH 
+ set(ROCM_PATH "/opt/rocm" CACHE PATH "ROCM install path")
+-set(CMAKE_INSTALL_PREFIX "/opt/rocm" CACHE PATH "CMAKE installation directory")
+-set(CPACK_PACKAGING_INSTALL_PREFIX "/opt/rocm" CACHE PATH "Prefix used in built packages")
++set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}")
++set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
++set(CPACK_PACKAGING_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}")
+ list(APPEND CMAKE_PREFIX_PATH "${ROCM_PATH}")
+-set(ROCR_INC_DIR "${ROCM_PATH}/include" CACHE PATH "Contains header files exported by ROC Runtime" FORCE)
+-set(ROCR_LIB_DIR "${ROCM_PATH}/lib" CACHE PATH "Contains library files exported by ROC Runtime" FORCE)
+-set(HIP_INC_DIR "${ROCM_PATH}" CACHE PATH "Contains header files exported by ROC Runtime")
+-set(ROCT_INC_DIR "${ROCM_PATH}/include" CACHE PATH "Contains header files exported by ROC Trunk" FORCE)
++set(ROCR_INC_DIR "${HSA_PATH}/include" CACHE PATH "Contains header files exported by ROC Runtime")
++set(ROCR_LIB_DIR "${HSA_PATH}/lib" CACHE PATH "Contains library files exported by ROC Runtime")
++set(HIP_INC_DIR "${HIP_PATH}" CACHE PATH "Contains header files exported by ROC Runtime")
++set(ROCT_INC_DIR "${ROCM_PATH}/include" CACHE PATH "Contains header files exported by ROC Trunk")
+ 
+ add_definitions(-DROCM_PATH="${ROCM_PATH}")
+ add_definitions(-DRVS_LIB_PATH="${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}/rvs")
+@@ -420,8 +421,8 @@ if (RVS_ROCBLAS EQUAL 1)
+   set(ROCBLAS_INC_DIR "${CMAKE_BINARY_DIR}/rvs_rblas-src/build/release/rocblas-install")
+   set(ROCBLAS_LIB_DIR "${CMAKE_BINARY_DIR}/rvs_rblas-src/build/release/rocblas-install/lib/")
+ else()
+-  set(ROCBLAS_INC_DIR "${ROCM_PATH}/include")
+-  set(ROCBLAS_LIB_DIR "${ROCM_PATH}/lib")
++  set(ROCBLAS_INC_DIR "${ROCBLAS_DIR}/include")
++  set(ROCBLAS_LIB_DIR "${ROCBLAS_DIR}/lib")
+ endif()
+ 
+ if (RVS_ROCMSMI EQUAL 1)
+@@ -436,8 +437,8 @@ else()
+     set(ROCM_SMI_LIB_DIR "${ROCM_PATH}/rocm_smi/lib")
+   else()
+     message( STATUS "ROCBLAS REORG Enabled Version: ${RVS_ROCBLAS_VERSION_FLAT}" )
+-    set(ROCM_SMI_INC_DIR "${ROCM_PATH}/include")
+-    set(ROCM_SMI_LIB_DIR "${ROCM_PATH}/lib")
++    set(ROCM_SMI_INC_DIR "${ROCM_SMI_DIR}/include")
++    set(ROCM_SMI_LIB_DIR "${ROCM_SMI_DIR}/lib")
+   endif()
+ endif()
+ set(ROCM_SMI_LIB "rocm_smi64" CACHE STRING "rocm_smi library name")
+diff --git a/babel.so/CMakeLists.txt b/babel.so/CMakeLists.txt
+index f163dae..fa85b38 100644
+--- a/babel.so/CMakeLists.txt
++++ b/babel.so/CMakeLists.txt
+@@ -107,13 +107,13 @@ set(HIP_HCC_LIB "amdhip64")
+ add_compile_options(-DRVS_ROCBLAS_VERSION_FLAT=${RVS_ROCBLAS_VERSION_FLAT})
+ 
+ # Determine Roc Runtime header files are accessible
+-if(NOT EXISTS ${HIP_INC_DIR}/include/hip/hip_runtime.h)
+-  message("ERROR: ROC Runtime headers can't be found under specified path. Please set HIP_INC_DIR path. Current value is : " ${HIP_INC_DIR})
++if(NOT EXISTS ${HIP_PATH}/include/hip/hip_runtime.h)
++	message("ERROR: ROC Runtime headers can't be found under specified path. Please set HIP_PATH path. Current value is : " ${HIP_PATH})
+   RETURN()
+ endif()
+ 
+-if(NOT EXISTS ${HIP_INC_DIR}/include/hip/hip_runtime_api.h)
+-  message("ERROR: ROC Runtime headers can't be found under specified path. Please set HIP_INC_DIR path. Current value is : " ${HIP_INC_DIR})
++if(NOT EXISTS ${HIP_PATH}/include/hip/hip_runtime_api.h)
++	message("ERROR: ROC Runtime headers can't be found under specified path. Please set HIP_PATH path. Current value is : " ${HIP_PATH})
+   RETURN()
+ endif()
+ 
+@@ -133,16 +133,16 @@ if(DEFINED RVS_ROCMSMI)
+ endif()
+ 
+ 
+-if(NOT EXISTS "${ROCR_LIB_DIR}/lib${HIP_HCC_LIB}.so")
+-  message("ERROR: ROC Runtime libraries can't be found under specified path. Please set ROCR_LIB_DIR path. Current value is : " ${ROCR_LIB_DIR})
++if(NOT EXISTS "${HIP_PATH}/lib/lib${HIP_HCC_LIB}.so")
++	message("ERROR: ROC Runtime libraries can't be found under specified path. Please set HIP_PATH path. Current value is : " ${HIP_PATH})
+   RETURN()
+ endif()
+ 
+ ## define include directories
+-include_directories(./ ../ ${ROCR_INC_DIR} ${HIP_INC_DIR})
++include_directories(./ ../ ${HIP_PATH})
+ 
+ # Add directories to look for library files to link
+-link_directories(${RVS_LIB_DIR} ${ROCR_LIB_DIR} ${ROCBLAS_LIB_DIR} ${ASAN_LIB_PATH})
++link_directories(${RVS_LIB_DIR} ${HIP_PATH}/lib/ ${ROCBLAS_LIB_DIR} ${ASAN_LIB_PATH} ${ROCM_SMI_LIB_DIR})
+ ## additional libraries
+ set (PROJECT_LINK_LIBS rvslib libpthread.so libpci.so libm.so)
+ 
+@@ -154,7 +154,7 @@ add_library( ${RVS_TARGET} SHARED ${SOURCES})
+ set_target_properties(${RVS_TARGET} PROPERTIES
+         SUFFIX .so.${LIB_VERSION_STRING}
+         LIBRARY_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
+-target_link_libraries(${RVS_TARGET} ${PROJECT_LINK_LIBS} ${HIP_HCC_LIB} ${ROCBLAS_LIB})
++target_link_libraries(${RVS_TARGET} ${PROJECT_LINK_LIBS} ${HIP_HCC_LIB} ${ROCBLAS_LIB} ${ROCBLAS_LIB_DIR}/librocblas.so ${HSAKMT_LIB_DIR}/libhsakmt.a ${HSA_PATH}/lib/libhsa-runtime64.so)
+ add_dependencies(${RVS_TARGET} rvslib)
+ 
+ add_custom_command(TARGET ${RVS_TARGET} POST_BUILD
+diff --git a/cmake_modules/tests_unit.cmake b/cmake_modules/tests_unit.cmake
+index e0e9f88..7321e0a 100644
+--- a/cmake_modules/tests_unit.cmake
++++ b/cmake_modules/tests_unit.cmake
+@@ -27,7 +27,7 @@
+ ## define additional unit testing include directories
+ include_directories(${UT_INC})
+ ## define additional unit testing lib directories
+-link_directories(${UT_LIB} ${RVS_LIB_DIR})
++link_directories(${UT_LIB} ${RVS_LIB_DIR} ${ROCM_SMI_LIB_DIR})
+ 
+ file(GLOB TESTSOURCES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} test/test*.cpp )
+ #message ( "TESTSOURCES: ${TESTSOURCES}" )
+@@ -45,6 +45,7 @@ FOREACH(SINGLE_TEST ${TESTSOURCES})
+   )
+   target_link_libraries(${TEST_NAME}
+     ${UT_LINK_LIBS}  rvslibut rvslib gtest_main gtest pthread pci
++    ${ROCBLAS_LIB_DIR}/librocblas.so ${HSAKMT_LIB_DIR}/libhsakmt.a ${HSA_PATH}/lib/libhsa-runtime64.so
+   )
+   target_compile_definitions(${TEST_NAME} PUBLIC RVS_UNIT_TEST)
+   if(DEFINED tcd.${TEST_NAME})
+diff --git a/edp.so/CMakeLists.txt b/edp.so/CMakeLists.txt
+index 7dd34ea..7978abe 100644
+--- a/edp.so/CMakeLists.txt
++++ b/edp.so/CMakeLists.txt
+@@ -134,11 +134,11 @@ if(NOT EXISTS "${ROCR_LIB_DIR}/lib${HIP_HCC_LIB}.so")
+ endif()
+ 
+ ## define include directories
+-include_directories(./ ../ ${ROCR_INC_DIR} ${ROCBLAS_INC_DIR} ${HIP_INC_DIR})
++include_directories(./ ../ ${ROCR_INC_DIR} ${ROCBLAS_INC_DIR} ${HIP_INC_DIR} ${YAML_CPP_INCLUDE_DIRS})
+ # Add directories to look for library files to link
+-link_directories(${RVS_LIB_DIR} ${ROCR_LIB_DIR} ${ROCBLAS_LIB_DIR})
++link_directories(${RVS_LIB_DIR} ${ROCR_LIB_DIR} ${ROCBLAS_LIB_DIR} ${ROCM_SMI_LIB_DIR})
+ ## additional libraries
+-set (PROJECT_LINK_LIBS rvslib libpthread.so libpciaccess.so libpci.so libm.so)
++set (PROJECT_LINK_LIBS rvslib libpthread.so libpciaccess.so libpci.so libm.so ${ROCBLAS_LIB_DIR}/librocblas.so ${HSAKMT_LIB_DIR}/libhsakmt.a ${HSA_PATH}/lib/libhsa-runtime64.so)
+ 
+ ## define source files
+ set (SOURCES src/rvs_module.cpp src/action.cpp src/edp_worker.cpp )
+diff --git a/gm.so/CMakeLists.txt b/gm.so/CMakeLists.txt
+index d3caa84..73b83ce 100644
+--- a/gm.so/CMakeLists.txt
++++ b/gm.so/CMakeLists.txt
+@@ -118,11 +118,11 @@ if(DEFINED RVS_ROCMSMI)
+ endif()
+ 
+ ## define include directories
+-include_directories(./ ../ ${ROCM_SMI_INC_DIR})
++include_directories(./ ../ ${ROCM_SMI_INC_DIR} ${YAML_CPP_INCLUDE_DIRS})
+ # Add directories to look for library files to link
+ link_directories(${RVS_LIB_DIR} ${ROCM_SMI_LIB_DIR} ${ASAN_LIB_PATH})
+ ## additional libraries
+-set (PROJECT_LINK_LIBS rvslib libpthread.so libpci.so libm.so)
++set (PROJECT_LINK_LIBS rvslib libpthread.so libpci.so libm.so librocm_smi64.so ${ROCBLAS_LIB_DIR}/librocblas.so ${HSAKMT_LIB_DIR}/libhsakmt.a ${HSA_PATH}/lib/libhsa-runtime64.so)
+ 
+ ## define source files
+ set(SOURCES  src/rvs_module.cpp src/action.cpp src/worker.cpp)
+@@ -133,7 +133,7 @@ add_library( ${RVS_TARGET} SHARED ${SOURCES})
+ set_target_properties(${RVS_TARGET} PROPERTIES
+         SUFFIX .so.${LIB_VERSION_STRING}
+         LIBRARY_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
+-target_link_libraries(${RVS_TARGET} ${PROJECT_LINK_LIBS} ${ROCM_SMI_LIB})
++target_link_libraries(${RVS_TARGET} ${PROJECT_LINK_LIBS})
+ add_dependencies(${RVS_TARGET} rvslib)
+ 
+ add_custom_command(TARGET ${RVS_TARGET} POST_BUILD
+diff --git a/gpup.so/CMakeLists.txt b/gpup.so/CMakeLists.txt
+index 43d337a..a234feb 100644
+--- a/gpup.so/CMakeLists.txt
++++ b/gpup.so/CMakeLists.txt
+@@ -109,11 +109,11 @@ else()
+ endif()
+ 
+ ## define include directories
+-include_directories(./ ../ include ../include)
++include_directories(./ ../ include ../include ${YAML_CPP_INCLUDE_DIRS})
+ # Add directories to look for library files to link
+-link_directories(${RVS_LIB_DIR} ${ASAN_LIB_PATH})
++link_directories(${RVS_LIB_DIR} ${ASAN_LIB_PATH} ${ROCM_SMI_LIB_DIR})
+ ## additional libraries
+-set (PROJECT_LINK_LIBS rvslib libpci.so libm.so)
++set (PROJECT_LINK_LIBS rvslib libpci.so libm.so ${ROCBLAS_LIB_DIR}/librocblas.so ${HSAKMT_LIB_DIR}/libhsakmt.a ${HSA_PATH}/lib/libhsa-runtime64.so) 
+ 
+ ## define source files
+ set(SOURCES src/rvs_module.cpp src/action.cpp)
+@@ -124,7 +124,7 @@ set_target_properties(${RVS_TARGET} PROPERTIES
+         SUFFIX .so.${LIB_VERSION_STRING}
+         LIBRARY_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
+ 
+-target_link_libraries(${RVS_TARGET} ${PROJECT_LINK_LIBS} )
++target_link_libraries(${RVS_TARGET} ${PROJECT_LINK_LIBS})
+ add_dependencies(${RVS_TARGET} rvslib)
+ 
+ add_custom_command(TARGET ${RVS_TARGET} POST_BUILD
+diff --git a/gst.so/CMakeLists.txt b/gst.so/CMakeLists.txt
+index fd346ce..cb8c4b6 100644
+--- a/gst.so/CMakeLists.txt
++++ b/gst.so/CMakeLists.txt
+@@ -137,17 +137,17 @@ if(DEFINED RVS_ROCMSMI)
+ endif()
+ 
+ 
+-if(NOT EXISTS "${ROCR_LIB_DIR}/lib${HIP_HCC_LIB}.so")
++if(NOT EXISTS "${HIP_INC_DIR}/lib/lib${HIP_HCC_LIB}.so")
+   message("ERROR: ROC Runtime libraries can't be found under specified path. Please set ROCR_LIB_DIR path. Current value is : " ${ROCR_LIB_DIR})
+   RETURN()
+ endif()
+ 
+ ## define include directories
+-include_directories(./ ../ ${ROCR_INC_DIR} ${ROCBLAS_INC_DIR} ${HIP_INC_DIR})
++include_directories(./ ../ ${ROCR_INC_DIR} ${ROCBLAS_INC_DIR} ${HIP_INC_DIR} ${YAML_CPP_INCLUDE_DIRS})
+ # Add directories to look for library files to link
+-link_directories(${RVS_LIB_DIR} ${ROCR_LIB_DIR} ${ROCBLAS_LIB_DIR} ${ASAN_LIB_PATH})
++link_directories(${RVS_LIB_DIR} ${ROCR_LIB_DIR} ${HIP_INC_DIR}/lib/ ${ROCBLAS_LIB_DIR} ${ASAN_LIB_PATH} ${ROCM_SMI_LIB_DIR})
+ ## additional libraries
+-set (PROJECT_LINK_LIBS rvslib libpthread.so libpci.so libm.so)
++set (PROJECT_LINK_LIBS rvslib libpthread.so libpci.so libm.so ${ROCBLAS_LIB_DIR}/librocblas.so ${HSAKMT_LIB_DIR}/libhsakmt.a ${HSA_PATH}/lib/libhsa-runtime64.so)
+ 
+ ## define source files
+ set(SOURCES src/rvs_module.cpp src/action.cpp src/gst_worker.cpp)
+@@ -157,7 +157,7 @@ add_library( ${RVS_TARGET} SHARED ${SOURCES})
+ set_target_properties(${RVS_TARGET} PROPERTIES
+         SUFFIX .so.${LIB_VERSION_STRING}
+         LIBRARY_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
+-target_link_libraries(${RVS_TARGET} ${PROJECT_LINK_LIBS} ${HIP_HCC_LIB} ${ROCBLAS_LIB})
++target_link_libraries(${RVS_TARGET} ${PROJECT_LINK_LIBS} ${HIP_HCC_LIB} ${ROCBLAS_LIB} )
+ add_dependencies(${RVS_TARGET} rvslib)
+ 
+ add_custom_command(TARGET ${RVS_TARGET} POST_BUILD
+diff --git a/iet.so/CMakeLists.txt b/iet.so/CMakeLists.txt
+index a85ca98..252e565 100644
+--- a/iet.so/CMakeLists.txt
++++ b/iet.so/CMakeLists.txt
+@@ -140,7 +140,7 @@ if(DEFINED RVS_ROCMSMI)
+   endif()
+ endif()
+ 
+-if(NOT EXISTS "${ROCR_LIB_DIR}/lib${HIP_HCC_LIB}.so")
++if(NOT EXISTS "${HIP_INC_DIR}/lib/lib${HIP_HCC_LIB}.so")
+   message("ERROR: ROC Runtime libraries can't be found under specified path. Please set ROCR_LIB_DIR path. Current value is : " ${ROCR_LIB_DIR})
+   RETURN()
+ endif()
+@@ -159,7 +159,7 @@ include_directories(./ ../ ${ROCM_SMI_INC_DIR} ${ROCBLAS_INC_DIR} ${ROCR_INC_DIR
+ # Add directories to look for library files to link
+ link_directories(${RVS_LIB_DIR} ${ROCR_LIB_DIR} ${ROCBLAS_LIB_DIR} ${ROCM_SMI_LIB_DIR} ${ASAN_LIB_PATH})
+ ## additional libraries
+-set (PROJECT_LINK_LIBS rvslib libpthread.so libpci.so libm.so)
++set (PROJECT_LINK_LIBS rvslib libpthread.so libpci.so libm.so librocm_smi64.so ${ROCBLAS_LIB_DIR}/librocblas.so ${HSAKMT_LIB_DIR}/libhsakmt.a ${HSA_PATH}/lib/libhsa-runtime64.so)
+ 
+ set(SOURCES src/rvs_module.cpp src/action.cpp src/iet_worker.cpp )
+ 
+@@ -168,7 +168,7 @@ add_library( ${RVS_TARGET} SHARED ${SOURCES})
+ set_target_properties(${RVS_TARGET} PROPERTIES
+         SUFFIX .so.${LIB_VERSION_STRING}
+         LIBRARY_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
+-target_link_libraries(${RVS_TARGET} ${PROJECT_LINK_LIBS} ${HIP_HCC_LIB} ${ROCBLAS_LIB} ${ROCM_SMI_LIB})
++target_link_libraries(${RVS_TARGET} ${PROJECT_LINK_LIBS} ${HIP_INC_DIR}/lib/ ${HIP_HCC_LIB} ${ROCBLAS_LIB})
+ add_dependencies(${RVS_TARGET} rvslib)
+ 
+ add_custom_command(TARGET ${RVS_TARGET} POST_BUILD
+diff --git a/mem.so/CMakeLists.txt b/mem.so/CMakeLists.txt
+index 5133337..2462bbc 100644
+--- a/mem.so/CMakeLists.txt
++++ b/mem.so/CMakeLists.txt
+@@ -134,7 +134,7 @@ if(DEFINED RVS_ROCMSMI)
+ endif()
+ 
+ 
+-if(NOT EXISTS "${ROCR_LIB_DIR}/lib${HIP_HCC_LIB}.so")
++if(NOT EXISTS "${HIP_INC_DIR}/lib/lib${HIP_HCC_LIB}.so")
+   message("ERROR: ROC Runtime libraries can't be found under specified path. Please set ROCR_LIB_DIR path. Current value is : " ${ROCR_LIB_DIR})
+   RETURN()
+ endif()
+@@ -143,9 +143,9 @@ endif()
+ include_directories(./ ../ ${ROCR_INC_DIR} ${HIP_INC_DIR})
+ 
+ # Add directories to look for library files to link
+-link_directories(${RVS_LIB_DIR} ${ROCR_LIB_DIR} ${ROCBLAS_LIB_DIR} ${ASAN_LIB_PATH})
++link_directories(${RVS_LIB_DIR} ${ROCR_LIB_DIR} ${HIP_INC_DIR}/lib ${ROCBLAS_LIB_DIR} ${ASAN_LIB_PATH} ${ROCM_SMI_LIB_DIR})
+ ## additional libraries
+-set (PROJECT_LINK_LIBS rvslib libpthread.so libpci.so libm.so)
++set (PROJECT_LINK_LIBS rvslib libpthread.so libpci.so libm.so ${ROCBLAS_LIB_DIR}/librocblas.so ${HSAKMT_LIB_DIR}/libhsakmt.a ${HSA_PATH}/lib/libhsa-runtime64.so)
+ 
+ ## define source files
+ set(SOURCES src/rvs_module.cpp src/action.cpp src/rvs_memtest.cpp src/rvs_memworker.cpp)
+diff --git a/pbqt.so/CMakeLists.txt b/pbqt.so/CMakeLists.txt
+index 5ae675a..892b6ac 100644
+--- a/pbqt.so/CMakeLists.txt
++++ b/pbqt.so/CMakeLists.txt
+@@ -136,11 +136,11 @@ if(NOT EXISTS ${ROCR_LIB_DIR}/${CORE_RUNTIME_LIBRARY}.so)
+ endif()
+ 
+ ## define include directories
+-include_directories(./ ../ pci ${ROCR_INC_DIR})
++include_directories(./ ../ pci ${ROCR_INC_DIR} ${YAML_CPP_INCLUDE_DIRS})
+ # Add directories to look for library files to link
+-link_directories(${RVS_LIB_DIR} ${ROCR_LIB_DIR} ${ROCT_LIB_DIR} ${ASAN_LIB_PATH})
++link_directories(${RVS_LIB_DIR} ${ROCR_LIB_DIR} ${HSAKMT_LIB_DIR} ${ROCT_LIB_DIR} ${ASAN_LIB_PATH} ${ROCM_SMI_LIB_DIR})
+ ## additional libraries
+-set (PROJECT_LINK_LIBS rvslib libpthread.so libpci.so libm.so)
++set (PROJECT_LINK_LIBS rvslib libpthread.so libpci.so libm.so ${ROCBLAS_LIB_DIR}/librocblas.so ${HSAKMT_LIB_DIR}/libhsakmt.a ${HSA_PATH}/lib/libhsa-runtime64.so)
+ 
+ ## define source files
+ set(SOURCES src/rvs_module.cpp src/action.cpp src/action_run.cpp
+diff --git a/pebb.so/CMakeLists.txt b/pebb.so/CMakeLists.txt
+index c4e2964..7a6b368 100644
+--- a/pebb.so/CMakeLists.txt
++++ b/pebb.so/CMakeLists.txt
+@@ -139,9 +139,9 @@ endif()
+ ## define include directories
+ include_directories(./ ../ pci ${ROCR_INC_DIR})
+ # Add directories to look for library files to link
+-link_directories(${RVS_LIB_DIR} ${ROCR_LIB_DIR} ${ROCT_LIB_DIR} ${ASAN_LIB_PATH})
++link_directories(${RVS_LIB_DIR} ${ROCR_LIB_DIR} ${HSAKMT_LIB_DIR} ${ROCT_LIB_DIR} ${ASAN_LIB_PATH} ${ROCM_SMI_LIB_DIR} )
+ ## additional libraries
+-set (PROJECT_LINK_LIBS rvslib libpthread.so libpci.so libm.so)
++set (PROJECT_LINK_LIBS rvslib libpthread.so libpci.so libm.so ${ROCBLAS_LIB_DIR}/librocblas.so ${HSAKMT_LIB_DIR}/libhsakmt.a ${HSA_PATH}/lib/libhsa-runtime64.so)
+ 
+ ## define source files
+ set(SOURCES src/rvs_module.cpp src/action.cpp src/action_run.cpp
+diff --git a/peqt.so/CMakeLists.txt b/peqt.so/CMakeLists.txt
+index ead507d..567358b 100644
+--- a/peqt.so/CMakeLists.txt
++++ b/peqt.so/CMakeLists.txt
+@@ -107,9 +107,9 @@ else()
+ endif()
+ 
+ ## define include directories
+-include_directories(./ ../)
++include_directories(./ ../ ${HSA_PATH})
+ # Add directories to look for library files to link
+-link_directories(${RVS_LIB_DIR} ${ASAN_LIB_PATH})
++link_directories(${RVS_LIB_DIR} ${HSA_PATH}/lib/ ${HSAKMT_LIB_DIR} ${ASAN_LIB_PATH} ${ROCM_SMI_LIB_DIR} ${YAML_CPP_INCLUDE_DIRS})
+ ## additional libraries
+ set (PROJECT_LINK_LIBS rvslib libpci.so libm.so)
+ 
+@@ -121,7 +121,7 @@ add_library( ${RVS_TARGET} SHARED ${SOURCES})
+ set_target_properties(${RVS_TARGET} PROPERTIES
+         SUFFIX .so.${LIB_VERSION_STRING}
+         LIBRARY_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
+-target_link_libraries(${RVS_TARGET} ${PROJECT_LINK_LIBS} )
++target_link_libraries(${RVS_TARGET} ${PROJECT_LINK_LIBS} ${ROCBLAS_LIB_DIR}/librocblas.so ${HSAKMT_LIB_DIR}/libhsakmt.a ${HSA_PATH}/lib/libhsa-runtime64.so)
+ add_dependencies(${RVS_TARGET} rvslib)
+ 
+ add_custom_command(TARGET ${RVS_TARGET} POST_BUILD
+diff --git a/perf.so/CMakeLists.txt b/perf.so/CMakeLists.txt
+index 518dac9..02d2245 100644
+--- a/perf.so/CMakeLists.txt
++++ b/perf.so/CMakeLists.txt
+@@ -137,7 +137,7 @@ if(DEFINED RVS_ROCMSMI)
+ endif()
+ 
+ 
+-if(NOT EXISTS "${ROCR_LIB_DIR}/lib${HIP_HCC_LIB}.so")
++if(NOT EXISTS "${HIP_INC_DIR}/lib/lib${HIP_HCC_LIB}.so")
+   message("ERROR: ROC Runtime libraries can't be found under specified path. Please set ROCR_LIB_DIR path. Current value is : " ${ROCR_LIB_DIR})
+   RETURN()
+ endif()
+@@ -145,9 +145,9 @@ endif()
+ ## define include directories
+ include_directories(./ ../ ${ROCR_INC_DIR} ${ROCBLAS_INC_DIR} ${HIP_INC_DIR})
+ # Add directories to look for library files to link
+-link_directories(${RVS_LIB_DIR} ${ROCR_LIB_DIR} ${ROCBLAS_LIB_DIR} ${ASAN_LIB_PATH})
++link_directories(${RVS_LIB_DIR} ${ROCR_LIB_DIR} ${HIP_INC_DIR}/lib ${ROCBLAS_LIB_DIR} ${ASAN_LIB_PATH} ${ROCM_SMI_LIB_DIR})
+ ## additional libraries
+-set (PROJECT_LINK_LIBS rvslib libpthread.so libpci.so libm.so)
++set (PROJECT_LINK_LIBS rvslib libpthread.so libpci.so libm.so ${ROCBLAS_LIB_DIR}/librocblas.so ${HSAKMT_LIB_DIR}/libhsakmt.a ${HSA_PATH}/lib/libhsa-runtime64.so)
+ 
+ ## define source files
+ set(SOURCES src/rvs_module.cpp src/action.cpp src/perf_worker.cpp)
+@@ -157,7 +157,7 @@ add_library( ${RVS_TARGET} SHARED ${SOURCES})
+ set_target_properties(${RVS_TARGET} PROPERTIES
+         SUFFIX .so.${LIB_VERSION_STRING}
+         LIBRARY_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
+-target_link_libraries(${RVS_TARGET} ${PROJECT_LINK_LIBS} ${HIP_HCC_LIB} ${ROCBLAS_LIB})
++target_link_libraries(${RVS_TARGET} ${PROJECT_LINK_LIBS})
+ add_dependencies(${RVS_TARGET} rvslib)
+ 
+ add_custom_command(TARGET ${RVS_TARGET} POST_BUILD
+diff --git a/pesm.so/CMakeLists.txt b/pesm.so/CMakeLists.txt
+index 1f27f34..20a8bed 100644
+--- a/pesm.so/CMakeLists.txt
++++ b/pesm.so/CMakeLists.txt
+@@ -107,11 +107,11 @@ else()
+ endif()
+ 
+ ## define include directories
+-include_directories(./ ../ pci)
++include_directories(./ ../ pci ${YAML_CPP_INCLUDE_DIRS})
+ # Add directories to look for library files to link
+-link_directories(${RVS_LIB_DIR} ${ROCR_LIB_DIR} ${ROCBLAS_LIB_DIR} ${ASAN_LIB_PATH})
++link_directories(${RVS_LIB_DIR} ${ROCR_LIB_DIR} ${ROCBLAS_LIB_DIR} ${ASAN_LIB_PATH} ${ROCM_SMI_LIB_DIR})
+ ## additional libraries
+-set (PROJECT_LINK_LIBS libpthread.so libpci.so libm.so)
++set (PROJECT_LINK_LIBS libpthread.so libpci.so libm.so ${PROJECT_LINK_LIBS} ${ROCBLAS_LIB_DIR}/librocblas.so ${HSAKMT_LIB_DIR}/libhsakmt.a ${HSA_PATH}/lib/libhsa-runtime64.so)
+ 
+ ## define source files
+ set(SOURCES  src/rvs_module.cpp src/action.cpp src/worker.cpp)
+@@ -121,7 +121,7 @@ add_library( ${RVS_TARGET} SHARED ${SOURCES})
+ set_target_properties(${RVS_TARGET} PROPERTIES
+         SUFFIX .so.${LIB_VERSION_STRING}
+         LIBRARY_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
+-target_link_libraries(${RVS_TARGET} rvslib ${PROJECT_LINK_LIBS} )
++target_link_libraries(${RVS_TARGET} rvslib ${PROJECT_LINK_LIBS})
+ add_dependencies(${RVS_TARGET} rvslib)
+ 
+ add_custom_command(TARGET ${RVS_TARGET} POST_BUILD
+diff --git a/rcqt.so/CMakeLists.txt b/rcqt.so/CMakeLists.txt
+index c0099ab..8d92982 100644
+--- a/rcqt.so/CMakeLists.txt
++++ b/rcqt.so/CMakeLists.txt
+@@ -108,11 +108,11 @@ else()
+ endif()
+ 
+ ## define include directories
+-include_directories(./ ../)
++include_directories(./ ../ ${YAML_CPP_INCLUDE_DIRS})
+ # Add directories to look for library files to link
+-link_directories(${RVS_LIB_DIR} ${ASAN_LIB_PATH})
++link_directories(${RVS_LIB_DIR} ${ASAN_LIB_PATH} ${ASAN_LIB_PATH} ${HSAKMT_LIB_DIR} ${ROCM_SMI_LIB_DIR})
+ ## additional libraries
+-set (PROJECT_LINK_LIBS rvslib)
++set (PROJECT_LINK_LIBS rvslib ${ROCBLAS_LIB_DIR}/librocblas.so ${HSAKMT_LIB_DIR}/libhsakmt.a ${HSA_PATH}/lib/libhsa-runtime64.so)
+ 
+ ## define source files
+ set(SOURCES 
+diff --git a/rvs/CMakeLists.txt b/rvs/CMakeLists.txt
+index 527d474..76a5efd 100644
+--- a/rvs/CMakeLists.txt
++++ b/rvs/CMakeLists.txt
+@@ -113,21 +113,22 @@ else()
+ endif()
+ 
+ ## define include directories
+-include_directories(./ ../ ${YAML_CPP_INCLUDE_DIRS})
++include_directories(./ ../ ${YAML_INC_DIR})
+ ## define lib directories
+-link_directories(${CMAKE_CURRENT_BINARY_DIR} ${RVS_LIB_DIR} ${ROCT_LIB_DIR} ${ROCBLAS_LIB_DIR} ${ROCM_SMI_LIB_DIR} ${ASAN_LIB_PATH})
++link_directories(${CMAKE_CURRENT_BINARY_DIR} ${RVS_LIB_DIR} ${RVS_LIB_DIR}/.. ${ASAN_LIB_PATH} ${ROCM_SMI_LIB_DIR} ${YAML_CPP_LIBRARIES} ${ROCT_LIB_DIR} ${ROCBLAS_LIB_DIR} )
+ 
+ ## additional libraries
+-set(ROCBLAS_LIB "rocblas")
+-set(ROC_THUNK_NAME "hsakmt")
+-set(CORE_RUNTIME_NAME "hsa-runtime")
++set(ROCBLAS_LIB "${ROCBLAS_LIB_DIR}/librocblas.so")
++set(ROC_THUNK_NAME "${HSAKMT_LIB_DIR}/libhsakmt.a")
++set(CORE_RUNTIME_NAME "${HSA_PATH}/lib/libhsa-runtime64.so")
++set(YAML_CPP_LIB "${YAML_INC_DIR}/../lib64/libyaml-cpp.a")
+ set(CORE_RUNTIME_TARGET "${CORE_RUNTIME_NAME}64")
+-set(PROJECT_LINK_LIBS libdl.so libpthread.so libpci.so ${YAML_CPP_LIBRARIES})
++set(PROJECT_LINK_LIBS libdl.so libpthread.so libpci.so)
+ 
+ ## define target
+ add_executable(${RVS_TARGET} src/rvs.cpp)
+ target_link_libraries(${RVS_TARGET} rvslib
+-  ${ROCBLAS_LIB} ${ROCM_SMI_LIB} ${ROC_THUNK_NAME} ${CORE_RUNTIME_TARGET} ${PROJECT_LINK_LIBS})
++  ${ROCBLAS_LIB} ${ROCM_SMI_LIB} ${ROC_THUNK_NAME} ${PROJECT_LINK_LIBS} ${CORE_RUNTIME_NAME} ${YAML_CPP_LIB})
+ add_dependencies(${RVS_TARGET} rvslib)
+ 
+ install(TARGETS ${RVS_TARGET}
+diff --git a/rvs/tests.cmake b/rvs/tests.cmake
+index 38ae3fb..0d62675 100644
+--- a/rvs/tests.cmake
++++ b/rvs/tests.cmake
+@@ -41,7 +41,8 @@ link_directories(${RVS_LIB_DIR} ${ROCBLAS_LIB_DIR} ${ROCM_SMI_LIB_DIR} ${ROCT_LI
+ ## define target for "test-to-fail"
+ add_executable(${RVS_TARGET}fail src/rvs.cpp)
+ target_link_libraries(${RVS_TARGET}fail rvslib rvslibut ${PROJECT_LINK_LIBS}
+-  ${ROCM_SMI_LIB} ${ROCBLAS_LIB} ${ROC_THUNK_NAME} ${CORE_RUNTIME_TARGET})
++  ${ROCM_SMI_LIB} ${ROCBLAS_LIB} ${ROC_THUNK_NAME} ${CORE_RUNTIME_TARGET}
++  ${ROCBLAS_LIB_DIR}/librocblas.so ${HSAKMT_LIB_DIR}/libhsakmt.a ${HSA_PATH}/lib/libhsa-runtime64.so)
+ 
+ target_compile_definitions(${RVS_TARGET}fail PRIVATE RVS_INVERT_RETURN_STATUS)
+ set_target_properties(${RVS_TARGET}fail PROPERTIES
+@@ -187,7 +188,7 @@ add_test(NAME unit.ttf.rvs.config.noconfig
+ )
+ 
+ ## define include directories
+-include_directories(${UT_INC})
++include_directories(${UT_INC} ${YAML_INC_DIR})
+ ## define lib directories
+ link_directories(${UT_LIB} ${ROCBLAS_LIB_DIR} ${ROCM_SMI_LIB_DIR} ${ROCT_LIB_DIR})
+ ## additional libraries for unit tests
+@@ -211,6 +212,7 @@ FOREACH(SINGLE_TEST ${TESTSOURCES})
+     ${PROJECT_TEST_LINK_LIBS}
+     rvslib rvslibut gtest_main gtest pthread
+     ${ROCM_SMI_LIB} ${ROCBLAS_LIB} ${ROC_THUNK_NAME} ${CORE_RUNTIME_TARGET}
++    ${ROCBLAS_LIB_DIR}/librocblas.so ${HSAKMT_LIB_DIR}/libhsakmt.a ${HSA_PATH}/lib/libhsa-runtime64.so
+   )
+   add_dependencies(${TEST_NAME} rvs_gtest_target)
+ 
+diff --git a/rvslib/CMakeLists.txt b/rvslib/CMakeLists.txt
+index 8d29590..d52aee3 100644
+--- a/rvslib/CMakeLists.txt
++++ b/rvslib/CMakeLists.txt
+@@ -116,7 +116,7 @@ endif()
+ 
+ ## define include directories
+ include_directories(./ ../ ../rvs
+-  ${ROCM_SMI_INC_DIR} ${ROCR_INC_DIR} ${ROCBLAS_INC_DIR} ${HIP_INC_DIR})
++  ${ROCM_SMI_INC_DIR} ${HIP_PATH} ${ROCBLAS_INC_DIR} ${YAML_INC_DIR})
+ 
+ link_directories(${ASAN_LIB_PATH} ${ROCM_SMI_LIB_DIR})
+ 
+diff --git a/smqt.so/CMakeLists.txt b/smqt.so/CMakeLists.txt
+index 042586f..0133c00 100644
+--- a/smqt.so/CMakeLists.txt
++++ b/smqt.so/CMakeLists.txt
+@@ -106,11 +106,11 @@ else()
+ endif()
+ 
+ ## define include directories
+-include_directories(./ ../ pci)
++include_directories(./ ../ pci ${YAML_CPP_INCLUDE_DIRS})
+ # Add directories to look for library files to link
+-link_directories(${RVS_LIB_DIR} ${ASAN_LIB_PATH})
++link_directories(${RVS_LIB_DIR} ${ASAN_LIB_PATH} ${ROCM_SMI_LIB_DIR})
+ ## additional libraries
+-set (PROJECT_LINK_LIBS rvslib libpci.so libm.so)
++set (PROJECT_LINK_LIBS rvslib libpci.so libm.so ${ROCBLAS_LIB_DIR}/librocblas.so ${HSAKMT_LIB_DIR}/libhsakmt.a ${HSA_PATH}/lib/libhsa-runtime64.so)
+ 
+ ## define source files
+ set(SOURCES src/rvs_module.cpp src/action.cpp)
+diff --git a/testif.so/CMakeLists.txt b/testif.so/CMakeLists.txt
+index 4cba0f9..34b491e 100644
+--- a/testif.so/CMakeLists.txt
++++ b/testif.so/CMakeLists.txt
+@@ -108,11 +108,11 @@ endif()
+ 
+ 
+ ## define include directories
+-include_directories(./ ../ pci)
++include_directories(./ ../ pci ${YAML_CPP_INCLUDE_DIRS})
+ # Add directories to look for library files to link
+-link_directories(${RVS_LIB_DIR} ${ROCR_LIB_DIR} ${ROCBLAS_LIB_DIR} ${ASAN_LIB_PATH})
++link_directories(${RVS_LIB_DIR} ${ROCBLAS_LIB_DIR} ${ASAN_LIB_PATH} ${ROCM_SMI_LIB_DIR})
+ ## additional libraries
+-set (PROJECT_LINK_LIBS libpthread.so libpci.so libm.so)
++set (PROJECT_LINK_LIBS libpthread.so libpci.so libm.so ${ROCBLAS_LIB_DIR}/librocblas.so ${HSAKMT_LIB_DIR}/libhsakmt.a ${HSA_PATH}/lib/libhsa-runtime64.so)
+ 
+ ## define source files
+ ## set(SOURCES  src/rvs_module.cpp src/action.cpp src/worker.cpp)
+@@ -124,7 +124,7 @@ add_library( ${RVS_TARGET} SHARED src/rvs_module_no_if_methods.cpp)
+ set_target_properties(${RVS_TARGET} PROPERTIES
+         SUFFIX .so.${LIB_VERSION_STRING}
+         LIBRARY_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
+-target_link_libraries(${RVS_TARGET} rvslib ${PROJECT_LINK_LIBS} )
++target_link_libraries(${RVS_TARGET} rvslib ${PROJECT_LINK_LIBS})
+ add_dependencies(${RVS_TARGET} rvslib)
+ 
+ add_custom_command(TARGET ${RVS_TARGET} POST_BUILD
+@@ -145,7 +145,7 @@ add_library( ${RVS_TARGET} SHARED src/rvs_module_no_if0.cpp)
+ set_target_properties(${RVS_TARGET} PROPERTIES
+         SUFFIX .so.${LIB_VERSION_STRING}
+         LIBRARY_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
+-target_link_libraries(${RVS_TARGET} rvslib ${PROJECT_LINK_LIBS} )
++target_link_libraries(${RVS_TARGET} rvslib ${PROJECT_LINK_LIBS})
+ add_dependencies(${RVS_TARGET} rvslib)
+ 
+ add_custom_command(TARGET ${RVS_TARGET} POST_BUILD
+@@ -166,7 +166,7 @@ add_library( ${RVS_TARGET} SHARED src/rvs_module_no_if0_methods.cpp)
+ set_target_properties(${RVS_TARGET} PROPERTIES
+         SUFFIX .so.${LIB_VERSION_STRING}
+         LIBRARY_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
+-target_link_libraries(${RVS_TARGET} rvslib ${PROJECT_LINK_LIBS} )
++target_link_libraries(${RVS_TARGET} rvslib ${PROJECT_LINK_LIBS})
+ add_dependencies(${RVS_TARGET} rvslib)
+ 
+ add_custom_command(TARGET ${RVS_TARGET} POST_BUILD
+@@ -187,7 +187,7 @@ add_library( ${RVS_TARGET} SHARED src/rvs_module_no_if1.cpp)
+ set_target_properties(${RVS_TARGET} PROPERTIES
+         SUFFIX .so.${LIB_VERSION_STRING}
+         LIBRARY_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
+-target_link_libraries(${RVS_TARGET} rvslib ${PROJECT_LINK_LIBS} )
++target_link_libraries(${RVS_TARGET} rvslib ${PROJECT_LINK_LIBS})
+ add_dependencies(${RVS_TARGET} rvslib)
+ 
+ add_custom_command(TARGET ${RVS_TARGET} POST_BUILD
+@@ -208,7 +208,7 @@ add_library( ${RVS_TARGET} SHARED src/rvs_module_no_if1_methods.cpp)
+ set_target_properties(${RVS_TARGET} PROPERTIES
+         SUFFIX .so.${LIB_VERSION_STRING}
+         LIBRARY_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
+-target_link_libraries(${RVS_TARGET} rvslib ${PROJECT_LINK_LIBS} )
++target_link_libraries(${RVS_TARGET} rvslib ${PROJECT_LINK_LIBS})
+ add_dependencies(${RVS_TARGET} rvslib)
+ 
+ add_custom_command(TARGET ${RVS_TARGET} POST_BUILD
+@@ -229,7 +229,7 @@ add_library( ${RVS_TARGET} SHARED src/rvs_module_fail_init.cpp)
+ set_target_properties(${RVS_TARGET} PROPERTIES
+         SUFFIX .so.${LIB_VERSION_STRING}
+         LIBRARY_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
+-target_link_libraries(${RVS_TARGET} rvslib ${PROJECT_LINK_LIBS} )
++target_link_libraries(${RVS_TARGET} rvslib ${PROJECT_LINK_LIBS})
+ add_dependencies(${RVS_TARGET} rvslib)
+ 
+ add_custom_command(TARGET ${RVS_TARGET} POST_BUILD
+@@ -250,7 +250,7 @@ add_library( ${RVS_TARGET} SHARED src/rvs_module_fail_create_action.cpp)
+ set_target_properties(${RVS_TARGET} PROPERTIES
+         SUFFIX .so.${LIB_VERSION_STRING}
+         LIBRARY_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
+-target_link_libraries(${RVS_TARGET} rvslib ${PROJECT_LINK_LIBS} )
++target_link_libraries(${RVS_TARGET} rvslib ${PROJECT_LINK_LIBS})
+ add_dependencies(${RVS_TARGET} rvslib)
+ 
+ add_custom_command(TARGET ${RVS_TARGET} POST_BUILD
+-- 
+2.39.3
+
diff --git a/var/spack/repos/builtin/packages/rocm-validation-suite/package.py b/var/spack/repos/builtin/packages/rocm-validation-suite/package.py
index 52e267f580..03b1c0d45e 100644
--- a/var/spack/repos/builtin/packages/rocm-validation-suite/package.py
+++ b/var/spack/repos/builtin/packages/rocm-validation-suite/package.py
@@ -15,13 +15,14 @@ class RocmValidationSuite(CMakePackage):
     computing environment, enabled using the ROCm software stack on a
     compatible platform."""
 
-    homepage = "https://github.com/ROCm-Developer-Tools/ROCmValidationSuite"
-    url = "https://github.com/ROCm-Developer-Tools/ROCmValidationSuite/archive/rocm-5.5.0.tar.gz"
+    homepage = "https://github.com/ROCm/ROCmValidationSuite"
+    url = "https://github.com/ROCm/ROCmValidationSuite/archive/rocm-6.0.0.tar.gz"
     tags = ["rocm"]
 
     license("MIT")
 
     maintainers("srekolam", "renjithravindrankannath")
+    version("6.0.0", sha256="a84e36b5e50e70ba033fb6bc6fa99da2e32bf7eaef2098df3164365a77a8f14c")
     version("5.7.1", sha256="202f2b6e014bbbeec40af5d3ec630c042f09a61087a77bd70715d81044ea4d65")
     version("5.7.0", sha256="f049b7786a220e9b6dfe099f17727dd0d9e41be9e680fe8309eae400cc5536ea")
     version("5.6.1", sha256="d5e4100e2d07311dfa101563c15d026a8130442cdee8af9ef861832cd7866c0d")
@@ -122,9 +123,8 @@ class RocmValidationSuite(CMakePackage):
         "007-cleanup-path-reference-donot-download-googletest-yaml-library-path_5.6.patch",
         when="@5.6",
     )
-    patch(
-        "008-correcting-library-and-include-path-WITHOUT-RVS-BUILD-TESTS.patch", when="@5.7.0:5.7"
-    )
+    patch("008-correcting-library-and-include-path-WITHOUT-RVS-BUILD-TESTS.patch", when="@5.7")
+    patch("009-replacing-rocm-path-with-package-path.patch", when="@6.0")
     depends_on("cmake@3.5:", type="build")
     depends_on("zlib-api", type="link")
     depends_on("yaml-cpp~shared")
@@ -165,6 +165,7 @@ class RocmValidationSuite(CMakePackage):
         "5.6.1",
         "5.7.0",
         "5.7.1",
+        "6.0.0",
     ]:
         depends_on("hip@" + ver, when="@" + ver)
         depends_on("rocminfo@" + ver, when="@" + ver)
diff --git a/var/spack/repos/builtin/packages/rocminfo/package.py b/var/spack/repos/builtin/packages/rocminfo/package.py
index 3d70c7024b..a71259914a 100644
--- a/var/spack/repos/builtin/packages/rocminfo/package.py
+++ b/var/spack/repos/builtin/packages/rocminfo/package.py
@@ -10,14 +10,15 @@ from spack.package import *
 class Rocminfo(CMakePackage):
     """Radeon Open Compute (ROCm) Runtime rocminfo tool"""
 
-    homepage = "https://github.com/RadeonOpenCompute/rocminfo"
-    git = "https://github.com/RadeonOpenCompute/rocminfo.git"
-    url = "https://github.com/RadeonOpenCompute/rocminfo/archive/rocm-5.5.0.tar.gz"
+    homepage = "https://github.com/ROCm/rocminfo"
+    git = "https://github.com/ROCm/rocminfo.git"
+    url = "https://github.com/ROCm/rocminfo/archive/rocm-6.0.0.tar.gz"
     tags = ["rocm"]
 
     maintainers("srekolam", "renjithravindrankannath", "haampie")
 
     version("master", branch="master")
+    version("6.0.0", sha256="bc29f1798644b6dea73895353dffada9db7366d0058274e587ebd3291a4d3844")
     version("5.7.1", sha256="642dc2ec4254b3c30c43064e6690861486db820b25f4906ec78bdb47e68dcd0b")
     version("5.7.0", sha256="a5a3c19513bf26f17f163a03ba5288c5c761619ef55f0cb9e15472771748b93e")
     version("5.6.1", sha256="780b186ac7410a503eca1060f4bbc35db1b7b4d1d714d15c7534cd26d8af7b54")
@@ -136,12 +137,13 @@ class Rocminfo(CMakePackage):
         "5.6.1",
         "5.7.0",
         "5.7.1",
+        "6.0.0",
         "master",
     ]:
         depends_on("hsakmt-roct@" + ver, when="@" + ver)
         depends_on("hsa-rocr-dev@" + ver, when="@" + ver)
 
-    for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1", "5.7.0", "5.7.1"]:
+    for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1", "5.7.0", "5.7.1", "6.0.0"]:
         depends_on("rocm-core@" + ver, when="@" + ver)
 
     def cmake_args(self):
diff --git a/var/spack/repos/builtin/packages/rocmlir/package.py b/var/spack/repos/builtin/packages/rocmlir/package.py
index e7be5107d6..0c57ef3b4f 100644
--- a/var/spack/repos/builtin/packages/rocmlir/package.py
+++ b/var/spack/repos/builtin/packages/rocmlir/package.py
@@ -12,9 +12,9 @@ class Rocmlir(CMakePackage):
     targetting AMD hardware. This generator is mainly used from MIOpen and MIGraphX,
     but it can be used on a standalone basis."""
 
-    homepage = "https://github.com/ROCmSoftwarePlatform/rocMLIR"
-    git = "https://github.com/ROCmSoftwarePlatform/rocMLIR.git"
-    url = "https://github.com/ROCmSoftwarePlatform/rocMLIR/archive/refs/tags/rocm-5.5.0.tar.gz"
+    homepage = "https://github.com/ROCm/rocMLIR"
+    git = "https://github.com/ROCm/rocMLIR.git"
+    url = "https://github.com/ROCm/rocMLIR/archive/refs/tags/rocm-6.0.0.tar.gz"
 
     maintainers("srekolam")
     version("5.5.1", commit="8c29325e7e68e3248e863172bf0e7f97055d45ee")
diff --git a/var/spack/repos/builtin/packages/rocprim/package.py b/var/spack/repos/builtin/packages/rocprim/package.py
index a6fd4806c1..fc0e594d15 100644
--- a/var/spack/repos/builtin/packages/rocprim/package.py
+++ b/var/spack/repos/builtin/packages/rocprim/package.py
@@ -9,14 +9,15 @@ from spack.package import *
 class Rocprim(CMakePackage):
     """Radeon Open Compute Parallel Primitives Library"""
 
-    homepage = "https://github.com/ROCmSoftwarePlatform/rocPRIM"
-    git = "https://github.com/ROCmSoftwarePlatform/rocPRIM.git"
-    url = "https://github.com/ROCmSoftwarePlatform/rocPRIM/archive/rocm-5.5.0.tar.gz"
+    homepage = "https://github.com/ROCm/rocPRIM"
+    git = "https://github.com/ROCm/rocPRIM.git"
+    url = "https://github.com/ROCm/rocPRIM/archive/rocm-6.0.0.tar.gz"
     tags = ["rocm"]
 
     license("MIT")
 
     maintainers("cgmb", "srekolam", "renjithravindrankannath")
+    version("6.0.0", sha256="51f26c9f891a64c8db8df51d75d86d404d682092fd9d243e966ac6b2a6de381a")
     version("5.7.1", sha256="15d820a0f61aed60efbba88b6efe6942878b02d912f523f9cf8f33a4583d6cd7")
     version("5.7.0", sha256="a1bf94bbad13a0410b49476771270606d8a9d257188ee3ec3a37eee80540fe9b")
     version("5.6.1", sha256="e9ec1b0039c07cf3096653a04224fe5fe755afc6ba000f6838b3a8bc84df27de")
@@ -147,6 +148,7 @@ class Rocprim(CMakePackage):
         "5.6.1",
         "5.7.0",
         "5.7.1",
+        "6.0.0",
     ]:
         depends_on("hip@" + ver, when="@" + ver)
         depends_on("comgr@" + ver, when="@" + ver)
diff --git a/var/spack/repos/builtin/packages/rocprofiler-dev/package.py b/var/spack/repos/builtin/packages/rocprofiler-dev/package.py
index d87dd3ad2d..b9375fd7ac 100644
--- a/var/spack/repos/builtin/packages/rocprofiler-dev/package.py
+++ b/var/spack/repos/builtin/packages/rocprofiler-dev/package.py
@@ -11,9 +11,9 @@ from spack.package import *
 class RocprofilerDev(CMakePackage):
     """ROCPROFILER library for AMD HSA runtime API extension support"""
 
-    homepage = "https://github.com/ROCm-Developer-Tools/rocprofiler"
-    git = "https://github.com/ROCm-Developer-Tools/rocprofiler.git"
-    url = "https://github.com/ROCm-Developer-Tools/rocprofiler/archive/refs/tags/rocm-5.4.3.tar.gz"
+    homepage = "https://github.com/ROCm/rocprofiler"
+    git = "https://github.com/ROCm/rocprofiler.git"
+    url = "https://github.com/ROCm/rocprofiler/archive/refs/tags/rocm-5.4.3.tar.gz"
     tags = ["rocm"]
 
     maintainers("srekolam", "renjithravindrankannath")
@@ -133,7 +133,7 @@ class RocprofilerDev(CMakePackage):
         depends_on("roctracer-dev-api@" + ver, when="@" + ver)
 
     depends_on("numactl", type="link", when="@4.3.1")
-    # See https://github.com/ROCm-Developer-Tools/rocprofiler/pull/50
+    # See https://github.com/ROCm/rocprofiler/pull/50
     patch("fix-includes.patch")
     patch("0001-Continue-build-in-absence-of-aql-profile-lib.patch", when="@5.3:")
 
diff --git a/var/spack/repos/builtin/packages/rocrand/package.py b/var/spack/repos/builtin/packages/rocrand/package.py
index 775f1eee69..d83857f346 100644
--- a/var/spack/repos/builtin/packages/rocrand/package.py
+++ b/var/spack/repos/builtin/packages/rocrand/package.py
@@ -14,9 +14,9 @@ class Rocrand(CMakePackage):
     """The rocRAND project provides functions that generate
     pseudo-random and quasi-random numbers."""
 
-    homepage = "https://github.com/ROCmSoftwarePlatform/rocRAND"
-    git = "https://github.com/ROCmSoftwarePlatform/rocRAND.git"
-    url = "https://github.com/ROCmSoftwarePlatform/rocRAND/archive/rocm-5.5.0.tar.gz"
+    homepage = "https://github.com/ROCm/rocRAND"
+    git = "https://github.com/ROCm/rocRAND.git"
+    url = "https://github.com/ROCm/rocRAND/archive/rocm-6.0.0.tar.gz"
     tags = ["rocm"]
 
     maintainers("cgmb", "srekolam", "renjithravindrankannath")
@@ -26,6 +26,7 @@ class Rocrand(CMakePackage):
 
     version("develop", branch="develop")
     version("master", branch="master")
+    version("6.0.0", sha256="cee93231c088be524bb2cb0e6093ec47e62e61a55153486bebbc2ca5b3d49360")
     version("5.7.1", sha256="885cd905bbd23d02ba8f3f87d5c0b79bc44bd020ea9af190f3959cf5aa33d07d")
     version("5.7.0", sha256="d6053d986821e5cbc6cfec0778476efb1411ef943f11e7a8b973b1814a259dcf")
     version("5.6.1", sha256="6bf71e687ffa0fcc1b00e3567dd43da4147a82390f1b2db5e6f1f594dee6066d")
@@ -149,7 +150,7 @@ class Rocrand(CMakePackage):
     ]:
         resource(
             name="hipRAND",
-            git="https://github.com/ROCmSoftwarePlatform/hipRAND.git",
+            git="https://github.com/ROCm/hipRAND.git",
             commit=d_commit,
             destination="",
             placement="hiprand",
@@ -157,7 +158,7 @@ class Rocrand(CMakePackage):
         )
     resource(
         name="hipRAND",
-        git="https://github.com/ROCmSoftwarePlatform/hipRAND.git",
+        git="https://github.com/ROCm/hipRAND.git",
         branch="master",
         destination="",
         placement="hiprand",
@@ -165,7 +166,7 @@ class Rocrand(CMakePackage):
     )
     resource(
         name="hipRAND",
-        git="https://github.com/ROCmSoftwarePlatform/hipRAND.git",
+        git="https://github.com/ROCm/hipRAND.git",
         branch="develop",
         destination="",
         placement="hiprand",
@@ -202,6 +203,7 @@ class Rocrand(CMakePackage):
         "5.6.1",
         "5.7.0",
         "5.7.1",
+        "6.0.0",
     ]:
         depends_on("hip@" + ver, when="@" + ver)
         depends_on("rocm-cmake@%s:" % ver, type="build", when="@" + ver)
diff --git a/var/spack/repos/builtin/packages/rocsolver/package.py b/var/spack/repos/builtin/packages/rocsolver/package.py
index ea85a69965..576675a371 100644
--- a/var/spack/repos/builtin/packages/rocsolver/package.py
+++ b/var/spack/repos/builtin/packages/rocsolver/package.py
@@ -13,9 +13,9 @@ class Rocsolver(CMakePackage):
     """rocSOLVER is a work-in-progress implementation of a
     subset of LAPACK functionality on the ROCm platform."""
 
-    homepage = "https://github.com/ROCmSoftwarePlatform/rocSOLVER"
-    git = "https://github.com/ROCmSoftwarePlatform/rocSOLVER.git"
-    url = "https://github.com/ROCmSoftwarePlatform/rocSOLVER/archive/rocm-5.5.0.tar.gz"
+    homepage = "https://github.com/ROCm/rocSOLVER"
+    git = "https://github.com/ROCm/rocSOLVER.git"
+    url = "https://github.com/ROCm/rocSOLVER/archive/rocm-6.0.0.tar.gz"
     tags = ["rocm"]
 
     maintainers("cgmb", "srekolam", "renjithravindrankannath", "haampie")
@@ -41,6 +41,7 @@ class Rocsolver(CMakePackage):
 
     version("develop", branch="develop")
     version("master", branch="master")
+    version("6.0.0", sha256="5fcaba96f3efafc2ecc3f4ec104095d96545c16e1b9f95410bd571cb0fc643ae")
     version("5.7.1", sha256="83e0c137b8690dbeb2e85d9e25415d96bd06979f09f2b10b2aff8e4c9f833fa4")
     version("5.7.0", sha256="bb16d360f14b34fe6e8a6b8ddc6e631672a5ffccbdcb25f0ce319edddd7f9682")
     version("5.6.1", sha256="6a8f366218aee599a0e56755030f94ee690b34f30e6d602748632226c5dc21bb")
@@ -136,7 +137,7 @@ class Rocsolver(CMakePackage):
     depends_on("netlib-lapack@3.7.1:", type="test")
 
     patch("link-clients-blas.patch", when="@4.3.0:4.3.2")
-    # Backport https://github.com/ROCmSoftwarePlatform/rocSOLVER/commit/2bbfb8976f6e4d667499c77e41a6433850063e88
+    # Backport https://github.com/ROCm/rocSOLVER/commit/2bbfb8976f6e4d667499c77e41a6433850063e88
     patch("fmt-8.1-compatibility.patch", when="@4.5.0:5.1.3")
     # Maximize compatibility with other libraries that are using fmt.
     patch("fmt-9-compatibility.patch", when="@5.2.0:5.5")
@@ -180,10 +181,11 @@ class Rocsolver(CMakePackage):
         "5.6.1",
         "5.7.0",
         "5.7.1",
+        "6.0.0",
     ]:
         depends_on("hip@" + ver, when="@" + ver)
         depends_on("rocblas@" + ver, when="@" + ver)
-    for ver in ["5.6.0", "5.6.1", "5.7.0", "5.7.1"]:
+    for ver in ["5.6.0", "5.6.1", "5.7.0", "5.7.1", "6.0.0"]:
         depends_on("rocsparse@5.2:", when="@5.6:")
 
     for tgt in itertools.chain(["auto"], amdgpu_targets):
diff --git a/var/spack/repos/builtin/packages/rocsparse/package.py b/var/spack/repos/builtin/packages/rocsparse/package.py
index 98c02e8807..211afb0d36 100644
--- a/var/spack/repos/builtin/packages/rocsparse/package.py
+++ b/var/spack/repos/builtin/packages/rocsparse/package.py
@@ -15,9 +15,9 @@ class Rocsparse(CMakePackage):
     and toolchains. rocSPARSE is created using the HIP programming
     language and optimized for AMD's latest discrete GPUs."""
 
-    homepage = "https://github.com/ROCmSoftwarePlatform/rocSPARSE"
-    git = "https://github.com/ROCmSoftwarePlatform/rocSPARSE.git"
-    url = "https://github.com/ROCmSoftwarePlatform/rocSPARSE/archive/rocm-5.5.0.tar.gz"
+    homepage = "https://github.com/ROCm/rocSPARSE"
+    git = "https://github.com/ROCm/rocSPARSE.git"
+    url = "https://github.com/ROCm/rocSPARSE/archive/rocm-6.0.0.tar.gz"
     tags = ["rocm"]
 
     maintainers("cgmb", "srekolam", "renjithravindrankannath")
@@ -34,7 +34,7 @@ class Rocsparse(CMakePackage):
     variant("test", default=False, description="Build rocsparse-test client")
 
     license("MIT")
-
+    version("6.0.0", sha256="bdc618677ec78830c6af315d61194d6ab8532345b8daeeb115aca96f274d4ca4")
     version("5.7.1", sha256="4c09b182b371124675d4057246021b5ed45e2833fdbf265b37a9b06b668baf0a")
     version("5.7.0", sha256="a42f0eb531b015b719e2bdcdff0cfb214e9894f73107966260f26931f982ecbc")
     version("5.6.1", sha256="6a50a64354507f1374e1a86aa7f5c07d1aaa96ac193ac292c279153087bb5d54")
@@ -153,6 +153,7 @@ class Rocsparse(CMakePackage):
         "5.6.1",
         "5.7.0",
         "5.7.1",
+        "6.0.0",
     ]:
         depends_on("hip@" + ver, when="@" + ver)
         depends_on("rocprim@" + ver, when="@" + ver)
diff --git a/var/spack/repos/builtin/packages/rocthrust/package.py b/var/spack/repos/builtin/packages/rocthrust/package.py
index c5e8dd1acc..01da0551b1 100644
--- a/var/spack/repos/builtin/packages/rocthrust/package.py
+++ b/var/spack/repos/builtin/packages/rocthrust/package.py
@@ -12,12 +12,13 @@ class Rocthrust(CMakePackage):
     HIP/ROCm platform, which uses the rocPRIM library. The HIP ported
     library works on HIP/ROCm platforms"""
 
-    homepage = "https://github.com/ROCmSoftwarePlatform/rocThrust"
-    git = "https://github.com/ROCmSoftwarePlatform/rocThrust.git"
-    url = "https://github.com/ROCmSoftwarePlatform/rocThrust/archive/rocm-5.5.0.tar.gz"
+    homepage = "https://github.com/ROCm/rocThrust"
+    git = "https://github.com/ROCm/rocThrust.git"
+    url = "https://github.com/ROCm/rocThrust/archive/rocm-6.0.0.tar.gz"
     tags = ["rocm"]
 
     maintainers("cgmb", "srekolam", "renjithravindrankannath")
+    version("6.0.0", sha256="a3fdafe4b6124118e07f23a3b0270d91740da324f61aaa3e8c034da08d9312b1")
     version("5.7.1", sha256="b7cb9ea6c42b2c6b610c34d2c438443e0f99245bd391aff18591949bf1cd53ee")
     version("5.7.0", sha256="64e10f071acfc5b8e3c168b9178289cf1afc7b168bf1962793fc256b25074d3a")
     version("5.6.1", sha256="63df61d5ab46d4cfda6066d748274bacecc77151692e372e6f7df5e91852bdc2")
@@ -149,6 +150,7 @@ class Rocthrust(CMakePackage):
         "5.6.1",
         "5.7.0",
         "5.7.1",
+        "6.0.0",
     ]:
         depends_on("hip@" + ver, when="@" + ver)
         depends_on("rocprim@" + ver, when="@" + ver)
diff --git a/var/spack/repos/builtin/packages/roctracer-dev-api/package.py b/var/spack/repos/builtin/packages/roctracer-dev-api/package.py
index a944ff3970..e93c202ccf 100644
--- a/var/spack/repos/builtin/packages/roctracer-dev-api/package.py
+++ b/var/spack/repos/builtin/packages/roctracer-dev-api/package.py
@@ -11,14 +11,15 @@ class RoctracerDevApi(Package):
     package, mainly to avoid circular dependencies in the ROCm ecosystem.
     For the ROC-tracer library, please check out roctracer-dev."""
 
-    homepage = "https://github.com/ROCm-Developer-Tools/roctracer"
-    git = "https://github.com/ROCm-Developer-Tools/roctracer.git"
-    url = "https://github.com/ROCm-Developer-Tools/roctracer/archive/refs/tags/rocm-5.5.0.tar.gz"
+    homepage = "https://github.com/ROCm/roctracer"
+    git = "https://github.com/ROCm/roctracer.git"
+    url = "https://github.com/ROCm/roctracer/archive/refs/tags/rocm-6.0.0.tar.gz"
     tags = ["rocm"]
 
     license("MIT")
 
     maintainers("srekolam", "renjithravindrankannath")
+    version("6.0.0", sha256="941166a0363c5689bfec118d54e986c43fb1ec8cbf18d95721d9a824bd52c0f8")
     version("5.7.1", sha256="ec0453adac7e62b142eb0df1e1e2506863aac4c3f2ce9d117c3184c08c0c6b48")
     version("5.7.0", sha256="40bb757920488466e29df90bb80a975cc340bf7f8771fb1d754dfbb6b688d78e")
     version("5.6.1", sha256="007c498be25b067ad9a7631a2b0892f9129150ee9714e471a921225875d45e69")
diff --git a/var/spack/repos/builtin/packages/roctracer-dev/package.py b/var/spack/repos/builtin/packages/roctracer-dev/package.py
index aa15dca00e..3c5f81e643 100644
--- a/var/spack/repos/builtin/packages/roctracer-dev/package.py
+++ b/var/spack/repos/builtin/packages/roctracer-dev/package.py
@@ -13,16 +13,16 @@ class RoctracerDev(CMakePackage, ROCmPackage):
     The goal of the implementation is to provide a generic independent from
     specific runtime profiler to trace API and asyncronous activity."""
 
-    homepage = "https://github.com/ROCm-Developer-Tools/roctracer"
-    git = "https://github.com/ROCm-Developer-Tools/roctracer.git"
-    url = "https://github.com/ROCm-Developer-Tools/roctracer/archive/rocm-5.5.0.tar.gz"
+    homepage = "https://github.com/ROCm/roctracer"
+    git = "https://github.com/ROCm/roctracer.git"
+    url = "https://github.com/ROCm/roctracer/archive/rocm-6.0.0.tar.gz"
     tags = ["rocm"]
 
     maintainers("srekolam", "renjithravindrankannath")
     libraries = ["libroctracer64"]
 
     license("MIT")
-
+    version("6.0.0", sha256="941166a0363c5689bfec118d54e986c43fb1ec8cbf18d95721d9a824bd52c0f8")
     version("5.7.1", sha256="ec0453adac7e62b142eb0df1e1e2506863aac4c3f2ce9d117c3184c08c0c6b48")
     version("5.7.0", sha256="40bb757920488466e29df90bb80a975cc340bf7f8771fb1d754dfbb6b688d78e")
     version("5.6.1", sha256="007c498be25b067ad9a7631a2b0892f9129150ee9714e471a921225875d45e69")
@@ -83,6 +83,7 @@ class RoctracerDev(CMakePackage, ROCmPackage):
         "5.6.1",
         "5.7.0",
         "5.7.1",
+        "6.0.0",
     ]:
         depends_on("hsakmt-roct@" + ver, when="@" + ver)
         depends_on("hsa-rocr-dev@" + ver, when="@" + ver)
@@ -105,7 +106,7 @@ class RoctracerDev(CMakePackage, ROCmPackage):
     ]:
         depends_on("rocprofiler-dev@" + ver, when="@" + ver)
 
-    for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1", "5.7.0", "5.7.1"]:
+    for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1", "5.7.0", "5.7.1", "6.0.0"]:
         depends_on("rocm-core@" + ver, when="@" + ver)
 
     patch("0001-include-rocprofiler-dev-path.patch", when="@5.3:5.4")
diff --git a/var/spack/repos/builtin/packages/rocwmma/package.py b/var/spack/repos/builtin/packages/rocwmma/package.py
index 8d5a9fdbea..ee5418b1c8 100644
--- a/var/spack/repos/builtin/packages/rocwmma/package.py
+++ b/var/spack/repos/builtin/packages/rocwmma/package.py
@@ -19,14 +19,15 @@ class Rocwmma(CMakePackage):
     generation of kernel assembly, and does not incur additional overhead costs of
     linking to external runtime libraries or having to launch separate kernels."""
 
-    homepage = "https://github.com/ROCmSoftwarePlatform/rocWMMA"
-    git = "https://github.com/ROCmSoftwarePlatform/rocWMMA.git"
-    url = "https://github.com/ROCmSoftwarePlatform/rocWMMA/archive/refs/tags/rocm-5.5.0.tar.gz"
+    homepage = "https://github.com/ROCm/rocWMMA"
+    git = "https://github.com/ROCm/rocWMMA.git"
+    url = "https://github.com/ROCm/rocWMMA/archive/refs/tags/rocm-6.0.0.tar.gz"
     tags = ["rocm"]
 
     license("MIT")
 
     maintainers("srekolam", "renjithravindrankannath")
+    version("6.0.0", sha256="f9e97e7c6c552d43ef8c7348e4402bead2cd978d0f81a9657d6a0f6c83a6139b")
     version("5.7.1", sha256="a998a1385e6ad7062707ddb9ff82bef727ca48c39a10b4d861667024e3ffd2a3")
     version("5.7.0", sha256="a8f1b090e9e504a149a924c80cfb6aca817359b43833a6512ba32e178245526f")
     version("5.6.1", sha256="41a5159ee1ad5fc411fe6220f37bd754e26d3883c24c0f2378f50ef628bc1b8f")
@@ -78,6 +79,7 @@ class Rocwmma(CMakePackage):
         "5.6.1",
         "5.7.0",
         "5.7.1",
+        "6.0.0",
     ]:
         depends_on("rocm-cmake@%s:" % ver, type="build", when="@" + ver)
         depends_on("llvm-amdgpu@" + ver, type="build", when="@" + ver)
@@ -85,7 +87,7 @@ class Rocwmma(CMakePackage):
         depends_on("rocblas@" + ver, type="build", when="@" + ver)
         depends_on("rocm-openmp-extras@" + ver, type="build", when="@" + ver)
 
-    for ver in ["5.6.0", "5.6.1", "5.7.0", "5.7.1"]:
+    for ver in ["5.6.0", "5.6.1", "5.7.0", "5.7.1", "6.0.0"]:
         depends_on("rocm-smi-lib@" + ver, when="@" + ver)
 
     for tgt in itertools.chain(["auto"], amdgpu_targets):
diff --git a/var/spack/repos/builtin/packages/rpp/0003-include-half-through-spack-package.patch b/var/spack/repos/builtin/packages/rpp/0003-include-half-through-spack-package.patch
new file mode 100644
index 0000000000..2e7e08c2ac
--- /dev/null
+++ b/var/spack/repos/builtin/packages/rpp/0003-include-half-through-spack-package.patch
@@ -0,0 +1,61 @@
+diff --git a/CMakeLists.txt b/CMakeLists.txt
+index 137896e..ca82e98 100644
+--- a/CMakeLists.txt
++++ b/CMakeLists.txt
+@@ -129,6 +129,9 @@ list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake)
+
+ # OpenMP
+ find_package(OpenMP REQUIRED)
++find_path(HALF_INCLUDE_DIR half.hpp)
++message(STATUS "HALF_INCLUDE_DIR: ${HALF_INCLUDE_DIR}")
++
+ if(APPLE)
+     if(CMAKE_C_COMPILER_ID MATCHES "Clang")
+         set(OpenMP_C "${CMAKE_C_COMPILER}")
+@@ -278,6 +281,7 @@ target_include_directories(${PROJECT_NAME}
+     PUBLIC
+         ${CMAKE_CURRENT_SOURCE_DIR}/include
+         ${ROCM_PATH}/include
++        ${HALF_INCLUDE_DIR}
+     PRIVATE
+         ${CMAKE_CURRENT_SOURCE_DIR}/src/include/cpu
+         ${CMAKE_CURRENT_SOURCE_DIR}/src/include/common
+diff --git a/src/modules/CMakeLists.txt b/src/modules/CMakeLists.txt
+index 2a64d77..80c5686 100644
+--- a/src/modules/CMakeLists.txt
++++ b/src/modules/CMakeLists.txt
+@@ -81,6 +81,8 @@ if("${TIME_INFO}" STREQUAL "1")
+ endif()
+
+ # Backend specific settings
++find_path(HALF_INCLUDE_DIR half.hpp)
++message(STATUS "HALF_INCLUDE_DIR: ${HALF_INCLUDE_DIR}")
+
+ if( "${BACKEND}" STREQUAL "HIP")
+     # Add HIP kernels
+@@ -99,7 +101,7 @@ if( "${BACKEND}" STREQUAL "HIP")
+     # Add HIP specific includes
+     set(ROCM_INC ${ROCM_PATH}/include/)
+     list(APPEND HIP_LOCAL_INCLUDE_DIRS ${CMAKE_SOURCE_DIR}/src/include/hip/ ${CMAKE_SOURCE_DIR}/src/include/common/)
+-    set(INCLUDE_LIST ${ROCM_INC} ${HIP_LOCAL_INCLUDE_DIRS} ${INCLUDE_LIST})
++    set(INCLUDE_LIST ${ROCM_INC} ${HIP_LOCAL_INCLUDE_DIRS} ${INCLUDE_LIST} ${HALF_INCLUDE_DIR})
+ elseif( "${BACKEND}" STREQUAL "OCL")
+     # Add OpenCL kernels
+     file(GLOB MOD_CL_CPP "cl/*.cpp" )
+@@ -114,7 +116,7 @@ elseif( "${BACKEND}" STREQUAL "OCL")
+     # Add OpenCL specific includes
+     set(ROCM_INC ${ROCM_PATH}/include/)
+     list(APPEND OCL_LOCAL_INCLUDE_LIST ${CMAKE_SOURCE_DIR}/src/include/cl/ ${CMAKE_SOURCE_DIR}/src/include/common/)
+-    set(INCLUDE_LIST ${ROCM_INC} ${OCL_LOCAL_INCLUDE_LIST} ${INCLUDE_LIST})
++    set(INCLUDE_LIST ${ROCM_INC} ${OCL_LOCAL_INCLUDE_LIST} ${INCLUDE_LIST} ${HALF_INCLUDE_DIR})
+ elseif( "${BACKEND}" STREQUAL "CPU")
+     # Add CPU specific includes
+     set(INCLUDE_LIST ${CMAKE_SOURCE_DIR}/src/include/common/)
+@@ -136,6 +138,7 @@ target_include_directories( ${PROJECT_NAME}
+     PUBLIC
+         ${CMAKE_SOURCE_DIR}/include
+         ${ROCM_INC}
++        ${HALF_INCLUDE_DIR}
+     PRIVATE
+         ${CMAKE_SOURCE_DIR}/src/include/cpu
+         ${CMAKE_SOURCE_DIR}/src/include/common
+\ No newline at end of file
diff --git a/var/spack/repos/builtin/packages/rpp/package.py b/var/spack/repos/builtin/packages/rpp/package.py
index 116fa90328..7049b342cd 100644
--- a/var/spack/repos/builtin/packages/rpp/package.py
+++ b/var/spack/repos/builtin/packages/rpp/package.py
@@ -29,6 +29,7 @@ class Rpp(CMakePackage):
 
     license("MIT")
 
+    version("6.0.0", sha256="3626a648bc773520f5cd5ca15f494de6e74b422baf32491750ce0737c3367f15")
     version("5.7.1", sha256="36fff5f1c52d969c3e2e0c75b879471f731770f193c9644aa6ab993fb8fa4bbf")
     version("5.7.0", sha256="1c612cde3c3d3840ae75ee5c1ee59bd8d61b1fdbf84421ae535cda863470fc06")
     version("1.2.0", sha256="660a11e1bd8706967835597b26daa874fd1507459bfebe22818149444bec540c")
@@ -54,8 +55,9 @@ class Rpp(CMakePackage):
         description="add utilities folder which contains rpp unit tests",
     )
 
-    patch("0001-include-half-openmp-through-spack-package.patch")
+    patch("0001-include-half-openmp-through-spack-package.patch", when="@:5.7")
     patch("0002-declare-handle-in-header.patch")
+    patch("0003-include-half-through-spack-package.patch", when="@6.0:")
 
     # adds half.hpp include directory and modifies how the libjpegturbo
     # library is linked for the rpp unit test
@@ -118,7 +120,11 @@ class Rpp(CMakePackage):
     conflicts("+opencl+hip")
 
     with when("+hip"):
-        depends_on("hip@5:")
+        with when("@5.7:"):
+            for ver in ["5.7.0", "5.7.1", "6.0.0"]:
+                depends_on("hip@" + ver, when="@" + ver)
+        with when("@:1.2"):
+            depends_on("hip@5:")
     with when("~hip"):
         depends_on("rocm-opencl@5:")
 
diff --git a/var/spack/repos/builtin/packages/sundials/Change-HIP_PLATFORM-from-HCC-to-AMD-and-NVCC-to-NVIDIA.patch b/var/spack/repos/builtin/packages/sundials/Change-HIP_PLATFORM-from-HCC-to-AMD-and-NVCC-to-NVIDIA.patch
new file mode 100644
index 0000000000..4dd9dc7a47
--- /dev/null
+++ b/var/spack/repos/builtin/packages/sundials/Change-HIP_PLATFORM-from-HCC-to-AMD-and-NVCC-to-NVIDIA.patch
@@ -0,0 +1,28 @@
+From d4afbed86fc4f9925e55367267b3796a522ba5d5 Mon Sep 17 00:00:00 2001
+From: Renjith Ravindran <Renjith.RavindranKannath@amd.com>
+Date: Sun, 14 Jan 2024 10:20:21 +0000
+Subject: [PATCH] Change HIP_PLATFORM from HCC to AMD and NVCC to NVIDIA
+
+---
+ include/sundials/sundials_hip_policies.hpp | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+diff --git a/include/sundials/sundials_hip_policies.hpp b/include/sundials/sundials_hip_policies.hpp
+index d759bbc..f6dfe41 100644
+--- a/include/sundials/sundials_hip_policies.hpp
++++ b/include/sundials/sundials_hip_policies.hpp
+@@ -30,9 +30,9 @@ namespace sundials
+ namespace hip
+ {
+ 
+-#if defined(__HIP_PLATFORM_HCC__)
++#if defined(__HIP_PLATFORM_AMD__)
+ constexpr const sunindextype WARP_SIZE = 64;
+-#elif defined(__HIP_PLATFORM_NVCC__)
++#elif defined(__HIP_PLATFORM_NVIDIA__)
+ constexpr const sunindextype WARP_SIZE = 32;
+ #endif
+ constexpr const sunindextype MAX_BLOCK_SIZE = 1024;
+-- 
+2.39.3
+
diff --git a/var/spack/repos/builtin/packages/sundials/package.py b/var/spack/repos/builtin/packages/sundials/package.py
index 48f5ec65d7..3a906e6c2c 100644
--- a/var/spack/repos/builtin/packages/sundials/package.py
+++ b/var/spack/repos/builtin/packages/sundials/package.py
@@ -285,6 +285,10 @@ class Sundials(CMakePackage, CudaPackage, ROCmPackage):
     # https://github.com/spack/spack/issues/29526
     patch("nvector-pic.patch", when="@6.1.0:6.2.0 +rocm")
 
+    # Backward compatibility is stopped from ROCm 6.0
+    # Need to follow the changes similar to PR https://github.com/LLNL/RAJA/pull/1568
+    patch("Change-HIP_PLATFORM-from-HCC-to-AMD-and-NVCC-to-NVIDIA.patch", when="^hip@6.0.0 +rocm")
+
     # remove OpenMP header file and function from hypre vector test code
     patch("test_nvector_parhyp.patch", when="@2.7.0:3.0.0")
     patch("FindPackageMultipass.cmake.patch", when="@5.0.0")
diff --git a/var/spack/repos/builtin/packages/trilinos/0001-use-the-gcnArchName-inplace-of-gcnArch-as-gcnArch-is.patch b/var/spack/repos/builtin/packages/trilinos/0001-use-the-gcnArchName-inplace-of-gcnArch-as-gcnArch-is.patch
new file mode 100644
index 0000000000..ea2b8b98a4
--- /dev/null
+++ b/var/spack/repos/builtin/packages/trilinos/0001-use-the-gcnArchName-inplace-of-gcnArch-as-gcnArch-is.patch
@@ -0,0 +1,26 @@
+From e7fa7ea37423d3d17d77334ac849c5df00feb20e Mon Sep 17 00:00:00 2001
+From: sreenivasa murthy kolam <sreenivasamurthy.kolam@amd.com>
+Date: Tue, 16 Jan 2024 10:09:34 +0000
+Subject: [PATCH] use the gcnArchName inplace of gcnArch as gcnArch is
+ deprecated from rocm-6.0.0
+
+---
+ packages/kokkos/core/src/HIP/Kokkos_HIP_Instance.cpp | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_Instance.cpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_Instance.cpp
+index 7840ad9..882d143 100644
+--- a/packages/kokkos/core/src/HIP/Kokkos_HIP_Instance.cpp
++++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_Instance.cpp
+@@ -86,7 +86,7 @@ void HIPInternal::print_configuration(std::ostream &s) const {
+     KOKKOS_IMPL_HIP_SAFE_CALL(hipGetDeviceProperties(&hipProp, i));
+ 
+     s << "Kokkos::HIP[ " << i << " ] "
+-      << "gcnArch " << hipProp.gcnArch << ", Total Global Memory: "
++      << "gcnArchName " << hipProp.gcnArchName << ", Total Global Memory: "
+       << ::Kokkos::Impl::human_memory_size(hipProp.totalGlobalMem)
+       << ", Shared Memory per Block: "
+       << ::Kokkos::Impl::human_memory_size(hipProp.sharedMemPerBlock);
+-- 
+2.39.3
+
diff --git a/var/spack/repos/builtin/packages/trilinos/package.py b/var/spack/repos/builtin/packages/trilinos/package.py
index d1de74f11c..e015bb7f4e 100644
--- a/var/spack/repos/builtin/packages/trilinos/package.py
+++ b/var/spack/repos/builtin/packages/trilinos/package.py
@@ -489,6 +489,11 @@ class Trilinos(CMakePackage, CudaPackage, ROCmPackage):
     # workaround an NVCC bug with c++14 (https://github.com/trilinos/Trilinos/issues/6954)
     # avoid calling deprecated functions with CUDA-11
     patch("fix_cxx14_cuda11.patch", when="@13.0.0:13.0.1 cxxstd=14 ^cuda@11:")
+    patch(
+        "0001-use-the-gcnArchName-inplace-of-gcnArch-as-gcnArch-is.patch",
+        when="@15.0.0 ^hip@6.0.0 +rocm",
+    )
+
     # Allow building with +teko gotype=long
     patch(
         "https://github.com/trilinos/Trilinos/commit/b17f20a0b91e0b9fc5b1b0af3c8a34e2a4874f3f.patch?full_index=1",
author	renjithravindrankannath <94420380+renjithravindrankannath@users.noreply.github.com>	2024-01-22 10:19:28 -0800
committer	GitHub <noreply@github.com>	2024-01-22 10:19:28 -0800
commit	c673979feeaadcf03fc8803e2261809c40df8362 (patch)
tree	f496d602a3bb56d9648db4755a8f7096cc41bb05
parent	7acd5bdc7f0fa646cf4ac1dd7acf7c85d62e3193 (diff)
download	spack-c673979feeaadcf03fc8803e2261809c40df8362.tar.gz spack-c673979feeaadcf03fc8803e2261809c40df8362.tar.bz2 spack-c673979feeaadcf03fc8803e2261809c40df8362.tar.xz spack-c673979feeaadcf03fc8803e2261809c40df8362.zip