diff options
author | renjithravindrankannath <94420380+renjithravindrankannath@users.noreply.github.com> | 2024-01-22 10:19:28 -0800 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-01-22 10:19:28 -0800 |
commit | c673979feeaadcf03fc8803e2261809c40df8362 (patch) | |
tree | f496d602a3bb56d9648db4755a8f7096cc41bb05 | |
parent | 7acd5bdc7f0fa646cf4ac1dd7acf7c85d62e3193 (diff) | |
download | spack-c673979feeaadcf03fc8803e2261809c40df8362.tar.gz spack-c673979feeaadcf03fc8803e2261809c40df8362.tar.bz2 spack-c673979feeaadcf03fc8803e2261809c40df8362.tar.xz spack-c673979feeaadcf03fc8803e2261809c40df8362.zip |
Bump up the version for ROCm-6.0.0 (#42026)
* Bump up the version for ROCm-6.0.0
* Adding patch files
* Style check failure fix
* Style check fixes
* Style check error fixes
* Patch to remove hipblas client file installation in 6.0
* Patch need to be applied on all 5.7 relases
* 6.0 update for math libs and other packages, new github url etc
* Correct package-audit failures
* Correcting shasum for rocfft patch and limiting patch in rocblas
* Reverting updates in rocprofiler-dev due to ci-gitlab failure
* Fixes for ci-gitlab failure due to disabling hip backward compatibilit
* Adding patch file to Change HIP_PLATFORM from HCC to AMD and NVCC to NVIDIA
* Use the gcnArchName inplace of gcnArch as gcnArch is deprecated from rocm-6.0.0
* Patches to fix magma and blaspp build error with rocm 6.0.0
* Patch for mfem and arborx for rocm 6.0
* Style check error fix
* Correcting style check errors
* Uodating dependent version
* Update for petsc to build with rocm 6.0
Need reverting-operator-mixup-fix-for-slate.patch for rocm 6.0
* Reverting the change in url for 2.7.4-rocm-enhanced
* hip-tensor 6.0.0 update
86 files changed, 13036 insertions, 280 deletions
diff --git a/var/spack/repos/builtin/packages/amdsmi/package.py b/var/spack/repos/builtin/packages/amdsmi/package.py index ecd2ca1f1d..e7543fdb8b 100644 --- a/var/spack/repos/builtin/packages/amdsmi/package.py +++ b/var/spack/repos/builtin/packages/amdsmi/package.py @@ -12,8 +12,8 @@ class Amdsmi(CMakePackage): is a C library for Linux that provides a user space interface for applications to monitor and control AMD device.""" - homepage = "https://github.com/RadeonOpenCompute/amdsmi" - url = "https://github.com/RadeonOpenCompute/amdsmi/archive/refs/tags/rocm-5.6.0.tar.gz" + homepage = "https://github.com/ROCm/amdsmi" + url = "https://github.com/ROCm/amdsmi/archive/refs/tags/rocm-5.6.0.tar.gz" tags = ["rocm"] maintainers("srekolam", "renjithravindrankannath") diff --git a/var/spack/repos/builtin/packages/aomp/package.py b/var/spack/repos/builtin/packages/aomp/package.py index e32dc705e3..6a9603a9de 100644 --- a/var/spack/repos/builtin/packages/aomp/package.py +++ b/var/spack/repos/builtin/packages/aomp/package.py @@ -7,8 +7,8 @@ import re from spack.package import * -tools_url = "https://github.com/ROCm-Developer-Tools" -compute_url = "https://github.com/RadeonOpenCompute" +tools_url = "https://github.com/ROCm" +compute_url = "https://github.com/ROCm" aomp = [ @@ -368,7 +368,7 @@ class Aomp(Package): "-DCMAKE_C_COMPILER={0}".format(self.compiler.cc), "-DCMAKE_CXX_COMPILER={0}".format(self.compiler.cxx), "-DCMAKE_ASM_COMPILER={0}".format(self.compiler.cc), - "-DBUG_REPORT_URL=https://github.com/ROCm-Developer-Tools/aomp", + "-DBUG_REPORT_URL=https://github.com/ROCm/aomp", "-DLLVM_ENABLE_BINDINGS=OFF", "-DLLVM_INCLUDE_BENCHMARKS=OFF", "-DLLVM_BUILD_TESTS=OFF", diff --git a/var/spack/repos/builtin/packages/arborx/0001-update-major-version-required-for-rocm-6.0.patch b/var/spack/repos/builtin/packages/arborx/0001-update-major-version-required-for-rocm-6.0.patch new file mode 100644 index 0000000000..009a40f984 --- /dev/null +++ b/var/spack/repos/builtin/packages/arborx/0001-update-major-version-required-for-rocm-6.0.patch @@ -0,0 +1,24 @@ +From a31d3766f5a7a3a3e20d5bc0c315ad6295a82298 Mon Sep 17 00:00:00 2001
+From: Afzal Patel <afzal.patel@amd.com>
+Date: Wed, 17 Jan 2024 11:50:18 -0800
+Subject: [PATCH] Changed required version of rocthrust to 3 for rocm 6.0
+
+---
+ CMakeLists.txt | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/CMakeLists.txt b/CMakeLists.txt
+index 8c3c99a..1af6d13 100644
+--- a/CMakeLists.txt
++++ b/CMakeLists.txt
+@@ -22,7 +22,7 @@ if(Kokkos_ENABLE_HIP AND ARBORX_ENABLE_ROCTHRUST)
+ # Require at least rocThrust-2.10.5 (that comes with ROCm 3.9) because
+ # rocPRIM dependency is not set properly in exported configuration for
+ # earlier versions
+- find_package(rocthrust 2.10.5 REQUIRED CONFIG)
++ find_package(rocthrust 3 REQUIRED CONFIG)
+ target_link_libraries(ArborX INTERFACE roc::rocthrust)
+ endif()
+
+--
+2.25.1
diff --git a/var/spack/repos/builtin/packages/arborx/package.py b/var/spack/repos/builtin/packages/arborx/package.py index 6eb003252c..1414a22d7a 100644 --- a/var/spack/repos/builtin/packages/arborx/package.py +++ b/var/spack/repos/builtin/packages/arborx/package.py @@ -96,6 +96,7 @@ class Arborx(CMakePackage, CudaPackage, ROCmPackage): depends_on("trilinos@13.4.0:", when="@1.3+trilinos") depends_on("trilinos@14.0.0:", when="@1.4:+trilinos") patch("trilinos14.0-kokkos-major-version.patch", when="@1.4+trilinos ^trilinos@14.0.0") + patch("0001-update-major-version-required-for-rocm-6.0.patch", when="+rocm ^hip@6.0:") conflicts("~serial", when="+trilinos") conflicts("+cuda", when="+trilinos") diff --git a/var/spack/repos/builtin/packages/atmi/package.py b/var/spack/repos/builtin/packages/atmi/package.py index 98fc5999f5..96c588174f 100644 --- a/var/spack/repos/builtin/packages/atmi/package.py +++ b/var/spack/repos/builtin/packages/atmi/package.py @@ -13,9 +13,9 @@ class Atmi(CMakePackage): consistent, declarative API to create task graphs on CPUs and GPUs (integrated and discrete).""" - homepage = "https://github.com/RadeonOpenCompute/atmi" - git = "https://github.com/RadeonOpenCompute/atmi.git" - url = "https://github.com/RadeonOpenCompute/atmi/archive/rocm-5.5.0.tar.gz" + homepage = "https://github.com/ROCm/atmi" + git = "https://github.com/ROCm/atmi.git" + url = "https://github.com/ROCm/atmi/archive/rocm-6.0.0.tar.gz" tags = ["rocm"] license("MIT") diff --git a/var/spack/repos/builtin/packages/aws-ofi-rccl/package.py b/var/spack/repos/builtin/packages/aws-ofi-rccl/package.py index 28045fd8ef..f831c88537 100644 --- a/var/spack/repos/builtin/packages/aws-ofi-rccl/package.py +++ b/var/spack/repos/builtin/packages/aws-ofi-rccl/package.py @@ -11,9 +11,9 @@ class AwsOfiRccl(AutotoolsPackage): libfabric as a network provider while running AMD's RCCL based applications.""" - homepage = "https://github.com/ROCmSoftwarePlatform/aws-ofi-rccl" - git = "https://github.com/ROCmSoftwarePlatform/aws-ofi-rccl.git" - url = "https://github.com/ROCmSoftwarePlatform/aws-ofi-rccl.git" + homepage = "https://github.com/ROCm/aws-ofi-rccl" + git = "https://github.com/ROCm/aws-ofi-rccl.git" + url = "https://github.com/ROCm/aws-ofi-rccl.git" tags = ["rocm"] maintainers("bvanessen") diff --git a/var/spack/repos/builtin/packages/babelstream/package.py b/var/spack/repos/builtin/packages/babelstream/package.py index 0d09e2f5d1..4b2a1c5857 100644 --- a/var/spack/repos/builtin/packages/babelstream/package.py +++ b/var/spack/repos/builtin/packages/babelstream/package.py @@ -157,7 +157,7 @@ class Babelstream(CMakePackage, CudaPackage, ROCmPackage): when="+thrust", msg="Which Thrust implementation to use, supported options include:\ - CUDA (via https://github.com/NVIDIA/thrust)\ - - ROCM (via https://github.com/ROCmSoftwarePlatform/rocThrust)", + - ROCM (via https://github.com/ROCm/rocThrust)", ) # This applies to all diff --git a/var/spack/repos/builtin/packages/blaspp/0001-fix-blaspp-build-error-with-rocm-6.0.0.patch b/var/spack/repos/builtin/packages/blaspp/0001-fix-blaspp-build-error-with-rocm-6.0.0.patch new file mode 100644 index 0000000000..3ce15f0859 --- /dev/null +++ b/var/spack/repos/builtin/packages/blaspp/0001-fix-blaspp-build-error-with-rocm-6.0.0.patch @@ -0,0 +1,50 @@ +From a75f399bfa77680e7736d126ef3e5a520e1a1702 Mon Sep 17 00:00:00 2001 +From: sreenivasa murthy kolam <sreenivasamurthy.kolam@amd.com> +Date: Wed, 17 Jan 2024 12:55:06 +0000 +Subject: [PATCH] fix build error with rocm-6.0.0 by adding extra parameters + for rocblas function calls rocblas_ztrmm() ,rocblas_strmm(), + rocblas_ctrmm(),rocblas_dtrmm() + +--- + src/rocblas_wrappers.cc | 4 ++++ + 1 file changed, 4 insertions(+) + +diff --git a/src/rocblas_wrappers.cc b/src/rocblas_wrappers.cc +index 0e01a95..44ab150 100644 +--- a/src/rocblas_wrappers.cc ++++ b/src/rocblas_wrappers.cc +@@ -667,6 +667,7 @@ void trmm( + m, n, + &alpha, + dA, ldda, ++ dB, lddb, + dB, lddb ) ); + } + +@@ -686,6 +687,7 @@ void trmm( + m, n, + &alpha, + dA, ldda, ++ dB, lddb, + dB, lddb ) ); + } + +@@ -705,6 +707,7 @@ void trmm( + m, n, + (rocblas_float_complex*) &alpha, + (rocblas_float_complex*) dA, ldda, ++ (rocblas_float_complex*) dB, lddb, + (rocblas_float_complex*) dB, lddb ) ); + } + +@@ -724,6 +727,7 @@ void trmm( + m, n, + (rocblas_double_complex*) &alpha, + (rocblas_double_complex*) dA, ldda, ++ (rocblas_double_complex*) dB, lddb, + (rocblas_double_complex*) dB, lddb ) ); + } + +-- +2.39.3 + diff --git a/var/spack/repos/builtin/packages/blaspp/package.py b/var/spack/repos/builtin/packages/blaspp/package.py index e0de779540..78a2fce1d6 100644 --- a/var/spack/repos/builtin/packages/blaspp/package.py +++ b/var/spack/repos/builtin/packages/blaspp/package.py @@ -22,6 +22,9 @@ class Blaspp(CMakePackage, CudaPackage, ROCmPackage): version("master", branch="master") version( + "2023.11.05", sha256="62dfc03ec07c0826e0466dc2c204b460caa929d53ad4f050cb132d92670be7ce" + ) + version( "2023.08.25", sha256="1d9c7227a6d8776944aa866592142b7b51c6e4ba5529d168eb8ae2b329c47401" ) version( @@ -76,6 +79,10 @@ class Blaspp(CMakePackage, CudaPackage, ROCmPackage): requires("%oneapi", when="+sycl", msg="blaspp+sycl must be compiled with %oneapi") + patch( + "0001-fix-blaspp-build-error-with-rocm-6.0.0.patch", when="@2023.06.00: ^hip@6.0.0 +rocm" + ) + def cmake_args(self): spec = self.spec backend_config = "-Duse_cuda=%s" % ("+cuda" in spec) diff --git a/var/spack/repos/builtin/packages/comgr/package.py b/var/spack/repos/builtin/packages/comgr/package.py index f713ccba6d..f8bbd4e526 100644 --- a/var/spack/repos/builtin/packages/comgr/package.py +++ b/var/spack/repos/builtin/packages/comgr/package.py @@ -12,9 +12,9 @@ class Comgr(CMakePackage): """This provides various Lightning Compiler related services. It currently contains one library, the Code Object Manager (Comgr)""" - homepage = "https://github.com/RadeonOpenCompute/ROCm-CompilerSupport" - git = "https://github.com/RadeonOpenCompute/ROCm-CompilerSupport.git" - url = "https://github.com/RadeonOpenCompute/ROCm-CompilerSupport/archive/rocm-5.5.0.tar.gz" + homepage = "https://github.com/ROCm/ROCm-CompilerSupport" + git = "https://github.com/ROCm/ROCm-CompilerSupport.git" + url = "https://github.com/ROCm/ROCm-CompilerSupport/archive/rocm-6.0.0.tar.gz" tags = ["rocm"] maintainers("srekolam", "renjithravindrankannath", "haampie") @@ -23,6 +23,7 @@ class Comgr(CMakePackage): license("NCSA") version("master", branch="amd-stg-open") + version("6.0.0", sha256="04353d27a512642a5e5339532a39d0aabe44e0964985de37b150a2550385800a") version("5.7.1", sha256="3b9433b4a0527167c3e9dfc37a3c54e0550744b8d4a8e1be298c8d4bcedfee7c") version("5.7.0", sha256="e234bcb93d602377cfaaacb59aeac5796edcd842a618162867b7e670c3a2c42c") version("5.6.1", sha256="0a85d84619f98be26ca7a32c71f94ed3c4e9866133789eabb451be64ce739300") @@ -152,6 +153,7 @@ class Comgr(CMakePackage): "5.6.1", "5.7.0", "5.7.1", + "6.0.0", "master", ]: # llvm libs are linked statically, so this *could* be a build dep @@ -163,7 +165,7 @@ class Comgr(CMakePackage): "rocm-device-libs@" + ver, when="@{0} ^llvm-amdgpu ~rocm-device-libs".format(ver) ) - for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1", "5.7.0", "5.7.1"]: + for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1", "5.7.0", "5.7.1", "6.0.0"]: depends_on("rocm-core@" + ver, when="@" + ver) root_cmakelists_dir = join_path("lib", "comgr") diff --git a/var/spack/repos/builtin/packages/composable-kernel/package.py b/var/spack/repos/builtin/packages/composable-kernel/package.py index afbb86f01f..10bdf7183c 100644 --- a/var/spack/repos/builtin/packages/composable-kernel/package.py +++ b/var/spack/repos/builtin/packages/composable-kernel/package.py @@ -11,14 +11,15 @@ class ComposableKernel(CMakePackage): """Composable Kernel: Performance Portable Programming Model for Machine Learning Tensor Operators.""" - homepage = "https://github.com/ROCmSoftwarePlatform/composable_kernel" - git = "https://github.com/ROCmSoftwarePlatform/composable_kernel.git" - url = "https://github.com/ROCmSoftwarePlatform/composable_kernel/archive/refs/tags/rocm-5.7.1.tar.gz" + homepage = "https://github.com/ROCm/composable_kernel" + git = "https://github.com/ROCm/composable_kernel.git" + url = "https://github.com/ROCm/composable_kernel/archive/refs/tags/rocm-5.7.1.tar.gz" maintainers("srekolam", "afzpatel") license("MIT") version("master", branch="develop") + version("6.0.0", sha256="a8f736f2f2a8afa4cddd06301205be27774d85f545429049b4a2bbbe6fcd67df") version("5.7.1", sha256="75f66e023c2e31948e91fa26366eaeac72d871fc2e5188361d4465179f13876e") version("5.7.0", sha256="d9624dbaef04e0138f9f73596c49b4fe9ded69974bae7236354baa32649bf21a") version("5.6.1", commit="f5ec04f091fa5c48c67d7bacec36a414d0be06a5") @@ -46,7 +47,18 @@ class ComposableKernel(CMakePackage): depends_on("pkgconfig", type="build") depends_on("cmake@3.16:", type="build") - for ver in ["master", "5.7.1", "5.7.0", "5.6.1", "5.6.0", "5.5.1", "5.5.0", "5.4.3", "5.4.0"]: + for ver in [ + "master", + "6.0.0", + "5.7.1", + "5.7.0", + "5.6.1", + "5.6.0", + "5.5.1", + "5.5.0", + "5.4.3", + "5.4.0", + ]: depends_on("hip@" + ver, when="@" + ver) depends_on("llvm-amdgpu@" + ver, when="@" + ver) depends_on("rocm-cmake@" + ver, when="@" + ver, type="build") diff --git a/var/spack/repos/builtin/packages/heffte/package.py b/var/spack/repos/builtin/packages/heffte/package.py index 228e813973..1472116be9 100644 --- a/var/spack/repos/builtin/packages/heffte/package.py +++ b/var/spack/repos/builtin/packages/heffte/package.py @@ -114,7 +114,7 @@ class Heffte(CMakePackage, CudaPackage, ROCmPackage): if "none" not in rocm_arch: args.append("-DCMAKE_CXX_FLAGS={0}".format(self.hip_flags(rocm_arch))) - # See https://github.com/ROCmSoftwarePlatform/rocFFT/issues/322 + # See https://github.com/ROCm/rocFFT/issues/322 if self.spec.satisfies("^cmake@3.21.0:3.21.2"): args.append(self.define("__skip_rocmclang", "ON")) diff --git a/var/spack/repos/builtin/packages/hip-examples/package.py b/var/spack/repos/builtin/packages/hip-examples/package.py index c2e8aaa97e..22f5705389 100644 --- a/var/spack/repos/builtin/packages/hip-examples/package.py +++ b/var/spack/repos/builtin/packages/hip-examples/package.py @@ -11,9 +11,9 @@ from spack.package import * class HipExamples(Package): """Examples for HIP""" - homepage = "https://github.com/ROCm-Developer-Tools/HIP-Examples/" - git = "https://github.com/ROCm-Developer-Tools/HIP-Examples.git" - url = "https://github.com/ROCm-Developer-Tools/HIP-Examples/archive/rocm-5.4.3.tar.gz" + homepage = "https://github.com/ROCm/HIP-Examples/" + git = "https://github.com/ROCm/HIP-Examples.git" + url = "https://github.com/ROCm/HIP-Examples/archive/rocm-5.4.3.tar.gz" tags = ["rocm"] maintainers("srekolam", "renjithravindrankannath", "afzpatel") diff --git a/var/spack/repos/builtin/packages/hip-rocclr/package.py b/var/spack/repos/builtin/packages/hip-rocclr/package.py index 2ae9e375e0..22c1232e2c 100644 --- a/var/spack/repos/builtin/packages/hip-rocclr/package.py +++ b/var/spack/repos/builtin/packages/hip-rocclr/package.py @@ -12,8 +12,8 @@ class HipRocclr(CMakePackage): with to different backends such as ROCr or PAL This abstraction allows runtimes to work on Windows as well as on Linux without much effort.""" - homepage = "https://github.com/ROCm-Developer-Tools/ROCclr" - git = "https://github.com/ROCm-Developer-Tools/ROCclr.git" + homepage = "https://github.com/ROCm/ROCclr" + git = "https://github.com/ROCm/ROCclr.git" tags = ["rocm"] maintainers("srekolam", "renjithravindrankannath") @@ -21,9 +21,9 @@ class HipRocclr(CMakePackage): def url_for_version(self, version): # Fix up a typo in the 3.5.0 release. if version == Version("3.5.0"): - return "https://github.com/ROCm-Developer-Tools/ROCclr/archive/roc-3.5.0.tar.gz" + return "https://github.com/ROCm/ROCclr/archive/roc-3.5.0.tar.gz" - url = "https://github.com/ROCm-Developer-Tools/ROCclr/archive/rocm-{0}.tar.gz" + url = "https://github.com/ROCm/ROCclr/archive/rocm-{0}.tar.gz" return url.format(version) license("MIT") @@ -152,13 +152,13 @@ class HipRocclr(CMakePackage): depends_on("hsa-rocr-dev@" + ver, when="@" + ver) depends_on("comgr@" + ver, when="@" + ver) - # See: https://github.com/ROCm-Developer-Tools/ROCclr/pull/16 + # See: https://github.com/ROCm/ROCclr/pull/16 # In 3.7.0 the find opengl things have changed slightly. patch("opengl.patch", when="@3.5.0") resource( name="opencl-on-vdi", - url="https://github.com/RadeonOpenCompute/ROCm-OpenCL-Runtime/archive/roc-3.5.0.tar.gz", + url="https://github.com/ROCm/ROCm-OpenCL-Runtime/archive/roc-3.5.0.tar.gz", sha256="511b617d5192f2d4893603c1a02402b2ac9556e9806ff09dd2a91d398abf39a0", expand=True, destination="", @@ -197,7 +197,7 @@ class HipRocclr(CMakePackage): ]: resource( name="opencl-on-vdi", - url="https://github.com/RadeonOpenCompute/ROCm-OpenCL-Runtime/archive/rocm-{0}.tar.gz".format( + url="https://github.com/ROCm/ROCm-OpenCL-Runtime/archive/rocm-{0}.tar.gz".format( d_version ), sha256=d_shasum, @@ -209,7 +209,7 @@ class HipRocclr(CMakePackage): resource( name="opencl-on-vdi", - git="https://github.com/RadeonOpenCompute/ROCm-OpenCL-Runtime.git", + git="https://github.com/ROCm/ROCm-OpenCL-Runtime.git", destination="", placement="opencl-on-vdi", branch="main", diff --git a/var/spack/repos/builtin/packages/hip-tensor/package.py b/var/spack/repos/builtin/packages/hip-tensor/package.py index e925031945..86fd4e385d 100644 --- a/var/spack/repos/builtin/packages/hip-tensor/package.py +++ b/var/spack/repos/builtin/packages/hip-tensor/package.py @@ -17,10 +17,11 @@ class HipTensor(CMakePackage, ROCmPackage): maintainers("srekolam", "afzpatel") version("master", branch="master") + version("6.0.0", sha256="268d7f114784b7e824f89c21c65c2efedbb5486f09a356a56dca1b89bde1ef7a") version("5.7.1", sha256="96743d4e695fe865aef4097ae31d9b4e42a2d5a92135a005b0d187d9c0b17645") version("5.7.0", sha256="4b17f6d43b17fe2dc1d0c61e9663d4752006f7898cc94231206444a1663eb252") - for ver in ["5.7.0", "5.7.1", "master"]: + for ver in ["5.7.0", "5.7.1", "6.0.0", "master"]: depends_on(f"composable-kernel@{ver}", when=f"@{ver}") depends_on(f"rocm-cmake@{ver}", when=f"@{ver}") diff --git a/var/spack/repos/builtin/packages/hip/0014-remove-compiler-rt-linkage-for-host.6.0.patch b/var/spack/repos/builtin/packages/hip/0014-remove-compiler-rt-linkage-for-host.6.0.patch new file mode 100644 index 0000000000..597baa2e5d --- /dev/null +++ b/var/spack/repos/builtin/packages/hip/0014-remove-compiler-rt-linkage-for-host.6.0.patch @@ -0,0 +1,61 @@ +diff --git a/clr/hipamd/CMakeLists.txt b/clr/hipamd/CMakeLists.txt +index 7ad3001..aaf6ad0 100755 +--- a/clr/hipamd/CMakeLists.txt ++++ b/clr/hipamd/CMakeLists.txt +@@ -297,16 +297,6 @@ if(HIP_RUNTIME STREQUAL "rocclr") + add_subdirectory(src) + endif() + +-# Download libamdhip64.so.5 +-if(HIP_PLATFORM STREQUAL "amd") +- if(NOT WIN32) +- execute_process(COMMAND sh -c "${CMAKE_CURRENT_SOURCE_DIR}/download_libamhip64_v5.sh" WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} COMMAND_ECHO STDERR RESULT_VARIABLE DWLD_HIP_SO_RC) +- if (DWLD_HIP_SO_RC AND NOT DWLD_HIP_SO_RC EQUAL 0) +- message(FATAL_ERROR "Failed to download libamdhip64.so.5") +- endif() +- endif() +-endif() +- + # Build doxygen documentation + find_program(DOXYGEN_EXE doxygen) + if(DOXYGEN_EXE) +@@ -408,8 +398,6 @@ if (NOT ${HIPCC_BIN_DIR} STREQUAL "") + install(PROGRAMS ${HIPCC_BIN_DIR}/hipcc.pl DESTINATION bin) + install(PROGRAMS ${HIPCC_BIN_DIR}/hipconfig.pl DESTINATION bin) + install(PROGRAMS ${HIPCC_BIN_DIR}/hipvars.pm DESTINATION bin) +- install(PROGRAMS ${HIPCC_BIN_DIR}/hipcc.bat DESTINATION bin) +- install(PROGRAMS ${HIPCC_BIN_DIR}/hipconfig.bat DESTINATION bin) + endif() + + ############################# +diff --git a/hipcc/bin/hipcc.pl b/hipcc/bin/hipcc.pl +index 513a427..cd2d6ac 100755 +--- a/hipcc/bin/hipcc.pl ++++ b/hipcc/bin/hipcc.pl +@@ -160,11 +160,14 @@ if ($HIP_PLATFORM eq "amd") { + if($isWindows) { + $execExtension = ".exe"; + } +- $HIPCC=get_normalized_path("$HIP_CLANG_PATH/clang++" . $execExtension); ++ # llvm_path is set inside the hip recipe ++ $LLVM_PATH= $ENV{'LLVM_PATH'}; ++ $HIPCC="${LLVM_PATH}/bin/clang++" . $execExtension; + + # If $HIPCC clang++ is not compiled, use clang instead + if ( ! -e $HIPCC ) { +- $HIPCC=get_normalized_path("$HIP_CLANG_PATH/clang" . $execExtension); ++ $LLVM_PATH= $ENV{'LLVM_PATH'}; ++ $HIPCC="${LLVM_PATH}/bin/clang" . $execExtension; + $HIPLDFLAGS = "--driver-mode=g++"; + } + # to avoid using dk linker or MSVC linker +@@ -484,7 +487,8 @@ if($HIP_PLATFORM eq "amd"){ + $targetsStr = $ENV{HCC_AMDGPU_TARGET}; + } elsif (not $isWindows) { + # Else try using rocm_agent_enumerator +- $ROCM_AGENT_ENUM = "${ROCM_PATH}/bin/rocm_agent_enumerator"; ++ $ROCMINFO_PATH = $ENV{'ROCMINFO_PATH'} // $ROCMINFO_PATH; ++ $ROCM_AGENT_ENUM = "${ROCMINFO_PATH}/bin/rocm_agent_enumerator"; + $targetsStr = `${ROCM_AGENT_ENUM} -t GPU`; + $targetsStr =~ s/\n/,/g; + } diff --git a/var/spack/repos/builtin/packages/hip/0018-reverting-hipMemoryType-with-memoryType.patch b/var/spack/repos/builtin/packages/hip/0018-reverting-hipMemoryType-with-memoryType.patch new file mode 100644 index 0000000000..c77075d640 --- /dev/null +++ b/var/spack/repos/builtin/packages/hip/0018-reverting-hipMemoryType-with-memoryType.patch @@ -0,0 +1,17 @@ +diff --git a/include/hip/hip_runtime_api.h b/include/hip/hip_runtime_api.h +index 88e6850..d280504 100644 +--- a/include/hip/hip_runtime_api.h ++++ b/include/hip/hip_runtime_api.h +@@ -259,7 +259,11 @@ typedef enum hipMemoryType { + * Pointer attributes + */ + typedef struct hipPointerAttribute_t { +- enum hipMemoryType type; ++ union { ++ // Deprecated, use instead type ++ enum hipMemoryType memoryType; ++ enum hipMemoryType type; ++ }; + int device; + void* devicePointer; + void* hostPointer; diff --git a/var/spack/repos/builtin/packages/hip/package.py b/var/spack/repos/builtin/packages/hip/package.py index 29b23fecca..a6fd946955 100644 --- a/var/spack/repos/builtin/packages/hip/package.py +++ b/var/spack/repos/builtin/packages/hip/package.py @@ -16,9 +16,9 @@ class Hip(CMakePackage): create portable applications for AMD and NVIDIA GPUs from single source code.""" - homepage = "https://github.com/ROCm-Developer-Tools/HIP" - git = "https://github.com/ROCm-Developer-Tools/HIP.git" - url = "https://github.com/ROCm-Developer-Tools/HIP/archive/rocm-5.5.0.tar.gz" + homepage = "https://github.com/ROCm/HIP" + git = "https://github.com/ROCm/HIP.git" + url = "https://github.com/ROCm/HIP/archive/rocm-6.0.0.tar.gz" tags = ["rocm"] maintainers("srekolam", "renjithravindrankannath", "haampie") @@ -27,6 +27,7 @@ class Hip(CMakePackage): license("MIT") version("master", branch="master") + version("6.0.0", sha256="0d575788e0b731124a8489a36652014a165b9ebab92d5456ec3c976e062f3a82") version("5.7.1", sha256="eaa0e14a9ae45c58ed37863797b683a7778b3cbbf92f5b6529ec65fd61d61f3e") version("5.7.0", sha256="cb61234eec7879fb7e20937659ad535b93a6e66fc8de0a543da8b7702474f2fc") version("5.6.1", sha256="4b3c4dfcf8595da0e1b8c3e8067b1ccebeaac337762ff098db14375fa8dd4487") @@ -172,6 +173,7 @@ class Hip(CMakePackage): "5.6.1", "5.7.0", "5.7.1", + "6.0.0", ]: depends_on("hsakmt-roct@" + ver, when="@" + ver) depends_on("hsa-rocr-dev@" + ver, when="@" + ver) @@ -180,12 +182,22 @@ class Hip(CMakePackage): depends_on("rocminfo@" + ver, when="@" + ver) depends_on("roctracer-dev-api@" + ver, when="@" + ver) - for ver in ["5.4.0", "5.4.3", "5.5.0", "5.5.1", "5.6.0", "5.6.1", "5.7.0", "5.7.1"]: + for ver in [ + "5.4.0", + "5.4.3", + "5.5.0", + "5.5.1", + "5.6.0", + "5.6.1", + "5.7.0", + "5.7.1", + "6.0.0", + ]: depends_on("hipify-clang", when="@" + ver) - for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1", "5.7.0", "5.7.1"]: + for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1", "5.7.0", "5.7.1", "6.0.0"]: depends_on("rocm-core@" + ver, when="@" + ver) # hipcc likes to add `-lnuma` by default :( - # ref https://github.com/ROCm-Developer-Tools/HIP/pull/2202 + # ref https://github.com/ROCm/HIP/pull/2202 depends_on("numactl", when="@3.7.0:") # roc-obj-ls requirements @@ -212,9 +224,7 @@ class Hip(CMakePackage): ]: resource( name="hipamd", - url="https://github.com/ROCm-Developer-Tools/hipamd/archive/rocm-{0}.tar.gz".format( - d_version - ), + url="https://github.com/ROCm/hipamd/archive/rocm-{0}.tar.gz".format(d_version), sha256=d_shasum, expand=True, destination="", @@ -241,7 +251,7 @@ class Hip(CMakePackage): ]: resource( name="opencl", - url="https://github.com/RadeonOpenCompute/ROCm-OpenCL-Runtime/archive/rocm-{0}.tar.gz".format( + url="https://github.com/ROCm/ROCm-OpenCL-Runtime/archive/rocm-{0}.tar.gz".format( d_version ), sha256=d_shasum, @@ -269,9 +279,7 @@ class Hip(CMakePackage): ]: resource( name="rocclr", - url="https://github.com/ROCm-Developer-Tools/ROCclr/archive/rocm-{0}.tar.gz".format( - d_version - ), + url="https://github.com/ROCm/ROCclr/archive/rocm-{0}.tar.gz".format(d_version), sha256=d_shasum, expand=True, destination="", @@ -280,6 +288,7 @@ class Hip(CMakePackage): ) # Add hip-clr sources thru the below for d_version, d_shasum in [ + ("6.0.0", "798b55b5b5fb90dd19db54f136d8d8e1da9ae1e408d5b12b896101d635f97e50"), ("5.7.1", "c78490335233a11b4d8a5426ace7417c555f5e2325de10422df06c0f0f00f7eb"), ("5.7.0", "bc2447cb6fd86dff6a333b04e77ce85755104d9011a14a044af53caf02449573"), ("5.6.1", "0b88af1e99643899d11b1c8cf8a3c46601051b328a5e0ffbd44ee88b7eb0db33"), @@ -287,9 +296,7 @@ class Hip(CMakePackage): ]: resource( name="clr", - url="https://github.com/ROCm-Developer-Tools/clr/archive/refs/tags/rocm-{0}.tar.gz".format( - d_version - ), + url="https://github.com/ROCm/clr/archive/refs/tags/rocm-{0}.tar.gz".format(d_version), sha256=d_shasum, expand=True, destination="", @@ -299,6 +306,7 @@ class Hip(CMakePackage): # Add hipcc sources thru the below for d_version, d_shasum in [ + ("6.0.0", "e9cfaaecaf0e6ed363946439197f340c115e8e1189f96dbd716cf20245c29255"), ("5.7.1", "d47d27ef2b5de7f49cdfd8547832ac9b437a32e6fc6f0e9c1646f4b704c90aee"), ("5.7.0", "9f839bf7226e5e26f3150f8ba6eca507ab9a668e68b207736301b3bb9040c973"), ("5.6.1", "5800fac92b841ef6f52acda78d9bf86f83970bec0fb848a6265d239bdb7eb51a"), @@ -306,7 +314,7 @@ class Hip(CMakePackage): ]: resource( name="hipcc", - url="https://github.com/ROCm-Developer-Tools/HIPCC/archive/refs/tags/rocm-{0}.tar.gz".format( + url="https://github.com/ROCm/HIPCC/archive/refs/tags/rocm-{0}.tar.gz".format( d_version ), sha256=d_shasum, @@ -317,6 +325,7 @@ class Hip(CMakePackage): ) # Add hiptests sources thru the below for d_version, d_shasum in [ + ("6.0.0", "e8f92a0f5d1f6093ca1fb24ff1b7140128900fcdc6e9f01f153d6907e5c2d807"), ("5.7.1", "28fbdf49f405adfee903bc0f05a43ac392c55b34c514c3582dfb7d6d67e79985"), ("5.7.0", "b1dae3cfc715e71dce92ac1da94265a9398944c76cee85ffab8f0c93665a48d6"), ("5.6.1", "5b3002ddfafda162329e4d9e6ac1200eeb48ff08e666b342aa8aeca30750f48b"), @@ -324,7 +333,7 @@ class Hip(CMakePackage): ]: resource( name="hip-tests", - url="https://github.com/ROCm-Developer-Tools/hip-tests/archive/refs/tags/rocm-{0}.tar.gz".format( + url="https://github.com/ROCm/hip-tests/archive/refs/tags/rocm-{0}.tar.gz".format( d_version ), sha256=d_shasum, @@ -366,10 +375,10 @@ class Hip(CMakePackage): ) patch("0013-remove-compiler-rt-linkage-for-host.5.3.0.patch", when="@5.3.0:5.4") - # See https://github.com/ROCm-Developer-Tools/HIP/pull/2141 + # See https://github.com/ROCm/HIP/pull/2141 patch("0002-Fix-detection-of-HIP_CLANG_ROOT.patch", when="@:3.9.0") - # See https://github.com/ROCm-Developer-Tools/HIP/pull/2218 + # See https://github.com/ROCm/HIP/pull/2218 patch("0003-Improve-compilation-without-git-repo.3.7.0.patch", when="@3.7.0:3.9.0") patch("0003-Improve-compilation-without-git-repo.3.10.0.patch", when="@3.10.0:4.0.0") patch("0003-Improve-compilation-without-git-repo.4.1.0.patch", when="@4.1.0") @@ -383,7 +392,7 @@ class Hip(CMakePackage): "_disabletests.4.5.0.patch", when="@4.5.0:4.5.3", ) - # See https://github.com/ROCm-Developer-Tools/HIP/pull/2219 + # See https://github.com/ROCm/HIP/pull/2219 patch("0004-Drop-clang-rt-builtins-linking-on-hip-host.3.7.0.patch", when="@3.7.0:3.9.0") patch("0004-Drop-clang-rt-builtins-linking-on-hip-host.3.10.0.patch", when="@3.10.0:4.1.0") @@ -400,14 +409,16 @@ class Hip(CMakePackage): patch("0014-remove-compiler-rt-linkage-for-host.5.5.0.patch", when="@5.5") patch("0014-remove-compiler-rt-linkage-for-host.5.6.0.patch", when="@5.6.0:5.6") patch("0014-Remove-compiler-rt-linkage-for-host-for-5.7.0.patch", when="@5.7.0:5.7") - patch("0015-reverting-operator-mixup-fix-for-slate.patch", when="@5.6:") - patch("0017-Set-PARAMETERS_MIN_ALIGNMENT-to-the-native-alignment.patch", when="@5.7") + patch("0014-remove-compiler-rt-linkage-for-host.6.0.patch", when="@6.0:") + patch("0015-reverting-operator-mixup-fix-for-slate.patch", when="@5.6:6.0") + patch("0017-Set-PARAMETERS_MIN_ALIGNMENT-to-the-native-alignment.patch", when="@5.7:6.0") + patch("0018-reverting-hipMemoryType-with-memoryType.patch", when="@6.0") - # See https://github.com/ROCm-Developer-Tools/HIP/pull/3206 + # See https://github.com/ROCm/HIP/pull/3206 patch( - "https://github.com/ROCm-Developer-Tools/HIP/commit/50ee82f6bc4aad10908ce09198c9f7ebfb2a3561.patch?full_index=1", + "https://github.com/ROCm/HIP/commit/50ee82f6bc4aad10908ce09198c9f7ebfb2a3561.patch?full_index=1", sha256="c2ee21cdc55262c7c6ba65546b5ca5f65ea89730", - when="@5.2:", + when="@5.2:5.7", ) @property @@ -533,7 +544,7 @@ class Hip(CMakePackage): # This is a variable that does not exist in hipcc but was introduced # in a patch of ours since 3.5.0 to locate rocm_agent_enumerator: - # https://github.com/ROCm-Developer-Tools/HIP/pull/2138 + # https://github.com/ROCm/HIP/pull/2138 env.set("ROCMINFO_PATH", paths["rocminfo"]) # This one is used in hipcc to run `clang --hip-device-lib-path=...` @@ -548,7 +559,7 @@ class Hip(CMakePackage): # Used in comgr and seems necessary when using the JIT compiler, e.g. # hiprtcCreateProgram: - # https://github.com/RadeonOpenCompute/ROCm-CompilerSupport/blob/rocm-4.0.0/lib/comgr/src/comgr-env.cpp + # https://github.com/ROCm/ROCm-CompilerSupport/blob/rocm-4.0.0/lib/comgr/src/comgr-env.cpp env.set("LLVM_PATH", paths["llvm-amdgpu"]) env.set("COMGR_PATH", paths["comgr"]) @@ -560,7 +571,7 @@ class Hip(CMakePackage): # and parsing of the <prefix>/bin/.hipVersion file. Let's just set this # to the hip prefix directory for non-external builds so that the # bin/.hipVersion file can still be parsed. - # See also https://github.com/ROCm-Developer-Tools/HIP/issues/2223 + # See also https://github.com/ROCm/HIP/issues/2223 if "@3.8.0:" in self.spec: env.append_path( "HIPCC_COMPILE_FLAGS_APPEND", diff --git a/var/spack/repos/builtin/packages/hipblas/package.py b/var/spack/repos/builtin/packages/hipblas/package.py index e05dfd3768..46b02ad352 100644 --- a/var/spack/repos/builtin/packages/hipblas/package.py +++ b/var/spack/repos/builtin/packages/hipblas/package.py @@ -12,9 +12,9 @@ class Hipblas(CMakePackage, CudaPackage, ROCmPackage): """hipBLAS is a BLAS marshalling library, with multiple supported backends""" - homepage = "https://github.com/ROCmSoftwarePlatform/hipBLAS" - git = "https://github.com/ROCmSoftwarePlatform/hipBLAS.git" - url = "https://github.com/ROCmSoftwarePlatform/hipBLAS/archive/rocm-5.5.0.tar.gz" + homepage = "https://github.com/ROCm/hipBLAS" + git = "https://github.com/ROCm/hipBLAS.git" + url = "https://github.com/ROCm/hipBLAS/archive/rocm-6.0.0.tar.gz" tags = ["rocm"] maintainers("cgmb", "srekolam", "renjithravindrankannath", "haampie") @@ -24,6 +24,7 @@ class Hipblas(CMakePackage, CudaPackage, ROCmPackage): version("develop", branch="develop") version("master", branch="master") + version("6.0.0", sha256="8fbd0c244fe82eded866e06d2399b1d91ab5d43d2ebcb73382c7ce1ae48d9cb3") version("5.7.1", sha256="794e9298f48ffbe3bd1c1ab87a5c2c2b953713500155fdec9ef8cbb11f81fc8a") version("5.7.0", sha256="8c6cd2ffa4ce6ab03e05feffe074685b5525610870aebe9d78f817b3037f33a4") version("5.6.1", sha256="f9da82fbefc68b84081ea0ed0139b91d2a540357fcf505c7f1d57eab01eb327c") @@ -136,11 +137,14 @@ class Hipblas(CMakePackage, CudaPackage, ROCmPackage): patch("link-clients-blas.patch", when="@4.3.0:4.3.2") patch("link-clients-blas-4.5.0.patch", when="@4.5.0:4.5.2") patch("hipblas-link-clients-blas-5.0.0.patch", when="@5.0.0:5.0.2") - patch("remove-hipblas-clients-file-installation.patch", when="@5.5:") + patch("remove-hipblas-clients-file-installation.patch", when="@5.5:5.7.1") + patch("remove-hipblas-clients-file-installation-6.0.patch", when="@6.0:") - depends_on("rocm-cmake@5.2.0:", type="build", when="@5.2.0:") + depends_on("rocm-cmake@5.2.0:", type="build", when="@5.2.0:5.7") depends_on("rocm-cmake@4.5.0:", type="build", when="@4.5.0:") depends_on("rocm-cmake@3.5.0:", type="build") + for ver in ["6.0.0"]: + depends_on("rocm-cmake@" + ver, when="+rocm @" + ver) depends_on("hip +cuda", when="+cuda") @@ -174,12 +178,12 @@ class Hipblas(CMakePackage, CudaPackage, ROCmPackage): "5.6.1", "5.7.0", "5.7.1", + "6.0.0", "master", "develop", ]: depends_on("rocsolver@" + ver, when="+rocm @" + ver) depends_on("rocblas@" + ver, when="+rocm @" + ver) - for tgt in ROCmPackage.amdgpu_targets: depends_on( "rocblas amdgpu_target={0}".format(tgt), when="+rocm amdgpu_target={0}".format(tgt) diff --git a/var/spack/repos/builtin/packages/hipblas/remove-hipblas-clients-file-installation-6.0.patch b/var/spack/repos/builtin/packages/hipblas/remove-hipblas-clients-file-installation-6.0.patch new file mode 100644 index 0000000000..ca6fa8f413 --- /dev/null +++ b/var/spack/repos/builtin/packages/hipblas/remove-hipblas-clients-file-installation-6.0.patch @@ -0,0 +1,32 @@ +From 120af1b2483868ebdc2ee5f137418d23c14178ad Mon Sep 17 00:00:00 2001 +From: Renjith Ravindran <Renjith.RavindranKannath@amd.com> +Date: Wed, 10 Jan 2024 04:28:15 +0000 +Subject: [PATCH] Remove hipblas clients file installation + +--- + clients/CMakeLists.txt | 12 ------------ + 1 file changed, 12 deletions(-) + +diff --git a/clients/CMakeLists.txt b/clients/CMakeLists.txt +index 8206ad7..6a59808 100644 +--- a/clients/CMakeLists.txt ++++ b/clients/CMakeLists.txt +@@ -135,15 +135,3 @@ add_custom_command( OUTPUT "${HIPBLAS_GENTEST}" + + add_custom_target( hipblas-common DEPENDS "${HIPBLAS_COMMON}" "${HIPBLAS_TEMPLATE}" "${HIPBLAS_SMOKE}" "${HIPBLAS_GENTEST}" ) + +-if( BUILD_CLIENTS_TESTS OR BUILD_CLIENTS_BENCHMARKS ) +- rocm_install( +- FILES ${HIPBLAS_COMMON} ${HIPBLAS_TEMPLATE} ${HIPBLAS_SMOKE} +- DESTINATION "${CMAKE_INSTALL_BINDIR}" +- COMPONENT clients-common +- ) +- rocm_install( +- PROGRAMS ${HIPBLAS_GENTEST} +- DESTINATION "${CMAKE_INSTALL_BINDIR}" +- COMPONENT clients-common +- ) +-endif() +-- +2.39.3 + diff --git a/var/spack/repos/builtin/packages/hipcub/package.py b/var/spack/repos/builtin/packages/hipcub/package.py index 61c05e7431..34e16cd4bc 100644 --- a/var/spack/repos/builtin/packages/hipcub/package.py +++ b/var/spack/repos/builtin/packages/hipcub/package.py @@ -9,14 +9,15 @@ from spack.package import * class Hipcub(CMakePackage, CudaPackage, ROCmPackage): """Radeon Open Compute Parallel Primitives Library""" - homepage = "https://github.com/ROCmSoftwarePlatform/hipCUB" - git = "https://github.com/ROCmSoftwarePlatform/hipCUB.git" - url = "https://github.com/ROCmSoftwarePlatform/hipCUB/archive/rocm-5.5.0.tar.gz" + homepage = "https://github.com/ROCm/hipCUB" + git = "https://github.com/ROCm/hipCUB.git" + url = "https://github.com/ROCm/hipCUB/archive/rocm-6.0.0.tar.gz" tags = ["rocm"] license("BSD-3-Clause") maintainers("srekolam", "renjithravindrankannath") + version("6.0.0", sha256="8d9f6e1e3f8433a2ceae1b0efd6727c21383980077e264725d00d5fee165bd30") version("5.7.1", sha256="9b23a58408bc4c549d3c754196cb3e2c1a50e177ab0a286101cbea2f7f173945") version("5.7.0", sha256="899356867f662d9a6f3870bb4a496f605a3143c6ad4d1fa9e9faead68fa8d13b") version("5.6.1", sha256="4b9479daa40424c9ddbc14ce967aa170680f8ca1ed01a514e6e30ccfa22552ce") @@ -157,6 +158,7 @@ class Hipcub(CMakePackage, CudaPackage, ROCmPackage): "5.6.1", "5.7.0", "5.7.1", + "6.0.0", ]: depends_on("rocprim@" + ver, when="+rocm @" + ver) depends_on("rocm-cmake@%s:" % ver, type="build", when="@" + ver) diff --git a/var/spack/repos/builtin/packages/hipfft/001-remove-submodule-and-sync-shared-files-from-rocFFT.patch b/var/spack/repos/builtin/packages/hipfft/001-remove-submodule-and-sync-shared-files-from-rocFFT.patch new file mode 100644 index 0000000000..537794d3cc --- /dev/null +++ b/var/spack/repos/builtin/packages/hipfft/001-remove-submodule-and-sync-shared-files-from-rocFFT.patch @@ -0,0 +1,11431 @@ +From 27ae15a459f45f1acfcb1a9b1c8d491d9f731fd4 Mon Sep 17 00:00:00 2001 +From: Steve Leung <Steve.Leung@amd.com> +Date: Thu, 4 Jan 2024 16:36:08 -0700 +Subject: [PATCH] remove submodule and sync shared files from rocFFT, update + CHANGELOG.md + +--- + clients/CMakeLists.txt | 15 - + clients/bench/CMakeLists.txt | 4 +- + clients/bench/bench.cpp | 2 +- + clients/hipfft_params.h | 2 +- + clients/tests/CMakeLists.txt | 11 +- + clients/tests/accuracy_test_1D.cpp | 8 +- + clients/tests/accuracy_test_2D.cpp | 8 +- + clients/tests/accuracy_test_3D.cpp | 8 +- + clients/tests/accuracy_test_callback.cpp | 2 +- + clients/tests/gtest_main.cpp | 6 +- + clients/tests/hipfft_accuracy_test.cpp | 11 +- + clients/tests/hipfft_accuracy_test.h | 2 +- + clients/tests/multi_device_test.cpp | 2 +- + cmake/dependencies.cmake | 3 - + library/src/amd_detail/hipfft.cpp | 8 +- + shared/accuracy_test.h | 1949 +++++++++++++ + shared/arithmetic.h | 61 + + shared/array_predicate.h | 47 + + shared/array_validator.cpp | 549 ++++ + shared/array_validator.h | 31 + + shared/concurrency.h | 41 + + shared/data_gen_device.h | 1303 +++++++++ + shared/data_gen_host.h | 881 ++++++ + shared/device_properties.h | 74 + + shared/enum_to_string.h | 81 + + shared/environment.h | 97 + + shared/fft_params.h | 3274 ++++++++++++++++++++++ + shared/fftw_transform.h | 493 ++++ + shared/gpubuf.h | 134 + + shared/hip_object_wrapper.h | 86 + + shared/hostbuf.h | 158 ++ + shared/increment.h | 100 + + shared/precision_type.h | 70 + + shared/printbuffer.h | 108 + + shared/ptrdiff.h | 40 + + shared/rocfft_accuracy_test.h | 29 + + shared/rocfft_against_fftw.h | 231 ++ + shared/rocfft_complex.h | 346 +++ + shared/rocfft_hip.h | 52 + + shared/rocfft_params.h | 585 ++++ + shared/test_params.h | 51 + + shared/work_queue.h | 49 + + 46 files changed, 10966 insertions(+), 66 deletions(-) + create mode 100644 shared/accuracy_test.h + create mode 100644 shared/arithmetic.h + create mode 100644 shared/array_predicate.h + create mode 100644 shared/array_validator.cpp + create mode 100644 shared/array_validator.h + create mode 100644 shared/concurrency.h + create mode 100644 shared/data_gen_device.h + create mode 100644 shared/data_gen_host.h + create mode 100644 shared/device_properties.h + create mode 100644 shared/enum_to_string.h + create mode 100644 shared/environment.h + create mode 100644 shared/fft_params.h + create mode 100644 shared/fftw_transform.h + create mode 100644 shared/gpubuf.h + create mode 100644 shared/hip_object_wrapper.h + create mode 100644 shared/hostbuf.h + create mode 100644 shared/increment.h + create mode 100644 shared/precision_type.h + create mode 100644 shared/printbuffer.h + create mode 100644 shared/ptrdiff.h + create mode 100644 shared/rocfft_accuracy_test.h + create mode 100644 shared/rocfft_against_fftw.h + create mode 100644 shared/rocfft_complex.h + create mode 100644 shared/rocfft_hip.h + create mode 100644 shared/rocfft_params.h + create mode 100644 shared/test_params.h + create mode 100644 shared/work_queue.h + +diff --git a/clients/CMakeLists.txt b/clients/CMakeLists.txt +index 1db0d9c..b99a9e5 100644 +--- a/clients/CMakeLists.txt ++++ b/clients/CMakeLists.txt +@@ -65,21 +65,6 @@ if( NOT CMAKE_CXX_COMPILER MATCHES ".*/hipcc$" AND NOT CMAKE_CXX_COMPILER_ID STR + endif() + + +-if( GIT_FOUND AND EXISTS "${CMAKE_SOURCE_DIR}/.git" ) +- message(STATUS "rocFFT submodule update") +- execute_process(COMMAND ${GIT_EXECUTABLE} submodule update --init --recursive +- WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}/clients/rocFFT +- RESULT_VARIABLE GIT_SUBMOD_RESULT) +- if( NOT GIT_SUBMOD_RESULT EQUAL "0" ) +- message(FATAL_ERROR "git submodule update --init --recursive failed with ${GIT_SUBMOD_RESULT}, please checkout submodules manually.") +- endif( ) +-endif( ) +- +-if( NOT EXISTS "${CMAKE_SOURCE_DIR}/clients/rocFFT/CMakeLists.txt" ) +- message(FATAL_ERROR "The rocFFT submodule is not present! Please update git submodules and try again. ${CMAKE_CURRENT_SOURCE_DIR}/clients/rocFFT/CMakeLists.txt") +-endif( ) +- +- + # This option only works for make/nmake and the ninja generators, but no reason it shouldn't be on + # all the time + # This tells cmake to create a compile_commands.json file that can be used with clang tooling or vim +diff --git a/clients/bench/CMakeLists.txt b/clients/bench/CMakeLists.txt +index b5cef9b..ccb8c29 100644 +--- a/clients/bench/CMakeLists.txt ++++ b/clients/bench/CMakeLists.txt +@@ -26,8 +26,8 @@ find_package( Boost COMPONENTS program_options REQUIRED) + set( Boost_USE_STATIC_LIBS OFF ) + + +-set( hipfft_bench_source bench.cpp ../rocFFT/shared/array_validator.cpp ) +-set( hipfft_bench_includes bench.h ../rocFFT/shared/array_validator.h ) ++set( hipfft_bench_source bench.cpp ../../shared/array_validator.cpp ) ++set( hipfft_bench_includes bench.h ../../shared/array_validator.h ) + + add_executable( hipfft-bench ${hipfft_bench_source} ${hipfft_bench_includes} ) + +diff --git a/clients/bench/bench.cpp b/clients/bench/bench.cpp +index 894769c..a906879 100644 +--- a/clients/bench/bench.cpp ++++ b/clients/bench/bench.cpp +@@ -29,7 +29,7 @@ + #include <boost/program_options.hpp> + namespace po = boost::program_options; + +-#include "../rocFFT/shared/gpubuf.h" ++#include "../../shared/gpubuf.h" + + int main(int argc, char* argv[]) + { +diff --git a/clients/hipfft_params.h b/clients/hipfft_params.h +index b8b58ac..75d9db9 100644 +--- a/clients/hipfft_params.h ++++ b/clients/hipfft_params.h +@@ -23,9 +23,9 @@ + + #include <optional> + ++#include "../shared/fft_params.h" + #include "hipfft/hipfft.h" + #include "hipfft/hipfftXt.h" +-#include "rocFFT/shared/fft_params.h" + + inline fft_status fft_status_from_hipfftparams(const hipfftResult_t val) + { +diff --git a/clients/tests/CMakeLists.txt b/clients/tests/CMakeLists.txt +index 9742a45..2d1aac0 100644 +--- a/clients/tests/CMakeLists.txt ++++ b/clients/tests/CMakeLists.txt +@@ -37,14 +37,7 @@ set( hipfft-test_source + accuracy_test_3D.cpp + accuracy_test_callback.cpp + multi_device_test.cpp +- ../rocFFT/shared/array_validator.cpp +- ) +- +-set( hipfft-test_includes +- ../rocFFT/clients/tests/fftw_transform.h +- ../rocFFT/clients/tests/rocfft_against_fftw.h +- ../rocFFT/clients/tests/misc/include/test_exception.h +- ../rocFFT/shared/array_validator.h ++ ../../shared/array_validator.cpp + ) + + add_executable( hipfft-test ${hipfft-test_source} ${hipfft-test_includes} ) +@@ -56,8 +49,6 @@ target_include_directories( + $<BUILD_INTERFACE:${FFTW_INCLUDE_DIRS}> + $<BUILD_INTERFACE:${hip_INCLUDE_DIRS}> + $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../../library/include> +- $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../rocFFT/library/include> +- $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../rocFFT/clients/tests> + ) + + +diff --git a/clients/tests/accuracy_test_1D.cpp b/clients/tests/accuracy_test_1D.cpp +index 27e849d..57d846a 100644 +--- a/clients/tests/accuracy_test_1D.cpp ++++ b/clients/tests/accuracy_test_1D.cpp +@@ -23,11 +23,11 @@ + #include <stdexcept> + #include <vector> + +-#include "../rocFFT/shared/fft_params.h" ++#include "../../shared/fft_params.h" + +-#include "accuracy_test.h" +-#include "fftw_transform.h" +-#include "rocfft_against_fftw.h" ++#include "../../shared/accuracy_test.h" ++#include "../../shared/fftw_transform.h" ++#include "../../shared/rocfft_against_fftw.h" + + using ::testing::ValuesIn; + +diff --git a/clients/tests/accuracy_test_2D.cpp b/clients/tests/accuracy_test_2D.cpp +index 1674593..6f618c0 100644 +--- a/clients/tests/accuracy_test_2D.cpp ++++ b/clients/tests/accuracy_test_2D.cpp +@@ -23,11 +23,11 @@ + #include <stdexcept> + #include <vector> + +-#include "../rocFFT/shared/fft_params.h" ++#include "../../shared/fft_params.h" + +-#include "accuracy_test.h" +-#include "fftw_transform.h" +-#include "rocfft_against_fftw.h" ++#include "../../shared/accuracy_test.h" ++#include "../../shared/fftw_transform.h" ++#include "../../shared/rocfft_against_fftw.h" + + using ::testing::ValuesIn; + +diff --git a/clients/tests/accuracy_test_3D.cpp b/clients/tests/accuracy_test_3D.cpp +index a87476a..941ec24 100644 +--- a/clients/tests/accuracy_test_3D.cpp ++++ b/clients/tests/accuracy_test_3D.cpp +@@ -23,11 +23,11 @@ + #include <stdexcept> + #include <vector> + +-#include "../rocFFT/shared/fft_params.h" ++#include "../../shared/fft_params.h" + +-#include "accuracy_test.h" +-#include "fftw_transform.h" +-#include "rocfft_against_fftw.h" ++#include "../../shared/accuracy_test.h" ++#include "../../shared/fftw_transform.h" ++#include "../../shared/rocfft_against_fftw.h" + + using ::testing::ValuesIn; + +diff --git a/clients/tests/accuracy_test_callback.cpp b/clients/tests/accuracy_test_callback.cpp +index 4782830..b5cc4a7 100644 +--- a/clients/tests/accuracy_test_callback.cpp ++++ b/clients/tests/accuracy_test_callback.cpp +@@ -18,7 +18,7 @@ + // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + // THE SOFTWARE. + +-#include "accuracy_test.h" ++#include "../../shared/accuracy_test.h" + + std::vector<std::vector<size_t>> callback_sizes = { + // some single kernel sizes +diff --git a/clients/tests/gtest_main.cpp b/clients/tests/gtest_main.cpp +index 1f0ae83..2f7674e 100644 +--- a/clients/tests/gtest_main.cpp ++++ b/clients/tests/gtest_main.cpp +@@ -30,10 +30,10 @@ + #include <streambuf> + #include <string> + ++#include "../../shared/concurrency.h" ++#include "../../shared/environment.h" ++#include "../../shared/work_queue.h" + #include "../hipfft_params.h" +-#include "../rocFFT/shared/concurrency.h" +-#include "../rocFFT/shared/environment.h" +-#include "../rocFFT/shared/work_queue.h" + #include "hipfft/hipfft.h" + #include "hipfft_accuracy_test.h" + #include "hipfft_test_params.h" +diff --git a/clients/tests/hipfft_accuracy_test.cpp b/clients/tests/hipfft_accuracy_test.cpp +index 2abaf74..609239a 100644 +--- a/clients/tests/hipfft_accuracy_test.cpp ++++ b/clients/tests/hipfft_accuracy_test.cpp +@@ -29,11 +29,12 @@ + #include "hipfft/hipfft.h" + + #include "../hipfft_params.h" +-#include "../rocFFT/clients/tests/fftw_transform.h" +-#include "../rocFFT/clients/tests/rocfft_accuracy_test.h" +-#include "../rocFFT/clients/tests/rocfft_against_fftw.h" +-#include "../rocFFT/shared/gpubuf.h" +-#include "../rocFFT/shared/rocfft_complex.h" ++ ++#include "../../shared/accuracy_test.h" ++#include "../../shared/fftw_transform.h" ++#include "../../shared/gpubuf.h" ++#include "../../shared/rocfft_against_fftw.h" ++#include "../../shared/rocfft_complex.h" + + void fft_vs_reference(hipfft_params& params, bool round_trip) + { +diff --git a/clients/tests/hipfft_accuracy_test.h b/clients/tests/hipfft_accuracy_test.h +index 0491bd9..181150e 100644 +--- a/clients/tests/hipfft_accuracy_test.h ++++ b/clients/tests/hipfft_accuracy_test.h +@@ -23,8 +23,8 @@ + #ifndef ROCFFT_ACCURACY_TEST + #define ROCFFT_ACCURACY_TEST + ++#include "../../shared/accuracy_test.h" + #include "../hipfft_params.h" +-#include "../rocFFT/clients/tests/accuracy_test.h" + + void fft_vs_reference(hipfft_params& params, bool round_trip = false); + +diff --git a/clients/tests/multi_device_test.cpp b/clients/tests/multi_device_test.cpp +index b3dc4c9..3274b80 100644 +--- a/clients/tests/multi_device_test.cpp ++++ b/clients/tests/multi_device_test.cpp +@@ -18,7 +18,7 @@ + // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + // THE SOFTWARE. + +-#include "accuracy_test.h" ++#include "../../shared/accuracy_test.h" + #include <gtest/gtest.h> + #include <hip/hip_runtime_api.h> + +diff --git a/cmake/dependencies.cmake b/cmake/dependencies.cmake +index 5810e37..bdbf689 100644 +--- a/cmake/dependencies.cmake ++++ b/cmake/dependencies.cmake +@@ -21,9 +21,6 @@ + # + # ############################################################################# + +-# Git +-find_package(Git REQUIRED) +- + # HIP + if( NOT CMAKE_CXX_COMPILER MATCHES ".*/hipcc$" ) + if( NOT BUILD_WITH_LIB STREQUAL "CUDA" ) +diff --git a/library/src/amd_detail/hipfft.cpp b/library/src/amd_detail/hipfft.cpp +index c2f7036..3d4f61f 100644 +--- a/library/src/amd_detail/hipfft.cpp ++++ b/library/src/amd_detail/hipfft.cpp +@@ -27,10 +27,10 @@ + #include <string> + #include <vector> + +-#include "../../../clients/rocFFT/shared/arithmetic.h" +-#include "../../../clients/rocFFT/shared/gpubuf.h" +-#include "../../../clients/rocFFT/shared/ptrdiff.h" +-#include "../../../clients/rocFFT/shared/rocfft_hip.h" ++#include "../../../shared/arithmetic.h" ++#include "../../../shared/gpubuf.h" ++#include "../../../shared/ptrdiff.h" ++#include "../../../shared/rocfft_hip.h" + + #define ROC_FFT_CHECK_ALLOC_FAILED(ret) \ + { \ +diff --git a/shared/accuracy_test.h b/shared/accuracy_test.h +new file mode 100644 +index 0000000..362a7c1 +--- /dev/null ++++ b/shared/accuracy_test.h +@@ -0,0 +1,1949 @@ ++// Copyright (C) 2020 - 2023 Advanced Micro Devices, Inc. All rights reserved. ++// ++// Permission is hereby granted, free of charge, to any person obtaining a copy ++// of this software and associated documentation files (the "Software"), to deal ++// in the Software without restriction, including without limitation the rights ++// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell ++// copies of the Software, and to permit persons to whom the Software is ++// furnished to do so, subject to the following conditions: ++// ++// The above copyright notice and this permission notice shall be included in ++// all copies or substantial portions of the Software. ++// ++// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, ++// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN ++// THE SOFTWARE. ++ ++#pragma once ++ ++#ifndef ACCURACY_TEST ++#define ACCURACY_TEST ++ ++#include <algorithm> ++#include <functional> ++#include <future> ++#include <iterator> ++#include <string> ++#include <vector> ++ ++#include "enum_to_string.h" ++#include "fft_params.h" ++#include "fftw_transform.h" ++#include "gpubuf.h" ++#include "rocfft_against_fftw.h" ++#include "test_params.h" ++ ++extern int verbose; ++extern size_t ramgb; ++extern bool fftw_compare; ++ ++static const size_t ONE_GiB = 1 << 30; ++ ++inline size_t bytes_to_GiB(const size_t bytes) ++{ ++ return bytes == 0 ? 0 : (bytes - 1 + ONE_GiB) / ONE_GiB; ++} ++ ++typedef std::tuple<fft_transform_type, fft_result_placement, fft_array_type, fft_array_type> ++ type_place_io_t; ++ ++// Remember the results of the last FFT we computed with FFTW. Tests ++// are ordered so that later cases can often reuse this result. ++struct last_cpu_fft_cache ++{ ++ // keys to the cache ++ std::vector<size_t> length; ++ size_t nbatch = 0; ++ fft_transform_type transform_type = fft_transform_type_complex_forward; ++ bool run_callbacks = false; ++ fft_precision precision = fft_precision_single; ++ ++ // FFTW input/output ++ std::vector<hostbuf> cpu_input; ++ std::vector<hostbuf> cpu_output; ++}; ++extern last_cpu_fft_cache last_cpu_fft_data; ++ ++struct system_memory ++{ ++ size_t total_bytes = 0; ++ size_t free_bytes = 0; ++}; ++extern system_memory start_memory; ++ ++system_memory get_system_memory(); ++ ++// Estimate the amount of host memory needed for buffers. ++inline size_t needed_ram_buffers(const fft_params& params, const int verbose) ++{ ++ // This calculation is assuming contiguous data but noncontiguous buffers ++ // are assumed to require a close enough amount of space for the purposes ++ // of this estimate. ++ ++ size_t needed_ram = 6 ++ * std::accumulate(params.length.begin(), ++ params.length.end(), ++ static_cast<size_t>(1), ++ std::multiplies<size_t>()); ++ ++ // Account for precision and data type: ++ if(params.transform_type != fft_transform_type_real_forward ++ && params.transform_type != fft_transform_type_real_inverse) ++ { ++ needed_ram *= 2; ++ } ++ switch(params.precision) ++ { ++ case fft_precision_half: ++ needed_ram *= 2; ++ break; ++ case fft_precision_single: ++ needed_ram *= 4; ++ break; ++ case fft_precision_double: ++ needed_ram *= 8; ++ break; ++ } ++ ++ needed_ram *= params.nbatch; ++ ++ if(verbose) ++ { ++ std::cout << "required host memory for buffers (GiB): " << bytes_to_GiB(needed_ram) << "\n"; ++ } ++ ++ return needed_ram; ++} ++ ++template <typename Tfloat> ++bool fftw_plan_uses_bluestein(const typename fftw_trait<Tfloat>::fftw_plan_type& cpu_plan) ++{ ++#ifdef FFTW_HAVE_SPRINT_PLAN ++ char* print_plan_c_str = fftw_sprint_plan<Tfloat>(cpu_plan); ++ std::string print_plan(print_plan_c_str); ++ free(print_plan_c_str); ++ return print_plan.find("bluestein") != std::string::npos; ++#else ++ // assume worst case (bluestein is always used) ++ return true; ++#endif ++} ++ ++// Estimate the amount of host memory needed for fftw. ++template <typename Tfloat> ++inline size_t needed_ram_fftw(const fft_params& contiguous_params, ++ const typename fftw_trait<Tfloat>::fftw_plan_type& cpu_plan, ++ const int verbose) ++{ ++ size_t total_length = std::accumulate(contiguous_params.length.begin(), ++ contiguous_params.length.end(), ++ static_cast<size_t>(1), ++ std::multiplies<size_t>()); ++ size_t needed_ram = 0; ++ // Detect Bluestein in plan ++ if(fftw_plan_uses_bluestein<Tfloat>(cpu_plan)) ++ { ++ for(size_t dim : contiguous_params.length) ++ { ++ unsigned int needed_ram_dim = dim; ++ ++ // Next-plus-one-power-of-two multiplied any other lengths ++ needed_ram_dim--; ++ ++ needed_ram_dim |= needed_ram_dim >> 2; ++ needed_ram_dim |= needed_ram_dim >> 4; ++ needed_ram_dim |= needed_ram_dim >> 8; ++ needed_ram_dim |= needed_ram_dim >> 16; ++ ++ needed_ram_dim++; ++ ++ needed_ram_dim *= 2 * (total_length / dim); ++ ++ if(needed_ram_dim > needed_ram) ++ { ++ needed_ram = needed_ram_dim; ++ } ++ } ++ } ++ ++ // Account for precision and data type: ++ if(contiguous_params.transform_type != fft_transform_type_real_forward ++ && contiguous_params.transform_type != fft_transform_type_real_inverse) ++ { ++ needed_ram *= 2; ++ } ++ switch(contiguous_params.precision) ++ { ++ case fft_precision_half: ++ needed_ram *= 2; ++ break; ++ case fft_precision_single: ++ needed_ram *= 4; ++ break; ++ case fft_precision_double: ++ needed_ram *= 8; ++ break; ++ } ++ ++ needed_ram *= contiguous_params.nbatch; ++ ++ if(verbose) ++ { ++ std::cout << "required host memory for FFTW (GiB): " << bytes_to_GiB(needed_ram) << "\n"; ++ } ++ ++ return needed_ram; ++} ++ ++// Base gtest class for comparison with FFTW. ++class accuracy_test : public ::testing::TestWithParam<fft_params> ++{ ++protected: ++ void SetUp() override {} ++ void TearDown() override {} ++ ++public: ++ static std::string TestName(const testing::TestParamInfo<accuracy_test::ParamType>& info) ++ { ++ return info.param.token(); ++ } ++}; ++ ++const static std::vector<size_t> batch_range = {2, 1}; ++ ++const static std::vector<fft_precision> precision_range_full ++ = {fft_precision_double, fft_precision_single, fft_precision_half}; ++const static std::vector<fft_precision> precision_range_sp_dp ++ = {fft_precision_double, fft_precision_single}; ++ ++const static std::vector<fft_result_placement> place_range ++ = {fft_placement_inplace, fft_placement_notinplace}; ++const static std::vector<fft_transform_type> trans_type_range ++ = {fft_transform_type_complex_forward, fft_transform_type_real_forward}; ++const static std::vector<fft_transform_type> trans_type_range_complex ++ = {fft_transform_type_complex_forward}; ++const static std::vector<fft_transform_type> trans_type_range_real ++ = {fft_transform_type_real_forward}; ++ ++// Given a vector of vector of lengths, generate all unique permutations. ++// Add an optional vector of ad-hoc lengths to the result. ++inline std::vector<std::vector<size_t>> ++ generate_lengths(const std::vector<std::vector<size_t>>& inlengths) ++{ ++ std::vector<std::vector<size_t>> output; ++ if(inlengths.size() == 0) ++ { ++ return output; ++ } ++ const size_t dim = inlengths.size(); ++ std::vector<size_t> looplength(dim); ++ for(unsigned int i = 0; i < dim; ++i) ++ { ++ looplength[i] = inlengths[i].size(); ++ } ++ for(unsigned int idx = 0; idx < inlengths.size(); ++idx) ++ { ++ std::vector<size_t> index(dim); ++ do ++ { ++ std::vector<size_t> length(dim); ++ for(unsigned int i = 0; i < dim; ++i) ++ { ++ length[i] = inlengths[i][index[i]]; ++ } ++ output.push_back(length); ++ } while(increment_rowmajor(index, looplength)); ++ } ++ // uniquify the result ++ std::sort(output.begin(), output.end()); ++ output.erase(std::unique(output.begin(), output.end()), output.end()); ++ return output; ++} ++ ++// Return the valid rocFFT input and output types for a given transform type. ++inline std::vector<std::pair<fft_array_type, fft_array_type>> ++ iotypes(const fft_transform_type transformType, ++ const fft_result_placement place, ++ const bool planar = true) ++{ ++ std::vector<std::pair<fft_array_type, fft_array_type>> iotypes; ++ switch(transformType) ++ { ++ case fft_transform_type_complex_forward: ++ case fft_transform_type_complex_inverse: ++ iotypes.push_back(std::make_pair<fft_array_type, fft_array_type>( ++ fft_array_type_complex_interleaved, fft_array_type_complex_interleaved)); ++ if(planar) ++ { ++ iotypes.push_back(std::make_pair<fft_array_type, fft_array_type>( ++ fft_array_type_complex_planar, fft_array_type_complex_planar)); ++ if(place == fft_placement_notinplace) ++ { ++ iotypes.push_back(std::make_pair<fft_array_type, fft_array_type>( ++ fft_array_type_complex_planar, fft_array_type_complex_interleaved)); ++ iotypes.push_back(std::make_pair<fft_array_type, fft_array_type>( ++ fft_array_type_complex_interleaved, fft_array_type_complex_planar)); ++ } ++ } ++ break; ++ case fft_transform_type_real_forward: ++ iotypes.push_back(std::make_pair<fft_array_type, fft_array_type>( ++ fft_array_type_real, fft_array_type_hermitian_interleaved)); ++ if(planar && place == fft_placement_notinplace) ++ { ++ iotypes.push_back(std::make_pair<fft_array_type, fft_array_type>( ++ fft_array_type_real, fft_array_type_hermitian_planar)); ++ } ++ break; ++ case fft_transform_type_real_inverse: ++ iotypes.push_back(std::make_pair<fft_array_type, fft_array_type>( ++ fft_array_type_hermitian_interleaved, fft_array_type_real)); ++ if(planar && place == fft_placement_notinplace) ++ { ++ iotypes.push_back(std::make_pair<fft_array_type, fft_array_type>( ++ fft_array_type_hermitian_planar, fft_array_type_real)); ++ } ++ break; ++ default: ++ throw std::runtime_error("Invalid transform type"); ++ } ++ return iotypes; ++} ++ ++// Generate all combinations of input/output types, from combinations of transform and placement ++// types. ++static std::vector<type_place_io_t> ++ generate_types(fft_transform_type transform_type, ++ const std::vector<fft_result_placement>& place_range, ++ const bool planar) ++{ ++ std::vector<type_place_io_t> ret; ++ for(auto place : place_range) ++ { ++ for(auto iotype : iotypes(transform_type, place, planar)) ++ { ++ ret.push_back(std::make_tuple(transform_type, place, iotype.first, iotype.second)); ++ } ++ } ++ return ret; ++} ++ ++struct stride_generator ++{ ++ struct stride_dist ++ { ++ stride_dist(const std::vector<size_t>& s, size_t d) ++ : stride(s) ++ , dist(d) ++ { ++ } ++ std::vector<size_t> stride; ++ size_t dist; ++ }; ++ ++ // NOTE: allow for this ctor to be implicit, so it's less typing for a test writer ++ // ++ // cppcheck-suppress noExplicitConstructor ++ stride_generator(const std::vector<std::vector<size_t>>& stride_list_in) ++ : stride_list(stride_list_in) ++ { ++ } ++ virtual std::vector<stride_dist> generate(const std::vector<size_t>& lengths, ++ size_t batch) const ++ { ++ std::vector<stride_dist> ret; ++ for(const auto& s : stride_list) ++ ret.emplace_back(s, 0); ++ return ret; ++ } ++ std::vector<std::vector<size_t>> stride_list; ++}; ++ ++// Generate strides such that batch is essentially the innermost dimension ++// e.g. given a batch-2 4x3x2 transform which logically looks like: ++// ++// batch0: ++// A B A B ++// A B A B ++// A B A B ++// ++// A B A B ++// A B A B ++// A B A B ++// ++// batch1: ++// A B A B ++// A B A B ++// A B A B ++// ++// A B A B ++// A B A B ++// A B A B ++// ++// we instead do stride-2 4x3x2 transform where first batch is the ++// A's and second batch is the B's. ++struct stride_generator_3D_inner_batch : public stride_generator ++{ ++ explicit stride_generator_3D_inner_batch(const std::vector<std::vector<size_t>>& stride_list_in) ++ : stride_generator(stride_list_in) ++ { ++ } ++ std::vector<stride_dist> generate(const std::vector<size_t>& lengths, ++ size_t batch) const override ++ { ++ std::vector<stride_dist> ret = stride_generator::generate(lengths, batch); ++ std::vector<size_t> strides{lengths[1] * lengths[2] * batch, lengths[2] * batch, batch}; ++ ret.emplace_back(strides, 1); ++ return ret; ++ } ++}; ++ ++// Create an array of parameters to pass to gtest. Base generator ++// that allows choosing transform type. ++inline auto param_generator_base(const std::vector<fft_transform_type>& type_range, ++ const std::vector<std::vector<size_t>>& v_lengths, ++ const std::vector<fft_precision>& precision_range, ++ const std::vector<size_t>& batch_range, ++ decltype(generate_types) types_generator, ++ const stride_generator& istride, ++ const stride_generator& ostride, ++ const std::vector<std::vector<size_t>>& ioffset_range, ++ const std::vector<std::vector<size_t>>& ooffset_range, ++ const std::vector<fft_result_placement>& place_range, ++ const bool planar = true, ++ const bool run_callbacks = false) ++{ ++ ++ std::vector<fft_params> params; ++ ++ // For any length, we compute double-precision CPU reference ++ // for largest batch size first and reuse for smaller batch ++ // sizes, then convert to single-precision. ++ ++ for(auto& transform_type : type_range) ++ { ++ for(const auto& lengths : v_lengths) ++ { ++ // try to ensure that we are given literal lengths, not ++ // something to be passed to generate_lengths ++ if(lengths.empty() || lengths.size() > 3) ++ { ++ continue; ++ } ++ { ++ for(const auto precision : precision_range) ++ { ++ for(const auto batch : batch_range) ++ { ++ for(const auto& types : ++ types_generator(transform_type, place_range, planar)) ++ { ++ for(const auto& istride_dist : istride.generate(lengths, batch)) ++ { ++ for(const auto& ostride_dist : ostride.generate(lengths, batch)) ++ { ++ for(const auto& ioffset : ioffset_range) ++ { ++ for(const auto& ooffset : ooffset_range) ++ { ++ fft_params param; ++ ++ param.length = lengths; ++ param.istride = istride_dist.stride; ++ param.ostride = ostride_dist.stride; ++ param.nbatch = batch; ++ param.precision = precision; ++ param.transform_type = std::get<0>(types); ++ param.placement = std::get<1>(types); ++ param.idist = istride_dist.dist; ++ param.odist = ostride_dist.dist; ++ param.itype = std::get<2>(types); ++ param.otype = std::get<3>(types); ++ param.ioffset = ioffset; ++ param.ooffset = ooffset; ++ ++ if(run_callbacks) ++ { ++ // add a test if both input and output support callbacks ++ if(param.itype != fft_array_type_complex_planar ++ && param.itype != fft_array_type_hermitian_planar ++ && param.otype != fft_array_type_complex_planar ++ && param.otype ++ != fft_array_type_hermitian_planar) ++ { ++ param.run_callbacks = true; ++ } ++ else ++ { ++ continue; ++ } ++ } ++ param.validate(); ++ ++ // Keeping the random number generator here ++ // allows one to run the same tests for a given ++ // random seed; ie the test suite is repeatable. ++ std::hash<std::string> hasher; ++ std::ranlux24_base gen(random_seed ++ + hasher(param.token())); ++ std::uniform_real_distribution<> dis(0.0, 1.0); ++ ++ if(param.is_planar()) ++ { ++ const double roll = dis(gen); ++ if(roll > planar_prob) ++ { ++ if(verbose > 4) ++ { ++ std::cout << "Planar transform skipped " ++ "(planar_prob: " ++ << planar_prob << " > " << roll ++ << ")\n"; ++ } ++ continue; ++ } ++ } ++ if(run_callbacks) ++ { ++ const double roll = dis(gen); ++ if(roll > callback_prob) ++ { ++ ++ if(verbose > 4) ++ { ++ std::cout << "Callback transform skipped " ++ "(planar_prob: " ++ << planar_prob << " > " << roll ++ << ")\n"; ++ } ++ continue; ++ } ++ } ++ ++ if(param.valid(0)) ++ { ++ params.push_back(param); ++ } ++ } ++ } ++ } ++ } ++ } ++ } ++ } ++ } ++ } ++ } ++ return params; ++} ++ ++// Create an array of parameters to pass to gtest. Default generator ++// that picks all transform types. ++inline auto param_generator(const std::vector<std::vector<size_t>>& v_lengths, ++ const std::vector<fft_precision>& precision_range, ++ const std::vector<size_t>& batch_range, ++ const stride_generator& istride, ++ const stride_generator& ostride, ++ const std::vector<std::vector<size_t>>& ioffset_range, ++ const std::vector<std::vector<size_t>>& ooffset_range, ++ const std::vector<fft_result_placement>& place_range, ++ const bool planar, ++ const bool run_callbacks = false) ++{ ++ return param_generator_base(trans_type_range, ++ v_lengths, ++ precision_range, ++ batch_range, ++ generate_types, ++ istride, ++ ostride, ++ ioffset_range, ++ ooffset_range, ++ place_range, ++ planar, ++ run_callbacks); ++} ++ ++// Create an array of parameters to pass to gtest. Only tests complex-type transforms ++inline auto param_generator_complex(const std::vector<std::vector<size_t>>& v_lengths, ++ const std::vector<fft_precision>& precision_range, ++ const std::vector<size_t>& batch_range, ++ const stride_generator& istride, ++ const stride_generator& ostride, ++ const std::vector<std::vector<size_t>>& ioffset_range, ++ const std::vector<std::vector<size_t>>& ooffset_range, ++ const std::vector<fft_result_placement>& place_range, ++ const bool planar, ++ const bool run_callbacks = false) ++{ ++ return param_generator_base(trans_type_range_complex, ++ v_lengths, ++ precision_range, ++ batch_range, ++ generate_types, ++ istride, ++ ostride, ++ ioffset_range, ++ ooffset_range, ++ place_range, ++ planar, ++ run_callbacks); ++} ++ ++// Create an array of parameters to pass to gtest. ++inline auto param_generator_real(const std::vector<std::vector<size_t>>& v_lengths, ++ const std::vector<fft_precision>& precision_range, ++ const std::vector<size_t>& batch_range, ++ const stride_generator& istride, ++ const stride_generator& ostride, ++ const std::vector<std::vector<size_t>>& ioffset_range, ++ const std::vector<std::vector<size_t>>& ooffset_range, ++ const std::vector<fft_result_placement>& place_range, ++ const bool planar, ++ const bool run_callbacks = false) ++{ ++ return param_generator_base(trans_type_range_real, ++ v_lengths, ++ precision_range, ++ batch_range, ++ generate_types, ++ istride, ++ ostride, ++ ioffset_range, ++ ooffset_range, ++ place_range, ++ planar, ++ run_callbacks); ++} ++ ++template <class Tcontainer> ++auto param_generator_token(const Tcontainer& tokens) ++{ ++ std::vector<fft_params> params; ++ params.reserve(tokens.size()); ++ for(auto t : tokens) ++ { ++ params.push_back({}); ++ params.back().from_token(t); ++ } ++ return params; ++} ++ ++struct callback_test_data ++{ ++ // scalar to modify the input/output with ++ double scalar; ++ // base address of input, to ensure that each callback gets an offset from that base ++ void* base; ++}; ++ ++void* get_load_callback_host(fft_array_type itype, ++ fft_precision precision, ++ bool round_trip_inverse); ++void apply_load_callback(const fft_params& params, std::vector<hostbuf>& input); ++void apply_store_callback(const fft_params& params, std::vector<hostbuf>& output); ++void* get_store_callback_host(fft_array_type otype, ++ fft_precision precision, ++ bool round_trip_inverse); ++ ++static auto allocate_cpu_fft_buffer(const fft_precision precision, ++ const fft_array_type type, ++ const std::vector<size_t>& size) ++{ ++ // FFTW does not support half-precision, so we do single instead. ++ // So if we need to do a half-precision FFTW transform, allocate ++ // enough buffer for single-precision instead. ++ return allocate_host_buffer( ++ precision == fft_precision_half ? fft_precision_single : precision, type, size); ++} ++ ++template <typename Tfloat> ++inline void execute_cpu_fft(fft_params& params, ++ fft_params& contiguous_params, ++ typename fftw_trait<Tfloat>::fftw_plan_type& cpu_plan, ++ std::vector<hostbuf>& cpu_input, ++ std::vector<hostbuf>& cpu_output) ++{ ++ // CPU output might not be allocated already for us, if FFTW never ++ // needed an output buffer during planning ++ if(cpu_output.empty()) ++ cpu_output = allocate_cpu_fft_buffer( ++ contiguous_params.precision, contiguous_params.otype, contiguous_params.osize); ++ ++ // If this is either C2R or callbacks are enabled, the ++ // input will be modified. So we need to modify the copy instead. ++ std::vector<hostbuf> cpu_input_copy(cpu_input.size()); ++ std::vector<hostbuf>* input_ptr = &cpu_input; ++ if(params.run_callbacks || contiguous_params.transform_type == fft_transform_type_real_inverse) ++ { ++ for(size_t i = 0; i < cpu_input.size(); ++i) ++ { ++ cpu_input_copy[i] = cpu_input[i].copy(); ++ } ++ ++ input_ptr = &cpu_input_copy; ++ } ++ ++ // run FFTW (which may destroy CPU input) ++ apply_load_callback(params, *input_ptr); ++ fftw_run<Tfloat>(contiguous_params.transform_type, cpu_plan, *input_ptr, cpu_output); ++ // clean up ++ fftw_destroy_plan_type(cpu_plan); ++ // ask FFTW to fully clean up, since it tries to cache plan details ++ fftw_cleanup(); ++ cpu_plan = nullptr; ++ apply_store_callback(params, cpu_output); ++} ++ ++// execute the GPU transform ++template <class Tparams> ++inline void execute_gpu_fft(Tparams& params, ++ std::vector<void*>& pibuffer, ++ std::vector<void*>& pobuffer, ++ std::vector<gpubuf>& obuffer, ++ std::vector<hostbuf>& gpu_output, ++ bool round_trip_inverse = false) ++{ ++ gpubuf_t<callback_test_data> load_cb_data_dev; ++ gpubuf_t<callback_test_data> store_cb_data_dev; ++ if(params.run_callbacks) ++ { ++ void* load_cb_host ++ = get_load_callback_host(params.itype, params.precision, round_trip_inverse); ++ ++ callback_test_data load_cb_data_host; ++ ++ if(round_trip_inverse) ++ { ++ load_cb_data_host.scalar = params.store_cb_scalar; ++ } ++ else ++ { ++ load_cb_data_host.scalar = params.load_cb_scalar; ++ } ++ ++ load_cb_data_host.base = pibuffer.front(); ++ ++ auto hip_status = hipSuccess; ++ ++ hip_status = load_cb_data_dev.alloc(sizeof(callback_test_data)); ++ if(hip_status != hipSuccess) ++ { ++ ++n_hip_failures; ++ if(skip_runtime_fails) ++ { ++ GTEST_SKIP(); ++ } ++ else ++ { ++ GTEST_FAIL(); ++ } ++ } ++ hip_status = hipMemcpy(load_cb_data_dev.data(), ++ &load_cb_data_host, ++ sizeof(callback_test_data), ++ hipMemcpyHostToDevice); ++ if(hip_status != hipSuccess) ++ { ++ ++n_hip_failures; ++ if(skip_runtime_fails) ++ { ++ GTEST_SKIP(); ++ } ++ else ++ { ++ GTEST_FAIL(); ++ } ++ } ++ ++ void* store_cb_host ++ = get_store_callback_host(params.otype, params.precision, round_trip_inverse); ++ ++ callback_test_data store_cb_data_host; ++ ++ if(round_trip_inverse) ++ { ++ store_cb_data_host.scalar = params.load_cb_scalar; ++ } ++ else ++ { ++ store_cb_data_host.scalar = params.store_cb_scalar; ++ } ++ ++ store_cb_data_host.base = pobuffer.front(); ++ ++ hip_status = store_cb_data_dev.alloc(sizeof(callback_test_data)); ++ if(hip_status != hipSuccess) ++ { ++ ++n_hip_failures; ++ if(skip_runtime_fails) ++ { ++ GTEST_SKIP(); ++ } ++ else ++ { ++ GTEST_FAIL(); ++ } ++ } ++ ++ hip_status = hipMemcpy(store_cb_data_dev.data(), ++ &store_cb_data_host, ++ sizeof(callback_test_data), ++ hipMemcpyHostToDevice); ++ if(hip_status != hipSuccess) ++ { ++ ++n_hip_failures; ++ if(skip_runtime_fails) ++ { ++ GTEST_SKIP(); ++ } ++ else ++ { ++ GTEST_FAIL(); ++ } ++ } ++ ++ auto fft_status = params.set_callbacks( ++ load_cb_host, load_cb_data_dev.data(), store_cb_host, store_cb_data_dev.data()); ++ if(fft_status != fft_status_success) ++ throw std::runtime_error("set callback failure"); ++ } ++ ++ // Execute the transform: ++ auto fft_status = params.execute(pibuffer.data(), pobuffer.data()); ++ if(fft_status != fft_status_success) ++ throw std::runtime_error("rocFFT plan execution failure"); ++ ++ // if not comparing, then just executing the GPU FFT is all we ++ // need to do ++ if(!fftw_compare) ++ return; ++ ++ // finalize a multi-GPU transform ++ params.multi_gpu_finalize(obuffer, pobuffer); ++ ++ ASSERT_TRUE(!gpu_output.empty()) << "no output buffers"; ++ for(unsigned int idx = 0; idx < gpu_output.size(); ++idx) ++ { ++ ASSERT_TRUE(gpu_output[idx].data() != nullptr) ++ << "output buffer index " << idx << " is empty"; ++ auto hip_status = hipMemcpy(gpu_output[idx].data(), ++ pobuffer.at(idx), ++ gpu_output[idx].size(), ++ hipMemcpyDeviceToHost); ++ if(hip_status != hipSuccess) ++ { ++ ++n_hip_failures; ++ if(skip_runtime_fails) ++ { ++ GTEST_SKIP() << "hipMemcpy failure"; ++ } ++ else ++ { ++ GTEST_FAIL() << "hipMemcpy failure"; ++ } ++ } ++ } ++ if(verbose > 2) ++ { ++ std::cout << "GPU output:\n"; ++ params.print_obuffer(gpu_output); ++ } ++ if(verbose > 5) ++ { ++ std::cout << "flat GPU output:\n"; ++ params.print_obuffer_flat(gpu_output); ++ } ++} ++ ++template <typename Tfloat> ++static void assert_init_value(const std::vector<hostbuf>& output, ++ const size_t idx, ++ const Tfloat orig_value); ++ ++template <> ++void assert_init_value(const std::vector<hostbuf>& output, const size_t idx, const float orig_value) ++{ ++ float actual_value = reinterpret_cast<const float*>(output.front().data())[idx]; ++ ASSERT_EQ(actual_value, orig_value) << "index " << idx; ++} ++ ++template <> ++void assert_init_value(const std::vector<hostbuf>& output, ++ const size_t idx, ++ const double orig_value) ++{ ++ double actual_value = reinterpret_cast<const double*>(output.front().data())[idx]; ++ ASSERT_EQ(actual_value, orig_value) << "index " << idx; ++} ++ ++template <> ++void assert_init_value(const std::vector<hostbuf>& output, ++ const size_t idx, ++ const rocfft_complex<float> orig_value) ++{ ++ // if this is interleaved, check directly ++ if(output.size() == 1) ++ { ++ rocfft_complex<float> actual_value ++ = reinterpret_cast<const rocfft_complex<float>*>(output.front().data())[idx]; ++ ASSERT_EQ(actual_value.x, orig_value.x) << "x index " << idx; ++ ASSERT_EQ(actual_value.y, orig_value.y) << "y index " << idx; ++ } ++ else ++ { ++ // planar ++ rocfft_complex<float> actual_value{ ++ reinterpret_cast<const float*>(output.front().data())[idx], ++ reinterpret_cast<const float*>(output.back().data())[idx]}; ++ ASSERT_EQ(actual_value.x, orig_value.x) << "x index " << idx; ++ ASSERT_EQ(actual_value.y, orig_value.y) << "y index " << idx; ++ } ++} ++ ++template <> ++void assert_init_value(const std::vector<hostbuf>& output, ++ const size_t idx, ++ const rocfft_complex<double> orig_value) ++{ ++ // if this is interleaved, check directly ++ if(output.size() == 1) ++ { ++ rocfft_complex<double> actual_value ++ = reinterpret_cast<const rocfft_complex<double>*>(output.front().data())[idx]; ++ ASSERT_EQ(actual_value.x, orig_value.x) << "x index " << idx; ++ ASSERT_EQ(actual_value.y, orig_value.y) << "y index " << idx; ++ } ++ else ++ { ++ // planar ++ rocfft_complex<double> actual_value{ ++ reinterpret_cast<const double*>(output.front().data())[idx], ++ reinterpret_cast<const double*>(output.back().data())[idx]}; ++ ASSERT_EQ(actual_value.x, orig_value.x) << "x index " << idx; ++ ASSERT_EQ(actual_value.y, orig_value.y) << "y index " << idx; ++ } ++} ++ ++static const int OUTPUT_INIT_PATTERN = 0xcd; ++template <class Tfloat> ++void check_single_output_stride(const std::vector<hostbuf>& output, ++ const size_t offset, ++ const std::vector<size_t>& length, ++ const std::vector<size_t>& stride, ++ const size_t i) ++{ ++ Tfloat orig; ++ memset(static_cast<void*>(&orig), OUTPUT_INIT_PATTERN, sizeof(Tfloat)); ++ ++ size_t curLength = length[i]; ++ size_t curStride = stride[i]; ++ size_t nextSmallerLength = i == length.size() - 1 ? 0 : length[i + 1]; ++ size_t nextSmallerStride = i == stride.size() - 1 ? 0 : stride[i + 1]; ++ ++ if(nextSmallerLength == 0) ++ { ++ // this is the fastest dim, indexes that are not multiples of ++ // the stride should be the initial value ++ for(size_t idx = 0; idx < (curLength - 1) * curStride; ++idx) ++ { ++ if(idx % curStride != 0) ++ assert_init_value<Tfloat>(output, idx, orig); ++ } ++ } ++ else ++ { ++ for(size_t lengthIdx = 0; lengthIdx < curLength; ++lengthIdx) ++ { ++ // check that the space after the next smaller dim and the ++ // end of this dim is initial value ++ for(size_t idx = nextSmallerLength * nextSmallerStride; idx < curStride; ++idx) ++ assert_init_value<Tfloat>(output, idx, orig); ++ ++ check_single_output_stride<Tfloat>( ++ output, offset + lengthIdx * curStride, length, stride, i + 1); ++ } ++ } ++} ++ ++template <class Tparams> ++void check_output_strides(const std::vector<hostbuf>& output, Tparams& params) ++{ ++ // treat batch+dist like highest length+stride, if batch > 1 ++ std::vector<size_t> length; ++ std::vector<size_t> stride; ++ if(params.nbatch > 1) ++ { ++ length.push_back(params.nbatch); ++ stride.push_back(params.odist); ++ } ++ ++ auto olength = params.olength(); ++ std::copy(olength.begin(), olength.end(), std::back_inserter(length)); ++ std::copy(params.ostride.begin(), params.ostride.end(), std::back_inserter(stride)); ++ ++ if(params.precision == fft_precision_single) ++ { ++ if(params.otype == fft_array_type_real) ++ check_single_output_stride<float>(output, 0, length, stride, 0); ++ else ++ check_single_output_stride<rocfft_complex<float>>(output, 0, length, stride, 0); ++ } ++ else ++ { ++ if(params.otype == fft_array_type_real) ++ check_single_output_stride<double>(output, 0, length, stride, 0); ++ else ++ check_single_output_stride<rocfft_complex<double>>(output, 0, length, stride, 0); ++ } ++} ++ ++// run rocFFT inverse transform ++template <class Tparams> ++inline void run_round_trip_inverse(Tparams& params, ++ std::vector<gpubuf>& obuffer, ++ std::vector<void*>& pibuffer, ++ std::vector<void*>& pobuffer, ++ std::vector<hostbuf>& gpu_output) ++{ ++ params.validate(); ++ ++ // Make sure that the parameters make sense: ++ ASSERT_TRUE(params.valid(verbose)); ++ ++ // Create FFT plan - this will also allocate work buffer, but will throw a ++ // specific exception if that step fails ++ auto plan_status = fft_status_success; ++ try ++ { ++ plan_status = params.create_plan(); ++ } ++ catch(fft_params::work_buffer_alloc_failure& e) ++ { ++ std::stringstream ss; ++ ss << "Failed to allocate work buffer (size: " << params.workbuffersize << ")"; ++ ++n_hip_failures; ++ if(skip_runtime_fails) ++ { ++ GTEST_SKIP() << ss.str(); ++ } ++ else ++ { ++ GTEST_FAIL() << ss.str(); ++ } ++ } ++ ASSERT_EQ(plan_status, fft_status_success) << "round trip inverse plan creation failed"; ++ ++ auto obuffer_sizes = params.obuffer_sizes(); ++ ++ if(params.placement != fft_placement_inplace) ++ { ++ for(unsigned int i = 0; i < obuffer_sizes.size(); ++i) ++ { ++ // If we're validating output strides, init the ++ // output buffer to a known pattern and we can check ++ // that the pattern is untouched in places that ++ // shouldn't have been touched. ++ if(params.check_output_strides) ++ { ++ auto hip_status ++ = hipMemset(obuffer[i].data(), OUTPUT_INIT_PATTERN, obuffer_sizes[i]); ++ if(hip_status != hipSuccess) ++ { ++ ++n_hip_failures; ++ if(skip_runtime_fails) ++ { ++ GTEST_SKIP() << "hipMemset failure"; ++ } ++ else ++ { ++ GTEST_FAIL() << "hipMemset failure"; ++ } ++ } ++ } ++ } ++ } ++ ++ // execute GPU transform ++ execute_gpu_fft(params, pibuffer, pobuffer, obuffer, gpu_output, true); ++} ++ ++// compare rocFFT inverse transform with forward transform input ++template <class Tparams> ++inline void compare_round_trip_inverse(Tparams& params, ++ fft_params& contiguous_params, ++ std::vector<hostbuf>& gpu_output, ++ std::vector<hostbuf>& cpu_input, ++ const VectorNorms& cpu_input_norm, ++ size_t total_length) ++{ ++ if(params.check_output_strides) ++ { ++ check_output_strides<Tparams>(gpu_output, params); ++ } ++ ++ // compute GPU output norm ++ std::shared_future<VectorNorms> gpu_norm = std::async(std::launch::async, [&]() { ++ return norm(gpu_output, ++ params.olength(), ++ params.nbatch, ++ params.precision, ++ params.otype, ++ params.ostride, ++ params.odist, ++ params.ooffset); ++ }); ++ ++ // compare GPU inverse output to CPU forward input ++ std::unique_ptr<std::vector<std::pair<size_t, size_t>>> linf_failures; ++ if(verbose > 1) ++ linf_failures = std::make_unique<std::vector<std::pair<size_t, size_t>>>(); ++ const double linf_cutoff ++ = type_epsilon(params.precision) * cpu_input_norm.l_inf * log(total_length); ++ ++ VectorNorms diff = distance(cpu_input, ++ gpu_output, ++ params.olength(), ++ params.nbatch, ++ params.precision, ++ contiguous_params.itype, ++ contiguous_params.istride, ++ contiguous_params.idist, ++ params.otype, ++ params.ostride, ++ params.odist, ++ linf_failures.get(), ++ linf_cutoff, ++ {0}, ++ params.ooffset, ++ 1.0 / total_length); ++ ++ if(verbose > 1) ++ { ++ std::cout << "GPU output Linf norm: " << gpu_norm.get().l_inf << "\n"; ++ std::cout << "GPU output L2 norm: " << gpu_norm.get().l_2 << "\n"; ++ std::cout << "GPU linf norm failures:"; ++ std::sort(linf_failures->begin(), linf_failures->end()); ++ for(const auto& i : *linf_failures) ++ { ++ std::cout << " (" << i.first << "," << i.second << ")"; ++ } ++ std::cout << std::endl; ++ } ++ ++ EXPECT_TRUE(std::isfinite(gpu_norm.get().l_inf)) << params.str(); ++ EXPECT_TRUE(std::isfinite(gpu_norm.get().l_2)) << params.str(); ++ ++ switch(params.precision) ++ { ++ case fft_precision_half: ++ max_linf_eps_half ++ = std::max(max_linf_eps_half, diff.l_inf / cpu_input_norm.l_inf / log(total_length)); ++ max_l2_eps_half ++ = std::max(max_l2_eps_half, diff.l_2 / cpu_input_norm.l_2 * sqrt(log2(total_length))); ++ break; ++ case fft_precision_single: ++ max_linf_eps_single ++ = std::max(max_linf_eps_single, diff.l_inf / cpu_input_norm.l_inf / log(total_length)); ++ max_l2_eps_single ++ = std::max(max_l2_eps_single, diff.l_2 / cpu_input_norm.l_2 * sqrt(log2(total_length))); ++ break; ++ case fft_precision_double: ++ max_linf_eps_double ++ = std::max(max_linf_eps_double, diff.l_inf / cpu_input_norm.l_inf / log(total_length)); ++ max_l2_eps_double ++ = std::max(max_l2_eps_double, diff.l_2 / cpu_input_norm.l_2 * sqrt(log2(total_length))); ++ break; ++ } ++ ++ if(verbose > 1) ++ { ++ std::cout << "L2 diff: " << diff.l_2 << "\n"; ++ std::cout << "Linf diff: " << diff.l_inf << "\n"; ++ } ++ ++ EXPECT_TRUE(diff.l_inf <= linf_cutoff) ++ << "Linf test failed. Linf:" << diff.l_inf ++ << "\tnormalized Linf: " << diff.l_inf / cpu_input_norm.l_inf << "\tcutoff: " << linf_cutoff ++ << params.str(); ++ ++ EXPECT_TRUE(diff.l_2 / cpu_input_norm.l_2 ++ < sqrt(log2(total_length)) * type_epsilon(params.precision)) ++ << "L2 test failed. L2: " << diff.l_2 ++ << "\tnormalized L2: " << diff.l_2 / cpu_input_norm.l_2 ++ << "\tepsilon: " << sqrt(log2(total_length)) * type_epsilon(params.precision) ++ << params.str(); ++} ++ ++// RAII type to put data into the cache when this object leaves scope ++struct StoreCPUDataToCache ++{ ++ StoreCPUDataToCache(std::vector<hostbuf>& cpu_input, std::vector<hostbuf>& cpu_output) ++ : cpu_input(cpu_input) ++ , cpu_output(cpu_output) ++ { ++ } ++ ~StoreCPUDataToCache() ++ { ++ last_cpu_fft_data.cpu_output.swap(cpu_output); ++ last_cpu_fft_data.cpu_input.swap(cpu_input); ++ } ++ std::vector<hostbuf>& cpu_input; ++ std::vector<hostbuf>& cpu_output; ++}; ++ ++// run CPU + rocFFT transform with the given params and compare ++template <class Tfloat, class Tparams> ++inline void fft_vs_reference_impl(Tparams& params, bool round_trip) ++{ ++ // Call hipGetLastError to reset any errors ++ // returned by previous HIP runtime API calls. ++ hipError_t hip_status = hipGetLastError(); ++ ++ // Make sure that the parameters make sense: ++ ASSERT_TRUE(params.valid(verbose)); ++ ++ size_t needed_ram = needed_ram_buffers(params, verbose); ++ ++ if(ramgb > 0 && needed_ram > ramgb * ONE_GiB) ++ { ++ GTEST_SKIP() << "needed_ramgb: " << bytes_to_GiB(needed_ram) << ", ramgb limit: " << ramgb ++ << ".\n"; ++ } ++ ++ auto ibuffer_sizes = params.ibuffer_sizes(); ++ auto obuffer_sizes = params.obuffer_sizes(); ++ ++ size_t vram_avail = 0; ++ ++ if(vramgb == 0) ++ { ++ // Check free and total available memory: ++ size_t free = 0; ++ size_t total = 0; ++ auto hip_status = hipMemGetInfo(&free, &total); ++ if(hip_status != hipSuccess || total == 0) ++ { ++ ++n_hip_failures; ++ std::stringstream ss; ++ if(total == 0) ++ ss << "hipMemGetInfo claims there there isn't any vram"; ++ else ++ ss << "hipMemGetInfo failure with error " << hip_status; ++ if(skip_runtime_fails) ++ { ++ GTEST_SKIP() << ss.str(); ++ } ++ else ++ { ++ GTEST_FAIL() << ss.str(); ++ } ++ } ++ vram_avail = total; ++ } ++ else ++ { ++ vram_avail = vramgb * ONE_GiB; ++ } ++ ++ // First try a quick estimation of vram footprint, to speed up skipping tests ++ // that are too large to fit in the gpu (no plan created with the rocFFT backend) ++ const auto raw_vram_footprint ++ = params.fft_params_vram_footprint() + twiddle_table_vram_footprint(params); ++ ++ if(!vram_fits_problem(raw_vram_footprint, vram_avail)) ++ { ++ GTEST_SKIP() << "Raw problem size (" << bytes_to_GiB(raw_vram_footprint) ++ << " GiB) raw data too large for device"; ++ } ++ ++ if(verbose > 2) ++ { ++ std::cout << "Raw problem size: " << raw_vram_footprint << std::endl; ++ } ++ ++ // If it passed the quick estimation test, go for the more ++ // accurate calculation that actually creates the plan and ++ // take into account the work buffer size ++ const auto vram_footprint = params.vram_footprint(); ++ if(!vram_fits_problem(vram_footprint, vram_avail)) ++ { ++ if(verbose) ++ { ++ std::cout << "Problem raw data won't fit on device; skipped." << std::endl; ++ } ++ GTEST_SKIP() << "Problem size (" << bytes_to_GiB(vram_footprint) ++ << " GiB) raw data too large for device"; ++ } ++ ++ // Create FFT plan - this will also allocate work buffer, but ++ // will throw a specific exception if that step fails ++ auto plan_status = fft_status_success; ++ try ++ { ++ plan_status = params.create_plan(); ++ } ++ catch(fft_params::work_buffer_alloc_failure& e) ++ { ++ ++n_hip_failures; ++ std::stringstream ss; ++ ss << "Work buffer allocation failed with size: " << params.workbuffersize; ++ if(skip_runtime_fails) ++ { ++ GTEST_SKIP() << ss.str(); ++ } ++ else ++ { ++ GTEST_FAIL() << ss.str(); ++ } ++ } ++ ASSERT_EQ(plan_status, fft_status_success) << "plan creation failed"; ++ ++ if(!vram_fits_problem(vram_footprint, vram_avail)) ++ { ++ if(verbose) ++ { ++ std::cout << "Problem won't fit on device; skipped." << std::endl; ++ } ++ GTEST_SKIP() << "Problem size (" << vram_footprint << ") too large for device"; ++ return; ++ } ++ ++ fft_params contiguous_params; ++ contiguous_params.length = params.length; ++ contiguous_params.precision = params.precision; ++ contiguous_params.placement = fft_placement_notinplace; ++ contiguous_params.transform_type = params.transform_type; ++ contiguous_params.nbatch = params.nbatch; ++ contiguous_params.itype = contiguous_itype(params.transform_type); ++ contiguous_params.otype = contiguous_otype(contiguous_params.transform_type); ++ ++ contiguous_params.validate(); ++ ++ if(!contiguous_params.valid(verbose)) ++ { ++ throw std::runtime_error("Invalid contiguous params"); ++ } ++ ++ if(verbose > 3) ++ { ++ std::cout << "CPU params:\n"; ++ std::cout << contiguous_params.str("\n\t") << std::endl; ++ } ++ ++ std::vector<gpubuf> ibuffer(ibuffer_sizes.size()); ++ std::vector<void*> pibuffer(ibuffer_sizes.size()); ++ for(unsigned int i = 0; i < ibuffer.size(); ++i) ++ { ++ hip_status = ibuffer[i].alloc(ibuffer_sizes[i]); ++ if(hip_status != hipSuccess) ++ { ++ std::stringstream ss; ++ ss << "hipMalloc failure for input buffer " << i << " size " << ibuffer_sizes[i] << "(" ++ << bytes_to_GiB(ibuffer_sizes[i]) << " GiB)" ++ << " with code " << hipError_to_string(hip_status); ++ ++n_hip_failures; ++ if(skip_runtime_fails) ++ { ++ GTEST_SKIP() << ss.str(); ++ } ++ else ++ { ++ GTEST_FAIL() << ss.str(); ++ } ++ } ++ pibuffer[i] = ibuffer[i].data(); ++ } ++ ++ // allocation counts in elements, ibuffer_sizes is in bytes ++ auto ibuffer_sizes_elems = ibuffer_sizes; ++ for(auto& buf : ibuffer_sizes_elems) ++ buf /= var_size<size_t>(params.precision, params.itype); ++ ++ // Check cache first - nbatch is a >= comparison because we compute ++ // the largest batch size and cache it. Smaller batch runs can ++ // compare against the larger data. ++ std::vector<hostbuf> cpu_input; ++ std::vector<hostbuf> cpu_output; ++ std::shared_future<void> convert_cpu_output_precision; ++ std::shared_future<void> convert_cpu_input_precision; ++ bool run_fftw = true; ++ std::unique_ptr<StoreCPUDataToCache> store_to_cache; ++ if(fftw_compare && last_cpu_fft_data.length == params.length ++ && last_cpu_fft_data.transform_type == params.transform_type ++ && last_cpu_fft_data.run_callbacks == params.run_callbacks) ++ { ++ if(last_cpu_fft_data.nbatch >= params.nbatch) ++ { ++ // use the cached input/output ++ cpu_input.swap(last_cpu_fft_data.cpu_input); ++ cpu_output.swap(last_cpu_fft_data.cpu_output); ++ run_fftw = false; ++ ++ store_to_cache = std::make_unique<StoreCPUDataToCache>(cpu_input, cpu_output); ++ ++ if(params.precision != last_cpu_fft_data.precision) ++ { ++ // Tests should be ordered so we do wider first, then narrower. ++ switch(params.precision) ++ { ++ case fft_precision_double: ++ std::cerr ++ << "test ordering is incorrect: double precision follows a narrower one" ++ << std::endl; ++ abort(); ++ break; ++ case fft_precision_single: ++ if(last_cpu_fft_data.precision != fft_precision_double) ++ { ++ std::cerr ++ << "test ordering is incorrect: float precision follows a narrower one" ++ << std::endl; ++ abort(); ++ } ++ // convert the input/output to single-precision ++ convert_cpu_output_precision = std::async(std::launch::async, [&]() { ++ narrow_precision_inplace<double, float>(cpu_output.front()); ++ }); ++ convert_cpu_input_precision = std::async(std::launch::async, [&]() { ++ narrow_precision_inplace<double, float>(cpu_input.front()); ++ }); ++ break; ++ case fft_precision_half: ++ // convert to half precision ++ if(last_cpu_fft_data.precision == fft_precision_double) ++ { ++ convert_cpu_output_precision = std::async(std::launch::async, [&]() { ++ narrow_precision_inplace<double, _Float16>(cpu_output.front()); ++ }); ++ convert_cpu_input_precision = std::async(std::launch::async, [&]() { ++ narrow_precision_inplace<double, _Float16>(cpu_input.front()); ++ }); ++ } ++ else if(last_cpu_fft_data.precision == fft_precision_single) ++ { ++ convert_cpu_output_precision = std::async(std::launch::async, [&]() { ++ narrow_precision_inplace<float, _Float16>(cpu_output.front()); ++ }); ++ convert_cpu_input_precision = std::async(std::launch::async, [&]() { ++ narrow_precision_inplace<float, _Float16>(cpu_input.front()); ++ }); ++ } ++ else ++ { ++ std::cerr << "unhandled previous precision, cannot convert to half" ++ << std::endl; ++ abort(); ++ } ++ break; ++ } ++ last_cpu_fft_data.precision = params.precision; ++ } ++ } ++ // If the last result has a smaller batch than the new ++ // params, that might be a developer error - tests should be ++ // ordered to generate the bigger batch first. But if tests ++ // got filtered or skipped due to insufficient memory, we ++ // might never have tried to generate the bigger batch first. ++ // So just fall through and redo the CPU FFT. ++ } ++ else ++ { ++ // Clear cache explicitly so that even if we didn't get a hit, ++ // we're not uselessly holding on to cached cpu input/output ++ last_cpu_fft_data = last_cpu_fft_cache(); ++ } ++ ++ // Allocate CPU input ++ if(run_fftw) ++ { ++ cpu_input = allocate_cpu_fft_buffer( ++ contiguous_params.precision, contiguous_params.itype, contiguous_params.isize); ++ } ++ ++ // Create FFTW plan - this may write to input, but that's fine ++ // since there's nothing in there right now ++ typename fftw_trait<Tfloat>::fftw_plan_type cpu_plan = nullptr; ++ if(run_fftw) ++ { ++ // Normally, we would want to defer allocation of CPU output ++ // buffer until when we actually do the CPU FFT. But if we're ++ // using FFTW wisdom, FFTW needs an output buffer at plan ++ // creation time. ++ if(use_fftw_wisdom) ++ { ++ cpu_output = allocate_cpu_fft_buffer( ++ contiguous_params.precision, contiguous_params.otype, contiguous_params.osize); ++ } ++ cpu_plan = fftw_plan_via_rocfft<Tfloat>(contiguous_params.length, ++ contiguous_params.istride, ++ contiguous_params.ostride, ++ contiguous_params.nbatch, ++ contiguous_params.idist, ++ contiguous_params.odist, ++ contiguous_params.transform_type, ++ cpu_input, ++ cpu_output); ++ ++ needed_ram += needed_ram_fftw<Tfloat>(contiguous_params, cpu_plan, verbose); ++ ++ if(ramgb > 0 && needed_ram > ramgb * ONE_GiB) ++ { ++ if(verbose) ++ { ++ std::cout << "Problem exceeds memory limit; skipped [rocfft_transform]." ++ << std::endl; ++ } ++ GTEST_SKIP(); ++ return; ++ } ++ } ++ ++ std::vector<hostbuf> gpu_input_data; ++ ++ // allocate and populate the input buffer (cpu/gpu) ++ if(run_fftw) ++ { ++ gpu_input_data = allocate_host_buffer(params.precision, params.itype, ibuffer_sizes_elems); ++ ++ //generate the input directly on the gpu ++ params.compute_input(ibuffer); ++ ++ // Copy the input to CPU ++ if(params.itype != contiguous_params.itype || params.istride != contiguous_params.istride ++ || params.idist != contiguous_params.idist || params.isize != contiguous_params.isize) ++ { ++ // Copy input to CPU ++ for(unsigned int idx = 0; idx < ibuffer.size(); ++idx) ++ { ++ hip_status = hipMemcpy(gpu_input_data.at(idx).data(), ++ ibuffer[idx].data(), ++ ibuffer_sizes[idx], ++ hipMemcpyDeviceToHost); ++ if(hip_status != hipSuccess) ++ { ++ ++n_hip_failures; ++ if(skip_runtime_fails) ++ { ++ GTEST_SKIP() << "hipMemcpy failure with error " << hip_status; ++ } ++ else ++ { ++ GTEST_FAIL() << "hipMemcpy failure with error " << hip_status; ++ } ++ } ++ } ++ ++ copy_buffers(gpu_input_data, ++ cpu_input, ++ params.ilength(), ++ params.nbatch, ++ params.precision, ++ params.itype, ++ params.istride, ++ params.idist, ++ contiguous_params.itype, ++ contiguous_params.istride, ++ contiguous_params.idist, ++ params.ioffset, ++ contiguous_params.ioffset); ++ } ++ else ++ { ++ // Copy input to CPU ++ for(unsigned int idx = 0; idx < ibuffer.size(); ++idx) ++ { ++ hip_status = hipMemcpy(cpu_input.at(idx).data(), ++ ibuffer[idx].data(), ++ ibuffer_sizes[idx], ++ hipMemcpyDeviceToHost); ++ if(hip_status != hipSuccess) ++ { ++ ++n_hip_failures; ++ if(skip_runtime_fails) ++ { ++ GTEST_SKIP() << "hipMemcpy failure with error " << hip_status; ++ } ++ else ++ { ++ GTEST_FAIL() << "hipMemcpy failure with error " << hip_status; ++ } ++ } ++ } ++ } ++ } ++ else if(fftw_compare) ++ { ++ gpu_input_data = allocate_host_buffer(params.precision, params.itype, ibuffer_sizes_elems); ++ ++ // In case the cached cpu input needed conversion, wait for it ++ if(convert_cpu_input_precision.valid()) ++ convert_cpu_input_precision.get(); ++ ++ // gets a pre-computed gpu input buffer from the cpu cache ++ std::vector<hostbuf>* gpu_input = &cpu_input; ++ ++ if(params.itype != contiguous_params.itype || params.istride != contiguous_params.istride ++ || params.idist != contiguous_params.idist || params.isize != contiguous_params.isize) ++ { ++ copy_buffers(cpu_input, ++ gpu_input_data, ++ params.ilength(), ++ params.nbatch, ++ params.precision, ++ contiguous_params.itype, ++ contiguous_params.istride, ++ contiguous_params.idist, ++ params.itype, ++ params.istride, ++ params.idist, ++ {0}, ++ params.ioffset); ++ gpu_input = &gpu_input_data; ++ } ++ ++ // Copy input to GPU ++ for(unsigned int idx = 0; idx < gpu_input->size(); ++idx) ++ { ++ hip_status = hipMemcpy(ibuffer[idx].data(), ++ gpu_input->at(idx).data(), ++ ibuffer_sizes[idx], ++ hipMemcpyHostToDevice); ++ ++ if(hip_status != hipSuccess) ++ { ++ ++n_hip_failures; ++ if(skip_runtime_fails) ++ { ++ GTEST_SKIP() << "hipMemcpy failure with error " << hip_status; ++ } ++ else ++ { ++ GTEST_FAIL() << "hipMemcpy failure with error " << hip_status; ++ } ++ } ++ } ++ } ++ ++ if(verbose > 3) ++ { ++ std::cout << "CPU input:\n"; ++ contiguous_params.print_ibuffer(cpu_input); ++ } ++ ++ // compute input norm ++ std::shared_future<VectorNorms> cpu_input_norm; ++ if(fftw_compare) ++ cpu_input_norm = std::async(std::launch::async, [&]() { ++ // in case the cached cpu input needed conversion, wait for it ++ if(convert_cpu_input_precision.valid()) ++ convert_cpu_input_precision.get(); ++ ++ auto input_norm = norm(cpu_input, ++ contiguous_params.ilength(), ++ contiguous_params.nbatch, ++ contiguous_params.precision, ++ contiguous_params.itype, ++ contiguous_params.istride, ++ contiguous_params.idist, ++ contiguous_params.ioffset); ++ if(verbose > 2) ++ { ++ std::cout << "CPU Input Linf norm: " << input_norm.l_inf << "\n"; ++ std::cout << "CPU Input L2 norm: " << input_norm.l_2 << "\n"; ++ } ++ return input_norm; ++ }); ++ ++ std::vector<gpubuf> obuffer_data; ++ std::vector<gpubuf>* obuffer = &obuffer_data; ++ std::vector<void*> pobuffer; ++ ++ // allocate the output buffer ++ ++ if(params.placement == fft_placement_inplace) ++ { ++ obuffer = &ibuffer; ++ } ++ else ++ { ++ auto obuffer_sizes = params.obuffer_sizes(); ++ obuffer_data.resize(obuffer_sizes.size()); ++ for(unsigned int i = 0; i < obuffer_data.size(); ++i) ++ { ++ hip_status = obuffer_data[i].alloc(obuffer_sizes[i]); ++ if(hip_status != hipSuccess) ++ { ++ ++n_hip_failures; ++ std::stringstream ss; ++ ss << "hipMalloc failure for output buffer " << i << " size " << obuffer_sizes[i] ++ << "(" << bytes_to_GiB(obuffer_sizes[i]) << " GiB)" ++ << " with code " << hipError_to_string(hip_status); ++ if(skip_runtime_fails) ++ { ++ GTEST_SKIP() << ss.str(); ++ } ++ else ++ { ++ GTEST_FAIL() << ss.str(); ++ } ++ } ++ ++ // If we're validating output strides, init the ++ // output buffer to a known pattern and we can check ++ // that the pattern is untouched in places that ++ // shouldn't have been touched. ++ if(params.check_output_strides) ++ { ++ hip_status ++ = hipMemset(obuffer_data[i].data(), OUTPUT_INIT_PATTERN, obuffer_sizes[i]); ++ if(hip_status != hipSuccess) ++ { ++ ++n_hip_failures; ++ if(skip_runtime_fails) ++ { ++ GTEST_SKIP() << "hipMemset failure with error " << hip_status; ++ } ++ else ++ { ++ GTEST_FAIL() << "hipMemset failure with error " << hip_status; ++ } ++ } ++ } ++ } ++ } ++ pobuffer.resize(obuffer->size()); ++ for(unsigned int i = 0; i < obuffer->size(); ++i) ++ { ++ pobuffer[i] = obuffer->at(i).data(); ++ } ++ ++ // Run CPU transform ++ // ++ // NOTE: This must happen after input is copied to GPU and input ++ // norm is computed, since the CPU FFT may overwrite the input. ++ VectorNorms cpu_output_norm; ++ std::shared_future<void> cpu_fft; ++ if(fftw_compare) ++ cpu_fft = std::async(std::launch::async, [&]() { ++ // wait for input norm to finish, since we might overwrite input ++ cpu_input_norm.get(); ++ ++ if(run_fftw) ++ execute_cpu_fft<Tfloat>(params, contiguous_params, cpu_plan, cpu_input, cpu_output); ++ // in case the cached cpu output needed conversion, wait for it ++ else if(convert_cpu_output_precision.valid()) ++ convert_cpu_output_precision.get(); ++ ++ if(verbose > 3) ++ { ++ std::cout << "CPU output:\n"; ++ contiguous_params.print_obuffer(cpu_output); ++ } ++ ++ cpu_output_norm = norm(cpu_output, ++ params.olength(), ++ params.nbatch, ++ params.precision, ++ contiguous_params.otype, ++ contiguous_params.ostride, ++ contiguous_params.odist, ++ contiguous_params.ooffset); ++ if(verbose > 2) ++ { ++ std::cout << "CPU Output Linf norm: " << cpu_output_norm.l_inf << "\n"; ++ std::cout << "CPU Output L2 norm: " << cpu_output_norm.l_2 << "\n"; ++ } ++ }); ++ ++ // scatter data out to multi-GPUs if this is a multi-GPU test ++ params.multi_gpu_prepare(ibuffer, pibuffer, pobuffer); ++ ++ // execute GPU transform ++ std::vector<hostbuf> gpu_output ++ = allocate_host_buffer(params.precision, params.otype, params.osize); ++ ++ execute_gpu_fft(params, pibuffer, pobuffer, *obuffer, gpu_output); ++ ++ params.free(); ++ ++ if(params.check_output_strides) ++ { ++ check_output_strides<Tparams>(gpu_output, params); ++ } ++ ++ // compute GPU output norm ++ std::shared_future<VectorNorms> gpu_norm; ++ if(fftw_compare) ++ gpu_norm = std::async(std::launch::async, [&]() { ++ return norm(gpu_output, ++ params.olength(), ++ params.nbatch, ++ params.precision, ++ params.otype, ++ params.ostride, ++ params.odist, ++ params.ooffset); ++ }); ++ ++ // compare output ++ // ++ // Compute the l-infinity and l-2 distance between the CPU and GPU output: ++ // wait for cpu FFT so we can compute cutoff ++ ++ const auto total_length = std::accumulate(params.length.begin(), ++ params.length.end(), ++ static_cast<size_t>(1), ++ std::multiplies<size_t>()); ++ ++ std::unique_ptr<std::vector<std::pair<size_t, size_t>>> linf_failures; ++ if(verbose > 1) ++ linf_failures = std::make_unique<std::vector<std::pair<size_t, size_t>>>(); ++ double linf_cutoff; ++ VectorNorms diff; ++ ++ std::shared_future<void> compare_output; ++ if(fftw_compare) ++ compare_output = std::async(std::launch::async, [&]() { ++ cpu_fft.get(); ++ linf_cutoff ++ = type_epsilon(params.precision) * cpu_output_norm.l_inf * log(total_length); ++ ++ diff = distance(cpu_output, ++ gpu_output, ++ params.olength(), ++ params.nbatch, ++ params.precision, ++ contiguous_params.otype, ++ contiguous_params.ostride, ++ contiguous_params.odist, ++ params.otype, ++ params.ostride, ++ params.odist, ++ linf_failures.get(), ++ linf_cutoff, ++ {0}, ++ params.ooffset); ++ }); ++ ++ // Update the cache if this current transform is different from ++ // what's stored. But if this transform only has a smaller batch ++ // than what's cached, we can still keep the cache around since ++ // the input/output we already have is still valid. ++ const bool update_last_cpu_fft_data ++ = last_cpu_fft_data.length != params.length ++ || last_cpu_fft_data.transform_type != params.transform_type ++ || last_cpu_fft_data.run_callbacks != params.run_callbacks ++ || last_cpu_fft_data.precision != params.precision ++ || params.nbatch > last_cpu_fft_data.nbatch; ++ ++ // store cpu output in cache ++ if(update_last_cpu_fft_data) ++ { ++ last_cpu_fft_data.length = params.length; ++ last_cpu_fft_data.nbatch = params.nbatch; ++ last_cpu_fft_data.transform_type = params.transform_type; ++ last_cpu_fft_data.run_callbacks = params.run_callbacks; ++ last_cpu_fft_data.precision = params.precision; ++ } ++ ++ if(compare_output.valid()) ++ compare_output.get(); ++ ++ if(!store_to_cache) ++ store_to_cache = std::make_unique<StoreCPUDataToCache>(cpu_input, cpu_output); ++ ++ Tparams params_inverse; ++ ++ if(round_trip) ++ { ++ params_inverse.inverse_from_forward(params); ++ ++ run_round_trip_inverse<Tparams>( ++ params_inverse, ibuffer, pobuffer, pibuffer, gpu_input_data); ++ } ++ ++ if(fftw_compare) ++ { ++ ASSERT_TRUE(std::isfinite(cpu_input_norm.get().l_2)); ++ ASSERT_TRUE(std::isfinite(cpu_input_norm.get().l_inf)); ++ ++ ASSERT_TRUE(std::isfinite(cpu_output_norm.l_2)); ++ ASSERT_TRUE(std::isfinite(cpu_output_norm.l_inf)); ++ ++ if(verbose > 1) ++ { ++ std::cout << "GPU output Linf norm: " << gpu_norm.get().l_inf << "\n"; ++ std::cout << "GPU output L2 norm: " << gpu_norm.get().l_2 << "\n"; ++ std::cout << "GPU linf norm failures:"; ++ std::sort(linf_failures->begin(), linf_failures->end()); ++ for(const auto& i : *linf_failures) ++ { ++ std::cout << " (" << i.first << "," << i.second << ")"; ++ } ++ std::cout << std::endl; ++ } ++ ++ EXPECT_TRUE(std::isfinite(gpu_norm.get().l_inf)) << params.str(); ++ EXPECT_TRUE(std::isfinite(gpu_norm.get().l_2)) << params.str(); ++ } ++ ++ switch(params.precision) ++ { ++ case fft_precision_half: ++ max_linf_eps_half ++ = std::max(max_linf_eps_half, diff.l_inf / cpu_output_norm.l_inf / log(total_length)); ++ max_l2_eps_half ++ = std::max(max_l2_eps_half, diff.l_2 / cpu_output_norm.l_2 * sqrt(log2(total_length))); ++ break; ++ case fft_precision_single: ++ max_linf_eps_single ++ = std::max(max_linf_eps_single, diff.l_inf / cpu_output_norm.l_inf / log(total_length)); ++ max_l2_eps_single = std::max(max_l2_eps_single, ++ diff.l_2 / cpu_output_norm.l_2 * sqrt(log2(total_length))); ++ break; ++ case fft_precision_double: ++ max_linf_eps_double ++ = std::max(max_linf_eps_double, diff.l_inf / cpu_output_norm.l_inf / log(total_length)); ++ max_l2_eps_double = std::max(max_l2_eps_double, ++ diff.l_2 / cpu_output_norm.l_2 * sqrt(log2(total_length))); ++ break; ++ } ++ ++ if(verbose > 1) ++ { ++ std::cout << "L2 diff: " << diff.l_2 << "\n"; ++ std::cout << "Linf diff: " << diff.l_inf << "\n"; ++ } ++ ++ if(fftw_compare) ++ { ++ EXPECT_TRUE(diff.l_inf <= linf_cutoff) ++ << "Linf test failed. Linf:" << diff.l_inf ++ << "\tnormalized Linf: " << diff.l_inf / cpu_output_norm.l_inf ++ << "\tcutoff: " << linf_cutoff << params.str(); ++ ++ EXPECT_TRUE(diff.l_2 / cpu_output_norm.l_2 ++ < sqrt(log2(total_length)) * type_epsilon(params.precision)) ++ << "L2 test failed. L2: " << diff.l_2 ++ << "\tnormalized L2: " << diff.l_2 / cpu_output_norm.l_2 ++ << "\tepsilon: " << sqrt(log2(total_length)) * type_epsilon(params.precision) ++ << params.str(); ++ } ++ ++ if(round_trip && fftw_compare) ++ { ++ compare_round_trip_inverse<Tparams>(params_inverse, ++ contiguous_params, ++ gpu_input_data, ++ cpu_input, ++ cpu_input_norm.get(), ++ total_length); ++ } ++} ++ ++#endif +diff --git a/shared/arithmetic.h b/shared/arithmetic.h +new file mode 100644 +index 0000000..774d342 +--- /dev/null ++++ b/shared/arithmetic.h +@@ -0,0 +1,61 @@ ++/****************************************************************************** ++* Copyright (C) 2021 - 2022 Advanced Micro Devices, Inc. All rights reserved. ++* ++* Permission is hereby granted, free of charge, to any person obtaining a copy ++* of this software and associated documentation files (the "Software"), to deal ++* in the Software without restriction, including without limitation the rights ++* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell ++* copies of the Software, and to permit persons to whom the Software is ++* furnished to do so, subject to the following conditions: ++* ++* The above copyright notice and this permission notice shall be included in ++* all copies or substantial portions of the Software. ++* ++* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, ++* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN ++* THE SOFTWARE. ++*******************************************************************************/ ++ ++#pragma once ++ ++#include <numeric> ++#include <stddef.h> ++ ++// arithmetic helper functions ++ ++static inline bool IsPo2(size_t u) ++{ ++ return (u != 0) && (0 == (u & (u - 1))); ++} ++ ++// help function: Find the smallest power of 2 that is >= n; return its ++// power of 2 factor ++// e.g., CeilPo2 (7) returns 3 : (2^3 >= 7) ++static inline size_t CeilPo2(size_t n) ++{ ++ size_t v = 1, t = 0; ++ while(v < n) ++ { ++ v <<= 1; ++ t++; ++ } ++ ++ return t; ++} ++ ++template <typename T> ++static inline T DivRoundingUp(T a, T b) ++{ ++ return (a + (b - 1)) / b; ++} ++ ++template <typename Titer> ++typename Titer::value_type product(Titer begin, Titer end) ++{ ++ return std::accumulate( ++ begin, end, typename Titer::value_type(1), std::multiplies<typename Titer::value_type>()); ++} +diff --git a/shared/array_predicate.h b/shared/array_predicate.h +new file mode 100644 +index 0000000..92e45b4 +--- /dev/null ++++ b/shared/array_predicate.h +@@ -0,0 +1,47 @@ ++// Copyright (C) 2021 - 2022 Advanced Micro Devices, Inc. All rights reserved. ++// ++// Permission is hereby granted, free of charge, to any person obtaining a copy ++// of this software and associated documentation files (the "Software"), to deal ++// in the Software without restriction, including without limitation the rights ++// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell ++// copies of the Software, and to permit persons to whom the Software is ++// furnished to do so, subject to the following conditions: ++// ++// The above copyright notice and this permission notice shall be included in ++// all copies or substantial portions of the Software. ++// ++// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, ++// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN ++// THE SOFTWARE. ++ ++#ifndef ROCFFT_ARRAY_PREDICATE_H ++#define ROCFFT_ARRAY_PREDICATE_H ++ ++#include "rocfft/rocfft.h" ++ ++namespace ++{ ++ bool array_type_is_complex(rocfft_array_type type) ++ { ++ return type == rocfft_array_type_complex_interleaved ++ || type == rocfft_array_type_complex_planar ++ || type == rocfft_array_type_hermitian_interleaved ++ || type == rocfft_array_type_hermitian_planar; ++ } ++ bool array_type_is_interleaved(rocfft_array_type type) ++ { ++ return type == rocfft_array_type_complex_interleaved ++ || type == rocfft_array_type_hermitian_interleaved; ++ } ++ bool array_type_is_planar(rocfft_array_type type) ++ { ++ return type == rocfft_array_type_complex_planar ++ || type == rocfft_array_type_hermitian_planar; ++ } ++} ++ ++#endif +diff --git a/shared/array_validator.cpp b/shared/array_validator.cpp +new file mode 100644 +index 0000000..70abb08 +--- /dev/null ++++ b/shared/array_validator.cpp +@@ -0,0 +1,549 @@ ++// Copyright (C) 2022 Advanced Micro Devices, Inc. All rights reserved. ++// ++// Permission is hereby granted, free of charge, to any person obtaining a copy ++// of this software and associated documentation files (the "Software"), to deal ++// in the Software without restriction, including without limitation the rights ++// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell ++// copies of the Software, and to permit persons to whom the Software is ++// furnished to do so, subject to the following conditions: ++// ++// The above copyright notice and this permission notice shall be included in ++// all copies or substantial portions of the Software. ++// ++// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, ++// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN ++// THE SOFTWARE. ++ ++#include <iostream> ++#include <numeric> ++#include <unordered_set> ++ ++#include "array_validator.h" ++#include "increment.h" ++ ++// Check a 2D array for collisions. ++// The 2D case can be determined via a number-theoretic argument. ++bool valid_length_stride_2d(const size_t l0, const size_t l1, const size_t s0, const size_t s1) ++{ ++ if(s0 == s1) ++ return false; ++ const auto c = std::lcm(s0, s1); ++ return !((s0 * (l0 - 1) >= c) && (s1 * (l1 - 1) >= c)); ++} ++ ++// Compare a 1D direction with a multi-index hyperface for collisions. ++bool valid_length_stride_1d_multi(const unsigned int idx, ++ const std::vector<size_t> l, ++ const std::vector<size_t> s, ++ const int verbose) ++{ ++ size_t l0{0}, s0{0}; ++ std::vector<size_t> l1{}, s1{}; ++ for(unsigned int i = 0; i < l.size(); ++i) ++ { ++ if(i == idx) ++ { ++ l0 = l[i]; ++ s0 = s[i]; ++ } ++ else ++ { ++ l1.push_back(l[i]); ++ s1.push_back(s[i]); ++ } ++ } ++ ++ if(verbose > 4) ++ { ++ std::cout << "l0: " << l0 << "\ts0: " << s0 << std::endl; ++ } ++ ++ // We only need to go to the maximum pointer offset for (l1,s1). ++ const auto max_offset ++ = std::accumulate(l1.begin(), l1.end(), (size_t)1, std::multiplies<size_t>()) ++ - std ::inner_product(l1.begin(), l1.end(), s1.begin(), (size_t)0); ++ std::unordered_set<size_t> a0{}; ++ for(size_t i = 1; i < l0; ++i) ++ { ++ const auto val = i * s0; ++ if(val <= max_offset) ++ a0.insert(val); ++ else ++ break; ++ } ++ ++ if(verbose > 5) ++ { ++ std::cout << "a0:"; ++ for(auto i : a0) ++ std::cout << " " << i; ++ std::cout << std::endl; ++ ++ std::cout << "l1:"; ++ for(auto i : l1) ++ std::cout << " " << i; ++ std::cout << std::endl; ++ ++ std::cout << "s1:"; ++ for(auto i : s1) ++ std::cout << " " << i; ++ std::cout << std::endl; ++ } ++ ++ // TODO: this can be multi-threaded, since find(...) is thread-safe. ++ std::vector<size_t> index(l1.size()); ++ std::fill(index.begin(), index.end(), 0); ++ do ++ { ++ const int i = std::inner_product(index.begin(), index.end(), s1.begin(), (size_t)0); ++ if(i > 0 && (i % s0 == 0)) ++ { ++ // TODO: use an ordered set and binary search ++ if(verbose > 6) ++ std::cout << i << std::endl; ++ if(a0.find(i) != a0.end()) ++ { ++ if(verbose > 4) ++ { ++ std::cout << "l0: " << l0 << "\ts0: " << s0 << std::endl; ++ std::cout << "l1:"; ++ for(const auto li : l1) ++ std::cout << " " << li; ++ std::cout << " s1:"; ++ for(const auto si : s1) ++ std::cout << " " << si; ++ std::cout << std::endl; ++ std::cout << "Found duplicate: " << i << std::endl; ++ } ++ return false; ++ } ++ } ++ } while(increment_rowmajor(index, l1)); ++ ++ return true; ++} ++ ++// Compare a hyperface with another hyperface for collisions. ++bool valid_length_stride_multi_multi(const std::vector<size_t> l0, ++ const std::vector<size_t> s0, ++ const std::vector<size_t> l1, ++ const std::vector<size_t> s1) ++{ ++ std::unordered_set<size_t> a0{}; ++ ++ const auto max_offset ++ = std::accumulate(l1.begin(), l1.end(), (size_t)1, std::multiplies<size_t>()) ++ - std::inner_product(l1.begin(), l1.end(), s1.begin(), (size_t)0); ++ std::vector<size_t> index0(l0.size()); // TODO: check this ++ std::fill(index0.begin(), index0.end(), 0); ++ do ++ { ++ const auto i = std::inner_product(index0.begin(), index0.end(), s0.begin(), (size_t)0); ++ if(i > max_offset) ++ a0.insert(i); ++ } while(increment_rowmajor(index0, l0)); ++ ++ std::vector<size_t> index1(l1.size()); ++ std::fill(index1.begin(), index1.end(), 0); ++ do ++ { ++ const auto i = std::inner_product(index1.begin(), index1.end(), s1.begin(), (size_t)0); ++ if(i > 0) ++ { ++ // TODO: use an ordered set and binary search ++ if(a0.find(i) != a0.end()) ++ { ++ ++ return false; ++ } ++ } ++ } while(increment_rowmajor(index1, l1)); ++ ++ return true; ++} ++ ++bool valid_length_stride_3d(const std::vector<size_t>& l, ++ const std::vector<size_t>& s, ++ const int verbose) ++{ ++ // Check that 2D faces are valid: ++ if(!valid_length_stride_2d(l[0], l[1], s[0], s[1])) ++ return false; ++ if(!valid_length_stride_2d(l[0], l[2], s[0], s[2])) ++ return false; ++ if(!valid_length_stride_2d(l[1], l[2], s[1], s[2])) ++ return false; ++ ++ // If the 2D faces are valid, check an axis vs a face for collisions: ++ bool invalid = false; ++#ifdef _OPENMP ++#pragma omp parallel for ++#endif ++ for(int idx = 0; idx < 3; ++idx) ++ { ++ if(!valid_length_stride_1d_multi(idx, l, s, verbose)) ++ { ++#ifdef _OPENMP ++#pragma omp cancel for ++#endif ++ invalid = true; ++ } ++ } ++ if(invalid) ++ return false; ++ return true; ++} ++ ++bool valid_length_stride_4d(const std::vector<size_t>& l, ++ const std::vector<size_t>& s, ++ const int verbose) ++{ ++ if(l.size() != 4) ++ { ++ throw std::runtime_error("Incorrect dimensions for valid_length_stride_4d"); ++ } ++ ++ // Check that 2D faces are valid: ++ for(int idx0 = 0; idx0 < 3; ++idx0) ++ { ++ for(int idx1 = idx0 + 1; idx1 < 4; ++idx1) ++ { ++ if(!valid_length_stride_2d(l[idx0], l[idx1], s[idx0], s[idx1])) ++ return false; ++ } ++ } ++ ++ bool invalid = false; ++ // Check that 1D vs 3D faces are valid: ++#ifdef _OPENMP ++#pragma omp parallel for ++#endif ++ for(int idx0 = 0; idx0 < 4; ++idx0) ++ { ++ if(!valid_length_stride_1d_multi(idx0, l, s, verbose)) ++ { ++#ifdef _OPENMP ++#pragma omp cancel for ++#endif ++ invalid = true; ++ } ++ } ++ if(invalid) ++ return false; ++ ++ // Check that 2D vs 2D faces are valid: ++ ++ // First, get all the permutations ++ std::vector<std::vector<size_t>> perms; ++ std::vector<size_t> v(l.size()); ++ std::fill(v.begin(), v.begin() + 2, 0); ++ std::fill(v.begin() + 2, v.end(), 1); ++ do ++ { ++ perms.push_back(v); ++ if(verbose > 3) ++ { ++ std::cout << "v:"; ++ for(const auto i : v) ++ { ++ std::cout << " " << i; ++ } ++ std::cout << "\n"; ++ } ++ } while(std::next_permutation(v.begin(), v.end())); ++ ++ // Then loop over all of the permutations. ++#ifdef _OPENMP ++#pragma omp parallel for ++#endif ++ for(size_t iperm = 0; iperm < perms.size(); ++iperm) ++ { ++ std::vector<size_t> l0(2); ++ std::vector<size_t> s0(2); ++ std::vector<size_t> l1(2); ++ std::vector<size_t> s1(2); ++ for(size_t i = 0; i < l.size(); ++i) ++ { ++ if(perms[iperm][i] == 0) ++ { ++ l0.push_back(l[i]); ++ s0.push_back(s[i]); ++ } ++ else ++ { ++ l1.push_back(l[i]); ++ s1.push_back(s[i]); ++ } ++ } ++ ++ if(verbose > 3) ++ { ++ std::cout << "\tl0:"; ++ for(const auto i : l0) ++ { ++ std::cout << " " << i; ++ } ++ std::cout << "\n"; ++ std::cout << "\ts0:"; ++ for(const auto i : s0) ++ { ++ std::cout << " " << i; ++ } ++ std::cout << "\n"; ++ std::cout << "\tl1:"; ++ for(const auto i : l1) ++ { ++ std::cout << " " << i; ++ } ++ std::cout << "\n"; ++ std::cout << "\ts1:"; ++ for(const auto i : s1) ++ { ++ std::cout << " " << i; ++ } ++ std::cout << "\n"; ++ } ++ ++ if(!valid_length_stride_multi_multi(l0, s0, l1, s1)) ++ { ++#ifdef _OPENMP ++#pragma omp cancel for ++#endif ++ invalid = true; ++ } ++ } ++ if(invalid) ++ return false; ++ ++ return true; ++} ++ ++bool valid_length_stride_generald(const std::vector<size_t> l, ++ const std::vector<size_t> s, ++ const int verbose) ++{ ++ if(verbose > 2) ++ { ++ std::cout << "checking dimension " << l.size() << std::endl; ++ } ++ ++ // Recurse on d-1 hyper-faces: ++ for(unsigned int idx = 0; idx < l.size(); ++idx) ++ { ++ std::vector<size_t> l0{}; ++ std::vector<size_t> s0{}; ++ for(size_t i = 0; i < l.size(); ++i) ++ { ++ if(i != idx) ++ { ++ l0.push_back(l[i]); ++ s0.push_back(s[i]); ++ } ++ } ++ if(!array_valid(l0, s0, verbose)) ++ return false; ++ } ++ ++ // Handle the 1D vs (N-1) case: ++ for(unsigned int idx = 0; idx < l.size(); ++idx) ++ { ++ if(!valid_length_stride_1d_multi(idx, l, s, verbose)) ++ return false; ++ } ++ ++ for(size_t dim0 = 2; dim0 <= l.size() / 2; ++dim0) ++ { ++ const size_t dim1 = l.size() - dim0; ++ if(verbose > 2) ++ std::cout << "dims: " << dim0 << " " << dim1 << std::endl; ++ ++ // We iterate over all permutations of an array of length l.size() which contains dim0 zeros ++ // and dim1 ones. We start with {0, ..., 0, 1, ... 1} to guarantee that we hit all the ++ // possibilities. ++ ++ // First, get all the permutations ++ std::vector<std::vector<size_t>> perms; ++ std::vector<size_t> v(l.size()); ++ std::fill(v.begin(), v.begin() + dim1, 0); ++ std::fill(v.begin() + dim1, v.end(), 1); ++ do ++ { ++ perms.push_back(v); ++ if(verbose > 3) ++ { ++ std::cout << "v:"; ++ for(const auto i : v) ++ { ++ std::cout << " " << i; ++ } ++ std::cout << "\n"; ++ } ++ ++ } while(std::next_permutation(v.begin(), v.end())); ++ ++ bool invalid = false; ++ // Then loop over all of the permutations. ++#ifdef _OPENMP ++#pragma omp parallel for ++#endif ++ for(size_t iperm = 0; iperm < perms.size(); ++iperm) ++ { ++ std::vector<size_t> l0(dim0); ++ std::vector<size_t> s0(dim0); ++ std::vector<size_t> l1(dim1); ++ std::vector<size_t> s1(dim1); ++ ++ for(size_t i = 0; i < l.size(); ++i) ++ { ++ if(v[i] == 0) ++ { ++ l0.push_back(l[i]); ++ s0.push_back(s[i]); ++ } ++ else ++ { ++ l1.push_back(l[i]); ++ s1.push_back(s[i]); ++ } ++ } ++ ++ if(verbose > 3) ++ { ++ std::cout << "\tl0:"; ++ for(const auto i : l0) ++ { ++ std::cout << " " << i; ++ } ++ std::cout << "\n"; ++ std::cout << "\ts0:"; ++ for(const auto i : s0) ++ { ++ std::cout << " " << i; ++ } ++ std::cout << "\n"; ++ std::cout << "\tl1:"; ++ for(const auto i : l1) ++ { ++ std::cout << " " << i; ++ } ++ std::cout << "\n"; ++ std::cout << "\ts1:"; ++ for(const auto i : s1) ++ { ++ std::cout << " " << i; ++ } ++ std::cout << "\n"; ++ } ++ ++ if(!valid_length_stride_multi_multi(l0, s0, l1, s1)) ++ { ++#ifdef _OPENMP ++#pragma omp cancel for ++#endif ++ invalid = true; ++ } ++ } ++ if(invalid) ++ return false; ++ } ++ ++ return true; ++} ++ ++bool sort_by_stride(const std::pair<size_t, size_t>& ls0, const std::pair<size_t, size_t>& ls1) ++{ ++ return ls0.second < ls1.second; ++} ++ ++bool array_valid(const std::vector<size_t>& length, ++ const std::vector<size_t>& stride, ++ const int verbose) ++{ ++ if(length.size() != stride.size()) ++ return false; ++ ++ // If a length is 1, then the stride is irrelevant. ++ // If a length is > 1, then the corresponding stride must be > 1. ++ std::vector<size_t> l{}, s{}; ++ for(unsigned int i = 0; i < length.size(); ++i) ++ { ++ if(length[i] > 1) ++ { ++ if(stride[i] == 0) ++ return false; ++ l.push_back(length[i]); ++ s.push_back(stride[i]); ++ } ++ } ++ ++ if(length.size() > 1) ++ { ++ // Check happy path. ++ bool happy_path = true; ++ std::vector<std::pair<size_t, size_t>> ls; ++ for(size_t idx = 0; idx < length.size(); ++idx) ++ { ++ ls.push_back(std::pair(length[idx], stride[idx])); ++ } ++ std::sort(ls.begin(), ls.end(), sort_by_stride); ++ ++ if(verbose > 2) ++ { ++ for(size_t idx = 0; idx < ls.size(); ++idx) ++ { ++ std::cout << ls[idx].first << "\t" << ls[idx].second << "\n"; ++ } ++ } ++ ++ for(size_t idx = 1; idx < ls.size(); ++idx) ++ { ++ if(ls[idx].second < ls[idx - 1].first * ls[idx - 1].second) ++ { ++ happy_path = false; ++ break; ++ } ++ } ++ if(happy_path) ++ { ++ if(verbose > 2) ++ { ++ std::cout << "happy path\n"; ++ } ++ return true; ++ } ++ } ++ ++ switch(l.size()) ++ { ++ case 0: ++ return true; ++ break; ++ case 1: ++ return s[0] != 0; ++ break; ++ case 2: ++ { ++ return valid_length_stride_2d(l[0], l[1], s[0], s[1]); ++ break; ++ } ++ case 3: ++ { ++ return valid_length_stride_3d(l, s, verbose); ++ break; ++ } ++ case 4: ++ { ++ return valid_length_stride_4d(l, s, verbose); ++ break; ++ } ++ default: ++ return valid_length_stride_generald(l, s, verbose); ++ return true; ++ } ++ ++ return true; ++} +diff --git a/shared/array_validator.h b/shared/array_validator.h +new file mode 100644 +index 0000000..ce85173 +--- /dev/null ++++ b/shared/array_validator.h +@@ -0,0 +1,31 @@ ++// Copyright (C) 2022 Advanced Micro Devices, Inc. All rights reserved. ++// ++// Permission is hereby granted, free of charge, to any person obtaining a copy ++// of this software and associated documentation files (the "Software"), to deal ++// in the Software without restriction, including without limitation the rights ++// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell ++// copies of the Software, and to permit persons to whom the Software is ++// furnished to do so, subject to the following conditions: ++// ++// The above copyright notice and this permission notice shall be included in ++// all copies or substantial portions of the Software. ++// ++// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, ++// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN ++// THE SOFTWARE. ++ ++#ifndef ARRAY_VALIDATOR_H ++#define ARRAY_VALIDATOR_H ++ ++#include <vector> ++ ++// Checks whether the array with given length and stride has multi-index collisions. ++bool array_valid(const std::vector<size_t>& length, ++ const std::vector<size_t>& stride, ++ const int verbose = 0); ++ ++#endif +diff --git a/shared/concurrency.h b/shared/concurrency.h +new file mode 100644 +index 0000000..a36c7c1 +--- /dev/null ++++ b/shared/concurrency.h +@@ -0,0 +1,41 @@ ++// Copyright (C) 2022 Advanced Micro Devices, Inc. All rights reserved. ++// ++// Permission is hereby granted, free of charge, to any person obtaining a copy ++// of this software and associated documentation files (the "Software"), to deal ++// in the Software without restriction, including without limitation the rights ++// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell ++// copies of the Software, and to permit persons to whom the Software is ++// furnished to do so, subject to the following conditions: ++// ++// The above copyright notice and this permission notice shall be included in ++// all copies or substantial portions of the Software. ++// ++// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, ++// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN ++// THE SOFTWARE. ++ ++#pragma once ++ ++#include <thread> ++ ++#ifndef WIN32 ++#include <sched.h> ++#endif ++ ++// work out how many parallel tasks to run, based on available ++// resources. on Linux, this will look at the cpu affinity mask (if ++// available) which might be restricted in a container. otherwise, ++// return std::thread::hardware_concurrency(). ++static unsigned int rocfft_concurrency() ++{ ++#ifndef WIN32 ++ cpu_set_t cpuset; ++ if(sched_getaffinity(0, sizeof(cpuset), &cpuset) == 0) ++ return CPU_COUNT(&cpuset); ++#endif ++ return std::thread::hardware_concurrency(); ++} +diff --git a/shared/data_gen_device.h b/shared/data_gen_device.h +new file mode 100644 +index 0000000..77fb012 +--- /dev/null ++++ b/shared/data_gen_device.h +@@ -0,0 +1,1303 @@ ++// Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved. ++// ++// Permission is hereby granted, free of charge, to any person obtaining a copy ++// of this software and associated documentation files (the "Software"), to deal ++// in the Software without restriction, including without limitation the rights ++// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell ++// copies of the Software, and to permit persons to whom the Software is ++// furnished to do so, subject to the following conditions: ++// ++// The above copyright notice and this permission notice shall be included in ++// all copies or substantial portions of the Software. ++// ++// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, ++// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN ++// THE SOFTWARE. ++ ++#ifndef DATA_GEN_DEVICE_H ++#define DATA_GEN_DEVICE_H ++ ++// rocRAND can generate warnings if inline asm is not available for ++// some architectures. data generation isn't performance-critical, ++// so just disable inline asm to prevent the warnings. ++#define ROCRAND_DISABLE_INLINE_ASM ++ ++#include "../shared/arithmetic.h" ++#include "../shared/device_properties.h" ++#include "../shared/gpubuf.h" ++#include "../shared/increment.h" ++#include "../shared/rocfft_complex.h" ++#include <hip/hip_runtime.h> ++#include <hip/hip_runtime_api.h> ++#include <hiprand/hiprand.h> ++#include <hiprand/hiprand_kernel.h> ++#include <limits> ++#include <vector> ++ ++static const unsigned int DATA_GEN_THREADS = 8; ++static const unsigned int DATA_GEN_GRID_Y_MAX = 64; ++ ++template <typename T> ++struct input_val_1D ++{ ++ T val1; ++}; ++ ++template <typename T> ++struct input_val_2D ++{ ++ T val1; ++ T val2; ++}; ++ ++template <typename T> ++struct input_val_3D ++{ ++ T val1; ++ T val2; ++ T val3; ++}; ++ ++template <typename T> ++static input_val_1D<T> get_input_val(const T& val) ++{ ++ return input_val_1D<T>{val}; ++} ++ ++template <typename T> ++static input_val_2D<T> get_input_val(const std::tuple<T, T>& val) ++{ ++ return input_val_2D<T>{std::get<0>(val), std::get<1>(val)}; ++} ++ ++template <typename T> ++static input_val_3D<T> get_input_val(const std::tuple<T, T, T>& val) ++{ ++ return input_val_3D<T>{std::get<0>(val), std::get<1>(val), std::get<2>(val)}; ++} ++ ++template <typename T> ++__device__ static size_t ++ compute_index(const input_val_1D<T>& length, const input_val_1D<T>& stride, size_t base) ++{ ++ return (length.val1 * stride.val1) + base; ++} ++ ++template <typename T> ++__device__ static size_t ++ compute_index(const input_val_2D<T>& length, const input_val_2D<T>& stride, size_t base) ++{ ++ return (length.val1 * stride.val1) + (length.val2 * stride.val2) + base; ++} ++ ++template <typename T> ++__device__ static size_t ++ compute_index(const input_val_3D<T>& length, const input_val_3D<T>& stride, size_t base) ++{ ++ return (length.val1 * stride.val1) + (length.val2 * stride.val2) + (length.val3 * stride.val3) ++ + base; ++} ++ ++template <typename T> ++static inline input_val_1D<T> make_zero_length(const input_val_1D<T>& whole_length) ++{ ++ return input_val_1D<T>{0}; ++} ++ ++template <typename T> ++static inline input_val_2D<T> make_zero_length(const input_val_2D<T>& whole_length) ++{ ++ return input_val_2D<T>{0, 0}; ++} ++ ++template <typename T> ++static inline input_val_3D<T> make_zero_length(const input_val_3D<T>& whole_length) ++{ ++ return input_val_3D<T>{0, 0, 0}; ++} ++ ++template <typename T> ++static inline input_val_1D<T> make_unit_stride(const input_val_1D<T>& whole_length) ++{ ++ return input_val_1D<T>{1}; ++} ++ ++template <typename T> ++static inline input_val_2D<T> make_unit_stride(const input_val_2D<T>& whole_length) ++{ ++ return input_val_2D<T>{1, whole_length.val1}; ++} ++ ++template <typename T> ++static inline input_val_3D<T> make_unit_stride(const input_val_3D<T>& whole_length) ++{ ++ return input_val_3D<T>{1, whole_length.val1, whole_length.val1 * whole_length.val2}; ++} ++ ++template <typename T> ++__device__ static input_val_1D<T> get_length(const size_t i, const input_val_1D<T>& whole_length) ++{ ++ auto xlen = whole_length.val1; ++ ++ auto xidx = i % xlen; ++ ++ return input_val_1D<T>{xidx}; ++} ++ ++template <typename T> ++__device__ static input_val_2D<T> get_length(const size_t i, const input_val_2D<T>& whole_length) ++{ ++ auto xlen = whole_length.val1; ++ auto ylen = whole_length.val2; ++ ++ auto xidx = i % xlen; ++ auto yidx = i / xlen % ylen; ++ ++ return input_val_2D<T>{xidx, yidx}; ++} ++ ++template <typename T> ++__device__ static input_val_3D<T> get_length(const size_t i, const input_val_3D<T>& whole_length) ++{ ++ auto xlen = whole_length.val1; ++ auto ylen = whole_length.val2; ++ auto zlen = whole_length.val3; ++ ++ auto xidx = i % xlen; ++ auto yidx = i / xlen % ylen; ++ auto zidx = i / xlen / ylen % zlen; ++ ++ return input_val_3D<T>{xidx, yidx, zidx}; ++} ++ ++template <typename T> ++__device__ static size_t get_batch(const size_t i, const input_val_1D<T>& whole_length) ++{ ++ auto xlen = whole_length.val1; ++ ++ auto yidx = i / xlen; ++ ++ return yidx; ++} ++ ++template <typename T> ++__device__ static size_t get_batch(const size_t i, const input_val_2D<T>& whole_length) ++{ ++ auto xlen = whole_length.val1; ++ auto ylen = whole_length.val2; ++ ++ auto zidx = i / xlen / ylen; ++ ++ return zidx; ++} ++ ++template <typename T> ++__device__ static size_t get_batch(const size_t i, const input_val_3D<T>& length) ++{ ++ auto xlen = length.val1; ++ auto ylen = length.val2; ++ auto zlen = length.val3; ++ ++ auto widx = i / xlen / ylen / zlen; ++ ++ return widx; ++} ++ ++__device__ static double make_random_val(hiprandStatePhilox4_32_10* gen_state, double offset) ++{ ++ return hiprand_uniform_double(gen_state) + offset; ++} ++ ++__device__ static float make_random_val(hiprandStatePhilox4_32_10* gen_state, float offset) ++{ ++ return hiprand_uniform(gen_state) + offset; ++} ++ ++__device__ static _Float16 make_random_val(hiprandStatePhilox4_32_10* gen_state, _Float16 offset) ++{ ++ return static_cast<_Float16>(hiprand_uniform(gen_state)) + offset; ++} ++ ++template <typename Tcomplex> ++__device__ static void set_imag_zero(const size_t pos, Tcomplex* x) ++{ ++ x[pos].y = 0.0; ++} ++ ++template <typename Tfloat> ++__device__ static void set_imag_zero(const size_t pos, Tfloat* xreal, Tfloat* ximag) ++{ ++ ximag[pos] = 0.0; ++} ++ ++template <typename Tcomplex> ++__device__ static void conjugate(const size_t pos, const size_t cpos, Tcomplex* x) ++{ ++ x[pos].x = x[cpos].x; ++ x[pos].y = -x[cpos].y; ++} ++ ++template <typename Tfloat> ++__device__ static void conjugate(const size_t pos, const size_t cpos, Tfloat* xreal, Tfloat* ximag) ++{ ++ xreal[pos] = xreal[cpos]; ++ ximag[pos] = -ximag[cpos]; ++} ++ ++template <typename Tint, typename Treal> ++__global__ static void __launch_bounds__(DATA_GEN_THREADS) ++ generate_random_interleaved_data_kernel(const Tint whole_length, ++ const Tint zero_length, ++ const size_t idist, ++ const size_t isize, ++ const Tint istride, ++ rocfft_complex<Treal>* data) ++{ ++ auto const i = static_cast<size_t>(threadIdx.x) + blockIdx.x * blockDim.x ++ + blockIdx.y * gridDim.x * DATA_GEN_THREADS; ++ static_assert(sizeof(i) >= sizeof(isize)); ++ if(i < isize) ++ { ++ auto i_length = get_length(i, whole_length); ++ auto i_batch = get_batch(i, whole_length); ++ auto i_base = i_batch * idist; ++ ++ auto seed = compute_index(zero_length, istride, i_base); ++ auto idx = compute_index(i_length, istride, i_base); ++ ++ hiprandStatePhilox4_32_10 gen_state; ++ hiprand_init(seed, idx, 0, &gen_state); ++ ++ data[idx].x = make_random_val(&gen_state, static_cast<Treal>(-0.5)); ++ data[idx].y = make_random_val(&gen_state, static_cast<Treal>(-0.5)); ++ } ++} ++ ++template <typename Tint, typename Treal> ++__global__ static void __launch_bounds__(DATA_GEN_THREADS) ++ generate_interleaved_data_kernel(const Tint whole_length, ++ const size_t idist, ++ const size_t isize, ++ const Tint istride, ++ const Tint ustride, ++ const Treal inv_scale, ++ rocfft_complex<Treal>* data) ++{ ++ auto const i = static_cast<size_t>(threadIdx.x) + blockIdx.x * blockDim.x ++ + blockIdx.y * gridDim.x * DATA_GEN_THREADS; ++ static_assert(sizeof(i) >= sizeof(isize)); ++ if(i < isize) ++ { ++ const auto i_length = get_length(i, whole_length); ++ const auto i_batch = get_batch(i, whole_length); ++ const auto i_base = i_batch * idist; ++ ++ const auto val = static_cast<Treal>(-0.5) ++ + static_cast<Treal>( ++ static_cast<unsigned long long>(compute_index(i_length, ustride, 0))) ++ * inv_scale; ++ ++ const auto idx = compute_index(i_length, istride, i_base); ++ ++ data[idx].x = val; ++ data[idx].y = val; ++ } ++} ++ ++template <typename Tint, typename Treal> ++__global__ static void __launch_bounds__(DATA_GEN_THREADS) ++ generate_random_planar_data_kernel(const Tint whole_length, ++ const Tint zero_length, ++ const size_t idist, ++ const size_t isize, ++ const Tint istride, ++ Treal* real_data, ++ Treal* imag_data) ++{ ++ auto const i = static_cast<size_t>(threadIdx.x) + blockIdx.x * blockDim.x ++ + blockIdx.y * gridDim.x * DATA_GEN_THREADS; ++ static_assert(sizeof(i) >= sizeof(isize)); ++ if(i < isize) ++ { ++ auto i_length = get_length(i, whole_length); ++ auto i_batch = get_batch(i, whole_length); ++ auto i_base = i_batch * idist; ++ ++ auto seed = compute_index(zero_length, istride, i_base); ++ auto idx = compute_index(i_length, istride, i_base); ++ ++ hiprandStatePhilox4_32_10 gen_state; ++ hiprand_init(seed, idx, 0, &gen_state); ++ ++ real_data[idx] = make_random_val(&gen_state, static_cast<Treal>(-0.5)); ++ imag_data[idx] = make_random_val(&gen_state, static_cast<Treal>(-0.5)); ++ } ++} ++ ++template <typename Tint, typename Treal> ++__global__ static void __launch_bounds__(DATA_GEN_THREADS) ++ generate_planar_data_kernel(const Tint whole_length, ++ const size_t idist, ++ const size_t isize, ++ const Tint istride, ++ const Tint ustride, ++ const Treal inv_scale, ++ Treal* real_data, ++ Treal* imag_data) ++{ ++ auto const i = static_cast<size_t>(threadIdx.x) + blockIdx.x * blockDim.x ++ + blockIdx.y * gridDim.x * DATA_GEN_THREADS; ++ static_assert(sizeof(i) >= sizeof(isize)); ++ if(i < isize) ++ { ++ const auto i_length = get_length(i, whole_length); ++ const auto i_batch = get_batch(i, whole_length); ++ const auto i_base = i_batch * idist; ++ ++ const auto val = static_cast<Treal>(-0.5) ++ + static_cast<Treal>( ++ static_cast<unsigned long long>(compute_index(i_length, ustride, 0))) ++ * inv_scale; ++ ++ const auto idx = compute_index(i_length, istride, i_base); ++ ++ real_data[idx] = val; ++ imag_data[idx] = val; ++ } ++} ++ ++template <typename Tint, typename Treal> ++__global__ static void __launch_bounds__(DATA_GEN_THREADS) ++ generate_random_real_data_kernel(const Tint whole_length, ++ const Tint zero_length, ++ const size_t idist, ++ const size_t isize, ++ const Tint istride, ++ Treal* data) ++{ ++ auto const i = static_cast<size_t>(threadIdx.x) + blockIdx.x * blockDim.x ++ + blockIdx.y * gridDim.x * DATA_GEN_THREADS; ++ static_assert(sizeof(i) >= sizeof(isize)); ++ if(i < isize) ++ { ++ auto i_length = get_length(i, whole_length); ++ auto i_batch = get_batch(i, whole_length); ++ auto i_base = i_batch * idist; ++ ++ auto seed = compute_index(zero_length, istride, i_base); ++ auto idx = compute_index(i_length, istride, i_base); ++ ++ hiprandStatePhilox4_32_10 gen_state; ++ hiprand_init(seed, idx, 0, &gen_state); ++ ++ data[idx] = make_random_val(&gen_state, static_cast<Treal>(-0.5)); ++ } ++} ++ ++template <typename Tint, typename Treal> ++__global__ static void __launch_bounds__(DATA_GEN_THREADS) ++ generate_real_data_kernel(const Tint whole_length, ++ const size_t idist, ++ const size_t isize, ++ const Tint istride, ++ const Tint ustride, ++ const Treal inv_scale, ++ Treal* data) ++{ ++ auto const i = static_cast<size_t>(threadIdx.x) + blockIdx.x * blockDim.x ++ + blockIdx.y * gridDim.x * DATA_GEN_THREADS; ++ static_assert(sizeof(i) >= sizeof(isize)); ++ if(i < isize) ++ { ++ const auto i_length = get_length(i, whole_length); ++ const auto i_batch = get_batch(i, whole_length); ++ const auto i_base = i_batch * idist; ++ ++ const auto val = static_cast<Treal>(-0.5) ++ + static_cast<Treal>( ++ static_cast<unsigned long long>(compute_index(i_length, ustride, 0))) ++ * inv_scale; ++ ++ const auto idx = compute_index(i_length, istride, i_base); ++ ++ data[idx] = val; ++ } ++} ++ ++// For complex-to-real transforms, the input data must be Hermitiam-symmetric. ++// That is, u_k is the complex conjugate of u_{-k}, where k is the wavevector in Fourier ++// space. For multi-dimensional data, this means that we only need to store a bit more ++// than half of the complex values; the rest are redundant. However, there are still ++// some restrictions: ++// * the origin and Nyquist value(s) must be real-valued ++// * some of the remaining values are still redundant, and you might get different results ++// than you expect if the values don't agree. ++ ++template <typename Tcomplex> ++__global__ static void impose_hermitian_symmetry_interleaved_1D_kernel(Tcomplex* x, ++ const size_t Nx, ++ const size_t xstride, ++ const size_t dist, ++ const size_t batch_total, ++ const bool Nxeven) ++{ ++ auto id_batch = static_cast<size_t>(threadIdx.x) + blockIdx.x * blockDim.x; ++ static_assert(sizeof(id_batch) == sizeof(size_t)); ++ ++ if(id_batch < batch_total) ++ { ++ id_batch *= dist; ++ ++ set_imag_zero(id_batch, x); ++ ++ if(Nxeven) ++ set_imag_zero(id_batch + (Nx / 2) * xstride, x); ++ } ++} ++ ++template <typename Tfloat> ++__global__ static void impose_hermitian_symmetry_planar_1D_kernel(Tfloat* xreal, ++ Tfloat* ximag, ++ const size_t Nx, ++ const size_t xstride, ++ const size_t dist, ++ const size_t batch_total, ++ const bool Nxeven) ++{ ++ auto id_batch = static_cast<size_t>(threadIdx.x) + blockIdx.x * blockDim.x; ++ static_assert(sizeof(id_batch) == sizeof(size_t)); ++ ++ if(id_batch < batch_total) ++ { ++ id_batch *= dist; ++ ++ set_imag_zero(id_batch, xreal, ximag); ++ ++ if(Nxeven) ++ set_imag_zero(id_batch + (Nx / 2) * xstride, xreal, ximag); ++ } ++} ++ ++template <typename Tcomplex> ++__global__ static void impose_hermitian_symmetry_interleaved_2D_kernel(Tcomplex* x, ++ const size_t Nx, ++ const size_t Ny, ++ const size_t xstride, ++ const size_t ystride, ++ const size_t dist, ++ const size_t batch_total, ++ const size_t x_total, ++ const bool Nxeven, ++ const bool Nyeven) ++{ ++ auto id_batch = static_cast<size_t>(threadIdx.x) + blockIdx.x * blockDim.x; ++ const auto id_x = static_cast<size_t>(threadIdx.y) + blockIdx.y * blockDim.y; ++ static_assert(sizeof(id_batch) == sizeof(size_t)); ++ static_assert(sizeof(id_x) == sizeof(size_t)); ++ ++ if(id_batch < batch_total) ++ { ++ id_batch *= dist; ++ ++ if(id_x == 0) ++ set_imag_zero(id_batch, x); ++ ++ if(id_x == 0 && Nxeven) ++ set_imag_zero(id_batch + (Nx / 2) * xstride, x); ++ ++ if(id_x == 0 && Nyeven) ++ set_imag_zero(id_batch + ystride * (Ny / 2), x); ++ ++ if(id_x == 0 && Nxeven && Nyeven) ++ set_imag_zero(id_batch + xstride * (Nx / 2) + ystride * (Ny / 2), x); ++ ++ if(id_x < x_total) ++ { ++ conjugate(id_batch + xstride * (Nx - (id_x + 1)), id_batch + xstride * (id_x + 1), x); ++ ++ if(Nyeven) ++ conjugate(id_batch + xstride * (Nx - (id_x + 1)) + ystride * (Ny / 2), ++ id_batch + xstride * (id_x + 1) + ystride * (Ny / 2), ++ x); ++ } ++ } ++} ++ ++template <typename Tfloat> ++__global__ static void impose_hermitian_symmetry_planar_2D_kernel(Tfloat* xreal, ++ Tfloat* ximag, ++ const size_t Nx, ++ const size_t Ny, ++ const size_t xstride, ++ const size_t ystride, ++ const size_t dist, ++ const size_t batch_total, ++ const size_t x_total, ++ const bool Nxeven, ++ const bool Nyeven) ++{ ++ auto id_batch = static_cast<size_t>(threadIdx.x) + blockIdx.x * blockDim.x; ++ const auto id_x = static_cast<size_t>(threadIdx.y) + blockIdx.y * blockDim.y; ++ static_assert(sizeof(id_batch) == sizeof(size_t)); ++ static_assert(sizeof(id_x) == sizeof(size_t)); ++ ++ if(id_batch < batch_total) ++ { ++ id_batch *= dist; ++ ++ if(id_x == 0) ++ set_imag_zero(id_batch, xreal, ximag); ++ ++ if(id_x == 0 && Nxeven) ++ set_imag_zero(id_batch + (Nx / 2) * xstride, xreal, ximag); ++ ++ if(id_x == 0 && Nyeven) ++ set_imag_zero(id_batch + ystride * (Ny / 2), xreal, ximag); ++ ++ if(id_x == 0 && Nxeven && Nyeven) ++ set_imag_zero(id_batch + xstride * (Nx / 2) + ystride * (Ny / 2), xreal, ximag); ++ ++ if(id_x < x_total) ++ { ++ conjugate(id_batch + xstride * (Nx - (id_x + 1)), ++ id_batch + xstride * (id_x + 1), ++ xreal, ++ ximag); ++ ++ if(Nyeven) ++ conjugate(id_batch + xstride * (Nx - (id_x + 1)) + ystride * (Ny / 2), ++ id_batch + xstride * (id_x + 1) + ystride * (Ny / 2), ++ xreal, ++ ximag); ++ } ++ } ++} ++ ++template <typename Tcomplex> ++__global__ static void impose_hermitian_symmetry_interleaved_3D_kernel(Tcomplex* x, ++ const size_t Nx, ++ const size_t Ny, ++ const size_t Nz, ++ const size_t xstride, ++ const size_t ystride, ++ const size_t zstride, ++ const size_t dist, ++ const size_t batch_total, ++ const size_t x_total, ++ const size_t y_total, ++ const size_t y_total_half, ++ const bool Nxeven, ++ const bool Nyeven, ++ const bool Nzeven) ++{ ++ auto id_batch = static_cast<size_t>(threadIdx.x) + blockIdx.x * blockDim.x; ++ const auto id_x = static_cast<size_t>(threadIdx.y) + blockIdx.y * blockDim.y; ++ const auto id_y = static_cast<size_t>(threadIdx.z) + blockIdx.z * blockDim.z; ++ static_assert(sizeof(id_batch) == sizeof(size_t)); ++ static_assert(sizeof(id_x) == sizeof(size_t)); ++ static_assert(sizeof(id_y) == sizeof(size_t)); ++ ++ if(id_batch < batch_total) ++ { ++ auto id_x_y_zero = (id_x == 0 && id_y == 0); ++ ++ id_batch *= dist; ++ ++ if(id_x_y_zero) ++ set_imag_zero(id_batch, x); ++ ++ if(Nxeven && id_x_y_zero) ++ set_imag_zero(id_batch + xstride * (Nx / 2), x); ++ ++ if(Nyeven && id_x_y_zero) ++ set_imag_zero(id_batch + ystride * (Ny / 2), x); ++ ++ if(Nzeven && id_x_y_zero) ++ set_imag_zero(id_batch + zstride * (Nz / 2), x); ++ ++ if(Nxeven && Nyeven && id_x_y_zero) ++ set_imag_zero(id_batch + xstride * (Nx / 2) + ystride * (Ny / 2), x); ++ ++ if(Nxeven && Nzeven && id_x_y_zero) ++ set_imag_zero(id_batch + xstride * (Nx / 2) + zstride * (Nz / 2), x); ++ ++ if(Nyeven && Nzeven && id_x_y_zero) ++ set_imag_zero(id_batch + ystride * (Ny / 2) + zstride * (Nz / 2), x); ++ ++ if(Nxeven && Nyeven && Nzeven && id_x_y_zero) ++ set_imag_zero(id_batch + xstride * (Nx / 2) + ystride * (Ny / 2) + zstride * (Nz / 2), ++ x); ++ ++ if(id_x == 0 && id_y < y_total_half) ++ conjugate(id_batch + ystride * (Ny - (id_y + 1)), id_batch + ystride * (id_y + 1), x); ++ ++ if(Nxeven && id_x == 0 && id_y < y_total_half) ++ conjugate(id_batch + xstride * (Nx / 2) + ystride * (Ny - (id_y + 1)), ++ id_batch + xstride * (Nx / 2) + ystride * (id_y + 1), ++ x); ++ ++ if(id_x < x_total && id_y == 0) ++ conjugate(id_batch + xstride * (Nx - (id_x + 1)), id_batch + xstride * (id_x + 1), x); ++ ++ if(Nyeven && id_x < x_total && id_y == 0) ++ conjugate(id_batch + xstride * (Nx - (id_x + 1)) + ystride * (Ny / 2), ++ id_batch + xstride * (id_x + 1) + ystride * (Ny / 2), ++ x); ++ ++ if(id_x < x_total && id_y < y_total) ++ conjugate(id_batch + xstride * (Nx - (id_x + 1)) + ystride * (Ny - (id_y + 1)), ++ id_batch + xstride * (id_x + 1) + ystride * (id_y + 1), ++ x); ++ ++ if(Nzeven) ++ { ++ if(id_x < x_total && id_y == 0) ++ conjugate(id_batch + xstride * (Nx - (id_x + 1)) + zstride * (Nz / 2), ++ id_batch + xstride * (id_x + 1) + zstride * (Nz / 2), ++ x); ++ ++ if(Nyeven && id_x < x_total && id_y == 0) ++ conjugate(id_batch + xstride * (Nx - (id_x + 1)) + zstride * (Nz / 2), ++ id_batch + xstride * (id_x + 1) + zstride * (Nz / 2), ++ x); ++ ++ if(id_x == 0 && id_y < y_total_half) ++ conjugate(id_batch + ystride * (Ny - (id_y + 1)) + zstride * (Nz / 2), ++ id_batch + ystride * (id_y + 1) + zstride * (Nz / 2), ++ x); ++ ++ if(Nxeven && id_x == 0 && id_y < y_total_half) ++ conjugate(id_batch + xstride * (Nx / 2) + ystride * (Ny - (id_y + 1)) ++ + zstride * (Nz / 2), ++ id_batch + xstride * (Nx / 2) + ystride * (id_y + 1) + zstride * (Nz / 2), ++ x); ++ ++ if(id_x < x_total && id_y < y_total) ++ conjugate(id_batch + xstride * (Nx - (id_x + 1)) + ystride * (Ny - (id_y + 1)) ++ + zstride * (Nz / 2), ++ id_batch + xstride * (id_x + 1) + ystride * (id_y + 1) ++ + zstride * (Nz / 2), ++ x); ++ } ++ } ++} ++ ++template <typename Tfloat> ++__global__ static void impose_hermitian_symmetry_planar_3D_kernel(Tfloat* xreal, ++ Tfloat* ximag, ++ const size_t Nx, ++ const size_t Ny, ++ const size_t Nz, ++ const size_t xstride, ++ const size_t ystride, ++ const size_t zstride, ++ const size_t dist, ++ const size_t batch_total, ++ const size_t x_total, ++ const size_t y_total, ++ const size_t y_total_half, ++ const bool Nxeven, ++ const bool Nyeven, ++ const bool Nzeven) ++{ ++ auto id_batch = static_cast<size_t>(threadIdx.x) + blockIdx.x * blockDim.x; ++ const auto id_x = static_cast<size_t>(threadIdx.y) + blockIdx.y * blockDim.y; ++ const auto id_y = static_cast<size_t>(threadIdx.z) + blockIdx.z * blockDim.z; ++ static_assert(sizeof(id_batch) == sizeof(size_t)); ++ static_assert(sizeof(id_x) == sizeof(size_t)); ++ static_assert(sizeof(id_y) == sizeof(size_t)); ++ ++ if(id_batch < batch_total) ++ { ++ auto id_x_y_zero = (id_x == 0 && id_y == 0); ++ ++ id_batch *= dist; ++ ++ if(id_x_y_zero) ++ set_imag_zero(id_batch, xreal, ximag); ++ ++ if(Nxeven && id_x_y_zero) ++ set_imag_zero(id_batch + xstride * (Nx / 2), xreal, ximag); ++ ++ if(Nyeven && id_x_y_zero) ++ set_imag_zero(id_batch + ystride * (Ny / 2), xreal, ximag); ++ ++ if(Nzeven && id_x_y_zero) ++ set_imag_zero(id_batch + zstride * (Nz / 2), xreal, ximag); ++ ++ if(Nxeven && Nyeven && id_x_y_zero) ++ set_imag_zero(id_batch + xstride * (Nx / 2) + ystride * (Ny / 2), xreal, ximag); ++ ++ if(Nxeven && Nzeven && id_x_y_zero) ++ set_imag_zero(id_batch + xstride * (Nx / 2) + zstride * (Nz / 2), xreal, ximag); ++ ++ if(Nyeven && Nzeven && id_x_y_zero) ++ set_imag_zero(id_batch + ystride * (Ny / 2) + zstride * (Nz / 2), xreal, ximag); ++ ++ if(Nxeven && Nyeven && Nzeven && id_x_y_zero) ++ set_imag_zero(id_batch + xstride * (Nx / 2) + ystride * (Ny / 2) + zstride * (Nz / 2), ++ xreal, ++ ximag); ++ ++ if(id_x == 0 && id_y < y_total_half) ++ conjugate(id_batch + ystride * (Ny - (id_y + 1)), ++ id_batch + ystride * (id_y + 1), ++ xreal, ++ ximag); ++ ++ if(Nxeven && id_x == 0 && id_y < y_total_half) ++ conjugate(id_batch + xstride * (Nx / 2) + ystride * (Ny - (id_y + 1)), ++ id_batch + xstride * (Nx / 2) + ystride * (id_y + 1), ++ xreal, ++ ximag); ++ ++ if(id_x < x_total && id_y == 0) ++ conjugate(id_batch + xstride * (Nx - (id_x + 1)), ++ id_batch + xstride * (id_x + 1), ++ xreal, ++ ximag); ++ ++ if(Nyeven && id_x < x_total && id_y == 0) ++ conjugate(id_batch + xstride * (Nx - (id_x + 1)) + ystride * (Ny / 2), ++ id_batch + xstride * (id_x + 1) + ystride * (Ny / 2), ++ xreal, ++ ximag); ++ ++ if(id_x < x_total && id_y < y_total) ++ conjugate(id_batch + xstride * (Nx - (id_x + 1)) + ystride * (Ny - (id_y + 1)), ++ id_batch + xstride * (id_x + 1) + ystride * (id_y + 1), ++ xreal, ++ ximag); ++ ++ if(Nzeven) ++ { ++ if(id_x < x_total && id_y == 0) ++ conjugate(id_batch + xstride * (Nx - (id_x + 1)) + zstride * (Nz / 2), ++ id_batch + xstride * (id_x + 1) + zstride * (Nz / 2), ++ xreal, ++ ximag); ++ ++ if(Nyeven && id_x < x_total && id_y == 0) ++ conjugate(id_batch + xstride * (Nx - (id_x + 1)) + zstride * (Nz / 2), ++ id_batch + xstride * (id_x + 1) + zstride * (Nz / 2), ++ xreal, ++ ximag); ++ ++ if(id_x == 0 && id_y < y_total_half) ++ conjugate(id_batch + ystride * (Ny - (id_y + 1)) + zstride * (Nz / 2), ++ id_batch + ystride * (id_y + 1) + zstride * (Nz / 2), ++ xreal, ++ ximag); ++ ++ if(Nxeven && id_x == 0 && id_y < y_total_half) ++ conjugate(id_batch + xstride * (Nx / 2) + ystride * (Ny - (id_y + 1)) ++ + zstride * (Nz / 2), ++ id_batch + xstride * (Nx / 2) + ystride * (id_y + 1) + zstride * (Nz / 2), ++ xreal, ++ ximag); ++ ++ if(id_x < x_total && id_y < y_total) ++ conjugate(id_batch + xstride * (Nx - (id_x + 1)) + ystride * (Ny - (id_y + 1)) ++ + zstride * (Nz / 2), ++ id_batch + xstride * (id_x + 1) + ystride * (id_y + 1) ++ + zstride * (Nz / 2), ++ xreal, ++ ximag); ++ } ++ } ++} ++ ++// get grid dimensions for data gen kernel ++static dim3 generate_data_gridDim(const size_t isize) ++{ ++ auto blockSize = DATA_GEN_THREADS; ++ // total number of blocks needed in the grid ++ auto numBlocks_setup = DivRoundingUp<size_t>(isize, blockSize); ++ ++ // Total work items per dimension in the grid is counted in ++ // uint32_t. Since each thread initializes one element, very ++ // large amounts of data will overflow this total size if we do ++ // all this work in one grid dimension, causing launch failure. ++ // ++ // CUDA also generally allows for effectively unlimited grid X ++ // dim, but Y and Z are more limited. ++ auto gridDim_y = std::min<unsigned int>(DATA_GEN_GRID_Y_MAX, numBlocks_setup); ++ auto gridDim_x = DivRoundingUp<unsigned int>(numBlocks_setup, DATA_GEN_GRID_Y_MAX); ++ return {gridDim_x, gridDim_y}; ++} ++ ++// get grid dimensions for hermitian symmetrizer kernel ++static dim3 generate_hermitian_gridDim(const std::vector<size_t>& length, ++ const size_t batch, ++ const size_t blockSize) ++{ ++ dim3 gridDim; ++ ++ switch(length.size()) ++ { ++ case 1: ++ gridDim = dim3(DivRoundingUp<size_t>(batch, blockSize)); ++ break; ++ case 2: ++ gridDim = dim3(DivRoundingUp<size_t>(batch, blockSize), ++ DivRoundingUp<size_t>((length[0] + 1) / 2 - 1, blockSize)); ++ break; ++ case 3: ++ gridDim = dim3(DivRoundingUp<size_t>(batch, blockSize), ++ DivRoundingUp<size_t>((length[0] + 1) / 2 - 1, blockSize), ++ DivRoundingUp<size_t>(length[1] - 1, blockSize)); ++ break; ++ default: ++ throw std::runtime_error("Invalid dimension for impose_hermitian_symmetry"); ++ } ++ ++ return gridDim; ++} ++ ++static dim3 generate_blockDim(const std::vector<size_t>& length, const size_t blockSize) ++{ ++ dim3 blockDim; ++ ++ switch(length.size()) ++ { ++ case 1: ++ blockDim = dim3(blockSize); ++ break; ++ case 2: ++ blockDim = dim3(blockSize, blockSize); ++ break; ++ case 3: ++ blockDim = dim3(blockSize, blockSize, blockSize); ++ break; ++ default: ++ throw std::runtime_error("Invalid dimension for impose_hermitian_symmetry"); ++ } ++ ++ return blockDim; ++} ++ ++template <typename Tint, typename Treal> ++static void generate_random_interleaved_data(const Tint& whole_length, ++ const size_t idist, ++ const size_t isize, ++ const Tint& whole_stride, ++ rocfft_complex<Treal>* input_data, ++ const hipDeviceProp_t& deviceProp) ++{ ++ auto input_length = get_input_val(whole_length); ++ auto zero_length = make_zero_length(input_length); ++ auto input_stride = get_input_val(whole_stride); ++ ++ dim3 gridDim = generate_data_gridDim(isize); ++ dim3 blockDim{DATA_GEN_THREADS}; ++ ++ launch_limits_check("generate_random_interleaved_data_kernel", gridDim, blockDim, deviceProp); ++ ++ hipLaunchKernelGGL( ++ HIP_KERNEL_NAME(generate_random_interleaved_data_kernel<decltype(input_length), Treal>), ++ gridDim, ++ blockDim, ++ 0, // sharedMemBytes ++ 0, // stream ++ input_length, ++ zero_length, ++ idist, ++ isize, ++ input_stride, ++ input_data); ++ auto err = hipGetLastError(); ++ if(err != hipSuccess) ++ throw std::runtime_error("generate_random_interleaved_data_kernel launch failure: " ++ + std::string(hipGetErrorName(err))); ++} ++ ++template <typename Tint, typename Treal> ++static void generate_interleaved_data(const Tint& whole_length, ++ const size_t idist, ++ const size_t isize, ++ const Tint& whole_stride, ++ const size_t nbatch, ++ rocfft_complex<Treal>* input_data, ++ const hipDeviceProp_t& deviceProp) ++{ ++ const auto input_length = get_input_val(whole_length); ++ const auto input_stride = get_input_val(whole_stride); ++ const auto unit_stride = make_unit_stride(input_length); ++ ++ const auto inv_scale ++ = static_cast<Treal>(1.0) ++ / static_cast<Treal>(static_cast<unsigned long long>(isize) / nbatch - 1); ++ ++ dim3 gridDim = generate_data_gridDim(isize); ++ dim3 blockDim{DATA_GEN_THREADS}; ++ ++ launch_limits_check("generate_interleaved_data_kernel", gridDim, blockDim, deviceProp); ++ ++ hipLaunchKernelGGL( ++ HIP_KERNEL_NAME(generate_interleaved_data_kernel<decltype(input_length), Treal>), ++ gridDim, ++ blockDim, ++ 0, // sharedMemBytes ++ 0, // stream ++ input_length, ++ idist, ++ isize, ++ input_stride, ++ unit_stride, ++ inv_scale, ++ input_data); ++ auto err = hipGetLastError(); ++ if(err != hipSuccess) ++ throw std::runtime_error("generate_interleaved_data_kernel launch failure: " ++ + std::string(hipGetErrorName(err))); ++} ++ ++template <typename Tint, typename Treal> ++static void generate_random_planar_data(const Tint& whole_length, ++ const size_t idist, ++ const size_t isize, ++ const Tint& whole_stride, ++ Treal* real_data, ++ Treal* imag_data, ++ const hipDeviceProp_t& deviceProp) ++{ ++ const auto input_length = get_input_val(whole_length); ++ const auto zero_length = make_zero_length(input_length); ++ const auto input_stride = get_input_val(whole_stride); ++ ++ dim3 gridDim = generate_data_gridDim(isize); ++ dim3 blockDim{DATA_GEN_THREADS}; ++ ++ launch_limits_check("generate_random_planar_data_kernel", gridDim, blockDim, deviceProp); ++ ++ hipLaunchKernelGGL( ++ HIP_KERNEL_NAME(generate_random_planar_data_kernel<decltype(input_length), Treal>), ++ gridDim, ++ blockDim, ++ 0, // sharedMemBytes ++ 0, // stream ++ input_length, ++ zero_length, ++ idist, ++ isize, ++ input_stride, ++ real_data, ++ imag_data); ++ auto err = hipGetLastError(); ++ if(err != hipSuccess) ++ throw std::runtime_error("generate_random_planar_data_kernel launch failure: " ++ + std::string(hipGetErrorName(err))); ++} ++ ++template <typename Tint, typename Treal> ++static void generate_planar_data(const Tint& whole_length, ++ const size_t idist, ++ const size_t isize, ++ const Tint& whole_stride, ++ const size_t nbatch, ++ Treal* real_data, ++ Treal* imag_data, ++ const hipDeviceProp_t& deviceProp) ++{ ++ const auto input_length = get_input_val(whole_length); ++ const auto input_stride = get_input_val(whole_stride); ++ const auto unit_stride = make_unit_stride(input_length); ++ ++ const auto inv_scale ++ = static_cast<Treal>(1.0) ++ / static_cast<Treal>(static_cast<unsigned long long>(isize) / nbatch - 1); ++ ++ dim3 gridDim = generate_data_gridDim(isize); ++ dim3 blockDim{DATA_GEN_THREADS}; ++ ++ launch_limits_check("generate_planar_data_kernel", gridDim, blockDim, deviceProp); ++ ++ hipLaunchKernelGGL(HIP_KERNEL_NAME(generate_planar_data_kernel<decltype(input_length), Treal>), ++ gridDim, ++ blockDim, ++ 0, // sharedMemBytes ++ 0, // stream ++ input_length, ++ idist, ++ isize, ++ input_stride, ++ unit_stride, ++ inv_scale, ++ real_data, ++ imag_data); ++ auto err = hipGetLastError(); ++ if(err != hipSuccess) ++ throw std::runtime_error("generate_planar_data_kernel launch failure: " ++ + std::string(hipGetErrorName(err))); ++} ++ ++template <typename Tint, typename Treal> ++static void generate_random_real_data(const Tint& whole_length, ++ const size_t idist, ++ const size_t isize, ++ const Tint& whole_stride, ++ Treal* input_data, ++ const hipDeviceProp_t& deviceProp) ++{ ++ const auto input_length = get_input_val(whole_length); ++ const auto zero_length = make_zero_length(input_length); ++ const auto input_stride = get_input_val(whole_stride); ++ ++ dim3 gridDim = generate_data_gridDim(isize); ++ dim3 blockDim{DATA_GEN_THREADS}; ++ ++ launch_limits_check("generate_random_real_data_kernel", gridDim, blockDim, deviceProp); ++ ++ hipLaunchKernelGGL( ++ HIP_KERNEL_NAME(generate_random_real_data_kernel<decltype(input_length), Treal>), ++ gridDim, ++ blockDim, ++ 0, // sharedMemBytes ++ 0, // stream ++ input_length, ++ zero_length, ++ idist, ++ isize, ++ input_stride, ++ input_data); ++ auto err = hipGetLastError(); ++ if(err != hipSuccess) ++ throw std::runtime_error("generate_random_real_data_kernel launch failure: " ++ + std::string(hipGetErrorName(err))); ++} ++ ++template <typename Tint, typename Treal> ++static void generate_real_data(const Tint& whole_length, ++ const size_t idist, ++ const size_t isize, ++ const Tint& whole_stride, ++ const size_t nbatch, ++ Treal* input_data, ++ const hipDeviceProp_t& deviceProp) ++{ ++ const auto input_length = get_input_val(whole_length); ++ const auto input_stride = get_input_val(whole_stride); ++ const auto unit_stride = make_unit_stride(input_length); ++ ++ const auto inv_scale ++ = static_cast<Treal>(1.0) ++ / static_cast<Treal>(static_cast<unsigned long long>(isize) / nbatch - 1); ++ ++ dim3 gridDim = generate_data_gridDim(isize); ++ dim3 blockDim{DATA_GEN_THREADS}; ++ ++ launch_limits_check("generate_real_data_kernel", gridDim, blockDim, deviceProp); ++ ++ hipLaunchKernelGGL(HIP_KERNEL_NAME(generate_real_data_kernel<decltype(input_length), Treal>), ++ gridDim, ++ blockDim, ++ 0, // sharedMemBytes ++ 0, // stream ++ input_length, ++ idist, ++ isize, ++ input_stride, ++ unit_stride, ++ inv_scale, ++ input_data); ++ auto err = hipGetLastError(); ++ if(err != hipSuccess) ++ throw std::runtime_error("generate_real_data_kernel launch failure: " ++ + std::string(hipGetErrorName(err))); ++} ++ ++template <typename Tcomplex> ++static void impose_hermitian_symmetry_interleaved(const std::vector<size_t>& length, ++ const std::vector<size_t>& ilength, ++ const std::vector<size_t>& stride, ++ const size_t dist, ++ const size_t batch, ++ Tcomplex* input_data, ++ const hipDeviceProp_t& deviceProp) ++{ ++ auto blockSize = DATA_GEN_THREADS; ++ auto blockDim = generate_blockDim(length, blockSize); ++ auto gridDim = generate_hermitian_gridDim(length, batch, blockSize); ++ ++ switch(length.size()) ++ { ++ case 1: ++ { ++ launch_limits_check( ++ "impose_hermitian_symmetry_interleaved_1D_kernel", gridDim, blockDim, deviceProp); ++ ++ hipLaunchKernelGGL(impose_hermitian_symmetry_interleaved_1D_kernel<Tcomplex>, ++ gridDim, ++ blockDim, ++ 0, ++ 0, ++ input_data, ++ length[0], ++ stride[0], ++ dist, ++ batch, ++ length[0] % 2 == 0); ++ ++ break; ++ } ++ case 2: ++ { ++ launch_limits_check( ++ "impose_hermitian_symmetry_interleaved_2D_kernel", gridDim, blockDim, deviceProp); ++ ++ hipLaunchKernelGGL(impose_hermitian_symmetry_interleaved_2D_kernel<Tcomplex>, ++ gridDim, ++ blockDim, ++ 0, ++ 0, ++ input_data, ++ length[0], ++ length[1], ++ stride[0], ++ stride[1], ++ dist, ++ batch, ++ (ilength[0] + 1) / 2 - 1, ++ length[0] % 2 == 0, ++ length[1] % 2 == 0); ++ ++ break; ++ } ++ case 3: ++ { ++ launch_limits_check( ++ "impose_hermitian_symmetry_interleaved_3D_kernel", gridDim, blockDim, deviceProp); ++ ++ hipLaunchKernelGGL(impose_hermitian_symmetry_interleaved_3D_kernel<Tcomplex>, ++ gridDim, ++ blockDim, ++ 0, ++ 0, ++ input_data, ++ length[0], ++ length[1], ++ length[2], ++ stride[0], ++ stride[1], ++ stride[2], ++ dist, ++ batch, ++ (ilength[0] + 1) / 2 - 1, ++ ilength[1] - 1, ++ (ilength[1] + 1) / 2 - 1, ++ length[0] % 2 == 0, ++ length[1] % 2 == 0, ++ length[2] % 2 == 0); ++ break; ++ } ++ default: ++ throw std::runtime_error("Invalid dimension for impose_hermitian_symmetry"); ++ } ++ auto err = hipGetLastError(); ++ if(err != hipSuccess) ++ throw std::runtime_error("impose_hermitian_symmetry_interleaved_kernel launch failure: " ++ + std::string(hipGetErrorName(err))); ++} ++ ++template <typename Tfloat> ++static void impose_hermitian_symmetry_planar(const std::vector<size_t>& length, ++ const std::vector<size_t>& ilength, ++ const std::vector<size_t>& stride, ++ const size_t dist, ++ const size_t batch, ++ Tfloat* input_data_real, ++ Tfloat* input_data_imag, ++ const hipDeviceProp_t& deviceProp) ++{ ++ auto blockSize = DATA_GEN_THREADS; ++ auto blockDim = generate_blockDim(length, blockSize); ++ auto gridDim = generate_hermitian_gridDim(length, batch, blockSize); ++ ++ switch(length.size()) ++ { ++ case 1: ++ { ++ launch_limits_check( ++ "impose_hermitian_symmetry_planar_1D_kernel", gridDim, blockDim, deviceProp); ++ ++ hipLaunchKernelGGL(impose_hermitian_symmetry_planar_1D_kernel<Tfloat>, ++ gridDim, ++ blockDim, ++ 0, ++ 0, ++ input_data_real, ++ input_data_imag, ++ length[0], ++ stride[0], ++ dist, ++ batch, ++ length[0] % 2 == 0); ++ ++ break; ++ } ++ case 2: ++ { ++ launch_limits_check( ++ "impose_hermitian_symmetry_planar_2D_kernel", gridDim, blockDim, deviceProp); ++ ++ hipLaunchKernelGGL(impose_hermitian_symmetry_planar_2D_kernel<Tfloat>, ++ gridDim, ++ blockDim, ++ 0, ++ 0, ++ input_data_real, ++ input_data_imag, ++ length[0], ++ length[1], ++ stride[0], ++ stride[1], ++ dist, ++ batch, ++ (ilength[0] + 1) / 2 - 1, ++ length[0] % 2 == 0, ++ length[1] % 2 == 0); ++ ++ break; ++ } ++ case 3: ++ { ++ launch_limits_check( ++ "impose_hermitian_symmetry_planar_3D_kernel", gridDim, blockDim, deviceProp); ++ ++ hipLaunchKernelGGL(impose_hermitian_symmetry_planar_3D_kernel<Tfloat>, ++ gridDim, ++ blockDim, ++ 0, ++ 0, ++ input_data_real, ++ input_data_imag, ++ length[0], ++ length[1], ++ length[2], ++ stride[0], ++ stride[1], ++ stride[2], ++ dist, ++ batch, ++ (ilength[0] + 1) / 2 - 1, ++ ilength[1] - 1, ++ (ilength[1] + 1) / 2 - 1, ++ length[0] % 2 == 0, ++ length[1] % 2 == 0, ++ length[2] % 2 == 0); ++ break; ++ } ++ default: ++ throw std::runtime_error("Invalid dimension for impose_hermitian_symmetry"); ++ } ++ auto err = hipGetLastError(); ++ if(err != hipSuccess) ++ throw std::runtime_error("impose_hermitian_symmetry_planar_kernel launch failure: " ++ + std::string(hipGetErrorName(err))); ++} ++ ++#endif // DATA_GEN_DEVICE_H +diff --git a/shared/data_gen_host.h b/shared/data_gen_host.h +new file mode 100644 +index 0000000..29d3854 +--- /dev/null ++++ b/shared/data_gen_host.h +@@ -0,0 +1,881 @@ ++// Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved. ++// ++// Permission is hereby granted, free of charge, to any person obtaining a copy ++// of this software and associated documentation files (the "Software"), to deal ++// in the Software without restriction, including without limitation the rights ++// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell ++// copies of the Software, and to permit persons to whom the Software is ++// furnished to do so, subject to the following conditions: ++// ++// The above copyright notice and this permission notice shall be included in ++// all copies or substantial portions of the Software. ++// ++// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, ++// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN ++// THE SOFTWARE. ++ ++#ifndef DATA_GEN_HOST_H ++#define DATA_GEN_HOST_H ++ ++#include "../shared/hostbuf.h" ++#include "../shared/increment.h" ++#include <complex> ++#include <limits> ++#include <random> ++#include <tuple> ++#include <vector> ++ ++// Specialized computation of index given 1-, 2-, 3- dimension length + stride ++template <typename T1, typename T2> ++size_t compute_index(T1 length, T2 stride, size_t base) ++{ ++ return (length * stride) + base; ++} ++ ++template <typename T1, typename T2> ++size_t ++ compute_index(const std::tuple<T1, T1>& length, const std::tuple<T2, T2>& stride, size_t base) ++{ ++ static_assert(std::is_integral<T1>::value, "Integral required."); ++ static_assert(std::is_integral<T2>::value, "Integral required."); ++ return (std::get<0>(length) * std::get<0>(stride)) + (std::get<1>(length) * std::get<1>(stride)) ++ + base; ++} ++ ++template <typename T1, typename T2> ++size_t compute_index(const std::tuple<T1, T1, T1>& length, ++ const std::tuple<T2, T2, T2>& stride, ++ size_t base) ++{ ++ static_assert(std::is_integral<T1>::value, "Integral required."); ++ static_assert(std::is_integral<T2>::value, "Integral required."); ++ return (std::get<0>(length) * std::get<0>(stride)) + (std::get<1>(length) * std::get<1>(stride)) ++ + (std::get<2>(length) * std::get<2>(stride)) + base; ++} ++ ++// count the number of total iterations for 1-, 2-, and 3-D dimensions ++template <typename T1> ++size_t count_iters(const T1& i) ++{ ++ return i; ++} ++ ++template <typename T1> ++size_t count_iters(const std::tuple<T1, T1>& i) ++{ ++ return std::get<0>(i) * std::get<1>(i); ++} ++ ++template <typename T1> ++size_t count_iters(const std::tuple<T1, T1, T1>& i) ++{ ++ return std::get<0>(i) * std::get<1>(i) * std::get<2>(i); ++} ++ ++template <typename T1> ++T1 make_unit_stride(const T1& whole_length) ++{ ++ return static_cast<T1>(1); ++} ++ ++template <typename T1> ++std::tuple<T1, T1> make_unit_stride(const std::tuple<T1, T1>& whole_length) ++{ ++ return std::make_tuple(static_cast<T1>(1), static_cast<T1>(std::get<0>(whole_length))); ++} ++ ++template <typename T1> ++std::tuple<T1, T1, T1> make_unit_stride(const std::tuple<T1, T1, T1>& whole_length) ++{ ++ return std::make_tuple(static_cast<T1>(1), ++ static_cast<T1>(std::get<0>(whole_length)), ++ static_cast<T1>(std::get<0>(whole_length)) ++ * static_cast<T1>(std::get<1>(whole_length))); ++} ++ ++// Work out how many partitions to break our iteration problem into ++template <typename T1> ++static size_t compute_partition_count(T1 length) ++{ ++#ifdef _OPENMP ++ // we seem to get contention from too many threads, which slows ++ // things down. particularly noticeable with mix_3D tests ++ static const size_t MAX_PARTITIONS = 8; ++ size_t iters = count_iters(length); ++ size_t hw_threads = std::min(MAX_PARTITIONS, static_cast<size_t>(omp_get_num_procs())); ++ if(!hw_threads) ++ return 1; ++ ++ // don't bother threading problem sizes that are too small. pick ++ // an arbitrary number of iterations and ensure that each thread ++ // has at least that many iterations to process ++ static const size_t MIN_ITERS_PER_THREAD = 2048; ++ ++ // either use the whole CPU, or use ceil(iters/iters_per_thread) ++ return std::min(hw_threads, (iters + MIN_ITERS_PER_THREAD + 1) / MIN_ITERS_PER_THREAD); ++#else ++ return 1; ++#endif ++} ++ ++// Break a scalar length into some number of pieces, returning ++// [(start0, end0), (start1, end1), ...] ++template <typename T1> ++std::vector<std::pair<T1, T1>> partition_base(const T1& length, size_t num_parts) ++{ ++ static_assert(std::is_integral<T1>::value, "Integral required."); ++ ++ // make sure we don't exceed the length ++ num_parts = std::min(length, num_parts); ++ ++ std::vector<std::pair<T1, T1>> ret(num_parts); ++ auto partition_size = length / num_parts; ++ T1 cur_partition = 0; ++ for(size_t i = 0; i < num_parts; ++i, cur_partition += partition_size) ++ { ++ ret[i].first = cur_partition; ++ ret[i].second = cur_partition + partition_size; ++ } ++ // last partition might not divide evenly, fix it up ++ ret.back().second = length; ++ return ret; ++} ++ ++// Returns pairs of startindex, endindex, for 1D, 2D, 3D lengths ++template <typename T1> ++std::vector<std::pair<T1, T1>> partition_rowmajor(const T1& length) ++{ ++ return partition_base(length, compute_partition_count(length)); ++} ++ ++// Partition on the leftmost part of the tuple, for row-major indexing ++template <typename T1> ++std::vector<std::pair<std::tuple<T1, T1>, std::tuple<T1, T1>>> ++ partition_rowmajor(const std::tuple<T1, T1>& length) ++{ ++ auto partitions = partition_base(std::get<0>(length), compute_partition_count(length)); ++ std::vector<std::pair<std::tuple<T1, T1>, std::tuple<T1, T1>>> ret(partitions.size()); ++ for(size_t i = 0; i < partitions.size(); ++i) ++ { ++ std::get<0>(ret[i].first) = partitions[i].first; ++ std::get<1>(ret[i].first) = 0; ++ std::get<0>(ret[i].second) = partitions[i].second; ++ std::get<1>(ret[i].second) = std::get<1>(length); ++ } ++ return ret; ++} ++template <typename T1> ++std::vector<std::pair<std::tuple<T1, T1, T1>, std::tuple<T1, T1, T1>>> ++ partition_rowmajor(const std::tuple<T1, T1, T1>& length) ++{ ++ auto partitions = partition_base(std::get<0>(length), compute_partition_count(length)); ++ std::vector<std::pair<std::tuple<T1, T1, T1>, std::tuple<T1, T1, T1>>> ret(partitions.size()); ++ for(size_t i = 0; i < partitions.size(); ++i) ++ { ++ std::get<0>(ret[i].first) = partitions[i].first; ++ std::get<1>(ret[i].first) = 0; ++ std::get<2>(ret[i].first) = 0; ++ std::get<0>(ret[i].second) = partitions[i].second; ++ std::get<1>(ret[i].second) = std::get<1>(length); ++ std::get<2>(ret[i].second) = std::get<2>(length); ++ } ++ return ret; ++} ++ ++// For complex-to-real transforms, the input data must be Hermitiam-symmetric. ++// That is, u_k is the complex conjugate of u_{-k}, where k is the wavevector in Fourier ++// space. For multi-dimensional data, this means that we only need to store a bit more ++// than half of the complex values; the rest are redundant. However, there are still ++// some restrictions: ++// * the origin and Nyquist value(s) must be real-valued ++// * some of the remaining values are still redundant, and you might get different results ++// than you expect if the values don't agree. ++// Below are some example kernels which impose Hermitian symmetry on a complex array ++// of the given dimensions. ++ ++template <typename Tfloat, typename Tsize> ++static void impose_hermitian_symmetry_interleaved_1D(std::vector<hostbuf>& vals, ++ const std::vector<Tsize>& length, ++ const std::vector<Tsize>& istride, ++ const Tsize idist, ++ const Tsize nbatch) ++{ ++ for(unsigned int ibatch = 0; ibatch < nbatch; ++ibatch) ++ { ++ auto data = ((std::complex<Tfloat>*)vals[0].data()) + ibatch * idist; ++ ++ data[0].imag(0.0); ++ ++ if(length[0] % 2 == 0) ++ { ++ data[istride[0] * (length[0] / 2)].imag(0.0); ++ } ++ } ++} ++ ++template <typename Tfloat, typename Tsize> ++static void impose_hermitian_symmetry_planar_1D(std::vector<hostbuf>& vals, ++ const std::vector<Tsize>& length, ++ const std::vector<Tsize>& istride, ++ const Tsize idist, ++ const Tsize nbatch) ++{ ++ for(unsigned int ibatch = 0; ibatch < nbatch; ++ibatch) ++ { ++ auto data_imag = ((Tfloat*)vals[1].data()) + ibatch * idist; ++ ++ data_imag[0] = 0.0; ++ ++ if(length[0] % 2 == 0) ++ { ++ data_imag[istride[0] * (length[0] / 2)] = 0.0; ++ } ++ } ++} ++ ++template <typename Tfloat, typename Tsize> ++static void impose_hermitian_symmetry_interleaved_2D(std::vector<hostbuf>& vals, ++ const std::vector<Tsize>& length, ++ const std::vector<Tsize>& istride, ++ const Tsize idist, ++ const Tsize nbatch) ++{ ++ for(unsigned int ibatch = 0; ibatch < nbatch; ++ibatch) ++ { ++ auto data = ((std::complex<Tfloat>*)vals[0].data()) + ibatch * idist; ++ ++ data[0].imag(0.0); ++ ++ if(length[0] % 2 == 0) ++ { ++ data[istride[0] * (length[0] / 2)].imag(0.0); ++ } ++ ++ if(length[1] % 2 == 0) ++ { ++ data[istride[1] * (length[1] / 2)].imag(0.0); ++ } ++ ++ if(length[0] % 2 == 0 && length[1] % 2 == 0) ++ { ++ data[istride[0] * (length[0] / 2) + istride[1] * (length[1] / 2)].imag(0.0); ++ } ++ ++ for(unsigned int i = 1; i < (length[0] + 1) / 2; ++i) ++ { ++ data[istride[0] * (length[0] - i)] = std::conj(data[istride[0] * i]); ++ } ++ ++ if(length[1] % 2 == 0) ++ { ++ for(unsigned int i = 1; i < (length[0] + 1) / 2; ++i) ++ { ++ data[istride[0] * (length[0] - i) + istride[1] * (length[1] / 2)] ++ = std::conj(data[istride[0] * i + istride[1] * (length[1] / 2)]); ++ } ++ } ++ } ++} ++ ++template <typename Tfloat, typename Tsize> ++static void impose_hermitian_symmetry_planar_2D(std::vector<hostbuf>& vals, ++ const std::vector<Tsize>& length, ++ const std::vector<Tsize>& istride, ++ const Tsize idist, ++ const Tsize nbatch) ++{ ++ for(unsigned int ibatch = 0; ibatch < nbatch; ++ibatch) ++ { ++ auto data_real = ((Tfloat*)vals[0].data()) + ibatch * idist; ++ auto data_imag = ((Tfloat*)vals[1].data()) + ibatch * idist; ++ ++ data_imag[0] = 0.0; ++ ++ if(length[0] % 2 == 0) ++ { ++ data_imag[istride[0] * (length[0] / 2)] = 0.0; ++ } ++ ++ if(length[1] % 2 == 0) ++ { ++ data_imag[istride[1] * (length[1] / 2)] = 0.0; ++ } ++ ++ if(length[0] % 2 == 0 && length[1] % 2 == 0) ++ { ++ data_imag[istride[0] * (length[0] / 2) + istride[1] * (length[1] / 2)] = 0.0; ++ } ++ ++ for(unsigned int i = 1; i < (length[0] + 1) / 2; ++i) ++ { ++ data_real[istride[0] * (length[0] - i)] = data_real[istride[0] * i]; ++ data_imag[istride[0] * (length[0] - i)] = -data_imag[istride[0] * i]; ++ } ++ ++ if(length[1] % 2 == 0) ++ { ++ for(unsigned int i = 1; i < (length[0] + 1) / 2; ++i) ++ { ++ data_real[istride[0] * (length[0] - i) + istride[1] * (length[1] / 2)] ++ = data_real[istride[0] * i + istride[1] * (length[1] / 2)]; ++ data_imag[istride[0] * (length[0] - i) + istride[1] * (length[1] / 2)] ++ = -data_imag[istride[0] * i + istride[1] * (length[1] / 2)]; ++ } ++ } ++ } ++} ++ ++template <typename Tfloat, typename Tsize> ++static void impose_hermitian_symmetry_interleaved_3D(std::vector<hostbuf>& vals, ++ const std::vector<Tsize>& length, ++ const std::vector<Tsize>& istride, ++ const Tsize idist, ++ const Tsize nbatch) ++{ ++ for(unsigned int ibatch = 0; ibatch < nbatch; ++ibatch) ++ { ++ auto data = ((std::complex<Tfloat>*)vals[0].data()) + ibatch * idist; ++ ++ data[0].imag(0.0); ++ ++ if(length[0] % 2 == 0) ++ { ++ data[istride[0] * (length[0] / 2)].imag(0.0); ++ } ++ ++ if(length[1] % 2 == 0) ++ { ++ data[istride[1] * (length[1] / 2)].imag(0.0); ++ } ++ ++ if(length[2] % 2 == 0) ++ { ++ data[istride[2] * (length[2] / 2)].imag(0.0); ++ } ++ ++ if(length[0] % 2 == 0 && length[1] % 2 == 0) ++ { ++ data[istride[0] * (length[0] / 2) + istride[1] * (length[1] / 2)].imag(0.0); ++ } ++ ++ if(length[0] % 2 == 0 && length[2] % 2 == 0) ++ { ++ data[istride[0] * (length[0] / 2) + istride[2] * (length[2] / 2)].imag(0.0); ++ } ++ if(length[1] % 2 == 0 && length[2] % 2 == 0) ++ { ++ data[istride[1] * (length[1] / 2) + istride[2] * (length[2] / 2)].imag(0.0); ++ } ++ ++ if(length[0] % 2 == 0 && length[1] % 2 == 0 && length[2] % 2 == 0) ++ { ++ data[istride[0] * (length[0] / 2) + istride[1] * (length[1] / 2) ++ + istride[2] * (length[2] / 2)] ++ .imag(0.0); ++ } ++ ++ // y-axis: ++ for(unsigned int j = 1; j < (length[1] + 1) / 2; ++j) ++ { ++ data[istride[1] * (length[1] - j)] = std::conj(data[istride[1] * j]); ++ } ++ ++ if(length[0] % 2 == 0) ++ { ++ // y-axis at x-nyquist ++ for(unsigned int j = 1; j < (length[1] + 1) / 2; ++j) ++ { ++ data[istride[0] * (length[0] / 2) + istride[1] * (length[1] - j)] ++ = std::conj(data[istride[0] * (length[0] / 2) + istride[1] * j]); ++ } ++ } ++ ++ // x-axis: ++ for(unsigned int i = 1; i < (length[0] + 1) / 2; ++i) ++ { ++ data[istride[0] * (length[0] - i)] = std::conj(data[istride[0] * i]); ++ } ++ ++ if(length[1] % 2 == 0) ++ { ++ // x-axis at y-nyquist ++ for(unsigned int i = 1; i < (length[0] + 1) / 2; ++i) ++ { ++ data[istride[0] * (length[0] - i) + istride[1] * (length[1] / 2)] ++ = std::conj(data[istride[0] * i + istride[1] * (length[1] / 2)]); ++ } ++ } ++ ++ // x-y plane: ++ for(unsigned int i = 1; i < (length[0] + 1) / 2; ++i) ++ { ++ for(unsigned int j = 1; j < length[1]; ++j) ++ { ++ data[istride[0] * (length[0] - i) + istride[1] * (length[1] - j)] ++ = std::conj(data[istride[0] * i + istride[1] * j]); ++ } ++ } ++ ++ if(length[2] % 2 == 0) ++ { ++ // x-axis at z-nyquist ++ for(unsigned int i = 1; i < (length[0] + 1) / 2; ++i) ++ { ++ data[istride[0] * (length[0] - i) + istride[2] * (length[2] / 2)] ++ = std::conj(data[istride[0] * i + istride[2] * (length[2] / 2)]); ++ } ++ if(length[1] % 2 == 0) ++ { ++ // x-axis at yz-nyquist ++ for(unsigned int i = 1; i < (length[0] + 1) / 2; ++i) ++ { ++ data[istride[0] * (length[0] - i) + istride[2] * (length[2] / 2)] ++ = std::conj(data[istride[0] * i + istride[2] * (length[2] / 2)]); ++ } ++ } ++ ++ // y-axis: at z-nyquist ++ for(unsigned int j = 1; j < (length[1] + 1) / 2; ++j) ++ { ++ data[istride[1] * (length[1] - j) + istride[2] * (length[2] / 2)] ++ = std::conj(data[istride[1] * j + istride[2] * (length[2] / 2)]); ++ } ++ ++ if(length[0] % 2 == 0) ++ { ++ // y-axis: at xz-nyquist ++ for(unsigned int j = 1; j < (length[1] + 1) / 2; ++j) ++ { ++ data[istride[0] * (length[0] / 2) + istride[1] * (length[1] - j) ++ + istride[2] * (length[2] / 2)] ++ = std::conj(data[istride[0] * (length[0] / 2) + istride[1] * j ++ + istride[2] * (length[2] / 2)]); ++ } ++ } ++ ++ // x-y plane: at z-nyquist ++ for(unsigned int i = 1; i < (length[0] + 1) / 2; ++i) ++ { ++ for(unsigned int j = 1; j < length[1]; ++j) ++ { ++ data[istride[0] * (length[0] - i) + istride[1] * (length[1] - j) ++ + istride[2] * (length[2] / 2)] ++ = std::conj( ++ data[istride[0] * i + istride[1] * j + istride[2] * (length[2] / 2)]); ++ } ++ } ++ } ++ } ++} ++ ++template <typename Tfloat, typename Tsize> ++static void impose_hermitian_symmetry_planar_3D(std::vector<hostbuf>& vals, ++ const std::vector<Tsize>& length, ++ const std::vector<Tsize>& istride, ++ const Tsize idist, ++ const Tsize nbatch) ++{ ++ for(unsigned int ibatch = 0; ibatch < nbatch; ++ibatch) ++ { ++ auto data_real = ((Tfloat*)vals[0].data()) + ibatch * idist; ++ auto data_imag = ((Tfloat*)vals[1].data()) + ibatch * idist; ++ ++ data_imag[0] = 0.0; ++ ++ if(length[0] % 2 == 0) ++ { ++ data_imag[istride[0] * (length[0] / 2)] = 0.0; ++ } ++ ++ if(length[1] % 2 == 0) ++ { ++ data_imag[istride[1] * (length[1] / 2)] = 0.0; ++ } ++ ++ if(length[2] % 2 == 0) ++ { ++ data_imag[istride[2] * (length[2] / 2)] = 0.0; ++ } ++ ++ if(length[0] % 2 == 0 && length[1] % 2 == 0) ++ { ++ data_imag[istride[0] * (length[0] / 2) + istride[1] * (length[1] / 2)] = 0.0; ++ } ++ ++ if(length[0] % 2 == 0 && length[2] % 2 == 0) ++ { ++ data_imag[istride[0] * (length[0] / 2) + istride[2] * (length[2] / 2)] = 0.0; ++ } ++ if(length[1] % 2 == 0 && length[2] % 2 == 0) ++ { ++ data_imag[istride[1] * (length[1] / 2) + istride[2] * (length[2] / 2)] = 0.0; ++ } ++ ++ if(length[0] % 2 == 0 && length[1] % 2 == 0 && length[2] % 2 == 0) ++ { ++ data_imag[istride[0] * (length[0] / 2) + istride[1] * (length[1] / 2) ++ + istride[2] * (length[2] / 2)] ++ = 0.0; ++ } ++ ++ // y-axis: ++ for(unsigned int j = 1; j < (length[1] + 1) / 2; ++j) ++ { ++ data_real[istride[1] * (length[1] - j)] = data_real[istride[1] * j]; ++ data_imag[istride[1] * (length[1] - j)] = -data_imag[istride[1] * j]; ++ } ++ ++ if(length[0] % 2 == 0) ++ { ++ // y-axis at x-nyquist ++ for(unsigned int j = 1; j < (length[1] + 1) / 2; ++j) ++ { ++ data_real[istride[0] * (length[0] / 2) + istride[1] * (length[1] - j)] ++ = data_real[istride[0] * (length[0] / 2) + istride[1] * j]; ++ data_imag[istride[0] * (length[0] / 2) + istride[1] * (length[1] - j)] ++ = -data_imag[istride[0] * (length[0] / 2) + istride[1] * j]; ++ } ++ } ++ ++ // x-axis: ++ for(unsigned int i = 1; i < (length[0] + 1) / 2; ++i) ++ { ++ data_real[istride[0] * (length[0] - i)] = data_real[istride[0] * i]; ++ data_imag[istride[0] * (length[0] - i)] = -data_imag[istride[0] * i]; ++ } ++ ++ if(length[1] % 2 == 0) ++ { ++ // x-axis at y-nyquist ++ for(unsigned int i = 1; i < (length[0] + 1) / 2; ++i) ++ { ++ data_real[istride[0] * (length[0] - i) + istride[1] * (length[1] / 2)] ++ = data_real[istride[0] * i + istride[1] * (length[1] / 2)]; ++ data_imag[istride[0] * (length[0] - i) + istride[1] * (length[1] / 2)] ++ = -data_imag[istride[0] * i + istride[1] * (length[1] / 2)]; ++ } ++ } ++ ++ // x-y plane: ++ for(unsigned int i = 1; i < (length[0] + 1) / 2; ++i) ++ { ++ for(unsigned int j = 1; j < length[1]; ++j) ++ { ++ data_real[istride[0] * (length[0] - i) + istride[1] * (length[1] - j)] ++ = data_real[istride[0] * i + istride[1] * j]; ++ data_imag[istride[0] * (length[0] - i) + istride[1] * (length[1] - j)] ++ = -data_imag[istride[0] * i + istride[1] * j]; ++ } ++ } ++ ++ if(length[2] % 2 == 0) ++ { ++ // x-axis at z-nyquist ++ for(unsigned int i = 1; i < (length[0] + 1) / 2; ++i) ++ { ++ data_real[istride[0] * (length[0] - i) + istride[2] * (length[2] / 2)] ++ = data_real[istride[0] * i + istride[2] * (length[2] / 2)]; ++ data_imag[istride[0] * (length[0] - i) + istride[2] * (length[2] / 2)] ++ = -data_imag[istride[0] * i + istride[2] * (length[2] / 2)]; ++ } ++ if(length[1] % 2 == 0) ++ { ++ // x-axis at yz-nyquist ++ for(unsigned int i = 1; i < (length[0] + 1) / 2; ++i) ++ { ++ data_real[istride[0] * (length[0] - i) + istride[2] * (length[2] / 2)] ++ = data_real[istride[0] * i + istride[2] * (length[2] / 2)]; ++ data_imag[istride[0] * (length[0] - i) + istride[2] * (length[2] / 2)] ++ = -data_imag[istride[0] * i + istride[2] * (length[2] / 2)]; ++ } ++ } ++ ++ // y-axis: at z-nyquist ++ for(unsigned int j = 1; j < (length[1] + 1) / 2; ++j) ++ { ++ data_real[istride[1] * (length[1] - j) + istride[2] * (length[2] / 2)] ++ = data_real[istride[1] * j + istride[2] * (length[2] / 2)]; ++ data_imag[istride[1] * (length[1] - j) + istride[2] * (length[2] / 2)] ++ = -data_imag[istride[1] * j + istride[2] * (length[2] / 2)]; ++ } ++ ++ if(length[0] % 2 == 0) ++ { ++ // y-axis: at xz-nyquist ++ for(unsigned int j = 1; j < (length[1] + 1) / 2; ++j) ++ { ++ data_real[istride[0] * (length[0] / 2) + istride[1] * (length[1] - j) ++ + istride[2] * (length[2] / 2)] ++ = data_real[istride[0] * (length[0] / 2) + istride[1] * j ++ + istride[2] * (length[2] / 2)]; ++ data_imag[istride[0] * (length[0] / 2) + istride[1] * (length[1] - j) ++ + istride[2] * (length[2] / 2)] ++ = -data_imag[istride[0] * (length[0] / 2) + istride[1] * j ++ + istride[2] * (length[2] / 2)]; ++ } ++ } ++ ++ // x-y plane: at z-nyquist ++ for(unsigned int i = 1; i < (length[0] + 1) / 2; ++i) ++ { ++ for(unsigned int j = 1; j < length[1]; ++j) ++ { ++ data_real[istride[0] * (length[0] - i) + istride[1] * (length[1] - j) ++ + istride[2] * (length[2] / 2)] ++ = data_real[istride[0] * i + istride[1] * j + istride[2] * (length[2] / 2)]; ++ data_imag[istride[0] * (length[0] - i) + istride[1] * (length[1] - j) ++ + istride[2] * (length[2] / 2)] ++ = -data_imag[istride[0] * i + istride[1] * j ++ + istride[2] * (length[2] / 2)]; ++ } ++ } ++ } ++ } ++} ++ ++template <typename Tfloat, typename Tint1> ++static void generate_random_interleaved_data(std::vector<hostbuf>& input, ++ const Tint1& whole_length, ++ const Tint1& whole_stride, ++ const size_t idist, ++ const size_t nbatch) ++{ ++ auto idata = (std::complex<Tfloat>*)input[0].data(); ++ size_t i_base = 0; ++ auto partitions = partition_rowmajor(whole_length); ++ for(unsigned int b = 0; b < nbatch; b++, i_base += idist) ++ { ++#pragma omp parallel for num_threads(partitions.size()) ++ for(size_t part = 0; part < partitions.size(); ++part) ++ { ++ auto index = partitions[part].first; ++ const auto length = partitions[part].second; ++ std::mt19937 gen(compute_index(index, whole_stride, i_base)); ++ do ++ { ++ const auto i = compute_index(index, whole_stride, i_base); ++ const Tfloat x = (Tfloat)gen() / (Tfloat)gen.max(); ++ const Tfloat y = (Tfloat)gen() / (Tfloat)gen.max(); ++ const std::complex<Tfloat> val(x, y); ++ idata[i] = val; ++ } while(increment_rowmajor(index, length)); ++ } ++ } ++} ++ ++template <typename Tfloat, typename Tint1> ++static void generate_interleaved_data(std::vector<hostbuf>& input, ++ const Tint1& whole_length, ++ const Tint1& whole_stride, ++ const size_t idist, ++ const size_t nbatch) ++{ ++ auto idata = (std::complex<Tfloat>*)input[0].data(); ++ size_t i_base = 0; ++ auto partitions = partition_rowmajor(whole_length); ++ auto unit_stride = make_unit_stride(whole_length); ++ ++ const Tfloat inv_scale = 1.0 / static_cast<Tfloat>(count_iters(whole_length) - 1); ++ ++ for(unsigned int b = 0; b < nbatch; b++, i_base += idist) ++ { ++#pragma omp parallel for num_threads(partitions.size()) ++ for(size_t part = 0; part < partitions.size(); ++part) ++ { ++ auto index = partitions[part].first; ++ const auto length = partitions[part].second; ++ do ++ { ++ const auto val_xy ++ = -0.5 + static_cast<Tfloat>(compute_index(index, unit_stride, 0)) * inv_scale; ++ ++ const std::complex<Tfloat> val(val_xy, val_xy); ++ ++ const auto i = compute_index(index, whole_stride, i_base); ++ ++ idata[i] = val; ++ } while(increment_rowmajor(index, length)); ++ } ++ } ++} ++ ++template <typename Tfloat, typename Tint1> ++static void generate_random_planar_data(std::vector<hostbuf>& input, ++ const Tint1& whole_length, ++ const Tint1& whole_stride, ++ const size_t idist, ++ const size_t nbatch) ++{ ++ auto ireal = (Tfloat*)input[0].data(); ++ auto iimag = (Tfloat*)input[1].data(); ++ size_t i_base = 0; ++ auto partitions = partition_rowmajor(whole_length); ++ for(unsigned int b = 0; b < nbatch; b++, i_base += idist) ++ { ++#pragma omp parallel for num_threads(partitions.size()) ++ for(size_t part = 0; part < partitions.size(); ++part) ++ { ++ auto index = partitions[part].first; ++ const auto length = partitions[part].second; ++ std::mt19937 gen(compute_index(index, whole_stride, i_base)); ++ do ++ { ++ const auto i = compute_index(index, whole_stride, i_base); ++ const std::complex<Tfloat> val((Tfloat)gen() / (Tfloat)gen.max(), ++ (Tfloat)gen() / (Tfloat)gen.max()); ++ ireal[i] = val.real(); ++ iimag[i] = val.imag(); ++ } while(increment_rowmajor(index, length)); ++ } ++ } ++} ++ ++template <typename Tfloat, typename Tint1> ++static void generate_planar_data(std::vector<hostbuf>& input, ++ const Tint1& whole_length, ++ const Tint1& whole_stride, ++ const size_t idist, ++ const size_t nbatch) ++{ ++ ++ auto ireal = (Tfloat*)input[0].data(); ++ auto iimag = (Tfloat*)input[1].data(); ++ size_t i_base = 0; ++ auto partitions = partition_rowmajor(whole_length); ++ auto unit_stride = make_unit_stride(whole_length); ++ ++ const Tfloat inv_scale = 1.0 / static_cast<Tfloat>(count_iters(whole_length) - 1); ++ ++ for(unsigned int b = 0; b < nbatch; b++, i_base += idist) ++ { ++#pragma omp parallel for num_threads(partitions.size()) ++ for(size_t part = 0; part < partitions.size(); ++part) ++ { ++ auto index = partitions[part].first; ++ const auto length = partitions[part].second; ++ do ++ { ++ const auto val_xy ++ = -0.5 + static_cast<Tfloat>(compute_index(index, unit_stride, 0)) * inv_scale; ++ ++ const auto i = compute_index(index, whole_stride, i_base); ++ ++ ireal[i] = val_xy; ++ iimag[i] = val_xy; ++ } while(increment_rowmajor(index, length)); ++ } ++ } ++} ++ ++template <typename Tfloat, typename Tint1> ++static void generate_random_real_data(std::vector<hostbuf>& input, ++ const Tint1& whole_length, ++ const Tint1& whole_stride, ++ const size_t idist, ++ const size_t nbatch) ++{ ++ auto idata = (Tfloat*)input[0].data(); ++ size_t i_base = 0; ++ auto partitions = partition_rowmajor(whole_length); ++ for(unsigned int b = 0; b < nbatch; b++, i_base += idist) ++ { ++#pragma omp parallel for num_threads(partitions.size()) ++ for(size_t part = 0; part < partitions.size(); ++part) ++ { ++ auto index = partitions[part].first; ++ const auto length = partitions[part].second; ++ std::mt19937 gen(compute_index(index, whole_stride, i_base)); ++ do ++ { ++ const auto i = compute_index(index, whole_stride, i_base); ++ const Tfloat val = (Tfloat)gen() / (Tfloat)gen.max(); ++ idata[i] = val; ++ } while(increment_rowmajor(index, length)); ++ } ++ } ++} ++ ++template <typename Tfloat, typename Tint1> ++static void generate_real_data(std::vector<hostbuf>& input, ++ const Tint1& whole_length, ++ const Tint1& whole_stride, ++ const size_t idist, ++ const size_t nbatch) ++{ ++ ++ auto idata = (Tfloat*)input[0].data(); ++ size_t i_base = 0; ++ auto partitions = partition_rowmajor(whole_length); ++ auto unit_stride = make_unit_stride(whole_length); ++ ++ const Tfloat inv_scale = 1.0 / static_cast<Tfloat>(count_iters(whole_length) - 1); ++ ++ for(unsigned int b = 0; b < nbatch; b++, i_base += idist) ++ { ++#pragma omp parallel for num_threads(partitions.size()) ++ for(size_t part = 0; part < partitions.size(); ++part) ++ { ++ auto index = partitions[part].first; ++ const auto length = partitions[part].second; ++ do ++ { ++ const auto i = compute_index(index, whole_stride, i_base); ++ ++ idata[i] ++ = -0.5 + static_cast<Tfloat>(compute_index(index, unit_stride, 0)) * inv_scale; ++ } while(increment_rowmajor(index, length)); ++ } ++ } ++} ++ ++template <typename Tfloat, typename Tsize> ++static void impose_hermitian_symmetry_interleaved(std::vector<hostbuf>& vals, ++ const std::vector<Tsize>& length, ++ const std::vector<Tsize>& istride, ++ const Tsize idist, ++ const Tsize nbatch) ++{ ++ switch(length.size()) ++ { ++ case 1: ++ impose_hermitian_symmetry_interleaved_1D<Tfloat>(vals, length, istride, idist, nbatch); ++ break; ++ case 2: ++ impose_hermitian_symmetry_interleaved_2D<Tfloat>(vals, length, istride, idist, nbatch); ++ break; ++ case 3: ++ impose_hermitian_symmetry_interleaved_3D<Tfloat>(vals, length, istride, idist, nbatch); ++ break; ++ default: ++ throw std::runtime_error("Invalid dimension for impose_hermitian_symmetry"); ++ } ++} ++ ++template <typename Tfloat, typename Tsize> ++static void impose_hermitian_symmetry_planar(std::vector<hostbuf>& vals, ++ const std::vector<Tsize>& length, ++ const std::vector<Tsize>& istride, ++ const Tsize idist, ++ const Tsize nbatch) ++{ ++ switch(length.size()) ++ { ++ case 1: ++ impose_hermitian_symmetry_planar_1D<Tfloat>(vals, length, istride, idist, nbatch); ++ break; ++ case 2: ++ impose_hermitian_symmetry_planar_2D<Tfloat>(vals, length, istride, idist, nbatch); ++ break; ++ case 3: ++ impose_hermitian_symmetry_planar_3D<Tfloat>(vals, length, istride, idist, nbatch); ++ break; ++ default: ++ throw std::runtime_error("Invalid dimension for impose_hermitian_symmetry"); ++ } ++} ++ ++#endif // DATA_GEN_HOST_H +diff --git a/shared/device_properties.h b/shared/device_properties.h +new file mode 100644 +index 0000000..6e2e1e1 +--- /dev/null ++++ b/shared/device_properties.h +@@ -0,0 +1,74 @@ ++// Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved. ++// ++// Permission is hereby granted, free of charge, to any person obtaining a copy ++// of this software and associated documentation files (the "Software"), to deal ++// in the Software without restriction, including without limitation the rights ++// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell ++// copies of the Software, and to permit persons to whom the Software is ++// furnished to do so, subject to the following conditions: ++// ++// The above copyright notice and this permission notice shall be included in ++// all copies or substantial portions of the Software. ++// ++// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, ++// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN ++// THE SOFTWARE. ++ ++#ifndef ROCFFT_DEVICE_PROPS_H ++#define ROCFFT_DEVICE_PROPS_H ++ ++#include <cstdint> ++#include <hip/hip_runtime_api.h> ++#include <stdexcept> ++ ++// get device properties ++static hipDeviceProp_t get_curr_device_prop() ++{ ++ hipDeviceProp_t prop; ++ int deviceId = 0; ++ if(hipGetDevice(&deviceId) != hipSuccess) ++ throw std::runtime_error("hipGetDevice failed."); ++ ++ if(hipGetDeviceProperties(&prop, deviceId) != hipSuccess) ++ throw std::runtime_error("hipGetDeviceProperties failed for deviceId " ++ + std::to_string(deviceId)); ++ ++ return prop; ++} ++ ++// check that the given grid/block dims will fit into the limits in ++// the device properties. throws std::runtime_error if the limits ++// are exceeded. ++static void launch_limits_check(const std::string& kernel_name, ++ const dim3 gridDim, ++ const dim3 blockDim, ++ const hipDeviceProp_t& deviceProp) ++{ ++ // Need lots of casting here because dim3 is unsigned but device ++ // props are signed. Cast direct comparisons to fix signedness ++ // issues. Promote types to 64-bit when multiplying to try to ++ // avoid overflow. ++ ++ // Block limits along each dimension ++ if(blockDim.x > static_cast<uint32_t>(deviceProp.maxThreadsDim[0]) ++ || blockDim.y > static_cast<uint32_t>(deviceProp.maxThreadsDim[1]) ++ || blockDim.z > static_cast<uint32_t>(deviceProp.maxThreadsDim[2])) ++ throw std::runtime_error("max threads per dim exceeded: " + kernel_name); ++ ++ // Total threads for the whole block ++ if(static_cast<uint64_t>(blockDim.x) * blockDim.y * blockDim.z ++ > static_cast<uint64_t>(deviceProp.maxThreadsPerBlock)) ++ throw std::runtime_error("max threads per block exceeded: " + kernel_name); ++ ++ // Grid dimension limits ++ if(gridDim.x > static_cast<uint32_t>(deviceProp.maxGridSize[0]) ++ || gridDim.y > static_cast<uint32_t>(deviceProp.maxGridSize[1]) ++ || gridDim.z > static_cast<uint32_t>(deviceProp.maxGridSize[2])) ++ throw std::runtime_error("max grid size exceeded: " + kernel_name); ++} ++ ++#endif +diff --git a/shared/enum_to_string.h b/shared/enum_to_string.h +new file mode 100644 +index 0000000..1c2fba0 +--- /dev/null ++++ b/shared/enum_to_string.h +@@ -0,0 +1,81 @@ ++// Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved. ++// ++// Permission is hereby granted, free of charge, to any person obtaining a copy ++// of this software and associated documentation files (the "Software"), to deal ++// in the Software without restriction, including without limitation the rights ++// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell ++// copies of the Software, and to permit persons to whom the Software is ++// furnished to do so, subject to the following conditions: ++// ++// The above copyright notice and this permission notice shall be included in ++// all copies or substantial portions of the Software. ++// ++// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, ++// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN ++// THE SOFTWARE. ++ ++#ifndef ENUM_TO_STRING_H ++#define ENUM_TO_STRING_H ++ ++#include "fft_params.h" ++ ++// Return the string of the hipError code. ++static std::string hipError_to_string(const hipError_t ret) ++{ ++ switch(ret) ++ { ++ case hipSuccess: ++ return "hipSuccess"; ++ case hipErrorInvalidContext: ++ return "hipErrorInvalidContext"; ++ case hipErrorInvalidKernelFile: ++ return "hipErrorInvalidKernelFile"; ++ case hipErrorMemoryAllocation: ++ return "hipErrorMemoryAllocation"; ++ case hipErrorInitializationError: ++ return "hipErrorInitializationError"; ++ case hipErrorLaunchFailure: ++ return "hipErrorLaunchFailure"; ++ case hipErrorLaunchOutOfResources: ++ return "hipErrorLaunchOutOfResources"; ++ case hipErrorInvalidDevice: ++ return "hipErrorInvalidDevice"; ++ case hipErrorInvalidValue: ++ return "hipErrorInvalidValue"; ++ case hipErrorInvalidDevicePointer: ++ return "hipErrorInvalidDevicePointer"; ++ case hipErrorInvalidMemcpyDirection: ++ return "hipErrorInvalidMemcpyDirection"; ++ case hipErrorUnknown: ++ return "hipErrorUnknown"; ++ case hipErrorInvalidResourceHandle: ++ return "hipErrorInvalidResourceHandle"; ++ case hipErrorNotReady: ++ return "hipErrorNotReady"; ++ case hipErrorNoDevice: ++ return "hipErrorNoDevice"; ++ case hipErrorPeerAccessAlreadyEnabled: ++ return "hipErrorPeerAccessAlreadyEnabled"; ++ case hipErrorPeerAccessNotEnabled: ++ return "hipErrorPeerAccessNotEnabled"; ++ case hipErrorRuntimeMemory: ++ return "hipErrorRuntimeMemory"; ++ case hipErrorRuntimeOther: ++ return "hipErrorRuntimeOther"; ++ case hipErrorHostMemoryAlreadyRegistered: ++ return "hipErrorHostMemoryAlreadyRegistered"; ++ case hipErrorHostMemoryNotRegistered: ++ return "hipErrorHostMemoryNotRegistered"; ++ case hipErrorMapBufferObjectFailed: ++ return "hipErrorMapBufferObjectFailed"; ++ case hipErrorTbd: ++ return "hipErrorTbd"; ++ default: ++ throw std::runtime_error("unknown hipError"); ++ } ++} ++#endif +diff --git a/shared/environment.h b/shared/environment.h +new file mode 100644 +index 0000000..7be56a0 +--- /dev/null ++++ b/shared/environment.h +@@ -0,0 +1,97 @@ ++// Copyright (C) 2021 - 2022 Advanced Micro Devices, Inc. All rights reserved. ++// ++// Permission is hereby granted, free of charge, to any person obtaining a copy ++// of this software and associated documentation files (the "Software"), to deal ++// in the Software without restriction, including without limitation the rights ++// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell ++// copies of the Software, and to permit persons to whom the Software is ++// furnished to do so, subject to the following conditions: ++// ++// The above copyright notice and this permission notice shall be included in ++// all copies or substantial portions of the Software. ++// ++// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, ++// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN ++// THE SOFTWARE. ++ ++// wrappers around environment variable routines ++ ++#pragma once ++ ++#include <string> ++ ++// Windows provides "getenv" and "_putenv", but those modify the ++// runtime's copy of the environment. The actual environment in the ++// process control block is accessed using GetEnvironmentVariable and ++// SetEnvironmentVariable. ++ ++#ifdef WIN32 ++#include <windows.h> ++static void rocfft_setenv(const char* var, const char* value) ++{ ++ SetEnvironmentVariable(var, value); ++} ++static void rocfft_unsetenv(const char* var) ++{ ++ SetEnvironmentVariable(var, nullptr); ++} ++static std::string rocfft_getenv(const char* var) ++{ ++ DWORD size = GetEnvironmentVariable(var, nullptr, 0); ++ std::string ret; ++ if(size) ++ { ++ ret.resize(size); ++ GetEnvironmentVariable(var, ret.data(), size); ++ // GetEnvironmentVariable counts the terminating null, so remove it ++ while(!ret.empty() && ret.back() == 0) ++ ret.pop_back(); ++ } ++ return ret; ++} ++ ++#else ++ ++#include <stdlib.h> ++ ++static void rocfft_setenv(const char* var, const char* value) ++{ ++ setenv(var, value, 1); ++} ++static void rocfft_unsetenv(const char* var) ++{ ++ unsetenv(var); ++} ++static std::string rocfft_getenv(const char* var) ++{ ++ auto value = getenv(var); ++ return value ? value : ""; ++} ++#endif ++ ++// RAII object to set an environment variable and restore it to its ++// previous value on destruction ++struct EnvironmentSetTemp ++{ ++ EnvironmentSetTemp(const char* _var, const char* val) ++ : var(_var) ++ { ++ auto val_ptr = rocfft_getenv(_var); ++ if(!val_ptr.empty()) ++ oldvalue = val_ptr; ++ rocfft_setenv(_var, val); ++ } ++ ~EnvironmentSetTemp() ++ { ++ if(oldvalue.empty()) ++ rocfft_unsetenv(var.c_str()); ++ else ++ rocfft_setenv(var.c_str(), oldvalue.c_str()); ++ } ++ std::string var; ++ std::string oldvalue; ++}; +diff --git a/shared/fft_params.h b/shared/fft_params.h +new file mode 100644 +index 0000000..bf428ef +--- /dev/null ++++ b/shared/fft_params.h +@@ -0,0 +1,3274 @@ ++// Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved. ++// ++// Permission is hereby granted, free of charge, to any person obtaining a copy ++// of this software and associated documentation files (the "Software"), to deal ++// in the Software without restriction, including without limitation the rights ++// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell ++// copies of the Software, and to permit persons to whom the Software is ++// furnished to do so, subject to the following conditions: ++// ++// The above copyright notice and this permission notice shall be included in ++// all copies or substantial portions of the Software. ++// ++// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, ++// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN ++// THE SOFTWARE. ++ ++#ifndef FFT_PARAMS_H ++#define FFT_PARAMS_H ++ ++#include <algorithm> ++#include <hip/hip_runtime.h> ++#include <iostream> ++#include <mutex> ++#include <numeric> ++#include <sstream> ++#ifdef _OPENMP ++#include <omp.h> ++#endif ++#include <random> ++#include <tuple> ++#include <unordered_set> ++#include <vector> ++ ++#include "../shared/arithmetic.h" ++#include "../shared/array_validator.h" ++#include "../shared/data_gen_device.h" ++#include "../shared/data_gen_host.h" ++#include "../shared/device_properties.h" ++#include "../shared/printbuffer.h" ++#include "../shared/ptrdiff.h" ++ ++enum fft_status ++{ ++ fft_status_success, ++ fft_status_failure, ++ fft_status_invalid_arg_value, ++ fft_status_invalid_dimensions, ++ fft_status_invalid_array_type, ++ fft_status_invalid_strides, ++ fft_status_invalid_distance, ++ fft_status_invalid_offset, ++ fft_status_invalid_work_buffer, ++}; ++ ++enum fft_transform_type ++{ ++ fft_transform_type_complex_forward, ++ fft_transform_type_complex_inverse, ++ fft_transform_type_real_forward, ++ fft_transform_type_real_inverse, ++}; ++ ++enum fft_precision ++{ ++ fft_precision_half, ++ fft_precision_single, ++ fft_precision_double, ++}; ++ ++static std::istream& operator>>(std::istream& str, fft_precision& precision) ++{ ++ std::string word; ++ str >> word; ++ ++ if(word == "half") ++ precision = fft_precision_half; ++ else if(word == "single") ++ precision = fft_precision_single; ++ else if(word == "double") ++ precision = fft_precision_double; ++ else ++ throw std::runtime_error("Invalid precision specified"); ++ return str; ++} ++ ++// fft_input_generator: linearly spaced sequence in [-0.5,0.5] ++// fft_input_random_generator: pseudo-random sequence in [-0.5,0.5] ++enum fft_input_generator ++{ ++ fft_input_random_generator_device, ++ fft_input_random_generator_host, ++ fft_input_generator_device, ++ fft_input_generator_host, ++}; ++ ++static std::istream& operator>>(std::istream& str, fft_input_generator& gen) ++{ ++ std::string word; ++ str >> word; ++ ++ if(word == "0") ++ gen = fft_input_random_generator_device; ++ else if(word == "1") ++ gen = fft_input_random_generator_host; ++ else if(word == "2") ++ gen = fft_input_generator_device; ++ else if(word == "3") ++ gen = fft_input_generator_host; ++ else ++ throw std::runtime_error("Invalid input generator specified"); ++ return str; ++} ++ ++enum fft_array_type ++{ ++ fft_array_type_complex_interleaved, ++ fft_array_type_complex_planar, ++ fft_array_type_real, ++ fft_array_type_hermitian_interleaved, ++ fft_array_type_hermitian_planar, ++ fft_array_type_unset, ++}; ++ ++enum fft_result_placement ++{ ++ fft_placement_inplace, ++ fft_placement_notinplace, ++}; ++ ++// Determine the size of the data type given the precision and type. ++template <typename Tsize> ++inline Tsize var_size(const fft_precision precision, const fft_array_type type) ++{ ++ size_t var_size = 0; ++ switch(precision) ++ { ++ case fft_precision_half: ++ var_size = sizeof(_Float16); ++ break; ++ case fft_precision_single: ++ var_size = sizeof(float); ++ break; ++ case fft_precision_double: ++ var_size = sizeof(double); ++ break; ++ } ++ switch(type) ++ { ++ case fft_array_type_complex_interleaved: ++ case fft_array_type_hermitian_interleaved: ++ var_size *= 2; ++ break; ++ default: ++ break; ++ } ++ return var_size; ++} ++// Given an array type and transform length, strides, etc, load random floats in [0,1] ++// into the input array of floats/doubles or complex floats/doubles gpu buffers. ++template <typename Tfloat, typename Tint1> ++inline void set_input(std::vector<gpubuf>& input, ++ const fft_input_generator igen, ++ const fft_array_type itype, ++ const std::vector<size_t>& length, ++ const std::vector<size_t>& ilength, ++ const std::vector<size_t>& istride, ++ const Tint1& whole_length, ++ const Tint1& whole_stride, ++ const size_t idist, ++ const size_t nbatch, ++ const hipDeviceProp_t& deviceProp) ++{ ++ auto isize = count_iters(whole_length) * nbatch; ++ ++ switch(itype) ++ { ++ case fft_array_type_complex_interleaved: ++ case fft_array_type_hermitian_interleaved: ++ { ++ auto ibuffer = (rocfft_complex<Tfloat>*)input[0].data(); ++ ++ if(igen == fft_input_generator_device) ++ generate_interleaved_data( ++ whole_length, idist, isize, whole_stride, nbatch, ibuffer, deviceProp); ++ else if(igen == fft_input_random_generator_device) ++ generate_random_interleaved_data( ++ whole_length, idist, isize, whole_stride, ibuffer, deviceProp); ++ ++ if(itype == fft_array_type_hermitian_interleaved) ++ { ++ auto ibuffer_2 = (rocfft_complex<Tfloat>*)input[0].data(); ++ impose_hermitian_symmetry_interleaved( ++ length, ilength, istride, idist, nbatch, ibuffer_2, deviceProp); ++ } ++ ++ break; ++ } ++ case fft_array_type_complex_planar: ++ case fft_array_type_hermitian_planar: ++ { ++ auto ibuffer_real = (Tfloat*)input[0].data(); ++ auto ibuffer_imag = (Tfloat*)input[1].data(); ++ ++ if(igen == fft_input_generator_device) ++ generate_planar_data(whole_length, ++ idist, ++ isize, ++ whole_stride, ++ nbatch, ++ ibuffer_real, ++ ibuffer_imag, ++ deviceProp); ++ else if(igen == fft_input_random_generator_device) ++ generate_random_planar_data( ++ whole_length, idist, isize, whole_stride, ibuffer_real, ibuffer_imag, deviceProp); ++ ++ if(itype == fft_array_type_hermitian_planar) ++ impose_hermitian_symmetry_planar( ++ length, ilength, istride, idist, nbatch, ibuffer_real, ibuffer_imag, deviceProp); ++ ++ break; ++ } ++ case fft_array_type_real: ++ { ++ auto ibuffer = (Tfloat*)input[0].data(); ++ ++ if(igen == fft_input_generator_device) ++ generate_real_data( ++ whole_length, idist, isize, whole_stride, nbatch, ibuffer, deviceProp); ++ else if(igen == fft_input_random_generator_device) ++ generate_random_real_data( ++ whole_length, idist, isize, whole_stride, ibuffer, deviceProp); ++ ++ break; ++ } ++ default: ++ throw std::runtime_error("Input layout format not yet supported"); ++ } ++} ++ ++template <typename Tfloat, typename Tint1> ++inline void set_input(std::vector<hostbuf>& input, ++ const fft_input_generator igen, ++ const fft_array_type itype, ++ const std::vector<size_t>& length, ++ const std::vector<size_t>& ilength, ++ const std::vector<size_t>& istride, ++ const Tint1& whole_length, ++ const Tint1& whole_stride, ++ const size_t idist, ++ const size_t nbatch, ++ const hipDeviceProp_t& deviceProp) ++{ ++ switch(itype) ++ { ++ case fft_array_type_complex_interleaved: ++ case fft_array_type_hermitian_interleaved: ++ { ++ if(igen == fft_input_generator_host) ++ generate_interleaved_data<Tfloat>(input, whole_length, whole_stride, idist, nbatch); ++ else if(igen == fft_input_random_generator_host) ++ generate_random_interleaved_data<Tfloat>( ++ input, whole_length, whole_stride, idist, nbatch); ++ ++ if(itype == fft_array_type_hermitian_interleaved) ++ impose_hermitian_symmetry_interleaved<Tfloat>(input, length, istride, idist, nbatch); ++ ++ break; ++ } ++ case fft_array_type_complex_planar: ++ case fft_array_type_hermitian_planar: ++ { ++ if(igen == fft_input_generator_host) ++ generate_planar_data<Tfloat>(input, whole_length, whole_stride, idist, nbatch); ++ else if(igen == fft_input_random_generator_host) ++ generate_random_planar_data<Tfloat>(input, whole_length, whole_stride, idist, nbatch); ++ ++ if(itype == fft_array_type_hermitian_planar) ++ impose_hermitian_symmetry_planar<Tfloat>(input, length, istride, idist, nbatch); ++ ++ break; ++ } ++ case fft_array_type_real: ++ { ++ if(igen == fft_input_generator_host) ++ generate_real_data<Tfloat>(input, whole_length, whole_stride, idist, nbatch); ++ else if(igen == fft_input_random_generator_host) ++ generate_random_real_data<Tfloat>(input, whole_length, whole_stride, idist, nbatch); ++ ++ break; ++ } ++ default: ++ throw std::runtime_error("Input layout format not yet supported"); ++ } ++} ++ ++// unroll set_input for dimension 1, 2, 3 ++template <typename Tbuff, typename Tfloat> ++inline void set_input(std::vector<Tbuff>& input, ++ const fft_input_generator igen, ++ const fft_array_type itype, ++ const std::vector<size_t>& length, ++ const std::vector<size_t>& ilength, ++ const std::vector<size_t>& istride, ++ const size_t idist, ++ const size_t nbatch, ++ const hipDeviceProp_t& deviceProp) ++{ ++ switch(length.size()) ++ { ++ case 1: ++ set_input<Tfloat>(input, ++ igen, ++ itype, ++ length, ++ ilength, ++ istride, ++ ilength[0], ++ istride[0], ++ idist, ++ nbatch, ++ deviceProp); ++ break; ++ case 2: ++ set_input<Tfloat>(input, ++ igen, ++ itype, ++ length, ++ ilength, ++ istride, ++ std::make_tuple(ilength[0], ilength[1]), ++ std::make_tuple(istride[0], istride[1]), ++ idist, ++ nbatch, ++ deviceProp); ++ break; ++ case 3: ++ set_input<Tfloat>(input, ++ igen, ++ itype, ++ length, ++ ilength, ++ istride, ++ std::make_tuple(ilength[0], ilength[1], ilength[2]), ++ std::make_tuple(istride[0], istride[1], istride[2]), ++ idist, ++ nbatch, ++ deviceProp); ++ break; ++ default: ++ abort(); ++ } ++} ++ ++// Container class for test parameters. ++class fft_params ++{ ++public: ++ // All parameters are row-major. ++ std::vector<size_t> length; ++ std::vector<size_t> istride; ++ std::vector<size_t> ostride; ++ size_t nbatch = 1; ++ fft_precision precision = fft_precision_single; ++ fft_input_generator igen = fft_input_random_generator_device; ++ fft_transform_type transform_type = fft_transform_type_complex_forward; ++ fft_result_placement placement = fft_placement_inplace; ++ size_t idist = 0; ++ size_t odist = 0; ++ fft_array_type itype = fft_array_type_unset; ++ fft_array_type otype = fft_array_type_unset; ++ std::vector<size_t> ioffset = {0, 0}; ++ std::vector<size_t> ooffset = {0, 0}; ++ ++ std::vector<size_t> isize; ++ std::vector<size_t> osize; ++ ++ size_t workbuffersize = 0; ++ ++ struct fft_brick ++ { ++ // all vectors here are row-major, with same length as FFT ++ // dimension + 1 (for batch dimension) ++ ++ // inclusive lower bound of brick ++ std::vector<size_t> lower; ++ // exclusive upper bound of brick ++ std::vector<size_t> upper; ++ // stride of brick in memory ++ std::vector<size_t> stride; ++ ++ // compute the length of this brick ++ std::vector<size_t> length() const ++ { ++ std::vector<size_t> ret; ++ for(size_t i = 0; i < lower.size(); ++i) ++ ret.push_back(upper[i] - lower[i]); ++ return ret; ++ } ++ ++ // compute offset of lower bound in a field with the given ++ // stride + dist (batch stride is separate) ++ size_t lower_field_offset(std::vector<size_t> stride, size_t dist) const ++ { ++ // brick strides include batch, so adjust our input accordingly ++ stride.insert(stride.begin(), dist); ++ ++ return std::inner_product(lower.begin(), lower.end(), stride.begin(), 0); ++ } ++ ++ // location of the brick ++ int device = 0; ++ }; ++ ++ struct fft_field ++ { ++ std::vector<fft_brick> bricks; ++ }; ++ // optional brick decomposition of inputs/outputs ++ std::vector<fft_field> ifields; ++ std::vector<fft_field> ofields; ++ ++ // run testing load/store callbacks ++ bool run_callbacks = false; ++ static constexpr double load_cb_scalar = 0.457813941; ++ static constexpr double store_cb_scalar = 0.391504938; ++ ++ // Check that data outside of output strides is not overwritten. ++ // This is only set explicitly on some tests where there's space ++ // between dimensions, but the dimensions are still in-order. ++ // We're not trying to generically find holes in arbitrary data ++ // layouts. ++ // ++ // NOTE: this flag is not included in tokens, since it doesn't ++ // affect how the FFT library behaves. ++ bool check_output_strides = false; ++ ++ // scaling factor - we do a pointwise multiplication of outputs by ++ // this factor ++ double scale_factor = 1.0; ++ ++ fft_params(){}; ++ virtual ~fft_params(){}; ++ ++ // Given an array type, return the name as a string. ++ static std::string array_type_name(const fft_array_type type, bool verbose = true) ++ { ++ switch(type) ++ { ++ case fft_array_type_complex_interleaved: ++ return verbose ? "fft_array_type_complex_interleaved" : "CI"; ++ case fft_array_type_complex_planar: ++ return verbose ? "fft_array_type_complex_planar" : "CP"; ++ case fft_array_type_real: ++ return verbose ? "fft_array_type_real" : "R"; ++ case fft_array_type_hermitian_interleaved: ++ return verbose ? "fft_array_type_hermitian_interleaved" : "HI"; ++ case fft_array_type_hermitian_planar: ++ return verbose ? "fft_array_type_hermitian_planar" : "HP"; ++ case fft_array_type_unset: ++ return verbose ? "fft_array_type_unset" : "UN"; ++ } ++ return ""; ++ } ++ ++ std::string transform_type_name() const ++ { ++ switch(transform_type) ++ { ++ case fft_transform_type_complex_forward: ++ return "fft_transform_type_complex_forward"; ++ case fft_transform_type_complex_inverse: ++ return "fft_transform_type_complex_inverse"; ++ case fft_transform_type_real_forward: ++ return "fft_transform_type_real_forward"; ++ case fft_transform_type_real_inverse: ++ return "fft_transform_type_real_inverse"; ++ default: ++ throw std::runtime_error("Invalid transform type"); ++ } ++ } ++ ++ // Convert to string for output. ++ std::string str(const std::string& separator = ", ") const ++ { ++ // top-level stride/dist are not used when fields are specified. ++ const bool have_ifields = !ifields.empty(); ++ const bool have_ofields = !ofields.empty(); ++ ++ std::stringstream ss; ++ auto print_size_vec = [&](const char* description, const std::vector<size_t>& vec) { ++ ss << description << ":"; ++ for(auto i : vec) ++ ss << " " << i; ++ ss << separator; ++ }; ++ auto print_fields = [&](const char* description, const std::vector<fft_field>& fields) { ++ for(unsigned int fidx = 0; fidx < fields.size(); ++fidx) ++ { ++ const auto& f = fields[fidx]; ++ ss << description << " " << fidx << ":" << separator; ++ for(unsigned int bidx = 0; bidx < f.bricks.size(); ++bidx) ++ { ++ const auto& b = f.bricks[bidx]; ++ ss << " brick " << bidx << ":" << separator; ++ print_size_vec(" lower", b.lower); ++ print_size_vec(" upper", b.upper); ++ print_size_vec(" stride", b.stride); ++ ss << " device: " << b.device << separator; ++ } ++ } ++ }; ++ ++ print_size_vec("length", length); ++ if(have_ifields) ++ { ++ print_fields("ifield", ifields); ++ } ++ else ++ { ++ print_size_vec("istride", istride); ++ ss << "idist: " << idist << separator; ++ } ++ ++ if(have_ofields) ++ { ++ print_fields("ofield", ofields); ++ } ++ else ++ { ++ print_size_vec("ostride", ostride); ++ ss << "odist: " << odist << separator; ++ } ++ ++ ss << "batch: " << nbatch << separator; ++ print_size_vec("isize", isize); ++ print_size_vec("osize", osize); ++ ++ print_size_vec("ioffset", ioffset); ++ print_size_vec("ooffset", ooffset); ++ ++ if(placement == fft_placement_inplace) ++ ss << "in-place"; ++ else ++ ss << "out-of-place"; ++ ss << separator; ++ ss << "transform_type: " << transform_type_name() << separator; ++ ss << array_type_name(itype) << " -> " << array_type_name(otype) << separator; ++ switch(precision) ++ { ++ case fft_precision_half: ++ ss << "half-precision"; ++ break; ++ case fft_precision_single: ++ ss << "single-precision"; ++ break; ++ case fft_precision_double: ++ ss << "double-precision"; ++ break; ++ } ++ ss << separator; ++ ++ print_size_vec("ilength", ilength()); ++ print_size_vec("olength", olength()); ++ ++ print_size_vec("ibuffer_size", ibuffer_sizes()); ++ print_size_vec("obuffer_size", obuffer_sizes()); ++ ++ if(scale_factor != 1.0) ++ ss << "scale factor: " << scale_factor << separator; ++ ++ return ss.str(); ++ } ++ ++ // Produce a stringified token of the test fft params. ++ std::string token() const ++ { ++ std::string ret; ++ ++ switch(transform_type) ++ { ++ case fft_transform_type_complex_forward: ++ ret += "complex_forward_"; ++ break; ++ case fft_transform_type_complex_inverse: ++ ret += "complex_inverse_"; ++ break; ++ case fft_transform_type_real_forward: ++ ret += "real_forward_"; ++ break; ++ case fft_transform_type_real_inverse: ++ ret += "real_inverse_"; ++ break; ++ } ++ ++ auto append_size_vec = [&ret](const std::vector<size_t>& vec) { ++ for(auto s : vec) ++ { ++ ret += "_"; ++ ret += std::to_string(s); ++ } ++ }; ++ ++ ret += "len"; ++ append_size_vec(length); ++ ++ switch(precision) ++ { ++ case fft_precision_half: ++ ret += "_half_"; ++ break; ++ case fft_precision_single: ++ ret += "_single_"; ++ break; ++ case fft_precision_double: ++ ret += "_double_"; ++ break; ++ } ++ ++ switch(placement) ++ { ++ case fft_placement_inplace: ++ ret += "ip_"; ++ break; ++ case fft_placement_notinplace: ++ ret += "op_"; ++ break; ++ } ++ ++ ret += "batch_"; ++ ret += std::to_string(nbatch); ++ ++ auto append_array_type = [&ret](fft_array_type type) { ++ switch(type) ++ { ++ case fft_array_type_complex_interleaved: ++ ret += "CI"; ++ break; ++ case fft_array_type_complex_planar: ++ ret += "CP"; ++ break; ++ case fft_array_type_real: ++ ret += "R"; ++ break; ++ case fft_array_type_hermitian_interleaved: ++ ret += "HI"; ++ break; ++ case fft_array_type_hermitian_planar: ++ ret += "HP"; ++ break; ++ default: ++ ret += "UN"; ++ break; ++ } ++ }; ++ ++ auto append_brick_info = [&ret, &append_size_vec](const fft_brick& b) { ++ ret += "_brick"; ++ ++ ret += "_lower"; ++ append_size_vec(b.lower); ++ ret += "_upper"; ++ append_size_vec(b.upper); ++ ret += "_stride"; ++ append_size_vec(b.stride); ++ ret += "_dev_"; ++ ret += std::to_string(b.device); ++ }; ++ ++ const bool have_ifields = !ifields.empty(); ++ const bool have_ofields = !ofields.empty(); ++ ++ if(have_ifields) ++ { ++ for(const auto& f : ifields) ++ { ++ ret += "_ifield"; ++ for(const auto& b : f.bricks) ++ append_brick_info(b); ++ } ++ } ++ else ++ { ++ ret += "_istride"; ++ append_size_vec(istride); ++ ret += "_"; ++ append_array_type(itype); ++ } ++ ++ if(have_ofields) ++ { ++ for(const auto& f : ofields) ++ { ++ ret += "_ofield"; ++ for(const auto& b : f.bricks) ++ append_brick_info(b); ++ } ++ } ++ else ++ { ++ ret += "_ostride"; ++ append_size_vec(ostride); ++ ret += "_"; ++ append_array_type(otype); ++ } ++ ++ if(!have_ifields) ++ { ++ ret += "_idist_"; ++ ret += std::to_string(idist); ++ } ++ if(!have_ofields) ++ { ++ ret += "_odist_"; ++ ret += std::to_string(odist); ++ } ++ ++ if(!have_ifields) ++ { ++ ret += "_ioffset"; ++ append_size_vec(ioffset); ++ } ++ ++ if(!have_ofields) ++ { ++ ret += "_ooffset"; ++ append_size_vec(ooffset); ++ } ++ ++ if(run_callbacks) ++ ret += "_CB"; ++ ++ if(scale_factor != 1.0) ++ ret += "_scale"; ++ ++ return ret; ++ } ++ ++ // Set all params from a stringified token. ++ void from_token(std::string token) ++ { ++ std::vector<std::string> vals; ++ ++ std::string delimiter = "_"; ++ { ++ size_t pos = 0; ++ while((pos = token.find(delimiter)) != std::string::npos) ++ { ++ auto val = token.substr(0, pos); ++ vals.push_back(val); ++ token.erase(0, pos + delimiter.length()); ++ } ++ vals.push_back(token); ++ } ++ ++ auto size_parser ++ = [](const std::vector<std::string>& vals, const std::string token, size_t& pos) { ++ if(vals[pos++] != token) ++ throw std::runtime_error("Unable to parse token"); ++ return std::stoull(vals[pos++]); ++ }; ++ ++ auto vector_parser ++ = [](const std::vector<std::string>& vals, const std::string token, size_t& pos) { ++ if(vals[pos++] != token) ++ throw std::runtime_error("Unable to parse token"); ++ std::vector<size_t> vec; ++ ++ while(pos < vals.size()) ++ { ++ if(std::all_of(vals[pos].begin(), vals[pos].end(), ::isdigit)) ++ { ++ vec.push_back(std::stoull(vals[pos++])); ++ } ++ else ++ { ++ break; ++ } ++ } ++ return vec; ++ }; ++ ++ auto type_parser = [](const std::string& val) { ++ if(val == "CI") ++ return fft_array_type_complex_interleaved; ++ else if(val == "CP") ++ return fft_array_type_complex_planar; ++ else if(val == "R") ++ return fft_array_type_real; ++ else if(val == "HI") ++ return fft_array_type_hermitian_interleaved; ++ else if(val == "HP") ++ return fft_array_type_hermitian_planar; ++ return fft_array_type_unset; ++ }; ++ ++ auto field_parser = [&vector_parser, &size_parser](const std::vector<std::string>& vals, ++ size_t& pos, ++ std::vector<fft_field>& output) { ++ // skip over ifield/ofield word ++ pos++; ++ fft_field& f = output.emplace_back(); ++ while(pos < vals.size() && vals[pos] == "brick") ++ { ++ fft_brick& b = f.bricks.emplace_back(); ++ pos++; ++ b.lower = vector_parser(vals, "lower", pos); ++ b.upper = vector_parser(vals, "upper", pos); ++ b.stride = vector_parser(vals, "stride", pos); ++ b.device = size_parser(vals, "dev", pos); ++ } ++ }; ++ ++ size_t pos = 0; ++ ++ bool complex = vals[pos++] == "complex"; ++ bool forward = vals[pos++] == "forward"; ++ ++ if(complex && forward) ++ transform_type = fft_transform_type_complex_forward; ++ if(complex && !forward) ++ transform_type = fft_transform_type_complex_inverse; ++ if(!complex && forward) ++ transform_type = fft_transform_type_real_forward; ++ if(!complex && !forward) ++ transform_type = fft_transform_type_real_inverse; ++ ++ length = vector_parser(vals, "len", pos); ++ ++ if(vals[pos] == "half") ++ precision = fft_precision_half; ++ else if(vals[pos] == "single") ++ precision = fft_precision_single; ++ else if(vals[pos] == "double") ++ precision = fft_precision_double; ++ pos++; ++ ++ placement = (vals[pos++] == "ip") ? fft_placement_inplace : fft_placement_notinplace; ++ ++ nbatch = size_parser(vals, "batch", pos); ++ ++ // strides, bricks etc are mixed in from here, so just keep ++ // looking at the next token to decide what to do ++ while(pos < vals.size()) ++ { ++ const auto& next_token = vals[pos]; ++ if(next_token == "istride") ++ { ++ istride = vector_parser(vals, "istride", pos); ++ itype = type_parser(vals[pos]); ++ pos++; ++ } ++ else if(next_token == "ostride") ++ { ++ ostride = vector_parser(vals, "ostride", pos); ++ otype = type_parser(vals[pos]); ++ pos++; ++ } ++ else if(next_token == "idist") ++ idist = size_parser(vals, "idist", pos); ++ else if(next_token == "odist") ++ odist = size_parser(vals, "odist", pos); ++ else if(next_token == "ioffset") ++ ioffset = vector_parser(vals, "ioffset", pos); ++ else if(next_token == "ooffset") ++ ooffset = vector_parser(vals, "ooffset", pos); ++ else if(next_token == "ifield") ++ field_parser(vals, pos, ifields); ++ else if(next_token == "ofield") ++ field_parser(vals, pos, ofields); ++ else ++ break; ++ } ++ ++ if(pos < vals.size() && vals[pos] == "CB") ++ { ++ run_callbacks = true; ++ ++pos; ++ } ++ ++ if(pos < vals.size() && vals[pos] == "scale") ++ { ++ // just pick some factor that's not zero or one ++ scale_factor = 0.1239; ++ ++pos; ++ } ++ } ++ ++ // Stream output operator (for gtest, etc). ++ friend std::ostream& operator<<(std::ostream& stream, const fft_params& params) ++ { ++ stream << params.str(); ++ return stream; ++ } ++ ++ // Dimension of the transform. ++ size_t dim() const ++ { ++ return length.size(); ++ } ++ ++ virtual std::vector<size_t> ilength() const ++ { ++ auto ilength = length; ++ if(transform_type == fft_transform_type_real_inverse) ++ ilength[dim() - 1] = ilength[dim() - 1] / 2 + 1; ++ return ilength; ++ } ++ ++ virtual std::vector<size_t> olength() const ++ { ++ auto olength = length; ++ if(transform_type == fft_transform_type_real_forward) ++ olength[dim() - 1] = olength[dim() - 1] / 2 + 1; ++ return olength; ++ } ++ ++ static size_t nbuffer(const fft_array_type type) ++ { ++ switch(type) ++ { ++ case fft_array_type_real: ++ case fft_array_type_complex_interleaved: ++ case fft_array_type_hermitian_interleaved: ++ return 1; ++ case fft_array_type_complex_planar: ++ case fft_array_type_hermitian_planar: ++ return 2; ++ case fft_array_type_unset: ++ return 0; ++ } ++ return 0; ++ } ++ ++ // Number of input buffers ++ size_t nibuffer() const ++ { ++ return nbuffer(itype); ++ } ++ ++ // Number of output buffers ++ size_t nobuffer() const ++ { ++ return nbuffer(otype); ++ } ++ ++ void set_iotypes() ++ { ++ if(itype == fft_array_type_unset) ++ { ++ switch(transform_type) ++ { ++ case fft_transform_type_complex_forward: ++ case fft_transform_type_complex_inverse: ++ itype = fft_array_type_complex_interleaved; ++ break; ++ case fft_transform_type_real_forward: ++ itype = fft_array_type_real; ++ break; ++ case fft_transform_type_real_inverse: ++ itype = fft_array_type_hermitian_interleaved; ++ break; ++ default: ++ throw std::runtime_error("Invalid transform type"); ++ } ++ } ++ if(otype == fft_array_type_unset) ++ { ++ switch(transform_type) ++ { ++ case fft_transform_type_complex_forward: ++ case fft_transform_type_complex_inverse: ++ otype = fft_array_type_complex_interleaved; ++ break; ++ case fft_transform_type_real_forward: ++ otype = fft_array_type_hermitian_interleaved; ++ break; ++ case fft_transform_type_real_inverse: ++ otype = fft_array_type_real; ++ break; ++ default: ++ throw std::runtime_error("Invalid transform type"); ++ } ++ } ++ } ++ ++ // Check that the input and output types are consistent. ++ bool check_iotypes() const ++ { ++ switch(itype) ++ { ++ case fft_array_type_complex_interleaved: ++ case fft_array_type_complex_planar: ++ case fft_array_type_hermitian_interleaved: ++ case fft_array_type_hermitian_planar: ++ case fft_array_type_real: ++ break; ++ default: ++ throw std::runtime_error("Invalid Input array type format"); ++ } ++ ++ switch(otype) ++ { ++ case fft_array_type_complex_interleaved: ++ case fft_array_type_complex_planar: ++ case fft_array_type_hermitian_interleaved: ++ case fft_array_type_hermitian_planar: ++ case fft_array_type_real: ++ break; ++ default: ++ throw std::runtime_error("Invalid Input array type format"); ++ } ++ ++ // Check that format choices are supported ++ if(transform_type != fft_transform_type_real_forward ++ && transform_type != fft_transform_type_real_inverse) ++ { ++ if(placement == fft_placement_inplace && itype != otype) ++ { ++ throw std::runtime_error( ++ "In-place transforms must have identical input and output types"); ++ } ++ } ++ ++ bool okformat = true; ++ switch(itype) ++ { ++ case fft_array_type_complex_interleaved: ++ case fft_array_type_complex_planar: ++ okformat = (otype == fft_array_type_complex_interleaved ++ || otype == fft_array_type_complex_planar); ++ break; ++ case fft_array_type_hermitian_interleaved: ++ case fft_array_type_hermitian_planar: ++ okformat = otype == fft_array_type_real; ++ break; ++ case fft_array_type_real: ++ okformat = (otype == fft_array_type_hermitian_interleaved ++ || otype == fft_array_type_hermitian_planar); ++ break; ++ default: ++ throw std::runtime_error("Invalid Input array type format"); ++ } ++ ++ return okformat; ++ } ++ ++ // Given a length vector, set the rest of the strides. ++ // The optional argument stride0 sets the stride for the contiguous dimension. ++ // The optional rcpadding argument sets the stride correctly for in-place ++ // multi-dimensional real/complex transforms. ++ // Format is row-major. ++ template <typename T1> ++ std::vector<T1> compute_stride(const std::vector<T1>& length, ++ const std::vector<size_t>& stride0 = std::vector<size_t>(), ++ const bool rcpadding = false) const ++ { ++ std::vector<T1> stride(dim()); ++ ++ size_t dimoffset = 0; ++ ++ if(stride0.size() == 0) ++ { ++ // Set the contiguous stride: ++ stride[dim() - 1] = 1; ++ dimoffset = 1; ++ } ++ else ++ { ++ // Copy the input values to the end of the stride array: ++ for(size_t i = 0; i < stride0.size(); ++i) ++ { ++ stride[dim() - stride0.size() + i] = stride0[i]; ++ } ++ } ++ ++ if(stride0.size() < dim()) ++ { ++ // Compute any remaining values via recursion. ++ for(size_t i = dim() - dimoffset - stride0.size(); i-- > 0;) ++ { ++ auto lengthip1 = length[i + 1]; ++ if(rcpadding && i == dim() - 2) ++ { ++ lengthip1 = 2 * (lengthip1 / 2 + 1); ++ } ++ stride[i] = stride[i + 1] * lengthip1; ++ } ++ } ++ ++ return stride; ++ } ++ ++ void compute_istride() ++ { ++ istride = compute_stride(ilength(), ++ istride, ++ placement == fft_placement_inplace ++ && transform_type == fft_transform_type_real_forward); ++ } ++ ++ void compute_ostride() ++ { ++ ostride = compute_stride(olength(), ++ ostride, ++ placement == fft_placement_inplace ++ && transform_type == fft_transform_type_real_inverse); ++ } ++ ++ virtual void compute_isize() ++ { ++ auto il = ilength(); ++ size_t val = compute_ptrdiff(il, istride, nbatch, idist); ++ isize.resize(nibuffer()); ++ for(unsigned int i = 0; i < isize.size(); ++i) ++ { ++ isize[i] = val + ioffset[i]; ++ } ++ } ++ ++ virtual void compute_osize() ++ { ++ auto ol = olength(); ++ size_t val = compute_ptrdiff(ol, ostride, nbatch, odist); ++ osize.resize(nobuffer()); ++ for(unsigned int i = 0; i < osize.size(); ++i) ++ { ++ osize[i] = val + ooffset[i]; ++ } ++ } ++ ++ std::vector<size_t> ibuffer_sizes() const ++ { ++ std::vector<size_t> ibuffer_sizes; ++ ++ // In-place real-to-complex transforms need to have enough space in the input buffer to ++ // accomadate the output, which is slightly larger. ++ if(placement == fft_placement_inplace && transform_type == fft_transform_type_real_forward) ++ { ++ return obuffer_sizes(); ++ } ++ ++ if(isize.empty()) ++ return ibuffer_sizes; ++ ++ switch(itype) ++ { ++ case fft_array_type_complex_planar: ++ case fft_array_type_hermitian_planar: ++ ibuffer_sizes.resize(2); ++ break; ++ default: ++ ibuffer_sizes.resize(1); ++ } ++ for(unsigned i = 0; i < ibuffer_sizes.size(); i++) ++ { ++ ibuffer_sizes[i] = isize[i] * var_size<size_t>(precision, itype); ++ } ++ return ibuffer_sizes; ++ } ++ ++ virtual std::vector<size_t> obuffer_sizes() const ++ { ++ std::vector<size_t> obuffer_sizes; ++ ++ if(osize.empty()) ++ return obuffer_sizes; ++ ++ switch(otype) ++ { ++ case fft_array_type_complex_planar: ++ case fft_array_type_hermitian_planar: ++ obuffer_sizes.resize(2); ++ break; ++ default: ++ obuffer_sizes.resize(1); ++ } ++ for(unsigned i = 0; i < obuffer_sizes.size(); i++) ++ { ++ obuffer_sizes[i] = osize[i] * var_size<size_t>(precision, otype); ++ } ++ return obuffer_sizes; ++ } ++ ++ // Compute the idist for a given transform based on the placeness, transform type, and data ++ // layout. ++ size_t compute_idist() const ++ { ++ size_t dist = 0; ++ // In-place 1D transforms need extra dist. ++ if(transform_type == fft_transform_type_real_forward && dim() == 1 ++ && placement == fft_placement_inplace) ++ { ++ dist = 2 * (length[0] / 2 + 1) * istride[0]; ++ return dist; ++ } ++ ++ if(transform_type == fft_transform_type_real_inverse && dim() == 1) ++ { ++ dist = (length[0] / 2 + 1) * istride[0]; ++ return dist; ++ } ++ ++ dist = (transform_type == fft_transform_type_real_inverse) ++ ? (length[dim() - 1] / 2 + 1) * istride[dim() - 1] ++ : length[dim() - 1] * istride[dim() - 1]; ++ for(unsigned int i = 0; i < dim() - 1; ++i) ++ { ++ dist = std::max(length[i] * istride[i], dist); ++ } ++ return dist; ++ } ++ void set_idist() ++ { ++ if(idist != 0) ++ return; ++ idist = compute_idist(); ++ } ++ ++ // Compute the odist for a given transform based on the placeness, transform type, and data ++ // layout. Row-major. ++ size_t compute_odist() const ++ { ++ size_t dist = 0; ++ // In-place 1D transforms need extra dist. ++ if(transform_type == fft_transform_type_real_inverse && dim() == 1 ++ && placement == fft_placement_inplace) ++ { ++ dist = 2 * (length[0] / 2 + 1) * ostride[0]; ++ return dist; ++ } ++ ++ if(transform_type == fft_transform_type_real_forward && dim() == 1) ++ { ++ dist = (length[0] / 2 + 1) * ostride[0]; ++ return dist; ++ } ++ ++ dist = (transform_type == fft_transform_type_real_forward) ++ ? (length[dim() - 1] / 2 + 1) * ostride[dim() - 1] ++ : length[dim() - 1] * ostride[dim() - 1]; ++ for(unsigned int i = 0; i < dim() - 1; ++i) ++ { ++ dist = std::max(length[i] * ostride[i], dist); ++ } ++ return dist; ++ } ++ void set_odist() ++ { ++ if(odist != 0) ++ return; ++ odist = compute_odist(); ++ } ++ ++ // Put the length, stride, batch, and dist into a single length/stride array and pass off to the ++ // validity checker. ++ bool valid_length_stride_batch_dist(const std::vector<size_t>& l0, ++ const std::vector<size_t>& s0, ++ const size_t n, ++ const size_t dist, ++ const int verbose = 0) const ++ { ++ if(l0.size() != s0.size()) ++ return false; ++ ++ // Length and stride vectors, including bathes: ++ std::vector<size_t> l{}, s{}; ++ for(unsigned int i = 0; i < l0.size(); ++i) ++ { ++ if(l0[i] > 1) ++ { ++ if(s0[i] == 0) ++ return false; ++ l.push_back(l0[i]); ++ s.push_back(s0[i]); ++ } ++ } ++ if(n > 1) ++ { ++ if(dist == 0) ++ return false; ++ l.push_back(n); ++ s.push_back(dist); ++ } ++ ++ return array_valid(l, s, verbose); ++ } ++ ++ // Return true if the given GPU parameters would produce a valid transform. ++ bool valid(const int verbose) const ++ { ++ if(ioffset.size() < nibuffer() || ooffset.size() < nobuffer()) ++ return false; ++ ++ // Check that in-place transforms have the same input and output stride: ++ if(placement == fft_placement_inplace) ++ { ++ const auto stridesize = std::min(istride.size(), ostride.size()); ++ bool samestride = true; ++ for(unsigned int i = 0; i < stridesize; ++i) ++ { ++ if(istride[i] != ostride[i]) ++ samestride = false; ++ } ++ if((transform_type == fft_transform_type_complex_forward ++ || transform_type == fft_transform_type_complex_inverse) ++ && !samestride) ++ { ++ // In-place transforms require identical input and output strides. ++ if(verbose) ++ { ++ std::cout << "istride:"; ++ for(const auto& i : istride) ++ std::cout << " " << i; ++ std::cout << " ostride0:"; ++ for(const auto& i : ostride) ++ std::cout << " " << i; ++ std::cout << " differ; skipped for in-place transforms: skipping test" ++ << std::endl; ++ } ++ return false; ++ } ++ ++ if((transform_type == fft_transform_type_complex_forward ++ || transform_type == fft_transform_type_complex_inverse) ++ && (idist != odist) && nbatch > 1) ++ { ++ // In-place transforms require identical distance, if ++ // batch > 1. If batch is 1 then dist is ignored and ++ // the FFT should still work. ++ if(verbose) ++ { ++ std::cout << "idist:" << idist << " odist:" << odist ++ << " differ; skipped for in-place transforms: skipping test" ++ << std::endl; ++ } ++ return false; ++ } ++ ++ if((transform_type == fft_transform_type_real_forward ++ || transform_type == fft_transform_type_real_inverse) ++ && (istride.back() != 1 || ostride.back() != 1)) ++ { ++ // In-place real/complex transforms require unit strides. ++ if(verbose) ++ { ++ std::cout ++ << "istride.back(): " << istride.back() ++ << " ostride.back(): " << ostride.back() ++ << " must be unitary for in-place real/complex transforms: skipping test" ++ << std::endl; ++ } ++ return false; ++ } ++ ++ if((itype == fft_array_type_complex_interleaved ++ && otype == fft_array_type_complex_planar) ++ || (itype == fft_array_type_complex_planar ++ && otype == fft_array_type_complex_interleaved)) ++ { ++ if(verbose) ++ { ++ std::cout << "In-place c2c transforms require identical io types; skipped.\n"; ++ } ++ return false; ++ } ++ ++ // Check offsets ++ switch(transform_type) ++ { ++ case fft_transform_type_complex_forward: ++ case fft_transform_type_complex_inverse: ++ for(unsigned int i = 0; i < nibuffer(); ++i) ++ { ++ if(ioffset[i] != ooffset[i]) ++ return false; ++ } ++ break; ++ case fft_transform_type_real_forward: ++ if(ioffset[0] != 2 * ooffset[0]) ++ return false; ++ break; ++ case fft_transform_type_real_inverse: ++ if(2 * ioffset[0] != ooffset[0]) ++ return false; ++ break; ++ } ++ } ++ ++ if(!check_iotypes()) ++ return false; ++ ++ // we can only check output strides on out-of-place ++ // transforms, since we need to initialize output to a known ++ // pattern ++ if(placement == fft_placement_inplace && check_output_strides) ++ return false; ++ ++ // Check input and output strides ++ if(valid_length_stride_batch_dist(ilength(), istride, nbatch, idist, verbose) != true) ++ { ++ if(verbose) ++ std::cout << "Invalid input data format.\n"; ++ return false; ++ } ++ if(!(ilength() == olength() && istride == ostride && idist == odist)) ++ { ++ // Only check if different ++ if(valid_length_stride_batch_dist(olength(), ostride, nbatch, odist, verbose) != true) ++ { ++ if(verbose) ++ std::cout << "Invalid output data format.\n"; ++ return false; ++ } ++ } ++ ++ // The parameters are valid. ++ return true; ++ } ++ ++ // Fill in any missing parameters. ++ void validate() ++ { ++ set_iotypes(); ++ compute_istride(); ++ compute_ostride(); ++ set_idist(); ++ set_odist(); ++ compute_isize(); ++ compute_osize(); ++ ++ validate_fields(); ++ } ++ ++ virtual void validate_fields() const ++ { ++ if(!ifields.empty() || !ofields.empty()) ++ throw std::runtime_error("input/output fields are unsupported"); ++ } ++ ++ // Column-major getters: ++ std::vector<size_t> length_cm() const ++ { ++ auto length_cm = length; ++ std::reverse(std::begin(length_cm), std::end(length_cm)); ++ return length_cm; ++ } ++ std::vector<size_t> ilength_cm() const ++ { ++ auto ilength_cm = ilength(); ++ std::reverse(std::begin(ilength_cm), std::end(ilength_cm)); ++ return ilength_cm; ++ } ++ std::vector<size_t> olength_cm() const ++ { ++ auto olength_cm = olength(); ++ std::reverse(std::begin(olength_cm), std::end(olength_cm)); ++ return olength_cm; ++ } ++ std::vector<size_t> istride_cm() const ++ { ++ auto istride_cm = istride; ++ std::reverse(std::begin(istride_cm), std::end(istride_cm)); ++ return istride_cm; ++ } ++ std::vector<size_t> ostride_cm() const ++ { ++ auto ostride_cm = ostride; ++ std::reverse(std::begin(ostride_cm), std::end(ostride_cm)); ++ return ostride_cm; ++ } ++ bool is_planar() const ++ { ++ if(itype == fft_array_type_complex_planar || itype == fft_array_type_hermitian_planar) ++ return true; ++ if(otype == fft_array_type_complex_planar || otype == fft_array_type_hermitian_planar) ++ return true; ++ return false; ++ } ++ ++ // Given a data type and dimensions, fill the buffer, imposing Hermitian symmetry if necessary. ++ template <typename Tbuff> ++ inline void compute_input(std::vector<Tbuff>& input) ++ { ++ auto deviceProp = get_curr_device_prop(); ++ ++ switch(precision) ++ { ++ case fft_precision_half: ++ set_input<Tbuff, _Float16>( ++ input, igen, itype, length, ilength(), istride, idist, nbatch, deviceProp); ++ break; ++ case fft_precision_double: ++ set_input<Tbuff, double>( ++ input, igen, itype, length, ilength(), istride, idist, nbatch, deviceProp); ++ break; ++ case fft_precision_single: ++ set_input<Tbuff, float>( ++ input, igen, itype, length, ilength(), istride, idist, nbatch, deviceProp); ++ break; ++ } ++ } ++ ++ template <typename Tstream = std::ostream> ++ void print_ibuffer(const std::vector<hostbuf>& buf, Tstream& stream = std::cout) const ++ { ++ switch(itype) ++ { ++ case fft_array_type_complex_interleaved: ++ case fft_array_type_hermitian_interleaved: ++ { ++ switch(precision) ++ { ++ case fft_precision_half: ++ { ++ buffer_printer<rocfft_complex<_Float16>> s; ++ s.print_buffer(buf, ilength(), istride, nbatch, idist, ioffset); ++ break; ++ } ++ case fft_precision_single: ++ { ++ buffer_printer<rocfft_complex<float>> s; ++ s.print_buffer(buf, ilength(), istride, nbatch, idist, ioffset); ++ break; ++ } ++ case fft_precision_double: ++ { ++ buffer_printer<rocfft_complex<double>> s; ++ s.print_buffer(buf, ilength(), istride, nbatch, idist, ioffset); ++ break; ++ } ++ } ++ break; ++ } ++ case fft_array_type_complex_planar: ++ case fft_array_type_hermitian_planar: ++ case fft_array_type_real: ++ { ++ switch(precision) ++ { ++ case fft_precision_half: ++ { ++ buffer_printer<_Float16> s; ++ s.print_buffer(buf, ilength(), istride, nbatch, idist, ioffset); ++ break; ++ } ++ case fft_precision_single: ++ { ++ buffer_printer<float> s; ++ s.print_buffer(buf, ilength(), istride, nbatch, idist, ioffset); ++ break; ++ } ++ case fft_precision_double: ++ { ++ buffer_printer<double> s; ++ s.print_buffer(buf, ilength(), istride, nbatch, idist, ioffset); ++ break; ++ } ++ } ++ break; ++ } ++ default: ++ throw std::runtime_error("Invalid itype in print_ibuffer"); ++ } ++ } ++ ++ template <typename Tstream = std::ostream> ++ void print_obuffer(const std::vector<hostbuf>& buf, Tstream& stream = std::cout) const ++ { ++ switch(otype) ++ { ++ case fft_array_type_complex_interleaved: ++ case fft_array_type_hermitian_interleaved: ++ { ++ switch(precision) ++ { ++ case fft_precision_half: ++ { ++ buffer_printer<rocfft_complex<_Float16>> s; ++ s.print_buffer(buf, olength(), ostride, nbatch, odist, ooffset); ++ break; ++ } ++ case fft_precision_single: ++ { ++ buffer_printer<rocfft_complex<float>> s; ++ s.print_buffer(buf, olength(), ostride, nbatch, odist, ooffset); ++ break; ++ } ++ case fft_precision_double: ++ buffer_printer<rocfft_complex<double>> s; ++ s.print_buffer(buf, olength(), ostride, nbatch, odist, ooffset); ++ break; ++ } ++ break; ++ } ++ case fft_array_type_complex_planar: ++ case fft_array_type_hermitian_planar: ++ case fft_array_type_real: ++ { ++ switch(precision) ++ { ++ case fft_precision_half: ++ { ++ buffer_printer<_Float16> s; ++ s.print_buffer(buf, olength(), ostride, nbatch, odist, ooffset); ++ break; ++ } ++ case fft_precision_single: ++ { ++ buffer_printer<float> s; ++ s.print_buffer(buf, olength(), ostride, nbatch, odist, ooffset); ++ break; ++ } ++ case fft_precision_double: ++ { ++ buffer_printer<double> s; ++ s.print_buffer(buf, olength(), ostride, nbatch, odist, ooffset); ++ break; ++ } ++ } ++ break; ++ } ++ ++ default: ++ throw std::runtime_error("Invalid itype in print_obuffer"); ++ } ++ } ++ ++ void print_ibuffer_flat(const std::vector<hostbuf>& buf) const ++ { ++ switch(itype) ++ { ++ case fft_array_type_complex_interleaved: ++ case fft_array_type_hermitian_interleaved: ++ { ++ switch(precision) ++ { ++ case fft_precision_half: ++ { ++ buffer_printer<rocfft_complex<_Float16>> s; ++ s.print_buffer_flat(buf, osize, ooffset); ++ break; ++ } ++ case fft_precision_single: ++ { ++ buffer_printer<rocfft_complex<float>> s; ++ s.print_buffer_flat(buf, osize, ooffset); ++ break; ++ } ++ case fft_precision_double: ++ buffer_printer<rocfft_complex<double>> s; ++ s.print_buffer_flat(buf, osize, ooffset); ++ break; ++ } ++ break; ++ } ++ case fft_array_type_complex_planar: ++ case fft_array_type_hermitian_planar: ++ case fft_array_type_real: ++ { ++ switch(precision) ++ { ++ case fft_precision_half: ++ { ++ buffer_printer<_Float16> s; ++ s.print_buffer_flat(buf, osize, ooffset); ++ break; ++ } ++ case fft_precision_single: ++ { ++ buffer_printer<float> s; ++ s.print_buffer_flat(buf, osize, ooffset); ++ break; ++ } ++ case fft_precision_double: ++ { ++ buffer_printer<double> s; ++ s.print_buffer_flat(buf, osize, ooffset); ++ break; ++ } ++ } ++ break; ++ default: ++ throw std::runtime_error("Invalid itype in print_ibuffer_flat"); ++ } ++ } ++ } ++ ++ void print_obuffer_flat(const std::vector<hostbuf>& buf) const ++ { ++ switch(otype) ++ { ++ case fft_array_type_complex_interleaved: ++ case fft_array_type_hermitian_interleaved: ++ { ++ switch(precision) ++ { ++ case fft_precision_half: ++ { ++ buffer_printer<rocfft_complex<_Float16>> s; ++ s.print_buffer_flat(buf, osize, ooffset); ++ break; ++ } ++ case fft_precision_single: ++ { ++ buffer_printer<rocfft_complex<float>> s; ++ s.print_buffer_flat(buf, osize, ooffset); ++ break; ++ } ++ case fft_precision_double: ++ buffer_printer<rocfft_complex<double>> s; ++ s.print_buffer_flat(buf, osize, ooffset); ++ break; ++ } ++ break; ++ } ++ case fft_array_type_complex_planar: ++ case fft_array_type_hermitian_planar: ++ case fft_array_type_real: ++ { ++ switch(precision) ++ { ++ case fft_precision_half: ++ { ++ buffer_printer<_Float16> s; ++ s.print_buffer_flat(buf, osize, ooffset); ++ break; ++ } ++ case fft_precision_single: ++ { ++ buffer_printer<float> s; ++ s.print_buffer_flat(buf, osize, ooffset); ++ break; ++ } ++ ++ case fft_precision_double: ++ { ++ buffer_printer<double> s; ++ s.print_buffer_flat(buf, osize, ooffset); ++ break; ++ } ++ } ++ break; ++ default: ++ throw std::runtime_error("Invalid itype in print_ibuffer_flat"); ++ } ++ } ++ } ++ ++ virtual fft_status set_callbacks(void* load_cb_host, ++ void* load_cb_data, ++ void* store_cb_host, ++ void* store_cb_data) ++ { ++ return fft_status_success; ++ } ++ ++ virtual fft_status execute(void** in, void** out) ++ { ++ return fft_status_success; ++ }; ++ ++ size_t fft_params_vram_footprint() ++ { ++ return fft_params::vram_footprint(); ++ } ++ ++ virtual size_t vram_footprint() ++ { ++ const auto ibuf_size = ibuffer_sizes(); ++ size_t val = std::accumulate(ibuf_size.begin(), ibuf_size.end(), (size_t)1); ++ if(placement == fft_placement_notinplace) ++ { ++ const auto obuf_size = obuffer_sizes(); ++ val += std::accumulate(obuf_size.begin(), obuf_size.end(), (size_t)1); ++ } ++ return val; ++ } ++ ++ // Specific exception type for work buffer allocation failure. ++ // Tests that hit this can't fit on the GPU and should be skipped. ++ struct work_buffer_alloc_failure : public std::runtime_error ++ { ++ work_buffer_alloc_failure(const std::string& s) ++ : std::runtime_error(s) ++ { ++ } ++ }; ++ ++ virtual fft_status create_plan() ++ { ++ return fft_status_success; ++ } ++ ++ // Change a forward transform to it's inverse ++ void inverse_from_forward(fft_params& params_forward) ++ { ++ switch(params_forward.transform_type) ++ { ++ case fft_transform_type_complex_forward: ++ transform_type = fft_transform_type_complex_inverse; ++ break; ++ case fft_transform_type_real_forward: ++ transform_type = fft_transform_type_real_inverse; ++ break; ++ default: ++ throw std::runtime_error("Transform type not forward."); ++ } ++ ++ length = params_forward.length; ++ istride = params_forward.ostride; ++ ostride = params_forward.istride; ++ nbatch = params_forward.nbatch; ++ precision = params_forward.precision; ++ placement = params_forward.placement; ++ idist = params_forward.odist; ++ odist = params_forward.idist; ++ itype = params_forward.otype; ++ otype = params_forward.itype; ++ ioffset = params_forward.ooffset; ++ ooffset = params_forward.ioffset; ++ ++ run_callbacks = params_forward.run_callbacks; ++ ++ check_output_strides = params_forward.check_output_strides; ++ ++ scale_factor = 1 / params_forward.scale_factor; ++ } ++ ++ // prepare for multi-GPU transform. Generated input is in ibuffer. ++ // pibuffer, pobuffer are the pointers that will be passed to the ++ // FFT library's "execute" API. ++ virtual void multi_gpu_prepare(std::vector<gpubuf>& ibuffer, ++ std::vector<void*>& pibuffer, ++ std::vector<void*>& pobuffer) ++ { ++ } ++ ++ // finalize multi-GPU transform. pobuffers are the pointers ++ // provided to the FFT library's "execute" API. obuffer is the ++ // buffer where transform output needs to go for validation ++ virtual void multi_gpu_finalize(std::vector<gpubuf>& obuffer, std::vector<void*>& pobuffer) {} ++ ++ // create bricks in the specified field for the specified number ++ // of devices. The field is split along the highest FFT ++ // dimension, and the length only includes FFT lengths, not batch ++ // dimension. ++ void distribute_field(int deviceCount, ++ std::vector<fft_field>& fields, ++ const std::vector<size_t>& field_length) ++ { ++ size_t slowLen = field_length.front(); ++ if(slowLen < static_cast<size_t>(deviceCount)) ++ throw std::runtime_error("too many devices to distribute length " ++ + std::to_string(slowLen)); ++ ++ auto& field = fields.emplace_back(); ++ ++ for(int i = 0; i < deviceCount; ++i) ++ { ++ // start at origin ++ std::vector<size_t> field_lower(field_length.size()); ++ std::vector<size_t> field_upper(field_length.size()); ++ ++ // note: slowest FFT dim is index 0 in these coordinates ++ field_lower[0] = slowLen / deviceCount * i; ++ ++ // last brick needs to include the whole slow len ++ if(i == deviceCount - 1) ++ { ++ field_upper[0] = slowLen; ++ } ++ else ++ { ++ field_upper[0] = std::min(slowLen, field_lower[0] + slowLen / deviceCount); ++ } ++ ++ for(unsigned int upperDim = 1; upperDim < field_length.size(); ++upperDim) ++ { ++ field_upper[upperDim] = field_length[upperDim]; ++ } ++ ++ // field coordinates also need to include batch ++ field_lower.insert(field_lower.begin(), 0); ++ field_upper.insert(field_upper.begin(), nbatch); ++ ++ // bricks have contiguous strides ++ size_t brick_dist = 1; ++ std::vector<size_t> brick_stride(field_lower.size()); ++ for(size_t distIdx = 0; distIdx < field_lower.size(); ++distIdx) ++ { ++ // fill strides from fastest to slowest ++ *(brick_stride.rbegin() + distIdx) = brick_dist; ++ brick_dist *= *(field_upper.rbegin() + distIdx) - *(field_lower.rbegin() + distIdx); ++ } ++ field.bricks.push_back( ++ fft_params::fft_brick{field_lower, field_upper, brick_stride, i}); ++ } ++ } ++ ++ void distribute_input(int deviceCount) ++ { ++ distribute_field(deviceCount, ifields, length); ++ } ++ ++ void distribute_output(int deviceCount) ++ { ++ distribute_field(deviceCount, ofields, olength()); ++ } ++}; ++ ++// This is used with the program_options class so that the user can type an integer on the ++// command line and we store into an enum varaible ++template <typename _Elem, typename _Traits> ++std::basic_istream<_Elem, _Traits>& operator>>(std::basic_istream<_Elem, _Traits>& stream, ++ fft_array_type& atype) ++{ ++ unsigned tmp; ++ stream >> tmp; ++ atype = fft_array_type(tmp); ++ return stream; ++} ++ ++// similarly for transform type ++template <typename _Elem, typename _Traits> ++std::basic_istream<_Elem, _Traits>& operator>>(std::basic_istream<_Elem, _Traits>& stream, ++ fft_transform_type& ttype) ++{ ++ unsigned tmp; ++ stream >> tmp; ++ ttype = fft_transform_type(tmp); ++ return stream; ++} ++ ++// Returns pairs of startindex, endindex, for 1D, 2D, 3D lengths ++template <typename T1> ++std::vector<std::pair<T1, T1>> partition_colmajor(const T1& length) ++{ ++ return partition_base(length, compute_partition_count(length)); ++} ++ ++// Partition on the rightmost part of the tuple, for col-major indexing ++template <typename T1> ++std::vector<std::pair<std::tuple<T1, T1>, std::tuple<T1, T1>>> ++ partition_colmajor(const std::tuple<T1, T1>& length) ++{ ++ auto partitions = partition_base(std::get<1>(length), compute_partition_count(length)); ++ std::vector<std::pair<std::tuple<T1, T1>, std::tuple<T1, T1>>> ret(partitions.size()); ++ for(size_t i = 0; i < partitions.size(); ++i) ++ { ++ std::get<1>(ret[i].first) = partitions[i].first; ++ std::get<0>(ret[i].first) = 0; ++ std::get<1>(ret[i].second) = partitions[i].second; ++ std::get<0>(ret[i].second) = std::get<0>(length); ++ } ++ return ret; ++} ++template <typename T1> ++std::vector<std::pair<std::tuple<T1, T1, T1>, std::tuple<T1, T1, T1>>> ++ partition_colmajor(const std::tuple<T1, T1, T1>& length) ++{ ++ auto partitions = partition_base(std::get<2>(length), compute_partition_count(length)); ++ std::vector<std::pair<std::tuple<T1, T1, T1>, std::tuple<T1, T1, T1>>> ret(partitions.size()); ++ for(size_t i = 0; i < partitions.size(); ++i) ++ { ++ std::get<2>(ret[i].first) = partitions[i].first; ++ std::get<1>(ret[i].first) = 0; ++ std::get<0>(ret[i].first) = 0; ++ std::get<2>(ret[i].second) = partitions[i].second; ++ std::get<1>(ret[i].second) = std::get<1>(length); ++ std::get<0>(ret[i].second) = std::get<0>(length); ++ } ++ return ret; ++} ++ ++// Copy data of dimensions length with strides istride and length idist between batches to ++// a buffer with strides ostride and length odist between batches. The input and output ++// types are identical. ++template <typename Tval, typename Tint1, typename Tint2, typename Tint3> ++inline void copy_buffers_1to1(const Tval* input, ++ Tval* output, ++ const Tint1& whole_length, ++ const size_t nbatch, ++ const Tint2& istride, ++ const size_t idist, ++ const Tint3& ostride, ++ const size_t odist, ++ const std::vector<size_t>& ioffset, ++ const std::vector<size_t>& ooffset) ++{ ++ const bool idx_equals_odx = istride == ostride && idist == odist; ++ size_t idx_base = 0; ++ size_t odx_base = 0; ++ auto partitions = partition_rowmajor(whole_length); ++ for(size_t b = 0; b < nbatch; b++, idx_base += idist, odx_base += odist) ++ { ++#ifdef _OPENMP ++#pragma omp parallel for num_threads(partitions.size()) ++#endif ++ for(size_t part = 0; part < partitions.size(); ++part) ++ { ++ auto index = partitions[part].first; ++ const auto length = partitions[part].second; ++ do ++ { ++ const auto idx = compute_index(index, istride, idx_base); ++ const auto odx = idx_equals_odx ? idx : compute_index(index, ostride, odx_base); ++ output[odx + ooffset[0]] = input[idx + ioffset[0]]; ++ } while(increment_rowmajor(index, length)); ++ } ++ } ++} ++ ++// Copy data of dimensions length with strides istride and length idist between batches to ++// a buffer with strides ostride and length odist between batches. The input type is ++// planar and the output type is complex interleaved. ++template <typename Tval, typename Tint1, typename Tint2, typename Tint3> ++inline void copy_buffers_2to1(const Tval* input0, ++ const Tval* input1, ++ rocfft_complex<Tval>* output, ++ const Tint1& whole_length, ++ const size_t nbatch, ++ const Tint2& istride, ++ const size_t idist, ++ const Tint3& ostride, ++ const size_t odist, ++ const std::vector<size_t>& ioffset, ++ const std::vector<size_t>& ooffset) ++{ ++ const bool idx_equals_odx = istride == ostride && idist == odist; ++ size_t idx_base = 0; ++ size_t odx_base = 0; ++ auto partitions = partition_rowmajor(whole_length); ++ for(size_t b = 0; b < nbatch; b++, idx_base += idist, odx_base += odist) ++ { ++#ifdef _OPENMP ++#pragma omp parallel for num_threads(partitions.size()) ++#endif ++ for(size_t part = 0; part < partitions.size(); ++part) ++ { ++ auto index = partitions[part].first; ++ const auto length = partitions[part].second; ++ do ++ { ++ const auto idx = compute_index(index, istride, idx_base); ++ const auto odx = idx_equals_odx ? idx : compute_index(index, ostride, odx_base); ++ output[odx + ooffset[0]] ++ = rocfft_complex<Tval>(input0[idx + ioffset[0]], input1[idx + ioffset[1]]); ++ } while(increment_rowmajor(index, length)); ++ } ++ } ++} ++ ++// Copy data of dimensions length with strides istride and length idist between batches to ++// a buffer with strides ostride and length odist between batches. The input type is ++// complex interleaved and the output type is planar. ++template <typename Tval, typename Tint1, typename Tint2, typename Tint3> ++inline void copy_buffers_1to2(const rocfft_complex<Tval>* input, ++ Tval* output0, ++ Tval* output1, ++ const Tint1& whole_length, ++ const size_t nbatch, ++ const Tint2& istride, ++ const size_t idist, ++ const Tint3& ostride, ++ const size_t odist, ++ const std::vector<size_t>& ioffset, ++ const std::vector<size_t>& ooffset) ++{ ++ const bool idx_equals_odx = istride == ostride && idist == odist; ++ size_t idx_base = 0; ++ size_t odx_base = 0; ++ auto partitions = partition_rowmajor(whole_length); ++ for(size_t b = 0; b < nbatch; b++, idx_base += idist, odx_base += odist) ++ { ++#ifdef _OPENMP ++#pragma omp parallel for num_threads(partitions.size()) ++#endif ++ for(size_t part = 0; part < partitions.size(); ++part) ++ { ++ auto index = partitions[part].first; ++ const auto length = partitions[part].second; ++ do ++ { ++ const auto idx = compute_index(index, istride, idx_base); ++ const auto odx = idx_equals_odx ? idx : compute_index(index, ostride, odx_base); ++ output0[odx + ooffset[0]] = input[idx + ioffset[0]].real(); ++ output1[odx + ooffset[1]] = input[idx + ioffset[0]].imag(); ++ } while(increment_rowmajor(index, length)); ++ } ++ } ++} ++ ++// Copy data of dimensions length with strides istride and length idist between batches to ++// a buffer with strides ostride and length odist between batches. The input type given ++// by itype, and the output type is given by otype. ++template <typename Tint1, typename Tint2, typename Tint3> ++inline void copy_buffers(const std::vector<hostbuf>& input, ++ std::vector<hostbuf>& output, ++ const Tint1& length, ++ const size_t nbatch, ++ const fft_precision precision, ++ const fft_array_type itype, ++ const Tint2& istride, ++ const size_t idist, ++ const fft_array_type otype, ++ const Tint3& ostride, ++ const size_t odist, ++ const std::vector<size_t>& ioffset, ++ const std::vector<size_t>& ooffset) ++{ ++ if(itype == otype) ++ { ++ switch(itype) ++ { ++ case fft_array_type_complex_interleaved: ++ case fft_array_type_hermitian_interleaved: ++ switch(precision) ++ { ++ case fft_precision_half: ++ copy_buffers_1to1( ++ reinterpret_cast<const rocfft_complex<_Float16>*>(input[0].data()), ++ reinterpret_cast<rocfft_complex<_Float16>*>(output[0].data()), ++ length, ++ nbatch, ++ istride, ++ idist, ++ ostride, ++ odist, ++ ioffset, ++ ooffset); ++ break; ++ case fft_precision_single: ++ copy_buffers_1to1(reinterpret_cast<const rocfft_complex<float>*>(input[0].data()), ++ reinterpret_cast<rocfft_complex<float>*>(output[0].data()), ++ length, ++ nbatch, ++ istride, ++ idist, ++ ostride, ++ odist, ++ ioffset, ++ ooffset); ++ break; ++ case fft_precision_double: ++ copy_buffers_1to1(reinterpret_cast<const rocfft_complex<double>*>(input[0].data()), ++ reinterpret_cast<rocfft_complex<double>*>(output[0].data()), ++ length, ++ nbatch, ++ istride, ++ idist, ++ ostride, ++ odist, ++ ioffset, ++ ooffset); ++ break; ++ } ++ break; ++ case fft_array_type_real: ++ case fft_array_type_complex_planar: ++ case fft_array_type_hermitian_planar: ++ for(unsigned int idx = 0; idx < input.size(); ++idx) ++ { ++ switch(precision) ++ { ++ case fft_precision_half: ++ copy_buffers_1to1(reinterpret_cast<const _Float16*>(input[idx].data()), ++ reinterpret_cast<_Float16*>(output[idx].data()), ++ length, ++ nbatch, ++ istride, ++ idist, ++ ostride, ++ odist, ++ ioffset, ++ ooffset); ++ break; ++ case fft_precision_single: ++ copy_buffers_1to1(reinterpret_cast<const float*>(input[idx].data()), ++ reinterpret_cast<float*>(output[idx].data()), ++ length, ++ nbatch, ++ istride, ++ idist, ++ ostride, ++ odist, ++ ioffset, ++ ooffset); ++ break; ++ case fft_precision_double: ++ copy_buffers_1to1(reinterpret_cast<const double*>(input[idx].data()), ++ reinterpret_cast<double*>(output[idx].data()), ++ length, ++ nbatch, ++ istride, ++ idist, ++ ostride, ++ odist, ++ ioffset, ++ ooffset); ++ break; ++ } ++ } ++ break; ++ default: ++ throw std::runtime_error("Invalid data type"); ++ } ++ } ++ else if((itype == fft_array_type_complex_interleaved && otype == fft_array_type_complex_planar) ++ || (itype == fft_array_type_hermitian_interleaved ++ && otype == fft_array_type_hermitian_planar)) ++ { ++ // copy 1to2 ++ switch(precision) ++ { ++ case fft_precision_half: ++ copy_buffers_1to2(reinterpret_cast<const rocfft_complex<_Float16>*>(input[0].data()), ++ reinterpret_cast<_Float16*>(output[0].data()), ++ reinterpret_cast<_Float16*>(output[1].data()), ++ length, ++ nbatch, ++ istride, ++ idist, ++ ostride, ++ odist, ++ ioffset, ++ ooffset); ++ break; ++ case fft_precision_single: ++ copy_buffers_1to2(reinterpret_cast<const rocfft_complex<float>*>(input[0].data()), ++ reinterpret_cast<float*>(output[0].data()), ++ reinterpret_cast<float*>(output[1].data()), ++ length, ++ nbatch, ++ istride, ++ idist, ++ ostride, ++ odist, ++ ioffset, ++ ooffset); ++ break; ++ case fft_precision_double: ++ copy_buffers_1to2(reinterpret_cast<const rocfft_complex<double>*>(input[0].data()), ++ reinterpret_cast<double*>(output[0].data()), ++ reinterpret_cast<double*>(output[1].data()), ++ length, ++ nbatch, ++ istride, ++ idist, ++ ostride, ++ odist, ++ ioffset, ++ ooffset); ++ break; ++ } ++ } ++ else if((itype == fft_array_type_complex_planar && otype == fft_array_type_complex_interleaved) ++ || (itype == fft_array_type_hermitian_planar ++ && otype == fft_array_type_hermitian_interleaved)) ++ { ++ // copy 2 to 1 ++ switch(precision) ++ { ++ case fft_precision_half: ++ copy_buffers_2to1(reinterpret_cast<const _Float16*>(input[0].data()), ++ reinterpret_cast<const _Float16*>(input[1].data()), ++ reinterpret_cast<rocfft_complex<_Float16>*>(output[0].data()), ++ length, ++ nbatch, ++ istride, ++ idist, ++ ostride, ++ odist, ++ ioffset, ++ ooffset); ++ break; ++ case fft_precision_single: ++ copy_buffers_2to1(reinterpret_cast<const float*>(input[0].data()), ++ reinterpret_cast<const float*>(input[1].data()), ++ reinterpret_cast<rocfft_complex<float>*>(output[0].data()), ++ length, ++ nbatch, ++ istride, ++ idist, ++ ostride, ++ odist, ++ ioffset, ++ ooffset); ++ break; ++ case fft_precision_double: ++ copy_buffers_2to1(reinterpret_cast<const double*>(input[0].data()), ++ reinterpret_cast<const double*>(input[1].data()), ++ reinterpret_cast<rocfft_complex<double>*>(output[0].data()), ++ length, ++ nbatch, ++ istride, ++ idist, ++ ostride, ++ odist, ++ ioffset, ++ ooffset); ++ break; ++ } ++ } ++ else ++ { ++ throw std::runtime_error("Invalid input and output types."); ++ } ++} ++ ++// unroll arbitrary-dimension copy_buffers into specializations for 1-, 2-, 3-dimensions ++template <typename Tint1, typename Tint2, typename Tint3> ++inline void copy_buffers(const std::vector<hostbuf>& input, ++ std::vector<hostbuf>& output, ++ const std::vector<Tint1>& length, ++ const size_t nbatch, ++ const fft_precision precision, ++ const fft_array_type itype, ++ const std::vector<Tint2>& istride, ++ const size_t idist, ++ const fft_array_type otype, ++ const std::vector<Tint3>& ostride, ++ const size_t odist, ++ const std::vector<size_t>& ioffset, ++ const std::vector<size_t>& ooffset) ++{ ++ switch(length.size()) ++ { ++ case 1: ++ return copy_buffers(input, ++ output, ++ length[0], ++ nbatch, ++ precision, ++ itype, ++ istride[0], ++ idist, ++ otype, ++ ostride[0], ++ odist, ++ ioffset, ++ ooffset); ++ case 2: ++ return copy_buffers(input, ++ output, ++ std::make_tuple(length[0], length[1]), ++ nbatch, ++ precision, ++ itype, ++ std::make_tuple(istride[0], istride[1]), ++ idist, ++ otype, ++ std::make_tuple(ostride[0], ostride[1]), ++ odist, ++ ioffset, ++ ooffset); ++ case 3: ++ return copy_buffers(input, ++ output, ++ std::make_tuple(length[0], length[1], length[2]), ++ nbatch, ++ precision, ++ itype, ++ std::make_tuple(istride[0], istride[1], istride[2]), ++ idist, ++ otype, ++ std::make_tuple(ostride[0], ostride[1], ostride[2]), ++ odist, ++ ioffset, ++ ooffset); ++ default: ++ abort(); ++ } ++} ++ ++// Compute the L-infinity and L-2 distance between two buffers with strides istride and ++// length idist between batches to a buffer with strides ostride and length odist between ++// batches. Both buffers are of complex type. ++ ++struct VectorNorms ++{ ++ double l_2 = 0.0, l_inf = 0.0; ++}; ++ ++template <typename Tcomplex, typename Tint1, typename Tint2, typename Tint3> ++inline VectorNorms distance_1to1_complex(const Tcomplex* input, ++ const Tcomplex* output, ++ const Tint1& whole_length, ++ const size_t nbatch, ++ const Tint2& istride, ++ const size_t idist, ++ const Tint3& ostride, ++ const size_t odist, ++ std::vector<std::pair<size_t, size_t>>* linf_failures, ++ const double linf_cutoff, ++ const std::vector<size_t>& ioffset, ++ const std::vector<size_t>& ooffset, ++ const double output_scalar = 1.0) ++{ ++ double linf = 0.0; ++ double l2 = 0.0; ++ ++ std::mutex linf_failure_lock; ++ std::vector<std::pair<size_t, size_t>> linf_failures_private; ++ ++ const bool idx_equals_odx = istride == ostride && idist == odist; ++ size_t idx_base = 0; ++ size_t odx_base = 0; ++ auto partitions = partition_colmajor(whole_length); ++ for(size_t b = 0; b < nbatch; b++, idx_base += idist, odx_base += odist) ++ { ++#ifdef _OPENMP ++#pragma omp parallel for reduction(max : linf) reduction(+ : l2) num_threads(partitions.size()) private(linf_failures_private) ++#endif ++ for(size_t part = 0; part < partitions.size(); ++part) ++ { ++ double cur_linf = 0.0; ++ double cur_l2 = 0.0; ++ auto index = partitions[part].first; ++ const auto length = partitions[part].second; ++ ++ do ++ { ++ const auto idx = compute_index(index, istride, idx_base); ++ const auto odx = idx_equals_odx ? idx : compute_index(index, ostride, odx_base); ++ const double rdiff ++ = std::abs(static_cast<double>(output[odx + ooffset[0]].real()) * output_scalar ++ - static_cast<double>(input[idx + ioffset[0]].real())); ++ cur_linf = std::max(rdiff, cur_linf); ++ if(cur_linf > linf_cutoff) ++ { ++ std::pair<size_t, size_t> fval(b, idx); ++ if(linf_failures) ++ linf_failures_private.push_back(fval); ++ } ++ cur_l2 += rdiff * rdiff; ++ ++ const double idiff ++ = std::abs(static_cast<double>(output[odx + ooffset[0]].imag()) * output_scalar ++ - static_cast<double>(input[idx + ioffset[0]].imag())); ++ cur_linf = std::max(idiff, cur_linf); ++ if(cur_linf > linf_cutoff) ++ { ++ std::pair<size_t, size_t> fval(b, idx); ++ if(linf_failures) ++ linf_failures_private.push_back(fval); ++ } ++ cur_l2 += idiff * idiff; ++ ++ } while(increment_rowmajor(index, length)); ++ linf = std::max(linf, cur_linf); ++ l2 += cur_l2; ++ ++ if(linf_failures) ++ { ++ linf_failure_lock.lock(); ++ std::copy(linf_failures_private.begin(), ++ linf_failures_private.end(), ++ std::back_inserter(*linf_failures)); ++ linf_failure_lock.unlock(); ++ } ++ } ++ } ++ return {.l_2 = sqrt(l2), .l_inf = linf}; ++} ++ ++// Compute the L-infinity and L-2 distance between two buffers with strides istride and ++// length idist between batches to a buffer with strides ostride and length odist between ++// batches. Both buffers are of real type. ++template <typename Tfloat, typename Tint1, typename Tint2, typename Tint3> ++inline VectorNorms distance_1to1_real(const Tfloat* input, ++ const Tfloat* output, ++ const Tint1& whole_length, ++ const size_t nbatch, ++ const Tint2& istride, ++ const size_t idist, ++ const Tint3& ostride, ++ const size_t odist, ++ std::vector<std::pair<size_t, size_t>>* linf_failures, ++ const double linf_cutoff, ++ const std::vector<size_t>& ioffset, ++ const std::vector<size_t>& ooffset, ++ const double output_scalar = 1.0) ++{ ++ double linf = 0.0; ++ double l2 = 0.0; ++ ++ std::mutex linf_failure_lock; ++ std::vector<std::pair<size_t, size_t>> linf_failures_private; ++ ++ const bool idx_equals_odx = istride == ostride && idist == odist; ++ size_t idx_base = 0; ++ size_t odx_base = 0; ++ auto partitions = partition_rowmajor(whole_length); ++ for(size_t b = 0; b < nbatch; b++, idx_base += idist, odx_base += odist) ++ { ++#ifdef _OPENMP ++#pragma omp parallel for reduction(max : linf) reduction(+ : l2) num_threads(partitions.size()) private(linf_failures_private) ++#endif ++ for(size_t part = 0; part < partitions.size(); ++part) ++ { ++ double cur_linf = 0.0; ++ double cur_l2 = 0.0; ++ auto index = partitions[part].first; ++ const auto length = partitions[part].second; ++ do ++ { ++ const auto idx = compute_index(index, istride, idx_base); ++ const auto odx = idx_equals_odx ? idx : compute_index(index, ostride, odx_base); ++ const double diff ++ = std::abs(static_cast<double>(output[odx + ooffset[0]]) * output_scalar ++ - static_cast<double>(input[idx + ioffset[0]])); ++ cur_linf = std::max(diff, cur_linf); ++ if(cur_linf > linf_cutoff) ++ { ++ std::pair<size_t, size_t> fval(b, idx); ++ if(linf_failures) ++ linf_failures_private.push_back(fval); ++ } ++ cur_l2 += diff * diff; ++ ++ } while(increment_rowmajor(index, length)); ++ linf = std::max(linf, cur_linf); ++ l2 += cur_l2; ++ ++ if(linf_failures) ++ { ++ linf_failure_lock.lock(); ++ std::copy(linf_failures_private.begin(), ++ linf_failures_private.end(), ++ std::back_inserter(*linf_failures)); ++ linf_failure_lock.unlock(); ++ } ++ } ++ } ++ return {.l_2 = sqrt(l2), .l_inf = linf}; ++} ++ ++// Compute the L-infinity and L-2 distance between two buffers with strides istride and ++// length idist between batches to a buffer with strides ostride and length odist between ++// batches. input is complex-interleaved, output is complex-planar. ++template <typename Tval, typename Tint1, typename T2, typename T3> ++inline VectorNorms distance_1to2(const rocfft_complex<Tval>* input, ++ const Tval* output0, ++ const Tval* output1, ++ const Tint1& whole_length, ++ const size_t nbatch, ++ const T2& istride, ++ const size_t idist, ++ const T3& ostride, ++ const size_t odist, ++ std::vector<std::pair<size_t, size_t>>* linf_failures, ++ const double linf_cutoff, ++ const std::vector<size_t>& ioffset, ++ const std::vector<size_t>& ooffset, ++ const double output_scalar = 1.0) ++{ ++ double linf = 0.0; ++ double l2 = 0.0; ++ ++ std::mutex linf_failure_lock; ++ std::vector<std::pair<size_t, size_t>> linf_failures_private; ++ ++ const bool idx_equals_odx = istride == ostride && idist == odist; ++ size_t idx_base = 0; ++ size_t odx_base = 0; ++ auto partitions = partition_rowmajor(whole_length); ++ for(size_t b = 0; b < nbatch; b++, idx_base += idist, odx_base += odist) ++ { ++#ifdef _OPENMP ++#pragma omp parallel for reduction(max : linf) reduction(+ : l2) num_threads(partitions.size()) private(linf_failures_private) ++#endif ++ for(size_t part = 0; part < partitions.size(); ++part) ++ { ++ double cur_linf = 0.0; ++ double cur_l2 = 0.0; ++ auto index = partitions[part].first; ++ const auto length = partitions[part].second; ++ do ++ { ++ const auto idx = compute_index(index, istride, idx_base); ++ const auto odx = idx_equals_odx ? idx : compute_index(index, ostride, odx_base); ++ const double rdiff ++ = std::abs(static_cast<double>(output0[odx + ooffset[0]]) * output_scalar ++ - static_cast<double>(input[idx + ioffset[0]].real())); ++ cur_linf = std::max(rdiff, cur_linf); ++ if(cur_linf > linf_cutoff) ++ { ++ std::pair<size_t, size_t> fval(b, idx); ++ if(linf_failures) ++ linf_failures_private.push_back(fval); ++ } ++ cur_l2 += rdiff * rdiff; ++ ++ const double idiff ++ = std::abs(static_cast<double>(output1[odx + ooffset[1]]) * output_scalar ++ - static_cast<double>(input[idx + ioffset[0]].imag())); ++ cur_linf = std::max(idiff, cur_linf); ++ if(cur_linf > linf_cutoff) ++ { ++ std::pair<size_t, size_t> fval(b, idx); ++ if(linf_failures) ++ linf_failures_private.push_back(fval); ++ } ++ cur_l2 += idiff * idiff; ++ ++ } while(increment_rowmajor(index, length)); ++ linf = std::max(linf, cur_linf); ++ l2 += cur_l2; ++ ++ if(linf_failures) ++ { ++ linf_failure_lock.lock(); ++ std::copy(linf_failures_private.begin(), ++ linf_failures_private.end(), ++ std::back_inserter(*linf_failures)); ++ linf_failure_lock.unlock(); ++ } ++ } ++ } ++ return {.l_2 = sqrt(l2), .l_inf = linf}; ++} ++ ++// Compute the L-inifnity and L-2 distance between two buffers of dimension length and ++// with types given by itype, otype, and precision. ++template <typename Tint1, typename Tint2, typename Tint3> ++inline VectorNorms distance(const std::vector<hostbuf>& input, ++ const std::vector<hostbuf>& output, ++ const Tint1& length, ++ const size_t nbatch, ++ const fft_precision precision, ++ const fft_array_type itype, ++ const Tint2& istride, ++ const size_t idist, ++ const fft_array_type otype, ++ const Tint3& ostride, ++ const size_t odist, ++ std::vector<std::pair<size_t, size_t>>* linf_failures, ++ const double linf_cutoff, ++ const std::vector<size_t>& ioffset, ++ const std::vector<size_t>& ooffset, ++ const double output_scalar = 1.0) ++{ ++ VectorNorms dist; ++ ++ if(itype == otype) ++ { ++ switch(itype) ++ { ++ case fft_array_type_complex_interleaved: ++ case fft_array_type_hermitian_interleaved: ++ switch(precision) ++ { ++ case fft_precision_half: ++ dist = distance_1to1_complex( ++ reinterpret_cast<const rocfft_complex<_Float16>*>(input[0].data()), ++ reinterpret_cast<const rocfft_complex<_Float16>*>(output[0].data()), ++ length, ++ nbatch, ++ istride, ++ idist, ++ ostride, ++ odist, ++ linf_failures, ++ linf_cutoff, ++ ioffset, ++ ooffset, ++ output_scalar); ++ break; ++ case fft_precision_single: ++ dist = distance_1to1_complex( ++ reinterpret_cast<const rocfft_complex<float>*>(input[0].data()), ++ reinterpret_cast<const rocfft_complex<float>*>(output[0].data()), ++ length, ++ nbatch, ++ istride, ++ idist, ++ ostride, ++ odist, ++ linf_failures, ++ linf_cutoff, ++ ioffset, ++ ooffset, ++ output_scalar); ++ break; ++ case fft_precision_double: ++ dist = distance_1to1_complex( ++ reinterpret_cast<const rocfft_complex<double>*>(input[0].data()), ++ reinterpret_cast<const rocfft_complex<double>*>(output[0].data()), ++ length, ++ nbatch, ++ istride, ++ idist, ++ ostride, ++ odist, ++ linf_failures, ++ linf_cutoff, ++ ioffset, ++ ooffset, ++ output_scalar); ++ break; ++ } ++ dist.l_2 *= dist.l_2; ++ break; ++ case fft_array_type_real: ++ case fft_array_type_complex_planar: ++ case fft_array_type_hermitian_planar: ++ for(unsigned int idx = 0; idx < input.size(); ++idx) ++ { ++ VectorNorms d; ++ switch(precision) ++ { ++ case fft_precision_half: ++ d = distance_1to1_real(reinterpret_cast<const _Float16*>(input[idx].data()), ++ reinterpret_cast<const _Float16*>(output[idx].data()), ++ length, ++ nbatch, ++ istride, ++ idist, ++ ostride, ++ odist, ++ linf_failures, ++ linf_cutoff, ++ ioffset, ++ ooffset, ++ output_scalar); ++ break; ++ case fft_precision_single: ++ d = distance_1to1_real(reinterpret_cast<const float*>(input[idx].data()), ++ reinterpret_cast<const float*>(output[idx].data()), ++ length, ++ nbatch, ++ istride, ++ idist, ++ ostride, ++ odist, ++ linf_failures, ++ linf_cutoff, ++ ioffset, ++ ooffset, ++ output_scalar); ++ break; ++ case fft_precision_double: ++ d = distance_1to1_real(reinterpret_cast<const double*>(input[idx].data()), ++ reinterpret_cast<const double*>(output[idx].data()), ++ length, ++ nbatch, ++ istride, ++ idist, ++ ostride, ++ odist, ++ linf_failures, ++ linf_cutoff, ++ ioffset, ++ ooffset, ++ output_scalar); ++ break; ++ } ++ dist.l_inf = std::max(d.l_inf, dist.l_inf); ++ dist.l_2 += d.l_2 * d.l_2; ++ } ++ break; ++ default: ++ throw std::runtime_error("Invalid input and output types."); ++ } ++ } ++ else if((itype == fft_array_type_complex_interleaved && otype == fft_array_type_complex_planar) ++ || (itype == fft_array_type_hermitian_interleaved ++ && otype == fft_array_type_hermitian_planar)) ++ { ++ switch(precision) ++ { ++ case fft_precision_half: ++ dist = distance_1to2(reinterpret_cast<const rocfft_complex<_Float16>*>(input[0].data()), ++ reinterpret_cast<const _Float16*>(output[0].data()), ++ reinterpret_cast<const _Float16*>(output[1].data()), ++ length, ++ nbatch, ++ istride, ++ idist, ++ ostride, ++ odist, ++ linf_failures, ++ linf_cutoff, ++ ioffset, ++ ooffset, ++ output_scalar); ++ break; ++ case fft_precision_single: ++ dist = distance_1to2(reinterpret_cast<const rocfft_complex<float>*>(input[0].data()), ++ reinterpret_cast<const float*>(output[0].data()), ++ reinterpret_cast<const float*>(output[1].data()), ++ length, ++ nbatch, ++ istride, ++ idist, ++ ostride, ++ odist, ++ linf_failures, ++ linf_cutoff, ++ ioffset, ++ ooffset, ++ output_scalar); ++ break; ++ case fft_precision_double: ++ dist = distance_1to2(reinterpret_cast<const rocfft_complex<double>*>(input[0].data()), ++ reinterpret_cast<const double*>(output[0].data()), ++ reinterpret_cast<const double*>(output[1].data()), ++ length, ++ nbatch, ++ istride, ++ idist, ++ ostride, ++ odist, ++ linf_failures, ++ linf_cutoff, ++ ioffset, ++ ooffset, ++ output_scalar); ++ break; ++ } ++ dist.l_2 *= dist.l_2; ++ } ++ else if((itype == fft_array_type_complex_planar && otype == fft_array_type_complex_interleaved) ++ || (itype == fft_array_type_hermitian_planar ++ && otype == fft_array_type_hermitian_interleaved)) ++ { ++ switch(precision) ++ { ++ case fft_precision_half: ++ dist ++ = distance_1to2(reinterpret_cast<const rocfft_complex<_Float16>*>(output[0].data()), ++ reinterpret_cast<const _Float16*>(input[0].data()), ++ reinterpret_cast<const _Float16*>(input[1].data()), ++ length, ++ nbatch, ++ ostride, ++ odist, ++ istride, ++ idist, ++ linf_failures, ++ linf_cutoff, ++ ioffset, ++ ooffset, ++ output_scalar); ++ break; ++ case fft_precision_single: ++ dist = distance_1to2(reinterpret_cast<const rocfft_complex<float>*>(output[0].data()), ++ reinterpret_cast<const float*>(input[0].data()), ++ reinterpret_cast<const float*>(input[1].data()), ++ length, ++ nbatch, ++ ostride, ++ odist, ++ istride, ++ idist, ++ linf_failures, ++ linf_cutoff, ++ ioffset, ++ ooffset, ++ output_scalar); ++ break; ++ case fft_precision_double: ++ dist = distance_1to2(reinterpret_cast<const rocfft_complex<double>*>(output[0].data()), ++ reinterpret_cast<const double*>(input[0].data()), ++ reinterpret_cast<const double*>(input[1].data()), ++ length, ++ nbatch, ++ ostride, ++ odist, ++ istride, ++ idist, ++ linf_failures, ++ linf_cutoff, ++ ioffset, ++ ooffset, ++ output_scalar); ++ break; ++ } ++ dist.l_2 *= dist.l_2; ++ } ++ else ++ { ++ throw std::runtime_error("Invalid input and output types."); ++ } ++ dist.l_2 = sqrt(dist.l_2); ++ return dist; ++} ++ ++// check if the specified length + stride/dist is contiguous ++template <typename Tint1, typename Tint2> ++bool is_contiguous_rowmajor(const std::vector<Tint1>& length, ++ const std::vector<Tint2>& stride, ++ size_t dist) ++{ ++ size_t expected_stride = 1; ++ auto stride_it = stride.rbegin(); ++ auto length_it = length.rbegin(); ++ for(; stride_it != stride.rend() && length_it != length.rend(); ++stride_it, ++length_it) ++ { ++ if(*stride_it != expected_stride) ++ return false; ++ expected_stride *= *length_it; ++ } ++ return expected_stride == dist; ++} ++ ++// Unroll arbitrary-dimension distance into specializations for 1-, 2-, 3-dimensions ++template <typename Tint1, typename Tint2, typename Tint3> ++inline VectorNorms distance(const std::vector<hostbuf>& input, ++ const std::vector<hostbuf>& output, ++ std::vector<Tint1> length, ++ size_t nbatch, ++ const fft_precision precision, ++ const fft_array_type itype, ++ std::vector<Tint2> istride, ++ const size_t idist, ++ const fft_array_type otype, ++ std::vector<Tint3> ostride, ++ const size_t odist, ++ std::vector<std::pair<size_t, size_t>>* linf_failures, ++ const double linf_cutoff, ++ const std::vector<size_t>& ioffset, ++ const std::vector<size_t>& ooffset, ++ const double output_scalar = 1.0) ++{ ++ // If istride and ostride are both contiguous, collapse them down ++ // to one dimension. Index calculation is simpler (and faster) ++ // in the 1D case. ++ if(is_contiguous_rowmajor(length, istride, idist) ++ && is_contiguous_rowmajor(length, ostride, odist)) ++ { ++ length = {product(length.begin(), length.end()) * nbatch}; ++ istride = {static_cast<Tint2>(1)}; ++ ostride = {static_cast<Tint3>(1)}; ++ nbatch = 1; ++ } ++ ++ switch(length.size()) ++ { ++ case 1: ++ return distance(input, ++ output, ++ length[0], ++ nbatch, ++ precision, ++ itype, ++ istride[0], ++ idist, ++ otype, ++ ostride[0], ++ odist, ++ linf_failures, ++ linf_cutoff, ++ ioffset, ++ ooffset, ++ output_scalar); ++ case 2: ++ return distance(input, ++ output, ++ std::make_tuple(length[0], length[1]), ++ nbatch, ++ precision, ++ itype, ++ std::make_tuple(istride[0], istride[1]), ++ idist, ++ otype, ++ std::make_tuple(ostride[0], ostride[1]), ++ odist, ++ linf_failures, ++ linf_cutoff, ++ ioffset, ++ ooffset, ++ output_scalar); ++ case 3: ++ return distance(input, ++ output, ++ std::make_tuple(length[0], length[1], length[2]), ++ nbatch, ++ precision, ++ itype, ++ std::make_tuple(istride[0], istride[1], istride[2]), ++ idist, ++ otype, ++ std::make_tuple(ostride[0], ostride[1], ostride[2]), ++ odist, ++ linf_failures, ++ linf_cutoff, ++ ioffset, ++ ooffset, ++ output_scalar); ++ default: ++ abort(); ++ } ++} ++ ++// Compute the L-infinity and L-2 norm of a buffer with strides istride and ++// length idist. Data is rocfft_complex. ++template <typename Tcomplex, typename T1, typename T2> ++inline VectorNorms norm_complex(const Tcomplex* input, ++ const T1& whole_length, ++ const size_t nbatch, ++ const T2& istride, ++ const size_t idist, ++ const std::vector<size_t>& offset) ++{ ++ double linf = 0.0; ++ double l2 = 0.0; ++ ++ size_t idx_base = 0; ++ auto partitions = partition_rowmajor(whole_length); ++ for(size_t b = 0; b < nbatch; b++, idx_base += idist) ++ { ++#ifdef _OPENMP ++#pragma omp parallel for reduction(max : linf) reduction(+ : l2) num_threads(partitions.size()) ++#endif ++ for(size_t part = 0; part < partitions.size(); ++part) ++ { ++ double cur_linf = 0.0; ++ double cur_l2 = 0.0; ++ auto index = partitions[part].first; ++ const auto length = partitions[part].second; ++ do ++ { ++ const auto idx = compute_index(index, istride, idx_base); ++ ++ const double rval = std::abs(static_cast<double>(input[idx + offset[0]].real())); ++ cur_linf = std::max(rval, cur_linf); ++ cur_l2 += rval * rval; ++ ++ const double ival = std::abs(static_cast<double>(input[idx + offset[0]].imag())); ++ cur_linf = std::max(ival, cur_linf); ++ cur_l2 += ival * ival; ++ ++ } while(increment_rowmajor(index, length)); ++ linf = std::max(linf, cur_linf); ++ l2 += cur_l2; ++ } ++ } ++ return {.l_2 = sqrt(l2), .l_inf = linf}; ++} ++ ++// Compute the L-infinity and L-2 norm of abuffer with strides istride and ++// length idist. Data is real-valued. ++template <typename Tfloat, typename T1, typename T2> ++inline VectorNorms norm_real(const Tfloat* input, ++ const T1& whole_length, ++ const size_t nbatch, ++ const T2& istride, ++ const size_t idist, ++ const std::vector<size_t>& offset) ++{ ++ double linf = 0.0; ++ double l2 = 0.0; ++ ++ size_t idx_base = 0; ++ auto partitions = partition_rowmajor(whole_length); ++ for(size_t b = 0; b < nbatch; b++, idx_base += idist) ++ { ++#ifdef _OPENMP ++#pragma omp parallel for reduction(max : linf) reduction(+ : l2) num_threads(partitions.size()) ++#endif ++ for(size_t part = 0; part < partitions.size(); ++part) ++ { ++ double cur_linf = 0.0; ++ double cur_l2 = 0.0; ++ auto index = partitions[part].first; ++ const auto length = partitions[part].second; ++ do ++ { ++ const auto idx = compute_index(index, istride, idx_base); ++ const double val = std::abs(static_cast<double>(input[idx + offset[0]])); ++ cur_linf = std::max(val, cur_linf); ++ cur_l2 += val * val; ++ ++ } while(increment_rowmajor(index, length)); ++ linf = std::max(linf, cur_linf); ++ l2 += cur_l2; ++ } ++ } ++ return {.l_2 = sqrt(l2), .l_inf = linf}; ++} ++ ++// Compute the L-infinity and L-2 norm of abuffer with strides istride and ++// length idist. Data format is given by precision and itype. ++template <typename T1, typename T2> ++inline VectorNorms norm(const std::vector<hostbuf>& input, ++ const T1& length, ++ const size_t nbatch, ++ const fft_precision precision, ++ const fft_array_type itype, ++ const T2& istride, ++ const size_t idist, ++ const std::vector<size_t>& offset) ++{ ++ VectorNorms norm; ++ ++ switch(itype) ++ { ++ case fft_array_type_complex_interleaved: ++ case fft_array_type_hermitian_interleaved: ++ switch(precision) ++ { ++ case fft_precision_half: ++ norm = norm_complex(reinterpret_cast<const rocfft_complex<_Float16>*>(input[0].data()), ++ length, ++ nbatch, ++ istride, ++ idist, ++ offset); ++ break; ++ case fft_precision_single: ++ norm = norm_complex(reinterpret_cast<const rocfft_complex<float>*>(input[0].data()), ++ length, ++ nbatch, ++ istride, ++ idist, ++ offset); ++ break; ++ case fft_precision_double: ++ norm = norm_complex(reinterpret_cast<const rocfft_complex<double>*>(input[0].data()), ++ length, ++ nbatch, ++ istride, ++ idist, ++ offset); ++ break; ++ } ++ norm.l_2 *= norm.l_2; ++ break; ++ case fft_array_type_real: ++ case fft_array_type_complex_planar: ++ case fft_array_type_hermitian_planar: ++ for(unsigned int idx = 0; idx < input.size(); ++idx) ++ { ++ VectorNorms n; ++ switch(precision) ++ { ++ case fft_precision_half: ++ n = norm_real(reinterpret_cast<const _Float16*>(input[idx].data()), ++ length, ++ nbatch, ++ istride, ++ idist, ++ offset); ++ break; ++ case fft_precision_single: ++ n = norm_real(reinterpret_cast<const float*>(input[idx].data()), ++ length, ++ nbatch, ++ istride, ++ idist, ++ offset); ++ break; ++ case fft_precision_double: ++ n = norm_real(reinterpret_cast<const double*>(input[idx].data()), ++ length, ++ nbatch, ++ istride, ++ idist, ++ offset); ++ break; ++ } ++ norm.l_inf = std::max(n.l_inf, norm.l_inf); ++ norm.l_2 += n.l_2 * n.l_2; ++ } ++ break; ++ default: ++ throw std::runtime_error("Invalid data type"); ++ } ++ ++ norm.l_2 = sqrt(norm.l_2); ++ return norm; ++} ++ ++// Unroll arbitrary-dimension norm into specializations for 1-, 2-, 3-dimensions ++template <typename T1, typename T2> ++inline VectorNorms norm(const std::vector<hostbuf>& input, ++ std::vector<T1> length, ++ size_t nbatch, ++ const fft_precision precision, ++ const fft_array_type type, ++ std::vector<T2> stride, ++ const size_t dist, ++ const std::vector<size_t>& offset) ++{ ++ // If stride is contiguous, collapse it down to one dimension. ++ // Index calculation is simpler (and faster) in the 1D case. ++ if(is_contiguous_rowmajor(length, stride, dist)) ++ { ++ length = {product(length.begin(), length.end()) * nbatch}; ++ stride = {static_cast<T2>(1)}; ++ nbatch = 1; ++ } ++ ++ switch(length.size()) ++ { ++ case 1: ++ return norm(input, length[0], nbatch, precision, type, stride[0], dist, offset); ++ case 2: ++ return norm(input, ++ std::make_tuple(length[0], length[1]), ++ nbatch, ++ precision, ++ type, ++ std::make_tuple(stride[0], stride[1]), ++ dist, ++ offset); ++ case 3: ++ return norm(input, ++ std::make_tuple(length[0], length[1], length[2]), ++ nbatch, ++ precision, ++ type, ++ std::make_tuple(stride[0], stride[1], stride[2]), ++ dist, ++ offset); ++ default: ++ abort(); ++ } ++} ++ ++// Given a data type and precision, the distance between batches, and ++// the batch size, allocate the required host buffer(s). ++static std::vector<hostbuf> allocate_host_buffer(const fft_precision precision, ++ const fft_array_type type, ++ const std::vector<size_t>& size) ++{ ++ std::vector<hostbuf> buffers(size.size()); ++ for(unsigned int i = 0; i < size.size(); ++i) ++ { ++ buffers[i].alloc(size[i] * var_size<size_t>(precision, type)); ++ } ++ return buffers; ++} ++ ++// Check if the required buffers fit in the device vram. ++inline bool vram_fits_problem(const size_t prob_size, const size_t vram_avail, int deviceId = 0) ++{ ++ // We keep a small margin of error for fitting the problem into vram: ++ const size_t extra = 1 << 27; ++ ++ return vram_avail > prob_size + extra; ++} ++ ++// Computes the twiddle table VRAM footprint for r2c/c2r transforms. ++// This function will return 0 for the other transform types, since ++// the VRAM footprint in rocFFT is negligible for the other cases. ++inline size_t twiddle_table_vram_footprint(const fft_params& params) ++{ ++ size_t vram_footprint = 0; ++ ++ // Add vram footprint from real/complex even twiddle buffer size. ++ if(params.transform_type == fft_transform_type_real_forward ++ || params.transform_type == fft_transform_type_real_inverse) ++ { ++ const auto realdim = params.length.back(); ++ if(realdim % 2 == 0) ++ { ++ const auto complex_size = params.precision == fft_precision_single ? 8 : 16; ++ // even length twiddle size is 1/4 of the real size, but ++ // in complex elements ++ vram_footprint += realdim * complex_size / 4; ++ } ++ } ++ ++ return vram_footprint; ++} ++ ++#endif +diff --git a/shared/fftw_transform.h b/shared/fftw_transform.h +new file mode 100644 +index 0000000..873a373 +--- /dev/null ++++ b/shared/fftw_transform.h +@@ -0,0 +1,493 @@ ++// Copyright (C) 2016 - 2023 Advanced Micro Devices, Inc. All rights reserved. ++// ++// Permission is hereby granted, free of charge, to any person obtaining a copy ++// of this software and associated documentation files (the "Software"), to deal ++// in the Software without restriction, including without limitation the rights ++// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell ++// copies of the Software, and to permit persons to whom the Software is ++// furnished to do so, subject to the following conditions: ++// ++// The above copyright notice and this permission notice shall be included in ++// all copies or substantial portions of the Software. ++// ++// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, ++// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN ++// THE SOFTWARE. ++ ++#pragma once ++#ifndef FFTWTRANSFORM_H ++#define FFTWTRANSFORM_H ++ ++#include "hostbuf.h" ++#include "rocfft_complex.h" ++#include "test_params.h" ++#include <fftw3.h> ++#include <vector> ++ ++// Function to return maximum error for float and double types. ++// ++// Following Schatzman (1996; Accuracy of the Discrete Fourier ++// Transform and the Fast Fourier Transform), the shape of relative ++// l_2 error vs length should look like ++// ++// epsilon * sqrt(log2(length)). ++// ++// The magic epsilon constants below were chosen so that we get a ++// reasonable upper bound for (all of) our tests. ++// ++// For rocFFT, prime lengths result in the highest error. As such, ++// the epsilons below are perhaps too loose for pow2 lengths; but they ++// are appropriate for prime lengths. ++template <typename Tfloat> ++inline double type_epsilon(); ++template <> ++inline double type_epsilon<_Float16>() ++{ ++ return half_epsilon; ++} ++template <> ++inline double type_epsilon<float>() ++{ ++ return single_epsilon; ++} ++template <> ++inline double type_epsilon<double>() ++{ ++ return double_epsilon; ++} ++ ++// C++ traits to translate float->fftwf_complex and ++// double->fftw_complex. ++// The correct FFTW complex type can be accessed via, for example, ++// using complex_t = typename fftw_complex_trait<Tfloat>::complex_t; ++template <typename Tfloat> ++struct fftw_trait; ++template <> ++struct fftw_trait<_Float16> ++{ ++ // fftw does not support half precision, so use single precision and convert ++ using fftw_complex_type = fftwf_complex; ++ using fftw_plan_type = fftwf_plan; ++}; ++template <> ++struct fftw_trait<float> ++{ ++ using fftw_complex_type = fftwf_complex; ++ using fftw_plan_type = fftwf_plan; ++}; ++template <> ++struct fftw_trait<double> ++{ ++ using fftw_complex_type = fftw_complex; ++ using fftw_plan_type = fftw_plan; ++}; ++ ++// Copies the half-precision input buffer to a single-precision ++// buffer. Note that the input buffer is already sized like it's a ++// single-precision buffer (but only half of it is filled), because ++// we allocate a single-precision buffer for FFTW to plan with. ++static hostbuf half_to_single_copy(const hostbuf& in) ++{ ++ auto out = in.copy(); ++ auto in_begin = reinterpret_cast<const _Float16*>(in.data()); ++ std::copy_n(in_begin, in.size() / sizeof(_Float16) / 2, reinterpret_cast<float*>(out.data())); ++ return out; ++} ++ ++// converts a wider precision buffer to a narrower precision, in-place ++template <typename TfloatIn, typename TfloatOut> ++void narrow_precision_inplace(hostbuf& in) ++{ ++ // ensure we're actually shrinking the data ++ static_assert(sizeof(TfloatIn) > sizeof(TfloatOut)); ++ ++ auto readPtr = reinterpret_cast<const TfloatIn*>(in.data()); ++ auto writePtr = reinterpret_cast<TfloatOut*>(in.data()); ++ std::copy_n(readPtr, in.size() / sizeof(TfloatIn), writePtr); ++ in.shrink(in.size() / (sizeof(TfloatIn) / sizeof(TfloatOut))); ++} ++ ++static void single_to_half_inplace(hostbuf& in) ++{ ++ narrow_precision_inplace<float, _Float16>(in); ++} ++ ++// Template wrappers for real-valued FFTW allocators: ++template <typename Tfloat> ++inline Tfloat* fftw_alloc_real_type(size_t n); ++template <> ++inline float* fftw_alloc_real_type<float>(size_t n) ++{ ++ return fftwf_alloc_real(n); ++} ++template <> ++inline double* fftw_alloc_real_type<double>(size_t n) ++{ ++ return fftw_alloc_real(n); ++} ++ ++// Template wrappers for complex-valued FFTW allocators: ++template <typename Tfloat> ++inline typename fftw_trait<Tfloat>::fftw_complex_type* fftw_alloc_complex_type(size_t n); ++template <> ++inline typename fftw_trait<float>::fftw_complex_type* fftw_alloc_complex_type<float>(size_t n) ++{ ++ return fftwf_alloc_complex(n); ++} ++template <> ++inline typename fftw_trait<double>::fftw_complex_type* fftw_alloc_complex_type<double>(size_t n) ++{ ++ return fftw_alloc_complex(n); ++} ++ ++template <typename fftw_type> ++inline fftw_type* fftw_alloc_type(size_t n); ++template <> ++inline float* fftw_alloc_type<float>(size_t n) ++{ ++ return fftw_alloc_real_type<float>(n); ++} ++template <> ++inline double* fftw_alloc_type<double>(size_t n) ++{ ++ return fftw_alloc_real_type<double>(n); ++} ++template <> ++inline fftwf_complex* fftw_alloc_type<fftwf_complex>(size_t n) ++{ ++ return fftw_alloc_complex_type<float>(n); ++} ++template <> ++inline fftw_complex* fftw_alloc_type<fftw_complex>(size_t n) ++{ ++ return fftw_alloc_complex_type<double>(n); ++} ++template <> ++inline rocfft_complex<float>* fftw_alloc_type<rocfft_complex<float>>(size_t n) ++{ ++ return (rocfft_complex<float>*)fftw_alloc_complex_type<float>(n); ++} ++template <> ++inline rocfft_complex<double>* fftw_alloc_type<rocfft_complex<double>>(size_t n) ++{ ++ return (rocfft_complex<double>*)fftw_alloc_complex_type<double>(n); ++} ++ ++// Template wrappers for FFTW plan executors: ++template <typename Tfloat> ++inline void fftw_execute_type(typename fftw_trait<Tfloat>::fftw_plan_type plan); ++template <> ++inline void fftw_execute_type<float>(typename fftw_trait<float>::fftw_plan_type plan) ++{ ++ return fftwf_execute(plan); ++} ++template <> ++inline void fftw_execute_type<double>(typename fftw_trait<double>::fftw_plan_type plan) ++{ ++ return fftw_execute(plan); ++} ++ ++// Template wrappers for FFTW plan destroyers: ++template <typename Tfftw_plan> ++inline void fftw_destroy_plan_type(Tfftw_plan plan); ++template <> ++inline void fftw_destroy_plan_type<fftwf_plan>(fftwf_plan plan) ++{ ++ return fftwf_destroy_plan(plan); ++} ++template <> ++inline void fftw_destroy_plan_type<fftw_plan>(fftw_plan plan) ++{ ++ return fftw_destroy_plan(plan); ++} ++ ++// Template wrappers for FFTW c2c planners: ++template <typename Tfloat> ++inline typename fftw_trait<Tfloat>::fftw_plan_type ++ fftw_plan_guru64_dft(int rank, ++ const fftw_iodim64* dims, ++ int howmany_rank, ++ const fftw_iodim64* howmany_dims, ++ typename fftw_trait<Tfloat>::fftw_complex_type* in, ++ typename fftw_trait<Tfloat>::fftw_complex_type* out, ++ int sign, ++ unsigned flags); ++ ++template <> ++inline typename fftw_trait<_Float16>::fftw_plan_type ++ fftw_plan_guru64_dft<_Float16>(int rank, ++ const fftw_iodim64* dims, ++ int howmany_rank, ++ const fftw_iodim64* howmany_dims, ++ typename fftw_trait<_Float16>::fftw_complex_type* in, ++ typename fftw_trait<_Float16>::fftw_complex_type* out, ++ int sign, ++ unsigned flags) ++{ ++ return fftwf_plan_guru64_dft(rank, dims, howmany_rank, howmany_dims, in, out, sign, flags); ++} ++ ++template <> ++inline typename fftw_trait<float>::fftw_plan_type ++ fftw_plan_guru64_dft<float>(int rank, ++ const fftw_iodim64* dims, ++ int howmany_rank, ++ const fftw_iodim64* howmany_dims, ++ typename fftw_trait<float>::fftw_complex_type* in, ++ typename fftw_trait<float>::fftw_complex_type* out, ++ int sign, ++ unsigned flags) ++{ ++ return fftwf_plan_guru64_dft(rank, dims, howmany_rank, howmany_dims, in, out, sign, flags); ++} ++ ++template <> ++inline typename fftw_trait<double>::fftw_plan_type ++ fftw_plan_guru64_dft<double>(int rank, ++ const fftw_iodim64* dims, ++ int howmany_rank, ++ const fftw_iodim64* howmany_dims, ++ typename fftw_trait<double>::fftw_complex_type* in, ++ typename fftw_trait<double>::fftw_complex_type* out, ++ int sign, ++ unsigned flags) ++{ ++ return fftw_plan_guru64_dft(rank, dims, howmany_rank, howmany_dims, in, out, sign, flags); ++} ++ ++// Template wrappers for FFTW c2c executors: ++template <typename Tfloat> ++inline void fftw_plan_execute_c2c(typename fftw_trait<Tfloat>::fftw_plan_type plan, ++ std::vector<hostbuf>& in, ++ std::vector<hostbuf>& out); ++ ++template <> ++inline void fftw_plan_execute_c2c<_Float16>(typename fftw_trait<_Float16>::fftw_plan_type plan, ++ std::vector<hostbuf>& in, ++ std::vector<hostbuf>& out) ++{ ++ // since FFTW does not natively support half precision, convert ++ // input to single, execute, then convert output back to half ++ auto in_single = half_to_single_copy(in.front()); ++ fftwf_execute_dft(plan, ++ reinterpret_cast<fftwf_complex*>(in_single.data()), ++ reinterpret_cast<fftwf_complex*>(out.front().data())); ++ single_to_half_inplace(out.front()); ++} ++ ++template <> ++inline void fftw_plan_execute_c2c<float>(typename fftw_trait<float>::fftw_plan_type plan, ++ std::vector<hostbuf>& in, ++ std::vector<hostbuf>& out) ++{ ++ fftwf_execute_dft(plan, ++ reinterpret_cast<fftwf_complex*>(in.front().data()), ++ reinterpret_cast<fftwf_complex*>(out.front().data())); ++} ++ ++template <> ++inline void fftw_plan_execute_c2c<double>(typename fftw_trait<double>::fftw_plan_type plan, ++ std::vector<hostbuf>& in, ++ std::vector<hostbuf>& out) ++{ ++ fftw_execute_dft(plan, ++ reinterpret_cast<fftw_complex*>(in.front().data()), ++ reinterpret_cast<fftw_complex*>(out.front().data())); ++} ++ ++// Template wrappers for FFTW r2c planners: ++template <typename Tfloat> ++inline typename fftw_trait<Tfloat>::fftw_plan_type ++ fftw_plan_guru64_r2c(int rank, ++ const fftw_iodim64* dims, ++ int howmany_rank, ++ const fftw_iodim64* howmany_dims, ++ Tfloat* in, ++ typename fftw_trait<Tfloat>::fftw_complex_type* out, ++ unsigned flags); ++template <> ++inline typename fftw_trait<_Float16>::fftw_plan_type ++ fftw_plan_guru64_r2c<_Float16>(int rank, ++ const fftw_iodim64* dims, ++ int howmany_rank, ++ const fftw_iodim64* howmany_dims, ++ _Float16* in, ++ typename fftw_trait<_Float16>::fftw_complex_type* out, ++ unsigned flags) ++{ ++ return fftwf_plan_guru64_dft_r2c( ++ rank, dims, howmany_rank, howmany_dims, reinterpret_cast<float*>(in), out, flags); ++} ++template <> ++inline typename fftw_trait<float>::fftw_plan_type ++ fftw_plan_guru64_r2c<float>(int rank, ++ const fftw_iodim64* dims, ++ int howmany_rank, ++ const fftw_iodim64* howmany_dims, ++ float* in, ++ typename fftw_trait<float>::fftw_complex_type* out, ++ unsigned flags) ++{ ++ return fftwf_plan_guru64_dft_r2c(rank, dims, howmany_rank, howmany_dims, in, out, flags); ++} ++template <> ++inline typename fftw_trait<double>::fftw_plan_type ++ fftw_plan_guru64_r2c<double>(int rank, ++ const fftw_iodim64* dims, ++ int howmany_rank, ++ const fftw_iodim64* howmany_dims, ++ double* in, ++ typename fftw_trait<double>::fftw_complex_type* out, ++ unsigned flags) ++{ ++ return fftw_plan_guru64_dft_r2c(rank, dims, howmany_rank, howmany_dims, in, out, flags); ++} ++ ++// Template wrappers for FFTW r2c executors: ++template <typename Tfloat> ++inline void fftw_plan_execute_r2c(typename fftw_trait<Tfloat>::fftw_plan_type plan, ++ std::vector<hostbuf>& in, ++ std::vector<hostbuf>& out); ++template <> ++inline void fftw_plan_execute_r2c<_Float16>(typename fftw_trait<float>::fftw_plan_type plan, ++ std::vector<hostbuf>& in, ++ std::vector<hostbuf>& out) ++{ ++ // since FFTW does not natively support half precision, convert ++ // input to single, execute, then convert output back to half ++ auto in_single = half_to_single_copy(in.front()); ++ fftwf_execute_dft_r2c(plan, ++ reinterpret_cast<float*>(in_single.data()), ++ reinterpret_cast<fftwf_complex*>(out.front().data())); ++ single_to_half_inplace(out.front()); ++} ++template <> ++inline void fftw_plan_execute_r2c<float>(typename fftw_trait<float>::fftw_plan_type plan, ++ std::vector<hostbuf>& in, ++ std::vector<hostbuf>& out) ++{ ++ fftwf_execute_dft_r2c(plan, ++ reinterpret_cast<float*>(in.front().data()), ++ reinterpret_cast<fftwf_complex*>(out.front().data())); ++} ++template <> ++inline void fftw_plan_execute_r2c<double>(typename fftw_trait<double>::fftw_plan_type plan, ++ std::vector<hostbuf>& in, ++ std::vector<hostbuf>& out) ++{ ++ fftw_execute_dft_r2c(plan, ++ reinterpret_cast<double*>(in.front().data()), ++ reinterpret_cast<fftw_complex*>(out.front().data())); ++} ++ ++// Template wrappers for FFTW c2r planners: ++template <typename Tfloat> ++inline typename fftw_trait<Tfloat>::fftw_plan_type ++ fftw_plan_guru64_c2r(int rank, ++ const fftw_iodim64* dims, ++ int howmany_rank, ++ const fftw_iodim64* howmany_dims, ++ typename fftw_trait<Tfloat>::fftw_complex_type* in, ++ Tfloat* out, ++ unsigned flags); ++template <> ++inline typename fftw_trait<_Float16>::fftw_plan_type ++ fftw_plan_guru64_c2r<_Float16>(int rank, ++ const fftw_iodim64* dims, ++ int howmany_rank, ++ const fftw_iodim64* howmany_dims, ++ typename fftw_trait<_Float16>::fftw_complex_type* in, ++ _Float16* out, ++ unsigned flags) ++{ ++ return fftwf_plan_guru64_dft_c2r( ++ rank, dims, howmany_rank, howmany_dims, in, reinterpret_cast<float*>(out), flags); ++} ++template <> ++inline typename fftw_trait<float>::fftw_plan_type ++ fftw_plan_guru64_c2r<float>(int rank, ++ const fftw_iodim64* dims, ++ int howmany_rank, ++ const fftw_iodim64* howmany_dims, ++ typename fftw_trait<float>::fftw_complex_type* in, ++ float* out, ++ unsigned flags) ++{ ++ return fftwf_plan_guru64_dft_c2r(rank, dims, howmany_rank, howmany_dims, in, out, flags); ++} ++template <> ++inline typename fftw_trait<double>::fftw_plan_type ++ fftw_plan_guru64_c2r<double>(int rank, ++ const fftw_iodim64* dims, ++ int howmany_rank, ++ const fftw_iodim64* howmany_dims, ++ typename fftw_trait<double>::fftw_complex_type* in, ++ double* out, ++ unsigned flags) ++{ ++ return fftw_plan_guru64_dft_c2r(rank, dims, howmany_rank, howmany_dims, in, out, flags); ++} ++ ++// Template wrappers for FFTW c2r executors: ++template <typename Tfloat> ++inline void fftw_plan_execute_c2r(typename fftw_trait<Tfloat>::fftw_plan_type plan, ++ std::vector<hostbuf>& in, ++ std::vector<hostbuf>& out); ++template <> ++inline void fftw_plan_execute_c2r<_Float16>(typename fftw_trait<float>::fftw_plan_type plan, ++ std::vector<hostbuf>& in, ++ std::vector<hostbuf>& out) ++{ ++ // since FFTW does not natively support half precision, convert ++ // input to single, execute, then convert output back to half ++ auto in_single = half_to_single_copy(in.front()); ++ fftwf_execute_dft_c2r(plan, ++ reinterpret_cast<fftwf_complex*>(in_single.data()), ++ reinterpret_cast<float*>(out.front().data())); ++ single_to_half_inplace(out.front()); ++} ++template <> ++inline void fftw_plan_execute_c2r<float>(typename fftw_trait<float>::fftw_plan_type plan, ++ std::vector<hostbuf>& in, ++ std::vector<hostbuf>& out) ++{ ++ fftwf_execute_dft_c2r(plan, ++ reinterpret_cast<fftwf_complex*>(in.front().data()), ++ reinterpret_cast<float*>(out.front().data())); ++} ++template <> ++inline void fftw_plan_execute_c2r<double>(typename fftw_trait<double>::fftw_plan_type plan, ++ std::vector<hostbuf>& in, ++ std::vector<hostbuf>& out) ++{ ++ fftw_execute_dft_c2r(plan, ++ reinterpret_cast<fftw_complex*>(in.front().data()), ++ reinterpret_cast<double*>(out.front().data())); ++} ++ ++#ifdef FFTW_HAVE_SPRINT_PLAN ++// Template wrappers for FFTW print plan: ++template <typename Tfloat> ++inline char* fftw_sprint_plan(const typename fftw_trait<Tfloat>::fftw_plan_type plan); ++template <> ++inline char* fftw_sprint_plan<_Float16>(const typename fftw_trait<_Float16>::fftw_plan_type plan) ++{ ++ return fftwf_sprint_plan(plan); ++} ++template <> ++inline char* fftw_sprint_plan<float>(const typename fftw_trait<float>::fftw_plan_type plan) ++{ ++ return fftwf_sprint_plan(plan); ++} ++template <> ++inline char* fftw_sprint_plan<double>(const typename fftw_trait<double>::fftw_plan_type plan) ++{ ++ return fftw_sprint_plan(plan); ++} ++#endif ++ ++#endif +diff --git a/shared/gpubuf.h b/shared/gpubuf.h +new file mode 100644 +index 0000000..993fa95 +--- /dev/null ++++ b/shared/gpubuf.h +@@ -0,0 +1,134 @@ ++// Copyright (C) 2021 - 2023 Advanced Micro Devices, Inc. All rights reserved. ++// ++// Permission is hereby granted, free of charge, to any person obtaining a copy ++// of this software and associated documentation files (the "Software"), to deal ++// in the Software without restriction, including without limitation the rights ++// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell ++// copies of the Software, and to permit persons to whom the Software is ++// furnished to do so, subject to the following conditions: ++// ++// The above copyright notice and this permission notice shall be included in ++// all copies or substantial portions of the Software. ++// ++// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, ++// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN ++// THE SOFTWARE. ++ ++#ifndef ROCFFT_GPUBUF_H ++#define ROCFFT_GPUBUF_H ++ ++#include "rocfft_hip.h" ++#include <cstdlib> ++ ++// Simple RAII class for GPU buffers. T is the type of pointer that ++// data() returns ++template <class T = void> ++class gpubuf_t ++{ ++public: ++ gpubuf_t() {} ++ // buffers are movable but not copyable ++ gpubuf_t(gpubuf_t&& other) ++ { ++ std::swap(buf, other.buf); ++ std::swap(bsize, other.bsize); ++ std::swap(device, other.device); ++ } ++ gpubuf_t& operator=(gpubuf_t&& other) ++ { ++ std::swap(buf, other.buf); ++ std::swap(bsize, other.bsize); ++ std::swap(device, other.device); ++ return *this; ++ } ++ gpubuf_t(const gpubuf_t&) = delete; ++ gpubuf_t& operator=(const gpubuf_t&) = delete; ++ ++ ~gpubuf_t() ++ { ++ free(); ++ } ++ ++ static bool use_alloc_managed() ++ { ++ return std::getenv("ROCFFT_MALLOC_MANAGED"); ++ } ++ ++ hipError_t alloc(const size_t size) ++ { ++ // remember the device that was current as of alloc, so we can ++ // free on the correct device ++ auto ret = hipGetDevice(&device); ++ if(ret != hipSuccess) ++ return ret; ++ ++ bsize = size; ++ static bool alloc_managed = use_alloc_managed(); ++ free(); ++ ret = alloc_managed ? hipMallocManaged(&buf, bsize) : hipMalloc(&buf, bsize); ++ if(ret != hipSuccess) ++ { ++ buf = nullptr; ++ bsize = 0; ++ } ++ return ret; ++ } ++ ++ size_t size() const ++ { ++ return bsize; ++ } ++ ++ void free() ++ { ++ if(buf != nullptr) ++ { ++ // free on the device we allocated on ++ rocfft_scoped_device dev(device); ++ (void)hipFree(buf); ++ buf = nullptr; ++ bsize = 0; ++ } ++ } ++ ++ // return a pointer to the allocated memory, offset by the ++ // specified number of bytes ++ T* data_offset(size_t offset_bytes = 0) const ++ { ++ void* ptr = static_cast<char*>(buf) + offset_bytes; ++ return static_cast<T*>(ptr); ++ } ++ ++ T* data() const ++ { ++ return static_cast<T*>(buf); ++ } ++ ++ // equality/bool tests ++ bool operator==(std::nullptr_t n) const ++ { ++ return buf == n; ++ } ++ bool operator!=(std::nullptr_t n) const ++ { ++ return buf != n; ++ } ++ operator bool() const ++ { ++ return buf; ++ } ++ ++private: ++ // The GPU buffer ++ void* buf = nullptr; ++ size_t bsize = 0; ++ int device = 0; ++}; ++ ++// default gpubuf that gives out void* pointers ++typedef gpubuf_t<> gpubuf; ++#endif +diff --git a/shared/hip_object_wrapper.h b/shared/hip_object_wrapper.h +new file mode 100644 +index 0000000..54083ab +--- /dev/null ++++ b/shared/hip_object_wrapper.h +@@ -0,0 +1,86 @@ ++/****************************************************************************** ++* Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved. ++* ++* Permission is hereby granted, free of charge, to any person obtaining a copy ++* of this software and associated documentation files (the "Software"), to deal ++* in the Software without restriction, including without limitation the rights ++* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell ++* copies of the Software, and to permit persons to whom the Software is ++* furnished to do so, subject to the following conditions: ++* ++* The above copyright notice and this permission notice shall be included in ++* all copies or substantial portions of the Software. ++* ++* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, ++* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN ++* THE SOFTWARE. ++*******************************************************************************/ ++ ++#ifndef ROCFFT_HIP_OBJ_WRAPPER_H ++#define ROCFFT_HIP_OBJ_WRAPPER_H ++ ++#include "rocfft_hip.h" ++ ++// RAII wrapper around HIP objects ++template <typename T, auto TCreate, auto TDestroy> ++struct hip_object_wrapper_t ++{ ++ hip_object_wrapper_t() ++ : obj(nullptr) ++ { ++ } ++ ++ void alloc() ++ { ++ if(obj == nullptr && TCreate(&obj) != hipSuccess) ++ throw std::runtime_error("hip create failure"); ++ } ++ ++ void free() ++ { ++ if(obj) ++ { ++ (void)TDestroy(obj); ++ obj = nullptr; ++ } ++ } ++ ++ operator const T&() const ++ { ++ return obj; ++ } ++ operator T&() ++ { ++ return obj; ++ } ++ ++ operator bool() const ++ { ++ return obj != nullptr; ++ } ++ ++ ~hip_object_wrapper_t() ++ { ++ free(); ++ } ++ ++ hip_object_wrapper_t(const hip_object_wrapper_t&) = delete; ++ hip_object_wrapper_t& operator=(const hip_object_wrapper_t&) = delete; ++ hip_object_wrapper_t(hip_object_wrapper_t&& other) ++ : obj(other.obj) ++ { ++ other.obj = nullptr; ++ } ++ ++private: ++ T obj; ++}; ++ ++typedef hip_object_wrapper_t<hipStream_t, hipStreamCreate, hipStreamDestroy> hipStream_wrapper_t; ++typedef hip_object_wrapper_t<hipEvent_t, hipEventCreate, hipEventDestroy> hipEvent_wrapper_t; ++ ++#endif // ROCFFT_HIP_OBJ_WRAPPER_H +diff --git a/shared/hostbuf.h b/shared/hostbuf.h +new file mode 100644 +index 0000000..0a96c7d +--- /dev/null ++++ b/shared/hostbuf.h +@@ -0,0 +1,158 @@ ++// Copyright (C) 2021 - 2023 Advanced Micro Devices, Inc. All rights reserved. ++// ++// Permission is hereby granted, free of charge, to any person obtaining a copy ++// of this software and associated documentation files (the "Software"), to deal ++// in the Software without restriction, including without limitation the rights ++// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell ++// copies of the Software, and to permit persons to whom the Software is ++// furnished to do so, subject to the following conditions: ++// ++// The above copyright notice and this permission notice shall be included in ++// all copies or substantial portions of the Software. ++// ++// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, ++// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN ++// THE SOFTWARE. ++ ++#ifndef ROCFFT_HOSTBUF_H ++#define ROCFFT_HOSTBUF_H ++ ++#include "arithmetic.h" ++#include <cstdlib> ++#include <cstring> ++ ++#ifndef WIN32 ++#include <stdlib.h> ++#include <sys/mman.h> ++#endif ++ ++// Simple RAII class for host buffers. T is the type of pointer that ++// data() returns ++template <class T = void> ++class hostbuf_t ++{ ++public: ++ hostbuf_t() {} ++ // buffers are movable but not copyable ++ hostbuf_t(hostbuf_t&& other) ++ { ++ std::swap(buf, other.buf); ++ std::swap(bsize, other.bsize); ++ } ++ hostbuf_t& operator=(hostbuf_t&& other) ++ { ++ std::swap(buf, other.buf); ++ std::swap(bsize, other.bsize); ++ return *this; ++ } ++ hostbuf_t(const hostbuf_t&) = delete; ++ hostbuf_t& operator=(const hostbuf_t&) = delete; ++ ++ ~hostbuf_t() ++ { ++ free(); ++ } ++ ++ void alloc(size_t size) ++ { ++ bsize = size; ++ free(); ++ ++ // we're aligning to multiples of 64 bytes, so round the ++ // allocation size up to the nearest 64 to keep ASAN happy ++ if(size % 64) ++ { ++ size += 64 - size % 64; ++ } ++ ++ // FFTW requires aligned allocations to use faster SIMD instructions. ++ // If enabling hugepages, align to 2 MiB. Otherwise, aligning to ++ // 64 bytes is enough for AVX instructions up to AVX512. ++#ifdef WIN32 ++ buf = _aligned_malloc(size, 64); ++#else ++ // On Linux, ask for hugepages to reduce TLB pressure and ++ // improve performance. Allocations need to be aligned to ++ // the hugepage size, and rounded up to the next whole ++ // hugepage. ++ static const size_t TWO_MiB = 2 * 1024 * 1024; ++ if(size >= TWO_MiB) ++ { ++ size_t rounded_size = DivRoundingUp(size, TWO_MiB) * TWO_MiB; ++ buf = aligned_alloc(TWO_MiB, rounded_size); ++ madvise(buf, rounded_size, MADV_HUGEPAGE); ++ } ++ else ++ buf = aligned_alloc(64, size); ++#endif ++ } ++ ++ size_t size() const ++ { ++ return bsize; ++ } ++ ++ void free() ++ { ++ if(buf != nullptr) ++ { ++#ifdef WIN32 ++ _aligned_free(buf); ++#else ++ std::free(buf); ++#endif ++ buf = nullptr; ++ bsize = 0; ++ } ++ } ++ ++ T* data() const ++ { ++ return static_cast<T*>(buf); ++ } ++ ++ // Copy method ++ hostbuf_t copy() const ++ { ++ hostbuf_t copy; ++ copy.alloc(bsize); ++ memcpy(copy.buf, buf, bsize); ++ return copy; ++ } ++ ++ // shrink the buffer to fit the new size ++ void shrink(size_t new_size) ++ { ++ if(new_size > bsize) ++ throw std::runtime_error("can't shrink hostbuf to larger size"); ++ // just pretend the buffer is now that size ++ bsize = new_size; ++ } ++ ++ // equality/bool tests ++ bool operator==(std::nullptr_t n) const ++ { ++ return buf == n; ++ } ++ bool operator!=(std::nullptr_t n) const ++ { ++ return buf != n; ++ } ++ operator bool() const ++ { ++ return buf; ++ } ++ ++private: ++ // The host buffer ++ void* buf = nullptr; ++ size_t bsize = 0; ++}; ++ ++// default hostbuf that gives out void* pointers ++typedef hostbuf_t<> hostbuf; ++#endif +diff --git a/shared/increment.h b/shared/increment.h +new file mode 100644 +index 0000000..90bba1d +--- /dev/null ++++ b/shared/increment.h +@@ -0,0 +1,100 @@ ++// Copyright (C) 2021 - 2022 Advanced Micro Devices, Inc. All rights reserved. ++// ++// Permission is hereby granted, free of charge, to any person obtaining a copy ++// of this software and associated documentation files (the "Software"), to deal ++// in the Software without restriction, including without limitation the rights ++// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell ++// copies of the Software, and to permit persons to whom the Software is ++// furnished to do so, subject to the following conditions: ++// ++// The above copyright notice and this permission notice shall be included in ++// all copies or substantial portions of the Software. ++// ++// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, ++// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN ++// THE SOFTWARE. ++ ++#ifndef ROCFFT_INCREMENT_H ++#define ROCFFT_INCREMENT_H ++ ++#include <algorithm> ++#include <tuple> ++#include <vector> ++ ++// Helper functions to iterate over a buffer in row-major order. ++// Indexes may be given as either a tuple or vector of sizes. They ++// return true if the index was successfully incremented to move to ++// the next element in the buffer. ++ ++template <typename T1, typename T2> ++static bool increment_base(T1& index, const T2& length) ++{ ++ static_assert(std::is_integral<T1>::value, "Integral required."); ++ static_assert(std::is_integral<T2>::value, "Integral required."); ++ if(index < length - 1) ++ { ++ ++index; ++ return true; ++ } ++ index = 0; ++ return false; ++} ++ ++// Increment the index (row-major) for looping over 1, 2, and 3 dimensions length. ++template <typename T1, typename T2> ++static bool increment_rowmajor(T1& index, const T2& length) ++{ ++ static_assert(std::is_integral<T1>::value, "Integral required."); ++ static_assert(std::is_integral<T2>::value, "Integral required."); ++ return increment_base(index, length); ++} ++ ++template <typename T1, typename T2> ++static bool increment_rowmajor(std::tuple<T1, T1>& index, const std::tuple<T2, T2>& length) ++{ ++ if(increment_base(std::get<1>(index), std::get<1>(length))) ++ // we incremented ok, nothing further to do ++ return true; ++ // otherwise, we rolled over ++ return increment_base(std::get<0>(index), std::get<0>(length)); ++} ++ ++template <typename T1, typename T2> ++static bool increment_rowmajor(std::tuple<T1, T1, T1>& index, const std::tuple<T2, T2, T2>& length) ++{ ++ if(increment_base(std::get<2>(index), std::get<2>(length))) ++ // we incremented ok, nothing further to do ++ return true; ++ if(increment_base(std::get<1>(index), std::get<1>(length))) ++ // we incremented ok, nothing further to do ++ return true; ++ // otherwise, we rolled over ++ return increment_base(std::get<0>(index), std::get<0>(length)); ++} ++ ++// Increment row-major index over arbitrary dimension length ++template <typename T1, typename T2> ++bool increment_rowmajor(std::vector<T1>& index, const std::vector<T2>& length) ++{ ++ for(int idim = length.size(); idim-- > 0;) ++ { ++ if(index[idim] < length[idim]) ++ { ++ if((++index[idim]) == length[idim]) ++ { ++ index[idim] = 0; ++ continue; ++ } ++ // we know we were able to increment something and didn't hit the end ++ return true; ++ } ++ } ++ // End the loop when we get back to the start: ++ return !std::all_of(index.begin(), index.end(), [](int i) { return i == 0; }); ++} ++ ++#endif +diff --git a/shared/precision_type.h b/shared/precision_type.h +new file mode 100644 +index 0000000..526fc9a +--- /dev/null ++++ b/shared/precision_type.h +@@ -0,0 +1,70 @@ ++// Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved. ++// ++// Permission is hereby granted, free of charge, to any person obtaining a copy ++// of this software and associated documentation files (the "Software"), to deal ++// in the Software without restriction, including without limitation the rights ++// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell ++// copies of the Software, and to permit persons to whom the Software is ++// furnished to do so, subject to the following conditions: ++// ++// The above copyright notice and this permission notice shall be included in ++// all copies or substantial portions of the Software. ++// ++// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, ++// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN ++// THE SOFTWARE. ++ ++#ifndef ROCFFT_PRECISION_TYPE_H ++#define ROCFFT_PRECISION_TYPE_H ++ ++#include "array_predicate.h" ++#include "rocfft/rocfft.h" ++ ++static size_t real_type_size(rocfft_precision precision) ++{ ++ switch(precision) ++ { ++ case rocfft_precision_half: ++ return 2; ++ case rocfft_precision_single: ++ return 4; ++ case rocfft_precision_double: ++ return 8; ++ } ++} ++ ++static size_t complex_type_size(rocfft_precision precision) ++{ ++ return real_type_size(precision) * 2; ++} ++ ++static const char* precision_name(rocfft_precision precision) ++{ ++ switch(precision) ++ { ++ case rocfft_precision_half: ++ return "half"; ++ case rocfft_precision_single: ++ return "single"; ++ case rocfft_precision_double: ++ return "double"; ++ } ++} ++ ++static size_t element_size(rocfft_precision precision, rocfft_array_type array_type) ++{ ++ return array_type_is_complex(array_type) ? complex_type_size(precision) ++ : real_type_size(precision); ++} ++ ++// offset a pointer by a number of elements, given the elements' ++// precision and type (complex or not) ++static void* ptr_offset(void* p, size_t elems, rocfft_precision precision, rocfft_array_type type) ++{ ++ return static_cast<char*>(p) + elems * element_size(precision, type); ++} ++#endif +diff --git a/shared/printbuffer.h b/shared/printbuffer.h +new file mode 100644 +index 0000000..5ae0b64 +--- /dev/null ++++ b/shared/printbuffer.h +@@ -0,0 +1,108 @@ ++// Copyright (C) 2021 - 2023 Advanced Micro Devices, Inc. All rights reserved. ++// ++// Permission is hereby granted, free of charge, to any person obtaining a copy ++// of this software and associated documentation files (the "Software"), to deal ++// in the Software without restriction, including without limitation the rights ++// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell ++// copies of the Software, and to permit persons to whom the Software is ++// furnished to do so, subject to the following conditions: ++// ++// The above copyright notice and this permission notice shall be included in ++// all copies or substantial portions of the Software. ++// ++// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, ++// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN ++// THE SOFTWARE. ++ ++#ifndef PRINTBUFFER_H ++#define PRINTBUFFER_H ++ ++#include "hostbuf.h" ++#include "increment.h" ++#include <algorithm> ++#include <vector> ++ ++// Output a formatted general-dimensional array with given length and stride in batches ++// separated by dist. ++template <typename Toutput, typename T1, typename T2, typename Tsize, typename Tstream> ++inline void printbuffer(const Toutput* output, ++ const std::vector<T1>& length, ++ const std::vector<T2>& stride, ++ const Tsize nbatch, ++ const Tsize dist, ++ const size_t offset, ++ Tstream& stream) ++{ ++ auto i_base = 0; ++ for(unsigned int b = 0; b < nbatch; b++, i_base += dist) ++ { ++ std::vector<size_t> index(length.size()); ++ std::fill(index.begin(), index.end(), 0); ++ do ++ { ++ const int i ++ = std::inner_product(index.begin(), index.end(), stride.begin(), i_base + offset); ++ stream << output[i] << " "; ++ for(int li = index.size(); li-- > 0;) ++ { ++ if(index[li] == (length[li] - 1)) ++ { ++ stream << "\n"; ++ } ++ else ++ { ++ break; ++ } ++ } ++ } while(increment_rowmajor(index, length)); ++ stream << std::endl; ++ } ++} ++ ++template <typename Telem> ++class buffer_printer ++{ ++ // The scalar versions might be part of a planar format. ++public: ++ template <typename Tint1, typename Tint2, typename Tsize, typename Tstream = std::ostream> ++ static void print_buffer(const std::vector<hostbuf>& buf, ++ const std::vector<Tint1>& length, ++ const std::vector<Tint2>& stride, ++ const Tsize nbatch, ++ const Tsize dist, ++ const std::vector<size_t>& offset, ++ Tstream& stream = std::cout) ++ { ++ for(const auto& vec : buf) ++ { ++ printbuffer(reinterpret_cast<const Telem*>(vec.data()), ++ length, ++ stride, ++ nbatch, ++ dist, ++ offset[0], ++ stream); ++ } ++ }; ++ template <typename Tstream = std::ostream> ++ static void print_buffer_flat(const std::vector<hostbuf>& buf, ++ const std::vector<size_t>& size, ++ const std::vector<size_t>& offset, ++ Tstream& stream = std::cout) ++ { ++ for(const auto& vec : buf) ++ { ++ auto data = reinterpret_cast<const Telem*>(vec.data()); ++ stream << "idx " << 0; ++ for(size_t i = 0; i < size[0]; ++i) ++ stream << " " << data[i]; ++ stream << std::endl; ++ } ++ }; ++}; ++ ++#endif +diff --git a/shared/ptrdiff.h b/shared/ptrdiff.h +new file mode 100644 +index 0000000..3bd15de +--- /dev/null ++++ b/shared/ptrdiff.h +@@ -0,0 +1,40 @@ ++// Copyright (C) 2022 Advanced Micro Devices, Inc. All rights reserved. ++// ++// Permission is hereby granted, free of charge, to any person obtaining a copy ++// of this software and associated documentation files (the "Software"), to deal ++// in the Software without restriction, including without limitation the rights ++// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell ++// copies of the Software, and to permit persons to whom the Software is ++// furnished to do so, subject to the following conditions: ++// ++// The above copyright notice and this permission notice shall be included in ++// all copies or substantial portions of the Software. ++// ++// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, ++// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN ++// THE SOFTWARE. ++ ++#pragma once ++ ++// Compute the farthest point from the original pointer. ++static size_t compute_ptrdiff(const std::vector<size_t>& length, ++ const std::vector<size_t>& stride, ++ const size_t nbatch, ++ const size_t dist) ++{ ++ size_t val = 0; ++ if(!length.empty()) ++ { ++ val = 1; ++ for(unsigned int i = 0; i < length.size(); ++i) ++ { ++ val += (length[i] - 1) * stride[i]; ++ } ++ val += (nbatch - 1) * dist; ++ } ++ return val; ++} +diff --git a/shared/rocfft_accuracy_test.h b/shared/rocfft_accuracy_test.h +new file mode 100644 +index 0000000..4ce3059 +--- /dev/null ++++ b/shared/rocfft_accuracy_test.h +@@ -0,0 +1,29 @@ ++// Copyright (C) 2022 - 2022 Advanced Micro Devices, Inc. All rights reserved. ++// ++// Permission is hereby granted, free of charge, to any person obtaining a copy ++// of this software and associated documentation files (the "Software"), to deal ++// in the Software without restriction, including without limitation the rights ++// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell ++// copies of the Software, and to permit persons to whom the Software is ++// furnished to do so, subject to the following conditions: ++// ++// The above copyright notice and this permission notice shall be included in ++// all copies or substantial portions of the Software. ++// ++// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, ++// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN ++// THE SOFTWARE. ++ ++#ifndef ROCFFT_ACCURACY_TEST ++#define ROCFFT_ACCURACY_TEST ++ ++#include "accuracy_test.h" ++#include "rocfft_params.h" ++ ++void fft_vs_reference(rocfft_params& params, bool round_trip = false); ++ ++#endif +diff --git a/shared/rocfft_against_fftw.h b/shared/rocfft_against_fftw.h +new file mode 100644 +index 0000000..d03754c +--- /dev/null ++++ b/shared/rocfft_against_fftw.h +@@ -0,0 +1,231 @@ ++// Copyright (C) 2016 - 2023 Advanced Micro Devices, Inc. All rights reserved. ++// ++// Permission is hereby granted, free of charge, to any person obtaining a copy ++// of this software and associated documentation files (the "Software"), to deal ++// in the Software without restriction, including without limitation the rights ++// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell ++// copies of the Software, and to permit persons to whom the Software is ++// furnished to do so, subject to the following conditions: ++// ++// The above copyright notice and this permission notice shall be included in ++// all copies or substantial portions of the Software. ++// ++// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, ++// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN ++// THE SOFTWARE. ++ ++#pragma once ++#ifndef ROCFFT_AGAINST_FFTW ++#define ROCFFT_AGAINST_FFTW ++ ++#include <gtest/gtest.h> ++#include <math.h> ++#include <stdexcept> ++#include <vector> ++ ++#include "fftw_transform.h" ++ ++// Return the precision enum for rocFFT based upon the type. ++template <typename Tfloat> ++inline fft_precision precision_selector(); ++template <> ++inline fft_precision precision_selector<float>() ++{ ++ return fft_precision_single; ++} ++template <> ++inline fft_precision precision_selector<double>() ++{ ++ return fft_precision_double; ++} ++ ++extern bool use_fftw_wisdom; ++ ++// construct and return an FFTW plan with the specified type, ++// precision, and dimensions. cpu_out is required if we're using ++// wisdom, which runs actual FFTs to work out the best plan. ++template <typename Tfloat> ++static typename fftw_trait<Tfloat>::fftw_plan_type ++ fftw_plan_with_precision(const std::vector<fftw_iodim64>& dims, ++ const std::vector<fftw_iodim64>& howmany_dims, ++ const fft_transform_type transformType, ++ const size_t isize, ++ void* cpu_in, ++ void* cpu_out) ++{ ++ using fftw_complex_type = typename fftw_trait<Tfloat>::fftw_complex_type; ++ ++ // NB: Using FFTW_MEASURE implies that the input buffer's data ++ // may be destroyed during plan creation. But if we're wanting ++ // to run FFTW in the first place, we must have just created an ++ // uninitialized input buffer anyway. ++ ++ switch(transformType) ++ { ++ case fft_transform_type_complex_forward: ++ return fftw_plan_guru64_dft<Tfloat>(dims.size(), ++ dims.data(), ++ howmany_dims.size(), ++ howmany_dims.data(), ++ reinterpret_cast<fftw_complex_type*>(cpu_in), ++ reinterpret_cast<fftw_complex_type*>(cpu_out), ++ -1, ++ use_fftw_wisdom ? FFTW_MEASURE : FFTW_ESTIMATE); ++ case fft_transform_type_complex_inverse: ++ return fftw_plan_guru64_dft<Tfloat>(dims.size(), ++ dims.data(), ++ howmany_dims.size(), ++ howmany_dims.data(), ++ reinterpret_cast<fftw_complex_type*>(cpu_in), ++ reinterpret_cast<fftw_complex_type*>(cpu_out), ++ 1, ++ use_fftw_wisdom ? FFTW_MEASURE : FFTW_ESTIMATE); ++ case fft_transform_type_real_forward: ++ return fftw_plan_guru64_r2c<Tfloat>(dims.size(), ++ dims.data(), ++ howmany_dims.size(), ++ howmany_dims.data(), ++ reinterpret_cast<Tfloat*>(cpu_in), ++ reinterpret_cast<fftw_complex_type*>(cpu_out), ++ use_fftw_wisdom ? FFTW_MEASURE : FFTW_ESTIMATE); ++ case fft_transform_type_real_inverse: ++ return fftw_plan_guru64_c2r<Tfloat>(dims.size(), ++ dims.data(), ++ howmany_dims.size(), ++ howmany_dims.data(), ++ reinterpret_cast<fftw_complex_type*>(cpu_in), ++ reinterpret_cast<Tfloat*>(cpu_out), ++ use_fftw_wisdom ? FFTW_MEASURE : FFTW_ESTIMATE); ++ default: ++ throw std::runtime_error("Invalid transform type"); ++ } ++} ++ ++// construct an FFTW plan, given rocFFT parameters. output is ++// required if planning with wisdom. ++template <typename Tfloat> ++static typename fftw_trait<Tfloat>::fftw_plan_type ++ fftw_plan_via_rocfft(const std::vector<size_t>& length, ++ const std::vector<size_t>& istride, ++ const std::vector<size_t>& ostride, ++ const size_t nbatch, ++ const size_t idist, ++ const size_t odist, ++ const fft_transform_type transformType, ++ std::vector<hostbuf>& input, ++ std::vector<hostbuf>& output) ++{ ++ // Dimension configuration: ++ std::vector<fftw_iodim64> dims(length.size()); ++ for(unsigned int idx = 0; idx < length.size(); ++idx) ++ { ++ dims[idx].n = length[idx]; ++ dims[idx].is = istride[idx]; ++ dims[idx].os = ostride[idx]; ++ } ++ ++ // Batch configuration: ++ std::vector<fftw_iodim64> howmany_dims(1); ++ howmany_dims[0].n = nbatch; ++ howmany_dims[0].is = idist; ++ howmany_dims[0].os = odist; ++ ++ return fftw_plan_with_precision<Tfloat>(dims, ++ howmany_dims, ++ transformType, ++ idist * nbatch, ++ input.front().data(), ++ output.empty() ? nullptr : output.front().data()); ++} ++ ++template <typename Tfloat> ++void fftw_run(fft_transform_type transformType, ++ typename fftw_trait<Tfloat>::fftw_plan_type cpu_plan, ++ std::vector<hostbuf>& cpu_in, ++ std::vector<hostbuf>& cpu_out) ++{ ++ switch(transformType) ++ { ++ case fft_transform_type_complex_forward: ++ { ++ fftw_plan_execute_c2c<Tfloat>(cpu_plan, cpu_in, cpu_out); ++ break; ++ } ++ case fft_transform_type_complex_inverse: ++ { ++ fftw_plan_execute_c2c<Tfloat>(cpu_plan, cpu_in, cpu_out); ++ break; ++ } ++ case fft_transform_type_real_forward: ++ { ++ fftw_plan_execute_r2c<Tfloat>(cpu_plan, cpu_in, cpu_out); ++ break; ++ } ++ case fft_transform_type_real_inverse: ++ { ++ fftw_plan_execute_c2r<Tfloat>(cpu_plan, cpu_in, cpu_out); ++ break; ++ } ++ } ++} ++ ++// Given a transform type, return the contiguous input type. ++inline fft_array_type contiguous_itype(const fft_transform_type transformType) ++{ ++ switch(transformType) ++ { ++ case fft_transform_type_complex_forward: ++ case fft_transform_type_complex_inverse: ++ return fft_array_type_complex_interleaved; ++ case fft_transform_type_real_forward: ++ return fft_array_type_real; ++ case fft_transform_type_real_inverse: ++ return fft_array_type_hermitian_interleaved; ++ default: ++ throw std::runtime_error("Invalid transform type"); ++ } ++ return fft_array_type_complex_interleaved; ++} ++ ++// Given a transform type, return the contiguous output type. ++inline fft_array_type contiguous_otype(const fft_transform_type transformType) ++{ ++ switch(transformType) ++ { ++ case fft_transform_type_complex_forward: ++ case fft_transform_type_complex_inverse: ++ return fft_array_type_complex_interleaved; ++ case fft_transform_type_real_forward: ++ return fft_array_type_hermitian_interleaved; ++ case fft_transform_type_real_inverse: ++ return fft_array_type_real; ++ default: ++ throw std::runtime_error("Invalid transform type"); ++ } ++ return fft_array_type_complex_interleaved; ++} ++ ++// Given a precision, return the acceptable tolerance. ++inline double type_epsilon(const fft_precision precision) ++{ ++ switch(precision) ++ { ++ case fft_precision_half: ++ return type_epsilon<_Float16>(); ++ break; ++ case fft_precision_single: ++ return type_epsilon<float>(); ++ break; ++ case fft_precision_double: ++ return type_epsilon<double>(); ++ break; ++ default: ++ throw std::runtime_error("Invalid precision"); ++ } ++} ++ ++#endif +diff --git a/shared/rocfft_complex.h b/shared/rocfft_complex.h +new file mode 100644 +index 0000000..efa0290 +--- /dev/null ++++ b/shared/rocfft_complex.h +@@ -0,0 +1,346 @@ ++// Copyright (C) 2021 - 2023 Advanced Micro Devices, Inc. All rights reserved. ++// ++// Permission is hereby granted, free of charge, to any person obtaining a copy ++// of this software and associated documentation files (the "Software"), to deal ++// in the Software without restriction, including without limitation the rights ++// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell ++// copies of the Software, and to permit persons to whom the Software is ++// furnished to do so, subject to the following conditions: ++// ++// The above copyright notice and this permission notice shall be included in ++// all copies or substantial portions of the Software. ++// ++// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, ++// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN ++// THE SOFTWARE. ++ ++#ifndef ROCFFT_COMPLEX_H ++#define ROCFFT_COMPLEX_H ++ ++#include <hip/hip_fp16.h> ++#if !defined(__HIPCC_RTC__) ++#include <iostream> ++#endif ++#include <math.h> ++#include <type_traits> ++ ++#ifdef __HIP_PLATFORM_NVIDIA__ ++typedef __half _Float16; ++#endif ++ ++template <typename Treal> ++struct rocfft_complex ++{ ++ ++ Treal x; // Real part ++ Treal y; // Imaginary part ++ ++ // Constructors ++ // Do not initialize the members x or y by default, to ensure that it can ++ // be used in __shared__ and that it is a trivial class compatible with C. ++ __device__ __host__ rocfft_complex() = default; ++ __device__ __host__ rocfft_complex(const rocfft_complex&) = default; ++ __device__ __host__ rocfft_complex(rocfft_complex&&) = default; ++ __device__ __host__ rocfft_complex& operator=(const rocfft_complex& rhs) & = default; ++ __device__ __host__ rocfft_complex& operator=(rocfft_complex&& rhs) & = default; ++ __device__ __host__ ~rocfft_complex() = default; ++ ++ // Constructor from real and imaginary parts ++ __device__ __host__ constexpr rocfft_complex(Treal real, Treal imag) ++ : x{real} ++ , y{imag} ++ { ++ } ++ ++ // Conversion from different precision ++ template <typename U> ++ __device__ __host__ explicit constexpr rocfft_complex(const rocfft_complex<U>& z) ++ : x(z.x) ++ , y(z.y) ++ { ++ } ++ ++ // Accessors ++ __device__ __host__ constexpr Treal real() const ++ { ++ return x; ++ } ++ ++ __device__ __host__ constexpr Treal imag() const ++ { ++ return y; ++ } ++ ++ // Unary operations ++ __forceinline__ __device__ __host__ rocfft_complex operator-() const ++ { ++ return {-x, -y}; ++ } ++ ++ __forceinline__ __device__ __host__ rocfft_complex operator+() const ++ { ++ return *this; ++ } ++ ++ __device__ __host__ Treal asum(const rocfft_complex& z) ++ { ++ return abs(z.x) + abs(z.y); ++ } ++ ++ // Internal real functions ++ static __forceinline__ __device__ __host__ Treal abs(Treal x) ++ { ++ return x < 0 ? -x : x; ++ } ++ ++ static __forceinline__ __device__ __host__ float sqrt(float x) ++ { ++ return ::sqrtf(x); ++ } ++ ++ static __forceinline__ __device__ __host__ double sqrt(double x) ++ { ++ return ::sqrt(x); ++ } ++ ++ // Addition operators ++ __device__ __host__ auto& operator+=(const rocfft_complex& rhs) ++ { ++ return *this = {x + rhs.x, y + rhs.y}; ++ } ++ ++ __device__ __host__ auto operator+(const rocfft_complex& rhs) const ++ { ++ auto lhs = *this; ++ return lhs += rhs; ++ } ++ ++ // Subtraction operators ++ __device__ __host__ auto& operator-=(const rocfft_complex& rhs) ++ { ++ return *this = {x - rhs.x, y - rhs.y}; ++ } ++ ++ __device__ __host__ auto operator-(const rocfft_complex& rhs) const ++ { ++ auto lhs = *this; ++ return lhs -= rhs; ++ } ++ ++ // Multiplication operators ++ __device__ __host__ auto& operator*=(const rocfft_complex& rhs) ++ { ++ return *this = {x * rhs.x - y * rhs.y, y * rhs.x + x * rhs.y}; ++ } ++ ++ __device__ __host__ auto operator*(const rocfft_complex& rhs) const ++ { ++ auto lhs = *this; ++ return lhs *= rhs; ++ } ++ ++ // Division operators ++ __device__ __host__ auto& operator/=(const rocfft_complex& rhs) ++ { ++ // Form of Robert L. Smith's Algorithm 116 ++ if(abs(rhs.x) > abs(rhs.y)) ++ { ++ Treal ratio = rhs.y / rhs.x; ++ Treal scale = 1 / (rhs.x + rhs.y * ratio); ++ *this = {(x + y * ratio) * scale, (y - x * ratio) * scale}; ++ } ++ else ++ { ++ Treal ratio = rhs.x / rhs.y; ++ Treal scale = 1 / (rhs.x * ratio + rhs.y); ++ *this = {(y + x * ratio) * scale, (y * ratio - x) * scale}; ++ } ++ return *this; ++ } ++ ++ __device__ __host__ auto operator/(const rocfft_complex& rhs) const ++ { ++ auto lhs = *this; ++ return lhs /= rhs; ++ } ++ ++ // Comparison operators ++ __device__ __host__ constexpr bool operator==(const rocfft_complex& rhs) const ++ { ++ return x == rhs.x && y == rhs.y; ++ } ++ ++ __device__ __host__ constexpr bool operator!=(const rocfft_complex& rhs) const ++ { ++ return !(*this == rhs); ++ } ++ ++ // Operators for complex-real computations ++ template <typename U> ++ __device__ __host__ auto& operator+=(const U& rhs) ++ { ++ return (x += Treal(rhs)), *this; ++ } ++ ++ template <typename U> ++ __device__ __host__ auto& operator-=(const U& rhs) ++ { ++ return (x -= Treal(rhs)), *this; ++ } ++ ++ __device__ __host__ auto operator+(const Treal& rhs) ++ { ++ auto lhs = *this; ++ return lhs += rhs; ++ } ++ ++ __device__ __host__ auto operator-(const Treal& rhs) ++ { ++ auto lhs = *this; ++ return lhs -= rhs; ++ } ++ ++ template <typename U> ++ __device__ __host__ auto& operator*=(const U& rhs) ++ { ++ return (x *= Treal(rhs)), (y *= Treal(rhs)), *this; ++ } ++ ++ template <typename U> ++ __device__ __host__ auto operator*(const U& rhs) const ++ { ++ auto lhs = *this; ++ return lhs *= Treal(rhs); ++ } ++ ++ template <typename U> ++ __device__ __host__ auto& operator/=(const U& rhs) ++ { ++ return (x /= Treal(rhs)), (y /= Treal(rhs)), *this; ++ } ++ ++ template <typename U> ++ __device__ __host__ auto operator/(const U& rhs) const ++ { ++ auto lhs = *this; ++ return lhs /= Treal(rhs); ++ } ++ ++ template <typename U> ++ __device__ __host__ constexpr bool operator==(const U& rhs) const ++ { ++ return x == Treal(rhs) && y == 0; ++ } ++ ++ template <typename U> ++ __device__ __host__ constexpr bool operator!=(const U& rhs) const ++ { ++ return !(*this == rhs); ++ } ++}; ++ ++// Stream operators ++#if !defined(__HIPCC_RTC__) ++static std::ostream& operator<<(std::ostream& stream, const _Float16& f) ++{ ++ return stream << static_cast<double>(f); ++} ++ ++template <typename Treal> ++std::ostream& operator<<(std::ostream& out, const rocfft_complex<Treal>& z) ++{ ++ return out << '(' << static_cast<double>(z.x) << ',' << static_cast<double>(z.y) << ')'; ++} ++#endif ++ ++// Operators for real-complex computations ++template <typename U, typename Treal> ++__device__ __host__ rocfft_complex<Treal> operator+(const U& lhs, const rocfft_complex<Treal>& rhs) ++{ ++ return {Treal(lhs) + rhs.x, rhs.y}; ++} ++ ++template <typename U, typename Treal> ++__device__ __host__ rocfft_complex<Treal> operator-(const U& lhs, const rocfft_complex<Treal>& rhs) ++{ ++ return {Treal(lhs) - rhs.x, -rhs.y}; ++} ++ ++template <typename U, typename Treal> ++__device__ __host__ rocfft_complex<Treal> operator*(const U& lhs, const rocfft_complex<Treal>& rhs) ++{ ++ return {Treal(lhs) * rhs.x, Treal(lhs) * rhs.y}; ++} ++ ++template <typename U, typename Treal> ++__device__ __host__ rocfft_complex<Treal> operator/(const U& lhs, const rocfft_complex<Treal>& rhs) ++{ ++ // Form of Robert L. Smith's Algorithm 116 ++ if(rocfft_complex<Treal>::abs(rhs.x) > rocfft_complex<Treal>::abs(rhs.y)) ++ { ++ Treal ratio = rhs.y / rhs.x; ++ Treal scale = Treal(lhs) / (rhs.x + rhs.y * ratio); ++ return {scale, -scale * ratio}; ++ } ++ else ++ { ++ Treal ratio = rhs.x / rhs.y; ++ Treal scale = Treal(lhs) / (rhs.x * ratio + rhs.y); ++ return {ratio * scale, -scale}; ++ } ++} ++ ++template <typename U, typename Treal> ++__device__ __host__ constexpr bool operator==(const U& lhs, const rocfft_complex<Treal>& rhs) ++{ ++ return Treal(lhs) == rhs.x && 0 == rhs.y; ++} ++ ++template <typename U, typename Treal> ++__device__ __host__ constexpr bool operator!=(const U& lhs, const rocfft_complex<Treal>& rhs) ++{ ++ return !(lhs == rhs); ++} ++ ++// Extending std namespace to handle rocfft_complex datatype ++namespace std ++{ ++ template <typename Treal> ++ __device__ __host__ constexpr Treal real(const rocfft_complex<Treal>& z) ++ { ++ return z.x; ++ } ++ ++ template <typename Treal> ++ __device__ __host__ constexpr Treal imag(const rocfft_complex<Treal>& z) ++ { ++ return z.y; ++ } ++ ++ template <typename Treal> ++ __device__ __host__ constexpr rocfft_complex<Treal> conj(const rocfft_complex<Treal>& z) ++ { ++ return {z.x, -z.y}; ++ } ++ ++ template <typename Treal> ++ __device__ __host__ inline Treal norm(const rocfft_complex<Treal>& z) ++ { ++ return (z.x * z.x) + (z.y * z.y); ++ } ++ ++ template <typename Treal> ++ __device__ __host__ inline Treal abs(const rocfft_complex<Treal>& z) ++ { ++ Treal tr = rocfft_complex<Treal>::abs(z.x), ti = rocfft_complex<Treal>::abs(z.y); ++ return tr > ti ? (ti /= tr, tr * rocfft_complex<Treal>::sqrt(ti * ti + 1)) ++ : ti ? (tr /= ti, ti * rocfft_complex<Treal>::sqrt(tr * tr + 1)) ++ : 0; ++ } ++} ++ ++#endif // ROCFFT_COMPLEX_H +diff --git a/shared/rocfft_hip.h b/shared/rocfft_hip.h +new file mode 100644 +index 0000000..e086cab +--- /dev/null ++++ b/shared/rocfft_hip.h +@@ -0,0 +1,52 @@ ++// Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved. ++// ++// Permission is hereby granted, free of charge, to any person obtaining a copy ++// of this software and associated documentation files (the "Software"), to deal ++// in the Software without restriction, including without limitation the rights ++// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell ++// copies of the Software, and to permit persons to whom the Software is ++// furnished to do so, subject to the following conditions: ++// ++// The above copyright notice and this permission notice shall be included in ++// all copies or substantial portions of the Software. ++// ++// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, ++// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN ++// THE SOFTWARE. ++ ++#ifndef __ROCFFT_HIP_H__ ++#define __ROCFFT_HIP_H__ ++ ++#include <hip/hip_runtime_api.h> ++#include <stdexcept> ++ ++class rocfft_scoped_device ++{ ++public: ++ rocfft_scoped_device(int device) ++ { ++ if(hipGetDevice(&orig_device) != hipSuccess) ++ throw std::runtime_error("hipGetDevice failure"); ++ ++ if(hipSetDevice(device) != hipSuccess) ++ throw std::runtime_error("hipSetDevice failure"); ++ } ++ ~rocfft_scoped_device() ++ { ++ (void)hipSetDevice(orig_device); ++ } ++ ++ // not copyable or movable ++ rocfft_scoped_device(const rocfft_scoped_device&) = delete; ++ rocfft_scoped_device(rocfft_scoped_device&&) = delete; ++ rocfft_scoped_device& operator=(const rocfft_scoped_device&) = delete; ++ ++private: ++ int orig_device; ++}; ++ ++#endif // __ROCFFT_HIP_H__ +diff --git a/shared/rocfft_params.h b/shared/rocfft_params.h +new file mode 100644 +index 0000000..bf9b728 +--- /dev/null ++++ b/shared/rocfft_params.h +@@ -0,0 +1,585 @@ ++// Copyright (C) 2021 - 2023 Advanced Micro Devices, Inc. All rights reserved. ++// ++// Permission is hereby granted, free of charge, to any person obtaining a copy ++// of this software and associated documentation files (the "Software"), to deal ++// in the Software without restriction, including without limitation the rights ++// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell ++// copies of the Software, and to permit persons to whom the Software is ++// furnished to do so, subject to the following conditions: ++// ++// The above copyright notice and this permission notice shall be included in ++// all copies or substantial portions of the Software. ++// ++// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, ++// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN ++// THE SOFTWARE. ++ ++#ifndef ROCFFT_PARAMS_H ++#define ROCFFT_PARAMS_H ++ ++#include "../shared/fft_params.h" ++#include "../shared/gpubuf.h" ++#include "rocfft/rocfft.h" ++ ++// Return the string of the rocfft_status code ++static std::string rocfft_status_to_string(const rocfft_status ret) ++{ ++ switch(ret) ++ { ++ case rocfft_status_success: ++ return "rocfft_status_success"; ++ case rocfft_status_failure: ++ return "rocfft_status_failure"; ++ case rocfft_status_invalid_arg_value: ++ return "rocfft_status_invalid_arg_value"; ++ case rocfft_status_invalid_dimensions: ++ return "rocfft_status_invalid_dimensions"; ++ case rocfft_status_invalid_array_type: ++ return "rocfft_status_invalid_array_type"; ++ case rocfft_status_invalid_strides: ++ return "rocfft_status_invalid_strides"; ++ case rocfft_status_invalid_distance: ++ return "rocfft_status_invalid_distance"; ++ case rocfft_status_invalid_offset: ++ return "rocfft_status_invalid_offset"; ++ case rocfft_status_invalid_work_buffer: ++ return "rocfft_status_invalid_work_buffer"; ++ default: ++ throw std::runtime_error("unknown rocfft_status"); ++ } ++} ++ ++inline fft_status fft_status_from_rocfftparams(const rocfft_status val) ++{ ++ switch(val) ++ { ++ case rocfft_status_success: ++ return fft_status_success; ++ case rocfft_status_failure: ++ return fft_status_failure; ++ case rocfft_status_invalid_arg_value: ++ return fft_status_invalid_arg_value; ++ case rocfft_status_invalid_dimensions: ++ return fft_status_invalid_dimensions; ++ case rocfft_status_invalid_array_type: ++ return fft_status_invalid_array_type; ++ case rocfft_status_invalid_strides: ++ return fft_status_invalid_strides; ++ case rocfft_status_invalid_distance: ++ return fft_status_invalid_distance; ++ case rocfft_status_invalid_offset: ++ return fft_status_invalid_offset; ++ case rocfft_status_invalid_work_buffer: ++ return fft_status_invalid_work_buffer; ++ default: ++ throw std::runtime_error("Invalid status"); ++ } ++} ++ ++inline rocfft_precision rocfft_precision_from_fftparams(const fft_precision val) ++{ ++ switch(val) ++ { ++ case fft_precision_single: ++ return rocfft_precision_single; ++ case fft_precision_double: ++ return rocfft_precision_double; ++ case fft_precision_half: ++ return rocfft_precision_half; ++ default: ++ throw std::runtime_error("Invalid precision"); ++ } ++} ++ ++inline rocfft_array_type rocfft_array_type_from_fftparams(const fft_array_type val) ++{ ++ switch(val) ++ { ++ case fft_array_type_complex_interleaved: ++ return rocfft_array_type_complex_interleaved; ++ case fft_array_type_complex_planar: ++ return rocfft_array_type_complex_planar; ++ case fft_array_type_real: ++ return rocfft_array_type_real; ++ case fft_array_type_hermitian_interleaved: ++ return rocfft_array_type_hermitian_interleaved; ++ case fft_array_type_hermitian_planar: ++ return rocfft_array_type_hermitian_planar; ++ case fft_array_type_unset: ++ return rocfft_array_type_unset; ++ } ++ return rocfft_array_type_unset; ++} ++ ++inline rocfft_transform_type rocfft_transform_type_from_fftparams(const fft_transform_type val) ++{ ++ switch(val) ++ { ++ case fft_transform_type_complex_forward: ++ return rocfft_transform_type_complex_forward; ++ case fft_transform_type_complex_inverse: ++ return rocfft_transform_type_complex_inverse; ++ case fft_transform_type_real_forward: ++ return rocfft_transform_type_real_forward; ++ case fft_transform_type_real_inverse: ++ return rocfft_transform_type_real_inverse; ++ default: ++ throw std::runtime_error("Invalid transform type"); ++ } ++} ++ ++inline rocfft_result_placement ++ rocfft_result_placement_from_fftparams(const fft_result_placement val) ++{ ++ switch(val) ++ { ++ case fft_placement_inplace: ++ return rocfft_placement_inplace; ++ case fft_placement_notinplace: ++ return rocfft_placement_notinplace; ++ default: ++ throw std::runtime_error("Invalid result placement"); ++ } ++} ++ ++class rocfft_params : public fft_params ++{ ++public: ++ rocfft_plan plan = nullptr; ++ rocfft_execution_info info = nullptr; ++ rocfft_plan_description desc = nullptr; ++ gpubuf_t<void> wbuffer; ++ ++ explicit rocfft_params(){}; ++ ++ explicit rocfft_params(const fft_params& p) ++ : fft_params(p){}; ++ ++ rocfft_params(const rocfft_params&) = delete; ++ rocfft_params& operator=(const rocfft_params&) = delete; ++ ++ ~rocfft_params() ++ { ++ free(); ++ }; ++ ++ void free() ++ { ++ if(plan != nullptr) ++ { ++ rocfft_plan_destroy(plan); ++ plan = nullptr; ++ } ++ if(info != nullptr) ++ { ++ rocfft_execution_info_destroy(info); ++ info = nullptr; ++ } ++ if(desc != nullptr) ++ { ++ rocfft_plan_description_destroy(desc); ++ desc = nullptr; ++ } ++ wbuffer.free(); ++ } ++ ++ void validate_fields() const override ++ { ++ // row-major lengths including batch (i.e. batch is at the front) ++ std::vector<size_t> length_with_batch{nbatch}; ++ std::copy(length.begin(), length.end(), std::back_inserter(length_with_batch)); ++ ++ auto validate_field = [&](const fft_field& f) { ++ for(const auto& b : f.bricks) ++ { ++ // bricks must have same dim as FFT, including batch ++ if(b.lower.size() != length.size() + 1 || b.upper.size() != length.size() + 1 ++ || b.stride.size() != length.size() + 1) ++ throw std::runtime_error( ++ "brick dimension does not match FFT + batch dimension"); ++ ++ // ensure lower < upper, and that both fit in the FFT + batch dims ++ if(!std::lexicographical_compare( ++ b.lower.begin(), b.lower.end(), b.upper.begin(), b.upper.end())) ++ throw std::runtime_error("brick lower index is not less than upper index"); ++ ++ if(!std::lexicographical_compare(b.lower.begin(), ++ b.lower.end(), ++ length_with_batch.begin(), ++ length_with_batch.end())) ++ throw std::runtime_error( ++ "brick lower index is not less than FFT + batch length"); ++ ++ if(!std::lexicographical_compare(b.upper.begin(), ++ b.upper.end(), ++ length_with_batch.begin(), ++ length_with_batch.end()) ++ && b.upper != length_with_batch) ++ throw std::runtime_error("brick upper index is not <= FFT + batch length"); ++ } ++ }; ++ ++ for(const auto& ifield : ifields) ++ validate_field(ifield); ++ for(const auto& ofield : ofields) ++ validate_field(ofield); ++ } ++ ++ rocfft_precision get_rocfft_precision() ++ { ++ return rocfft_precision_from_fftparams(precision); ++ } ++ ++ size_t vram_footprint() override ++ { ++ size_t val = fft_params::vram_footprint(); ++ if(setup_structs() != fft_status_success) ++ { ++ throw std::runtime_error("Struct setup failed"); ++ } ++ val += workbuffersize; ++ ++ return val; ++ } ++ ++ // Convert the generic fft_field structure to a rocfft_field ++ // structure that can be passed to rocFFT. In particular, we need ++ // to convert from row-major to column-major. ++ static rocfft_field fft_field_to_rocfft_field(const fft_field& f) ++ { ++ rocfft_field rfield = nullptr; ++ if(f.bricks.empty()) ++ return rfield; ++ ++ if(rocfft_field_create(&rfield) != rocfft_status_success) ++ throw std::runtime_error("rocfft_field_create failed"); ++ for(const auto& b : f.bricks) ++ { ++ // rocFFT wants column-major bricks and fft_params stores ++ // row-major ++ std::vector<size_t> lower_cm; ++ std::copy(b.lower.rbegin(), b.lower.rend(), std::back_inserter(lower_cm)); ++ std::vector<size_t> upper_cm; ++ std::copy(b.upper.rbegin(), b.upper.rend(), std::back_inserter(upper_cm)); ++ std::vector<size_t> stride_cm; ++ std::copy(b.stride.rbegin(), b.stride.rend(), std::back_inserter(stride_cm)); ++ ++ rocfft_brick rbrick = nullptr; ++ if(rocfft_brick_create(&rbrick, ++ lower_cm.data(), // field_lower ++ upper_cm.data(), // field_upper ++ stride_cm.data(), // brick_stride ++ lower_cm.size(), // dim ++ b.device) // deviceID ++ != rocfft_status_success) ++ throw std::runtime_error("rocfft_brick_create failed"); ++ ++ if(rocfft_field_add_brick(rfield, rbrick) != rocfft_status_success) ++ throw std::runtime_error("rocfft_field_add_brick failed"); ++ ++ rocfft_brick_destroy(rbrick); ++ } ++ return rfield; ++ } ++ ++ fft_status setup_structs() ++ { ++ rocfft_status fft_status = rocfft_status_success; ++ if(desc == nullptr) ++ { ++ rocfft_plan_description_create(&desc); ++ if(fft_status != rocfft_status_success) ++ return fft_status_from_rocfftparams(fft_status); ++ ++ fft_status ++ = rocfft_plan_description_set_data_layout(desc, ++ rocfft_array_type_from_fftparams(itype), ++ rocfft_array_type_from_fftparams(otype), ++ ioffset.data(), ++ ooffset.data(), ++ istride_cm().size(), ++ istride_cm().data(), ++ idist, ++ ostride_cm().size(), ++ ostride_cm().data(), ++ odist); ++ if(fft_status != rocfft_status_success) ++ { ++ throw std::runtime_error("rocfft_plan_description_set_data_layout failed"); ++ } ++ ++ if(scale_factor != 1.0) ++ { ++ fft_status = rocfft_plan_description_set_scale_factor(desc, scale_factor); ++ if(fft_status != rocfft_status_success) ++ { ++ throw std::runtime_error("rocfft_plan_description_set_scale_factor failed"); ++ } ++ } ++ ++ for(const auto& ifield : ifields) ++ { ++ rocfft_field infield = fft_field_to_rocfft_field(ifield); ++ if(rocfft_plan_description_add_infield(desc, infield) != rocfft_status_success) ++ throw std::runtime_error("rocfft_description_add_infield failed"); ++ rocfft_field_destroy(infield); ++ } ++ ++ for(const auto& ofield : ofields) ++ { ++ rocfft_field outfield = fft_field_to_rocfft_field(ofield); ++ if(rocfft_plan_description_add_outfield(desc, outfield) != rocfft_status_success) ++ throw std::runtime_error("rocfft_description_add_outfield failed"); ++ rocfft_field_destroy(outfield); ++ } ++ } ++ ++ if(plan == nullptr) ++ { ++ fft_status = rocfft_plan_create(&plan, ++ rocfft_result_placement_from_fftparams(placement), ++ rocfft_transform_type_from_fftparams(transform_type), ++ get_rocfft_precision(), ++ length_cm().size(), ++ length_cm().data(), ++ nbatch, ++ desc); ++ if(fft_status != rocfft_status_success) ++ { ++ throw std::runtime_error("rocfft_plan_create failed"); ++ } ++ } ++ ++ if(info == nullptr) ++ { ++ fft_status = rocfft_execution_info_create(&info); ++ if(fft_status != rocfft_status_success) ++ { ++ throw std::runtime_error("rocfft_execution_info_create failed"); ++ } ++ } ++ ++ fft_status = rocfft_plan_get_work_buffer_size(plan, &workbuffersize); ++ if(fft_status != rocfft_status_success) ++ { ++ throw std::runtime_error("rocfft_plan_get_work_buffer_size failed"); ++ } ++ ++ return fft_status_from_rocfftparams(fft_status); ++ } ++ ++ fft_status create_plan() override ++ { ++ fft_status ret = setup_structs(); ++ if(ret != fft_status_success) ++ { ++ return ret; ++ } ++ if(workbuffersize > 0) ++ { ++ hipError_t hip_status = hipSuccess; ++ hip_status = wbuffer.alloc(workbuffersize); ++ if(hip_status != hipSuccess) ++ { ++ std::ostringstream oss; ++ oss << "work buffer allocation failed (" << workbuffersize << " requested)"; ++ size_t mem_free = 0; ++ size_t mem_total = 0; ++ hip_status = hipMemGetInfo(&mem_free, &mem_total); ++ if(hip_status == hipSuccess) ++ { ++ oss << "free vram: " << mem_free << " total vram: " << mem_total; ++ } ++ else ++ { ++ oss << "hipMemGetInfo also failed"; ++ } ++ throw work_buffer_alloc_failure(oss.str()); ++ } ++ ++ auto rocret ++ = rocfft_execution_info_set_work_buffer(info, wbuffer.data(), workbuffersize); ++ if(rocret != rocfft_status_success) ++ { ++ throw std::runtime_error("rocfft_execution_info_set_work_buffer failed"); ++ } ++ } ++ ++ return ret; ++ } ++ ++ fft_status set_callbacks(void* load_cb_host, ++ void* load_cb_data, ++ void* store_cb_host, ++ void* store_cb_data) override ++ { ++ if(run_callbacks) ++ { ++ auto roc_status ++ = rocfft_execution_info_set_load_callback(info, &load_cb_host, &load_cb_data, 0); ++ if(roc_status != rocfft_status_success) ++ return fft_status_from_rocfftparams(roc_status); ++ ++ roc_status ++ = rocfft_execution_info_set_store_callback(info, &store_cb_host, &store_cb_data, 0); ++ if(roc_status != rocfft_status_success) ++ return fft_status_from_rocfftparams(roc_status); ++ } ++ return fft_status_success; ++ } ++ ++ fft_status execute(void** in, void** out) override ++ { ++ auto ret = rocfft_execute(plan, in, out, info); ++ return fft_status_from_rocfftparams(ret); ++ } ++ ++ // scatter data to multiple GPUs and adjust I/O buffers to match ++ void multi_gpu_prepare(std::vector<gpubuf>& ibuffer, ++ std::vector<void*>& pibuffer, ++ std::vector<void*>& pobuffer) override ++ { ++ auto alloc_fields = [&](const fft_params::fft_field& field, ++ fft_array_type array_type, ++ std::vector<void*>& pbuffer, ++ bool copy_input) { ++ if(field.bricks.empty()) ++ return; ++ ++ // we have a field defined, clear the list of buffers as ++ // we'll be allocating new ones for each brick ++ pbuffer.clear(); ++ ++ for(const auto& b : field.bricks) ++ { ++ // get brick's length - note that this includes batch ++ // dimension ++ const auto brick_len = b.length(); ++ const auto brick_stride = b.stride; ++ ++ const size_t brick_size_elems = product(brick_len.begin(), brick_len.end()); ++ const size_t elem_size_bytes = var_size<size_t>(precision, array_type); ++ const size_t brick_size_bytes = brick_size_elems * elem_size_bytes; ++ ++ // set device for the alloc, but we want to return to the ++ // default device as the source of a following memcpy ++ { ++ rocfft_scoped_device dev(b.device); ++ multi_gpu_data.emplace_back(); ++ if(multi_gpu_data.back().alloc(brick_size_bytes) != hipSuccess) ++ throw std::runtime_error("device allocation failure"); ++ pbuffer.push_back(multi_gpu_data.back().data()); ++ } ++ ++ if(copy_input) ++ { ++ // For now, assume we're only splitting on highest FFT ++ // dimension, lower-dimensional FFT data is all ++ // contiguous, and batches are contiguous in each brick. ++ // ++ // That means we can express this as a 2D memcpy. ++ const size_t unbatched_elems_per_brick ++ = product(brick_len.begin() + 1, brick_len.end()); ++ const size_t unbatched_elems_per_fft = product(length.begin(), length.end()); ++ ++ // get this brick's starting offset in the field ++ const size_t brick_offset ++ = b.lower_field_offset(istride, idist) * elem_size_bytes; ++ ++ // copy from original input - note that we're ++ // assuming interleaved data so ibuffer has only one ++ // gpubuf ++ if(hipMemcpy2D(pbuffer.back(), ++ unbatched_elems_per_brick * elem_size_bytes, ++ ibuffer.front().data_offset(brick_offset), ++ unbatched_elems_per_fft * elem_size_bytes, ++ unbatched_elems_per_brick * elem_size_bytes, ++ brick_len.front(), ++ hipMemcpyHostToDevice) ++ != hipSuccess) ++ throw std::runtime_error("hipMemcpy failure"); ++ } ++ } ++ ++ // if we copied the input to all the other devices, and ++ // this is an out-of-place transform, we no longer ++ // need the original input ++ if(copy_input && placement == fft_placement_notinplace) ++ ibuffer.clear(); ++ }; ++ ++ // assume one input, one output field for simple cases ++ if(!ifields.empty()) ++ alloc_fields(ifields.front(), itype, pibuffer, true); ++ if(!ofields.empty()) ++ { ++ if(!ifields.empty() && placement == fft_placement_inplace) ++ pobuffer = pibuffer; ++ else ++ alloc_fields(ofields.front(), otype, pobuffer, false); ++ } ++ } ++ ++ // when preparing for multi-GPU transform, we need to allocate data ++ // on each GPU. This vector remembers all of those allocations. ++ std::vector<gpubuf> multi_gpu_data; ++ ++ // gather data after multi-GPU FFT for verification ++ void multi_gpu_finalize(std::vector<gpubuf>& obuffer, std::vector<void*>& pobuffer) override ++ { ++ if(ofields.empty()) ++ return; ++ ++ for(size_t i = 0; i < ofields.front().bricks.size(); ++i) ++ { ++ const auto& b = ofields.front().bricks[i]; ++ const auto& brick_ptr = pobuffer[i]; ++ ++ const auto brick_len = b.length(); ++ ++ const size_t elem_size_bytes = var_size<size_t>(precision, otype); ++ ++ // get this brick's starting offset in the field ++ const size_t brick_offset = b.lower_field_offset(ostride, odist) * elem_size_bytes; ++ ++ // switch device to where we're copying from ++ rocfft_scoped_device dev(b.device); ++ ++ // For now, assume we're only splitting on highest FFT ++ // dimension, lower-dimensional FFT data is all ++ // contiguous, and batches are contiguous in each brick. ++ // ++ // That means we can express this as a 2D memcpy. ++ const size_t unbatched_elems_per_brick ++ = product(brick_len.begin() + 1, brick_len.end()); ++ const auto output_length = olength(); ++ const size_t unbatched_elems_per_fft ++ = product(output_length.begin(), output_length.end()); ++ ++ // copy to original output buffer - note that ++ // we're assuming interleaved data so obuffer ++ // has only one gpubuf ++ if(hipMemcpy2D(obuffer.front().data_offset(brick_offset), ++ unbatched_elems_per_fft * elem_size_bytes, ++ brick_ptr, ++ unbatched_elems_per_brick * elem_size_bytes, ++ unbatched_elems_per_brick * elem_size_bytes, ++ brick_len.front(), ++ hipMemcpyDeviceToDevice) ++ != hipSuccess) ++ throw std::runtime_error("hipMemcpy failure"); ++ ++ // device-to-device transfers don't synchronize with the ++ // host, add explicit sync ++ (void)hipDeviceSynchronize(); ++ } ++ pobuffer.clear(); ++ pobuffer.push_back(obuffer.front().data()); ++ } ++}; ++ ++#endif +diff --git a/shared/test_params.h b/shared/test_params.h +new file mode 100644 +index 0000000..8d8f6f7 +--- /dev/null ++++ b/shared/test_params.h +@@ -0,0 +1,51 @@ ++// Copyright (C) 2016 - 2023 Advanced Micro Devices, Inc. All rights reserved. ++// ++// Permission is hereby granted, free of charge, to any person obtaining a copy ++// of this software and associated documentation files (the "Software"), to deal ++// in the Software without restriction, including without limitation the rights ++// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell ++// copies of the Software, and to permit persons to whom the Software is ++// furnished to do so, subject to the following conditions: ++// ++// The above copyright notice and this permission notice shall be included in ++// all copies or substantial portions of the Software. ++// ++// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, ++// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN ++// THE SOFTWARE. ++ ++#pragma once ++#ifndef TESTCONSTANTS_H ++#define TESTCONSTANTS_H ++ ++#include <stdexcept> ++ ++extern int verbose; ++extern size_t ramgb; ++extern size_t vramgb; ++ ++extern size_t n_random_tests; ++ ++extern size_t random_seed; ++extern double planar_prob; ++extern double callback_prob; ++ ++extern double half_epsilon; ++extern double single_epsilon; ++extern double double_epsilon; ++extern bool skip_runtime_fails; ++ ++extern double max_linf_eps_double; ++extern double max_l2_eps_double; ++extern double max_linf_eps_single; ++extern double max_l2_eps_single; ++extern double max_linf_eps_half; ++extern double max_l2_eps_half; ++ ++extern int n_hip_failures; ++ ++#endif +diff --git a/shared/work_queue.h b/shared/work_queue.h +new file mode 100644 +index 0000000..e13fc41 +--- /dev/null ++++ b/shared/work_queue.h +@@ -0,0 +1,49 @@ ++// Copyright (C) 2022 Advanced Micro Devices, Inc. All rights reserved. ++// ++// Permission is hereby granted, free of charge, to any person obtaining a copy ++// of this software and associated documentation files (the "Software"), to deal ++// in the Software without restriction, including without limitation the rights ++// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell ++// copies of the Software, and to permit persons to whom the Software is ++// furnished to do so, subject to the following conditions: ++// ++// The above copyright notice and this permission notice shall be included in ++// all copies or substantial portions of the Software. ++// ++// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, ++// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN ++// THE SOFTWARE. ++ ++#pragma once ++ ++#include <condition_variable> ++#include <mutex> ++#include <queue> ++template <typename _WorkItem> ++struct WorkQueue ++{ ++ void push(_WorkItem&& i) ++ { ++ std::unique_lock<std::mutex> lock(queueMutex); ++ items.emplace(std::move(i)); ++ emptyWait.notify_all(); ++ } ++ _WorkItem pop() ++ { ++ std::unique_lock<std::mutex> lock(queueMutex); ++ while(items.empty()) ++ emptyWait.wait(lock); ++ _WorkItem item(items.front()); ++ items.pop(); ++ return item; ++ } ++ ++private: ++ std::queue<_WorkItem> items; ++ std::mutex queueMutex; ++ std::condition_variable emptyWait; ++}; diff --git a/var/spack/repos/builtin/packages/hipfft/package.py b/var/spack/repos/builtin/packages/hipfft/package.py index 818a9c4935..f5749749ac 100644 --- a/var/spack/repos/builtin/packages/hipfft/package.py +++ b/var/spack/repos/builtin/packages/hipfft/package.py @@ -14,9 +14,9 @@ class Hipfft(CMakePackage, CudaPackage, ROCmPackage): It sits between the application and the backend FFT library, marshalling inputs into the backend and results back to the application.""" - homepage = "https://github.com/ROCmSoftwarePlatform/hipFFT" - git = "https://github.com/ROCmSoftwarePlatform/hipFFT.git" - url = "https://github.com/ROCmSoftwarePlatform/hipfft/archive/rocm-5.5.0.tar.gz" + homepage = "https://github.com/ROCm/hipFFT" + git = "https://github.com/ROCm/hipFFT.git" + url = "https://github.com/ROCm/hipfft/archive/rocm-6.0.0.tar.gz" tags = ["rocm"] maintainers("renjithravindrankannath", "srekolam") @@ -24,6 +24,7 @@ class Hipfft(CMakePackage, CudaPackage, ROCmPackage): license("MIT") version("master", branch="master") + version("6.0.0", sha256="44f328b7862c066459089dfe62833cb7d626c6ceb71c57d8c7d6bba45dad491e") version("5.7.1", sha256="33452576649df479f084076c47d0b30f6f1da34864094bce767dd9bf609f04aa") version("5.7.0", sha256="daa5dc44580145e85ff8ffa7eb40a3d1ef41f3217549c01281715ff696a31588") version("5.6.1", sha256="d2ae36b8eacd39b865e8a7972b8eb86bcea2de4ac90711bba7e29b39b01eaa74") @@ -125,6 +126,7 @@ class Hipfft(CMakePackage, CudaPackage, ROCmPackage): "5.6.1", "5.7.0", "5.7.1", + "6.0.0", ]: depends_on("rocm-cmake@%s:" % ver, type="build", when="@" + ver) depends_on("rocfft@" + ver, when="+rocm @" + ver) @@ -133,6 +135,8 @@ class Hipfft(CMakePackage, CudaPackage, ROCmPackage): depends_on( "rocfft amdgpu_target={0}".format(tgt), when="+rocm amdgpu_target={0}".format(tgt) ) + # https://github.com/ROCm/rocFFT/pull/85) + patch("001-remove-submodule-and-sync-shared-files-from-rocFFT.patch", when="@6.0.0") def cmake_args(self): args = [self.define("BUILD_CLIENTS_SAMPLES", "OFF")] diff --git a/var/spack/repos/builtin/packages/hipfort/package.py b/var/spack/repos/builtin/packages/hipfort/package.py index be1819bf50..8e8ea5a0a6 100644 --- a/var/spack/repos/builtin/packages/hipfort/package.py +++ b/var/spack/repos/builtin/packages/hipfort/package.py @@ -9,14 +9,15 @@ from spack.package import * class Hipfort(CMakePackage): """Radeon Open Compute Parallel Primitives Library""" - homepage = "https://github.com/ROCmSoftwarePlatform/hipfort" - git = "https://github.com/ROCmSoftwarePlatform/hipfort.git" - url = "https://github.com/ROCmSoftwarePlatform/hipfort/archive/rocm-5.5.0.tar.gz" + homepage = "https://github.com/ROCm/hipfort" + git = "https://github.com/ROCm/hipfort.git" + url = "https://github.com/ROCm/hipfort/archive/rocm-6.0.0.tar.gz" tags = ["rocm"] license("MIT") maintainers("cgmb", "srekolam", "renjithravindrankannath") + version("6.0.0", sha256="151cf11648885db799aade0d00a7882589e7195643b02beaa251f1b2a43aceed") version("5.7.1", sha256="859fac509e195f3ab97c555b5f63afea325a61aae0f281cb19a970a1b533dead") version("5.7.0", sha256="57b04d59f61683a1b141d6d831d10c9fdecea483991ec02d14c14e441e935c05") version("5.6.1", sha256="a55345cc9ccaf0cd69d306b8eb9ec2a02c220a57e9c396443cc7273aa3377adc") @@ -127,6 +128,7 @@ class Hipfort(CMakePackage): "5.6.1", "5.7.0", "5.7.1", + "6.0.0", ]: depends_on("hip@" + ver, type="build", when="@" + ver) diff --git a/var/spack/repos/builtin/packages/hipify-clang/package.py b/var/spack/repos/builtin/packages/hipify-clang/package.py index ab15e479d4..b1c5f2a7fb 100644 --- a/var/spack/repos/builtin/packages/hipify-clang/package.py +++ b/var/spack/repos/builtin/packages/hipify-clang/package.py @@ -10,9 +10,9 @@ class HipifyClang(CMakePackage): """hipify-clang is a clang-based tool for translation CUDA sources into HIP sources""" - homepage = "https://github.com/ROCm-Developer-Tools/HIPIFY" - git = "https://github.com/ROCm-Developer-Tools/HIPIFY.git" - url = "https://github.com/ROCm-Developer-Tools/HIPIFY/archive/rocm-5.5.0.tar.gz" + homepage = "https://github.com/ROCm/HIPIFY" + git = "https://github.com/ROCm/HIPIFY.git" + url = "https://github.com/ROCm/HIPIFY/archive/rocm-6.0.0.tar.gz" tags = ["rocm"] maintainers("srekolam", "renjithravindrankannath") @@ -20,6 +20,7 @@ class HipifyClang(CMakePackage): license("MIT") version("master", branch="master") + version("6.0.0", sha256="91bed2b72a6684a04e078e50b12b36b93f64ff96523283f4e5d9a33c11e6b967") version("5.7.1", sha256="43121e62233dab010ab686d6805bc2d3163f0dc5e89cc503d50c4bcd59eeb394") version("5.7.0", sha256="10e4386727e102fba166f012147120a6ec776e8d95fbcac3af93e243205d80a6") version("5.6.1", sha256="ec3a4f276556f9fd924ea3c89be11b6c6ddf999cdd4387f669e38e41ee0042e8") @@ -143,11 +144,12 @@ class HipifyClang(CMakePackage): "5.6.1", "5.7.0", "5.7.1", + "6.0.0", "master", ]: depends_on("llvm-amdgpu@" + ver, when="@" + ver) - for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1", "5.7.0", "5.7.1"]: + for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1", "5.7.0", "5.7.1", "6.0.0"]: depends_on("rocm-core@" + ver, when="@" + ver) def setup_run_environment(self, env): diff --git a/var/spack/repos/builtin/packages/hiprand/package.py b/var/spack/repos/builtin/packages/hiprand/package.py index 0d8666f884..acc3629762 100644 --- a/var/spack/repos/builtin/packages/hiprand/package.py +++ b/var/spack/repos/builtin/packages/hiprand/package.py @@ -12,9 +12,9 @@ class Hiprand(CMakePackage, CudaPackage, ROCmPackage): """The hipRAND project provides an interface for generating pseudo-random and quasi-random numbers with either cuRAND or rocRAND backends.""" - homepage = "https://github.com/ROCmSoftwarePlatform/hipRAND" - git = "https://github.com/ROCmSoftwarePlatform/hipRAND.git" - url = "https://github.com/ROCmSoftwarePlatform/hipRAND/archive/rocm-5.7.1.tar.gz" + homepage = "https://github.com/ROCm/hipRAND" + git = "https://github.com/ROCm/hipRAND.git" + url = "https://github.com/ROCm/hipRAND/archive/rocm-5.7.1.tar.gz" tags = ["rocm"] maintainers("cgmb", "srekolam", "renjithravindrankannath") @@ -24,6 +24,7 @@ class Hiprand(CMakePackage, CudaPackage, ROCmPackage): version("develop", branch="develop") version("master", branch="master") + version("6.0.0", sha256="7e06c98f9da7c0b20b55b2106cf3a48b9ef6577a79549a455667ae97bd15b61d") version("5.7.1", sha256="81a9f5f0960dce125ce1ab1c7eb58bb07c8756346f9e46a1cc65aa61d5a114f8") version("5.7.0", sha256="4dee76719839503b02ce7d38e1c61bbdb2da18da7f63a7ef7012c84c71aa0a9d") version("5.6.1", sha256="a73d5578bc7f8dff0b8960e4bff97bc4fc28f508a19ed6acd1cfd4d3e76b47ee") @@ -88,6 +89,7 @@ class Hiprand(CMakePackage, CudaPackage, ROCmPackage): "5.6.1", "5.7.0", "5.7.1", + "6.0.0", "master", "develop", ]: diff --git a/var/spack/repos/builtin/packages/hipsolver/package.py b/var/spack/repos/builtin/packages/hipsolver/package.py index f39755d03d..81c956334c 100644 --- a/var/spack/repos/builtin/packages/hipsolver/package.py +++ b/var/spack/repos/builtin/packages/hipsolver/package.py @@ -16,9 +16,9 @@ class Hipsolver(CMakePackage, CudaPackage, ROCmPackage): regardless of the chosen backend. Currently, hipSOLVER supports rocSOLVER and cuSOLVER as backends.""" - homepage = "https://github.com/ROCmSoftwarePlatform/hipSOLVER" - git = "https://github.com/ROCmSoftwarePlatform/hipSOLVER.git" - url = "https://github.com/ROCmSoftwarePlatform/hipSOLVER/archive/rocm-5.5.0.tar.gz" + homepage = "https://github.com/ROCm/hipSOLVER" + git = "https://github.com/ROCm/hipSOLVER.git" + url = "https://github.com/ROCm/hipSOLVER/archive/rocm-6.0.0.tar.gz" tags = ["rocm"] maintainers("cgmb", "srekolam", "renjithravindrankannath") @@ -28,6 +28,7 @@ class Hipsolver(CMakePackage, CudaPackage, ROCmPackage): version("develop", branch="develop") version("master", branch="master") + version("6.0.0", sha256="385849db02189d5e62096457e52ae899ae5c1ae7d409dc1da61f904d8861b48c") version("5.7.1", sha256="5592e965c0dc5722931302289643d1ece370220af2c7afc58af97b3395295658") version("5.7.0", sha256="0e35795bfbcb57ed8e8437471209fb7d230babcc31d9a4a0b3640c3ee639f4a7") version("5.6.1", sha256="2e546bc7771f7bf0aa7892b69cded725941573e8b70614759c3d03c21eb78dde") @@ -115,6 +116,7 @@ class Hipsolver(CMakePackage, CudaPackage, ROCmPackage): "5.6.1", "5.7.0", "5.7.1", + "6.0.0", "master", "develop", ]: diff --git a/var/spack/repos/builtin/packages/hipsparse/package.py b/var/spack/repos/builtin/packages/hipsparse/package.py index a195356fa4..0473a3ea3d 100644 --- a/var/spack/repos/builtin/packages/hipsparse/package.py +++ b/var/spack/repos/builtin/packages/hipsparse/package.py @@ -12,9 +12,9 @@ class Hipsparse(CMakePackage, CudaPackage, ROCmPackage): """hipSPARSE is a SPARSE marshalling library, with multiple supported backends""" - homepage = "https://github.com/ROCmSoftwarePlatform/hipSPARSE" - git = "https://github.com/ROCmSoftwarePlatform/hipSPARSE.git" - url = "https://github.com/ROCmSoftwarePlatform/hipSPARSE/archive/rocm-5.5.0.tar.gz" + homepage = "https://github.com/ROCm/hipSPARSE" + git = "https://github.com/ROCm/hipSPARSE.git" + url = "https://github.com/ROCm/hipSPARSE/archive/rocm-6.0.0.tar.gz" tags = ["rocm"] maintainers("cgmb", "srekolam", "renjithravindrankannath", "haampie") @@ -22,6 +22,7 @@ class Hipsparse(CMakePackage, CudaPackage, ROCmPackage): license("MIT") + version("6.0.0", sha256="718a5f03b6a579c0542a60d00f5688bec53a181b429b7ee8ce3c8b6c4a78d754") version("5.7.1", sha256="16c3818260611226c3576d8d55ad8f51e0890d2473503edf2c9313250ae65ca7") version("5.7.0", sha256="729b749b5340034639873a99e6091963374f6f0456c8f36d076c96f03fe43888") version("5.6.1", sha256="d636d0c5d1e38cc0c09b1e95380199ec82bd465b94bd6661f0c8d9374d9b565d") @@ -160,6 +161,7 @@ class Hipsparse(CMakePackage, CudaPackage, ROCmPackage): "5.6.1", "5.7.0", "5.7.1", + "6.0.0", ]: depends_on("rocm-cmake@%s:" % ver, type="build", when="@" + ver) depends_on("rocsparse@" + ver, when="+rocm @" + ver) diff --git a/var/spack/repos/builtin/packages/hsa-rocr-dev/package.py b/var/spack/repos/builtin/packages/hsa-rocr-dev/package.py index d0a153a595..6b64a0129b 100644 --- a/var/spack/repos/builtin/packages/hsa-rocr-dev/package.py +++ b/var/spack/repos/builtin/packages/hsa-rocr-dev/package.py @@ -15,15 +15,16 @@ class HsaRocrDev(CMakePackage): HSA ROCm kernel agents.AMD Heterogeneous System Architecture HSA - Linux HSA Runtime for Boltzmann (ROCm) platforms.""" - homepage = "https://github.com/RadeonOpenCompute/ROCR-Runtime" - git = "https://github.com/RadeonOpenCompute/ROCR-Runtime.git" - url = "https://github.com/RadeonOpenCompute/ROCR-Runtime/archive/rocm-5.5.0.tar.gz" + homepage = "https://github.com/ROCm/ROCR-Runtime" + git = "https://github.com/ROCm/ROCR-Runtime.git" + url = "https://github.com/ROCm/ROCR-Runtime/archive/rocm-6.0.0.tar.gz" tags = ["rocm"] maintainers("srekolam", "renjithravindrankannath", "haampie") libraries = ["libhsa-runtime64"] version("master", branch="master") + version("6.0.0", sha256="99e8fa1af52d0bf382f28468e1a345af1ff3452c35914a6a7b5eeaf69fc568db") version("5.7.1", sha256="655e9bfef4b0b6ad3f9b89c934dc0a8377273bb0bccbda6c399ac5d5d2c1c04c") version("5.7.0", sha256="2c56ec5c78a36f2b847afd4632cb25dbf6ecc58661eb2ae038c2552342e6ce23") version("5.6.1", sha256="4de9a57c2092edf9398d671c8a2c60626eb7daf358caf710da70d9c105490221") @@ -154,6 +155,7 @@ class HsaRocrDev(CMakePackage): "5.6.1", "5.7.0", "5.7.1", + "6.0.0", "master", ]: depends_on("hsakmt-roct@" + ver, when="@" + ver) @@ -163,7 +165,7 @@ class HsaRocrDev(CMakePackage): "rocm-device-libs@" + ver, when="@{0} ^llvm-amdgpu ~rocm-device-libs".format(ver) ) - for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1", "5.7.0", "5.7.1"]: + for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1", "5.7.0", "5.7.1", "6.0.0"]: depends_on("rocm-core@" + ver, when="@" + ver) # Both 3.5.0 and 3.7.0 force INSTALL_RPATH in different ways @@ -210,4 +212,7 @@ class HsaRocrDev(CMakePackage): if self.spec.satisfies("@5.6:"): args.append("-DCMAKE_INSTALL_LIBDIR=lib") + if self.spec.satisfies("@6.0:"): + args.append(self.define("ROCM_PATCH_VERSION", "60000")) + return args diff --git a/var/spack/repos/builtin/packages/hsakmt-roct/package.py b/var/spack/repos/builtin/packages/hsakmt-roct/package.py index e087ea6519..89be71a9ea 100644 --- a/var/spack/repos/builtin/packages/hsakmt-roct/package.py +++ b/var/spack/repos/builtin/packages/hsakmt-roct/package.py @@ -14,14 +14,15 @@ class HsakmtRoct(CMakePackage): Thunk Interface is a user-mode API interfaces used to interact with the ROCk driver.""" - homepage = "https://github.com/RadeonOpenCompute/ROCT-Thunk-Interface" - git = "https://github.com/RadeonOpenCompute/ROCT-Thunk-Interface.git" - url = "https://github.com/RadeonOpenCompute/ROCT-Thunk-Interface/archive/rocm-5.5.0.tar.gz" + homepage = "https://github.com/ROCm/ROCT-Thunk-Interface" + git = "https://github.com/ROCm/ROCT-Thunk-Interface.git" + url = "https://github.com/ROCm/ROCT-Thunk-Interface/archive/rocm-6.0.0.tar.gz" tags = ["rocm"] maintainers("srekolam", "renjithravindrankannath") version("master", branch="master") + version("6.0.0", sha256="9f4e80bd0a714ce45326941b906a62298c62025eff186dc6c48282ce84c787c7") version("5.7.1", sha256="38bc3732886a52ca9cd477ec6fcde3ab17a0ba5dc8e2f7ac34c4de597bd00e8b") version("5.7.0", sha256="52293e40c4ba0c653d796e2f6109f5fb4c79f5fb82310ecbfd9a5432acf9da43") version("5.6.1", sha256="d60b355bfd21a08e0e36270fd56f98d052c3c6edca47da887fa32bf32759c29b") @@ -119,11 +120,11 @@ class HsakmtRoct(CMakePackage): for ver in ["5.3.0", "5.4.0", "5.4.3"]: depends_on("llvm-amdgpu@" + ver, type="test", when="@" + ver) - for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1", "5.7.0", "5.7.1"]: + for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1", "5.7.0", "5.7.1", "6.0.0"]: depends_on("rocm-core@" + ver, when="@" + ver) depends_on("llvm-amdgpu@" + ver, type="test", when="@" + ver) - # See https://github.com/RadeonOpenCompute/ROCT-Thunk-Interface/issues/72 + # See https://github.com/ROCm/ROCT-Thunk-Interface/issues/72 # and https://github.com/spack/spack/issues/28398 patch("0001-Remove-compiler-support-libraries-and-libudev-as-req.patch", when="@4.5.0:5.2") patch("0002-Remove-compiler-support-libraries-and-libudev-as-req-5.3.patch", when="@5.3.0:5.4") diff --git a/var/spack/repos/builtin/packages/legion/package.py b/var/spack/repos/builtin/packages/legion/package.py index 2840d577de..7cc446cded 100644 --- a/var/spack/repos/builtin/packages/legion/package.py +++ b/var/spack/repos/builtin/packages/legion/package.py @@ -74,6 +74,7 @@ class Legion(CMakePackage, ROCmPackage): # https://github.com/spack/spack/issues/37232#issuecomment-1553376552 patch("hip-offload-arch.patch", when="@23.03.0 +rocm") + patch("update-hip-path-legion-23.06.0.patch", when="@23.06.0 ^hip@6.0.0 +rocm") def patch(self): if "network=gasnet conduit=ofi-slingshot11 ^cray-mpich+wrappers" in self.spec: @@ -349,6 +350,10 @@ class Legion(CMakePackage, ROCmPackage): options.append(from_variant("Legion_HIP_ARCH", "amdgpu_target")) options.append(from_variant("Legion_HIJACK_HIP", "hip_hijack")) options.append(self.define("HIP_PATH", "{0}/hip".format(spec["hip"].prefix))) + if "^hip@:5.7" in spec: + options.append(self.define("HIP_PATH", "{0}/hip".format(spec["hip"].prefix))) + elif "^hip@6.0:" in spec: + options.append(self.define("HIP_PATH", "{0}".format(spec["hip"].prefix))) if "+fortran" in spec: # default is off. diff --git a/var/spack/repos/builtin/packages/legion/update-hip-path-legion-23.06.0.patch b/var/spack/repos/builtin/packages/legion/update-hip-path-legion-23.06.0.patch new file mode 100644 index 0000000000..9f7f6a7a86 --- /dev/null +++ b/var/spack/repos/builtin/packages/legion/update-hip-path-legion-23.06.0.patch @@ -0,0 +1,13 @@ +diff --git a/cmake/FindHIP.cmake b/cmake/FindHIP.cmake +index f86edd2..24492ad 100644 +--- a/cmake/FindHIP.cmake ++++ b/cmake/FindHIP.cmake +@@ -22,7 +22,7 @@ if(NOT DEFINED HIP_PATH) + set(HIP_PATH $ENV{HIP_PATH} CACHE PATH "Path to where HIP has been installed") + endif() + endif() +-include(${HIP_PATH}/cmake/FindHIP.cmake) ++include(${HIP_PATH}/lib/cmake/hip/FindHIP.cmake) + + if(NOT HIP_INCLUDE_DIRS) + list(APPEND HIP_INCLUDE_DIRS diff --git a/var/spack/repos/builtin/packages/llvm-amdgpu/package.py b/var/spack/repos/builtin/packages/llvm-amdgpu/package.py index f8cddebf84..99a2e67488 100644 --- a/var/spack/repos/builtin/packages/llvm-amdgpu/package.py +++ b/var/spack/repos/builtin/packages/llvm-amdgpu/package.py @@ -14,9 +14,9 @@ class LlvmAmdgpu(CMakePackage): """Toolkit for the construction of highly optimized compilers, optimizers, and run-time environments.""" - homepage = "https://github.com/RadeonOpenCompute/llvm-project" - git = "https://github.com/RadeonOpenCompute/llvm-project.git" - url = "https://github.com/RadeonOpenCompute/llvm-project/archive/rocm-5.5.0.tar.gz" + homepage = "https://github.com/ROCm/llvm-project" + git = "https://github.com/ROCm/llvm-project.git" + url = "https://github.com/ROCm/llvm-project/archive/rocm-6.0.0.tar.gz" tags = ["rocm"] executables = [r"amdclang", r"amdclang\+\+", r"amdflang", r"clang.*", r"flang.*", "llvm-.*"] generator("ninja") @@ -26,6 +26,7 @@ class LlvmAmdgpu(CMakePackage): license("Apache-2.0") version("master", branch="amd-stg-open") + version("6.0.0", sha256="c673708d413d60ca8606ee75c77e9871b6953c59029c987b92f2f6e85f683626") version("5.7.1", sha256="6b54c422e45ad19c9bf5ab090ec21753e7f7d854ca78132c30eb146657b168eb") version("5.7.0", sha256="4abdf00b297a77c5886cedb37e63acda2ba11cb9f4c0a64e133b05800aadfcf0") version("5.6.1", sha256="045e43c0c4a3f4f2f1db9fb603a4f1ea3d56e128147e19ba17909eb57d7f08e5") @@ -167,12 +168,12 @@ class LlvmAmdgpu(CMakePackage): # as per 5.2.0 llvm code. It used to be llvm/bin/../lib/libdevice. # Below patch is to look in the old path. patch("adjust-openmp-bitcode-directory-for-llvm-link.patch", when="@5.2.0:5.6") - patch("0001-update-HIP_PATH-deduction-for-5.7.0.patch", when="@5.7.0:5.7") + patch("0001-update-HIP_PATH-deduction-for-5.7.0.patch", when="@5.7.0:6.0") # Below patch is to set the flag -mcode-object-version=none until # the below fix is available in device-libs release code. - # https://github.com/RadeonOpenCompute/ROCm-Device-Libs/commit/f0356159dbdc93ea9e545f9b61a7842f9c881fdf - patch("patch-llvm-5.5.0.patch", when="@5.5: +rocm-device-libs") + # https://github.com/ROCm/ROCm-Device-Libs/commit/f0356159dbdc93ea9e545f9b61a7842f9c881fdf + patch("patch-llvm-5.5.0.patch", when="@5.5:5.7 +rocm-device-libs") # i1 muls can sometimes happen after SCEV. # They resulted in ISel failures because we were missing the patterns for them. @@ -188,6 +189,7 @@ class LlvmAmdgpu(CMakePackage): # Add device libs sources so they can be an external LLVM project for d_version, d_shasum in [ + ("6.0.0", "198df4550d4560537ba60ac7af9bde31d59779c8ec5d6309627f77a43ab6ef6f"), ("5.7.1", "703de8403c0bd0d80f37c970a698f10f148daf144d34f982e4484d04f7c7bbef"), ("5.7.0", "0f8780b9098573f1c456bdc84358de924dcf00604330770a383983e1775bf61e"), ("5.6.1", "f0dfab272ff936225bfa1e9dabeb3c5d12ce08b812bf53ffbddd2ddfac49761c"), @@ -221,7 +223,7 @@ class LlvmAmdgpu(CMakePackage): resource( name="rocm-device-libs", placement="rocm-device-libs", - url="https://github.com/RadeonOpenCompute/ROCm-Device-Libs/archive/rocm-{0}.tar.gz".format( + url="https://github.com/ROCm/ROCm-Device-Libs/archive/rocm-{0}.tar.gz".format( d_version ), sha256=d_shasum, @@ -231,11 +233,12 @@ class LlvmAmdgpu(CMakePackage): resource( name="rocm-device-libs", placement="rocm-device-libs", - git="https://github.com/RadeonOpenCompute/ROCm-Device-Libs.git", + git="https://github.com/ROCm/ROCm-Device-Libs.git", branch="amd-stg-open", when="@master +rocm-device-libs", ) for d_version, d_shasum in [ + ("6.0.0", "99e8fa1af52d0bf382f28468e1a345af1ff3452c35914a6a7b5eeaf69fc568db"), ("5.7.1", "655e9bfef4b0b6ad3f9b89c934dc0a8377273bb0bccbda6c399ac5d5d2c1c04c"), ("5.7.0", "2c56ec5c78a36f2b847afd4632cb25dbf6ecc58661eb2ae038c2552342e6ce23"), ("5.6.1", "4de9a57c2092edf9398d671c8a2c60626eb7daf358caf710da70d9c105490221"), @@ -244,19 +247,20 @@ class LlvmAmdgpu(CMakePackage): resource( name="hsa-runtime", placement="hsa-runtime", - url=f"https://github.com/RadeonOpenCompute/ROCR-Runtime/archive/rocm-{d_version}.tar.gz", + url=f"https://github.com/ROCm/ROCR-Runtime/archive/rocm-{d_version}.tar.gz", sha256=d_shasum, when="@{0}".format(d_version), ) resource( name="hsa-runtime", placement="hsa-runtime", - git="https://github.com/RadeonOpenCompute/ROCR-Runtime.git", + git="https://github.com/ROCm/ROCR-Runtime.git", branch="master", when="@master", ) for d_version, d_shasum in [ + ("6.0.0", "04353d27a512642a5e5339532a39d0aabe44e0964985de37b150a2550385800a"), ("5.7.1", "3b9433b4a0527167c3e9dfc37a3c54e0550744b8d4a8e1be298c8d4bcedfee7c"), ("5.7.0", "e234bcb93d602377cfaaacb59aeac5796edcd842a618162867b7e670c3a2c42c"), ("5.6.1", "0a85d84619f98be26ca7a32c71f94ed3c4e9866133789eabb451be64ce739300"), @@ -265,14 +269,14 @@ class LlvmAmdgpu(CMakePackage): resource( name="comgr", placement="comgr", - url=f"https://github.com/RadeonOpenCompute/ROCm-CompilerSupport/archive/rocm-{d_version}.tar.gz", + url=f"https://github.com/ROCm/ROCm-CompilerSupport/archive/rocm-{d_version}.tar.gz", sha256=d_shasum, when="@{0}".format(d_version), ) resource( name="comgr", placement="comgr", - git="https://github.com/RadeonOpenCompute/ROCm-CompilerSupport.git", + git="https://github.com/ROCm/ROCm-CompilerSupport.git", branch="amd-stg-open", when="@master", ) diff --git a/var/spack/repos/builtin/packages/magma/0001-fix-magma-build-error-with-rocm-6.0.0.patch b/var/spack/repos/builtin/packages/magma/0001-fix-magma-build-error-with-rocm-6.0.0.patch new file mode 100644 index 0000000000..accc271419 --- /dev/null +++ b/var/spack/repos/builtin/packages/magma/0001-fix-magma-build-error-with-rocm-6.0.0.patch @@ -0,0 +1,99 @@ +From 4f7d9ff22996ba3000ee344a0f84f73c27257f47 Mon Sep 17 00:00:00 2001 +From: sreenivasa murthy kolam <sreenivasamurthy.kolam@amd.com> +Date: Wed, 17 Jan 2024 11:44:32 +0000 +Subject: [PATCH] Fix Build Failure with rocm-6.0.0 . Add extra parameter for + hipblasZtrmm(),hipblasCtrmm()etc + +--- + interface_hip/blas_c_v2.cpp | 3 ++- + interface_hip/blas_d_v2.cpp | 3 ++- + interface_hip/blas_s_v2.cpp | 3 ++- + interface_hip/blas_z_v2.cpp | 3 ++- + interface_hip/interface.cpp | 5 ++--- + 5 files changed, 10 insertions(+), 7 deletions(-) + +diff --git a/interface_hip/blas_c_v2.cpp b/interface_hip/blas_c_v2.cpp +index 6147857..a406faf 100644 +--- a/interface_hip/blas_c_v2.cpp ++++ b/interface_hip/blas_c_v2.cpp +@@ -1858,7 +1858,8 @@ magma_ctrmm( + hipblas_diag_const( diag ), + int(m), int(n), + (hipblasComplex*)&alpha, (const hipblasComplex*)dA, int(ldda), +- (hipblasComplex*)dB, int(lddb) ); ++ (hipblasComplex*)dB, int(lddb), ++ (hipblasComplex*)dB, int(lddb) ); /* C same as B; less efficient */ + #else + hipblasCtrmm( + queue->hipblas_handle(), +diff --git a/interface_hip/blas_d_v2.cpp b/interface_hip/blas_d_v2.cpp +index 340f0b2..8c1ecd4 100644 +--- a/interface_hip/blas_d_v2.cpp ++++ b/interface_hip/blas_d_v2.cpp +@@ -1858,7 +1858,8 @@ magma_dtrmm( + hipblas_diag_const( diag ), + int(m), int(n), + (double*)&alpha, (const double*)dA, int(ldda), +- (double*)dB, int(lddb) ); ++ (double*)dB, int(lddb), ++ (double*)dB, int(lddb) ); /* C same as B; less efficient */ + #else + hipblasDtrmm( + queue->hipblas_handle(), +diff --git a/interface_hip/blas_s_v2.cpp b/interface_hip/blas_s_v2.cpp +index 87aeba3..a2cfc02 100644 +--- a/interface_hip/blas_s_v2.cpp ++++ b/interface_hip/blas_s_v2.cpp +@@ -1858,7 +1858,8 @@ magma_strmm( + hipblas_diag_const( diag ), + int(m), int(n), + (float*)&alpha, (const float*)dA, int(ldda), +- (float*)dB, int(lddb) ); ++ (float*)dB, int(lddb), ++ (float*)dB, int(lddb) ); /* C same as B; less efficient */ + #else + hipblasStrmm( + queue->hipblas_handle(), +diff --git a/interface_hip/blas_z_v2.cpp b/interface_hip/blas_z_v2.cpp +index 3c7e87a..eb9e2e6 100644 +--- a/interface_hip/blas_z_v2.cpp ++++ b/interface_hip/blas_z_v2.cpp +@@ -1858,7 +1858,8 @@ magma_ztrmm( + hipblas_diag_const( diag ), + int(m), int(n), + (hipblasDoubleComplex*)&alpha, (const hipblasDoubleComplex*)dA, int(ldda), +- (hipblasDoubleComplex*)dB, int(lddb) ); ++ (hipblasDoubleComplex*)dB, int(lddb), ++ (hipblasDoubleComplex*)dB, int(lddb) ); /* C same as B; less efficient */ + #else + hipblasZtrmm( + queue->hipblas_handle(), +diff --git a/interface_hip/interface.cpp b/interface_hip/interface.cpp +index 2b35b34..7c76426 100644 +--- a/interface_hip/interface.cpp ++++ b/interface_hip/interface.cpp +@@ -209,11 +209,10 @@ magma_init() + else { + g_magma_devices[dev].memory = prop.totalGlobalMem; + g_magma_devices[dev].shmem_block = prop.sharedMemPerBlock; +- #ifdef MAGMA_HAVE_CUDA + g_magma_devices[dev].cuda_arch = prop.major*100 + prop.minor*10; ++ #ifdef MAGMA_HAVE_CUDA + g_magma_devices[dev].shmem_multiproc = prop.sharedMemPerMultiprocessor; + #elif defined(MAGMA_HAVE_HIP) +- g_magma_devices[dev].cuda_arch = prop.gcnArch; + g_magma_devices[dev].shmem_multiproc = prop.maxSharedMemoryPerMultiProcessor; + #endif + +@@ -464,7 +463,7 @@ magma_print_environment() + prop.name, + prop.clockRate / 1000., + prop.totalGlobalMem / (1024.*1024.), +- prop.gcnArch ); ++ prop.gcnArchName ); + #endif + } + +-- +2.39.3 + diff --git a/var/spack/repos/builtin/packages/magma/package.py b/var/spack/repos/builtin/packages/magma/package.py index 26ed916e72..585f360a7b 100644 --- a/var/spack/repos/builtin/packages/magma/package.py +++ b/var/spack/repos/builtin/packages/magma/package.py @@ -78,6 +78,7 @@ class Magma(CMakePackage, CudaPackage, ROCmPackage): patch("magma-2.5.0.patch", when="@2.5.0") patch("magma-2.5.0-cmake.patch", when="@2.5.0") patch("cmake-W.patch", when="@2.5.0:%nvhpc") + patch("0001-fix-magma-build-error-with-rocm-6.0.0.patch", when="@2.7.2 ^hip@6.0.0 + rocm") @run_before("cmake") def generate_gpu_config(self): @@ -146,7 +147,7 @@ class Magma(CMakePackage, CudaPackage, ROCmPackage): if "+rocm" in spec: options.append(define("MAGMA_ENABLE_HIP", True)) options.append(define("CMAKE_CXX_COMPILER", spec["hip"].hipcc)) - # See https://github.com/ROCmSoftwarePlatform/rocFFT/issues/322 + # See https://github.com/ROCm/rocFFT/issues/322 if spec.satisfies("^cmake@3.21.0:3.21.2"): options.append(define("__skip_rocmclang", True)) else: diff --git a/var/spack/repos/builtin/packages/mfem/mfem-hip.patch b/var/spack/repos/builtin/packages/mfem/mfem-hip.patch new file mode 100644 index 0000000000..565bae348c --- /dev/null +++ b/var/spack/repos/builtin/packages/mfem/mfem-hip.patch @@ -0,0 +1,24 @@ +From 93ab69cac72cc2d13cfd4b7efcc235bdbca2b9f5 Mon Sep 17 00:00:00 2001
+From: Afzal Patel <afzal.patel@amd.com>
+Date: Wed, 17 Jan 2024 11:44:18 -0800
+Subject: [PATCH] Add hip library path to ghv flags so libamdhip64 can be found
+
+---
+ config/makefile | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/config/makefile b/config/makefile
+index 627d117..a453865 100644
+--- a/config/makefile
++++ b/config/makefile
+@@ -38,7 +38,7 @@ all: header config-mk
+ MPI = $(MFEM_USE_MPI:NO=)
+ GHV_CXX ?= $(MFEM_CXX)
+ GHV = get_hypre_version
+-GHV_FLAGS = $(subst @MFEM_DIR@,$(if $(MFEM_DIR),$(MFEM_DIR),..),$(HYPRE_OPT))
++GHV_FLAGS = $(subst @MFEM_DIR@,$(if $(MFEM_DIR),$(MFEM_DIR),..),$(HYPRE_OPT)) $(HIP_LIB)
+ SMX = $(if $(MFEM_USE_PUMI:NO=),MFEM_USE_SIMMETRIX)
+ SMX_PATH = $(PUMI_DIR)/include/gmi_sim.h
+ SMX_FILE = $(subst @MFEM_DIR@,$(if $(MFEM_DIR),$(MFEM_DIR),..),$(SMX_PATH))
+--
+2.25.1
\ No newline at end of file diff --git a/var/spack/repos/builtin/packages/mfem/package.py b/var/spack/repos/builtin/packages/mfem/package.py index 618b397181..ddd7be363d 100644 --- a/var/spack/repos/builtin/packages/mfem/package.py +++ b/var/spack/repos/builtin/packages/mfem/package.py @@ -480,6 +480,7 @@ class Mfem(Package, CudaPackage, ROCmPackage): when="@4.6.0 +gslib+shared+miniapps", sha256="2a31682d876626529e2778a216d403648b83b90997873659a505d982d0e65beb", ) + patch("mfem-hip.patch", when="+rocm ^hip@6.0:") phases = ["configure", "build", "install"] @@ -954,6 +955,7 @@ class Mfem(Package, CudaPackage, ROCmPackage): options += ["HIP_CXX=%s" % spec["hip"].hipcc, "HIP_ARCH=%s" % amdgpu_target] hip_headers = HeaderList([]) hip_libs = LibraryList([]) + hip_libs += find_libraries("libamdhip64", spec["hip"].prefix.lib) # To use a C++ compiler that supports -xhip flag one can use # something like this: # options += [ diff --git a/var/spack/repos/builtin/packages/migraphx/package.py b/var/spack/repos/builtin/packages/migraphx/package.py index 1245a48109..efc4280521 100644 --- a/var/spack/repos/builtin/packages/migraphx/package.py +++ b/var/spack/repos/builtin/packages/migraphx/package.py @@ -11,9 +11,9 @@ from spack.package import * class Migraphx(CMakePackage): """AMD's graph optimization engine.""" - homepage = "https://github.com/ROCmSoftwarePlatform/AMDMIGraphX" - git = "https://github.com/ROCmSoftwarePlatform/AMDMIGraphX.git" - url = "https://github.com/ROCmSoftwarePlatform/AMDMIGraphX/archive/rocm-5.5.0.tar.gz" + homepage = "https://github.com/ROCm/AMDMIGraphX" + git = "https://github.com/ROCm/AMDMIGraphX.git" + url = "https://github.com/ROCm/AMDMIGraphX/archive/rocm-6.0.0.tar.gz" tags = ["rocm"] maintainers("srekolam", "renjithravindrankannath") @@ -21,6 +21,7 @@ class Migraphx(CMakePackage): license("MIT") + version("6.0.0", sha256="7bb3f5011da9b1f3b79707b06118c523c1259215f650c2ffa5622a7e1d88868f") version("5.7.1", sha256="3e58c043a5a7d1357ee05725fd6cd41e190b070f1ba57f61300128429902089c") version("5.7.0", sha256="14f13554367d2d6490d66f8b5b739203225e7acce25085559e7c4acf29e2a4d5") version("5.6.1", sha256="b108c33f07572ffd880b20f6de06f1934ab2a1b41ae69095612322ac412fa91c") @@ -108,7 +109,7 @@ class Migraphx(CMakePackage): ) def url_for_version(self, version): - url = "https://github.com/ROCmSoftwarePlatform/AMDMIGraphX/archive/" + url = "https://github.com/ROCm/AMDMIGraphX/archive/" if version <= Version("3.5.0"): url += "{0}.tar.gz".format(version) else: @@ -168,6 +169,7 @@ class Migraphx(CMakePackage): "5.6.1", "5.7.0", "5.7.1", + "6.0.0", ]: depends_on("rocm-cmake@%s:" % ver, type="build", when="@" + ver) depends_on("hip@" + ver, when="@" + ver) @@ -175,7 +177,7 @@ class Migraphx(CMakePackage): depends_on("rocblas@" + ver, when="@" + ver) depends_on("miopen-hip@" + ver, when="@" + ver) - for ver in ["5.7.0", "5.7.1"]: + for ver in ["5.7.0", "5.7.1", "6.0.0"]: depends_on("composable-kernel@" + ver, when="@" + ver) @property diff --git a/var/spack/repos/builtin/packages/miopen-hip/package.py b/var/spack/repos/builtin/packages/miopen-hip/package.py index ee3b78a5ff..8bafc28701 100644 --- a/var/spack/repos/builtin/packages/miopen-hip/package.py +++ b/var/spack/repos/builtin/packages/miopen-hip/package.py @@ -12,9 +12,9 @@ from spack.pkg.builtin.boost import Boost class MiopenHip(CMakePackage): """AMD's library for high performance machine learning primitives.""" - homepage = "https://github.com/ROCmSoftwarePlatform/MIOpen" - git = "https://github.com/ROCmSoftwarePlatform/MIOpen.git" - url = "https://github.com/ROCmSoftwarePlatform/MIOpen/archive/rocm-5.5.0.tar.gz" + homepage = "https://github.com/ROCm/MIOpen" + git = "https://github.com/ROCm/MIOpen.git" + url = "https://github.com/ROCm/MIOpen/archive/rocm-6.0.0.tar.gz" tags = ["rocm"] maintainers("srekolam", "renjithravindrankannath") @@ -22,6 +22,7 @@ class MiopenHip(CMakePackage): license("MIT") + version("6.0.0", sha256="a0718a48353be30ff98118ade511f0c1b454e394d8f934aefe7dd6946562b2e9") version("5.7.1", sha256="912a658fe21ce6f1982b0f2ff251c3f7bb618f2e7e9876d983bcb54e3cd7129e") version("5.7.0", sha256="5cd0b62254469e1c246d5890d2b78f8aedcf42cf8a327eabc1a391b83bcd14e1") version("5.6.1", sha256="ff627d68ed9e52433a3c808b5d3ff179a398b77ce81b00cfea7b2c4da5162c6c") @@ -124,7 +125,7 @@ class MiopenHip(CMakePackage): patch("0001-Add-rocm-path-and-rocm-device-lib-path-flags.patch", when="@3.9.0:5.0.2") patch("miopen-hip-include-nlohmann-include-directory.patch", when="@5.4.0:") patch( - "https://github.com/ROCmSoftwarePlatform/MIOpen/pull/2276/commits/f60aa1ff89f8fb596b4a6a4c70aa7d557803db87.patch?full_index=1", + "https://github.com/ROCm/MIOpen/pull/2276/commits/f60aa1ff89f8fb596b4a6a4c70aa7d557803db87.patch?full_index=1", sha256="c777d9f4cd2bbfec632b38620c0f70bb0cce8da1", when="@5.7:", ) @@ -159,6 +160,7 @@ class MiopenHip(CMakePackage): "5.6.1", "5.7.0", "5.7.1", + "6.0.0", ]: depends_on("rocm-cmake@%s:" % ver, type="build", when="@" + ver) depends_on("hip@" + ver, when="@" + ver) @@ -168,7 +170,7 @@ class MiopenHip(CMakePackage): for ver in ["5.1.0", "5.1.3", "5.2.0", "5.2.1", "5.2.3", "5.3.0", "5.3.3"]: depends_on("mlirmiopen@" + ver, when="@" + ver) - for ver in ["5.5.1", "5.6.0", "5.6.1", "5.7.0", "5.7.1"]: + for ver in ["5.5.1", "5.6.0", "5.6.1", "5.7.0", "5.7.1", "6.0.0"]: depends_on("nlohmann-json", type="link") depends_on("composable-kernel@" + ver, when="@" + ver) for ver in ["5.4.0", "5.4.3", "5.5.0"]: diff --git a/var/spack/repos/builtin/packages/miopen-opencl/package.py b/var/spack/repos/builtin/packages/miopen-opencl/package.py index ec5eac8a96..5ec89b243d 100644 --- a/var/spack/repos/builtin/packages/miopen-opencl/package.py +++ b/var/spack/repos/builtin/packages/miopen-opencl/package.py @@ -12,9 +12,9 @@ from spack.pkg.builtin.boost import Boost class MiopenOpencl(CMakePackage): """AMD's library for high performance machine learning primitives.""" - homepage = "https://github.com/ROCmSoftwarePlatform/MIOpen" - git = "https://github.com/ROCmSoftwarePlatform/MIOpen.git" - url = "https://github.com/ROCmSoftwarePlatform/MIOpen/archive/rocm-5.5.0.tar.gz" + homepage = "https://github.com/ROCm/MIOpen" + git = "https://github.com/ROCm/MIOpen.git" + url = "https://github.com/ROCm/MIOpen/archive/rocm-6.0.0.tar.gz" tags = ["rocm"] maintainers("srekolam", "renjithravindrankannath") diff --git a/var/spack/repos/builtin/packages/miopen-tensile/package.py b/var/spack/repos/builtin/packages/miopen-tensile/package.py index 11dece2143..1d64b792d4 100644 --- a/var/spack/repos/builtin/packages/miopen-tensile/package.py +++ b/var/spack/repos/builtin/packages/miopen-tensile/package.py @@ -12,9 +12,9 @@ class MiopenTensile(CMakePackage): """MIOpenTensile provides host-callable interfaces to Tensile library. MIOpenTensile supports one programming model: HIP""" - homepage = "https://github.com/ROCmSoftwarePlatform/MIOpenTensile" - git = "https://github.com/ROCmSoftwarePlatform/MIOpenTensile.git" - url = "https://github.com/ROCmSoftwarePlatform/MIOpentensile/archive/rocm-5.0.0.tar.gz" + homepage = "https://github.com/ROCm/MIOpenTensile" + git = "https://github.com/ROCm/MIOpenTensile.git" + url = "https://github.com/ROCm/MIOpentensile/archive/rocm-5.0.0.tar.gz" tags = ["rocm"] maintainers("srekolam") @@ -72,7 +72,7 @@ class MiopenTensile(CMakePackage): resource( name="Tensile", - git="https://github.com/ROCmSoftwarePlatform/Tensile.git", + git="https://github.com/ROCm/Tensile.git", commit="9cbabb07f81e932b9c98bf5ae48fbd7fcef615cf", when="@4.5.0:", ) diff --git a/var/spack/repos/builtin/packages/miopengemm/package.py b/var/spack/repos/builtin/packages/miopengemm/package.py index 937210ec77..e67185563e 100644 --- a/var/spack/repos/builtin/packages/miopengemm/package.py +++ b/var/spack/repos/builtin/packages/miopengemm/package.py @@ -12,9 +12,9 @@ class Miopengemm(CMakePackage): """An OpenCL general matrix multiplication (GEMM) API and kernel generator""" - homepage = "https://github.com/ROCmSoftwarePlatform/MIOpenGEMM" - git = "https://github.com/ROCmSoftwarePlatform/MIOpenGEMM.git" - url = "https://github.com/ROCmSoftwarePlatform/MIOpenGEMM/archive/rocm-5.5.0.tar.gz" + homepage = "https://github.com/ROCm/MIOpenGEMM" + git = "https://github.com/ROCm/MIOpenGEMM.git" + url = "https://github.com/ROCm/MIOpenGEMM/archive/rocm-6.0.0.tar.gz" tags = ["rocm"] maintainers("srekolam", "renjithravindrankannath") @@ -22,8 +22,8 @@ class Miopengemm(CMakePackage): def url_for_version(self, version): if version == Version("1.1.6"): - return "https://github.com/ROCmSoftwarePlatform/MIOpenGEMM/archive/1.1.6.tar.gz" - url = "https://github.com/ROCmSoftwarePlatform/MIOpenGEMM/archive/rocm-{0}.tar.gz" + return "https://github.com/ROCm/MIOpenGEMM/archive/1.1.6.tar.gz" + url = "https://github.com/ROCm/MIOpenGEMM/archive/rocm-{0}.tar.gz" return url.format(version) license("MIT") diff --git a/var/spack/repos/builtin/packages/mivisionx/package.py b/var/spack/repos/builtin/packages/mivisionx/package.py index 153469f16e..5e2549631f 100644 --- a/var/spack/repos/builtin/packages/mivisionx/package.py +++ b/var/spack/repos/builtin/packages/mivisionx/package.py @@ -13,7 +13,7 @@ class Mivisionx(CMakePackage): homepage = "https://github.com/GPUOpen-ProfessionalCompute-Libraries/MIVisionX" git = "https://github.com/GPUOpen-ProfessionalCompute-Libraries/MIVisionX.git" - url = "https://github.com/GPUOpen-ProfessionalCompute-Libraries/MIVisionX/archive/rocm-5.5.0.tar.gz" + url = "https://github.com/GPUOpen-ProfessionalCompute-Libraries/MIVisionX/archive/rocm-6.0.0.tar.gz" maintainers("srekolam", "renjithravindrankannath") tags = ["rocm"] @@ -27,6 +27,7 @@ class Mivisionx(CMakePackage): license("MIT") + version("6.0.0", sha256="01324a12f21ea0e29a4d7d7c60498ba9231723569fedcdd90f28ddffb5e0570e") version("5.7.1", sha256="bfc074bc32ebe84c72149ee6abb30b5b6499023d5b98269232de82e35d0505a8") version("5.7.0", sha256="07e4ec8a8c06a9a8bb6394a043c9c3e7176acd3b462a16de91ef9518a64df9ba") version("5.6.1", sha256="b2ff95c1488e244f379482631dae4f9ab92d94a513d180e03607aa1e184b5b0a") @@ -369,6 +370,7 @@ class Mivisionx(CMakePackage): "5.6.1", "5.7.0", "5.7.1", + "6.0.0", ]: depends_on("miopen-hip@" + ver, when="@" + ver) for ver in [ @@ -381,11 +383,12 @@ class Mivisionx(CMakePackage): "5.6.1", "5.7.0", "5.7.1", + "6.0.0", ]: depends_on("migraphx@" + ver, when="@" + ver) depends_on("hip@" + ver, when="@" + ver) - for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1", "5.7.0", "5.7.1"]: + for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1", "5.7.0", "5.7.1", "6.0.0"]: depends_on("rocm-core@" + ver, when="@" + ver) depends_on("python@3.5:", type="build") diff --git a/var/spack/repos/builtin/packages/mlirmiopen/package.py b/var/spack/repos/builtin/packages/mlirmiopen/package.py index eeed27450d..7cfe466a83 100644 --- a/var/spack/repos/builtin/packages/mlirmiopen/package.py +++ b/var/spack/repos/builtin/packages/mlirmiopen/package.py @@ -10,9 +10,9 @@ from spack.package import * class Mlirmiopen(CMakePackage): """Multi-Level Intermediate Representation for rocm miopen project.""" - homepage = "https://github.com/ROCmSoftwarePlatform/llvm-project-mlir" - url = "https://github.com/ROCmSoftwarePlatform/llvm-project-mlir/archive/refs/tags/rocm-5.4.0.tar.gz" - git = "https://github.com/ROCmSoftwarePlatform/llvm-project-mlir.git" + homepage = "https://github.com/ROCm/llvm-project-mlir" + url = "https://github.com/ROCm/llvm-project-mlir/archive/refs/tags/rocm-5.4.0.tar.gz" + git = "https://github.com/ROCm/llvm-project-mlir.git" tags = ["rocm"] maintainers("srekolam") diff --git a/var/spack/repos/builtin/packages/petsc/Handle-hipsparse-api-changes-for-rocm-6.0.patch b/var/spack/repos/builtin/packages/petsc/Handle-hipsparse-api-changes-for-rocm-6.0.patch new file mode 100644 index 0000000000..674c083f51 --- /dev/null +++ b/var/spack/repos/builtin/packages/petsc/Handle-hipsparse-api-changes-for-rocm-6.0.patch @@ -0,0 +1,70 @@ +From 3c9aaca12a1ae6000ff3cfd0564f7b2ab45396d2 Mon Sep 17 00:00:00 2001 +From: Renjith Ravindran <Renjith.RavindranKannath@amd.com> +Date: Thu, 18 Jan 2024 07:38:25 +0000 +Subject: [PATCH] Handle the hipsparse api changes for rocm 6.0 + +--- + .../impls/aij/seq/seqhipsparse/aijhipsparse.hip.cpp | 12 ++++++------ + 1 file changed, 6 insertions(+), 6 deletions(-) + +diff --git a/src/mat/impls/aij/seq/seqhipsparse/aijhipsparse.hip.cpp b/src/mat/impls/aij/seq/seqhipsparse/aijhipsparse.hip.cpp +index e6f878f..4bf52cd 100644 +--- a/src/mat/impls/aij/seq/seqhipsparse/aijhipsparse.hip.cpp ++++ b/src/mat/impls/aij/seq/seqhipsparse/aijhipsparse.hip.cpp +@@ -1258,7 +1258,7 @@ static PetscErrorCode MatSolve_SeqAIJHIPSPARSE_ILU0(Mat fact, Vec b, Vec x) + /* Solve L*y = b */ + PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray)); + PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y)); +- #if PETSC_PKG_HIP_VERSION_EQ(5, 6, 31061) // i.e., 5.6.0 ++ #if (PETSC_PKG_HIP_VERSION_EQ(5, 6, 31061)||PETSC_PKG_HIP_VERSION_EQ(6, 0, 32830)) // i.e., 5.6.0 + PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */ + fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L)); // hipsparseSpSV_solve() secretely uses the external buffer used in hipsparseSpSV_analysis()! + #else +@@ -1267,7 +1267,7 @@ static PetscErrorCode MatSolve_SeqAIJHIPSPARSE_ILU0(Mat fact, Vec b, Vec x) + #endif + /* Solve U*x = y */ + PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_X, xarray)); +- #if PETSC_PKG_HIP_VERSION_EQ(5, 6, 31061) // i.e., 5.6.0 ++ #if (PETSC_PKG_HIP_VERSION_EQ(5, 6, 31061)||PETSC_PKG_HIP_VERSION_EQ(6, 0, 32830)) // i.e., 5.6.0 + PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, /* U X = Y */ + fs->dnVecDescr_Y, fs->dnVecDescr_X, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U)); + #else +@@ -1316,7 +1316,7 @@ static PetscErrorCode MatSolveTranspose_SeqAIJHIPSPARSE_ILU0(Mat fact, Vec b, Ve + /* Solve Ut*y = b */ + PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray)); + PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y)); +- #if PETSC_PKG_HIP_VERSION_EQ(5, 6, 31061) // i.e., 5.6.0 ++ #if (PETSC_PKG_HIP_VERSION_EQ(5, 6, 31061)||PETSC_PKG_HIP_VERSION_EQ(6, 0, 32830)) // i.e., 5.6.0 + PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, /* Ut Y = X */ + fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut)); + #else +@@ -1325,7 +1325,7 @@ static PetscErrorCode MatSolveTranspose_SeqAIJHIPSPARSE_ILU0(Mat fact, Vec b, Ve + #endif + /* Solve Lt*x = y */ + PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_X, xarray)); +- #if PETSC_PKG_HIP_VERSION_EQ(5, 6, 31061) // i.e., 5.6.0 ++ #if (PETSC_PKG_HIP_VERSION_EQ(5, 6, 31061)||PETSC_PKG_HIP_VERSION_EQ(6, 0, 32830)) // i.e., 5.6.0 + PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */ + fs->dnVecDescr_Y, fs->dnVecDescr_X, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt)); + #else +@@ -1559,7 +1559,7 @@ static PetscErrorCode MatSolve_SeqAIJHIPSPARSE_ICC0(Mat fact, Vec b, Vec x) + /* Solve L*y = b */ + PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray)); + PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y)); +- #if PETSC_PKG_HIP_VERSION_EQ(5, 6, 31061) // i.e., 5.6.0 ++ #if (PETSC_PKG_HIP_VERSION_EQ(5, 6, 31061)||PETSC_PKG_HIP_VERSION_EQ(6, 0, 32830)) // i.e., 5.6.0 + PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */ + fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L)); + #else +@@ -1568,7 +1568,7 @@ static PetscErrorCode MatSolve_SeqAIJHIPSPARSE_ICC0(Mat fact, Vec b, Vec x) + #endif + /* Solve Lt*x = y */ + PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_X, xarray)); +- #if PETSC_PKG_HIP_VERSION_EQ(5, 6, 31061) // i.e., 5.6.0 ++ #if (PETSC_PKG_HIP_VERSION_EQ(5, 6, 31061)||PETSC_PKG_HIP_VERSION_EQ(6, 0, 32830)) // i.e., 5.6.0 + PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */ + fs->dnVecDescr_Y, fs->dnVecDescr_X, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt)); + #else +-- +2.39.3 + diff --git a/var/spack/repos/builtin/packages/petsc/package.py b/var/spack/repos/builtin/packages/petsc/package.py index 5a4c011002..67a872ea8b 100644 --- a/var/spack/repos/builtin/packages/petsc/package.py +++ b/var/spack/repos/builtin/packages/petsc/package.py @@ -21,7 +21,7 @@ class Petsc(Package, CudaPackage, ROCmPackage): tags = ["e4s"] version("main", branch="main") - + version("3.20.3", sha256="75a94fb44df0512f51ad093fa784e56b61f51b7ead5956fbe49185c203f8c245") version("3.20.2", sha256="2a2d08b5f0e3d0198dae2c42ce1fd036f25c153ef2bb4a2d320ca141ac7cd30b") version("3.20.1", sha256="3d54f13000c9c8ceb13ca4f24f93d838319019d29e6de5244551a3ec22704f32") version("3.20.0", sha256="c152ccb12cb2353369d27a65470d4044a0c67e0b69814368249976f5bb232bd4") @@ -172,6 +172,9 @@ class Petsc(Package, CudaPackage, ROCmPackage): ) patch("hip-5.6.0-for-3.18.diff", when="@3.18:3.19 ^hipsparse@5.6.0") patch("hip-5.7-plus-for-3.18.diff", when="@3.18:3.19 ^hipsparse@5.7:") + patch( + "Handle-hipsparse-api-changes-for-rocm-6.0.patch", when="@3.20.2:3.20.3 ^hipsparse@6.0" + ) # 3.8.0 has a build issue with MKL - so list this conflict explicitly conflicts("^intel-mkl", when="@3.8.0") diff --git a/var/spack/repos/builtin/packages/raja/package.py b/var/spack/repos/builtin/packages/raja/package.py index fb67631779..9bb463412f 100644 --- a/var/spack/repos/builtin/packages/raja/package.py +++ b/var/spack/repos/builtin/packages/raja/package.py @@ -114,6 +114,14 @@ class Raja(CachedCMakePackage, CudaPackage, ROCmPackage): when="@:0.13.0 ^blt@0.4:", ) + # Backward compatibility is stopped from ROCm 6.0 + # Future relase will have the change from PR https://github.com/LLNL/RAJA/pull/1568 + patch( + "https://github.com/LLNL/RAJA/commit/406eb8dee05a41eb32c421c375688a4863b60642.patch?full_index=1", + sha256="d9ce5ef038555cbccb330a9016b7be77e56ae0660583cba955dab9d0297a4b07", + when="^hip@6.0.0", + ) + variant("openmp", default=True, description="Build OpenMP backend") variant("shared", default=True, description="Build Shared Libs") variant("plugins", default=False, description="Enable runtime plugins") diff --git a/var/spack/repos/builtin/packages/rccl-tests/package.py b/var/spack/repos/builtin/packages/rccl-tests/package.py index 18131077e4..a27bebac07 100644 --- a/var/spack/repos/builtin/packages/rccl-tests/package.py +++ b/var/spack/repos/builtin/packages/rccl-tests/package.py @@ -10,9 +10,9 @@ class RcclTests(MakefilePackage): """These tests check both the performance and the correctness of RCCL operations. They can be compiled against RCCL.""" - homepage = "https://github.com/ROCmSoftwarePlatform/rccl-tests" - git = "https://github.com/ROCmSoftwarePlatform/rccl-tests.git" - url = "https://github.com/ROCmSoftwarePlatform/rccl-tests.git" + homepage = "https://github.com/ROCm/rccl-tests" + git = "https://github.com/ROCm/rccl-tests.git" + url = "https://github.com/ROCm/rccl-tests.git" tags = ["rocm"] maintainers("bvanessen") diff --git a/var/spack/repos/builtin/packages/rccl/0004-Set-rocm-core-path-for-version-file.patch b/var/spack/repos/builtin/packages/rccl/0004-Set-rocm-core-path-for-version-file.patch new file mode 100644 index 0000000000..fd03def3ee --- /dev/null +++ b/var/spack/repos/builtin/packages/rccl/0004-Set-rocm-core-path-for-version-file.patch @@ -0,0 +1,13 @@ +diff --git a/CMakeLists.txt b/CMakeLists.txt +index 5384287..ea6fd4b 100644 +--- a/CMakeLists.txt ++++ b/CMakeLists.txt +@@ -121,7 +121,7 @@ message(STATUS "hipcc version: ${hipcc_version_string}") + + ## Check for ROCm version + execute_process( +- COMMAND bash "-c" "cat ${ROCM_PATH}/.info/version" ++ COMMAND bash "-c" "cat $ENV{ROCMCORE_PATH}/.info/version" + OUTPUT_VARIABLE rocm_version_string + ) + string(REGEX MATCH "([0-9]+)\\.([0-9]+)\\.([0-9]+)" rocm_version_matches ${rocm_version_string}) diff --git a/var/spack/repos/builtin/packages/rccl/package.py b/var/spack/repos/builtin/packages/rccl/package.py index 9b388d1a27..52519c0194 100644 --- a/var/spack/repos/builtin/packages/rccl/package.py +++ b/var/spack/repos/builtin/packages/rccl/package.py @@ -14,13 +14,14 @@ class Rccl(CMakePackage): implementing all-reduce, all-gather, reduce, broadcast, and reduce-scatter.""" - homepage = "https://github.com/ROCmSoftwarePlatform/rccl" - git = "https://github.com/ROCmSoftwarePlatform/rccl.git" - url = "https://github.com/ROCmSoftwarePlatform/rccl/archive/rocm-5.5.0.tar.gz" + homepage = "https://github.com/ROCm/rccl" + git = "https://github.com/ROCm/rccl.git" + url = "https://github.com/ROCm/rccl/archive/rocm-6.0.0.tar.gz" tags = ["rocm"] maintainers("srekolam", "renjithravindrankannath") libraries = ["librccl"] + version("6.0.0", sha256="0496d5a5f2e48c92cd390ab318df31a53cf7ec590988c2574c9f3d99c38b0fa7") version("5.7.1", sha256="fb4c1f0084196d1226ce8a726d0f012d3890b54508a06ca87bbda619be8b90b1") version("5.7.0", sha256="4c2825a3e4323ef3c2f8855ef445c1a81cf1992fb37e3e8a07a50db354aa3954") version("5.6.1", sha256="27ec6b86a1a329684d808f728c1fce134517ac8e6e7047689f95dbf8386c077e") @@ -119,6 +120,7 @@ class Rccl(CMakePackage): patch("0001-Fix-numactl-path-issue.patch", when="@3.7.0:4.3.2") patch("0002-Fix-numactl-rocm-smi-path-issue.patch", when="@4.5.0:5.2.1") patch("0003-Fix-numactl-rocm-smi-path-issue.patch", when="@5.2.3:5.6") + patch("0004-Set-rocm-core-path-for-version-file.patch", when="@6.0:") depends_on("cmake@3.5:", type="build") for ver in [ @@ -151,6 +153,7 @@ class Rccl(CMakePackage): "5.6.1", "5.7.0", "5.7.1", + "6.0.0", ]: depends_on("rocm-cmake@%s:" % ver, type="build", when="@" + ver) depends_on("hip@" + ver, when="@" + ver) @@ -186,6 +189,7 @@ class Rccl(CMakePackage): "5.6.1", "5.7.0", "5.7.1", + "6.0.0", ]: depends_on("numactl@2:", when="@" + ver) for ver in [ @@ -208,12 +212,14 @@ class Rccl(CMakePackage): "5.6.1", "5.7.0", "5.7.1", + "6.0.0", ]: depends_on("rocm-smi-lib@" + ver, when="@" + ver) depends_on("chrpath", when="@5.3.0:") - for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1", "5.7.0", "5.7.1"]: + for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1", "5.7.0", "5.7.1", "6.0.0"]: depends_on("rocm-core@" + ver, when="@" + ver) + depends_on("googletest@1.11.0:", when="@5.3:") @classmethod @@ -229,6 +235,7 @@ class Rccl(CMakePackage): def setup_build_environment(self, env): env.set("CXX", self.spec["hip"].hipcc) + env.set("ROCMCORE_PATH", self.spec["rocm-core"].prefix) def cmake_args(self): args = [] diff --git a/var/spack/repos/builtin/packages/rdc/package.py b/var/spack/repos/builtin/packages/rdc/package.py index fbcb130fb2..f4466bc991 100644 --- a/var/spack/repos/builtin/packages/rdc/package.py +++ b/var/spack/repos/builtin/packages/rdc/package.py @@ -12,8 +12,8 @@ from spack.package import * class Rdc(CMakePackage): """ROCm Data Center Tool""" - homepage = "https://github.com/RadeonOpenCompute/rdc" - url = "https://github.com/RadeonOpenCompute/rdc/archive/rocm-5.5.0.tar.gz" + homepage = "https://github.com/ROCm/rdc" + url = "https://github.com/ROCm/rdc/archive/rocm-6.0.0.tar.gz" tags = ["rocm"] maintainers("srekolam", "renjithravindrankannath") @@ -21,13 +21,14 @@ class Rdc(CMakePackage): def url_for_version(self, version): if version == Version("3.9.0"): - return "https://github.com/RadeonOpenCompute/rdc/archive/rdc_so_ver-0.3.tar.gz" + return "https://github.com/ROCm/rdc/archive/rdc_so_ver-0.3.tar.gz" - url = "https://github.com/RadeonOpenCompute/rdc/archive/rocm-{0}.tar.gz" + url = "https://github.com/ROCm/rdc/archive/rocm-{0}.tar.gz" return url.format(version) license("MIT") + version("6.0.0", sha256="5e3847a919d5f7efe99d8d76c96e78401659eccd1fb234b1b8cb4304096d6e89") version("5.7.1", sha256="5251eb3085f2019246b332e9552dfae1572cf64ddf58306b81cbe7108019ffee") version("5.7.0", sha256="924e94f14f6390d7a6ff7863fb4e2085c1ff5f9c12b8bd46471eb31f001c4f14") version("5.6.1", sha256="9e9f57cebbc5ae386a405957ed2c17344cdb42db5e1a71285f2c9bc09eea6519") @@ -140,6 +141,7 @@ class Rdc(CMakePackage): "5.6.1", "5.7.0", "5.7.1", + "6.0.0", ]: depends_on("rocm-smi-lib@" + ver, type=("build", "link"), when="@" + ver) @@ -161,10 +163,11 @@ class Rdc(CMakePackage): "5.6.1", "5.7.0", "5.7.1", + "6.0.0", ]: depends_on("hsa-rocr-dev@" + ver, when="@" + ver) - for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1", "5.7.0", "5.7.1"]: + for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1", "5.7.0", "5.7.1", "6.0.0"]: depends_on("rocm-core@" + ver, when="@" + ver) def patch(self): diff --git a/var/spack/repos/builtin/packages/rocalution/package.py b/var/spack/repos/builtin/packages/rocalution/package.py index 103fcd7373..d04530e77b 100644 --- a/var/spack/repos/builtin/packages/rocalution/package.py +++ b/var/spack/repos/builtin/packages/rocalution/package.py @@ -17,9 +17,9 @@ class Rocalution(CMakePackage): generic and flexible design that allows seamless integration with other scientific software packages.""" - homepage = "https://github.com/ROCmSoftwarePlatform/rocALUTION" - git = "https://github.com/ROCmSoftwarePlatform/rocALUTION.git" - url = "https://github.com/ROCmSoftwarePlatform/rocALUTION/archive/rocm-5.5.0.tar.gz" + homepage = "https://github.com/ROCm/rocALUTION" + git = "https://github.com/ROCm/rocALUTION.git" + url = "https://github.com/ROCm/rocALUTION/archive/rocm-6.0.0.tar.gz" tags = ["rocm"] maintainers("cgmb", "srekolam", "renjithravindrankannath") @@ -27,6 +27,7 @@ class Rocalution(CMakePackage): license("MIT") + version("6.0.0", sha256="cabf37691b8db00c82bda49c7dcfaefd9b9067b7d097afa43b7a5f86c45bff99") version("5.7.1", sha256="b95afa1285759843c5fea1ad6e1c1edf283922e0d448db03a3e1f42b6942bc24") version("5.7.0", sha256="48232a0d1250debce89e39a233bd0b5d52324a2454c078b99c9d44965cbbc0e9") version("5.6.1", sha256="7197b3617a0c91e90adaa32003c04d247a5f585d216e77493d20984ba215addb") @@ -165,6 +166,7 @@ class Rocalution(CMakePackage): "5.6.1", "5.7.0", "5.7.1", + "6.0.0", ]: depends_on("hip@" + ver, when="@" + ver) depends_on("rocprim@" + ver, when="@" + ver) diff --git a/var/spack/repos/builtin/packages/rocblas/package.py b/var/spack/repos/builtin/packages/rocblas/package.py index 1012b89a17..854d897e5d 100644 --- a/var/spack/repos/builtin/packages/rocblas/package.py +++ b/var/spack/repos/builtin/packages/rocblas/package.py @@ -11,9 +11,9 @@ from spack.package import * class Rocblas(CMakePackage): """Radeon Open Compute BLAS library""" - homepage = "https://github.com/ROCmSoftwarePlatform/rocBLAS/" - git = "https://github.com/ROCmSoftwarePlatform/rocBLAS.git" - url = "https://github.com/ROCmSoftwarePlatform/rocBLAS/archive/rocm-5.5.0.tar.gz" + homepage = "https://github.com/ROCm/rocBLAS/" + git = "https://github.com/ROCm/rocBLAS.git" + url = "https://github.com/ROCm/rocBLAS/archive/rocm-6.0.0.tar.gz" tags = ["rocm"] maintainers("cgmb", "srekolam", "renjithravindrankannath", "haampie") @@ -23,6 +23,7 @@ class Rocblas(CMakePackage): version("develop", branch="develop") version("master", branch="master") + version("6.0.0", sha256="befa4a75f1de0ea37f2358d4c2de5406d7bce671ca9936e2294b64d3b3bafb60") version("5.7.1", sha256="2984a5ed0ea5a05d40996ee3fddecb24399cbe8ea3e4921fc254e54d8f52fe4f") version("5.7.0", sha256="024edd98de9687ee5394badc4dd4c543eef4eb3f71c96ff64100705d851e1744") version("5.6.1", sha256="73896ebd445162a69af97f9fd462684609b4e0cf617eab450cd4558b4a23941e") @@ -131,8 +132,8 @@ class Rocblas(CMakePackage): conflicts("amdgpu_target=gfx1012", when="@:4.2.1") conflicts("amdgpu_target=gfx1030", when="@:4.2.1") # https://reviews.llvm.org/D124866 - # https://github.com/ROCm-Developer-Tools/HIP/issues/2678 - # https://github.com/ROCm-Developer-Tools/hipamd/blob/rocm-5.2.x/include/hip/amd_detail/host_defines.h#L50 + # https://github.com/ROCm/HIP/issues/2678 + # https://github.com/ROCm/hipamd/blob/rocm-5.2.x/include/hip/amd_detail/host_defines.h#L50 conflicts("%gcc@12", when="@5.2") depends_on("cmake@3.16.8:", type="build", when="@4.2.0:") @@ -182,6 +183,7 @@ class Rocblas(CMakePackage): "5.6.1", "5.7.0", "5.7.1", + "6.0.0", ]: depends_on("hip@" + ver, when="@" + ver) depends_on("llvm-amdgpu@" + ver, type="build", when="@" + ver) @@ -232,10 +234,11 @@ class Rocblas(CMakePackage): ("@5.6.1", "7d0a9d040c3bbae893df7ecef6a19d9cd1c304aa"), ("@5.7.0", "97e0cfc2c8cb87a1e38901d99c39090dc4181652"), ("@5.7.1", "97e0cfc2c8cb87a1e38901d99c39090dc4181652"), + ("@6.0.0", "17df881bde80fc20f997dfb290f4bb4b0e05a7e9"), ]: resource( name="Tensile", - git="https://github.com/ROCmSoftwarePlatform/Tensile.git", + git="https://github.com/ROCm/Tensile.git", commit=t_commit, when="{} +tensile".format(t_version), ) @@ -243,12 +246,12 @@ class Rocblas(CMakePackage): for ver in ["master", "develop"]: resource( name="Tensile", - git="https://github.com/ROCmSoftwarePlatform/Tensile.git", + git="https://github.com/ROCm/Tensile.git", branch=ver, when="@{} +tensile".format(ver), ) - # Status: https://github.com/ROCmSoftwarePlatform/Tensile/commit/a488f7dadba34f84b9658ba92ce9ec5a0615a087 + # Status: https://github.com/ROCm/Tensile/commit/a488f7dadba34f84b9658ba92ce9ec5a0615a087 # Not yet landed in 3.7.0, nor 3.8.0. patch("0001-Fix-compilation-error-with-StringRef-to-basic-string.patch", when="@:3.8") patch("0002-Fix-rocblas-clients-blas.patch", when="@4.2.0:4.3.1") @@ -256,7 +259,7 @@ class Rocblas(CMakePackage): # Finding Python package and set command python as python3 patch("0004-Find-python.patch", when="@5.2.0:5.4") patch("0006-Guard-use-of-OpenMP-to-make-it-optional-5.4.patch", when="@5.4") - patch("0007-add-rocm-openmp-extras-include-dir.patch", when="@5.6:") + patch("0007-add-rocm-openmp-extras-include-dir.patch", when="@5.6:5.7") def setup_build_environment(self, env): env.set("CXX", self.spec["hip"].hipcc) @@ -309,14 +312,14 @@ class Rocblas(CMakePackage): # Restrict the number of jobs Tensile can spawn. # If we don't specify otherwise, Tensile creates a job per available core, # and that consumes a lot of system memory. - # https://github.com/ROCmSoftwarePlatform/Tensile/blob/93e10678a0ced7843d9332b80bc17ebf9a166e8e/Tensile/Parallel.py#L38 + # https://github.com/ROCm/Tensile/blob/93e10678a0ced7843d9332b80bc17ebf9a166e8e/Tensile/Parallel.py#L38 args.append(self.define("Tensile_CPU_THREADS", min(16, make_jobs))) - # See https://github.com/ROCmSoftwarePlatform/rocBLAS/commit/c1895ba4bb3f4f5947f3818ebd155cf71a27b634 + # See https://github.com/ROCm/rocBLAS/commit/c1895ba4bb3f4f5947f3818ebd155cf71a27b634 if "auto" not in self.spec.variants["amdgpu_target"]: args.append(self.define_from_variant(arch_define_name, "amdgpu_target")) - # See https://github.com/ROCmSoftwarePlatform/rocBLAS/issues/1196 + # See https://github.com/ROCm/rocBLAS/issues/1196 if self.spec.satisfies("^cmake@3.21.0:3.21.2"): args.append(self.define("__skip_rocmclang", "ON")) diff --git a/var/spack/repos/builtin/packages/rocfft/package.py b/var/spack/repos/builtin/packages/rocfft/package.py index 229dd4bdb0..815bb03132 100644 --- a/var/spack/repos/builtin/packages/rocfft/package.py +++ b/var/spack/repos/builtin/packages/rocfft/package.py @@ -11,16 +11,16 @@ from spack.package import * class Rocfft(CMakePackage): """Radeon Open Compute FFT library""" - homepage = "https://github.com/ROCmSoftwarePlatform/rocFFT/" - git = "https://github.com/ROCmSoftwarePlatform/rocFFT.git" - url = "https://github.com/ROCmSoftwarePlatform/rocfft/archive/rocm-5.5.0.tar.gz" + homepage = "https://github.com/ROCm/rocFFT/" + git = "https://github.com/ROCm/rocFFT.git" + url = "https://github.com/ROCm/rocfft/archive/rocm-6.0.0.tar.gz" tags = ["rocm"] maintainers("cgmb", "srekolam", "renjithravindrankannath", "haampie") libraries = ["librocfft"] license("MIT") - + version("6.0.0", sha256="fb8ba56572702e77e4383d922cd1fee4ad3fa5f63a5ebdb3d9c354439a446992") version("5.7.1", sha256="202f11f60dc8738e29bbd1b397d419e032794f8bffb7f48f2b31f09cc5f08bc2") version("5.7.0", sha256="3c4a1537a6ec76dc9b622644fe3890647306bf9f28f61c5d2028259c31bb964f") version("5.6.1", sha256="a65861e453587c3e6393da75b0b1976508c61f968aecda77fbec920fea48489e") @@ -167,6 +167,7 @@ class Rocfft(CMakePackage): "5.6.1", "5.7.0", "5.7.1", + "6.0.0", ]: depends_on("hip@" + ver, when="@" + ver) depends_on("rocm-cmake@%s:" % ver, type="build", when="@" + ver) @@ -178,6 +179,14 @@ class Rocfft(CMakePackage): # Patch to add install prefix header location for sqlite for 5.4 patch("0004-fix-missing-sqlite-include-paths.patch", when="@5.4.0:5.5") + # Set LD_LIBRARY_PATH for executing the binaries from build directoryfix missing type + # https://github.com/ROCm/rocFFT/pull/449) + patch( + "https://github.com/ROCm/rocFFT/commit/0ec78f1daac2d7fa1415f4deff0d129252c1c9de.patch?full_index=1", + sha256="bac7873185ac60f2aaa50e278f0b8d52b4d79d586bf7f52db1da33559569ba54", + when="@6.0.0", + ) + def setup_build_environment(self, env): env.set("CXX", self.spec["hip"].hipcc) @@ -214,7 +223,7 @@ class Rocfft(CMakePackage): self.define_from_variant("AMDGPU_TARGETS_SRAM_ECC", "amdgpu_target_sram_ecc") ) - # See https://github.com/ROCmSoftwarePlatform/rocFFT/issues/322 + # See https://github.com/ROCm/rocFFT/issues/322 if self.spec.satisfies("^cmake@3.21.0:3.21.2"): args.append(self.define("__skip_rocmclang", "ON")) diff --git a/var/spack/repos/builtin/packages/rocm-bandwidth-test/package.py b/var/spack/repos/builtin/packages/rocm-bandwidth-test/package.py index 27806866a4..ffb8f927f0 100644 --- a/var/spack/repos/builtin/packages/rocm-bandwidth-test/package.py +++ b/var/spack/repos/builtin/packages/rocm-bandwidth-test/package.py @@ -10,14 +10,15 @@ from spack.package import * class RocmBandwidthTest(CMakePackage): """Test to measure PciE bandwidth on ROCm platforms""" - homepage = "https://github.com/RadeonOpenCompute/rocm_bandwidth_test" - git = "https://github.com/RadeonOpenCompute/rocm_bandwidth_test.git" - url = "https://github.com/RadeonOpenCompute/rocm_bandwidth_test/archive/rocm-5.5.0.tar.gz" + homepage = "https://github.com/ROCm/rocm_bandwidth_test" + git = "https://github.com/ROCm/rocm_bandwidth_test.git" + url = "https://github.com/ROCm/rocm_bandwidth_test/archive/rocm-6.0.0.tar.gz" tags = ["rocm"] maintainers("srekolam", "renjithravindrankannath") version("master", branch="master") + version("6.0.0", sha256="9023401bd6a896059545b8e6263c6730afd89d7d45c0f5866261c300415532a6") version("5.7.1", sha256="7426ef1e317b8293e4d6389673cfa8c63efb3f7d061e2f50a6f0b1b706e2a2a7") version("5.7.0", sha256="fa95c28488ab4bb6d920b9f3c316554ca340f44c87ec2efb4cf8fa488e63ddd9") version("5.6.1", sha256="849af715d08dfd89e7aa5e4453b624151db1cafaa567ab5fa36a77948b90bf0d") @@ -136,12 +137,13 @@ class RocmBandwidthTest(CMakePackage): "5.6.1", "5.7.0", "5.7.1", + "6.0.0", "master", ]: depends_on("hsa-rocr-dev@" + ver, when="@" + ver) depends_on("hsakmt-roct@" + ver, when="@" + ver) - for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1", "5.7.0", "5.7.1"]: + for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1", "5.7.0", "5.7.1", "6.0.0"]: depends_on("rocm-core@" + ver, when="@" + ver) build_targets = ["package"] diff --git a/var/spack/repos/builtin/packages/rocm-clang-ocl/package.py b/var/spack/repos/builtin/packages/rocm-clang-ocl/package.py index 6961c15b80..aeca0c39a2 100644 --- a/var/spack/repos/builtin/packages/rocm-clang-ocl/package.py +++ b/var/spack/repos/builtin/packages/rocm-clang-ocl/package.py @@ -9,15 +9,16 @@ from spack.package import * class RocmClangOcl(CMakePackage): """OpenCL compilation with clang compiler""" - homepage = "https://github.com/RadeonOpenCompute/clang-ocl" - git = "https://github.com/RadeonOpenCompute/clang-ocl.git" - url = "https://github.com/RadeonOpenCompute/clang-ocl/archive/rocm-5.5.0.tar.gz" + homepage = "https://github.com/ROCm/clang-ocl" + git = "https://github.com/ROCm/clang-ocl.git" + url = "https://github.com/ROCm/clang-ocl/archive/rocm-6.0.0.tar.gz" tags = ["rocm"] license("MIT") maintainers("srekolam", "renjithravindrankannath") version("master", branch="master") + version("6.0.0", sha256="74b5a64c32f3c57e7e4de638fffabbf448ecdb3dd8e65678b7ba0633352b4ca3") version("5.7.1", sha256="32e4430d009cbbf5404ca9cbbb549b36897fa1826bc2285372e293cfe7531bf8") version("5.7.0", sha256="c9ca80bfee674e740039256a846107373f1cf6554dc28398599976d8646a0392") version("5.6.1", sha256="c41deb1b564d939fc897b2bbdb13570b2234fa4c052a39783f5ad2dd1052f901") @@ -136,6 +137,7 @@ class RocmClangOcl(CMakePackage): "5.6.1", "5.7.0", "5.7.1", + "6.0.0", "master", ]: depends_on("rocm-cmake@%s:" % ver, type="build", when="@" + ver) @@ -145,7 +147,7 @@ class RocmClangOcl(CMakePackage): depends_on( "rocm-device-libs@" + ver, when="@{0} ^llvm-amdgpu ~rocm-device-libs".format(ver) ) - for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1", "5.7.0", "5.7.1"]: + for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1", "5.7.0", "5.7.1", "6.0.0"]: depends_on("rocm-core@" + ver, when="@" + ver) test_src_dir = "test" diff --git a/var/spack/repos/builtin/packages/rocm-cmake/package.py b/var/spack/repos/builtin/packages/rocm-cmake/package.py index c14999a989..a5cbb03c5b 100644 --- a/var/spack/repos/builtin/packages/rocm-cmake/package.py +++ b/var/spack/repos/builtin/packages/rocm-cmake/package.py @@ -11,9 +11,9 @@ class RocmCmake(CMakePackage): """rocm-cmake provides CMake modules for common build tasks in the ROCm software stack""" - homepage = "https://github.com/RadeonOpenCompute/rocm-cmake" - git = "https://github.com/RadeonOpenCompute/rocm-cmake.git" - url = "https://github.com/RadeonOpenCompute/rocm-cmake/archive/rocm-5.6.0.tar.gz" + homepage = "https://github.com/ROCm/rocm-cmake" + git = "https://github.com/ROCm/rocm-cmake.git" + url = "https://github.com/ROCm/rocm-cmake/archive/rocm-5.6.0.tar.gz" tags = ["rocm"] maintainers("srekolam", "renjithravindrankannath") @@ -21,6 +21,7 @@ class RocmCmake(CMakePackage): license("MIT") version("master", branch="master") + version("6.0.0", sha256="82bd97ba23d1883ef38bb667e92f7367fedc50d6c11c82f54cced4ab04b0412d") version("5.7.1", sha256="4a4c6aa09576ccb834f869bdcb49e98cc0f0bac3678b802358065d1179a9d6f1") version("5.7.0", sha256="93b98144201a1143eeca32744a9927d063f4685189f132ba52a6f3bba158a86b") version("5.6.1", sha256="98bf5fe2e6e12f55d122807d0060f1bb19c80d63d2c2f6fee579c40bfd244fa6") @@ -110,7 +111,7 @@ class RocmCmake(CMakePackage): depends_on("cmake@3:", type="build") depends_on("cmake@3.6:", type="build", when="@4.1.0:") - for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1", "5.7.0", "5.7.1"]: + for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1", "5.7.0", "5.7.1", "6.0.0"]: depends_on("rocm-core@" + ver, when="@" + ver) test_src_dir = "test" diff --git a/var/spack/repos/builtin/packages/rocm-core/package.py b/var/spack/repos/builtin/packages/rocm-core/package.py index 9d6bca46b6..54c1a526dd 100644 --- a/var/spack/repos/builtin/packages/rocm-core/package.py +++ b/var/spack/repos/builtin/packages/rocm-core/package.py @@ -12,8 +12,8 @@ class RocmCore(CMakePackage): It also provides the Lmod modules files for the ROCm release. getROCmVersion function provides the ROCm version.""" - homepage = "https://github.com/RadeonOpenCompute/rocm-core" - url = "https://github.com/RadeonOpenCompute/rocm-core/archive/refs/tags/rocm-5.5.0.tar.gz" + homepage = "https://github.com/ROCm/rocm-core" + url = "https://github.com/ROCm/rocm-core/archive/refs/tags/rocm-6.0.0.tar.gz" tags = ["rocm"] maintainers("srekolam", "renjithravindrankannath") @@ -21,6 +21,7 @@ class RocmCore(CMakePackage): license("MIT") + version("6.0.0", sha256="d950ee4b63336f34579b6e1dda2d05966b7afa9c84bcdc13874991d1147dc788") version("5.7.1", sha256="fc4915019ddfd126e8ef6a15006bce3aa7bd5fd11dc8eb04ce2ee6bdf9c6ae7f") version("5.7.0", sha256="722689bfec46c35f5428a41c5aacfc31efec2294fc3b0112861c562f8a71ac93") version("5.6.1", sha256="eeef75e16e05380ccbc8df17a02dc141a66dddaadb444a97f7278f78067c498c") diff --git a/var/spack/repos/builtin/packages/rocm-dbgapi/package.py b/var/spack/repos/builtin/packages/rocm-dbgapi/package.py index 92b4ec72a9..d068de3456 100644 --- a/var/spack/repos/builtin/packages/rocm-dbgapi/package.py +++ b/var/spack/repos/builtin/packages/rocm-dbgapi/package.py @@ -14,9 +14,9 @@ class RocmDbgapi(CMakePackage): control of the execution and inspection of execution state of AMD's commercially available GPU architectures.""" - homepage = "https://github.com/ROCm-Developer-Tools/ROCdbgapi" - git = "https://github.com/ROCm-Developer-Tools/ROCdbgapi.git" - url = "https://github.com/ROCm-Developer-Tools/ROCdbgapi/archive/rocm-5.5.0.tar.gz" + homepage = "https://github.com/ROCm/ROCdbgapi" + git = "https://github.com/ROCm/ROCdbgapi.git" + url = "https://github.com/ROCm/ROCdbgapi/archive/rocm-6.0.0.tar.gz" tags = ["rocm"] maintainers("srekolam", "renjithravindrankannath") @@ -25,6 +25,7 @@ class RocmDbgapi(CMakePackage): license("MIT") version("master", branch="amd-master") + version("6.0.0", sha256="4e823eba255e46b93aff05fd5938ef2a51693ffd74debebffc1aabfce613805c") version("5.7.1", sha256="0ee9c2f083868849f2ea0cec7010e0270c27e7679ccbbadd12072cc0ef6c8a6f") version("5.7.0", sha256="285ddded8e7f1981d8861ffc1cd7770b78129e4955da08ad55a4779945699716") version("5.6.1", sha256="c7241bf94bdb97a4cf1befbf25b8c35720797710da6f6b5b9d6a4094c1bc9c8b") @@ -144,12 +145,13 @@ class RocmDbgapi(CMakePackage): "5.6.1", "5.7.0", "5.7.1", + "6.0.0", "master", ]: depends_on("hsa-rocr-dev@" + ver, type="build", when="@" + ver) depends_on("comgr@" + ver, type=("build", "link"), when="@" + ver) - for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1", "5.7.0", "5.7.1"]: + for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1", "5.7.0", "5.7.1", "6.0.0"]: depends_on("rocm-core@" + ver, when="@" + ver) @classmethod diff --git a/var/spack/repos/builtin/packages/rocm-debug-agent/package.py b/var/spack/repos/builtin/packages/rocm-debug-agent/package.py index 5f95ebf8e4..a397fb6f56 100644 --- a/var/spack/repos/builtin/packages/rocm-debug-agent/package.py +++ b/var/spack/repos/builtin/packages/rocm-debug-agent/package.py @@ -11,13 +11,14 @@ from spack.package import * class RocmDebugAgent(CMakePackage): """Radeon Open Compute (ROCm) debug agent""" - homepage = "https://github.com/ROCm-Developer-Tools/rocr_debug_agent" - git = "https://github.com/ROCm-Developer-Tools/rocr_debug_agent.git" - url = "https://github.com/ROCm-Developer-Tools/rocr_debug_agent/archive/rocm-5.5.0.tar.gz" + homepage = "https://github.com/ROCm/rocr_debug_agent" + git = "https://github.com/ROCm/rocr_debug_agent.git" + url = "https://github.com/ROCm/rocr_debug_agent/archive/rocm-6.0.0.tar.gz" tags = ["rocm"] maintainers("srekolam", "renjithravindrankannath") libraries = ["librocm-debug-agent"] + version("6.0.0", sha256="705be2c2bd0f5c7d1e286eb9b94045b2bd017ff323f07bca9aa7c81f2d168524") version("5.7.1", sha256="3b8d2835935da98f41e7cfc5b808c596ac06dd705b9a07bb70283e002f8dea6a") version("5.7.0", sha256="d9344ed02e82a01140f2162e901e6a519e5fee6b498e2f49417730ee2660c5c1") version("5.6.1", sha256="d3b1d5d757489ed3cc66d351cec56b7b850aaa7ecf6a55b0350b89c3dee3153a") @@ -105,7 +106,7 @@ class RocmDebugAgent(CMakePackage): ) def url_for_version(self, version): - url = "https://github.com/ROCm-Developer-Tools/rocr_debug_agent/archive/" + url = "https://github.com/ROCm/rocr_debug_agent/archive/" if version <= Version("3.7.0"): url += "roc-{0}.tar.gz".format(version) else: @@ -146,6 +147,7 @@ class RocmDebugAgent(CMakePackage): "5.6.1", "5.7.0", "5.7.1", + "6.0.0", ]: depends_on("hsa-rocr-dev@" + ver, when="@" + ver) depends_on("hsakmt-roct@" + ver, when="@" + ver) @@ -179,14 +181,15 @@ class RocmDebugAgent(CMakePackage): "5.6.1", "5.7.0", "5.7.1", + "6.0.0", ]: depends_on("rocm-dbgapi@" + ver, when="@" + ver) depends_on("hip@" + ver, when="@" + ver) - for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1", "5.7.0", "5.7.1"]: + for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1", "5.7.0", "5.7.1", "6.0.0"]: depends_on("rocm-core@" + ver, when="@" + ver) - # https://github.com/ROCm-Developer-Tools/rocr_debug_agent/pull/4 + # https://github.com/ROCm/rocr_debug_agent/pull/4 patch("0001-Drop-overly-strict-Werror-flag.patch", when="@3.7.0:") patch("0002-add-hip-architecture.patch", when="@3.9.0:") diff --git a/var/spack/repos/builtin/packages/rocm-device-libs/package.py b/var/spack/repos/builtin/packages/rocm-device-libs/package.py index b83682d120..6ba87f4dab 100644 --- a/var/spack/repos/builtin/packages/rocm-device-libs/package.py +++ b/var/spack/repos/builtin/packages/rocm-device-libs/package.py @@ -10,14 +10,15 @@ from spack.package import * class RocmDeviceLibs(CMakePackage): """set of AMD specific device-side language runtime libraries""" - homepage = "https://github.com/RadeonOpenCompute/ROCm-Device-Libs" - git = "https://github.com/RadeonOpenCompute/ROCm-Device-Libs.git" - url = "https://github.com/RadeonOpenCompute/ROCm-Device-Libs/archive/rocm-5.5.0.tar.gz" + homepage = "https://github.com/ROCm/ROCm-Device-Libs" + git = "https://github.com/ROCm/ROCm-Device-Libs.git" + url = "https://github.com/ROCm/ROCm-Device-Libs/archive/rocm-6.0.0.tar.gz" tags = ["rocm"] maintainers("srekolam", "renjithravindrankannath", "haampie") version("master", branch="amd-stg-open") + version("6.0.0", sha256="198df4550d4560537ba60ac7af9bde31d59779c8ec5d6309627f77a43ab6ef6f") version("5.7.1", sha256="703de8403c0bd0d80f37c970a698f10f148daf144d34f982e4484d04f7c7bbef") version("5.7.0", sha256="0f8780b9098573f1c456bdc84358de924dcf00604330770a383983e1775bf61e") version("5.6.1", sha256="f0dfab272ff936225bfa1e9dabeb3c5d12ce08b812bf53ffbddd2ddfac49761c") @@ -146,11 +147,12 @@ class RocmDeviceLibs(CMakePackage): "5.6.1", "5.7.0", "5.7.1", + "6.0.0", "master", ]: depends_on("llvm-amdgpu@" + ver, when="@" + ver) - for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1", "5.7.0", "5.7.1"]: + for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1", "5.7.0", "5.7.1", "6.0.0"]: depends_on("rocm-core@" + ver, when="@" + ver) def cmake_args(self): diff --git a/var/spack/repos/builtin/packages/rocm-gdb/package.py b/var/spack/repos/builtin/packages/rocm-gdb/package.py index 8c29704b29..5a7c06d8eb 100644 --- a/var/spack/repos/builtin/packages/rocm-gdb/package.py +++ b/var/spack/repos/builtin/packages/rocm-gdb/package.py @@ -11,13 +11,14 @@ class RocmGdb(AutotoolsPackage): """This is ROCmgdb, the ROCm source-level debugger for Linux, based on GDB, the GNU source-level debugger.""" - homepage = "https://github.com/ROCm-Developer-Tools/ROCgdb/" - url = "https://github.com/ROCm-Developer-Tools/ROCgdb/archive/rocm-5.5.0.tar.gz" + homepage = "https://github.com/ROCm/ROCgdb" + url = "https://github.com/ROCm/ROCgdb/archive/rocm-6.0.0.tar.gz" tags = ["rocm"] license("LGPL-2.0-or-later") maintainers("srekolam", "renjithravindrankannath") + version("6.0.0", sha256="0db4ab32ca729e69688cdb238df274ce5cf58b5cb2538584662cca4358708c2b") version("5.7.1", sha256="5cd150b5796aea9d77efd43b89d30a34fa4125338179eb87c6053abcac9f3c62") version("5.7.0", sha256="94fba57b2f17b593de61f7593b404fabc00b054d38567be57d12cf7654b7969a") version("5.6.1", sha256="d2b40d4c5aa41a6ce2a84307627b30d16a458672e03e13f9d27c12f2dc3f21d6") @@ -145,11 +146,12 @@ class RocmGdb(AutotoolsPackage): "5.6.1", "5.7.0", "5.7.1", + "6.0.0", ]: depends_on("rocm-dbgapi@" + ver, type="link", when="@" + ver) depends_on("comgr@" + ver, type="link", when="@" + ver) - for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1", "5.7.0", "5.7.1"]: + for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1", "5.7.0", "5.7.1", "6.0.0"]: depends_on("rocm-core@" + ver, when="@" + ver) build_directory = "spack-build" @@ -160,7 +162,7 @@ class RocmGdb(AutotoolsPackage): # Distributor options "--program-prefix=roc", "--enable-64-bit-bfd", - "--with-bugurl=https://github.com/ROCm-Developer-Tools/ROCgdb/issues", + "--with-bugurl=https://github.com/ROCm/ROCgdb/issues", "--with-pkgversion=-ROCm", "--enable-targets=x86_64-linux-gnu,amdgcn-amd-amdhsa", "--disable-ld", diff --git a/var/spack/repos/builtin/packages/rocm-opencl/package.py b/var/spack/repos/builtin/packages/rocm-opencl/package.py index 9435c1a8ec..8aa0b0a391 100644 --- a/var/spack/repos/builtin/packages/rocm-opencl/package.py +++ b/var/spack/repos/builtin/packages/rocm-opencl/package.py @@ -12,8 +12,8 @@ from spack.package import * class RocmOpencl(CMakePackage): """OpenCL: Open Computing Language on ROCclr""" - homepage = "https://github.com/RadeonOpenCompute/ROCm-OpenCL-Runtime" - git = "https://github.com/RadeonOpenCompute/ROCm-OpenCL-Runtime.git" + homepage = "https://github.com/ROCm/ROCm-OpenCL-Runtime" + git = "https://github.com/ROCm/ROCm-OpenCL-Runtime.git" tags = ["rocm"] maintainers("srekolam", "renjithravindrankannath") @@ -146,9 +146,7 @@ class RocmOpencl(CMakePackage): ]: resource( name="rocclr", - url="https://github.com/ROCm-Developer-Tools/ROCclr/archive/rocm-{0}.tar.gz".format( - d_version - ), + url="https://github.com/ROCm/ROCclr/archive/rocm-{0}.tar.gz".format(d_version), sha256=d_shasum, expand=True, destination="", diff --git a/var/spack/repos/builtin/packages/rocm-openmp-extras/package.py b/var/spack/repos/builtin/packages/rocm-openmp-extras/package.py index 836698b92b..d23a487914 100644 --- a/var/spack/repos/builtin/packages/rocm-openmp-extras/package.py +++ b/var/spack/repos/builtin/packages/rocm-openmp-extras/package.py @@ -8,8 +8,8 @@ import re from spack.package import * -tools_url = "https://github.com/ROCm-Developer-Tools" -compute_url = "https://github.com/RadeonOpenCompute" +tools_url = "https://github.com/ROCm" +compute_url = "https://github.com/ROCm" # Arrays of hashes are in order of the versions array below # For example array[0] = 3.9.0, array[1] = 3.10.0, etc. @@ -41,6 +41,7 @@ aomp = [ "6c051bf7625f682ba3d2ea80b46a38ca2cbcd20f5d89ae3433602d3e7ef0403a", "4f34fa02db410808c5e629f30f8804210b42c4ff7d31aa80606deaed43054c3c", "ed7bbf92230b6535a353ed032a39a9f16e9987397798100392fc25e40c8a1a4e", + "1b2c0934ef16e17b2377944fae8c9b3db6dc64b7e43932ddfe2eeefdf6821410", ] devlib = [ @@ -70,6 +71,7 @@ devlib = [ "f0dfab272ff936225bfa1e9dabeb3c5d12ce08b812bf53ffbddd2ddfac49761c", "0f8780b9098573f1c456bdc84358de924dcf00604330770a383983e1775bf61e", "703de8403c0bd0d80f37c970a698f10f148daf144d34f982e4484d04f7c7bbef", + "198df4550d4560537ba60ac7af9bde31d59779c8ec5d6309627f77a43ab6ef6f", ] llvm = [ @@ -99,6 +101,7 @@ llvm = [ "045e43c0c4a3f4f2f1db9fb603a4f1ea3d56e128147e19ba17909eb57d7f08e5", "4abdf00b297a77c5886cedb37e63acda2ba11cb9f4c0a64e133b05800aadfcf0", "6b54c422e45ad19c9bf5ab090ec21753e7f7d854ca78132c30eb146657b168eb", + "c673708d413d60ca8606ee75c77e9871b6953c59029c987b92f2f6e85f683626", ] flang = [ @@ -128,6 +131,7 @@ flang = [ "5ebcbca2e03bd0686e677f44ea551e97bd9395c6b119f832fa784818733aa652", "cc4f1973b1b8e7bcc4f09e3381bae4e1a2e51ea4e2598fc1b520ccb8bf24d28c", "8fd618d81af092416b267c4d00c801731f7a00c0f8d4aedb795e52a4ec1bf183", + "fcb319ddb2aa3004a6ae60370ab4425f529336b1cee50f29200e697e61b53586", ] extras = [ @@ -157,6 +161,7 @@ extras = [ "437e2017cfe2ab73b15ada0fc1ea88f794f0b108cc5410f457268ae7e4e8985a", "be59433dd85d4b8f0eaff87e0cc424a814152c67f3a682d1343c4bd61dd49a0f", "8060c6879708faf5f7d417b19a479dec9b7b9583a1b885f12d247faf831f7f0b", + "f37e1107e4da5b083e794244f3d0c9fd073ccb6fd6015e635349d8f0d679c4b8", ] versions = [ @@ -186,6 +191,7 @@ versions = [ "5.6.1", "5.7.0", "5.7.1", + "6.0.0", ] versions_dict = dict() # type: Dict[str,Dict[str,str]] components = ["aomp", "devlib", "llvm", "flang", "extras"] @@ -203,12 +209,13 @@ class RocmOpenmpExtras(Package): """OpenMP support for ROCm LLVM.""" homepage = tools_url + "/aomp" - url = tools_url + "/aomp/archive/rocm-5.5.0.tar.gz" + url = tools_url + "/aomp/archive/rocm-6.0.0.tar.gz" tags = ["rocm"] license("Apache-2.0") maintainers("srekolam", "renjithravindrankannath", "estewart08") + version("6.0.0", sha256=versions_dict["6.0.0"]["aomp"]) version("5.7.1", sha256=versions_dict["5.7.1"]["aomp"]) version("5.7.0", sha256=versions_dict["5.7.0"]["aomp"]) version("5.6.1", sha256=versions_dict["5.6.1"]["aomp"]) @@ -243,8 +250,8 @@ class RocmOpenmpExtras(Package): depends_on("awk", type="build") depends_on("elfutils", type=("build", "link")) depends_on("libffi", type=("build", "link")) - depends_on("libdrm", when="@5.7") - depends_on("numactl", when="@5.7") + depends_on("libdrm", when="@5.7:6.0") + depends_on("numactl", when="@5.7:6.0") for ver in [ "3.9.0", @@ -273,13 +280,14 @@ class RocmOpenmpExtras(Package): "5.6.1", "5.7.0", "5.7.1", + "6.0.0", ]: depends_on("hsakmt-roct@" + ver, when="@" + ver) depends_on("comgr@" + ver, when="@" + ver) depends_on("hsa-rocr-dev@" + ver, when="@" + ver) depends_on("llvm-amdgpu@{0} ~openmp".format(ver), when="@" + ver) - for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1", "5.7.0", "5.7.1"]: + for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1", "5.7.0", "5.7.1", "6.0.0"]: depends_on("rocm-core@" + ver, when="@" + ver) # tag changed to 'rocm-' in 4.0.0 @@ -327,7 +335,7 @@ class RocmOpenmpExtras(Package): placement="llvm-project", when="@" + ver, ) - patch("0001-Linking-hsakmt-libdrm-and-numactl-libraries.patch", when="@5.7") + patch("0001-Linking-hsakmt-libdrm-and-numactl-libraries.patch", when="@5.7:6.0") def setup_run_environment(self, env): devlibs_prefix = self.spec["llvm-amdgpu"].prefix @@ -497,7 +505,7 @@ class RocmOpenmpExtras(Package): devlibs_src = "{0}/rocm-openmp-extras/rocm-device-libs".format(src) hsa_prefix = self.spec["hsa-rocr-dev"].prefix hsakmt_prefix = self.spec["hsakmt-roct"].prefix - if self.spec.satisfies("@5.7"): + if self.spec.satisfies("@5.7:6.0"): libdrm_prefix = self.spec["libdrm"].prefix numactl_prefix = self.spec["numactl"].prefix comgr_prefix = self.spec["comgr"].prefix @@ -576,7 +584,7 @@ class RocmOpenmpExtras(Package): "-DCMAKE_CXX_FLAGS=-isystem{0} -I{1}".format(elfutils_inc, ffi_inc), "-DNEW_BC_PATH=1", ] - if self.spec.satisfies("@5.7"): + if self.spec.satisfies("@5.7:6.0"): openmp_common_args += [ "-DLIBDRM_LIB={0}/lib".format(libdrm_prefix), "-DHSAKMT_INC_PATH={0}/include".format(hsakmt_prefix), diff --git a/var/spack/repos/builtin/packages/rocm-smi-lib/package.py b/var/spack/repos/builtin/packages/rocm-smi-lib/package.py index fdd2bf216c..23af4a7653 100644 --- a/var/spack/repos/builtin/packages/rocm-smi-lib/package.py +++ b/var/spack/repos/builtin/packages/rocm-smi-lib/package.py @@ -16,15 +16,16 @@ class RocmSmiLib(CMakePackage): """It is a C library for Linux that provides a user space interface for applications to monitor and control GPU applications.""" - homepage = "https://github.com/RadeonOpenCompute/rocm_smi_lib" - git = "https://github.com/RadeonOpenCompute/rocm_smi_lib.git" - url = "https://github.com/RadeonOpenCompute/rocm_smi_lib/archive/rocm-5.5.0.tar.gz" + homepage = "https://github.com/ROCm/rocm_smi_lib" + git = "https://github.com/ROCm/rocm_smi_lib.git" + url = "https://github.com/ROCm/rocm_smi_lib/archive/rocm-6.0.0.tar.gz" tags = ["rocm"] maintainers("srekolam", "renjithravindrankannath") libraries = ["librocm_smi64"] version("master", branch="master") + version("6.0.0", sha256="0053b42402fd007e5ca9b3186c70f2c6f1b3026558f328722adadc2838c51309") version("5.7.1", sha256="4d79cb0482b2f801cc7824172743e3dd2b44b9f6784d1ca2e5067f2fbb4ef803") version("5.7.0", sha256="a399db3d9fc113ce2dd1ab5608a1cf9129ec4b6a2a79ab7922b1d9f43c454640") version("5.6.1", sha256="9e94f9a941202c3d7ce917fd1cd78c4e0f06f48d6c929f3aa916378ccef1e02c") @@ -116,7 +117,7 @@ class RocmSmiLib(CMakePackage): depends_on("cmake@3:", type="build") depends_on("python@3:", type=("build", "run"), when="@3.9.0:") - for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1", "5.7.0", "5.7.1"]: + for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1", "5.7.0", "5.7.1", "6.0.0"]: depends_on("rocm-core@" + ver, when="@" + ver) patch("disable_pdf_generation_with_doxygen_and_latex.patch", when="@4.5.2:5.6") diff --git a/var/spack/repos/builtin/packages/rocm-smi/package.py b/var/spack/repos/builtin/packages/rocm-smi/package.py index 0cc265c849..4e927b1f01 100644 --- a/var/spack/repos/builtin/packages/rocm-smi/package.py +++ b/var/spack/repos/builtin/packages/rocm-smi/package.py @@ -14,11 +14,11 @@ class RocmSmi(MakefilePackage): management of your ROCm enabled system Note: After ROCm 3.9, this project moved to - https://github.com/RadeonOpenCompute/rocm_smi_lib/tree/master/python_smi_tools + https://github.com/ROCm/rocm_smi_lib/tree/master/python_smi_tools The spack package is called: rocm-smi-lib""" - homepage = "https://github.com/RadeonOpenCompute/ROC-smi" - url = "https://github.com/RadeonOpenCompute/ROC-smi/archive/rocm-4.1.0.tar.gz" + homepage = "https://github.com/ROCm/ROC-smi" + url = "https://github.com/ROCm/ROC-smi/archive/rocm-4.1.0.tar.gz" maintainers("srekolam", "renjithravindrankannath") tags = ["rocm"] diff --git a/var/spack/repos/builtin/packages/rocm-tensile/package.py b/var/spack/repos/builtin/packages/rocm-tensile/package.py index c92e4b34d6..8b869452cc 100644 --- a/var/spack/repos/builtin/packages/rocm-tensile/package.py +++ b/var/spack/repos/builtin/packages/rocm-tensile/package.py @@ -11,14 +11,15 @@ from spack.pkg.builtin.boost import Boost class RocmTensile(CMakePackage): """Radeon Open Compute Tensile library""" - homepage = "https://github.com/ROCmSoftwarePlatform/Tensile/" - git = "https://github.com/ROCmSoftwarePlatform/Tensile.git" - url = "https://github.com/ROCmSoftwarePlatform/Tensile/archive/rocm-5.5.0.tar.gz" + homepage = "https://github.com/ROCm/Tensile/" + git = "https://github.com/ROCm/Tensile.git" + url = "https://github.com/ROCm/Tensile/archive/rocm-6.0.0.tar.gz" tags = ["rocm"] license("MIT") maintainers("srekolam", "renjithravindrankannath", "haampie") + version("6.0.0", sha256="5d90add62d1439b7daf0527316e950e454e5d8beefb4f723865fe9ab26c7aa42") version("5.7.1", sha256="9211a51b23c22b7a79e4e494e8ff3c31e90bf21adb8cce260acc57891fb2c917") version("5.7.0", sha256="fe2ae067c1c579f33d7a1e26da3fe6b4ed44befa08f9dfce2ceae586f184b816") version("5.6.1", sha256="3e78c933563fade8781a1dca2079bff135af2f5d2c6eb0147797d2c1f24d006c") @@ -166,6 +167,7 @@ class RocmTensile(CMakePackage): "5.6.1", "5.7.0", "5.7.1", + "6.0.0", ]: depends_on("rocm-cmake@" + ver, type="build", when="@" + ver) depends_on("hip@" + ver, when="@" + ver) @@ -187,6 +189,7 @@ class RocmTensile(CMakePackage): "5.6.1", "5.7.0", "5.7.1", + "6.0.0", ]: depends_on("rocm-openmp-extras@" + ver, when="@" + ver) @@ -218,11 +221,12 @@ class RocmTensile(CMakePackage): "5.6.1", "5.7.0", "5.7.1", + "6.0.0", ]: depends_on("rocm-smi-lib@" + ver, type="build", when="@" + ver) root_cmakelists_dir = "Tensile/Source" - # Status: https://github.com/ROCmSoftwarePlatform/Tensile/commit/a488f7dadba34f84b9658ba92ce9ec5a0615a087 + # Status: https://github.com/ROCm/Tensile/commit/a488f7dadba34f84b9658ba92ce9ec5a0615a087 # Not yet landed in 3.7.0, nor 3.8.0. patch("0001-fix-compile-error.patch", when="@3.7.0:3.8.0") patch("0002-require-openmp-when-tensile-use-openmp-is-on.patch", when="@3.9.0:4.0.0") diff --git a/var/spack/repos/builtin/packages/rocm-validation-suite/009-replacing-rocm-path-with-package-path.patch b/var/spack/repos/builtin/packages/rocm-validation-suite/009-replacing-rocm-path-with-package-path.patch new file mode 100644 index 0000000000..ae21de8c82 --- /dev/null +++ b/var/spack/repos/builtin/packages/rocm-validation-suite/009-replacing-rocm-path-with-package-path.patch @@ -0,0 +1,636 @@ +From 7bb26280b6da667573a581780f97856985b44e4e Mon Sep 17 00:00:00 2001 +From: Renjith Ravindran <Renjith.RavindranKannath@amd.com> +Date: Fri, 12 Jan 2024 09:31:21 +0000 +Subject: [PATCH] Updating cmake with include and library path for spack + +--- + CMakeLists.txt | 21 +++++++++++---------- + babel.so/CMakeLists.txt | 18 +++++++++--------- + cmake_modules/tests_unit.cmake | 3 ++- + edp.so/CMakeLists.txt | 6 +++--- + gm.so/CMakeLists.txt | 6 +++--- + gpup.so/CMakeLists.txt | 8 ++++---- + gst.so/CMakeLists.txt | 10 +++++----- + iet.so/CMakeLists.txt | 6 +++--- + mem.so/CMakeLists.txt | 6 +++--- + pbqt.so/CMakeLists.txt | 6 +++--- + pebb.so/CMakeLists.txt | 4 ++-- + peqt.so/CMakeLists.txt | 6 +++--- + perf.so/CMakeLists.txt | 8 ++++---- + pesm.so/CMakeLists.txt | 8 ++++---- + rcqt.so/CMakeLists.txt | 6 +++--- + rvs/CMakeLists.txt | 15 ++++++++------- + rvs/tests.cmake | 6 ++++-- + rvslib/CMakeLists.txt | 2 +- + smqt.so/CMakeLists.txt | 6 +++--- + testif.so/CMakeLists.txt | 20 ++++++++++---------- + 20 files changed, 88 insertions(+), 83 deletions(-) + +diff --git a/CMakeLists.txt b/CMakeLists.txt +index b25eca4..eeee55d 100644 +--- a/CMakeLists.txt ++++ b/CMakeLists.txt +@@ -70,13 +70,14 @@ endif(rocblas_FOUND) + # variables since we will pass them as cmake params appropriately, and + # all find_packages relevant to this build will be in ROCM path hence appending it to CMAKE_PREFIX_PATH + set(ROCM_PATH "/opt/rocm" CACHE PATH "ROCM install path") +-set(CMAKE_INSTALL_PREFIX "/opt/rocm" CACHE PATH "CMAKE installation directory") +-set(CPACK_PACKAGING_INSTALL_PREFIX "/opt/rocm" CACHE PATH "Prefix used in built packages") ++set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}") ++set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE) ++set(CPACK_PACKAGING_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}") + list(APPEND CMAKE_PREFIX_PATH "${ROCM_PATH}") +-set(ROCR_INC_DIR "${ROCM_PATH}/include" CACHE PATH "Contains header files exported by ROC Runtime" FORCE) +-set(ROCR_LIB_DIR "${ROCM_PATH}/lib" CACHE PATH "Contains library files exported by ROC Runtime" FORCE) +-set(HIP_INC_DIR "${ROCM_PATH}" CACHE PATH "Contains header files exported by ROC Runtime") +-set(ROCT_INC_DIR "${ROCM_PATH}/include" CACHE PATH "Contains header files exported by ROC Trunk" FORCE) ++set(ROCR_INC_DIR "${HSA_PATH}/include" CACHE PATH "Contains header files exported by ROC Runtime") ++set(ROCR_LIB_DIR "${HSA_PATH}/lib" CACHE PATH "Contains library files exported by ROC Runtime") ++set(HIP_INC_DIR "${HIP_PATH}" CACHE PATH "Contains header files exported by ROC Runtime") ++set(ROCT_INC_DIR "${ROCM_PATH}/include" CACHE PATH "Contains header files exported by ROC Trunk") + + add_definitions(-DROCM_PATH="${ROCM_PATH}") + add_definitions(-DRVS_LIB_PATH="${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}/rvs") +@@ -420,8 +421,8 @@ if (RVS_ROCBLAS EQUAL 1) + set(ROCBLAS_INC_DIR "${CMAKE_BINARY_DIR}/rvs_rblas-src/build/release/rocblas-install") + set(ROCBLAS_LIB_DIR "${CMAKE_BINARY_DIR}/rvs_rblas-src/build/release/rocblas-install/lib/") + else() +- set(ROCBLAS_INC_DIR "${ROCM_PATH}/include") +- set(ROCBLAS_LIB_DIR "${ROCM_PATH}/lib") ++ set(ROCBLAS_INC_DIR "${ROCBLAS_DIR}/include") ++ set(ROCBLAS_LIB_DIR "${ROCBLAS_DIR}/lib") + endif() + + if (RVS_ROCMSMI EQUAL 1) +@@ -436,8 +437,8 @@ else() + set(ROCM_SMI_LIB_DIR "${ROCM_PATH}/rocm_smi/lib") + else() + message( STATUS "ROCBLAS REORG Enabled Version: ${RVS_ROCBLAS_VERSION_FLAT}" ) +- set(ROCM_SMI_INC_DIR "${ROCM_PATH}/include") +- set(ROCM_SMI_LIB_DIR "${ROCM_PATH}/lib") ++ set(ROCM_SMI_INC_DIR "${ROCM_SMI_DIR}/include") ++ set(ROCM_SMI_LIB_DIR "${ROCM_SMI_DIR}/lib") + endif() + endif() + set(ROCM_SMI_LIB "rocm_smi64" CACHE STRING "rocm_smi library name") +diff --git a/babel.so/CMakeLists.txt b/babel.so/CMakeLists.txt +index f163dae..fa85b38 100644 +--- a/babel.so/CMakeLists.txt ++++ b/babel.so/CMakeLists.txt +@@ -107,13 +107,13 @@ set(HIP_HCC_LIB "amdhip64") + add_compile_options(-DRVS_ROCBLAS_VERSION_FLAT=${RVS_ROCBLAS_VERSION_FLAT}) + + # Determine Roc Runtime header files are accessible +-if(NOT EXISTS ${HIP_INC_DIR}/include/hip/hip_runtime.h) +- message("ERROR: ROC Runtime headers can't be found under specified path. Please set HIP_INC_DIR path. Current value is : " ${HIP_INC_DIR}) ++if(NOT EXISTS ${HIP_PATH}/include/hip/hip_runtime.h) ++ message("ERROR: ROC Runtime headers can't be found under specified path. Please set HIP_PATH path. Current value is : " ${HIP_PATH}) + RETURN() + endif() + +-if(NOT EXISTS ${HIP_INC_DIR}/include/hip/hip_runtime_api.h) +- message("ERROR: ROC Runtime headers can't be found under specified path. Please set HIP_INC_DIR path. Current value is : " ${HIP_INC_DIR}) ++if(NOT EXISTS ${HIP_PATH}/include/hip/hip_runtime_api.h) ++ message("ERROR: ROC Runtime headers can't be found under specified path. Please set HIP_PATH path. Current value is : " ${HIP_PATH}) + RETURN() + endif() + +@@ -133,16 +133,16 @@ if(DEFINED RVS_ROCMSMI) + endif() + + +-if(NOT EXISTS "${ROCR_LIB_DIR}/lib${HIP_HCC_LIB}.so") +- message("ERROR: ROC Runtime libraries can't be found under specified path. Please set ROCR_LIB_DIR path. Current value is : " ${ROCR_LIB_DIR}) ++if(NOT EXISTS "${HIP_PATH}/lib/lib${HIP_HCC_LIB}.so") ++ message("ERROR: ROC Runtime libraries can't be found under specified path. Please set HIP_PATH path. Current value is : " ${HIP_PATH}) + RETURN() + endif() + + ## define include directories +-include_directories(./ ../ ${ROCR_INC_DIR} ${HIP_INC_DIR}) ++include_directories(./ ../ ${HIP_PATH}) + + # Add directories to look for library files to link +-link_directories(${RVS_LIB_DIR} ${ROCR_LIB_DIR} ${ROCBLAS_LIB_DIR} ${ASAN_LIB_PATH}) ++link_directories(${RVS_LIB_DIR} ${HIP_PATH}/lib/ ${ROCBLAS_LIB_DIR} ${ASAN_LIB_PATH} ${ROCM_SMI_LIB_DIR}) + ## additional libraries + set (PROJECT_LINK_LIBS rvslib libpthread.so libpci.so libm.so) + +@@ -154,7 +154,7 @@ add_library( ${RVS_TARGET} SHARED ${SOURCES}) + set_target_properties(${RVS_TARGET} PROPERTIES + SUFFIX .so.${LIB_VERSION_STRING} + LIBRARY_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) +-target_link_libraries(${RVS_TARGET} ${PROJECT_LINK_LIBS} ${HIP_HCC_LIB} ${ROCBLAS_LIB}) ++target_link_libraries(${RVS_TARGET} ${PROJECT_LINK_LIBS} ${HIP_HCC_LIB} ${ROCBLAS_LIB} ${ROCBLAS_LIB_DIR}/librocblas.so ${HSAKMT_LIB_DIR}/libhsakmt.a ${HSA_PATH}/lib/libhsa-runtime64.so) + add_dependencies(${RVS_TARGET} rvslib) + + add_custom_command(TARGET ${RVS_TARGET} POST_BUILD +diff --git a/cmake_modules/tests_unit.cmake b/cmake_modules/tests_unit.cmake +index e0e9f88..7321e0a 100644 +--- a/cmake_modules/tests_unit.cmake ++++ b/cmake_modules/tests_unit.cmake +@@ -27,7 +27,7 @@ + ## define additional unit testing include directories + include_directories(${UT_INC}) + ## define additional unit testing lib directories +-link_directories(${UT_LIB} ${RVS_LIB_DIR}) ++link_directories(${UT_LIB} ${RVS_LIB_DIR} ${ROCM_SMI_LIB_DIR}) + + file(GLOB TESTSOURCES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} test/test*.cpp ) + #message ( "TESTSOURCES: ${TESTSOURCES}" ) +@@ -45,6 +45,7 @@ FOREACH(SINGLE_TEST ${TESTSOURCES}) + ) + target_link_libraries(${TEST_NAME} + ${UT_LINK_LIBS} rvslibut rvslib gtest_main gtest pthread pci ++ ${ROCBLAS_LIB_DIR}/librocblas.so ${HSAKMT_LIB_DIR}/libhsakmt.a ${HSA_PATH}/lib/libhsa-runtime64.so + ) + target_compile_definitions(${TEST_NAME} PUBLIC RVS_UNIT_TEST) + if(DEFINED tcd.${TEST_NAME}) +diff --git a/edp.so/CMakeLists.txt b/edp.so/CMakeLists.txt +index 7dd34ea..7978abe 100644 +--- a/edp.so/CMakeLists.txt ++++ b/edp.so/CMakeLists.txt +@@ -134,11 +134,11 @@ if(NOT EXISTS "${ROCR_LIB_DIR}/lib${HIP_HCC_LIB}.so") + endif() + + ## define include directories +-include_directories(./ ../ ${ROCR_INC_DIR} ${ROCBLAS_INC_DIR} ${HIP_INC_DIR}) ++include_directories(./ ../ ${ROCR_INC_DIR} ${ROCBLAS_INC_DIR} ${HIP_INC_DIR} ${YAML_CPP_INCLUDE_DIRS}) + # Add directories to look for library files to link +-link_directories(${RVS_LIB_DIR} ${ROCR_LIB_DIR} ${ROCBLAS_LIB_DIR}) ++link_directories(${RVS_LIB_DIR} ${ROCR_LIB_DIR} ${ROCBLAS_LIB_DIR} ${ROCM_SMI_LIB_DIR}) + ## additional libraries +-set (PROJECT_LINK_LIBS rvslib libpthread.so libpciaccess.so libpci.so libm.so) ++set (PROJECT_LINK_LIBS rvslib libpthread.so libpciaccess.so libpci.so libm.so ${ROCBLAS_LIB_DIR}/librocblas.so ${HSAKMT_LIB_DIR}/libhsakmt.a ${HSA_PATH}/lib/libhsa-runtime64.so) + + ## define source files + set (SOURCES src/rvs_module.cpp src/action.cpp src/edp_worker.cpp ) +diff --git a/gm.so/CMakeLists.txt b/gm.so/CMakeLists.txt +index d3caa84..73b83ce 100644 +--- a/gm.so/CMakeLists.txt ++++ b/gm.so/CMakeLists.txt +@@ -118,11 +118,11 @@ if(DEFINED RVS_ROCMSMI) + endif() + + ## define include directories +-include_directories(./ ../ ${ROCM_SMI_INC_DIR}) ++include_directories(./ ../ ${ROCM_SMI_INC_DIR} ${YAML_CPP_INCLUDE_DIRS}) + # Add directories to look for library files to link + link_directories(${RVS_LIB_DIR} ${ROCM_SMI_LIB_DIR} ${ASAN_LIB_PATH}) + ## additional libraries +-set (PROJECT_LINK_LIBS rvslib libpthread.so libpci.so libm.so) ++set (PROJECT_LINK_LIBS rvslib libpthread.so libpci.so libm.so librocm_smi64.so ${ROCBLAS_LIB_DIR}/librocblas.so ${HSAKMT_LIB_DIR}/libhsakmt.a ${HSA_PATH}/lib/libhsa-runtime64.so) + + ## define source files + set(SOURCES src/rvs_module.cpp src/action.cpp src/worker.cpp) +@@ -133,7 +133,7 @@ add_library( ${RVS_TARGET} SHARED ${SOURCES}) + set_target_properties(${RVS_TARGET} PROPERTIES + SUFFIX .so.${LIB_VERSION_STRING} + LIBRARY_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) +-target_link_libraries(${RVS_TARGET} ${PROJECT_LINK_LIBS} ${ROCM_SMI_LIB}) ++target_link_libraries(${RVS_TARGET} ${PROJECT_LINK_LIBS}) + add_dependencies(${RVS_TARGET} rvslib) + + add_custom_command(TARGET ${RVS_TARGET} POST_BUILD +diff --git a/gpup.so/CMakeLists.txt b/gpup.so/CMakeLists.txt +index 43d337a..a234feb 100644 +--- a/gpup.so/CMakeLists.txt ++++ b/gpup.so/CMakeLists.txt +@@ -109,11 +109,11 @@ else() + endif() + + ## define include directories +-include_directories(./ ../ include ../include) ++include_directories(./ ../ include ../include ${YAML_CPP_INCLUDE_DIRS}) + # Add directories to look for library files to link +-link_directories(${RVS_LIB_DIR} ${ASAN_LIB_PATH}) ++link_directories(${RVS_LIB_DIR} ${ASAN_LIB_PATH} ${ROCM_SMI_LIB_DIR}) + ## additional libraries +-set (PROJECT_LINK_LIBS rvslib libpci.so libm.so) ++set (PROJECT_LINK_LIBS rvslib libpci.so libm.so ${ROCBLAS_LIB_DIR}/librocblas.so ${HSAKMT_LIB_DIR}/libhsakmt.a ${HSA_PATH}/lib/libhsa-runtime64.so) + + ## define source files + set(SOURCES src/rvs_module.cpp src/action.cpp) +@@ -124,7 +124,7 @@ set_target_properties(${RVS_TARGET} PROPERTIES + SUFFIX .so.${LIB_VERSION_STRING} + LIBRARY_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) + +-target_link_libraries(${RVS_TARGET} ${PROJECT_LINK_LIBS} ) ++target_link_libraries(${RVS_TARGET} ${PROJECT_LINK_LIBS}) + add_dependencies(${RVS_TARGET} rvslib) + + add_custom_command(TARGET ${RVS_TARGET} POST_BUILD +diff --git a/gst.so/CMakeLists.txt b/gst.so/CMakeLists.txt +index fd346ce..cb8c4b6 100644 +--- a/gst.so/CMakeLists.txt ++++ b/gst.so/CMakeLists.txt +@@ -137,17 +137,17 @@ if(DEFINED RVS_ROCMSMI) + endif() + + +-if(NOT EXISTS "${ROCR_LIB_DIR}/lib${HIP_HCC_LIB}.so") ++if(NOT EXISTS "${HIP_INC_DIR}/lib/lib${HIP_HCC_LIB}.so") + message("ERROR: ROC Runtime libraries can't be found under specified path. Please set ROCR_LIB_DIR path. Current value is : " ${ROCR_LIB_DIR}) + RETURN() + endif() + + ## define include directories +-include_directories(./ ../ ${ROCR_INC_DIR} ${ROCBLAS_INC_DIR} ${HIP_INC_DIR}) ++include_directories(./ ../ ${ROCR_INC_DIR} ${ROCBLAS_INC_DIR} ${HIP_INC_DIR} ${YAML_CPP_INCLUDE_DIRS}) + # Add directories to look for library files to link +-link_directories(${RVS_LIB_DIR} ${ROCR_LIB_DIR} ${ROCBLAS_LIB_DIR} ${ASAN_LIB_PATH}) ++link_directories(${RVS_LIB_DIR} ${ROCR_LIB_DIR} ${HIP_INC_DIR}/lib/ ${ROCBLAS_LIB_DIR} ${ASAN_LIB_PATH} ${ROCM_SMI_LIB_DIR}) + ## additional libraries +-set (PROJECT_LINK_LIBS rvslib libpthread.so libpci.so libm.so) ++set (PROJECT_LINK_LIBS rvslib libpthread.so libpci.so libm.so ${ROCBLAS_LIB_DIR}/librocblas.so ${HSAKMT_LIB_DIR}/libhsakmt.a ${HSA_PATH}/lib/libhsa-runtime64.so) + + ## define source files + set(SOURCES src/rvs_module.cpp src/action.cpp src/gst_worker.cpp) +@@ -157,7 +157,7 @@ add_library( ${RVS_TARGET} SHARED ${SOURCES}) + set_target_properties(${RVS_TARGET} PROPERTIES + SUFFIX .so.${LIB_VERSION_STRING} + LIBRARY_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) +-target_link_libraries(${RVS_TARGET} ${PROJECT_LINK_LIBS} ${HIP_HCC_LIB} ${ROCBLAS_LIB}) ++target_link_libraries(${RVS_TARGET} ${PROJECT_LINK_LIBS} ${HIP_HCC_LIB} ${ROCBLAS_LIB} ) + add_dependencies(${RVS_TARGET} rvslib) + + add_custom_command(TARGET ${RVS_TARGET} POST_BUILD +diff --git a/iet.so/CMakeLists.txt b/iet.so/CMakeLists.txt +index a85ca98..252e565 100644 +--- a/iet.so/CMakeLists.txt ++++ b/iet.so/CMakeLists.txt +@@ -140,7 +140,7 @@ if(DEFINED RVS_ROCMSMI) + endif() + endif() + +-if(NOT EXISTS "${ROCR_LIB_DIR}/lib${HIP_HCC_LIB}.so") ++if(NOT EXISTS "${HIP_INC_DIR}/lib/lib${HIP_HCC_LIB}.so") + message("ERROR: ROC Runtime libraries can't be found under specified path. Please set ROCR_LIB_DIR path. Current value is : " ${ROCR_LIB_DIR}) + RETURN() + endif() +@@ -159,7 +159,7 @@ include_directories(./ ../ ${ROCM_SMI_INC_DIR} ${ROCBLAS_INC_DIR} ${ROCR_INC_DIR + # Add directories to look for library files to link + link_directories(${RVS_LIB_DIR} ${ROCR_LIB_DIR} ${ROCBLAS_LIB_DIR} ${ROCM_SMI_LIB_DIR} ${ASAN_LIB_PATH}) + ## additional libraries +-set (PROJECT_LINK_LIBS rvslib libpthread.so libpci.so libm.so) ++set (PROJECT_LINK_LIBS rvslib libpthread.so libpci.so libm.so librocm_smi64.so ${ROCBLAS_LIB_DIR}/librocblas.so ${HSAKMT_LIB_DIR}/libhsakmt.a ${HSA_PATH}/lib/libhsa-runtime64.so) + + set(SOURCES src/rvs_module.cpp src/action.cpp src/iet_worker.cpp ) + +@@ -168,7 +168,7 @@ add_library( ${RVS_TARGET} SHARED ${SOURCES}) + set_target_properties(${RVS_TARGET} PROPERTIES + SUFFIX .so.${LIB_VERSION_STRING} + LIBRARY_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) +-target_link_libraries(${RVS_TARGET} ${PROJECT_LINK_LIBS} ${HIP_HCC_LIB} ${ROCBLAS_LIB} ${ROCM_SMI_LIB}) ++target_link_libraries(${RVS_TARGET} ${PROJECT_LINK_LIBS} ${HIP_INC_DIR}/lib/ ${HIP_HCC_LIB} ${ROCBLAS_LIB}) + add_dependencies(${RVS_TARGET} rvslib) + + add_custom_command(TARGET ${RVS_TARGET} POST_BUILD +diff --git a/mem.so/CMakeLists.txt b/mem.so/CMakeLists.txt +index 5133337..2462bbc 100644 +--- a/mem.so/CMakeLists.txt ++++ b/mem.so/CMakeLists.txt +@@ -134,7 +134,7 @@ if(DEFINED RVS_ROCMSMI) + endif() + + +-if(NOT EXISTS "${ROCR_LIB_DIR}/lib${HIP_HCC_LIB}.so") ++if(NOT EXISTS "${HIP_INC_DIR}/lib/lib${HIP_HCC_LIB}.so") + message("ERROR: ROC Runtime libraries can't be found under specified path. Please set ROCR_LIB_DIR path. Current value is : " ${ROCR_LIB_DIR}) + RETURN() + endif() +@@ -143,9 +143,9 @@ endif() + include_directories(./ ../ ${ROCR_INC_DIR} ${HIP_INC_DIR}) + + # Add directories to look for library files to link +-link_directories(${RVS_LIB_DIR} ${ROCR_LIB_DIR} ${ROCBLAS_LIB_DIR} ${ASAN_LIB_PATH}) ++link_directories(${RVS_LIB_DIR} ${ROCR_LIB_DIR} ${HIP_INC_DIR}/lib ${ROCBLAS_LIB_DIR} ${ASAN_LIB_PATH} ${ROCM_SMI_LIB_DIR}) + ## additional libraries +-set (PROJECT_LINK_LIBS rvslib libpthread.so libpci.so libm.so) ++set (PROJECT_LINK_LIBS rvslib libpthread.so libpci.so libm.so ${ROCBLAS_LIB_DIR}/librocblas.so ${HSAKMT_LIB_DIR}/libhsakmt.a ${HSA_PATH}/lib/libhsa-runtime64.so) + + ## define source files + set(SOURCES src/rvs_module.cpp src/action.cpp src/rvs_memtest.cpp src/rvs_memworker.cpp) +diff --git a/pbqt.so/CMakeLists.txt b/pbqt.so/CMakeLists.txt +index 5ae675a..892b6ac 100644 +--- a/pbqt.so/CMakeLists.txt ++++ b/pbqt.so/CMakeLists.txt +@@ -136,11 +136,11 @@ if(NOT EXISTS ${ROCR_LIB_DIR}/${CORE_RUNTIME_LIBRARY}.so) + endif() + + ## define include directories +-include_directories(./ ../ pci ${ROCR_INC_DIR}) ++include_directories(./ ../ pci ${ROCR_INC_DIR} ${YAML_CPP_INCLUDE_DIRS}) + # Add directories to look for library files to link +-link_directories(${RVS_LIB_DIR} ${ROCR_LIB_DIR} ${ROCT_LIB_DIR} ${ASAN_LIB_PATH}) ++link_directories(${RVS_LIB_DIR} ${ROCR_LIB_DIR} ${HSAKMT_LIB_DIR} ${ROCT_LIB_DIR} ${ASAN_LIB_PATH} ${ROCM_SMI_LIB_DIR}) + ## additional libraries +-set (PROJECT_LINK_LIBS rvslib libpthread.so libpci.so libm.so) ++set (PROJECT_LINK_LIBS rvslib libpthread.so libpci.so libm.so ${ROCBLAS_LIB_DIR}/librocblas.so ${HSAKMT_LIB_DIR}/libhsakmt.a ${HSA_PATH}/lib/libhsa-runtime64.so) + + ## define source files + set(SOURCES src/rvs_module.cpp src/action.cpp src/action_run.cpp +diff --git a/pebb.so/CMakeLists.txt b/pebb.so/CMakeLists.txt +index c4e2964..7a6b368 100644 +--- a/pebb.so/CMakeLists.txt ++++ b/pebb.so/CMakeLists.txt +@@ -139,9 +139,9 @@ endif() + ## define include directories + include_directories(./ ../ pci ${ROCR_INC_DIR}) + # Add directories to look for library files to link +-link_directories(${RVS_LIB_DIR} ${ROCR_LIB_DIR} ${ROCT_LIB_DIR} ${ASAN_LIB_PATH}) ++link_directories(${RVS_LIB_DIR} ${ROCR_LIB_DIR} ${HSAKMT_LIB_DIR} ${ROCT_LIB_DIR} ${ASAN_LIB_PATH} ${ROCM_SMI_LIB_DIR} ) + ## additional libraries +-set (PROJECT_LINK_LIBS rvslib libpthread.so libpci.so libm.so) ++set (PROJECT_LINK_LIBS rvslib libpthread.so libpci.so libm.so ${ROCBLAS_LIB_DIR}/librocblas.so ${HSAKMT_LIB_DIR}/libhsakmt.a ${HSA_PATH}/lib/libhsa-runtime64.so) + + ## define source files + set(SOURCES src/rvs_module.cpp src/action.cpp src/action_run.cpp +diff --git a/peqt.so/CMakeLists.txt b/peqt.so/CMakeLists.txt +index ead507d..567358b 100644 +--- a/peqt.so/CMakeLists.txt ++++ b/peqt.so/CMakeLists.txt +@@ -107,9 +107,9 @@ else() + endif() + + ## define include directories +-include_directories(./ ../) ++include_directories(./ ../ ${HSA_PATH}) + # Add directories to look for library files to link +-link_directories(${RVS_LIB_DIR} ${ASAN_LIB_PATH}) ++link_directories(${RVS_LIB_DIR} ${HSA_PATH}/lib/ ${HSAKMT_LIB_DIR} ${ASAN_LIB_PATH} ${ROCM_SMI_LIB_DIR} ${YAML_CPP_INCLUDE_DIRS}) + ## additional libraries + set (PROJECT_LINK_LIBS rvslib libpci.so libm.so) + +@@ -121,7 +121,7 @@ add_library( ${RVS_TARGET} SHARED ${SOURCES}) + set_target_properties(${RVS_TARGET} PROPERTIES + SUFFIX .so.${LIB_VERSION_STRING} + LIBRARY_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) +-target_link_libraries(${RVS_TARGET} ${PROJECT_LINK_LIBS} ) ++target_link_libraries(${RVS_TARGET} ${PROJECT_LINK_LIBS} ${ROCBLAS_LIB_DIR}/librocblas.so ${HSAKMT_LIB_DIR}/libhsakmt.a ${HSA_PATH}/lib/libhsa-runtime64.so) + add_dependencies(${RVS_TARGET} rvslib) + + add_custom_command(TARGET ${RVS_TARGET} POST_BUILD +diff --git a/perf.so/CMakeLists.txt b/perf.so/CMakeLists.txt +index 518dac9..02d2245 100644 +--- a/perf.so/CMakeLists.txt ++++ b/perf.so/CMakeLists.txt +@@ -137,7 +137,7 @@ if(DEFINED RVS_ROCMSMI) + endif() + + +-if(NOT EXISTS "${ROCR_LIB_DIR}/lib${HIP_HCC_LIB}.so") ++if(NOT EXISTS "${HIP_INC_DIR}/lib/lib${HIP_HCC_LIB}.so") + message("ERROR: ROC Runtime libraries can't be found under specified path. Please set ROCR_LIB_DIR path. Current value is : " ${ROCR_LIB_DIR}) + RETURN() + endif() +@@ -145,9 +145,9 @@ endif() + ## define include directories + include_directories(./ ../ ${ROCR_INC_DIR} ${ROCBLAS_INC_DIR} ${HIP_INC_DIR}) + # Add directories to look for library files to link +-link_directories(${RVS_LIB_DIR} ${ROCR_LIB_DIR} ${ROCBLAS_LIB_DIR} ${ASAN_LIB_PATH}) ++link_directories(${RVS_LIB_DIR} ${ROCR_LIB_DIR} ${HIP_INC_DIR}/lib ${ROCBLAS_LIB_DIR} ${ASAN_LIB_PATH} ${ROCM_SMI_LIB_DIR}) + ## additional libraries +-set (PROJECT_LINK_LIBS rvslib libpthread.so libpci.so libm.so) ++set (PROJECT_LINK_LIBS rvslib libpthread.so libpci.so libm.so ${ROCBLAS_LIB_DIR}/librocblas.so ${HSAKMT_LIB_DIR}/libhsakmt.a ${HSA_PATH}/lib/libhsa-runtime64.so) + + ## define source files + set(SOURCES src/rvs_module.cpp src/action.cpp src/perf_worker.cpp) +@@ -157,7 +157,7 @@ add_library( ${RVS_TARGET} SHARED ${SOURCES}) + set_target_properties(${RVS_TARGET} PROPERTIES + SUFFIX .so.${LIB_VERSION_STRING} + LIBRARY_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) +-target_link_libraries(${RVS_TARGET} ${PROJECT_LINK_LIBS} ${HIP_HCC_LIB} ${ROCBLAS_LIB}) ++target_link_libraries(${RVS_TARGET} ${PROJECT_LINK_LIBS}) + add_dependencies(${RVS_TARGET} rvslib) + + add_custom_command(TARGET ${RVS_TARGET} POST_BUILD +diff --git a/pesm.so/CMakeLists.txt b/pesm.so/CMakeLists.txt +index 1f27f34..20a8bed 100644 +--- a/pesm.so/CMakeLists.txt ++++ b/pesm.so/CMakeLists.txt +@@ -107,11 +107,11 @@ else() + endif() + + ## define include directories +-include_directories(./ ../ pci) ++include_directories(./ ../ pci ${YAML_CPP_INCLUDE_DIRS}) + # Add directories to look for library files to link +-link_directories(${RVS_LIB_DIR} ${ROCR_LIB_DIR} ${ROCBLAS_LIB_DIR} ${ASAN_LIB_PATH}) ++link_directories(${RVS_LIB_DIR} ${ROCR_LIB_DIR} ${ROCBLAS_LIB_DIR} ${ASAN_LIB_PATH} ${ROCM_SMI_LIB_DIR}) + ## additional libraries +-set (PROJECT_LINK_LIBS libpthread.so libpci.so libm.so) ++set (PROJECT_LINK_LIBS libpthread.so libpci.so libm.so ${PROJECT_LINK_LIBS} ${ROCBLAS_LIB_DIR}/librocblas.so ${HSAKMT_LIB_DIR}/libhsakmt.a ${HSA_PATH}/lib/libhsa-runtime64.so) + + ## define source files + set(SOURCES src/rvs_module.cpp src/action.cpp src/worker.cpp) +@@ -121,7 +121,7 @@ add_library( ${RVS_TARGET} SHARED ${SOURCES}) + set_target_properties(${RVS_TARGET} PROPERTIES + SUFFIX .so.${LIB_VERSION_STRING} + LIBRARY_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) +-target_link_libraries(${RVS_TARGET} rvslib ${PROJECT_LINK_LIBS} ) ++target_link_libraries(${RVS_TARGET} rvslib ${PROJECT_LINK_LIBS}) + add_dependencies(${RVS_TARGET} rvslib) + + add_custom_command(TARGET ${RVS_TARGET} POST_BUILD +diff --git a/rcqt.so/CMakeLists.txt b/rcqt.so/CMakeLists.txt +index c0099ab..8d92982 100644 +--- a/rcqt.so/CMakeLists.txt ++++ b/rcqt.so/CMakeLists.txt +@@ -108,11 +108,11 @@ else() + endif() + + ## define include directories +-include_directories(./ ../) ++include_directories(./ ../ ${YAML_CPP_INCLUDE_DIRS}) + # Add directories to look for library files to link +-link_directories(${RVS_LIB_DIR} ${ASAN_LIB_PATH}) ++link_directories(${RVS_LIB_DIR} ${ASAN_LIB_PATH} ${ASAN_LIB_PATH} ${HSAKMT_LIB_DIR} ${ROCM_SMI_LIB_DIR}) + ## additional libraries +-set (PROJECT_LINK_LIBS rvslib) ++set (PROJECT_LINK_LIBS rvslib ${ROCBLAS_LIB_DIR}/librocblas.so ${HSAKMT_LIB_DIR}/libhsakmt.a ${HSA_PATH}/lib/libhsa-runtime64.so) + + ## define source files + set(SOURCES +diff --git a/rvs/CMakeLists.txt b/rvs/CMakeLists.txt +index 527d474..76a5efd 100644 +--- a/rvs/CMakeLists.txt ++++ b/rvs/CMakeLists.txt +@@ -113,21 +113,22 @@ else() + endif() + + ## define include directories +-include_directories(./ ../ ${YAML_CPP_INCLUDE_DIRS}) ++include_directories(./ ../ ${YAML_INC_DIR}) + ## define lib directories +-link_directories(${CMAKE_CURRENT_BINARY_DIR} ${RVS_LIB_DIR} ${ROCT_LIB_DIR} ${ROCBLAS_LIB_DIR} ${ROCM_SMI_LIB_DIR} ${ASAN_LIB_PATH}) ++link_directories(${CMAKE_CURRENT_BINARY_DIR} ${RVS_LIB_DIR} ${RVS_LIB_DIR}/.. ${ASAN_LIB_PATH} ${ROCM_SMI_LIB_DIR} ${YAML_CPP_LIBRARIES} ${ROCT_LIB_DIR} ${ROCBLAS_LIB_DIR} ) + + ## additional libraries +-set(ROCBLAS_LIB "rocblas") +-set(ROC_THUNK_NAME "hsakmt") +-set(CORE_RUNTIME_NAME "hsa-runtime") ++set(ROCBLAS_LIB "${ROCBLAS_LIB_DIR}/librocblas.so") ++set(ROC_THUNK_NAME "${HSAKMT_LIB_DIR}/libhsakmt.a") ++set(CORE_RUNTIME_NAME "${HSA_PATH}/lib/libhsa-runtime64.so") ++set(YAML_CPP_LIB "${YAML_INC_DIR}/../lib64/libyaml-cpp.a") + set(CORE_RUNTIME_TARGET "${CORE_RUNTIME_NAME}64") +-set(PROJECT_LINK_LIBS libdl.so libpthread.so libpci.so ${YAML_CPP_LIBRARIES}) ++set(PROJECT_LINK_LIBS libdl.so libpthread.so libpci.so) + + ## define target + add_executable(${RVS_TARGET} src/rvs.cpp) + target_link_libraries(${RVS_TARGET} rvslib +- ${ROCBLAS_LIB} ${ROCM_SMI_LIB} ${ROC_THUNK_NAME} ${CORE_RUNTIME_TARGET} ${PROJECT_LINK_LIBS}) ++ ${ROCBLAS_LIB} ${ROCM_SMI_LIB} ${ROC_THUNK_NAME} ${PROJECT_LINK_LIBS} ${CORE_RUNTIME_NAME} ${YAML_CPP_LIB}) + add_dependencies(${RVS_TARGET} rvslib) + + install(TARGETS ${RVS_TARGET} +diff --git a/rvs/tests.cmake b/rvs/tests.cmake +index 38ae3fb..0d62675 100644 +--- a/rvs/tests.cmake ++++ b/rvs/tests.cmake +@@ -41,7 +41,8 @@ link_directories(${RVS_LIB_DIR} ${ROCBLAS_LIB_DIR} ${ROCM_SMI_LIB_DIR} ${ROCT_LI + ## define target for "test-to-fail" + add_executable(${RVS_TARGET}fail src/rvs.cpp) + target_link_libraries(${RVS_TARGET}fail rvslib rvslibut ${PROJECT_LINK_LIBS} +- ${ROCM_SMI_LIB} ${ROCBLAS_LIB} ${ROC_THUNK_NAME} ${CORE_RUNTIME_TARGET}) ++ ${ROCM_SMI_LIB} ${ROCBLAS_LIB} ${ROC_THUNK_NAME} ${CORE_RUNTIME_TARGET} ++ ${ROCBLAS_LIB_DIR}/librocblas.so ${HSAKMT_LIB_DIR}/libhsakmt.a ${HSA_PATH}/lib/libhsa-runtime64.so) + + target_compile_definitions(${RVS_TARGET}fail PRIVATE RVS_INVERT_RETURN_STATUS) + set_target_properties(${RVS_TARGET}fail PROPERTIES +@@ -187,7 +188,7 @@ add_test(NAME unit.ttf.rvs.config.noconfig + ) + + ## define include directories +-include_directories(${UT_INC}) ++include_directories(${UT_INC} ${YAML_INC_DIR}) + ## define lib directories + link_directories(${UT_LIB} ${ROCBLAS_LIB_DIR} ${ROCM_SMI_LIB_DIR} ${ROCT_LIB_DIR}) + ## additional libraries for unit tests +@@ -211,6 +212,7 @@ FOREACH(SINGLE_TEST ${TESTSOURCES}) + ${PROJECT_TEST_LINK_LIBS} + rvslib rvslibut gtest_main gtest pthread + ${ROCM_SMI_LIB} ${ROCBLAS_LIB} ${ROC_THUNK_NAME} ${CORE_RUNTIME_TARGET} ++ ${ROCBLAS_LIB_DIR}/librocblas.so ${HSAKMT_LIB_DIR}/libhsakmt.a ${HSA_PATH}/lib/libhsa-runtime64.so + ) + add_dependencies(${TEST_NAME} rvs_gtest_target) + +diff --git a/rvslib/CMakeLists.txt b/rvslib/CMakeLists.txt +index 8d29590..d52aee3 100644 +--- a/rvslib/CMakeLists.txt ++++ b/rvslib/CMakeLists.txt +@@ -116,7 +116,7 @@ endif() + + ## define include directories + include_directories(./ ../ ../rvs +- ${ROCM_SMI_INC_DIR} ${ROCR_INC_DIR} ${ROCBLAS_INC_DIR} ${HIP_INC_DIR}) ++ ${ROCM_SMI_INC_DIR} ${HIP_PATH} ${ROCBLAS_INC_DIR} ${YAML_INC_DIR}) + + link_directories(${ASAN_LIB_PATH} ${ROCM_SMI_LIB_DIR}) + +diff --git a/smqt.so/CMakeLists.txt b/smqt.so/CMakeLists.txt +index 042586f..0133c00 100644 +--- a/smqt.so/CMakeLists.txt ++++ b/smqt.so/CMakeLists.txt +@@ -106,11 +106,11 @@ else() + endif() + + ## define include directories +-include_directories(./ ../ pci) ++include_directories(./ ../ pci ${YAML_CPP_INCLUDE_DIRS}) + # Add directories to look for library files to link +-link_directories(${RVS_LIB_DIR} ${ASAN_LIB_PATH}) ++link_directories(${RVS_LIB_DIR} ${ASAN_LIB_PATH} ${ROCM_SMI_LIB_DIR}) + ## additional libraries +-set (PROJECT_LINK_LIBS rvslib libpci.so libm.so) ++set (PROJECT_LINK_LIBS rvslib libpci.so libm.so ${ROCBLAS_LIB_DIR}/librocblas.so ${HSAKMT_LIB_DIR}/libhsakmt.a ${HSA_PATH}/lib/libhsa-runtime64.so) + + ## define source files + set(SOURCES src/rvs_module.cpp src/action.cpp) +diff --git a/testif.so/CMakeLists.txt b/testif.so/CMakeLists.txt +index 4cba0f9..34b491e 100644 +--- a/testif.so/CMakeLists.txt ++++ b/testif.so/CMakeLists.txt +@@ -108,11 +108,11 @@ endif() + + + ## define include directories +-include_directories(./ ../ pci) ++include_directories(./ ../ pci ${YAML_CPP_INCLUDE_DIRS}) + # Add directories to look for library files to link +-link_directories(${RVS_LIB_DIR} ${ROCR_LIB_DIR} ${ROCBLAS_LIB_DIR} ${ASAN_LIB_PATH}) ++link_directories(${RVS_LIB_DIR} ${ROCBLAS_LIB_DIR} ${ASAN_LIB_PATH} ${ROCM_SMI_LIB_DIR}) + ## additional libraries +-set (PROJECT_LINK_LIBS libpthread.so libpci.so libm.so) ++set (PROJECT_LINK_LIBS libpthread.so libpci.so libm.so ${ROCBLAS_LIB_DIR}/librocblas.so ${HSAKMT_LIB_DIR}/libhsakmt.a ${HSA_PATH}/lib/libhsa-runtime64.so) + + ## define source files + ## set(SOURCES src/rvs_module.cpp src/action.cpp src/worker.cpp) +@@ -124,7 +124,7 @@ add_library( ${RVS_TARGET} SHARED src/rvs_module_no_if_methods.cpp) + set_target_properties(${RVS_TARGET} PROPERTIES + SUFFIX .so.${LIB_VERSION_STRING} + LIBRARY_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) +-target_link_libraries(${RVS_TARGET} rvslib ${PROJECT_LINK_LIBS} ) ++target_link_libraries(${RVS_TARGET} rvslib ${PROJECT_LINK_LIBS}) + add_dependencies(${RVS_TARGET} rvslib) + + add_custom_command(TARGET ${RVS_TARGET} POST_BUILD +@@ -145,7 +145,7 @@ add_library( ${RVS_TARGET} SHARED src/rvs_module_no_if0.cpp) + set_target_properties(${RVS_TARGET} PROPERTIES + SUFFIX .so.${LIB_VERSION_STRING} + LIBRARY_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) +-target_link_libraries(${RVS_TARGET} rvslib ${PROJECT_LINK_LIBS} ) ++target_link_libraries(${RVS_TARGET} rvslib ${PROJECT_LINK_LIBS}) + add_dependencies(${RVS_TARGET} rvslib) + + add_custom_command(TARGET ${RVS_TARGET} POST_BUILD +@@ -166,7 +166,7 @@ add_library( ${RVS_TARGET} SHARED src/rvs_module_no_if0_methods.cpp) + set_target_properties(${RVS_TARGET} PROPERTIES + SUFFIX .so.${LIB_VERSION_STRING} + LIBRARY_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) +-target_link_libraries(${RVS_TARGET} rvslib ${PROJECT_LINK_LIBS} ) ++target_link_libraries(${RVS_TARGET} rvslib ${PROJECT_LINK_LIBS}) + add_dependencies(${RVS_TARGET} rvslib) + + add_custom_command(TARGET ${RVS_TARGET} POST_BUILD +@@ -187,7 +187,7 @@ add_library( ${RVS_TARGET} SHARED src/rvs_module_no_if1.cpp) + set_target_properties(${RVS_TARGET} PROPERTIES + SUFFIX .so.${LIB_VERSION_STRING} + LIBRARY_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) +-target_link_libraries(${RVS_TARGET} rvslib ${PROJECT_LINK_LIBS} ) ++target_link_libraries(${RVS_TARGET} rvslib ${PROJECT_LINK_LIBS}) + add_dependencies(${RVS_TARGET} rvslib) + + add_custom_command(TARGET ${RVS_TARGET} POST_BUILD +@@ -208,7 +208,7 @@ add_library( ${RVS_TARGET} SHARED src/rvs_module_no_if1_methods.cpp) + set_target_properties(${RVS_TARGET} PROPERTIES + SUFFIX .so.${LIB_VERSION_STRING} + LIBRARY_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) +-target_link_libraries(${RVS_TARGET} rvslib ${PROJECT_LINK_LIBS} ) ++target_link_libraries(${RVS_TARGET} rvslib ${PROJECT_LINK_LIBS}) + add_dependencies(${RVS_TARGET} rvslib) + + add_custom_command(TARGET ${RVS_TARGET} POST_BUILD +@@ -229,7 +229,7 @@ add_library( ${RVS_TARGET} SHARED src/rvs_module_fail_init.cpp) + set_target_properties(${RVS_TARGET} PROPERTIES + SUFFIX .so.${LIB_VERSION_STRING} + LIBRARY_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) +-target_link_libraries(${RVS_TARGET} rvslib ${PROJECT_LINK_LIBS} ) ++target_link_libraries(${RVS_TARGET} rvslib ${PROJECT_LINK_LIBS}) + add_dependencies(${RVS_TARGET} rvslib) + + add_custom_command(TARGET ${RVS_TARGET} POST_BUILD +@@ -250,7 +250,7 @@ add_library( ${RVS_TARGET} SHARED src/rvs_module_fail_create_action.cpp) + set_target_properties(${RVS_TARGET} PROPERTIES + SUFFIX .so.${LIB_VERSION_STRING} + LIBRARY_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) +-target_link_libraries(${RVS_TARGET} rvslib ${PROJECT_LINK_LIBS} ) ++target_link_libraries(${RVS_TARGET} rvslib ${PROJECT_LINK_LIBS}) + add_dependencies(${RVS_TARGET} rvslib) + + add_custom_command(TARGET ${RVS_TARGET} POST_BUILD +-- +2.39.3 + diff --git a/var/spack/repos/builtin/packages/rocm-validation-suite/package.py b/var/spack/repos/builtin/packages/rocm-validation-suite/package.py index 52e267f580..03b1c0d45e 100644 --- a/var/spack/repos/builtin/packages/rocm-validation-suite/package.py +++ b/var/spack/repos/builtin/packages/rocm-validation-suite/package.py @@ -15,13 +15,14 @@ class RocmValidationSuite(CMakePackage): computing environment, enabled using the ROCm software stack on a compatible platform.""" - homepage = "https://github.com/ROCm-Developer-Tools/ROCmValidationSuite" - url = "https://github.com/ROCm-Developer-Tools/ROCmValidationSuite/archive/rocm-5.5.0.tar.gz" + homepage = "https://github.com/ROCm/ROCmValidationSuite" + url = "https://github.com/ROCm/ROCmValidationSuite/archive/rocm-6.0.0.tar.gz" tags = ["rocm"] license("MIT") maintainers("srekolam", "renjithravindrankannath") + version("6.0.0", sha256="a84e36b5e50e70ba033fb6bc6fa99da2e32bf7eaef2098df3164365a77a8f14c") version("5.7.1", sha256="202f2b6e014bbbeec40af5d3ec630c042f09a61087a77bd70715d81044ea4d65") version("5.7.0", sha256="f049b7786a220e9b6dfe099f17727dd0d9e41be9e680fe8309eae400cc5536ea") version("5.6.1", sha256="d5e4100e2d07311dfa101563c15d026a8130442cdee8af9ef861832cd7866c0d") @@ -122,9 +123,8 @@ class RocmValidationSuite(CMakePackage): "007-cleanup-path-reference-donot-download-googletest-yaml-library-path_5.6.patch", when="@5.6", ) - patch( - "008-correcting-library-and-include-path-WITHOUT-RVS-BUILD-TESTS.patch", when="@5.7.0:5.7" - ) + patch("008-correcting-library-and-include-path-WITHOUT-RVS-BUILD-TESTS.patch", when="@5.7") + patch("009-replacing-rocm-path-with-package-path.patch", when="@6.0") depends_on("cmake@3.5:", type="build") depends_on("zlib-api", type="link") depends_on("yaml-cpp~shared") @@ -165,6 +165,7 @@ class RocmValidationSuite(CMakePackage): "5.6.1", "5.7.0", "5.7.1", + "6.0.0", ]: depends_on("hip@" + ver, when="@" + ver) depends_on("rocminfo@" + ver, when="@" + ver) diff --git a/var/spack/repos/builtin/packages/rocminfo/package.py b/var/spack/repos/builtin/packages/rocminfo/package.py index 3d70c7024b..a71259914a 100644 --- a/var/spack/repos/builtin/packages/rocminfo/package.py +++ b/var/spack/repos/builtin/packages/rocminfo/package.py @@ -10,14 +10,15 @@ from spack.package import * class Rocminfo(CMakePackage): """Radeon Open Compute (ROCm) Runtime rocminfo tool""" - homepage = "https://github.com/RadeonOpenCompute/rocminfo" - git = "https://github.com/RadeonOpenCompute/rocminfo.git" - url = "https://github.com/RadeonOpenCompute/rocminfo/archive/rocm-5.5.0.tar.gz" + homepage = "https://github.com/ROCm/rocminfo" + git = "https://github.com/ROCm/rocminfo.git" + url = "https://github.com/ROCm/rocminfo/archive/rocm-6.0.0.tar.gz" tags = ["rocm"] maintainers("srekolam", "renjithravindrankannath", "haampie") version("master", branch="master") + version("6.0.0", sha256="bc29f1798644b6dea73895353dffada9db7366d0058274e587ebd3291a4d3844") version("5.7.1", sha256="642dc2ec4254b3c30c43064e6690861486db820b25f4906ec78bdb47e68dcd0b") version("5.7.0", sha256="a5a3c19513bf26f17f163a03ba5288c5c761619ef55f0cb9e15472771748b93e") version("5.6.1", sha256="780b186ac7410a503eca1060f4bbc35db1b7b4d1d714d15c7534cd26d8af7b54") @@ -136,12 +137,13 @@ class Rocminfo(CMakePackage): "5.6.1", "5.7.0", "5.7.1", + "6.0.0", "master", ]: depends_on("hsakmt-roct@" + ver, when="@" + ver) depends_on("hsa-rocr-dev@" + ver, when="@" + ver) - for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1", "5.7.0", "5.7.1"]: + for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1", "5.7.0", "5.7.1", "6.0.0"]: depends_on("rocm-core@" + ver, when="@" + ver) def cmake_args(self): diff --git a/var/spack/repos/builtin/packages/rocmlir/package.py b/var/spack/repos/builtin/packages/rocmlir/package.py index e7be5107d6..0c57ef3b4f 100644 --- a/var/spack/repos/builtin/packages/rocmlir/package.py +++ b/var/spack/repos/builtin/packages/rocmlir/package.py @@ -12,9 +12,9 @@ class Rocmlir(CMakePackage): targetting AMD hardware. This generator is mainly used from MIOpen and MIGraphX, but it can be used on a standalone basis.""" - homepage = "https://github.com/ROCmSoftwarePlatform/rocMLIR" - git = "https://github.com/ROCmSoftwarePlatform/rocMLIR.git" - url = "https://github.com/ROCmSoftwarePlatform/rocMLIR/archive/refs/tags/rocm-5.5.0.tar.gz" + homepage = "https://github.com/ROCm/rocMLIR" + git = "https://github.com/ROCm/rocMLIR.git" + url = "https://github.com/ROCm/rocMLIR/archive/refs/tags/rocm-6.0.0.tar.gz" maintainers("srekolam") version("5.5.1", commit="8c29325e7e68e3248e863172bf0e7f97055d45ee") diff --git a/var/spack/repos/builtin/packages/rocprim/package.py b/var/spack/repos/builtin/packages/rocprim/package.py index a6fd4806c1..fc0e594d15 100644 --- a/var/spack/repos/builtin/packages/rocprim/package.py +++ b/var/spack/repos/builtin/packages/rocprim/package.py @@ -9,14 +9,15 @@ from spack.package import * class Rocprim(CMakePackage): """Radeon Open Compute Parallel Primitives Library""" - homepage = "https://github.com/ROCmSoftwarePlatform/rocPRIM" - git = "https://github.com/ROCmSoftwarePlatform/rocPRIM.git" - url = "https://github.com/ROCmSoftwarePlatform/rocPRIM/archive/rocm-5.5.0.tar.gz" + homepage = "https://github.com/ROCm/rocPRIM" + git = "https://github.com/ROCm/rocPRIM.git" + url = "https://github.com/ROCm/rocPRIM/archive/rocm-6.0.0.tar.gz" tags = ["rocm"] license("MIT") maintainers("cgmb", "srekolam", "renjithravindrankannath") + version("6.0.0", sha256="51f26c9f891a64c8db8df51d75d86d404d682092fd9d243e966ac6b2a6de381a") version("5.7.1", sha256="15d820a0f61aed60efbba88b6efe6942878b02d912f523f9cf8f33a4583d6cd7") version("5.7.0", sha256="a1bf94bbad13a0410b49476771270606d8a9d257188ee3ec3a37eee80540fe9b") version("5.6.1", sha256="e9ec1b0039c07cf3096653a04224fe5fe755afc6ba000f6838b3a8bc84df27de") @@ -147,6 +148,7 @@ class Rocprim(CMakePackage): "5.6.1", "5.7.0", "5.7.1", + "6.0.0", ]: depends_on("hip@" + ver, when="@" + ver) depends_on("comgr@" + ver, when="@" + ver) diff --git a/var/spack/repos/builtin/packages/rocprofiler-dev/package.py b/var/spack/repos/builtin/packages/rocprofiler-dev/package.py index d87dd3ad2d..b9375fd7ac 100644 --- a/var/spack/repos/builtin/packages/rocprofiler-dev/package.py +++ b/var/spack/repos/builtin/packages/rocprofiler-dev/package.py @@ -11,9 +11,9 @@ from spack.package import * class RocprofilerDev(CMakePackage): """ROCPROFILER library for AMD HSA runtime API extension support""" - homepage = "https://github.com/ROCm-Developer-Tools/rocprofiler" - git = "https://github.com/ROCm-Developer-Tools/rocprofiler.git" - url = "https://github.com/ROCm-Developer-Tools/rocprofiler/archive/refs/tags/rocm-5.4.3.tar.gz" + homepage = "https://github.com/ROCm/rocprofiler" + git = "https://github.com/ROCm/rocprofiler.git" + url = "https://github.com/ROCm/rocprofiler/archive/refs/tags/rocm-5.4.3.tar.gz" tags = ["rocm"] maintainers("srekolam", "renjithravindrankannath") @@ -133,7 +133,7 @@ class RocprofilerDev(CMakePackage): depends_on("roctracer-dev-api@" + ver, when="@" + ver) depends_on("numactl", type="link", when="@4.3.1") - # See https://github.com/ROCm-Developer-Tools/rocprofiler/pull/50 + # See https://github.com/ROCm/rocprofiler/pull/50 patch("fix-includes.patch") patch("0001-Continue-build-in-absence-of-aql-profile-lib.patch", when="@5.3:") diff --git a/var/spack/repos/builtin/packages/rocrand/package.py b/var/spack/repos/builtin/packages/rocrand/package.py index 775f1eee69..d83857f346 100644 --- a/var/spack/repos/builtin/packages/rocrand/package.py +++ b/var/spack/repos/builtin/packages/rocrand/package.py @@ -14,9 +14,9 @@ class Rocrand(CMakePackage): """The rocRAND project provides functions that generate pseudo-random and quasi-random numbers.""" - homepage = "https://github.com/ROCmSoftwarePlatform/rocRAND" - git = "https://github.com/ROCmSoftwarePlatform/rocRAND.git" - url = "https://github.com/ROCmSoftwarePlatform/rocRAND/archive/rocm-5.5.0.tar.gz" + homepage = "https://github.com/ROCm/rocRAND" + git = "https://github.com/ROCm/rocRAND.git" + url = "https://github.com/ROCm/rocRAND/archive/rocm-6.0.0.tar.gz" tags = ["rocm"] maintainers("cgmb", "srekolam", "renjithravindrankannath") @@ -26,6 +26,7 @@ class Rocrand(CMakePackage): version("develop", branch="develop") version("master", branch="master") + version("6.0.0", sha256="cee93231c088be524bb2cb0e6093ec47e62e61a55153486bebbc2ca5b3d49360") version("5.7.1", sha256="885cd905bbd23d02ba8f3f87d5c0b79bc44bd020ea9af190f3959cf5aa33d07d") version("5.7.0", sha256="d6053d986821e5cbc6cfec0778476efb1411ef943f11e7a8b973b1814a259dcf") version("5.6.1", sha256="6bf71e687ffa0fcc1b00e3567dd43da4147a82390f1b2db5e6f1f594dee6066d") @@ -149,7 +150,7 @@ class Rocrand(CMakePackage): ]: resource( name="hipRAND", - git="https://github.com/ROCmSoftwarePlatform/hipRAND.git", + git="https://github.com/ROCm/hipRAND.git", commit=d_commit, destination="", placement="hiprand", @@ -157,7 +158,7 @@ class Rocrand(CMakePackage): ) resource( name="hipRAND", - git="https://github.com/ROCmSoftwarePlatform/hipRAND.git", + git="https://github.com/ROCm/hipRAND.git", branch="master", destination="", placement="hiprand", @@ -165,7 +166,7 @@ class Rocrand(CMakePackage): ) resource( name="hipRAND", - git="https://github.com/ROCmSoftwarePlatform/hipRAND.git", + git="https://github.com/ROCm/hipRAND.git", branch="develop", destination="", placement="hiprand", @@ -202,6 +203,7 @@ class Rocrand(CMakePackage): "5.6.1", "5.7.0", "5.7.1", + "6.0.0", ]: depends_on("hip@" + ver, when="@" + ver) depends_on("rocm-cmake@%s:" % ver, type="build", when="@" + ver) diff --git a/var/spack/repos/builtin/packages/rocsolver/package.py b/var/spack/repos/builtin/packages/rocsolver/package.py index ea85a69965..576675a371 100644 --- a/var/spack/repos/builtin/packages/rocsolver/package.py +++ b/var/spack/repos/builtin/packages/rocsolver/package.py @@ -13,9 +13,9 @@ class Rocsolver(CMakePackage): """rocSOLVER is a work-in-progress implementation of a subset of LAPACK functionality on the ROCm platform.""" - homepage = "https://github.com/ROCmSoftwarePlatform/rocSOLVER" - git = "https://github.com/ROCmSoftwarePlatform/rocSOLVER.git" - url = "https://github.com/ROCmSoftwarePlatform/rocSOLVER/archive/rocm-5.5.0.tar.gz" + homepage = "https://github.com/ROCm/rocSOLVER" + git = "https://github.com/ROCm/rocSOLVER.git" + url = "https://github.com/ROCm/rocSOLVER/archive/rocm-6.0.0.tar.gz" tags = ["rocm"] maintainers("cgmb", "srekolam", "renjithravindrankannath", "haampie") @@ -41,6 +41,7 @@ class Rocsolver(CMakePackage): version("develop", branch="develop") version("master", branch="master") + version("6.0.0", sha256="5fcaba96f3efafc2ecc3f4ec104095d96545c16e1b9f95410bd571cb0fc643ae") version("5.7.1", sha256="83e0c137b8690dbeb2e85d9e25415d96bd06979f09f2b10b2aff8e4c9f833fa4") version("5.7.0", sha256="bb16d360f14b34fe6e8a6b8ddc6e631672a5ffccbdcb25f0ce319edddd7f9682") version("5.6.1", sha256="6a8f366218aee599a0e56755030f94ee690b34f30e6d602748632226c5dc21bb") @@ -136,7 +137,7 @@ class Rocsolver(CMakePackage): depends_on("netlib-lapack@3.7.1:", type="test") patch("link-clients-blas.patch", when="@4.3.0:4.3.2") - # Backport https://github.com/ROCmSoftwarePlatform/rocSOLVER/commit/2bbfb8976f6e4d667499c77e41a6433850063e88 + # Backport https://github.com/ROCm/rocSOLVER/commit/2bbfb8976f6e4d667499c77e41a6433850063e88 patch("fmt-8.1-compatibility.patch", when="@4.5.0:5.1.3") # Maximize compatibility with other libraries that are using fmt. patch("fmt-9-compatibility.patch", when="@5.2.0:5.5") @@ -180,10 +181,11 @@ class Rocsolver(CMakePackage): "5.6.1", "5.7.0", "5.7.1", + "6.0.0", ]: depends_on("hip@" + ver, when="@" + ver) depends_on("rocblas@" + ver, when="@" + ver) - for ver in ["5.6.0", "5.6.1", "5.7.0", "5.7.1"]: + for ver in ["5.6.0", "5.6.1", "5.7.0", "5.7.1", "6.0.0"]: depends_on("rocsparse@5.2:", when="@5.6:") for tgt in itertools.chain(["auto"], amdgpu_targets): diff --git a/var/spack/repos/builtin/packages/rocsparse/package.py b/var/spack/repos/builtin/packages/rocsparse/package.py index 98c02e8807..211afb0d36 100644 --- a/var/spack/repos/builtin/packages/rocsparse/package.py +++ b/var/spack/repos/builtin/packages/rocsparse/package.py @@ -15,9 +15,9 @@ class Rocsparse(CMakePackage): and toolchains. rocSPARSE is created using the HIP programming language and optimized for AMD's latest discrete GPUs.""" - homepage = "https://github.com/ROCmSoftwarePlatform/rocSPARSE" - git = "https://github.com/ROCmSoftwarePlatform/rocSPARSE.git" - url = "https://github.com/ROCmSoftwarePlatform/rocSPARSE/archive/rocm-5.5.0.tar.gz" + homepage = "https://github.com/ROCm/rocSPARSE" + git = "https://github.com/ROCm/rocSPARSE.git" + url = "https://github.com/ROCm/rocSPARSE/archive/rocm-6.0.0.tar.gz" tags = ["rocm"] maintainers("cgmb", "srekolam", "renjithravindrankannath") @@ -34,7 +34,7 @@ class Rocsparse(CMakePackage): variant("test", default=False, description="Build rocsparse-test client") license("MIT") - + version("6.0.0", sha256="bdc618677ec78830c6af315d61194d6ab8532345b8daeeb115aca96f274d4ca4") version("5.7.1", sha256="4c09b182b371124675d4057246021b5ed45e2833fdbf265b37a9b06b668baf0a") version("5.7.0", sha256="a42f0eb531b015b719e2bdcdff0cfb214e9894f73107966260f26931f982ecbc") version("5.6.1", sha256="6a50a64354507f1374e1a86aa7f5c07d1aaa96ac193ac292c279153087bb5d54") @@ -153,6 +153,7 @@ class Rocsparse(CMakePackage): "5.6.1", "5.7.0", "5.7.1", + "6.0.0", ]: depends_on("hip@" + ver, when="@" + ver) depends_on("rocprim@" + ver, when="@" + ver) diff --git a/var/spack/repos/builtin/packages/rocthrust/package.py b/var/spack/repos/builtin/packages/rocthrust/package.py index c5e8dd1acc..01da0551b1 100644 --- a/var/spack/repos/builtin/packages/rocthrust/package.py +++ b/var/spack/repos/builtin/packages/rocthrust/package.py @@ -12,12 +12,13 @@ class Rocthrust(CMakePackage): HIP/ROCm platform, which uses the rocPRIM library. The HIP ported library works on HIP/ROCm platforms""" - homepage = "https://github.com/ROCmSoftwarePlatform/rocThrust" - git = "https://github.com/ROCmSoftwarePlatform/rocThrust.git" - url = "https://github.com/ROCmSoftwarePlatform/rocThrust/archive/rocm-5.5.0.tar.gz" + homepage = "https://github.com/ROCm/rocThrust" + git = "https://github.com/ROCm/rocThrust.git" + url = "https://github.com/ROCm/rocThrust/archive/rocm-6.0.0.tar.gz" tags = ["rocm"] maintainers("cgmb", "srekolam", "renjithravindrankannath") + version("6.0.0", sha256="a3fdafe4b6124118e07f23a3b0270d91740da324f61aaa3e8c034da08d9312b1") version("5.7.1", sha256="b7cb9ea6c42b2c6b610c34d2c438443e0f99245bd391aff18591949bf1cd53ee") version("5.7.0", sha256="64e10f071acfc5b8e3c168b9178289cf1afc7b168bf1962793fc256b25074d3a") version("5.6.1", sha256="63df61d5ab46d4cfda6066d748274bacecc77151692e372e6f7df5e91852bdc2") @@ -149,6 +150,7 @@ class Rocthrust(CMakePackage): "5.6.1", "5.7.0", "5.7.1", + "6.0.0", ]: depends_on("hip@" + ver, when="@" + ver) depends_on("rocprim@" + ver, when="@" + ver) diff --git a/var/spack/repos/builtin/packages/roctracer-dev-api/package.py b/var/spack/repos/builtin/packages/roctracer-dev-api/package.py index a944ff3970..e93c202ccf 100644 --- a/var/spack/repos/builtin/packages/roctracer-dev-api/package.py +++ b/var/spack/repos/builtin/packages/roctracer-dev-api/package.py @@ -11,14 +11,15 @@ class RoctracerDevApi(Package): package, mainly to avoid circular dependencies in the ROCm ecosystem. For the ROC-tracer library, please check out roctracer-dev.""" - homepage = "https://github.com/ROCm-Developer-Tools/roctracer" - git = "https://github.com/ROCm-Developer-Tools/roctracer.git" - url = "https://github.com/ROCm-Developer-Tools/roctracer/archive/refs/tags/rocm-5.5.0.tar.gz" + homepage = "https://github.com/ROCm/roctracer" + git = "https://github.com/ROCm/roctracer.git" + url = "https://github.com/ROCm/roctracer/archive/refs/tags/rocm-6.0.0.tar.gz" tags = ["rocm"] license("MIT") maintainers("srekolam", "renjithravindrankannath") + version("6.0.0", sha256="941166a0363c5689bfec118d54e986c43fb1ec8cbf18d95721d9a824bd52c0f8") version("5.7.1", sha256="ec0453adac7e62b142eb0df1e1e2506863aac4c3f2ce9d117c3184c08c0c6b48") version("5.7.0", sha256="40bb757920488466e29df90bb80a975cc340bf7f8771fb1d754dfbb6b688d78e") version("5.6.1", sha256="007c498be25b067ad9a7631a2b0892f9129150ee9714e471a921225875d45e69") diff --git a/var/spack/repos/builtin/packages/roctracer-dev/package.py b/var/spack/repos/builtin/packages/roctracer-dev/package.py index aa15dca00e..3c5f81e643 100644 --- a/var/spack/repos/builtin/packages/roctracer-dev/package.py +++ b/var/spack/repos/builtin/packages/roctracer-dev/package.py @@ -13,16 +13,16 @@ class RoctracerDev(CMakePackage, ROCmPackage): The goal of the implementation is to provide a generic independent from specific runtime profiler to trace API and asyncronous activity.""" - homepage = "https://github.com/ROCm-Developer-Tools/roctracer" - git = "https://github.com/ROCm-Developer-Tools/roctracer.git" - url = "https://github.com/ROCm-Developer-Tools/roctracer/archive/rocm-5.5.0.tar.gz" + homepage = "https://github.com/ROCm/roctracer" + git = "https://github.com/ROCm/roctracer.git" + url = "https://github.com/ROCm/roctracer/archive/rocm-6.0.0.tar.gz" tags = ["rocm"] maintainers("srekolam", "renjithravindrankannath") libraries = ["libroctracer64"] license("MIT") - + version("6.0.0", sha256="941166a0363c5689bfec118d54e986c43fb1ec8cbf18d95721d9a824bd52c0f8") version("5.7.1", sha256="ec0453adac7e62b142eb0df1e1e2506863aac4c3f2ce9d117c3184c08c0c6b48") version("5.7.0", sha256="40bb757920488466e29df90bb80a975cc340bf7f8771fb1d754dfbb6b688d78e") version("5.6.1", sha256="007c498be25b067ad9a7631a2b0892f9129150ee9714e471a921225875d45e69") @@ -83,6 +83,7 @@ class RoctracerDev(CMakePackage, ROCmPackage): "5.6.1", "5.7.0", "5.7.1", + "6.0.0", ]: depends_on("hsakmt-roct@" + ver, when="@" + ver) depends_on("hsa-rocr-dev@" + ver, when="@" + ver) @@ -105,7 +106,7 @@ class RoctracerDev(CMakePackage, ROCmPackage): ]: depends_on("rocprofiler-dev@" + ver, when="@" + ver) - for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1", "5.7.0", "5.7.1"]: + for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1", "5.7.0", "5.7.1", "6.0.0"]: depends_on("rocm-core@" + ver, when="@" + ver) patch("0001-include-rocprofiler-dev-path.patch", when="@5.3:5.4") diff --git a/var/spack/repos/builtin/packages/rocwmma/package.py b/var/spack/repos/builtin/packages/rocwmma/package.py index 8d5a9fdbea..ee5418b1c8 100644 --- a/var/spack/repos/builtin/packages/rocwmma/package.py +++ b/var/spack/repos/builtin/packages/rocwmma/package.py @@ -19,14 +19,15 @@ class Rocwmma(CMakePackage): generation of kernel assembly, and does not incur additional overhead costs of linking to external runtime libraries or having to launch separate kernels.""" - homepage = "https://github.com/ROCmSoftwarePlatform/rocWMMA" - git = "https://github.com/ROCmSoftwarePlatform/rocWMMA.git" - url = "https://github.com/ROCmSoftwarePlatform/rocWMMA/archive/refs/tags/rocm-5.5.0.tar.gz" + homepage = "https://github.com/ROCm/rocWMMA" + git = "https://github.com/ROCm/rocWMMA.git" + url = "https://github.com/ROCm/rocWMMA/archive/refs/tags/rocm-6.0.0.tar.gz" tags = ["rocm"] license("MIT") maintainers("srekolam", "renjithravindrankannath") + version("6.0.0", sha256="f9e97e7c6c552d43ef8c7348e4402bead2cd978d0f81a9657d6a0f6c83a6139b") version("5.7.1", sha256="a998a1385e6ad7062707ddb9ff82bef727ca48c39a10b4d861667024e3ffd2a3") version("5.7.0", sha256="a8f1b090e9e504a149a924c80cfb6aca817359b43833a6512ba32e178245526f") version("5.6.1", sha256="41a5159ee1ad5fc411fe6220f37bd754e26d3883c24c0f2378f50ef628bc1b8f") @@ -78,6 +79,7 @@ class Rocwmma(CMakePackage): "5.6.1", "5.7.0", "5.7.1", + "6.0.0", ]: depends_on("rocm-cmake@%s:" % ver, type="build", when="@" + ver) depends_on("llvm-amdgpu@" + ver, type="build", when="@" + ver) @@ -85,7 +87,7 @@ class Rocwmma(CMakePackage): depends_on("rocblas@" + ver, type="build", when="@" + ver) depends_on("rocm-openmp-extras@" + ver, type="build", when="@" + ver) - for ver in ["5.6.0", "5.6.1", "5.7.0", "5.7.1"]: + for ver in ["5.6.0", "5.6.1", "5.7.0", "5.7.1", "6.0.0"]: depends_on("rocm-smi-lib@" + ver, when="@" + ver) for tgt in itertools.chain(["auto"], amdgpu_targets): diff --git a/var/spack/repos/builtin/packages/rpp/0003-include-half-through-spack-package.patch b/var/spack/repos/builtin/packages/rpp/0003-include-half-through-spack-package.patch new file mode 100644 index 0000000000..2e7e08c2ac --- /dev/null +++ b/var/spack/repos/builtin/packages/rpp/0003-include-half-through-spack-package.patch @@ -0,0 +1,61 @@ +diff --git a/CMakeLists.txt b/CMakeLists.txt
+index 137896e..ca82e98 100644
+--- a/CMakeLists.txt
++++ b/CMakeLists.txt
+@@ -129,6 +129,9 @@ list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake)
+
+ # OpenMP
+ find_package(OpenMP REQUIRED)
++find_path(HALF_INCLUDE_DIR half.hpp)
++message(STATUS "HALF_INCLUDE_DIR: ${HALF_INCLUDE_DIR}")
++
+ if(APPLE)
+ if(CMAKE_C_COMPILER_ID MATCHES "Clang")
+ set(OpenMP_C "${CMAKE_C_COMPILER}")
+@@ -278,6 +281,7 @@ target_include_directories(${PROJECT_NAME}
+ PUBLIC
+ ${CMAKE_CURRENT_SOURCE_DIR}/include
+ ${ROCM_PATH}/include
++ ${HALF_INCLUDE_DIR}
+ PRIVATE
+ ${CMAKE_CURRENT_SOURCE_DIR}/src/include/cpu
+ ${CMAKE_CURRENT_SOURCE_DIR}/src/include/common
+diff --git a/src/modules/CMakeLists.txt b/src/modules/CMakeLists.txt
+index 2a64d77..80c5686 100644
+--- a/src/modules/CMakeLists.txt
++++ b/src/modules/CMakeLists.txt
+@@ -81,6 +81,8 @@ if("${TIME_INFO}" STREQUAL "1")
+ endif()
+
+ # Backend specific settings
++find_path(HALF_INCLUDE_DIR half.hpp)
++message(STATUS "HALF_INCLUDE_DIR: ${HALF_INCLUDE_DIR}")
+
+ if( "${BACKEND}" STREQUAL "HIP")
+ # Add HIP kernels
+@@ -99,7 +101,7 @@ if( "${BACKEND}" STREQUAL "HIP")
+ # Add HIP specific includes
+ set(ROCM_INC ${ROCM_PATH}/include/)
+ list(APPEND HIP_LOCAL_INCLUDE_DIRS ${CMAKE_SOURCE_DIR}/src/include/hip/ ${CMAKE_SOURCE_DIR}/src/include/common/)
+- set(INCLUDE_LIST ${ROCM_INC} ${HIP_LOCAL_INCLUDE_DIRS} ${INCLUDE_LIST})
++ set(INCLUDE_LIST ${ROCM_INC} ${HIP_LOCAL_INCLUDE_DIRS} ${INCLUDE_LIST} ${HALF_INCLUDE_DIR})
+ elseif( "${BACKEND}" STREQUAL "OCL")
+ # Add OpenCL kernels
+ file(GLOB MOD_CL_CPP "cl/*.cpp" )
+@@ -114,7 +116,7 @@ elseif( "${BACKEND}" STREQUAL "OCL")
+ # Add OpenCL specific includes
+ set(ROCM_INC ${ROCM_PATH}/include/)
+ list(APPEND OCL_LOCAL_INCLUDE_LIST ${CMAKE_SOURCE_DIR}/src/include/cl/ ${CMAKE_SOURCE_DIR}/src/include/common/)
+- set(INCLUDE_LIST ${ROCM_INC} ${OCL_LOCAL_INCLUDE_LIST} ${INCLUDE_LIST})
++ set(INCLUDE_LIST ${ROCM_INC} ${OCL_LOCAL_INCLUDE_LIST} ${INCLUDE_LIST} ${HALF_INCLUDE_DIR})
+ elseif( "${BACKEND}" STREQUAL "CPU")
+ # Add CPU specific includes
+ set(INCLUDE_LIST ${CMAKE_SOURCE_DIR}/src/include/common/)
+@@ -136,6 +138,7 @@ target_include_directories( ${PROJECT_NAME}
+ PUBLIC
+ ${CMAKE_SOURCE_DIR}/include
+ ${ROCM_INC}
++ ${HALF_INCLUDE_DIR}
+ PRIVATE
+ ${CMAKE_SOURCE_DIR}/src/include/cpu
+ ${CMAKE_SOURCE_DIR}/src/include/common
\ No newline at end of file diff --git a/var/spack/repos/builtin/packages/rpp/package.py b/var/spack/repos/builtin/packages/rpp/package.py index 116fa90328..7049b342cd 100644 --- a/var/spack/repos/builtin/packages/rpp/package.py +++ b/var/spack/repos/builtin/packages/rpp/package.py @@ -29,6 +29,7 @@ class Rpp(CMakePackage): license("MIT") + version("6.0.0", sha256="3626a648bc773520f5cd5ca15f494de6e74b422baf32491750ce0737c3367f15") version("5.7.1", sha256="36fff5f1c52d969c3e2e0c75b879471f731770f193c9644aa6ab993fb8fa4bbf") version("5.7.0", sha256="1c612cde3c3d3840ae75ee5c1ee59bd8d61b1fdbf84421ae535cda863470fc06") version("1.2.0", sha256="660a11e1bd8706967835597b26daa874fd1507459bfebe22818149444bec540c") @@ -54,8 +55,9 @@ class Rpp(CMakePackage): description="add utilities folder which contains rpp unit tests", ) - patch("0001-include-half-openmp-through-spack-package.patch") + patch("0001-include-half-openmp-through-spack-package.patch", when="@:5.7") patch("0002-declare-handle-in-header.patch") + patch("0003-include-half-through-spack-package.patch", when="@6.0:") # adds half.hpp include directory and modifies how the libjpegturbo # library is linked for the rpp unit test @@ -118,7 +120,11 @@ class Rpp(CMakePackage): conflicts("+opencl+hip") with when("+hip"): - depends_on("hip@5:") + with when("@5.7:"): + for ver in ["5.7.0", "5.7.1", "6.0.0"]: + depends_on("hip@" + ver, when="@" + ver) + with when("@:1.2"): + depends_on("hip@5:") with when("~hip"): depends_on("rocm-opencl@5:") diff --git a/var/spack/repos/builtin/packages/sundials/Change-HIP_PLATFORM-from-HCC-to-AMD-and-NVCC-to-NVIDIA.patch b/var/spack/repos/builtin/packages/sundials/Change-HIP_PLATFORM-from-HCC-to-AMD-and-NVCC-to-NVIDIA.patch new file mode 100644 index 0000000000..4dd9dc7a47 --- /dev/null +++ b/var/spack/repos/builtin/packages/sundials/Change-HIP_PLATFORM-from-HCC-to-AMD-and-NVCC-to-NVIDIA.patch @@ -0,0 +1,28 @@ +From d4afbed86fc4f9925e55367267b3796a522ba5d5 Mon Sep 17 00:00:00 2001 +From: Renjith Ravindran <Renjith.RavindranKannath@amd.com> +Date: Sun, 14 Jan 2024 10:20:21 +0000 +Subject: [PATCH] Change HIP_PLATFORM from HCC to AMD and NVCC to NVIDIA + +--- + include/sundials/sundials_hip_policies.hpp | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/include/sundials/sundials_hip_policies.hpp b/include/sundials/sundials_hip_policies.hpp +index d759bbc..f6dfe41 100644 +--- a/include/sundials/sundials_hip_policies.hpp ++++ b/include/sundials/sundials_hip_policies.hpp +@@ -30,9 +30,9 @@ namespace sundials + namespace hip + { + +-#if defined(__HIP_PLATFORM_HCC__) ++#if defined(__HIP_PLATFORM_AMD__) + constexpr const sunindextype WARP_SIZE = 64; +-#elif defined(__HIP_PLATFORM_NVCC__) ++#elif defined(__HIP_PLATFORM_NVIDIA__) + constexpr const sunindextype WARP_SIZE = 32; + #endif + constexpr const sunindextype MAX_BLOCK_SIZE = 1024; +-- +2.39.3 + diff --git a/var/spack/repos/builtin/packages/sundials/package.py b/var/spack/repos/builtin/packages/sundials/package.py index 48f5ec65d7..3a906e6c2c 100644 --- a/var/spack/repos/builtin/packages/sundials/package.py +++ b/var/spack/repos/builtin/packages/sundials/package.py @@ -285,6 +285,10 @@ class Sundials(CMakePackage, CudaPackage, ROCmPackage): # https://github.com/spack/spack/issues/29526 patch("nvector-pic.patch", when="@6.1.0:6.2.0 +rocm") + # Backward compatibility is stopped from ROCm 6.0 + # Need to follow the changes similar to PR https://github.com/LLNL/RAJA/pull/1568 + patch("Change-HIP_PLATFORM-from-HCC-to-AMD-and-NVCC-to-NVIDIA.patch", when="^hip@6.0.0 +rocm") + # remove OpenMP header file and function from hypre vector test code patch("test_nvector_parhyp.patch", when="@2.7.0:3.0.0") patch("FindPackageMultipass.cmake.patch", when="@5.0.0") diff --git a/var/spack/repos/builtin/packages/trilinos/0001-use-the-gcnArchName-inplace-of-gcnArch-as-gcnArch-is.patch b/var/spack/repos/builtin/packages/trilinos/0001-use-the-gcnArchName-inplace-of-gcnArch-as-gcnArch-is.patch new file mode 100644 index 0000000000..ea2b8b98a4 --- /dev/null +++ b/var/spack/repos/builtin/packages/trilinos/0001-use-the-gcnArchName-inplace-of-gcnArch-as-gcnArch-is.patch @@ -0,0 +1,26 @@ +From e7fa7ea37423d3d17d77334ac849c5df00feb20e Mon Sep 17 00:00:00 2001 +From: sreenivasa murthy kolam <sreenivasamurthy.kolam@amd.com> +Date: Tue, 16 Jan 2024 10:09:34 +0000 +Subject: [PATCH] use the gcnArchName inplace of gcnArch as gcnArch is + deprecated from rocm-6.0.0 + +--- + packages/kokkos/core/src/HIP/Kokkos_HIP_Instance.cpp | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_Instance.cpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_Instance.cpp +index 7840ad9..882d143 100644 +--- a/packages/kokkos/core/src/HIP/Kokkos_HIP_Instance.cpp ++++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_Instance.cpp +@@ -86,7 +86,7 @@ void HIPInternal::print_configuration(std::ostream &s) const { + KOKKOS_IMPL_HIP_SAFE_CALL(hipGetDeviceProperties(&hipProp, i)); + + s << "Kokkos::HIP[ " << i << " ] " +- << "gcnArch " << hipProp.gcnArch << ", Total Global Memory: " ++ << "gcnArchName " << hipProp.gcnArchName << ", Total Global Memory: " + << ::Kokkos::Impl::human_memory_size(hipProp.totalGlobalMem) + << ", Shared Memory per Block: " + << ::Kokkos::Impl::human_memory_size(hipProp.sharedMemPerBlock); +-- +2.39.3 + diff --git a/var/spack/repos/builtin/packages/trilinos/package.py b/var/spack/repos/builtin/packages/trilinos/package.py index d1de74f11c..e015bb7f4e 100644 --- a/var/spack/repos/builtin/packages/trilinos/package.py +++ b/var/spack/repos/builtin/packages/trilinos/package.py @@ -489,6 +489,11 @@ class Trilinos(CMakePackage, CudaPackage, ROCmPackage): # workaround an NVCC bug with c++14 (https://github.com/trilinos/Trilinos/issues/6954) # avoid calling deprecated functions with CUDA-11 patch("fix_cxx14_cuda11.patch", when="@13.0.0:13.0.1 cxxstd=14 ^cuda@11:") + patch( + "0001-use-the-gcnArchName-inplace-of-gcnArch-as-gcnArch-is.patch", + when="@15.0.0 ^hip@6.0.0 +rocm", + ) + # Allow building with +teko gotype=long patch( "https://github.com/trilinos/Trilinos/commit/b17f20a0b91e0b9fc5b1b0af3c8a34e2a4874f3f.patch?full_index=1", |