summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorrenjithravindrankannath <94420380+renjithravindrankannath@users.noreply.github.com>2023-10-02 11:36:51 -0700
committerGitHub <noreply@github.com>2023-10-02 11:36:51 -0700
commit615312fceeae6402b6aa01cd42fde8511a5fb284 (patch)
tree4fbd0f03c107a21f746406e47e0d595f73ae9353
parent453625014da8a471a34cc10b0c18ddd0e141f130 (diff)
downloadspack-615312fceeae6402b6aa01cd42fde8511a5fb284.tar.gz
spack-615312fceeae6402b6aa01cd42fde8511a5fb284.tar.bz2
spack-615312fceeae6402b6aa01cd42fde8511a5fb284.tar.xz
spack-615312fceeae6402b6aa01cd42fde8511a5fb284.zip
Rocm 5.6.0 & 5.6.1 release updates (#39673)
* 5.6.0 updates * Rocm 5.6.0 updates * Style and audit corrections for 5.6 * Patching smi path for tests. * Style correction * 5.6.1 updates * Updated hip tests for ci build failure Updated hiprand with the release tag Taken care the review comment rocsolver * Adding rocm-smi path for 5.6 * Adding the patch file * Setting library directory uniform * gl depends on mesa but it should not be llvm variant * Fix for the issue 39520 by setting CMAKE_INSTALL_LIBDIR=lib * i1 muls can sometimes happen after SCEV. They resulted in ISel failures because we were missing the patterns for them. * 5.6.0 & 5.6.1 updates for migraphx, miopen-hip, mivisionx * Revert "5.6.0 & 5.6.1 updates for migraphx, miopen-hip, mivisionx" This reverts commit f54c9c6c67a4e5a54859f59d6550eb8e542d6c26. * Revert operator mixup fix * Splitting compiler-rt-linkage-for-host and operator mixup patch * Adding missing patch for reverting operator mixup * 5.6 update for composable-kernel,migraphx,miopen-hip and mivisionx * Updating rvs, rcd and rccl for 5.6.1. adding comment for llvm patch
-rw-r--r--var/spack/repos/builtin/packages/comgr/package.py6
-rw-r--r--var/spack/repos/builtin/packages/composable-kernel/package.py4
-rw-r--r--var/spack/repos/builtin/packages/hip-rocclr/package.py6
-rw-r--r--var/spack/repos/builtin/packages/hip/0014-remove-compiler-rt-linkage-for-host.5.6.0.patch75
-rw-r--r--var/spack/repos/builtin/packages/hip/0015-reverting-operator-mixup-fix-for-slate.patch107
-rw-r--r--var/spack/repos/builtin/packages/hip/package.py134
-rw-r--r--var/spack/repos/builtin/packages/hipblas/package.py4
-rw-r--r--var/spack/repos/builtin/packages/hipcub/package.py5
-rw-r--r--var/spack/repos/builtin/packages/hipfft/package.py5
-rw-r--r--var/spack/repos/builtin/packages/hipfort/package.py5
-rw-r--r--var/spack/repos/builtin/packages/hipify-clang/0002-install-hipify-clang-in-bin-dir-and-llvm-clangs-head.patch13
-rw-r--r--var/spack/repos/builtin/packages/hipify-clang/package.py9
-rw-r--r--var/spack/repos/builtin/packages/hiprand/package.py4
-rw-r--r--var/spack/repos/builtin/packages/hipsolver/package.py5
-rw-r--r--var/spack/repos/builtin/packages/hipsparse/package.py4
-rw-r--r--var/spack/repos/builtin/packages/hsa-rocr-dev/package.py8
-rw-r--r--var/spack/repos/builtin/packages/hsakmt-roct/package.py4
-rw-r--r--var/spack/repos/builtin/packages/llvm-amdgpu/001-Add-i1-mul-patterns.patch2842
-rw-r--r--var/spack/repos/builtin/packages/llvm-amdgpu/package.py12
-rw-r--r--var/spack/repos/builtin/packages/llvm/package.py2
-rw-r--r--var/spack/repos/builtin/packages/migraphx/0005-Adding-half-include-directory-path-migraphx.patch48
-rw-r--r--var/spack/repos/builtin/packages/migraphx/package.py20
-rw-r--r--var/spack/repos/builtin/packages/miopen-hip/package.py17
-rw-r--r--var/spack/repos/builtin/packages/mivisionx/package.py11
-rw-r--r--var/spack/repos/builtin/packages/rccl/package.py11
-rw-r--r--var/spack/repos/builtin/packages/rdc/package.py8
-rw-r--r--var/spack/repos/builtin/packages/rocalution/package.py5
-rw-r--r--var/spack/repos/builtin/packages/rocblas/package.py8
-rw-r--r--var/spack/repos/builtin/packages/rocfft/package.py4
-rw-r--r--var/spack/repos/builtin/packages/rocm-bandwidth-test/package.py6
-rw-r--r--var/spack/repos/builtin/packages/rocm-clang-ocl/package.py6
-rw-r--r--var/spack/repos/builtin/packages/rocm-cmake/package.py6
-rw-r--r--var/spack/repos/builtin/packages/rocm-core/package.py2
-rw-r--r--var/spack/repos/builtin/packages/rocm-dbgapi/package.py6
-rw-r--r--var/spack/repos/builtin/packages/rocm-debug-agent/package.py8
-rw-r--r--var/spack/repos/builtin/packages/rocm-device-libs/package.py6
-rw-r--r--var/spack/repos/builtin/packages/rocm-gdb/package.py6
-rw-r--r--var/spack/repos/builtin/packages/rocm-opencl/package.py8
-rw-r--r--var/spack/repos/builtin/packages/rocm-openmp-extras/package.py18
-rw-r--r--var/spack/repos/builtin/packages/rocm-smi-lib/package.py6
-rw-r--r--var/spack/repos/builtin/packages/rocm-tensile/package.py8
-rw-r--r--var/spack/repos/builtin/packages/rocm-validation-suite/007-cleanup-path-reference-donot-download-googletest-yaml-library-path_5.6.patch532
-rw-r--r--var/spack/repos/builtin/packages/rocm-validation-suite/package.py10
-rw-r--r--var/spack/repos/builtin/packages/rocminfo/package.py6
-rw-r--r--var/spack/repos/builtin/packages/rocprim/package.py4
-rw-r--r--var/spack/repos/builtin/packages/rocrand/package.py4
-rw-r--r--var/spack/repos/builtin/packages/rocsolver/package.py8
-rw-r--r--var/spack/repos/builtin/packages/rocsparse/package.py5
-rw-r--r--var/spack/repos/builtin/packages/rocthrust/package.py4
-rw-r--r--var/spack/repos/builtin/packages/roctracer-dev-api/package.py2
-rw-r--r--var/spack/repos/builtin/packages/roctracer-dev/package.py6
-rw-r--r--var/spack/repos/builtin/packages/rocwmma/0001-add-rocm-smi-lib-path-for-building-tests.patch31
-rw-r--r--var/spack/repos/builtin/packages/rocwmma/package.py24
53 files changed, 4047 insertions, 61 deletions
diff --git a/var/spack/repos/builtin/packages/comgr/package.py b/var/spack/repos/builtin/packages/comgr/package.py
index 93a5fcf740..a17bcc7e94 100644
--- a/var/spack/repos/builtin/packages/comgr/package.py
+++ b/var/spack/repos/builtin/packages/comgr/package.py
@@ -21,6 +21,8 @@ class Comgr(CMakePackage):
libraries = ["libamd_comgr"]
version("master", branch="amd-stg-open")
+ version("5.6.1", sha256="0a85d84619f98be26ca7a32c71f94ed3c4e9866133789eabb451be64ce739300")
+ version("5.6.0", sha256="9396a7238b547ee68146c669b10b9d5de8f1d76527c649133c75d8076a185a72")
version("5.5.1", sha256="0fbb15fe5a95c2e141ccd360bc413e1feda283334781540a6e5095ab27fd8019")
version("5.5.0", sha256="97dfff03226ce0902b9d5d1c8c7bebb7a15978a81b6e9c750bf2d2473890bd42")
version("5.4.3", sha256="8af18035550977fe0aa9cca8dfacbe65fe292e971de5a0e160710bafda05a81f")
@@ -142,6 +144,8 @@ class Comgr(CMakePackage):
"5.4.3",
"5.5.0",
"5.5.1",
+ "5.6.0",
+ "5.6.1",
"master",
]:
# llvm libs are linked statically, so this *could* be a build dep
@@ -153,7 +157,7 @@ class Comgr(CMakePackage):
"rocm-device-libs@" + ver, when="@{0} ^llvm-amdgpu ~rocm-device-libs".format(ver)
)
- for ver in ["5.5.0", "5.5.1"]:
+ for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1"]:
depends_on("rocm-core@" + ver, when="@" + ver)
root_cmakelists_dir = join_path("lib", "comgr")
diff --git a/var/spack/repos/builtin/packages/composable-kernel/package.py b/var/spack/repos/builtin/packages/composable-kernel/package.py
index 57bfc6a17c..efa05197f9 100644
--- a/var/spack/repos/builtin/packages/composable-kernel/package.py
+++ b/var/spack/repos/builtin/packages/composable-kernel/package.py
@@ -17,6 +17,8 @@ class ComposableKernel(CMakePackage):
maintainers("srekolam", "afzpatel")
version("master", branch="develop")
+ version("5.6.1", commit="f5ec04f091fa5c48c67d7bacec36a414d0be06a5")
+ version("5.6.0", commit="f0fd02634c2f8f8c70f5a0ab2a8c84db5e36eeca")
version("5.5.1", commit="ac9e01e2cc3721be24619807adc444e1f59a9d25")
version("5.5.0", commit="8b76b832420a3d69708401de6607a033163edcce")
version("5.4.3", commit="bb3d9546f186e39cefedc3e7f01d88924ba20168")
@@ -40,7 +42,7 @@ class ComposableKernel(CMakePackage):
depends_on("pkgconfig", type="build")
depends_on("cmake@3.16:", type="build")
- for ver in ["master", "5.5.1", "5.5.0", "5.4.3", "5.4.0"]:
+ for ver in ["master", "5.6.1", "5.6.0", "5.5.1", "5.5.0", "5.4.3", "5.4.0"]:
depends_on("hip@" + ver, when="@" + ver)
depends_on("llvm-amdgpu@" + ver, when="@" + ver)
depends_on("rocm-cmake@" + ver, when="@" + ver, type="build")
diff --git a/var/spack/repos/builtin/packages/hip-rocclr/package.py b/var/spack/repos/builtin/packages/hip-rocclr/package.py
index 3c2239d179..e6a4b3bbdf 100644
--- a/var/spack/repos/builtin/packages/hip-rocclr/package.py
+++ b/var/spack/repos/builtin/packages/hip-rocclr/package.py
@@ -27,6 +27,8 @@ class HipRocclr(CMakePackage):
return url.format(version)
version("master", branch="main")
+ version("5.6.1", sha256="cc9a99c7e4de3d9360c0a471b27d626e84a39c9e60e0aff1e8e1500d82391819")
+ version("5.6.0", sha256="864f87323e793e60b16905284fba381a7182b960dd4a37fb67420c174442c03c")
version("5.5.1", sha256="1375fc7723cfaa0ae22a78682186d4804188b0a54990bfd9c0b8eb421b85e37e")
version("5.5.0", sha256="efbae9a1ef2ab3de5ca44091e9bb78522e76759c43524c1349114f9596cc61d1")
version("5.4.3", sha256="71d9668619ab57ec8a4564d11860438c5aad5bd161a3e58fbc49555fbd59182d")
@@ -140,6 +142,8 @@ class HipRocclr(CMakePackage):
"5.4.3",
"5.5.0",
"5.5.1",
+ "5.6.0",
+ "5.6.1",
"master",
]:
depends_on("hsakmt-roct@" + ver, when="@" + ver)
@@ -162,6 +166,8 @@ class HipRocclr(CMakePackage):
# Add opencl sources thru the below
for d_version, d_shasum in [
+ ("5.6.1", "ec26049f7d93c95050c27ba65472736665ec7a40f25920a868616b2970f6b845"),
+ ("5.6.0", "52ab260d00d279c2a86c353901ffd88ee61b934ad89e9eb480f210656705f04e"),
("5.5.1", "a8a62a7c6fc5398406d2203b8cb75621a24944688e545d917033d87de2724498"),
("5.5.0", "0df9fa0b8aa0c8e6711d34eec0fdf1ed356adcd9625bc8f1ce9b3e72090f3e4f"),
("5.4.3", "b0f8339c844a2e62773bd85cd1e7c5ecddfe71d7c8e8d604e1a1d60900c30873"),
diff --git a/var/spack/repos/builtin/packages/hip/0014-remove-compiler-rt-linkage-for-host.5.6.0.patch b/var/spack/repos/builtin/packages/hip/0014-remove-compiler-rt-linkage-for-host.5.6.0.patch
new file mode 100644
index 0000000000..dfca3691f1
--- /dev/null
+++ b/var/spack/repos/builtin/packages/hip/0014-remove-compiler-rt-linkage-for-host.5.6.0.patch
@@ -0,0 +1,75 @@
+From cd4283eab943a3018237035afea61f1b5e0042cd Mon Sep 17 00:00:00 2001
+From: Renjith Ravindran <Renjith.RavindranKannath@amd.com>
+Date: Wed, 27 Sep 2023 06:38:18 +0000
+Subject: [PATCH] Remove-compiler-rt-linkage-for-host
+
+---
+ clr/hipamd/CMakeLists.txt | 6 ++++--
+ clr/hipamd/hip-config.cmake.in | 1 -
+ hipcc/bin/hipcc.pl | 11 ++++++++---
+ 3 files changed, 12 insertions(+), 6 deletions(-)
+
+diff --git a/clr/hipamd/CMakeLists.txt b/clr/hipamd/CMakeLists.txt
+index c14a9ad..ca49f7f 100755
+--- a/clr/hipamd/CMakeLists.txt
++++ b/clr/hipamd/CMakeLists.txt
+@@ -400,8 +400,10 @@ if (NOT ${HIPCC_BIN_DIR} STREQUAL "")
+ install(PROGRAMS ${HIPCC_BIN_DIR}/hipcc.pl DESTINATION bin)
+ install(PROGRAMS ${HIPCC_BIN_DIR}/hipconfig.pl DESTINATION bin)
+ install(PROGRAMS ${HIPCC_BIN_DIR}/hipvars.pm DESTINATION bin)
+- install(PROGRAMS ${HIPCC_BIN_DIR}/hipcc.bat DESTINATION bin)
+- install(PROGRAMS ${HIPCC_BIN_DIR}/hipconfig.bat DESTINATION bin)
++ if(WIN32)
++ install(PROGRAMS ${HIPCC_BIN_DIR}/hipcc.bat DESTINATION bin)
++ install(PROGRAMS ${HIPCC_BIN_DIR}/hipconfig.bat DESTINATION bin)
++ endif()
+ endif()
+
+ #############################
+diff --git a/clr/hipamd/hip-config.cmake.in b/clr/hipamd/hip-config.cmake.in
+index 537a599..7d10273 100755
+--- a/clr/hipamd/hip-config.cmake.in
++++ b/clr/hipamd/hip-config.cmake.in
+@@ -245,7 +245,6 @@ if(HIP_COMPILER STREQUAL "clang")
+ # Add support for __fp16 and _Float16, explicitly link with compiler-rt
+ if( "${CLANGRT_BUILTINS_FETCH_EXIT_CODE}" STREQUAL "0" )
+ # CLANG_RT Builtins found Successfully Set interface link libraries property
+- set_property(TARGET hip::host APPEND PROPERTY INTERFACE_LINK_LIBRARIES "${CLANGRT_BUILTINS}")
+ set_property(TARGET hip::device APPEND PROPERTY INTERFACE_LINK_LIBRARIES "${CLANGRT_BUILTINS}")
+ else()
+ message(STATUS "clangrt builtins lib not found: ${CLANGRT_BUILTINS_FETCH_EXIT_CODE}")
+diff --git a/hipcc/bin/hipcc.pl b/hipcc/bin/hipcc.pl
+index 56dcda2..c7ae60b 100755
+--- a/hipcc/bin/hipcc.pl
++++ b/hipcc/bin/hipcc.pl
+@@ -155,11 +155,15 @@ if ($HIP_PLATFORM eq "amd") {
+ if($isWindows) {
+ $execExtension = ".exe";
+ }
+- $HIPCC="$HIP_CLANG_PATH/clang++" . $execExtension;
++ # llvm_path is set inside the hip recipe
++ $LLVM_PATH= $ENV{'LLVM_PATH'};
++ $HIPCC="${LLVM_PATH}/bin/clang++" . $execExtension;
++
+
+ # If $HIPCC clang++ is not compiled, use clang instead
+ if ( ! -e $HIPCC ) {
+- $HIPCC="$HIP_CLANG_PATH/clang" . $execExtension;
++ $LLVM_PATH= $ENV{'LLVM_PATH'};
++ $HIPCC="${LLVM_PATH}/bin/clang" . $execExtension;
+ $HIPLDFLAGS = "--driver-mode=g++";
+ }
+ # to avoid using dk linker or MSVC linker
+@@ -483,7 +487,8 @@ if($HIP_PLATFORM eq "amd"){
+ $targetsStr = $ENV{HCC_AMDGPU_TARGET};
+ } elsif (not $isWindows) {
+ # Else try using rocm_agent_enumerator
+- $ROCM_AGENT_ENUM = "${ROCM_PATH}/bin/rocm_agent_enumerator";
++ $ROCMINFO_PATH = $ENV{'ROCMINFO_PATH'} // $ROCMINFO_PATH;
++ $ROCM_AGENT_ENUM = "${ROCMINFO_PATH}/bin/rocm_agent_enumerator";
+ $targetsStr = `${ROCM_AGENT_ENUM} -t GPU`;
+ $targetsStr =~ s/\n/,/g;
+ }
+--
+2.31.1
+
diff --git a/var/spack/repos/builtin/packages/hip/0015-reverting-operator-mixup-fix-for-slate.patch b/var/spack/repos/builtin/packages/hip/0015-reverting-operator-mixup-fix-for-slate.patch
new file mode 100644
index 0000000000..36bfadfe94
--- /dev/null
+++ b/var/spack/repos/builtin/packages/hip/0015-reverting-operator-mixup-fix-for-slate.patch
@@ -0,0 +1,107 @@
+From 1d7f7eb9a52af2b83d3cb06bb4fe0f31eb47ce7f Mon Sep 17 00:00:00 2001
+From: Renjith Ravindran <Renjith.RavindranKannath@amd.com>
+Date: Wed, 27 Sep 2023 07:07:01 +0000
+Subject: [PATCH] Reverting operator mixup fix for slate
+
+---
+ .../include/hip/amd_detail/amd_hip_complex.h | 17 ++++------
+ .../hip/amd_detail/amd_hip_vector_types.h | 31 +++++++++++--------
+ 2 files changed, 24 insertions(+), 24 deletions(-)
+
+diff --git a/clr/hipamd/include/hip/amd_detail/amd_hip_complex.h b/clr/hipamd/include/hip/amd_detail/amd_hip_complex.h
+index 9d9dfd5..eba6eb5 100644
+--- a/clr/hipamd/include/hip/amd_detail/amd_hip_complex.h
++++ b/clr/hipamd/include/hip/amd_detail/amd_hip_complex.h
+@@ -106,20 +106,15 @@ THE SOFTWARE.
+ return lhs; \
+ }
+
+-#define COMPLEX_MUL_PREOP_OVERLOAD(type) \
+- __HOST_DEVICE__ static inline type& operator*=(type& lhs, const type& rhs) { \
+- type temp{lhs}; \
+- lhs.x = rhs.x * temp.x - rhs.y * temp.y; \
+- lhs.y = rhs.y * temp.x + rhs.x * temp.y; \
+- return lhs; \
++#define COMPLEX_MUL_PREOP_OVERLOAD(type) \
++ __HOST_DEVICE__ static inline type& operator*=(type& lhs, const type& rhs) { \
++ lhs = lhs * rhs; \
++ return lhs; \
+ }
+
+ #define COMPLEX_DIV_PREOP_OVERLOAD(type) \
+- __HOST_DEVICE__ static inline type& operator/=(type& lhs, const type& rhs) { \
+- type temp; \
+- temp.x = (lhs.x*rhs.x + lhs.y * rhs.y) / (rhs.x*rhs.x + rhs.y*rhs.y); \
+- temp.y = (lhs.y * rhs.x - lhs.x * rhs.y) / (rhs.x*rhs.x + rhs.y*rhs.y); \
+- lhs = temp; \
++ __HOST_DEVICE__ static inline type& operator/=(type& lhs, const type& rhs) { \
++ lhs = lhs / rhs; \
+ return lhs; \
+ }
+
+diff --git a/clr/hipamd/include/hip/amd_detail/amd_hip_vector_types.h b/clr/hipamd/include/hip/amd_detail/amd_hip_vector_types.h
+index 8215fb0..dfd3b39 100644
+--- a/clr/hipamd/include/hip/amd_detail/amd_hip_vector_types.h
++++ b/clr/hipamd/include/hip/amd_detail/amd_hip_vector_types.h
+@@ -544,13 +544,6 @@ template <typename __T> struct is_scalar : public integral_constant<bool, __is_s
+ data *= x.data;
+ return *this;
+ }
+-
+- friend __HOST_DEVICE__ inline constexpr HIP_vector_type operator*(
+- HIP_vector_type x, const HIP_vector_type& y) noexcept
+- {
+- return HIP_vector_type{ x } *= y;
+- }
+-
+ template<
+ typename U,
+ typename std::enable_if<
+@@ -561,12 +554,6 @@ template <typename __T> struct is_scalar : public integral_constant<bool, __is_s
+ return *this *= HIP_vector_type{x};
+ }
+
+- friend __HOST_DEVICE__ inline constexpr HIP_vector_type operator/(
+- HIP_vector_type x, const HIP_vector_type& y) noexcept
+- {
+- return HIP_vector_type{ x } /= y;
+- }
+-
+ __HOST_DEVICE__
+ HIP_vector_type& operator/=(const HIP_vector_type& x) noexcept
+ {
+@@ -722,6 +709,15 @@ template <typename __T> struct is_scalar : public integral_constant<bool, __is_s
+ return HIP_vector_type<T, n>{x} -= y;
+ }
+
++ template<typename T, unsigned int n>
++ __HOST_DEVICE__
++ inline
++ constexpr
++ HIP_vector_type<T, n> operator*(
++ const HIP_vector_type<T, n>& x, const HIP_vector_type<T, n>& y) noexcept
++ {
++ return HIP_vector_type<T, n>{x} *= y;
++ }
+ template<typename T, unsigned int n, typename U>
+ __HOST_DEVICE__
+ inline
+@@ -741,6 +737,15 @@ template <typename __T> struct is_scalar : public integral_constant<bool, __is_s
+ return HIP_vector_type<T, n>{x} *= y;
+ }
+
++ template<typename T, unsigned int n>
++ __HOST_DEVICE__
++ inline
++ constexpr
++ HIP_vector_type<T, n> operator/(
++ const HIP_vector_type<T, n>& x, const HIP_vector_type<T, n>& y) noexcept
++ {
++ return HIP_vector_type<T, n>{x} /= y;
++ }
+ template<typename T, unsigned int n, typename U>
+ __HOST_DEVICE__
+ inline
+--
+2.31.1
+
diff --git a/var/spack/repos/builtin/packages/hip/package.py b/var/spack/repos/builtin/packages/hip/package.py
index 2bf04a1983..5e1d6744cb 100644
--- a/var/spack/repos/builtin/packages/hip/package.py
+++ b/var/spack/repos/builtin/packages/hip/package.py
@@ -25,6 +25,8 @@ class Hip(CMakePackage):
libraries = ["libamdhip64"]
version("master", branch="master")
+ version("5.6.1", sha256="4b3c4dfcf8595da0e1b8c3e8067b1ccebeaac337762ff098db14375fa8dd4487")
+ version("5.6.0", sha256="a8237768c1ae70029d972376f8d279f4de18a1e6106fff6215d1e16847bc375e")
version("5.5.1", sha256="1f5f6bb72d8d64335ccc8242ef2e2ea8efeb380cce2997f475b1ee77528d9fb4")
version("5.5.0", sha256="5b0d0253e62f85cc21d043513f7c11c64e4a4ec416159668f0b160d732d09a3c")
version("5.4.3", sha256="23e51d3af517cd63019f8d199e46b84d5a18251d148e727f3985e8d99ccb0e58")
@@ -162,6 +164,8 @@ class Hip(CMakePackage):
"5.4.3",
"5.5.0",
"5.5.1",
+ "5.6.0",
+ "5.6.1",
]:
depends_on("hsakmt-roct@" + ver, when="@" + ver)
depends_on("hsa-rocr-dev@" + ver, when="@" + ver)
@@ -170,10 +174,10 @@ class Hip(CMakePackage):
depends_on("rocminfo@" + ver, when="@" + ver)
depends_on("roctracer-dev-api@" + ver, when="@" + ver)
- for ver in ["5.4.0", "5.4.3", "5.5.0", "5.5.1"]:
+ for ver in ["5.4.0", "5.4.3", "5.5.0", "5.5.1", "5.6.0", "5.6.1"]:
depends_on("hipify-clang", when="@" + ver)
- for ver in ["5.5.0", "5.5.1"]:
+ for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1"]:
depends_on("rocm-core@" + ver, when="@" + ver)
# hipcc likes to add `-lnuma` by default :(
# ref https://github.com/ROCm-Developer-Tools/HIP/pull/2202
@@ -269,6 +273,55 @@ class Hip(CMakePackage):
placement="rocclr",
when="@{0}".format(d_version),
)
+ # Add hip-clr sources thru the below
+ for d_version, d_shasum in [
+ ("5.6.1", "0b88af1e99643899d11b1c8cf8a3c46601051b328a5e0ffbd44ee88b7eb0db33"),
+ ("5.6.0", "8dcd99110737a294f67a805639cf372890c8ca16c7603caaa793e71e84478fe4"),
+ ]:
+ resource(
+ name="clr",
+ url="https://github.com/ROCm-Developer-Tools/clr/archive/refs/tags/rocm-{0}.tar.gz".format(
+ d_version
+ ),
+ sha256=d_shasum,
+ expand=True,
+ destination="",
+ placement="clr",
+ when="@{0}".format(d_version),
+ )
+
+ # Add hipcc sources thru the below
+ for d_version, d_shasum in [
+ ("5.6.1", "5800fac92b841ef6f52acda78d9bf86f83970bec0fb848a6265d239bdb7eb51a"),
+ ("5.6.0", "fdb7fdc9e4648376120330f034ee8353038d34c8a015f9eb0c208c56eeddd097"),
+ ]:
+ resource(
+ name="hipcc",
+ url="https://github.com/ROCm-Developer-Tools/HIPCC/archive/refs/tags/rocm-{0}.tar.gz".format(
+ d_version
+ ),
+ sha256=d_shasum,
+ expand=True,
+ destination="",
+ placement="hipcc",
+ when="@{0}".format(d_version),
+ )
+ # Add hiptests sources thru the below
+ for d_version, d_shasum in [
+ ("5.6.1", "5b3002ddfafda162329e4d9e6ac1200eeb48ff08e666b342aa8aeca30750f48b"),
+ ("5.6.0", "8cf4509bf9c0747dab8ed8fec1365a9156792034b517207a0b2d63270429fd2e"),
+ ]:
+ resource(
+ name="hip-tests",
+ url="https://github.com/ROCm-Developer-Tools/hip-tests/archive/refs/tags/rocm-{0}.tar.gz".format(
+ d_version
+ ),
+ sha256=d_shasum,
+ expand=True,
+ destination="",
+ placement="hip-tests",
+ when="@{0}".format(d_version),
+ )
# Note: the ROCm ecosystem expects `lib/` and `bin/` folders with symlinks
# in the parent directory of the package, which is incompatible with spack.
# In hipcc the ROCM_PATH variable is used to point to the parent directory
@@ -331,10 +384,11 @@ class Hip(CMakePackage):
patch("0005-Disable-tests-4.1.0.patch", when="@4.1.0:4.3.2")
patch("Add_missing_open_cl_header_file_for_4.3.0.patch", when="@4.3.0:4.3.2")
- patch("0014-hip-test-file-reorg-5.4.0.patch", when="@5.4.0:")
- patch("0016-hip-sample-fix-hipMalloc-call.patch", when="@5.4.3:")
+ patch("0014-hip-test-file-reorg-5.4.0.patch", when="@5.4.0:5.5")
+ patch("0016-hip-sample-fix-hipMalloc-call.patch", when="@5.4.3:5.5")
patch("0014-remove-compiler-rt-linkage-for-host.5.5.0.patch", when="@5.5")
-
+ patch("0014-remove-compiler-rt-linkage-for-host.5.6.0.patch", when="@5.6:")
+ patch("0015-reverting-operator-mixup-fix-for-slate.patch", when="@5.6:")
# See https://github.com/ROCm-Developer-Tools/HIP/pull/3206
patch(
"https://github.com/ROCm-Developer-Tools/HIP/commit/50ee82f6bc4aad10908ce09198c9f7ebfb2a3561.patch?full_index=1",
@@ -346,8 +400,10 @@ class Hip(CMakePackage):
def root_cmakelists_dir(self):
if self.spec.satisfies("@:4.3.2"):
return self.stage.source_path
- else:
+ elif self.spec.satisfies("@4.5:5.5"):
return "hipamd"
+ else:
+ return "clr"
def get_paths(self):
if self.spec.external:
@@ -393,6 +449,7 @@ class Hip(CMakePackage):
"llvm-amdgpu": rocm_prefix.llvm,
"hsa-rocr-dev": rocm_prefix.hsa,
"rocminfo": rocm_prefix,
+ "comgr": rocm_prefix,
"rocm-device-libs": rocm_prefix,
}
@@ -405,6 +462,7 @@ class Hip(CMakePackage):
"llvm-amdgpu": self.spec["llvm-amdgpu"].prefix,
"hsa-rocr-dev": self.spec["hsa-rocr-dev"].prefix,
"rocminfo": self.spec["rocminfo"].prefix,
+ "comgr": self.spec["comgr"].prefix,
"rocm-device-libs": self.spec["llvm-amdgpu"].prefix,
}
@@ -476,6 +534,7 @@ class Hip(CMakePackage):
# hiprtcCreateProgram:
# https://github.com/RadeonOpenCompute/ROCm-CompilerSupport/blob/rocm-4.0.0/lib/comgr/src/comgr-env.cpp
env.set("LLVM_PATH", paths["llvm-amdgpu"])
+ env.set("COMGR_PATH", paths["comgr"])
# Finally we have to set --rocm-path=<prefix> ourselves, which is not
# the same as --hip-device-lib-path (set by hipcc). It's used to set
@@ -525,13 +584,20 @@ class Hip(CMakePackage):
"hip-config.cmake.in",
string=True,
)
- if self.spec.satisfies("@5.2: +rocm"):
+ if self.spec.satisfies("@5.2:5.4 +rocm"):
filter_file(
'"${ROCM_PATH}/llvm"',
self.spec["llvm-amdgpu"].prefix,
"hipamd/hip-config.cmake.in",
string=True,
)
+ if self.spec.satisfies("@5.6: +rocm"):
+ filter_file(
+ '"${ROCM_PATH}/llvm"',
+ self.spec["llvm-amdgpu"].prefix,
+ "clr/hipamd/hip-config.cmake.in",
+ string=True,
+ )
perl = self.spec["perl"].command
kwargs = {"ignore_absent": False, "backup": False, "string": False}
@@ -552,13 +618,13 @@ class Hip(CMakePackage):
"roc-obj-ls",
"hipvars.pm",
]
- elif self.spec.satisfies("@4.5.0:"):
+ elif self.spec.satisfies("@4.5.0:5.5"):
files = []
- filter_file(match, substitute, *files, **kwargs)
- # This guy is used during the cmake phase, so we have to fix the
- # shebang already here in case it is too long.
- filter_shebang("hipconfig")
- if self.spec.satisfies("@4.5.0:"):
+ filter_file(match, substitute, *files, **kwargs)
+ # This guy is used during the cmake phase, so we have to fix the
+ # shebang already here in case it is too long.
+ filter_shebang("hipconfig")
+ if self.spec.satisfies("@4.5.0:5.5"):
perl = self.spec["perl"].command
kwargs = {"ignore_absent": False, "backup": False, "string": False}
with working_dir("hipamd/bin"):
@@ -566,6 +632,18 @@ class Hip(CMakePackage):
substitute = "#!{perl}".format(perl=perl)
files = ["roc-obj-extract", "roc-obj-ls"]
filter_file(match, substitute, *files, **kwargs)
+ if self.spec.satisfies("@5.6.0:"):
+ perl = self.spec["perl"].command
+ kwargs = {"ignore_absent": False, "backup": False, "string": False}
+ match = "^#!/usr/bin/perl"
+ substitute = "#!{perl}".format(perl=perl)
+ with working_dir("clr/hipamd/bin"):
+ files = ["roc-obj-extract", "roc-obj-ls"]
+ filter_file(match, substitute, *files, **kwargs)
+ with working_dir("hipcc/bin"):
+ files = []
+ filter_file(match, substitute, *files, **kwargs)
+ filter_shebang("hipconfig")
if "@3.7.0: +rocm" in self.spec:
numactl = self.spec["numactl"].prefix.lib
kwargs = {"ignore_absent": False, "backup": False, "string": False}
@@ -573,7 +651,16 @@ class Hip(CMakePackage):
with working_dir("bin"):
match = " -lnuma"
substitute = " -L{numactl} -lnuma".format(numactl=numactl)
- filter_file(match, substitute, "hipcc", **kwargs)
+ if self.spec.satisfies("@4.5.0:5.5"):
+ filter_file(match, substitute, "hipcc", **kwargs)
+ if "@5.6.0: +rocm" in self.spec:
+ numactl = self.spec["numactl"].prefix.lib
+ kwargs = {"ignore_absent": False, "backup": False, "string": False}
+
+ with working_dir("hipcc/src"):
+ match = " -lnuma"
+ substitute = " -L{numactl} -lnuma".format(numactl=numactl)
+ filter_file(match, substitute, "hipBin_amd.h", **kwargs)
def flag_handler(self, name, flags):
if name == "cxxflags" and self.spec.satisfies("@3.7.0:4.3.2"):
@@ -609,21 +696,30 @@ class Hip(CMakePackage):
if "@4.5.0:" in self.spec:
args.append(self.define("HIP_COMMON_DIR", self.stage.source_path))
args.append(self.define("HIP_CATCH_TEST", "OFF"))
- args.append(self.define("ROCCLR_PATH", self.stage.source_path + "/rocclr"))
- args.append(self.define("AMD_OPENCL_PATH", self.stage.source_path + "/opencl"))
+ if "@4.5.0:5.5" in self.spec:
+ args.append(self.define("ROCCLR_PATH", self.stage.source_path + "rocclr"))
+ args.append(self.define("AMD_OPENCL_PATH", self.stage.source_path + "opencl"))
if "@5.3.0:" in self.spec:
args.append("-DCMAKE_INSTALL_LIBDIR=lib")
-
+ if "@5.6.0:" in self.spec:
+ args.append(self.define("ROCCLR_PATH", self.stage.source_path + "/clr/rocclr"))
+ args.append(self.define("AMD_OPENCL_PATH", self.stage.source_path + "/clr/opencl"))
+ args.append(self.define("HIPCC_BIN_DIR", self.stage.source_path + "/hipcc/bin")),
+ args.append(self.define("CLR_BUILD_HIP", True)),
+ args.append(self.define("CLR_BUILD_OCL", False)),
return args
- test_src_dir = "samples"
-
@run_after("install")
def cache_test_sources(self):
"""Copy the tests source files after the package is installed to an
install test subdirectory for use during `spack test run`."""
if self.spec.satisfies("@:5.1.0"):
return
+ else:
+ if "@:5.5" in self.spec:
+ self.test_src_dir = "samples"
+ else:
+ self.test_src_dir = "hip-tests/samples"
self.cache_extra_test_sources([self.test_src_dir])
def test_samples(self):
diff --git a/var/spack/repos/builtin/packages/hipblas/package.py b/var/spack/repos/builtin/packages/hipblas/package.py
index 08998400ee..b0261bd5db 100644
--- a/var/spack/repos/builtin/packages/hipblas/package.py
+++ b/var/spack/repos/builtin/packages/hipblas/package.py
@@ -22,6 +22,8 @@ class Hipblas(CMakePackage, CudaPackage, ROCmPackage):
version("develop", branch="develop")
version("master", branch="master")
+ version("5.6.1", sha256="f9da82fbefc68b84081ea0ed0139b91d2a540357fcf505c7f1d57eab01eb327c")
+ version("5.6.0", sha256="9453a31324e10ba528f8f4755d2c270d0ed9baa33e980d8f8383204d8e28a563")
version("5.5.1", sha256="5920c9a9c83cf7e2b42d1f99f5d5091cac7f6c0a040a737e869e57b92d7045a9")
version("5.5.0", sha256="b080c25cb61531228d26badcdca856c46c640035c058bfc1c9f63de65f418cd5")
version("5.4.3", sha256="5acac147aafc15c249c2f24c19459135ed68b506403aa92e602b67cfc10c38b7")
@@ -167,6 +169,8 @@ class Hipblas(CMakePackage, CudaPackage, ROCmPackage):
"5.4.3",
"5.5.0",
"5.5.1",
+ "5.6.0",
+ "5.6.1",
"master",
"develop",
]:
diff --git a/var/spack/repos/builtin/packages/hipcub/package.py b/var/spack/repos/builtin/packages/hipcub/package.py
index 94e6055705..cb878d1823 100644
--- a/var/spack/repos/builtin/packages/hipcub/package.py
+++ b/var/spack/repos/builtin/packages/hipcub/package.py
@@ -15,7 +15,8 @@ class Hipcub(CMakePackage, CudaPackage, ROCmPackage):
tags = ["rocm"]
maintainers("srekolam", "renjithravindrankannath")
-
+ version("5.6.1", sha256="4b9479daa40424c9ddbc14ce967aa170680f8ca1ed01a514e6e30ccfa22552ce")
+ version("5.6.0", sha256="5e74ddbf833f39836bf9ec6c6750348c7386a85ca67aaf9bb54d16c9e1959031")
version("5.5.1", sha256="ad83f3f1ed85ead9e3012906957c125a896168be913f6fb6af298228fc571480")
version("5.5.0", sha256="3eec838119326a67eb4cc006c706e328f3a51a01e98bbfb518df8fe4a4707e13")
version("5.4.3", sha256="cf528d9acb4f9b9c3aad439ae76bfc3d02be6e7a74d96099544e5d54e1a23675")
@@ -148,6 +149,8 @@ class Hipcub(CMakePackage, CudaPackage, ROCmPackage):
"5.4.3",
"5.5.0",
"5.5.1",
+ "5.6.0",
+ "5.6.1",
]:
depends_on("rocprim@" + ver, when="+rocm @" + ver)
depends_on("rocm-cmake@%s:" % ver, type="build", when="@" + ver)
diff --git a/var/spack/repos/builtin/packages/hipfft/package.py b/var/spack/repos/builtin/packages/hipfft/package.py
index 92e3db29cd..046d908e3e 100644
--- a/var/spack/repos/builtin/packages/hipfft/package.py
+++ b/var/spack/repos/builtin/packages/hipfft/package.py
@@ -22,7 +22,8 @@ class Hipfft(CMakePackage, CudaPackage, ROCmPackage):
maintainers("renjithravindrankannath", "srekolam")
version("master", branch="master")
-
+ version("5.6.1", sha256="d2ae36b8eacd39b865e8a7972b8eb86bcea2de4ac90711bba7e29b39b01eaa74")
+ version("5.6.0", sha256="c7f425b693caf9371b42226d86392335d993a117d23219b6ba1fd13523cb8261")
version("5.5.1", sha256="3addd15a459752ad657e84c2a7b6b6289600d1d0a5f90d6e0946ba11e8148fc0")
version("5.5.0", sha256="47ec6f7da7346c312b80daaa8f763e86c7bdc33ac8617cfa3344068e5b20dd9e")
version("5.4.3", sha256="ae37f40b6019a11f10646ef193716836f366d269eab3c5cc2ed09af85355b945")
@@ -116,6 +117,8 @@ class Hipfft(CMakePackage, CudaPackage, ROCmPackage):
"5.4.3",
"5.5.0",
"5.5.1",
+ "5.6.0",
+ "5.6.1",
]:
depends_on("rocm-cmake@%s:" % ver, type="build", when="@" + ver)
depends_on("rocfft@" + ver, when="+rocm @" + ver)
diff --git a/var/spack/repos/builtin/packages/hipfort/package.py b/var/spack/repos/builtin/packages/hipfort/package.py
index ec1e64ce13..da688d9c1f 100644
--- a/var/spack/repos/builtin/packages/hipfort/package.py
+++ b/var/spack/repos/builtin/packages/hipfort/package.py
@@ -15,7 +15,8 @@ class Hipfort(CMakePackage):
tags = ["rocm"]
maintainers("cgmb", "srekolam", "renjithravindrankannath")
-
+ version("5.6.1", sha256="a55345cc9ccaf0cd69d306b8eb9ec2a02c220a57e9c396443cc7273aa3377adc")
+ version("5.6.0", sha256="03176a099bc81e212ad1bf9d86f35561f8f2d21a2f126732d7620e1ea59888d5")
version("5.5.1", sha256="abc59f7b81cbefbe3555cbf1bf0d80e8aa65901c70799748c40870fe6f3fea60")
version("5.5.0", sha256="cae75ffeac129639cabebfe2f95f254c83d6c0a6cffd98142ea3537a132e42bb")
version("5.4.3", sha256="1954a1cba351d566872ced5549b2ced7ab6332221e2b98dba3c07180dce8f173")
@@ -118,6 +119,8 @@ class Hipfort(CMakePackage):
"5.4.3",
"5.5.0",
"5.5.1",
+ "5.6.0",
+ "5.6.1",
]:
depends_on("hip@" + ver, type="build", when="@" + ver)
diff --git a/var/spack/repos/builtin/packages/hipify-clang/0002-install-hipify-clang-in-bin-dir-and-llvm-clangs-head.patch b/var/spack/repos/builtin/packages/hipify-clang/0002-install-hipify-clang-in-bin-dir-and-llvm-clangs-head.patch
new file mode 100644
index 0000000000..c2fad6d3f2
--- /dev/null
+++ b/var/spack/repos/builtin/packages/hipify-clang/0002-install-hipify-clang-in-bin-dir-and-llvm-clangs-head.patch
@@ -0,0 +1,13 @@
+diff --git a/CMakeLists.txt b/CMakeLists.txt
+index 80c8a3f..d2b88c0 100644
+--- a/CMakeLists.txt
++++ b/CMakeLists.txt
+@@ -137,7 +137,7 @@ install(
+ # install all folders under clang/version/ in CMAKE_INSTALL_PREFIX path
+ install(
+ DIRECTORY ${LLVM_DIR}/../../clang/${LLVM_VERSION_MAJOR}.${LLVM_VERSION_MINOR}.${LLVM_VERSION_PATCH}/
+- DESTINATION .
++ DESTINATION ${CMAKE_INSTALL_PREFIX}/include
+ COMPONENT clang-resource-headers
+ FILES_MATCHING
+ PATTERN "*.h"
diff --git a/var/spack/repos/builtin/packages/hipify-clang/package.py b/var/spack/repos/builtin/packages/hipify-clang/package.py
index 65dd8df60f..dd6b99ee71 100644
--- a/var/spack/repos/builtin/packages/hipify-clang/package.py
+++ b/var/spack/repos/builtin/packages/hipify-clang/package.py
@@ -18,6 +18,8 @@ class HipifyClang(CMakePackage):
maintainers("srekolam", "renjithravindrankannath")
version("master", branch="master")
+ version("5.6.1", sha256="ec3a4f276556f9fd924ea3c89be11b6c6ddf999cdd4387f669e38e41ee0042e8")
+ version("5.6.0", sha256="a2572037a7d3bd0813bd6819a5e6c0e911678db5fd3ab15a65370601df91891b")
version("5.5.1", sha256="35b9c07a7afaf9cf6f3bbe9dd147fa81b1b297af3e5e26e60c55629e83feaa48")
version("5.5.0", sha256="1b75c702799ac93027337f8fb61d7c27ba960e8ece60d907fc8c5ab3f15c3fe9")
version("5.4.3", sha256="79e27bd6c0a28e6a62b02dccc0b5d88a81f69fe58487e83f3b7ab47d6b64341b")
@@ -103,7 +105,8 @@ class HipifyClang(CMakePackage):
# the patch was added to install the targets in the correct directory structure
# this will fix the issue https://github.com/spack/spack/issues/30711
- patch("0001-install-hipify-clang-in-bin-dir-and-llvm-clangs-head.patch", when="@5.1.0:")
+ patch("0001-install-hipify-clang-in-bin-dir-and-llvm-clangs-head.patch", when="@5.1.0:5.5")
+ patch("0002-install-hipify-clang-in-bin-dir-and-llvm-clangs-head.patch", when="@5.6:")
depends_on("cmake@3.5:", type="build")
for ver in [
@@ -132,11 +135,13 @@ class HipifyClang(CMakePackage):
"5.4.3",
"5.5.0",
"5.5.1",
+ "5.6.0",
+ "5.6.1",
"master",
]:
depends_on("llvm-amdgpu@" + ver, when="@" + ver)
- for ver in ["5.5.0", "5.5.1"]:
+ for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1"]:
depends_on("rocm-core@" + ver, when="@" + ver)
def setup_run_environment(self, env):
diff --git a/var/spack/repos/builtin/packages/hiprand/package.py b/var/spack/repos/builtin/packages/hiprand/package.py
index f55092df19..5f85c46cb9 100644
--- a/var/spack/repos/builtin/packages/hiprand/package.py
+++ b/var/spack/repos/builtin/packages/hiprand/package.py
@@ -22,6 +22,8 @@ class Hiprand(CMakePackage, CudaPackage, ROCmPackage):
version("develop", branch="develop")
version("master", branch="master")
+ version("5.6.1", sha256="a73d5578bc7f8dff0b8960e4bff97bc4fc28f508a19ed6acd1cfd4d3e76b47ee")
+ version("5.6.0", sha256="8c214e2f90337a5317a69950026bf337b1e567d43bb9ae64f2a802af2228c313")
version("5.5.1", sha256="5df9d78eae0991be5ec9f60e8d3530fabc23793d9f9cf274b075d689675db04e")
version("5.5.0", sha256="7c7dde7b989d5da9c0b0251233245f955b477c090462c7d34e3e0284c5fca761")
version("5.4.3", sha256="7d3d04476880ec90c088dff81f69aac8699eaef972476000e5c4726584ffa98f")
@@ -78,6 +80,8 @@ class Hiprand(CMakePackage, CudaPackage, ROCmPackage):
"5.4.3",
"5.5.0",
"5.5.1",
+ "5.6.0",
+ "5.6.1",
"master",
"develop",
]:
diff --git a/var/spack/repos/builtin/packages/hipsolver/package.py b/var/spack/repos/builtin/packages/hipsolver/package.py
index c1ca0db616..1ef38160a6 100644
--- a/var/spack/repos/builtin/packages/hipsolver/package.py
+++ b/var/spack/repos/builtin/packages/hipsolver/package.py
@@ -26,7 +26,8 @@ class Hipsolver(CMakePackage, CudaPackage, ROCmPackage):
version("develop", branch="develop")
version("master", branch="master")
-
+ version("5.6.1", sha256="2e546bc7771f7bf0aa7892b69cded725941573e8b70614759c3d03c21eb78dde")
+ version("5.6.0", sha256="11fa51d210853d93d24d55b20367738e49711793412f58e8d7689710b92ae16c")
version("5.5.1", sha256="826bd64a4887176595bb7319d9a3612e7327602efe1f42aa3f2ad0e783d1a180")
version("5.5.0", sha256="0f45be0f90907381ae3e82424599e2ca2112d6411b4a64c72558d63f00409b83")
version("5.4.3", sha256="02a1bffecc494393f49f97174db7d2c101db557d32404923a44520876e682e3a")
@@ -106,6 +107,8 @@ class Hipsolver(CMakePackage, CudaPackage, ROCmPackage):
"5.4.3",
"5.5.0",
"5.5.1",
+ "5.6.0",
+ "5.6.1",
"master",
"develop",
]:
diff --git a/var/spack/repos/builtin/packages/hipsparse/package.py b/var/spack/repos/builtin/packages/hipsparse/package.py
index f698d783bc..696094cb5a 100644
--- a/var/spack/repos/builtin/packages/hipsparse/package.py
+++ b/var/spack/repos/builtin/packages/hipsparse/package.py
@@ -19,6 +19,8 @@ class Hipsparse(CMakePackage, CudaPackage, ROCmPackage):
maintainers("cgmb", "srekolam", "renjithravindrankannath", "haampie")
libraries = ["libhipsparse"]
+ version("5.6.1", sha256="d636d0c5d1e38cc0c09b1e95380199ec82bd465b94bd6661f0c8d9374d9b565d")
+ version("5.6.0", sha256="3a6931b744ebaa4469a4c50d059a008403e4dc2a4f04dd69c3c6d20916b4a491")
version("5.5.1", sha256="3d291e4fe2c611d555e54de66149b204fe7ac59f5dd00a9ad93bc6dca0528880")
version("5.5.0", sha256="8122c8f17d899385de83efb7ac0d8a4fabfcd2aa21bbed63e63ea7adf0d22df6")
version("5.4.3", sha256="b373eccd03679a13fab4e740fc780da25cbd598abca3a1e5e3613ae14954f9db")
@@ -149,6 +151,8 @@ class Hipsparse(CMakePackage, CudaPackage, ROCmPackage):
"5.4.3",
"5.5.0",
"5.5.1",
+ "5.6.0",
+ "5.6.1",
]:
depends_on("rocm-cmake@%s:" % ver, type="build", when="@" + ver)
depends_on("rocsparse@" + ver, when="+rocm @" + ver)
diff --git a/var/spack/repos/builtin/packages/hsa-rocr-dev/package.py b/var/spack/repos/builtin/packages/hsa-rocr-dev/package.py
index 40ec2435f5..8a24050226 100644
--- a/var/spack/repos/builtin/packages/hsa-rocr-dev/package.py
+++ b/var/spack/repos/builtin/packages/hsa-rocr-dev/package.py
@@ -24,6 +24,8 @@ class HsaRocrDev(CMakePackage):
libraries = ["libhsa-runtime64"]
version("master", branch="master")
+ version("5.6.1", sha256="4de9a57c2092edf9398d671c8a2c60626eb7daf358caf710da70d9c105490221")
+ version("5.6.0", sha256="30875d440df9d8481ffb24d87755eae20a0efc1114849a72619ea954f1e9206c")
version("5.5.1", sha256="53d84ad5ba5086ed4ad67ad892c52c0e4eba8ddfa85c2dd341bf825f4d5fe4ee")
version("5.5.0", sha256="8dbc776b56f93ddaa2ca38bf3b88299b8091de7c1b3f2e481064896cf6808e6c")
version("5.4.3", sha256="a600eed848d47a7578c60da7e64eb92f29bbce2ec67932b251eafd4c2974cb67")
@@ -145,6 +147,8 @@ class HsaRocrDev(CMakePackage):
"5.4.3",
"5.5.0",
"5.5.1",
+ "5.6.0",
+ "5.6.1",
"master",
]:
depends_on("hsakmt-roct@" + ver, when="@" + ver)
@@ -153,7 +157,7 @@ class HsaRocrDev(CMakePackage):
depends_on(
"rocm-device-libs@" + ver, when="@{0} ^llvm-amdgpu ~rocm-device-libs".format(ver)
)
- for ver in ["5.5.0", "5.5.1"]:
+ for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1"]:
depends_on("rocm-core@" + ver, when="@" + ver)
# Both 3.5.0 and 3.7.0 force INSTALL_RPATH in different ways
@@ -198,4 +202,6 @@ class HsaRocrDev(CMakePackage):
args.append(self.define("BITCODE_DIR", bitcode_dir))
+ if self.spec.satisfies("@5.6:"):
+ args.append("-DCMAKE_INSTALL_LIBDIR=lib")
return args
diff --git a/var/spack/repos/builtin/packages/hsakmt-roct/package.py b/var/spack/repos/builtin/packages/hsakmt-roct/package.py
index 1a2ce25a04..0bebaae6bf 100644
--- a/var/spack/repos/builtin/packages/hsakmt-roct/package.py
+++ b/var/spack/repos/builtin/packages/hsakmt-roct/package.py
@@ -22,6 +22,8 @@ class HsakmtRoct(CMakePackage):
maintainers("srekolam", "renjithravindrankannath")
version("master", branch="master")
+ version("5.6.1", sha256="d60b355bfd21a08e0e36270fd56f98d052c3c6edca47da887fa32bf32759c29b")
+ version("5.6.0", sha256="cd009c5c09f664f046c428ba9843582ab468f7b88d560747eb949d8d7f8c5567")
version("5.5.1", sha256="4ffde3fc1f91f24cdbf09263fd8e012a3995ad10854f4c1d866beab7b9f36bf4")
version("5.5.0", sha256="2b11fd8937c2b06cd4ddea2c3699fbf3d1651892c4c5957d38553b993dd9af18")
version("5.4.3", sha256="3799abbe7177fbff3b304e2a363e2b39e8864f8650ae569b2b88b9291f9a710c")
@@ -114,7 +116,7 @@ class HsakmtRoct(CMakePackage):
for ver in ["5.3.0", "5.4.0", "5.4.3"]:
depends_on("llvm-amdgpu@" + ver, type="test", when="@" + ver)
- for ver in ["5.5.0", "5.5.1"]:
+ for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1"]:
depends_on("rocm-core@" + ver, when="@" + ver)
depends_on("llvm-amdgpu@" + ver, type="test", when="@" + ver)
diff --git a/var/spack/repos/builtin/packages/llvm-amdgpu/001-Add-i1-mul-patterns.patch b/var/spack/repos/builtin/packages/llvm-amdgpu/001-Add-i1-mul-patterns.patch
new file mode 100644
index 0000000000..f93fcb99db
--- /dev/null
+++ b/var/spack/repos/builtin/packages/llvm-amdgpu/001-Add-i1-mul-patterns.patch
@@ -0,0 +1,2842 @@
+From a0f3d7f45075a3e9545c0c9fa25a9f8fc840cdd7 Mon Sep 17 00:00:00 2001
+From: Renjith Ravindran <Renjith.RavindranKannath@amd.com>
+Date: Mon, 25 Sep 2023 18:38:17 +0000
+Subject: [PATCH] i1 muls can sometimes happen after SCEV. They resulted in
+ ISel failures because we were missing the patterns for them.
+
+---
+ llvm/lib/Target/AMDGPU/SIInstructions.td | 10 +
+ llvm/test/CodeGen/AMDGPU/mul.ll | 2676 ++++++++++++++++++++--
+ 2 files changed, 2544 insertions(+), 142 deletions(-)
+
+diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
+index 03b2160a1..3bf4e42de 100644
+--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
++++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
+@@ -2372,6 +2372,11 @@ def : GCNPat <
+ (S_AND_B64 $src0, $src1)
+ >;
+
++def : GCNPat <
++ (i1 (mul i1:$src0, i1:$src1)),
++ (S_AND_B64 $src0, $src1)
++>;
++
+ def : GCNPat <
+ (i1 (or i1:$src0, i1:$src1)),
+ (S_OR_B64 $src0, $src1)
+@@ -2411,6 +2416,11 @@ def : GCNPat <
+ (S_AND_B32 $src0, $src1)
+ >;
+
++def : GCNPat <
++ (i1 (mul i1:$src0, i1:$src1)),
++ (S_AND_B32 $src0, $src1)
++>;
++
+ def : GCNPat <
+ (i1 (or i1:$src0, i1:$src1)),
+ (S_OR_B32 $src0, $src1)
+diff --git a/llvm/test/CodeGen/AMDGPU/mul.ll b/llvm/test/CodeGen/AMDGPU/mul.ll
+index 85dd59a0c..a8973d845 100644
+--- a/llvm/test/CodeGen/AMDGPU/mul.ll
++++ b/llvm/test/CodeGen/AMDGPU/mul.ll
+@@ -1,20 +1,129 @@
+-; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,SI,FUNC %s
+-; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,VI,FUNC %s
+-; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=FUNC,GFX9PLUS %s
+-; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=FUNC,GFX9PLUS %s
+-; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=FUNC,GFX9PLUS %s
+-; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=EG,FUNC %s
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefixes=SI %s
++; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=VI %s
++; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
++; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
++; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
++; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -check-prefixes=EG %s
+
+ ; mul24 and mad24 are affected
+
+-; FUNC-LABEL: {{^}}test_mul_v2i32:
+-; EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+-; EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+-
+-; GCN: v_mul_lo_u32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+-; GCN: v_mul_lo_u32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+-
+ define amdgpu_kernel void @test_mul_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
++; SI-LABEL: test_mul_v2i32:
++; SI: ; %bb.0: ; %entry
++; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
++; SI-NEXT: s_mov_b32 s7, 0xf000
++; SI-NEXT: s_mov_b32 s6, -1
++; SI-NEXT: s_mov_b32 s10, s6
++; SI-NEXT: s_mov_b32 s11, s7
++; SI-NEXT: s_waitcnt lgkmcnt(0)
++; SI-NEXT: s_mov_b32 s8, s2
++; SI-NEXT: s_mov_b32 s9, s3
++; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
++; SI-NEXT: s_mov_b32 s4, s0
++; SI-NEXT: s_mov_b32 s5, s1
++; SI-NEXT: s_waitcnt vmcnt(0)
++; SI-NEXT: v_mul_lo_u32 v1, v1, v3
++; SI-NEXT: v_mul_lo_u32 v0, v0, v2
++; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
++; SI-NEXT: s_endpgm
++;
++; VI-LABEL: test_mul_v2i32:
++; VI: ; %bb.0: ; %entry
++; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
++; VI-NEXT: s_mov_b32 s7, 0xf000
++; VI-NEXT: s_mov_b32 s6, -1
++; VI-NEXT: s_mov_b32 s10, s6
++; VI-NEXT: s_mov_b32 s11, s7
++; VI-NEXT: s_waitcnt lgkmcnt(0)
++; VI-NEXT: s_mov_b32 s8, s2
++; VI-NEXT: s_mov_b32 s9, s3
++; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
++; VI-NEXT: s_mov_b32 s4, s0
++; VI-NEXT: s_mov_b32 s5, s1
++; VI-NEXT: s_waitcnt vmcnt(0)
++; VI-NEXT: v_mul_lo_u32 v1, v1, v3
++; VI-NEXT: v_mul_lo_u32 v0, v0, v2
++; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
++; VI-NEXT: s_endpgm
++;
++; GFX9-LABEL: test_mul_v2i32:
++; GFX9: ; %bb.0: ; %entry
++; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
++; GFX9-NEXT: s_mov_b32 s7, 0xf000
++; GFX9-NEXT: s_mov_b32 s6, -1
++; GFX9-NEXT: s_mov_b32 s10, s6
++; GFX9-NEXT: s_mov_b32 s11, s7
++; GFX9-NEXT: s_waitcnt lgkmcnt(0)
++; GFX9-NEXT: s_mov_b32 s8, s2
++; GFX9-NEXT: s_mov_b32 s9, s3
++; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
++; GFX9-NEXT: s_mov_b32 s4, s0
++; GFX9-NEXT: s_mov_b32 s5, s1
++; GFX9-NEXT: s_waitcnt vmcnt(0)
++; GFX9-NEXT: v_mul_lo_u32 v1, v1, v3
++; GFX9-NEXT: v_mul_lo_u32 v0, v0, v2
++; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
++; GFX9-NEXT: s_endpgm
++;
++; GFX10-LABEL: test_mul_v2i32:
++; GFX10: ; %bb.0: ; %entry
++; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
++; GFX10-NEXT: s_mov_b32 s6, -1
++; GFX10-NEXT: s_mov_b32 s7, 0x31016000
++; GFX10-NEXT: s_mov_b32 s10, s6
++; GFX10-NEXT: s_mov_b32 s11, s7
++; GFX10-NEXT: s_waitcnt lgkmcnt(0)
++; GFX10-NEXT: s_mov_b32 s8, s2
++; GFX10-NEXT: s_mov_b32 s9, s3
++; GFX10-NEXT: s_mov_b32 s4, s0
++; GFX10-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
++; GFX10-NEXT: s_mov_b32 s5, s1
++; GFX10-NEXT: s_waitcnt vmcnt(0)
++; GFX10-NEXT: v_mul_lo_u32 v1, v1, v3
++; GFX10-NEXT: v_mul_lo_u32 v0, v0, v2
++; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
++; GFX10-NEXT: s_endpgm
++;
++; GFX11-LABEL: test_mul_v2i32:
++; GFX11: ; %bb.0: ; %entry
++; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
++; GFX11-NEXT: s_mov_b32 s6, -1
++; GFX11-NEXT: s_mov_b32 s7, 0x31016000
++; GFX11-NEXT: s_mov_b32 s10, s6
++; GFX11-NEXT: s_mov_b32 s11, s7
++; GFX11-NEXT: s_waitcnt lgkmcnt(0)
++; GFX11-NEXT: s_mov_b32 s8, s2
++; GFX11-NEXT: s_mov_b32 s9, s3
++; GFX11-NEXT: s_mov_b32 s4, s0
++; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[8:11], 0
++; GFX11-NEXT: s_mov_b32 s5, s1
++; GFX11-NEXT: s_waitcnt vmcnt(0)
++; GFX11-NEXT: v_mul_lo_u32 v1, v1, v3
++; GFX11-NEXT: v_mul_lo_u32 v0, v0, v2
++; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
++; GFX11-NEXT: s_nop 0
++; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
++; GFX11-NEXT: s_endpgm
++;
++; EG-LABEL: test_mul_v2i32:
++; EG: ; %bb.0: ; %entry
++; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
++; EG-NEXT: TEX 0 @6
++; EG-NEXT: ALU 3, @9, KC0[CB0:0-32], KC1[]
++; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
++; EG-NEXT: CF_END
++; EG-NEXT: PAD
++; EG-NEXT: Fetch clause starting at 6:
++; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1
++; EG-NEXT: ALU clause starting at 8:
++; EG-NEXT: MOV * T0.X, KC0[2].Z,
++; EG-NEXT: ALU clause starting at 9:
++; EG-NEXT: MULLO_INT * T0.Y, T0.Y, T0.W,
++; EG-NEXT: LSHR T1.X, KC0[2].Y, literal.x,
++; EG-NEXT: MULLO_INT * T0.X, T0.X, T0.Z,
++; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
++entry:
+ %b_ptr = getelementptr <2 x i32>, ptr addrspace(1) %in, i32 1
+ %a = load <2 x i32>, ptr addrspace(1) %in
+ %b = load <2 x i32>, ptr addrspace(1) %b_ptr
+@@ -23,18 +132,142 @@ define amdgpu_kernel void @test_mul_v2i32(ptr addrspace(1) %out, ptr addrspace(1
+ ret void
+ }
+
+-; FUNC-LABEL: {{^}}v_mul_v4i32:
+-; EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+-; EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+-; EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+-; EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+-
+-; GCN: v_mul_lo_u32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+-; GCN: v_mul_lo_u32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+-; GCN: v_mul_lo_u32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+-; GCN: v_mul_lo_u32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+-
+ define amdgpu_kernel void @v_mul_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
++; SI-LABEL: v_mul_v4i32:
++; SI: ; %bb.0: ; %entry
++; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
++; SI-NEXT: s_mov_b32 s7, 0xf000
++; SI-NEXT: s_mov_b32 s6, -1
++; SI-NEXT: s_mov_b32 s10, s6
++; SI-NEXT: s_mov_b32 s11, s7
++; SI-NEXT: s_waitcnt lgkmcnt(0)
++; SI-NEXT: s_mov_b32 s8, s2
++; SI-NEXT: s_mov_b32 s9, s3
++; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
++; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
++; SI-NEXT: s_mov_b32 s4, s0
++; SI-NEXT: s_mov_b32 s5, s1
++; SI-NEXT: s_waitcnt vmcnt(0)
++; SI-NEXT: v_mul_lo_u32 v3, v3, v7
++; SI-NEXT: v_mul_lo_u32 v2, v2, v6
++; SI-NEXT: v_mul_lo_u32 v1, v1, v5
++; SI-NEXT: v_mul_lo_u32 v0, v0, v4
++; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
++; SI-NEXT: s_endpgm
++;
++; VI-LABEL: v_mul_v4i32:
++; VI: ; %bb.0: ; %entry
++; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
++; VI-NEXT: s_mov_b32 s7, 0xf000
++; VI-NEXT: s_mov_b32 s6, -1
++; VI-NEXT: s_mov_b32 s10, s6
++; VI-NEXT: s_mov_b32 s11, s7
++; VI-NEXT: s_waitcnt lgkmcnt(0)
++; VI-NEXT: s_mov_b32 s8, s2
++; VI-NEXT: s_mov_b32 s9, s3
++; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
++; VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
++; VI-NEXT: s_mov_b32 s4, s0
++; VI-NEXT: s_mov_b32 s5, s1
++; VI-NEXT: s_waitcnt vmcnt(0)
++; VI-NEXT: v_mul_lo_u32 v3, v3, v7
++; VI-NEXT: v_mul_lo_u32 v2, v2, v6
++; VI-NEXT: v_mul_lo_u32 v1, v1, v5
++; VI-NEXT: v_mul_lo_u32 v0, v0, v4
++; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
++; VI-NEXT: s_endpgm
++;
++; GFX9-LABEL: v_mul_v4i32:
++; GFX9: ; %bb.0: ; %entry
++; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
++; GFX9-NEXT: s_mov_b32 s7, 0xf000
++; GFX9-NEXT: s_mov_b32 s6, -1
++; GFX9-NEXT: s_mov_b32 s10, s6
++; GFX9-NEXT: s_mov_b32 s11, s7
++; GFX9-NEXT: s_waitcnt lgkmcnt(0)
++; GFX9-NEXT: s_mov_b32 s8, s2
++; GFX9-NEXT: s_mov_b32 s9, s3
++; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
++; GFX9-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
++; GFX9-NEXT: s_mov_b32 s4, s0
++; GFX9-NEXT: s_mov_b32 s5, s1
++; GFX9-NEXT: s_waitcnt vmcnt(0)
++; GFX9-NEXT: v_mul_lo_u32 v3, v3, v7
++; GFX9-NEXT: v_mul_lo_u32 v2, v2, v6
++; GFX9-NEXT: v_mul_lo_u32 v1, v1, v5
++; GFX9-NEXT: v_mul_lo_u32 v0, v0, v4
++; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
++; GFX9-NEXT: s_endpgm
++;
++; GFX10-LABEL: v_mul_v4i32:
++; GFX10: ; %bb.0: ; %entry
++; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
++; GFX10-NEXT: s_mov_b32 s6, -1
++; GFX10-NEXT: s_mov_b32 s7, 0x31016000
++; GFX10-NEXT: s_mov_b32 s10, s6
++; GFX10-NEXT: s_mov_b32 s11, s7
++; GFX10-NEXT: s_waitcnt lgkmcnt(0)
++; GFX10-NEXT: s_mov_b32 s8, s2
++; GFX10-NEXT: s_mov_b32 s9, s3
++; GFX10-NEXT: s_clause 0x1
++; GFX10-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
++; GFX10-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
++; GFX10-NEXT: s_mov_b32 s4, s0
++; GFX10-NEXT: s_mov_b32 s5, s1
++; GFX10-NEXT: s_waitcnt vmcnt(0)
++; GFX10-NEXT: v_mul_lo_u32 v3, v3, v7
++; GFX10-NEXT: v_mul_lo_u32 v2, v2, v6
++; GFX10-NEXT: v_mul_lo_u32 v1, v1, v5
++; GFX10-NEXT: v_mul_lo_u32 v0, v0, v4
++; GFX10-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
++; GFX10-NEXT: s_endpgm
++;
++; GFX11-LABEL: v_mul_v4i32:
++; GFX11: ; %bb.0: ; %entry
++; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
++; GFX11-NEXT: s_mov_b32 s6, -1
++; GFX11-NEXT: s_mov_b32 s7, 0x31016000
++; GFX11-NEXT: s_mov_b32 s10, s6
++; GFX11-NEXT: s_mov_b32 s11, s7
++; GFX11-NEXT: s_waitcnt lgkmcnt(0)
++; GFX11-NEXT: s_mov_b32 s8, s2
++; GFX11-NEXT: s_mov_b32 s9, s3
++; GFX11-NEXT: s_clause 0x1
++; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[8:11], 0
++; GFX11-NEXT: buffer_load_b128 v[4:7], off, s[8:11], 0 offset:16
++; GFX11-NEXT: s_mov_b32 s4, s0
++; GFX11-NEXT: s_mov_b32 s5, s1
++; GFX11-NEXT: s_waitcnt vmcnt(0)
++; GFX11-NEXT: v_mul_lo_u32 v3, v3, v7
++; GFX11-NEXT: v_mul_lo_u32 v2, v2, v6
++; GFX11-NEXT: v_mul_lo_u32 v1, v1, v5
++; GFX11-NEXT: v_mul_lo_u32 v0, v0, v4
++; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[4:7], 0
++; GFX11-NEXT: s_nop 0
++; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
++; GFX11-NEXT: s_endpgm
++;
++; EG-LABEL: v_mul_v4i32:
++; EG: ; %bb.0: ; %entry
++; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[]
++; EG-NEXT: TEX 1 @6
++; EG-NEXT: ALU 5, @11, KC0[CB0:0-32], KC1[]
++; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T2.X, 1
++; EG-NEXT: CF_END
++; EG-NEXT: PAD
++; EG-NEXT: Fetch clause starting at 6:
++; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 16, #1
++; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1
++; EG-NEXT: ALU clause starting at 10:
++; EG-NEXT: MOV * T0.X, KC0[2].Z,
++; EG-NEXT: ALU clause starting at 11:
++; EG-NEXT: MULLO_INT * T0.W, T0.W, T1.W,
++; EG-NEXT: MULLO_INT * T0.Z, T0.Z, T1.Z,
++; EG-NEXT: MULLO_INT * T0.Y, T0.Y, T1.Y,
++; EG-NEXT: LSHR T2.X, KC0[2].Y, literal.x,
++; EG-NEXT: MULLO_INT * T0.X, T0.X, T1.X,
++; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
++entry:
+ %b_ptr = getelementptr <4 x i32>, ptr addrspace(1) %in, i32 1
+ %a = load <4 x i32>, ptr addrspace(1) %in
+ %b = load <4 x i32>, ptr addrspace(1) %b_ptr
+@@ -43,24 +276,232 @@ define amdgpu_kernel void @v_mul_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %
+ ret void
+ }
+
+-; FUNC-LABEL: {{^}}s_trunc_i64_mul_to_i32:
+-; GCN: s_load_dword
+-; GCN: s_load_dword
+-; GCN: s_mul_i32
+-; GCN: buffer_store_dword
+ define amdgpu_kernel void @s_trunc_i64_mul_to_i32(ptr addrspace(1) %out, i64 %a, i64 %b) {
++; SI-LABEL: s_trunc_i64_mul_to_i32:
++; SI: ; %bb.0: ; %entry
++; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
++; SI-NEXT: s_waitcnt lgkmcnt(0)
++; SI-NEXT: s_load_dword s7, s[0:1], 0xd
++; SI-NEXT: s_mov_b32 s3, 0xf000
++; SI-NEXT: s_mov_b32 s2, -1
++; SI-NEXT: s_mov_b32 s0, s4
++; SI-NEXT: s_waitcnt lgkmcnt(0)
++; SI-NEXT: s_mul_i32 s4, s7, s6
++; SI-NEXT: s_mov_b32 s1, s5
++; SI-NEXT: v_mov_b32_e32 v0, s4
++; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
++; SI-NEXT: s_endpgm
++;
++; VI-LABEL: s_trunc_i64_mul_to_i32:
++; VI: ; %bb.0: ; %entry
++; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
++; VI-NEXT: s_waitcnt lgkmcnt(0)
++; VI-NEXT: s_load_dword s7, s[0:1], 0x34
++; VI-NEXT: s_mov_b32 s3, 0xf000
++; VI-NEXT: s_mov_b32 s2, -1
++; VI-NEXT: s_mov_b32 s0, s4
++; VI-NEXT: s_waitcnt lgkmcnt(0)
++; VI-NEXT: s_mul_i32 s4, s7, s6
++; VI-NEXT: s_mov_b32 s1, s5
++; VI-NEXT: v_mov_b32_e32 v0, s4
++; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
++; VI-NEXT: s_endpgm
++;
++; GFX9-LABEL: s_trunc_i64_mul_to_i32:
++; GFX9: ; %bb.0: ; %entry
++; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
++; GFX9-NEXT: s_waitcnt lgkmcnt(0)
++; GFX9-NEXT: s_load_dword s7, s[0:1], 0x34
++; GFX9-NEXT: ; kill: killed $sgpr0_sgpr1
++; GFX9-NEXT: s_mov_b32 s3, 0xf000
++; GFX9-NEXT: s_mov_b32 s2, -1
++; GFX9-NEXT: s_mov_b32 s0, s4
++; GFX9-NEXT: s_waitcnt lgkmcnt(0)
++; GFX9-NEXT: s_mul_i32 s4, s7, s6
++; GFX9-NEXT: s_mov_b32 s1, s5
++; GFX9-NEXT: v_mov_b32_e32 v0, s4
++; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
++; GFX9-NEXT: s_endpgm
++;
++; GFX10-LABEL: s_trunc_i64_mul_to_i32:
++; GFX10: ; %bb.0: ; %entry
++; GFX10-NEXT: s_clause 0x1
++; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
++; GFX10-NEXT: s_load_dword s2, s[0:1], 0x34
++; GFX10-NEXT: s_waitcnt lgkmcnt(0)
++; GFX10-NEXT: s_mov_b32 s7, 0x31016000
++; GFX10-NEXT: s_mul_i32 s0, s2, s6
++; GFX10-NEXT: s_mov_b32 s6, -1
++; GFX10-NEXT: v_mov_b32_e32 v0, s0
++; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0
++; GFX10-NEXT: s_endpgm
++;
++; GFX11-LABEL: s_trunc_i64_mul_to_i32:
++; GFX11: ; %bb.0: ; %entry
++; GFX11-NEXT: s_clause 0x1
++; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
++; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34
++; GFX11-NEXT: s_waitcnt lgkmcnt(0)
++; GFX11-NEXT: s_mov_b32 s7, 0x31016000
++; GFX11-NEXT: s_mul_i32 s0, s0, s6
++; GFX11-NEXT: s_mov_b32 s6, -1
++; GFX11-NEXT: v_mov_b32_e32 v0, s0
++; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
++; GFX11-NEXT: s_nop 0
++; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
++; GFX11-NEXT: s_endpgm
++;
++; EG-LABEL: s_trunc_i64_mul_to_i32:
++; EG: ; %bb.0: ; %entry
++; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
++; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
++; EG-NEXT: CF_END
++; EG-NEXT: PAD
++; EG-NEXT: ALU clause starting at 4:
++; EG-NEXT: LSHR * T0.X, KC0[2].Y, literal.x,
++; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
++; EG-NEXT: MULLO_INT * T1.X, KC0[3].Y, KC0[2].W,
++entry:
+ %mul = mul i64 %b, %a
+ %trunc = trunc i64 %mul to i32
+ store i32 %trunc, ptr addrspace(1) %out, align 8
+ ret void
+ }
+
+-; FUNC-LABEL: {{^}}v_trunc_i64_mul_to_i32:
+-; GCN: s_load_dword
+-; GCN: s_load_dword
+-; GCN: v_mul_lo_u32
+-; GCN: buffer_store_dword
+ define amdgpu_kernel void @v_trunc_i64_mul_to_i32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind {
++; SI-LABEL: v_trunc_i64_mul_to_i32:
++; SI: ; %bb.0: ; %entry
++; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
++; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
++; SI-NEXT: s_mov_b32 s3, 0xf000
++; SI-NEXT: s_mov_b32 s2, -1
++; SI-NEXT: s_mov_b32 s14, s2
++; SI-NEXT: s_waitcnt lgkmcnt(0)
++; SI-NEXT: s_mov_b32 s12, s6
++; SI-NEXT: s_mov_b32 s13, s7
++; SI-NEXT: s_mov_b32 s15, s3
++; SI-NEXT: s_mov_b32 s10, s2
++; SI-NEXT: s_mov_b32 s11, s3
++; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0
++; SI-NEXT: buffer_load_dword v1, off, s[8:11], 0
++; SI-NEXT: s_mov_b32 s0, s4
++; SI-NEXT: s_mov_b32 s1, s5
++; SI-NEXT: s_waitcnt vmcnt(0)
++; SI-NEXT: v_mul_lo_u32 v0, v1, v0
++; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
++; SI-NEXT: s_endpgm
++;
++; VI-LABEL: v_trunc_i64_mul_to_i32:
++; VI: ; %bb.0: ; %entry
++; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
++; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
++; VI-NEXT: s_mov_b32 s3, 0xf000
++; VI-NEXT: s_mov_b32 s2, -1
++; VI-NEXT: s_mov_b32 s14, s2
++; VI-NEXT: s_waitcnt lgkmcnt(0)
++; VI-NEXT: s_mov_b32 s12, s6
++; VI-NEXT: s_mov_b32 s13, s7
++; VI-NEXT: s_mov_b32 s15, s3
++; VI-NEXT: s_mov_b32 s10, s2
++; VI-NEXT: s_mov_b32 s11, s3
++; VI-NEXT: buffer_load_dword v0, off, s[12:15], 0
++; VI-NEXT: buffer_load_dword v1, off, s[8:11], 0
++; VI-NEXT: s_mov_b32 s0, s4
++; VI-NEXT: s_mov_b32 s1, s5
++; VI-NEXT: s_waitcnt vmcnt(0)
++; VI-NEXT: v_mul_lo_u32 v0, v1, v0
++; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
++; VI-NEXT: s_endpgm
++;
++; GFX9-LABEL: v_trunc_i64_mul_to_i32:
++; GFX9: ; %bb.0: ; %entry
++; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
++; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
++; GFX9-NEXT: s_mov_b32 s3, 0xf000
++; GFX9-NEXT: s_mov_b32 s2, -1
++; GFX9-NEXT: s_mov_b32 s14, s2
++; GFX9-NEXT: s_waitcnt lgkmcnt(0)
++; GFX9-NEXT: s_mov_b32 s12, s6
++; GFX9-NEXT: s_mov_b32 s13, s7
++; GFX9-NEXT: s_mov_b32 s15, s3
++; GFX9-NEXT: s_mov_b32 s10, s2
++; GFX9-NEXT: s_mov_b32 s11, s3
++; GFX9-NEXT: buffer_load_dword v0, off, s[12:15], 0
++; GFX9-NEXT: buffer_load_dword v1, off, s[8:11], 0
++; GFX9-NEXT: s_mov_b32 s0, s4
++; GFX9-NEXT: s_mov_b32 s1, s5
++; GFX9-NEXT: s_waitcnt vmcnt(0)
++; GFX9-NEXT: v_mul_lo_u32 v0, v1, v0
++; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
++; GFX9-NEXT: s_endpgm
++;
++; GFX10-LABEL: v_trunc_i64_mul_to_i32:
++; GFX10: ; %bb.0: ; %entry
++; GFX10-NEXT: s_clause 0x1
++; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
++; GFX10-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
++; GFX10-NEXT: s_mov_b32 s2, -1
++; GFX10-NEXT: s_mov_b32 s3, 0x31016000
++; GFX10-NEXT: s_mov_b32 s14, s2
++; GFX10-NEXT: s_mov_b32 s15, s3
++; GFX10-NEXT: s_mov_b32 s10, s2
++; GFX10-NEXT: s_mov_b32 s11, s3
++; GFX10-NEXT: s_waitcnt lgkmcnt(0)
++; GFX10-NEXT: s_mov_b32 s12, s6
++; GFX10-NEXT: s_mov_b32 s13, s7
++; GFX10-NEXT: buffer_load_dword v0, off, s[12:15], 0
++; GFX10-NEXT: buffer_load_dword v1, off, s[8:11], 0
++; GFX10-NEXT: s_mov_b32 s0, s4
++; GFX10-NEXT: s_mov_b32 s1, s5
++; GFX10-NEXT: s_waitcnt vmcnt(0)
++; GFX10-NEXT: v_mul_lo_u32 v0, v1, v0
++; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0
++; GFX10-NEXT: s_endpgm
++;
++; GFX11-LABEL: v_trunc_i64_mul_to_i32:
++; GFX11: ; %bb.0: ; %entry
++; GFX11-NEXT: s_clause 0x1
++; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
++; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
++; GFX11-NEXT: s_mov_b32 s10, -1
++; GFX11-NEXT: s_mov_b32 s11, 0x31016000
++; GFX11-NEXT: s_mov_b32 s14, s10
++; GFX11-NEXT: s_mov_b32 s15, s11
++; GFX11-NEXT: s_mov_b32 s2, s10
++; GFX11-NEXT: s_mov_b32 s3, s11
++; GFX11-NEXT: s_waitcnt lgkmcnt(0)
++; GFX11-NEXT: s_mov_b32 s12, s6
++; GFX11-NEXT: s_mov_b32 s13, s7
++; GFX11-NEXT: buffer_load_b32 v0, off, s[12:15], 0
++; GFX11-NEXT: buffer_load_b32 v1, off, s[0:3], 0
++; GFX11-NEXT: s_mov_b32 s8, s4
++; GFX11-NEXT: s_mov_b32 s9, s5
++; GFX11-NEXT: s_waitcnt vmcnt(0)
++; GFX11-NEXT: v_mul_lo_u32 v0, v1, v0
++; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
++; GFX11-NEXT: s_nop 0
++; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
++; GFX11-NEXT: s_endpgm
++;
++; EG-LABEL: v_trunc_i64_mul_to_i32:
++; EG: ; %bb.0: ; %entry
++; EG-NEXT: ALU 1, @10, KC0[CB0:0-32], KC1[]
++; EG-NEXT: TEX 1 @6
++; EG-NEXT: ALU 2, @12, KC0[CB0:0-32], KC1[]
++; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T2.X, 1
++; EG-NEXT: CF_END
++; EG-NEXT: PAD
++; EG-NEXT: Fetch clause starting at 6:
++; EG-NEXT: VTX_READ_32 T1.X, T1.X, 0, #1
++; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
++; EG-NEXT: ALU clause starting at 10:
++; EG-NEXT: MOV T0.X, KC0[2].Z,
++; EG-NEXT: MOV * T1.X, KC0[2].W,
++; EG-NEXT: ALU clause starting at 12:
++; EG-NEXT: LSHR T2.X, KC0[2].Y, literal.x,
++; EG-NEXT: MULLO_INT * T0.X, T1.X, T0.X,
++; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
++entry:
+ %a = load i64, ptr addrspace(1) %aptr, align 8
+ %b = load i64, ptr addrspace(1) %bptr, align 8
+ %mul = mul i64 %b, %a
+@@ -71,13 +512,93 @@ define amdgpu_kernel void @v_trunc_i64_mul_to_i32(ptr addrspace(1) %out, ptr add
+
+ ; This 64-bit multiply should just use MUL_HI and MUL_LO, since the top
+ ; 32-bits of both arguments are sign bits.
+-; FUNC-LABEL: {{^}}mul64_sext_c:
+-; EG-DAG: MULLO_INT
+-; EG-DAG: MULHI_INT
+-; SI-DAG: s_mulk_i32
+-; SI-DAG: v_mul_hi_i32
+-; VI: v_mad_i64_i32
++
+ define amdgpu_kernel void @mul64_sext_c(ptr addrspace(1) %out, i32 %in) {
++; SI-LABEL: mul64_sext_c:
++; SI: ; %bb.0: ; %entry
++; SI-NEXT: s_load_dword s4, s[0:1], 0xb
++; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
++; SI-NEXT: v_mov_b32_e32 v0, 0x50
++; SI-NEXT: s_mov_b32 s3, 0xf000
++; SI-NEXT: s_mov_b32 s2, -1
++; SI-NEXT: s_waitcnt lgkmcnt(0)
++; SI-NEXT: v_mul_hi_i32 v1, s4, v0
++; SI-NEXT: s_mulk_i32 s4, 0x50
++; SI-NEXT: v_mov_b32_e32 v0, s4
++; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
++; SI-NEXT: s_endpgm
++;
++; VI-LABEL: mul64_sext_c:
++; VI: ; %bb.0: ; %entry
++; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
++; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
++; VI-NEXT: v_mov_b32_e32 v0, 0x50
++; VI-NEXT: s_waitcnt lgkmcnt(0)
++; VI-NEXT: v_mad_i64_i32 v[0:1], s[2:3], s2, v0, 0
++; VI-NEXT: s_mov_b32 s3, 0xf000
++; VI-NEXT: s_mov_b32 s2, -1
++; VI-NEXT: s_nop 2
++; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
++; VI-NEXT: s_endpgm
++;
++; GFX9-LABEL: mul64_sext_c:
++; GFX9: ; %bb.0: ; %entry
++; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
++; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
++; GFX9-NEXT: s_mov_b32 s7, 0xf000
++; GFX9-NEXT: s_mov_b32 s6, -1
++; GFX9-NEXT: s_waitcnt lgkmcnt(0)
++; GFX9-NEXT: s_mul_hi_i32 s0, s2, 0x50
++; GFX9-NEXT: s_mulk_i32 s2, 0x50
++; GFX9-NEXT: v_mov_b32_e32 v0, s2
++; GFX9-NEXT: v_mov_b32_e32 v1, s0
++; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
++; GFX9-NEXT: s_endpgm
++;
++; GFX10-LABEL: mul64_sext_c:
++; GFX10: ; %bb.0: ; %entry
++; GFX10-NEXT: s_clause 0x1
++; GFX10-NEXT: s_load_dword s2, s[0:1], 0x2c
++; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
++; GFX10-NEXT: s_mov_b32 s7, 0x31016000
++; GFX10-NEXT: s_mov_b32 s6, -1
++; GFX10-NEXT: s_waitcnt lgkmcnt(0)
++; GFX10-NEXT: s_mul_i32 s0, s2, 0x50
++; GFX10-NEXT: s_mul_hi_i32 s1, s2, 0x50
++; GFX10-NEXT: v_mov_b32_e32 v0, s0
++; GFX10-NEXT: v_mov_b32_e32 v1, s1
++; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
++; GFX10-NEXT: s_endpgm
++;
++; GFX11-LABEL: mul64_sext_c:
++; GFX11: ; %bb.0: ; %entry
++; GFX11-NEXT: s_clause 0x1
++; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
++; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
++; GFX11-NEXT: s_waitcnt lgkmcnt(0)
++; GFX11-NEXT: s_mul_i32 s3, s2, 0x50
++; GFX11-NEXT: s_mul_hi_i32 s2, s2, 0x50
++; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
++; GFX11-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s2
++; GFX11-NEXT: s_mov_b32 s3, 0x31016000
++; GFX11-NEXT: s_mov_b32 s2, -1
++; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
++; GFX11-NEXT: s_nop 0
++; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
++; GFX11-NEXT: s_endpgm
++;
++; EG-LABEL: mul64_sext_c:
++; EG: ; %bb.0: ; %entry
++; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[]
++; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
++; EG-NEXT: CF_END
++; EG-NEXT: PAD
++; EG-NEXT: ALU clause starting at 4:
++; EG-NEXT: MULHI_INT * T0.Y, KC0[2].Z, literal.x,
++; EG-NEXT: 80(1.121039e-43), 0(0.000000e+00)
++; EG-NEXT: LSHR T1.X, KC0[2].Y, literal.x,
++; EG-NEXT: MULLO_INT * T0.X, KC0[2].Z, literal.y,
++; EG-NEXT: 2(2.802597e-45), 80(1.121039e-43)
+ entry:
+ %0 = sext i32 %in to i64
+ %1 = mul i64 %0, 80
+@@ -85,14 +606,125 @@ entry:
+ ret void
+ }
+
+-; FUNC-LABEL: {{^}}v_mul64_sext_c:
+-; EG-DAG: MULLO_INT
+-; EG-DAG: MULHI_INT
+-; SI-DAG: v_mul_lo_u32
+-; SI-DAG: v_mul_hi_i32
+-; VI: v_mad_i64_i32
+-; GCN: s_endpgm
+ define amdgpu_kernel void @v_mul64_sext_c(ptr addrspace(1) %out, ptr addrspace(1) %in) {
++; SI-LABEL: v_mul64_sext_c:
++; SI: ; %bb.0: ; %entry
++; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
++; SI-NEXT: s_mov_b32 s7, 0xf000
++; SI-NEXT: s_mov_b32 s6, -1
++; SI-NEXT: s_mov_b32 s10, s6
++; SI-NEXT: s_mov_b32 s11, s7
++; SI-NEXT: s_waitcnt lgkmcnt(0)
++; SI-NEXT: s_mov_b32 s8, s2
++; SI-NEXT: s_mov_b32 s9, s3
++; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0
++; SI-NEXT: s_movk_i32 s2, 0x50
++; SI-NEXT: s_mov_b32 s4, s0
++; SI-NEXT: s_mov_b32 s5, s1
++; SI-NEXT: s_waitcnt vmcnt(0)
++; SI-NEXT: v_mul_hi_i32 v1, v0, s2
++; SI-NEXT: v_mul_lo_u32 v0, v0, s2
++; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
++; SI-NEXT: s_endpgm
++;
++; VI-LABEL: v_mul64_sext_c:
++; VI: ; %bb.0: ; %entry
++; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
++; VI-NEXT: s_mov_b32 s7, 0xf000
++; VI-NEXT: s_mov_b32 s6, -1
++; VI-NEXT: s_mov_b32 s10, s6
++; VI-NEXT: s_mov_b32 s11, s7
++; VI-NEXT: s_waitcnt lgkmcnt(0)
++; VI-NEXT: s_mov_b32 s8, s2
++; VI-NEXT: s_mov_b32 s9, s3
++; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
++; VI-NEXT: s_movk_i32 s2, 0x50
++; VI-NEXT: s_mov_b32 s4, s0
++; VI-NEXT: s_mov_b32 s5, s1
++; VI-NEXT: s_waitcnt vmcnt(0)
++; VI-NEXT: v_mad_i64_i32 v[0:1], s[2:3], v0, s2, 0
++; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
++; VI-NEXT: s_endpgm
++;
++; GFX9-LABEL: v_mul64_sext_c:
++; GFX9: ; %bb.0: ; %entry
++; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
++; GFX9-NEXT: s_mov_b32 s7, 0xf000
++; GFX9-NEXT: s_mov_b32 s6, -1
++; GFX9-NEXT: s_mov_b32 s10, s6
++; GFX9-NEXT: s_mov_b32 s11, s7
++; GFX9-NEXT: s_waitcnt lgkmcnt(0)
++; GFX9-NEXT: s_mov_b32 s8, s2
++; GFX9-NEXT: s_mov_b32 s9, s3
++; GFX9-NEXT: buffer_load_dword v0, off, s[8:11], 0
++; GFX9-NEXT: s_movk_i32 s2, 0x50
++; GFX9-NEXT: s_mov_b32 s4, s0
++; GFX9-NEXT: s_mov_b32 s5, s1
++; GFX9-NEXT: s_waitcnt vmcnt(0)
++; GFX9-NEXT: v_mul_hi_i32 v1, v0, s2
++; GFX9-NEXT: v_mul_lo_u32 v0, v0, s2
++; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
++; GFX9-NEXT: s_endpgm
++;
++; GFX10-LABEL: v_mul64_sext_c:
++; GFX10: ; %bb.0: ; %entry
++; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
++; GFX10-NEXT: s_mov_b32 s6, -1
++; GFX10-NEXT: s_mov_b32 s7, 0x31016000
++; GFX10-NEXT: s_mov_b32 s10, s6
++; GFX10-NEXT: s_mov_b32 s11, s7
++; GFX10-NEXT: s_waitcnt lgkmcnt(0)
++; GFX10-NEXT: s_mov_b32 s8, s2
++; GFX10-NEXT: s_mov_b32 s9, s3
++; GFX10-NEXT: s_mov_b32 s4, s0
++; GFX10-NEXT: buffer_load_dword v0, off, s[8:11], 0
++; GFX10-NEXT: s_mov_b32 s5, s1
++; GFX10-NEXT: s_waitcnt vmcnt(0)
++; GFX10-NEXT: v_mul_hi_i32 v1, 0x50, v0
++; GFX10-NEXT: v_mul_lo_u32 v0, 0x50, v0
++; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
++; GFX10-NEXT: s_endpgm
++;
++; GFX11-LABEL: v_mul64_sext_c:
++; GFX11: ; %bb.0: ; %entry
++; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
++; GFX11-NEXT: s_mov_b32 s6, -1
++; GFX11-NEXT: s_mov_b32 s7, 0x31016000
++; GFX11-NEXT: s_mov_b32 s10, s6
++; GFX11-NEXT: s_mov_b32 s11, s7
++; GFX11-NEXT: s_waitcnt lgkmcnt(0)
++; GFX11-NEXT: s_mov_b32 s8, s2
++; GFX11-NEXT: s_mov_b32 s9, s3
++; GFX11-NEXT: s_mov_b32 s4, s0
++; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
++; GFX11-NEXT: s_mov_b32 s5, s1
++; GFX11-NEXT: s_waitcnt vmcnt(0)
++; GFX11-NEXT: v_mul_hi_i32 v1, 0x50, v0
++; GFX11-NEXT: v_mul_lo_u32 v0, 0x50, v0
++; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
++; GFX11-NEXT: s_nop 0
++; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
++; GFX11-NEXT: s_endpgm
++;
++; EG-LABEL: v_mul64_sext_c:
++; EG: ; %bb.0: ; %entry
++; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
++; EG-NEXT: TEX 0 @6
++; EG-NEXT: ALU 4, @9, KC0[CB0:0-32], KC1[]
++; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
++; EG-NEXT: CF_END
++; EG-NEXT: PAD
++; EG-NEXT: Fetch clause starting at 6:
++; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
++; EG-NEXT: ALU clause starting at 8:
++; EG-NEXT: MOV * T0.X, KC0[2].Z,
++; EG-NEXT: ALU clause starting at 9:
++; EG-NEXT: MULHI_INT * T0.Y, T0.X, literal.x,
++; EG-NEXT: 80(1.121039e-43), 0(0.000000e+00)
++; EG-NEXT: LSHR T1.X, KC0[2].Y, literal.x,
++; EG-NEXT: MULLO_INT * T0.X, T0.X, literal.y,
++; EG-NEXT: 2(2.802597e-45), 80(1.121039e-43)
++entry:
+ %val = load i32, ptr addrspace(1) %in, align 4
+ %ext = sext i32 %val to i64
+ %mul = mul i64 %ext, 80
+@@ -100,12 +732,122 @@ define amdgpu_kernel void @v_mul64_sext_c(ptr addrspace(1) %out, ptr addrspace(1
+ ret void
+ }
+
+-; FUNC-LABEL: {{^}}v_mul64_sext_inline_imm:
+-; SI-DAG: v_mul_lo_u32 v{{[0-9]+}}, v{{[0-9]+}}, 9
+-; SI-DAG: v_mul_hi_i32 v{{[0-9]+}}, v{{[0-9]+}}, 9
+-; VI: v_mad_i64_i32 v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}, 9, 0
+-; GCN: s_endpgm
+ define amdgpu_kernel void @v_mul64_sext_inline_imm(ptr addrspace(1) %out, ptr addrspace(1) %in) {
++; SI-LABEL: v_mul64_sext_inline_imm:
++; SI: ; %bb.0: ; %entry
++; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
++; SI-NEXT: s_mov_b32 s7, 0xf000
++; SI-NEXT: s_mov_b32 s6, -1
++; SI-NEXT: s_mov_b32 s10, s6
++; SI-NEXT: s_mov_b32 s11, s7
++; SI-NEXT: s_waitcnt lgkmcnt(0)
++; SI-NEXT: s_mov_b32 s8, s2
++; SI-NEXT: s_mov_b32 s9, s3
++; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0
++; SI-NEXT: s_mov_b32 s4, s0
++; SI-NEXT: s_mov_b32 s5, s1
++; SI-NEXT: s_waitcnt vmcnt(0)
++; SI-NEXT: v_mul_hi_i32 v1, v0, 9
++; SI-NEXT: v_mul_lo_u32 v0, v0, 9
++; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
++; SI-NEXT: s_endpgm
++;
++; VI-LABEL: v_mul64_sext_inline_imm:
++; VI: ; %bb.0: ; %entry
++; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
++; VI-NEXT: s_mov_b32 s7, 0xf000
++; VI-NEXT: s_mov_b32 s6, -1
++; VI-NEXT: s_mov_b32 s10, s6
++; VI-NEXT: s_mov_b32 s11, s7
++; VI-NEXT: s_waitcnt lgkmcnt(0)
++; VI-NEXT: s_mov_b32 s8, s2
++; VI-NEXT: s_mov_b32 s9, s3
++; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
++; VI-NEXT: s_mov_b32 s4, s0
++; VI-NEXT: s_mov_b32 s5, s1
++; VI-NEXT: s_waitcnt vmcnt(0)
++; VI-NEXT: v_mad_i64_i32 v[0:1], s[2:3], v0, 9, 0
++; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
++; VI-NEXT: s_endpgm
++;
++; GFX9-LABEL: v_mul64_sext_inline_imm:
++; GFX9: ; %bb.0: ; %entry
++; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
++; GFX9-NEXT: s_mov_b32 s7, 0xf000
++; GFX9-NEXT: s_mov_b32 s6, -1
++; GFX9-NEXT: s_mov_b32 s10, s6
++; GFX9-NEXT: s_mov_b32 s11, s7
++; GFX9-NEXT: s_waitcnt lgkmcnt(0)
++; GFX9-NEXT: s_mov_b32 s8, s2
++; GFX9-NEXT: s_mov_b32 s9, s3
++; GFX9-NEXT: buffer_load_dword v0, off, s[8:11], 0
++; GFX9-NEXT: s_mov_b32 s4, s0
++; GFX9-NEXT: s_mov_b32 s5, s1
++; GFX9-NEXT: s_waitcnt vmcnt(0)
++; GFX9-NEXT: v_mul_hi_i32 v1, v0, 9
++; GFX9-NEXT: v_mul_lo_u32 v0, v0, 9
++; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
++; GFX9-NEXT: s_endpgm
++;
++; GFX10-LABEL: v_mul64_sext_inline_imm:
++; GFX10: ; %bb.0: ; %entry
++; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
++; GFX10-NEXT: s_mov_b32 s6, -1
++; GFX10-NEXT: s_mov_b32 s7, 0x31016000
++; GFX10-NEXT: s_mov_b32 s10, s6
++; GFX10-NEXT: s_mov_b32 s11, s7
++; GFX10-NEXT: s_waitcnt lgkmcnt(0)
++; GFX10-NEXT: s_mov_b32 s8, s2
++; GFX10-NEXT: s_mov_b32 s9, s3
++; GFX10-NEXT: s_mov_b32 s4, s0
++; GFX10-NEXT: buffer_load_dword v0, off, s[8:11], 0
++; GFX10-NEXT: s_mov_b32 s5, s1
++; GFX10-NEXT: s_waitcnt vmcnt(0)
++; GFX10-NEXT: v_mul_hi_i32 v1, v0, 9
++; GFX10-NEXT: v_mul_lo_u32 v0, v0, 9
++; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
++; GFX10-NEXT: s_endpgm
++;
++; GFX11-LABEL: v_mul64_sext_inline_imm:
++; GFX11: ; %bb.0: ; %entry
++; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
++; GFX11-NEXT: s_mov_b32 s6, -1
++; GFX11-NEXT: s_mov_b32 s7, 0x31016000
++; GFX11-NEXT: s_mov_b32 s10, s6
++; GFX11-NEXT: s_mov_b32 s11, s7
++; GFX11-NEXT: s_waitcnt lgkmcnt(0)
++; GFX11-NEXT: s_mov_b32 s8, s2
++; GFX11-NEXT: s_mov_b32 s9, s3
++; GFX11-NEXT: s_mov_b32 s4, s0
++; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
++; GFX11-NEXT: s_mov_b32 s5, s1
++; GFX11-NEXT: s_waitcnt vmcnt(0)
++; GFX11-NEXT: v_mul_hi_i32 v1, v0, 9
++; GFX11-NEXT: v_mul_lo_u32 v0, v0, 9
++; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
++; GFX11-NEXT: s_nop 0
++; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
++; GFX11-NEXT: s_endpgm
++;
++; EG-LABEL: v_mul64_sext_inline_imm:
++; EG: ; %bb.0: ; %entry
++; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
++; EG-NEXT: TEX 0 @6
++; EG-NEXT: ALU 4, @9, KC0[CB0:0-32], KC1[]
++; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
++; EG-NEXT: CF_END
++; EG-NEXT: PAD
++; EG-NEXT: Fetch clause starting at 6:
++; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
++; EG-NEXT: ALU clause starting at 8:
++; EG-NEXT: MOV * T0.X, KC0[2].Z,
++; EG-NEXT: ALU clause starting at 9:
++; EG-NEXT: MULHI_INT * T0.Y, T0.X, literal.x,
++; EG-NEXT: 9(1.261169e-44), 0(0.000000e+00)
++; EG-NEXT: LSHR T1.X, KC0[2].Y, literal.x,
++; EG-NEXT: MULLO_INT * T0.X, T0.X, literal.y,
++; EG-NEXT: 2(2.802597e-45), 9(1.261169e-44)
++entry:
+ %val = load i32, ptr addrspace(1) %in, align 4
+ %ext = sext i32 %val to i64
+ %mul = mul i64 %ext, 9
+@@ -113,22 +855,202 @@ define amdgpu_kernel void @v_mul64_sext_inline_imm(ptr addrspace(1) %out, ptr ad
+ ret void
+ }
+
+-; FUNC-LABEL: {{^}}s_mul_i32:
+-; GCN: s_load_dword [[SRC0:s[0-9]+]],
+-; GCN: s_load_dword [[SRC1:s[0-9]+]],
+-; GCN: s_mul_i32 [[SRESULT:s[0-9]+]], [[SRC0]], [[SRC1]]
+-; GCN: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
+-; GCN: buffer_store_dword [[VRESULT]],
+-; GCN: s_endpgm
+ define amdgpu_kernel void @s_mul_i32(ptr addrspace(1) %out, [8 x i32], i32 %a, [8 x i32], i32 %b) nounwind {
++; SI-LABEL: s_mul_i32:
++; SI: ; %bb.0: ; %entry
++; SI-NEXT: s_load_dword s4, s[0:1], 0x13
++; SI-NEXT: s_load_dword s5, s[0:1], 0x1c
++; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
++; SI-NEXT: s_mov_b32 s3, 0xf000
++; SI-NEXT: s_mov_b32 s2, -1
++; SI-NEXT: s_waitcnt lgkmcnt(0)
++; SI-NEXT: s_mul_i32 s4, s4, s5
++; SI-NEXT: v_mov_b32_e32 v0, s4
++; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
++; SI-NEXT: s_endpgm
++;
++; VI-LABEL: s_mul_i32:
++; VI: ; %bb.0: ; %entry
++; VI-NEXT: s_load_dword s4, s[0:1], 0x4c
++; VI-NEXT: s_load_dword s5, s[0:1], 0x70
++; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
++; VI-NEXT: s_mov_b32 s3, 0xf000
++; VI-NEXT: s_mov_b32 s2, -1
++; VI-NEXT: s_waitcnt lgkmcnt(0)
++; VI-NEXT: s_mul_i32 s4, s4, s5
++; VI-NEXT: v_mov_b32_e32 v0, s4
++; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
++; VI-NEXT: s_endpgm
++;
++; GFX9-LABEL: s_mul_i32:
++; GFX9: ; %bb.0: ; %entry
++; GFX9-NEXT: s_load_dword s2, s[0:1], 0x4c
++; GFX9-NEXT: s_load_dword s3, s[0:1], 0x70
++; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
++; GFX9-NEXT: s_mov_b32 s7, 0xf000
++; GFX9-NEXT: s_mov_b32 s6, -1
++; GFX9-NEXT: s_waitcnt lgkmcnt(0)
++; GFX9-NEXT: s_mul_i32 s0, s2, s3
++; GFX9-NEXT: v_mov_b32_e32 v0, s0
++; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
++; GFX9-NEXT: s_endpgm
++;
++; GFX10-LABEL: s_mul_i32:
++; GFX10: ; %bb.0: ; %entry
++; GFX10-NEXT: s_clause 0x2
++; GFX10-NEXT: s_load_dword s2, s[0:1], 0x4c
++; GFX10-NEXT: s_load_dword s3, s[0:1], 0x70
++; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
++; GFX10-NEXT: s_mov_b32 s7, 0x31016000
++; GFX10-NEXT: s_mov_b32 s6, -1
++; GFX10-NEXT: s_waitcnt lgkmcnt(0)
++; GFX10-NEXT: s_mul_i32 s0, s2, s3
++; GFX10-NEXT: v_mov_b32_e32 v0, s0
++; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0
++; GFX10-NEXT: s_endpgm
++;
++; GFX11-LABEL: s_mul_i32:
++; GFX11: ; %bb.0: ; %entry
++; GFX11-NEXT: s_clause 0x2
++; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x4c
++; GFX11-NEXT: s_load_b32 s3, s[0:1], 0x70
++; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
++; GFX11-NEXT: s_waitcnt lgkmcnt(0)
++; GFX11-NEXT: s_mul_i32 s2, s2, s3
++; GFX11-NEXT: s_mov_b32 s3, 0x31016000
++; GFX11-NEXT: v_mov_b32_e32 v0, s2
++; GFX11-NEXT: s_mov_b32 s2, -1
++; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
++; GFX11-NEXT: s_nop 0
++; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
++; GFX11-NEXT: s_endpgm
++;
++; EG-LABEL: s_mul_i32:
++; EG: ; %bb.0: ; %entry
++; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
++; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
++; EG-NEXT: CF_END
++; EG-NEXT: PAD
++; EG-NEXT: ALU clause starting at 4:
++; EG-NEXT: LSHR * T0.X, KC0[2].Y, literal.x,
++; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
++; EG-NEXT: MULLO_INT * T1.X, KC0[4].Z, KC0[6].W,
++entry:
+ %mul = mul i32 %a, %b
+ store i32 %mul, ptr addrspace(1) %out, align 4
+ ret void
+ }
+
+-; FUNC-LABEL: {{^}}v_mul_i32:
+-; GCN: v_mul_lo_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+ define amdgpu_kernel void @v_mul_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
++; SI-LABEL: v_mul_i32:
++; SI: ; %bb.0: ; %entry
++; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
++; SI-NEXT: s_mov_b32 s7, 0xf000
++; SI-NEXT: s_mov_b32 s6, -1
++; SI-NEXT: s_mov_b32 s10, s6
++; SI-NEXT: s_mov_b32 s11, s7
++; SI-NEXT: s_waitcnt lgkmcnt(0)
++; SI-NEXT: s_mov_b32 s8, s2
++; SI-NEXT: s_mov_b32 s9, s3
++; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
++; SI-NEXT: s_mov_b32 s4, s0
++; SI-NEXT: s_mov_b32 s5, s1
++; SI-NEXT: s_waitcnt vmcnt(0)
++; SI-NEXT: v_mul_lo_u32 v0, v0, v1
++; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
++; SI-NEXT: s_endpgm
++;
++; VI-LABEL: v_mul_i32:
++; VI: ; %bb.0: ; %entry
++; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
++; VI-NEXT: s_mov_b32 s7, 0xf000
++; VI-NEXT: s_mov_b32 s6, -1
++; VI-NEXT: s_mov_b32 s10, s6
++; VI-NEXT: s_mov_b32 s11, s7
++; VI-NEXT: s_waitcnt lgkmcnt(0)
++; VI-NEXT: s_mov_b32 s8, s2
++; VI-NEXT: s_mov_b32 s9, s3
++; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
++; VI-NEXT: s_mov_b32 s4, s0
++; VI-NEXT: s_mov_b32 s5, s1
++; VI-NEXT: s_waitcnt vmcnt(0)
++; VI-NEXT: v_mul_lo_u32 v0, v0, v1
++; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
++; VI-NEXT: s_endpgm
++;
++; GFX9-LABEL: v_mul_i32:
++; GFX9: ; %bb.0: ; %entry
++; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
++; GFX9-NEXT: s_mov_b32 s7, 0xf000
++; GFX9-NEXT: s_mov_b32 s6, -1
++; GFX9-NEXT: s_mov_b32 s10, s6
++; GFX9-NEXT: s_mov_b32 s11, s7
++; GFX9-NEXT: s_waitcnt lgkmcnt(0)
++; GFX9-NEXT: s_mov_b32 s8, s2
++; GFX9-NEXT: s_mov_b32 s9, s3
++; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
++; GFX9-NEXT: s_mov_b32 s4, s0
++; GFX9-NEXT: s_mov_b32 s5, s1
++; GFX9-NEXT: s_waitcnt vmcnt(0)
++; GFX9-NEXT: v_mul_lo_u32 v0, v0, v1
++; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
++; GFX9-NEXT: s_endpgm
++;
++; GFX10-LABEL: v_mul_i32:
++; GFX10: ; %bb.0: ; %entry
++; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
++; GFX10-NEXT: s_mov_b32 s6, -1
++; GFX10-NEXT: s_mov_b32 s7, 0x31016000
++; GFX10-NEXT: s_mov_b32 s10, s6
++; GFX10-NEXT: s_mov_b32 s11, s7
++; GFX10-NEXT: s_waitcnt lgkmcnt(0)
++; GFX10-NEXT: s_mov_b32 s8, s2
++; GFX10-NEXT: s_mov_b32 s9, s3
++; GFX10-NEXT: s_mov_b32 s4, s0
++; GFX10-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
++; GFX10-NEXT: s_mov_b32 s5, s1
++; GFX10-NEXT: s_waitcnt vmcnt(0)
++; GFX10-NEXT: v_mul_lo_u32 v0, v0, v1
++; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0
++; GFX10-NEXT: s_endpgm
++;
++; GFX11-LABEL: v_mul_i32:
++; GFX11: ; %bb.0: ; %entry
++; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
++; GFX11-NEXT: s_mov_b32 s6, -1
++; GFX11-NEXT: s_mov_b32 s7, 0x31016000
++; GFX11-NEXT: s_mov_b32 s10, s6
++; GFX11-NEXT: s_mov_b32 s11, s7
++; GFX11-NEXT: s_waitcnt lgkmcnt(0)
++; GFX11-NEXT: s_mov_b32 s8, s2
++; GFX11-NEXT: s_mov_b32 s9, s3
++; GFX11-NEXT: s_mov_b32 s4, s0
++; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[8:11], 0
++; GFX11-NEXT: s_mov_b32 s5, s1
++; GFX11-NEXT: s_waitcnt vmcnt(0)
++; GFX11-NEXT: v_mul_lo_u32 v0, v0, v1
++; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
++; GFX11-NEXT: s_nop 0
++; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
++; GFX11-NEXT: s_endpgm
++;
++; EG-LABEL: v_mul_i32:
++; EG: ; %bb.0: ; %entry
++; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
++; EG-NEXT: TEX 0 @6
++; EG-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[]
++; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
++; EG-NEXT: CF_END
++; EG-NEXT: PAD
++; EG-NEXT: Fetch clause starting at 6:
++; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1
++; EG-NEXT: ALU clause starting at 8:
++; EG-NEXT: MOV * T0.X, KC0[2].Z,
++; EG-NEXT: ALU clause starting at 9:
++; EG-NEXT: LSHR T1.X, KC0[2].Y, literal.x,
++; EG-NEXT: MULLO_INT * T0.X, T0.X, T0.Y,
++; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
++entry:
+ %b_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1
+ %a = load i32, ptr addrspace(1) %in
+ %b = load i32, ptr addrspace(1) %b_ptr
+@@ -137,6 +1059,298 @@ define amdgpu_kernel void @v_mul_i32(ptr addrspace(1) %out, ptr addrspace(1) %in
+ ret void
+ }
+
++define amdgpu_kernel void @s_mul_i1(ptr addrspace(1) %out, [8 x i32], i1 %a, [8 x i32], i1 %b) nounwind {
++; SI-LABEL: s_mul_i1:
++; SI: ; %bb.0: ; %entry
++; SI-NEXT: s_load_dword s2, s[0:1], 0x13
++; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
++; SI-NEXT: s_load_dword s3, s[0:1], 0x1c
++; SI-NEXT: s_mov_b32 s7, 0xf000
++; SI-NEXT: s_mov_b32 s6, -1
++; SI-NEXT: s_waitcnt lgkmcnt(0)
++; SI-NEXT: s_bitcmp1_b32 s2, 0
++; SI-NEXT: s_cselect_b64 s[0:1], -1, 0
++; SI-NEXT: s_bitcmp1_b32 s3, 0
++; SI-NEXT: s_cselect_b64 s[2:3], -1, 0
++; SI-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3]
++; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
++; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0
++; SI-NEXT: s_endpgm
++;
++; VI-LABEL: s_mul_i1:
++; VI: ; %bb.0: ; %entry
++; VI-NEXT: s_load_dword s2, s[0:1], 0x4c
++; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
++; VI-NEXT: s_load_dword s3, s[0:1], 0x70
++; VI-NEXT: s_mov_b32 s7, 0xf000
++; VI-NEXT: s_mov_b32 s6, -1
++; VI-NEXT: s_waitcnt lgkmcnt(0)
++; VI-NEXT: s_bitcmp1_b32 s2, 0
++; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
++; VI-NEXT: s_bitcmp1_b32 s3, 0
++; VI-NEXT: s_cselect_b64 s[2:3], -1, 0
++; VI-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3]
++; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
++; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0
++; VI-NEXT: s_endpgm
++;
++; GFX9-LABEL: s_mul_i1:
++; GFX9: ; %bb.0: ; %entry
++; GFX9-NEXT: s_load_dword s2, s[0:1], 0x4c
++; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
++; GFX9-NEXT: s_load_dword s3, s[0:1], 0x70
++; GFX9-NEXT: s_mov_b32 s7, 0xf000
++; GFX9-NEXT: s_mov_b32 s6, -1
++; GFX9-NEXT: s_waitcnt lgkmcnt(0)
++; GFX9-NEXT: s_bitcmp1_b32 s2, 0
++; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0
++; GFX9-NEXT: s_bitcmp1_b32 s3, 0
++; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0
++; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3]
++; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
++; GFX9-NEXT: buffer_store_byte v0, off, s[4:7], 0
++; GFX9-NEXT: s_endpgm
++;
++; GFX10-LABEL: s_mul_i1:
++; GFX10: ; %bb.0: ; %entry
++; GFX10-NEXT: s_clause 0x2
++; GFX10-NEXT: s_load_dword s2, s[0:1], 0x4c
++; GFX10-NEXT: s_load_dword s3, s[0:1], 0x70
++; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
++; GFX10-NEXT: s_mov_b32 s7, 0x31016000
++; GFX10-NEXT: s_mov_b32 s6, -1
++; GFX10-NEXT: s_waitcnt lgkmcnt(0)
++; GFX10-NEXT: s_bitcmp1_b32 s2, 0
++; GFX10-NEXT: s_cselect_b32 s0, -1, 0
++; GFX10-NEXT: s_bitcmp1_b32 s3, 0
++; GFX10-NEXT: s_cselect_b32 s1, -1, 0
++; GFX10-NEXT: s_and_b32 s0, s0, s1
++; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
++; GFX10-NEXT: buffer_store_byte v0, off, s[4:7], 0
++; GFX10-NEXT: s_endpgm
++;
++; GFX11-LABEL: s_mul_i1:
++; GFX11: ; %bb.0: ; %entry
++; GFX11-NEXT: s_clause 0x2
++; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x4c
++; GFX11-NEXT: s_load_b32 s3, s[0:1], 0x70
++; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
++; GFX11-NEXT: s_waitcnt lgkmcnt(0)
++; GFX11-NEXT: s_bitcmp1_b32 s2, 0
++; GFX11-NEXT: s_cselect_b32 s2, -1, 0
++; GFX11-NEXT: s_bitcmp1_b32 s3, 0
++; GFX11-NEXT: s_cselect_b32 s3, -1, 0
++; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
++; GFX11-NEXT: s_and_b32 s2, s2, s3
++; GFX11-NEXT: s_mov_b32 s3, 0x31016000
++; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2
++; GFX11-NEXT: s_mov_b32 s2, -1
++; GFX11-NEXT: buffer_store_b8 v0, off, s[0:3], 0
++; GFX11-NEXT: s_nop 0
++; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
++; GFX11-NEXT: s_endpgm
++;
++; EG-LABEL: s_mul_i1:
++; EG: ; %bb.0: ; %entry
++; EG-NEXT: ALU 0, @10, KC0[], KC1[]
++; EG-NEXT: TEX 1 @6
++; EG-NEXT: ALU 12, @11, KC0[CB0:0-32], KC1[]
++; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
++; EG-NEXT: CF_END
++; EG-NEXT: PAD
++; EG-NEXT: Fetch clause starting at 6:
++; EG-NEXT: VTX_READ_8 T1.X, T0.X, 72, #3
++; EG-NEXT: VTX_READ_8 T0.X, T0.X, 108, #3
++; EG-NEXT: ALU clause starting at 10:
++; EG-NEXT: MOV * T0.X, 0.0,
++; EG-NEXT: ALU clause starting at 11:
++; EG-NEXT: AND_INT T0.W, KC0[2].Y, literal.x,
++; EG-NEXT: MULLO_INT * T0.X, T1.X, T0.X,
++; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
++; EG-NEXT: AND_INT T1.W, PS, 1,
++; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
++; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
++; EG-NEXT: LSHL T0.X, PV.W, PS,
++; EG-NEXT: LSHL * T0.W, literal.x, PS,
++; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
++; EG-NEXT: MOV T0.Y, 0.0,
++; EG-NEXT: MOV * T0.Z, 0.0,
++; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
++; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
++entry:
++ %mul = mul i1 %a, %b
++ store i1 %mul, ptr addrspace(1) %out, align 4
++ ret void
++}
++
++define amdgpu_kernel void @v_mul_i1(ptr addrspace(1) %out, ptr addrspace(1) %in) {
++; SI-LABEL: v_mul_i1:
++; SI: ; %bb.0: ; %entry
++; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
++; SI-NEXT: s_mov_b32 s7, 0xf000
++; SI-NEXT: s_mov_b32 s6, -1
++; SI-NEXT: s_mov_b32 s10, s6
++; SI-NEXT: s_mov_b32 s11, s7
++; SI-NEXT: s_waitcnt lgkmcnt(0)
++; SI-NEXT: s_mov_b32 s8, s2
++; SI-NEXT: s_mov_b32 s9, s3
++; SI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0
++; SI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:4
++; SI-NEXT: s_mov_b32 s4, s0
++; SI-NEXT: s_mov_b32 s5, s1
++; SI-NEXT: s_waitcnt vmcnt(1)
++; SI-NEXT: v_and_b32_e32 v0, 1, v0
++; SI-NEXT: s_waitcnt vmcnt(0)
++; SI-NEXT: v_and_b32_e32 v1, 1, v1
++; SI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
++; SI-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v1
++; SI-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
++; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
++; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0
++; SI-NEXT: s_endpgm
++;
++; VI-LABEL: v_mul_i1:
++; VI: ; %bb.0: ; %entry
++; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
++; VI-NEXT: s_mov_b32 s7, 0xf000
++; VI-NEXT: s_mov_b32 s6, -1
++; VI-NEXT: s_mov_b32 s10, s6
++; VI-NEXT: s_mov_b32 s11, s7
++; VI-NEXT: s_waitcnt lgkmcnt(0)
++; VI-NEXT: s_mov_b32 s8, s2
++; VI-NEXT: s_mov_b32 s9, s3
++; VI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0
++; VI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:4
++; VI-NEXT: s_mov_b32 s4, s0
++; VI-NEXT: s_mov_b32 s5, s1
++; VI-NEXT: s_waitcnt vmcnt(1)
++; VI-NEXT: v_and_b32_e32 v0, 1, v0
++; VI-NEXT: s_waitcnt vmcnt(0)
++; VI-NEXT: v_and_b32_e32 v1, 1, v1
++; VI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
++; VI-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v1
++; VI-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
++; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
++; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0
++; VI-NEXT: s_endpgm
++;
++; GFX9-LABEL: v_mul_i1:
++; GFX9: ; %bb.0: ; %entry
++; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
++; GFX9-NEXT: s_mov_b32 s7, 0xf000
++; GFX9-NEXT: s_mov_b32 s6, -1
++; GFX9-NEXT: s_mov_b32 s10, s6
++; GFX9-NEXT: s_mov_b32 s11, s7
++; GFX9-NEXT: s_waitcnt lgkmcnt(0)
++; GFX9-NEXT: s_mov_b32 s8, s2
++; GFX9-NEXT: s_mov_b32 s9, s3
++; GFX9-NEXT: buffer_load_ubyte v0, off, s[8:11], 0
++; GFX9-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:4
++; GFX9-NEXT: s_mov_b32 s4, s0
++; GFX9-NEXT: s_mov_b32 s5, s1
++; GFX9-NEXT: s_waitcnt vmcnt(1)
++; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
++; GFX9-NEXT: s_waitcnt vmcnt(0)
++; GFX9-NEXT: v_and_b32_e32 v1, 1, v1
++; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
++; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v1
++; GFX9-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
++; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
++; GFX9-NEXT: buffer_store_byte v0, off, s[4:7], 0
++; GFX9-NEXT: s_endpgm
++;
++; GFX10-LABEL: v_mul_i1:
++; GFX10: ; %bb.0: ; %entry
++; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
++; GFX10-NEXT: s_mov_b32 s2, -1
++; GFX10-NEXT: s_mov_b32 s3, 0x31016000
++; GFX10-NEXT: s_mov_b32 s10, s2
++; GFX10-NEXT: s_mov_b32 s11, s3
++; GFX10-NEXT: s_waitcnt lgkmcnt(0)
++; GFX10-NEXT: s_mov_b32 s8, s6
++; GFX10-NEXT: s_mov_b32 s9, s7
++; GFX10-NEXT: s_clause 0x1
++; GFX10-NEXT: buffer_load_ubyte v0, off, s[8:11], 0
++; GFX10-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:4
++; GFX10-NEXT: s_mov_b32 s1, s5
++; GFX10-NEXT: s_waitcnt vmcnt(1)
++; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
++; GFX10-NEXT: s_waitcnt vmcnt(0)
++; GFX10-NEXT: v_and_b32_e32 v1, 1, v1
++; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
++; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 1, v1
++; GFX10-NEXT: s_and_b32 s0, vcc_lo, s0
++; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
++; GFX10-NEXT: s_mov_b32 s0, s4
++; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], 0
++; GFX10-NEXT: s_endpgm
++;
++; GFX11-LABEL: v_mul_i1:
++; GFX11: ; %bb.0: ; %entry
++; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
++; GFX11-NEXT: s_mov_b32 s2, -1
++; GFX11-NEXT: s_mov_b32 s3, 0x31016000
++; GFX11-NEXT: s_mov_b32 s10, s2
++; GFX11-NEXT: s_mov_b32 s11, s3
++; GFX11-NEXT: s_waitcnt lgkmcnt(0)
++; GFX11-NEXT: s_mov_b32 s8, s6
++; GFX11-NEXT: s_mov_b32 s9, s7
++; GFX11-NEXT: s_clause 0x1
++; GFX11-NEXT: buffer_load_u8 v0, off, s[8:11], 0
++; GFX11-NEXT: buffer_load_u8 v1, off, s[8:11], 0 offset:4
++; GFX11-NEXT: s_mov_b32 s1, s5
++; GFX11-NEXT: s_waitcnt vmcnt(1)
++; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
++; GFX11-NEXT: s_waitcnt vmcnt(0)
++; GFX11-NEXT: v_and_b32_e32 v1, 1, v1
++; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
++; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
++; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 1, v1
++; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
++; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
++; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
++; GFX11-NEXT: s_mov_b32 s0, s4
++; GFX11-NEXT: buffer_store_b8 v0, off, s[0:3], 0
++; GFX11-NEXT: s_nop 0
++; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
++; GFX11-NEXT: s_endpgm
++;
++; EG-LABEL: v_mul_i1:
++; EG: ; %bb.0: ; %entry
++; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[]
++; EG-NEXT: TEX 1 @6
++; EG-NEXT: ALU 12, @11, KC0[CB0:0-32], KC1[]
++; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
++; EG-NEXT: CF_END
++; EG-NEXT: PAD
++; EG-NEXT: Fetch clause starting at 6:
++; EG-NEXT: VTX_READ_8 T1.X, T0.X, 4, #1
++; EG-NEXT: VTX_READ_8 T0.X, T0.X, 0, #1
++; EG-NEXT: ALU clause starting at 10:
++; EG-NEXT: MOV * T0.X, KC0[2].Z,
++; EG-NEXT: ALU clause starting at 11:
++; EG-NEXT: AND_INT T0.W, KC0[2].Y, literal.x,
++; EG-NEXT: MULLO_INT * T0.X, T0.X, T1.X,
++; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
++; EG-NEXT: AND_INT T1.W, PS, 1,
++; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
++; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
++; EG-NEXT: LSHL T0.X, PV.W, PS,
++; EG-NEXT: LSHL * T0.W, literal.x, PS,
++; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
++; EG-NEXT: MOV T0.Y, 0.0,
++; EG-NEXT: MOV * T0.Z, 0.0,
++; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
++; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
++entry:
++ %b_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1
++ %a = load i1, ptr addrspace(1) %in
++ %b = load i1, ptr addrspace(1) %b_ptr
++ %result = mul i1 %a, %b
++ store i1 %result, ptr addrspace(1) %out
++ ret void
++}
++
+ ; A standard 64-bit multiply. The expansion should be around 6 instructions.
+ ; It would be difficult to match the expansion correctly without writing
+ ; a really complicated list of FileCheck expressions. I don't want
+@@ -144,21 +1358,294 @@ define amdgpu_kernel void @v_mul_i32(ptr addrspace(1) %out, ptr addrspace(1) %in
+ ; so this test just uses FUNC-LABEL to make sure the compiler does not
+ ; crash with a 'failed to select' error.
+
+-; FUNC-LABEL: {{^}}s_mul_i64:
+-; GFX9PLUS-DAG: s_mul_i32
+-; GFX9PLUS-DAG: s_mul_hi_u32
+-; GFX9PLUS-DAG: s_mul_i32
+-; GFX9PLUS-DAG: s_mul_i32
+-; GFX9PLUS: s_endpgm
+ define amdgpu_kernel void @s_mul_i64(ptr addrspace(1) %out, i64 %a, i64 %b) nounwind {
++; SI-LABEL: s_mul_i64:
++; SI: ; %bb.0: ; %entry
++; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
++; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
++; SI-NEXT: s_mov_b32 s3, 0xf000
++; SI-NEXT: s_mov_b32 s2, -1
++; SI-NEXT: s_waitcnt lgkmcnt(0)
++; SI-NEXT: s_mov_b32 s0, s4
++; SI-NEXT: v_mov_b32_e32 v0, s8
++; SI-NEXT: v_mul_hi_u32 v0, s6, v0
++; SI-NEXT: s_mul_i32 s4, s6, s9
++; SI-NEXT: s_mov_b32 s1, s5
++; SI-NEXT: v_add_i32_e32 v0, vcc, s4, v0
++; SI-NEXT: s_mul_i32 s4, s7, s8
++; SI-NEXT: v_add_i32_e32 v1, vcc, s4, v0
++; SI-NEXT: s_mul_i32 s4, s6, s8
++; SI-NEXT: v_mov_b32_e32 v0, s4
++; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
++; SI-NEXT: s_endpgm
++;
++; VI-LABEL: s_mul_i64:
++; VI: ; %bb.0: ; %entry
++; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
++; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
++; VI-NEXT: s_mov_b32 s3, 0xf000
++; VI-NEXT: s_mov_b32 s2, -1
++; VI-NEXT: s_waitcnt lgkmcnt(0)
++; VI-NEXT: s_mov_b32 s0, s4
++; VI-NEXT: v_mov_b32_e32 v0, s8
++; VI-NEXT: v_mad_u64_u32 v[0:1], s[10:11], s6, v0, 0
++; VI-NEXT: s_mul_i32 s4, s6, s9
++; VI-NEXT: s_mov_b32 s1, s5
++; VI-NEXT: v_add_u32_e32 v1, vcc, s4, v1
++; VI-NEXT: s_mul_i32 s4, s7, s8
++; VI-NEXT: v_add_u32_e32 v1, vcc, s4, v1
++; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
++; VI-NEXT: s_endpgm
++;
++; GFX9-LABEL: s_mul_i64:
++; GFX9: ; %bb.0: ; %entry
++; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
++; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
++; GFX9-NEXT: s_mov_b32 s3, 0xf000
++; GFX9-NEXT: s_mov_b32 s2, -1
++; GFX9-NEXT: s_waitcnt lgkmcnt(0)
++; GFX9-NEXT: s_mov_b32 s0, s4
++; GFX9-NEXT: s_mov_b32 s1, s5
++; GFX9-NEXT: s_mul_i32 s4, s6, s9
++; GFX9-NEXT: s_mul_hi_u32 s5, s6, s8
++; GFX9-NEXT: s_add_i32 s4, s5, s4
++; GFX9-NEXT: s_mul_i32 s5, s7, s8
++; GFX9-NEXT: s_add_i32 s4, s4, s5
++; GFX9-NEXT: s_mul_i32 s5, s6, s8
++; GFX9-NEXT: v_mov_b32_e32 v0, s5
++; GFX9-NEXT: v_mov_b32_e32 v1, s4
++; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
++; GFX9-NEXT: s_endpgm
++;
++; GFX10-LABEL: s_mul_i64:
++; GFX10: ; %bb.0: ; %entry
++; GFX10-NEXT: s_clause 0x1
++; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
++; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
++; GFX10-NEXT: s_waitcnt lgkmcnt(0)
++; GFX10-NEXT: s_mul_i32 s0, s6, s3
++; GFX10-NEXT: s_mul_hi_u32 s1, s6, s2
++; GFX10-NEXT: s_mov_b32 s3, 0x31016000
++; GFX10-NEXT: s_add_i32 s0, s1, s0
++; GFX10-NEXT: s_mul_i32 s1, s7, s2
++; GFX10-NEXT: s_mul_i32 s2, s6, s2
++; GFX10-NEXT: s_add_i32 s0, s0, s1
++; GFX10-NEXT: v_mov_b32_e32 v0, s2
++; GFX10-NEXT: v_mov_b32_e32 v1, s0
++; GFX10-NEXT: s_mov_b32 s2, -1
++; GFX10-NEXT: s_mov_b32 s0, s4
++; GFX10-NEXT: s_mov_b32 s1, s5
++; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
++; GFX10-NEXT: s_endpgm
++;
++; GFX11-LABEL: s_mul_i64:
++; GFX11: ; %bb.0: ; %entry
++; GFX11-NEXT: s_clause 0x1
++; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
++; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
++; GFX11-NEXT: s_mov_b32 s3, 0x31016000
++; GFX11-NEXT: s_waitcnt lgkmcnt(0)
++; GFX11-NEXT: s_mul_i32 s1, s6, s1
++; GFX11-NEXT: s_mul_hi_u32 s2, s6, s0
++; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
++; GFX11-NEXT: s_add_i32 s1, s2, s1
++; GFX11-NEXT: s_mul_i32 s2, s7, s0
++; GFX11-NEXT: s_mul_i32 s0, s6, s0
++; GFX11-NEXT: s_add_i32 s1, s1, s2
++; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
++; GFX11-NEXT: s_mov_b32 s2, -1
++; GFX11-NEXT: s_mov_b32 s0, s4
++; GFX11-NEXT: s_mov_b32 s1, s5
++; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
++; GFX11-NEXT: s_nop 0
++; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
++; GFX11-NEXT: s_endpgm
++;
++; EG-LABEL: s_mul_i64:
++; EG: ; %bb.0: ; %entry
++; EG-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[]
++; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
++; EG-NEXT: CF_END
++; EG-NEXT: PAD
++; EG-NEXT: ALU clause starting at 4:
++; EG-NEXT: MULHI * T0.X, KC0[2].W, KC0[3].Y,
++; EG-NEXT: MULLO_INT * T0.Y, KC0[2].W, KC0[3].Z,
++; EG-NEXT: ADD_INT T0.W, T0.X, PS,
++; EG-NEXT: MULLO_INT * T0.X, KC0[3].X, KC0[3].Y,
++; EG-NEXT: ADD_INT * T0.Y, PV.W, PS,
++; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
++; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
++; EG-NEXT: MULLO_INT * T0.X, KC0[2].W, KC0[3].Y,
++entry:
+ %mul = mul i64 %a, %b
+ store i64 %mul, ptr addrspace(1) %out, align 8
+ ret void
+ }
+
+-; FUNC-LABEL: {{^}}v_mul_i64:
+-; GCN: v_mul_lo_u32
+ define amdgpu_kernel void @v_mul_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) {
++; SI-LABEL: v_mul_i64:
++; SI: ; %bb.0: ; %entry
++; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
++; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
++; SI-NEXT: s_mov_b32 s3, 0xf000
++; SI-NEXT: s_mov_b32 s2, -1
++; SI-NEXT: s_mov_b32 s10, s2
++; SI-NEXT: s_mov_b32 s11, s3
++; SI-NEXT: s_waitcnt lgkmcnt(0)
++; SI-NEXT: s_mov_b32 s12, s6
++; SI-NEXT: s_mov_b32 s13, s7
++; SI-NEXT: s_mov_b32 s14, s2
++; SI-NEXT: s_mov_b32 s15, s3
++; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
++; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0
++; SI-NEXT: s_mov_b32 s0, s4
++; SI-NEXT: s_mov_b32 s1, s5
++; SI-NEXT: s_waitcnt vmcnt(0)
++; SI-NEXT: v_mul_lo_u32 v1, v2, v1
++; SI-NEXT: v_mul_hi_u32 v4, v2, v0
++; SI-NEXT: v_mul_lo_u32 v3, v3, v0
++; SI-NEXT: v_mul_lo_u32 v0, v2, v0
++; SI-NEXT: v_add_i32_e32 v1, vcc, v1, v4
++; SI-NEXT: v_add_i32_e32 v1, vcc, v1, v3
++; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
++; SI-NEXT: s_endpgm
++;
++; VI-LABEL: v_mul_i64:
++; VI: ; %bb.0: ; %entry
++; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
++; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
++; VI-NEXT: s_mov_b32 s3, 0xf000
++; VI-NEXT: s_mov_b32 s2, -1
++; VI-NEXT: s_mov_b32 s10, s2
++; VI-NEXT: s_mov_b32 s11, s3
++; VI-NEXT: s_waitcnt lgkmcnt(0)
++; VI-NEXT: s_mov_b32 s12, s6
++; VI-NEXT: s_mov_b32 s13, s7
++; VI-NEXT: s_mov_b32 s14, s2
++; VI-NEXT: s_mov_b32 s15, s3
++; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
++; VI-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0
++; VI-NEXT: s_mov_b32 s0, s4
++; VI-NEXT: s_mov_b32 s1, s5
++; VI-NEXT: s_waitcnt vmcnt(0)
++; VI-NEXT: v_mul_lo_u32 v4, v2, v1
++; VI-NEXT: v_mad_u64_u32 v[1:2], s[6:7], v2, v0, 0
++; VI-NEXT: v_mul_lo_u32 v0, v3, v0
++; VI-NEXT: v_add_u32_e32 v2, vcc, v4, v2
++; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0
++; VI-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0
++; VI-NEXT: s_endpgm
++;
++; GFX9-LABEL: v_mul_i64:
++; GFX9: ; %bb.0: ; %entry
++; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
++; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
++; GFX9-NEXT: s_mov_b32 s3, 0xf000
++; GFX9-NEXT: s_mov_b32 s2, -1
++; GFX9-NEXT: s_mov_b32 s10, s2
++; GFX9-NEXT: s_mov_b32 s11, s3
++; GFX9-NEXT: s_waitcnt lgkmcnt(0)
++; GFX9-NEXT: s_mov_b32 s12, s6
++; GFX9-NEXT: s_mov_b32 s13, s7
++; GFX9-NEXT: s_mov_b32 s14, s2
++; GFX9-NEXT: s_mov_b32 s15, s3
++; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
++; GFX9-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0
++; GFX9-NEXT: s_mov_b32 s0, s4
++; GFX9-NEXT: s_mov_b32 s1, s5
++; GFX9-NEXT: s_waitcnt vmcnt(0)
++; GFX9-NEXT: v_mul_lo_u32 v1, v2, v1
++; GFX9-NEXT: v_mul_hi_u32 v4, v2, v0
++; GFX9-NEXT: v_mul_lo_u32 v3, v3, v0
++; GFX9-NEXT: v_mul_lo_u32 v0, v2, v0
++; GFX9-NEXT: v_add_u32_e32 v1, v4, v1
++; GFX9-NEXT: v_add_u32_e32 v1, v1, v3
++; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
++; GFX9-NEXT: s_endpgm
++;
++; GFX10-LABEL: v_mul_i64:
++; GFX10: ; %bb.0: ; %entry
++; GFX10-NEXT: s_clause 0x1
++; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
++; GFX10-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
++; GFX10-NEXT: s_mov_b32 s2, -1
++; GFX10-NEXT: s_mov_b32 s3, 0x31016000
++; GFX10-NEXT: s_mov_b32 s10, s2
++; GFX10-NEXT: s_mov_b32 s11, s3
++; GFX10-NEXT: s_mov_b32 s14, s2
++; GFX10-NEXT: s_mov_b32 s15, s3
++; GFX10-NEXT: s_waitcnt lgkmcnt(0)
++; GFX10-NEXT: s_mov_b32 s12, s6
++; GFX10-NEXT: s_mov_b32 s13, s7
++; GFX10-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
++; GFX10-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0
++; GFX10-NEXT: s_mov_b32 s0, s4
++; GFX10-NEXT: s_mov_b32 s1, s5
++; GFX10-NEXT: s_waitcnt vmcnt(0)
++; GFX10-NEXT: v_mul_lo_u32 v1, v2, v1
++; GFX10-NEXT: v_mul_hi_u32 v4, v2, v0
++; GFX10-NEXT: v_mul_lo_u32 v3, v3, v0
++; GFX10-NEXT: v_mul_lo_u32 v0, v2, v0
++; GFX10-NEXT: v_add_nc_u32_e32 v1, v4, v1
++; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v3
++; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
++; GFX10-NEXT: s_endpgm
++;
++; GFX11-LABEL: v_mul_i64:
++; GFX11: ; %bb.0: ; %entry
++; GFX11-NEXT: s_clause 0x1
++; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
++; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
++; GFX11-NEXT: s_mov_b32 s10, -1
++; GFX11-NEXT: s_mov_b32 s11, 0x31016000
++; GFX11-NEXT: s_mov_b32 s2, s10
++; GFX11-NEXT: s_mov_b32 s3, s11
++; GFX11-NEXT: s_mov_b32 s14, s10
++; GFX11-NEXT: s_mov_b32 s15, s11
++; GFX11-NEXT: s_waitcnt lgkmcnt(0)
++; GFX11-NEXT: s_mov_b32 s12, s6
++; GFX11-NEXT: s_mov_b32 s13, s7
++; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[0:3], 0
++; GFX11-NEXT: buffer_load_b64 v[2:3], off, s[12:15], 0
++; GFX11-NEXT: s_mov_b32 s8, s4
++; GFX11-NEXT: s_mov_b32 s9, s5
++; GFX11-NEXT: s_waitcnt vmcnt(0)
++; GFX11-NEXT: v_mul_lo_u32 v1, v2, v1
++; GFX11-NEXT: v_mul_hi_u32 v4, v2, v0
++; GFX11-NEXT: v_mul_lo_u32 v3, v3, v0
++; GFX11-NEXT: v_mul_lo_u32 v0, v2, v0
++; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
++; GFX11-NEXT: v_add_nc_u32_e32 v1, v4, v1
++; GFX11-NEXT: v_add_nc_u32_e32 v1, v1, v3
++; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0
++; GFX11-NEXT: s_nop 0
++; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
++; GFX11-NEXT: s_endpgm
++;
++; EG-LABEL: v_mul_i64:
++; EG: ; %bb.0: ; %entry
++; EG-NEXT: ALU 1, @10, KC0[CB0:0-32], KC1[]
++; EG-NEXT: TEX 1 @6
++; EG-NEXT: ALU 7, @12, KC0[CB0:0-32], KC1[]
++; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T2.X, 1
++; EG-NEXT: CF_END
++; EG-NEXT: PAD
++; EG-NEXT: Fetch clause starting at 6:
++; EG-NEXT: VTX_READ_64 T1.XY, T1.X, 0, #1
++; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1
++; EG-NEXT: ALU clause starting at 10:
++; EG-NEXT: MOV T0.X, KC0[2].Z,
++; EG-NEXT: MOV * T1.X, KC0[2].W,
++; EG-NEXT: ALU clause starting at 12:
++; EG-NEXT: MULHI * T0.Z, T0.X, T1.X,
++; EG-NEXT: MULLO_INT * T0.W, T0.X, T1.Y,
++; EG-NEXT: ADD_INT T0.W, T0.Z, PS,
++; EG-NEXT: MULLO_INT * T0.Y, T0.Y, T1.X,
++; EG-NEXT: ADD_INT * T0.Y, PV.W, PS,
++; EG-NEXT: LSHR T2.X, KC0[2].Y, literal.x,
++; EG-NEXT: MULLO_INT * T0.X, T0.X, T1.X,
++; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
++entry:
+ %a = load i64, ptr addrspace(1) %aptr, align 8
+ %b = load i64, ptr addrspace(1) %bptr, align 8
+ %mul = mul i64 %a, %b
+@@ -166,9 +1653,220 @@ define amdgpu_kernel void @v_mul_i64(ptr addrspace(1) %out, ptr addrspace(1) %ap
+ ret void
+ }
+
+-; FUNC-LABEL: {{^}}mul32_in_branch:
+-; GCN: s_mul_i32
+ define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %a, i32 %b, i32 %c) {
++; SI-LABEL: mul32_in_branch:
++; SI: ; %bb.0: ; %entry
++; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd
++; SI-NEXT: s_waitcnt lgkmcnt(0)
++; SI-NEXT: s_cmp_lg_u32 s2, 0
++; SI-NEXT: s_cbranch_scc0 .LBB13_2
++; SI-NEXT: ; %bb.1: ; %else
++; SI-NEXT: s_mul_i32 s6, s2, s3
++; SI-NEXT: s_mov_b64 s[4:5], 0
++; SI-NEXT: s_branch .LBB13_3
++; SI-NEXT: .LBB13_2:
++; SI-NEXT: s_mov_b64 s[4:5], -1
++; SI-NEXT: ; implicit-def: $sgpr6
++; SI-NEXT: .LBB13_3: ; %Flow
++; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
++; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5]
++; SI-NEXT: s_waitcnt lgkmcnt(0)
++; SI-NEXT: s_mov_b64 vcc, vcc
++; SI-NEXT: s_cbranch_vccnz .LBB13_5
++; SI-NEXT: ; %bb.4: ; %if
++; SI-NEXT: s_mov_b32 s7, 0xf000
++; SI-NEXT: s_mov_b32 s6, -1
++; SI-NEXT: s_mov_b32 s4, s2
++; SI-NEXT: s_mov_b32 s5, s3
++; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0
++; SI-NEXT: s_branch .LBB13_6
++; SI-NEXT: .LBB13_5:
++; SI-NEXT: v_mov_b32_e32 v0, s6
++; SI-NEXT: .LBB13_6: ; %endif
++; SI-NEXT: s_mov_b32 s3, 0xf000
++; SI-NEXT: s_mov_b32 s2, -1
++; SI-NEXT: s_waitcnt vmcnt(0)
++; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
++; SI-NEXT: s_endpgm
++;
++; VI-LABEL: mul32_in_branch:
++; VI: ; %bb.0: ; %entry
++; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
++; VI-NEXT: s_waitcnt lgkmcnt(0)
++; VI-NEXT: s_cmp_lg_u32 s2, 0
++; VI-NEXT: s_cbranch_scc0 .LBB13_2
++; VI-NEXT: ; %bb.1: ; %else
++; VI-NEXT: s_mul_i32 s6, s2, s3
++; VI-NEXT: s_mov_b64 s[4:5], 0
++; VI-NEXT: s_branch .LBB13_3
++; VI-NEXT: .LBB13_2:
++; VI-NEXT: s_mov_b64 s[4:5], -1
++; VI-NEXT: ; implicit-def: $sgpr6
++; VI-NEXT: .LBB13_3: ; %Flow
++; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
++; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5]
++; VI-NEXT: s_cbranch_vccnz .LBB13_5
++; VI-NEXT: ; %bb.4: ; %if
++; VI-NEXT: s_mov_b32 s7, 0xf000
++; VI-NEXT: s_mov_b32 s6, -1
++; VI-NEXT: s_waitcnt lgkmcnt(0)
++; VI-NEXT: s_mov_b32 s4, s2
++; VI-NEXT: s_mov_b32 s5, s3
++; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0
++; VI-NEXT: s_branch .LBB13_6
++; VI-NEXT: .LBB13_5:
++; VI-NEXT: v_mov_b32_e32 v0, s6
++; VI-NEXT: .LBB13_6: ; %endif
++; VI-NEXT: s_waitcnt lgkmcnt(0)
++; VI-NEXT: s_mov_b32 s3, 0xf000
++; VI-NEXT: s_mov_b32 s2, -1
++; VI-NEXT: s_waitcnt vmcnt(0)
++; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
++; VI-NEXT: s_endpgm
++;
++; GFX9-LABEL: mul32_in_branch:
++; GFX9: ; %bb.0: ; %entry
++; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
++; GFX9-NEXT: s_waitcnt lgkmcnt(0)
++; GFX9-NEXT: s_cmp_lg_u32 s2, 0
++; GFX9-NEXT: s_cbranch_scc0 .LBB13_2
++; GFX9-NEXT: ; %bb.1: ; %else
++; GFX9-NEXT: s_mul_i32 s6, s2, s3
++; GFX9-NEXT: s_mov_b64 s[4:5], 0
++; GFX9-NEXT: s_branch .LBB13_3
++; GFX9-NEXT: .LBB13_2:
++; GFX9-NEXT: s_mov_b64 s[4:5], -1
++; GFX9-NEXT: ; implicit-def: $sgpr6
++; GFX9-NEXT: .LBB13_3: ; %Flow
++; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
++; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5]
++; GFX9-NEXT: s_cbranch_vccnz .LBB13_5
++; GFX9-NEXT: ; %bb.4: ; %if
++; GFX9-NEXT: s_mov_b32 s7, 0xf000
++; GFX9-NEXT: s_mov_b32 s6, -1
++; GFX9-NEXT: s_waitcnt lgkmcnt(0)
++; GFX9-NEXT: s_mov_b32 s4, s2
++; GFX9-NEXT: s_mov_b32 s5, s3
++; GFX9-NEXT: buffer_load_dword v0, off, s[4:7], 0
++; GFX9-NEXT: s_branch .LBB13_6
++; GFX9-NEXT: .LBB13_5:
++; GFX9-NEXT: v_mov_b32_e32 v0, s6
++; GFX9-NEXT: .LBB13_6: ; %endif
++; GFX9-NEXT: s_waitcnt lgkmcnt(0)
++; GFX9-NEXT: s_mov_b32 s3, 0xf000
++; GFX9-NEXT: s_mov_b32 s2, -1
++; GFX9-NEXT: s_waitcnt vmcnt(0)
++; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
++; GFX9-NEXT: s_endpgm
++;
++; GFX10-LABEL: mul32_in_branch:
++; GFX10: ; %bb.0: ; %entry
++; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
++; GFX10-NEXT: s_mov_b32 s4, 0
++; GFX10-NEXT: s_waitcnt lgkmcnt(0)
++; GFX10-NEXT: s_cmp_lg_u32 s2, 0
++; GFX10-NEXT: s_cbranch_scc0 .LBB13_2
++; GFX10-NEXT: ; %bb.1: ; %else
++; GFX10-NEXT: s_mul_i32 s5, s2, s3
++; GFX10-NEXT: s_branch .LBB13_3
++; GFX10-NEXT: .LBB13_2:
++; GFX10-NEXT: s_mov_b32 s4, -1
++; GFX10-NEXT: ; implicit-def: $sgpr5
++; GFX10-NEXT: .LBB13_3: ; %Flow
++; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
++; GFX10-NEXT: s_andn2_b32 vcc_lo, exec_lo, s4
++; GFX10-NEXT: s_cbranch_vccnz .LBB13_5
++; GFX10-NEXT: ; %bb.4: ; %if
++; GFX10-NEXT: s_mov_b32 s7, 0x31016000
++; GFX10-NEXT: s_mov_b32 s6, -1
++; GFX10-NEXT: s_waitcnt lgkmcnt(0)
++; GFX10-NEXT: s_mov_b32 s4, s2
++; GFX10-NEXT: s_mov_b32 s5, s3
++; GFX10-NEXT: buffer_load_dword v0, off, s[4:7], 0
++; GFX10-NEXT: s_branch .LBB13_6
++; GFX10-NEXT: .LBB13_5:
++; GFX10-NEXT: v_mov_b32_e32 v0, s5
++; GFX10-NEXT: .LBB13_6: ; %endif
++; GFX10-NEXT: s_waitcnt lgkmcnt(0)
++; GFX10-NEXT: s_mov_b32 s3, 0x31016000
++; GFX10-NEXT: s_mov_b32 s2, -1
++; GFX10-NEXT: s_waitcnt vmcnt(0)
++; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0
++; GFX10-NEXT: s_endpgm
++;
++; GFX11-LABEL: mul32_in_branch:
++; GFX11: ; %bb.0: ; %entry
++; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
++; GFX11-NEXT: s_mov_b32 s4, 0
++; GFX11-NEXT: s_waitcnt lgkmcnt(0)
++; GFX11-NEXT: s_cmp_lg_u32 s2, 0
++; GFX11-NEXT: s_cbranch_scc0 .LBB13_2
++; GFX11-NEXT: ; %bb.1: ; %else
++; GFX11-NEXT: s_mul_i32 s5, s2, s3
++; GFX11-NEXT: s_branch .LBB13_3
++; GFX11-NEXT: .LBB13_2:
++; GFX11-NEXT: s_mov_b32 s4, -1
++; GFX11-NEXT: ; implicit-def: $sgpr5
++; GFX11-NEXT: .LBB13_3: ; %Flow
++; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
++; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
++; GFX11-NEXT: s_cbranch_vccnz .LBB13_5
++; GFX11-NEXT: ; %bb.4: ; %if
++; GFX11-NEXT: s_mov_b32 s7, 0x31016000
++; GFX11-NEXT: s_mov_b32 s6, -1
++; GFX11-NEXT: s_waitcnt lgkmcnt(0)
++; GFX11-NEXT: s_mov_b32 s4, s2
++; GFX11-NEXT: s_mov_b32 s5, s3
++; GFX11-NEXT: buffer_load_b32 v0, off, s[4:7], 0
++; GFX11-NEXT: s_branch .LBB13_6
++; GFX11-NEXT: .LBB13_5:
++; GFX11-NEXT: v_mov_b32_e32 v0, s5
++; GFX11-NEXT: .LBB13_6: ; %endif
++; GFX11-NEXT: s_waitcnt lgkmcnt(0)
++; GFX11-NEXT: s_mov_b32 s3, 0x31016000
++; GFX11-NEXT: s_mov_b32 s2, -1
++; GFX11-NEXT: s_waitcnt vmcnt(0)
++; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
++; GFX11-NEXT: s_nop 0
++; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
++; GFX11-NEXT: s_endpgm
++;
++; EG-LABEL: mul32_in_branch:
++; EG: ; %bb.0: ; %entry
++; EG-NEXT: ALU_PUSH_BEFORE 3, @14, KC0[CB0:0-32], KC1[]
++; EG-NEXT: JUMP @3 POP:1
++; EG-NEXT: ALU_POP_AFTER 4, @18, KC0[CB0:0-32], KC1[]
++; EG-NEXT: ALU_PUSH_BEFORE 2, @23, KC0[CB0:0-32], KC1[]
++; EG-NEXT: JUMP @8 POP:1
++; EG-NEXT: ALU 0, @26, KC0[CB0:0-32], KC1[]
++; EG-NEXT: TEX 0 @12
++; EG-NEXT: POP @8 POP:1
++; EG-NEXT: ALU 1, @27, KC0[], KC1[]
++; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
++; EG-NEXT: CF_END
++; EG-NEXT: PAD
++; EG-NEXT: Fetch clause starting at 12:
++; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
++; EG-NEXT: ALU clause starting at 14:
++; EG-NEXT: MOV T0.W, literal.x,
++; EG-NEXT: SETNE_INT * T1.W, KC0[2].W, 0.0,
++; EG-NEXT: 1(1.401298e-45), 0(0.000000e+00)
++; EG-NEXT: PRED_SETNE_INT * ExecMask,PredicateBit (MASKED), PS, 0.0,
++; EG-NEXT: ALU clause starting at 18:
++; EG-NEXT: MOV T1.W, KC0[2].W,
++; EG-NEXT: MOV * T2.W, KC0[3].X,
++; EG-NEXT: MOV T0.W, literal.x,
++; EG-NEXT: MULLO_INT * T0.X, PV.W, PS,
++; EG-NEXT: 0(0.000000e+00), 0(0.000000e+00)
++; EG-NEXT: ALU clause starting at 23:
++; EG-NEXT: MOV T1.W, KC0[2].Y,
++; EG-NEXT: SETE_INT * T0.W, T0.W, 0.0,
++; EG-NEXT: PRED_SETE_INT * ExecMask,PredicateBit (MASKED), PS, 0.0,
++; EG-NEXT: ALU clause starting at 26:
++; EG-NEXT: MOV * T0.X, KC0[2].Z,
++; EG-NEXT: ALU clause starting at 27:
++; EG-NEXT: LSHR * T1.X, T1.W, literal.x,
++; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+ entry:
+ %0 = icmp eq i32 %a, 0
+ br i1 %0, label %if, label %else
+@@ -187,12 +1885,227 @@ endif:
+ ret void
+ }
+
+-; FUNC-LABEL: {{^}}mul64_in_branch:
+-; SI-DAG: s_mul_i32
+-; SI-DAG: v_mul_hi_u32
+-; VI: v_mad_u64_u32
+-; GCN: s_endpgm
+ define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace(1) %in, i64 %a, i64 %b, i64 %c) {
++; SI-LABEL: mul64_in_branch:
++; SI: ; %bb.0: ; %entry
++; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
++; SI-NEXT: s_mov_b64 s[8:9], 0
++; SI-NEXT: s_waitcnt lgkmcnt(0)
++; SI-NEXT: v_cmp_ne_u64_e64 s[10:11], s[4:5], 0
++; SI-NEXT: s_and_b64 vcc, exec, s[10:11]
++; SI-NEXT: s_cbranch_vccz .LBB14_4
++; SI-NEXT: ; %bb.1: ; %else
++; SI-NEXT: v_mov_b32_e32 v0, s6
++; SI-NEXT: v_mul_hi_u32 v0, s4, v0
++; SI-NEXT: s_mul_i32 s7, s4, s7
++; SI-NEXT: s_mul_i32 s5, s5, s6
++; SI-NEXT: s_mul_i32 s4, s4, s6
++; SI-NEXT: v_add_i32_e32 v0, vcc, s7, v0
++; SI-NEXT: v_add_i32_e32 v1, vcc, s5, v0
++; SI-NEXT: v_mov_b32_e32 v0, s4
++; SI-NEXT: s_andn2_b64 vcc, exec, s[8:9]
++; SI-NEXT: s_cbranch_vccnz .LBB14_3
++; SI-NEXT: .LBB14_2: ; %if
++; SI-NEXT: s_mov_b32 s7, 0xf000
++; SI-NEXT: s_mov_b32 s6, -1
++; SI-NEXT: s_mov_b32 s4, s2
++; SI-NEXT: s_mov_b32 s5, s3
++; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
++; SI-NEXT: .LBB14_3: ; %endif
++; SI-NEXT: s_mov_b32 s3, 0xf000
++; SI-NEXT: s_mov_b32 s2, -1
++; SI-NEXT: s_waitcnt vmcnt(0)
++; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
++; SI-NEXT: s_endpgm
++; SI-NEXT: .LBB14_4:
++; SI-NEXT: ; implicit-def: $vgpr0_vgpr1
++; SI-NEXT: s_branch .LBB14_2
++;
++; VI-LABEL: mul64_in_branch:
++; VI: ; %bb.0: ; %entry
++; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
++; VI-NEXT: s_mov_b64 s[8:9], 0
++; VI-NEXT: s_waitcnt lgkmcnt(0)
++; VI-NEXT: s_cmp_lg_u64 s[4:5], 0
++; VI-NEXT: s_cbranch_scc0 .LBB14_4
++; VI-NEXT: ; %bb.1: ; %else
++; VI-NEXT: v_mov_b32_e32 v0, s6
++; VI-NEXT: v_mad_u64_u32 v[0:1], s[10:11], s4, v0, 0
++; VI-NEXT: s_mul_i32 s4, s4, s7
++; VI-NEXT: v_add_u32_e32 v1, vcc, s4, v1
++; VI-NEXT: s_mul_i32 s4, s5, s6
++; VI-NEXT: v_add_u32_e32 v1, vcc, s4, v1
++; VI-NEXT: s_andn2_b64 vcc, exec, s[8:9]
++; VI-NEXT: s_cbranch_vccnz .LBB14_3
++; VI-NEXT: .LBB14_2: ; %if
++; VI-NEXT: s_mov_b32 s7, 0xf000
++; VI-NEXT: s_mov_b32 s6, -1
++; VI-NEXT: s_mov_b32 s4, s2
++; VI-NEXT: s_mov_b32 s5, s3
++; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
++; VI-NEXT: .LBB14_3: ; %endif
++; VI-NEXT: s_mov_b32 s3, 0xf000
++; VI-NEXT: s_mov_b32 s2, -1
++; VI-NEXT: s_waitcnt vmcnt(0)
++; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
++; VI-NEXT: s_endpgm
++; VI-NEXT: .LBB14_4:
++; VI-NEXT: ; implicit-def: $vgpr0_vgpr1
++; VI-NEXT: s_branch .LBB14_2
++;
++; GFX9-LABEL: mul64_in_branch:
++; GFX9: ; %bb.0: ; %entry
++; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
++; GFX9-NEXT: s_mov_b64 s[8:9], 0
++; GFX9-NEXT: s_waitcnt lgkmcnt(0)
++; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0
++; GFX9-NEXT: s_cbranch_scc0 .LBB14_3
++; GFX9-NEXT: ; %bb.1: ; %else
++; GFX9-NEXT: s_mul_i32 s7, s4, s7
++; GFX9-NEXT: s_mul_hi_u32 s10, s4, s6
++; GFX9-NEXT: s_add_i32 s7, s10, s7
++; GFX9-NEXT: s_mul_i32 s5, s5, s6
++; GFX9-NEXT: s_add_i32 s5, s7, s5
++; GFX9-NEXT: s_mul_i32 s4, s4, s6
++; GFX9-NEXT: s_andn2_b64 vcc, exec, s[8:9]
++; GFX9-NEXT: s_cbranch_vccnz .LBB14_4
++; GFX9-NEXT: .LBB14_2: ; %if
++; GFX9-NEXT: s_mov_b32 s7, 0xf000
++; GFX9-NEXT: s_mov_b32 s6, -1
++; GFX9-NEXT: s_mov_b32 s4, s2
++; GFX9-NEXT: s_mov_b32 s5, s3
++; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
++; GFX9-NEXT: s_branch .LBB14_5
++; GFX9-NEXT: .LBB14_3:
++; GFX9-NEXT: ; implicit-def: $sgpr4_sgpr5
++; GFX9-NEXT: s_branch .LBB14_2
++; GFX9-NEXT: .LBB14_4:
++; GFX9-NEXT: v_mov_b32_e32 v0, s4
++; GFX9-NEXT: v_mov_b32_e32 v1, s5
++; GFX9-NEXT: .LBB14_5: ; %endif
++; GFX9-NEXT: s_mov_b32 s3, 0xf000
++; GFX9-NEXT: s_mov_b32 s2, -1
++; GFX9-NEXT: s_waitcnt vmcnt(0)
++; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
++; GFX9-NEXT: s_endpgm
++;
++; GFX10-LABEL: mul64_in_branch:
++; GFX10: ; %bb.0: ; %entry
++; GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
++; GFX10-NEXT: s_waitcnt lgkmcnt(0)
++; GFX10-NEXT: s_cmp_lg_u64 s[4:5], 0
++; GFX10-NEXT: s_cbranch_scc0 .LBB14_3
++; GFX10-NEXT: ; %bb.1: ; %else
++; GFX10-NEXT: s_mul_i32 s7, s4, s7
++; GFX10-NEXT: s_mul_hi_u32 s8, s4, s6
++; GFX10-NEXT: s_mul_i32 s5, s5, s6
++; GFX10-NEXT: s_add_i32 s7, s8, s7
++; GFX10-NEXT: s_mul_i32 s4, s4, s6
++; GFX10-NEXT: s_add_i32 s5, s7, s5
++; GFX10-NEXT: s_mov_b32 s6, 0
++; GFX10-NEXT: s_cbranch_execnz .LBB14_4
++; GFX10-NEXT: .LBB14_2: ; %if
++; GFX10-NEXT: s_mov_b32 s7, 0x31016000
++; GFX10-NEXT: s_mov_b32 s6, -1
++; GFX10-NEXT: s_mov_b32 s4, s2
++; GFX10-NEXT: s_mov_b32 s5, s3
++; GFX10-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
++; GFX10-NEXT: s_branch .LBB14_5
++; GFX10-NEXT: .LBB14_3:
++; GFX10-NEXT: s_mov_b32 s6, -1
++; GFX10-NEXT: ; implicit-def: $sgpr4_sgpr5
++; GFX10-NEXT: s_branch .LBB14_2
++; GFX10-NEXT: .LBB14_4:
++; GFX10-NEXT: v_mov_b32_e32 v0, s4
++; GFX10-NEXT: v_mov_b32_e32 v1, s5
++; GFX10-NEXT: .LBB14_5: ; %endif
++; GFX10-NEXT: s_mov_b32 s3, 0x31016000
++; GFX10-NEXT: s_mov_b32 s2, -1
++; GFX10-NEXT: s_waitcnt vmcnt(0)
++; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
++; GFX10-NEXT: s_endpgm
++;
++; GFX11-LABEL: mul64_in_branch:
++; GFX11: ; %bb.0: ; %entry
++; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
++; GFX11-NEXT: s_waitcnt lgkmcnt(0)
++; GFX11-NEXT: s_cmp_lg_u64 s[4:5], 0
++; GFX11-NEXT: s_cbranch_scc0 .LBB14_3
++; GFX11-NEXT: ; %bb.1: ; %else
++; GFX11-NEXT: s_mul_i32 s7, s4, s7
++; GFX11-NEXT: s_mul_hi_u32 s8, s4, s6
++; GFX11-NEXT: s_mul_i32 s5, s5, s6
++; GFX11-NEXT: s_add_i32 s7, s8, s7
++; GFX11-NEXT: s_mul_i32 s4, s4, s6
++; GFX11-NEXT: s_add_i32 s5, s7, s5
++; GFX11-NEXT: s_mov_b32 s6, 0
++; GFX11-NEXT: s_cbranch_execnz .LBB14_4
++; GFX11-NEXT: .LBB14_2: ; %if
++; GFX11-NEXT: s_mov_b32 s7, 0x31016000
++; GFX11-NEXT: s_mov_b32 s6, -1
++; GFX11-NEXT: s_mov_b32 s4, s2
++; GFX11-NEXT: s_mov_b32 s5, s3
++; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[4:7], 0
++; GFX11-NEXT: s_branch .LBB14_5
++; GFX11-NEXT: .LBB14_3:
++; GFX11-NEXT: s_mov_b32 s6, -1
++; GFX11-NEXT: ; implicit-def: $sgpr4_sgpr5
++; GFX11-NEXT: s_branch .LBB14_2
++; GFX11-NEXT: .LBB14_4:
++; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
++; GFX11-NEXT: .LBB14_5: ; %endif
++; GFX11-NEXT: s_mov_b32 s3, 0x31016000
++; GFX11-NEXT: s_mov_b32 s2, -1
++; GFX11-NEXT: s_waitcnt vmcnt(0)
++; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
++; GFX11-NEXT: s_nop 0
++; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
++; GFX11-NEXT: s_endpgm
++;
++; EG-LABEL: mul64_in_branch:
++; EG: ; %bb.0: ; %entry
++; EG-NEXT: ALU_PUSH_BEFORE 4, @14, KC0[CB0:0-32], KC1[]
++; EG-NEXT: JUMP @3 POP:1
++; EG-NEXT: ALU_POP_AFTER 11, @19, KC0[CB0:0-32], KC1[]
++; EG-NEXT: ALU_PUSH_BEFORE 2, @31, KC0[CB0:0-32], KC1[]
++; EG-NEXT: JUMP @8 POP:1
++; EG-NEXT: ALU 0, @34, KC0[CB0:0-32], KC1[]
++; EG-NEXT: TEX 0 @12
++; EG-NEXT: POP @8 POP:1
++; EG-NEXT: ALU 1, @35, KC0[], KC1[]
++; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
++; EG-NEXT: CF_END
++; EG-NEXT: PAD
++; EG-NEXT: Fetch clause starting at 12:
++; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1
++; EG-NEXT: ALU clause starting at 14:
++; EG-NEXT: OR_INT T0.W, KC0[2].W, KC0[3].X,
++; EG-NEXT: MOV * T1.W, literal.x,
++; EG-NEXT: 1(1.401298e-45), 0(0.000000e+00)
++; EG-NEXT: SETNE_INT * T0.W, PV.W, 0.0,
++; EG-NEXT: PRED_SETNE_INT * ExecMask,PredicateBit (MASKED), PV.W, 0.0,
++; EG-NEXT: ALU clause starting at 19:
++; EG-NEXT: MOV T0.W, KC0[2].W,
++; EG-NEXT: MOV * T1.W, KC0[3].Z,
++; EG-NEXT: MOV T2.W, KC0[3].Y,
++; EG-NEXT: MULLO_INT * T0.X, PV.W, PS,
++; EG-NEXT: MOV T1.W, KC0[3].X,
++; EG-NEXT: MULHI * T0.Y, T0.W, PV.W,
++; EG-NEXT: ADD_INT T3.W, PS, T0.X,
++; EG-NEXT: MULLO_INT * T0.X, PV.W, T2.W,
++; EG-NEXT: ADD_INT T0.Y, PV.W, PS,
++; EG-NEXT: MOV T1.W, literal.x,
++; EG-NEXT: MULLO_INT * T0.X, T0.W, T2.W,
++; EG-NEXT: 0(0.000000e+00), 0(0.000000e+00)
++; EG-NEXT: ALU clause starting at 31:
++; EG-NEXT: MOV T0.W, KC0[2].Y,
++; EG-NEXT: SETE_INT * T1.W, T1.W, 0.0,
++; EG-NEXT: PRED_SETE_INT * ExecMask,PredicateBit (MASKED), PS, 0.0,
++; EG-NEXT: ALU clause starting at 34:
++; EG-NEXT: MOV * T0.X, KC0[2].Z,
++; EG-NEXT: ALU clause starting at 35:
++; EG-NEXT: LSHR * T1.X, T0.W, literal.x,
++; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+ entry:
+ %0 = icmp eq i64 %a, 0
+ br i1 %0, label %if, label %else
+@@ -211,79 +2124,558 @@ endif:
+ ret void
+ }
+
+-; FIXME: Load dwordx4
+-; FUNC-LABEL: {{^}}s_mul_i128:
+-; GCN: s_load_dwordx4
+-; GCN: s_load_dwordx4
+-
+-; SI: v_mul_hi_u32
+-; SI: v_mul_hi_u32
+-; SI: s_mul_i32
+-; SI: v_mul_hi_u32
+-; SI: s_mul_i32
+-; SI: s_mul_i32
+-
+-; SI-DAG: s_mul_i32
+-; SI-DAG: v_mul_hi_u32
+-; SI-DAG: v_mul_hi_u32
+-; SI-DAG: s_mul_i32
+-; SI-DAG: s_mul_i32
+-; SI-DAG: v_mul_hi_u32
+-
+-; VI-DAG: v_mad_u64_u32
+-; VI-DAG: v_mad_u64_u32
+-; VI-DAG: v_mad_u64_u32
+-; VI-DAG: v_mad_u64_u32
+-; VI-DAG: v_mad_u64_u32
+-; VI-DAG: v_mad_u64_u32
+-; VI-DAG: s_mul_i32
+-; VI-DAG: s_mul_i32
+-; VI-DAG: s_mul_i32
+-; VI-DAG: s_mul_i32
+-
+-
+-; GCN: buffer_store_dwordx4
+ define amdgpu_kernel void @s_mul_i128(ptr addrspace(1) %out, [8 x i32], i128 %a, [8 x i32], i128 %b) nounwind #0 {
++; SI-LABEL: s_mul_i128:
++; SI: ; %bb.0: ; %entry
++; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x13
++; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x1f
++; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
++; SI-NEXT: s_mov_b32 s3, 0xf000
++; SI-NEXT: s_mov_b32 s2, -1
++; SI-NEXT: s_waitcnt lgkmcnt(0)
++; SI-NEXT: v_mov_b32_e32 v0, s6
++; SI-NEXT: v_mul_hi_u32 v0, s8, v0
++; SI-NEXT: v_mov_b32_e32 v1, s4
++; SI-NEXT: v_mul_hi_u32 v1, s10, v1
++; SI-NEXT: s_mul_i32 s7, s8, s7
++; SI-NEXT: v_add_i32_e32 v0, vcc, s7, v0
++; SI-NEXT: s_mul_i32 s7, s10, s5
++; SI-NEXT: s_mul_i32 s12, s9, s6
++; SI-NEXT: s_mul_i32 s6, s8, s6
++; SI-NEXT: v_add_i32_e32 v1, vcc, s7, v1
++; SI-NEXT: s_mul_i32 s7, s11, s4
++; SI-NEXT: v_add_i32_e32 v0, vcc, s12, v0
++; SI-NEXT: v_add_i32_e32 v1, vcc, s7, v1
++; SI-NEXT: s_mul_i32 s7, s10, s4
++; SI-NEXT: v_mov_b32_e32 v2, s6
++; SI-NEXT: v_add_i32_e32 v2, vcc, s7, v2
++; SI-NEXT: v_addc_u32_e32 v0, vcc, v1, v0, vcc
++; SI-NEXT: v_mov_b32_e32 v1, s8
++; SI-NEXT: v_mul_hi_u32 v5, s4, v1
++; SI-NEXT: v_mul_hi_u32 v1, s5, v1
++; SI-NEXT: v_mov_b32_e32 v3, s9
++; SI-NEXT: v_mul_hi_u32 v4, s4, v3
++; SI-NEXT: s_mul_i32 s7, s5, s8
++; SI-NEXT: v_add_i32_e32 v5, vcc, s7, v5
++; SI-NEXT: s_mul_i32 s6, s4, s9
++; SI-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc
++; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v5
++; SI-NEXT: v_mul_hi_u32 v3, s5, v3
++; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc
++; SI-NEXT: v_add_i32_e32 v4, vcc, v6, v4
++; SI-NEXT: s_mul_i32 s5, s5, s9
++; SI-NEXT: v_addc_u32_e64 v5, s[6:7], 0, 0, vcc
++; SI-NEXT: v_add_i32_e32 v4, vcc, s5, v4
++; SI-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc
++; SI-NEXT: v_add_i32_e32 v2, vcc, v4, v2
++; SI-NEXT: s_mul_i32 s4, s4, s8
++; SI-NEXT: v_addc_u32_e32 v3, vcc, v3, v0, vcc
++; SI-NEXT: v_mov_b32_e32 v0, s4
++; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
++; SI-NEXT: s_endpgm
++;
++; VI-LABEL: s_mul_i128:
++; VI: ; %bb.0: ; %entry
++; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x4c
++; VI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x7c
++; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
++; VI-NEXT: v_mov_b32_e32 v5, 0
++; VI-NEXT: s_mov_b32 s3, 0xf000
++; VI-NEXT: s_waitcnt lgkmcnt(0)
++; VI-NEXT: v_mov_b32_e32 v0, s6
++; VI-NEXT: v_mad_u64_u32 v[2:3], s[12:13], s8, v0, 0
++; VI-NEXT: s_mul_i32 s7, s8, s7
++; VI-NEXT: v_mov_b32_e32 v6, s8
++; VI-NEXT: v_add_u32_e32 v3, vcc, s7, v3
++; VI-NEXT: s_mul_i32 s12, s9, s6
++; VI-NEXT: v_mad_u64_u32 v[0:1], s[6:7], s4, v6, 0
++; VI-NEXT: v_add_u32_e32 v3, vcc, s12, v3
++; VI-NEXT: v_mov_b32_e32 v4, v1
++; VI-NEXT: v_mad_u64_u32 v[6:7], s[6:7], s5, v6, v[4:5]
++; VI-NEXT: v_mov_b32_e32 v8, s4
++; VI-NEXT: v_mad_u64_u32 v[1:2], s[6:7], s10, v8, v[2:3]
++; VI-NEXT: v_mov_b32_e32 v3, v7
++; VI-NEXT: v_mov_b32_e32 v7, v5
++; VI-NEXT: v_mov_b32_e32 v8, s9
++; VI-NEXT: v_mad_u64_u32 v[4:5], s[6:7], s4, v8, v[6:7]
++; VI-NEXT: s_mul_i32 s8, s11, s4
++; VI-NEXT: v_add_u32_e32 v6, vcc, s8, v2
++; VI-NEXT: v_mov_b32_e32 v2, v5
++; VI-NEXT: v_add_u32_e32 v2, vcc, v3, v2
++; VI-NEXT: v_addc_u32_e64 v3, s[6:7], 0, 0, vcc
++; VI-NEXT: s_mul_i32 s8, s10, s5
++; VI-NEXT: v_mad_u64_u32 v[2:3], s[4:5], s5, v8, v[2:3]
++; VI-NEXT: v_add_u32_e32 v5, vcc, s8, v6
++; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
++; VI-NEXT: s_mov_b32 s2, -1
++; VI-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc
++; VI-NEXT: v_mov_b32_e32 v1, v4
++; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
++; VI-NEXT: s_endpgm
++;
++; GFX9-LABEL: s_mul_i128:
++; GFX9: ; %bb.0: ; %entry
++; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x4c
++; GFX9-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x7c
++; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
++; GFX9-NEXT: s_mov_b32 s7, 0xf000
++; GFX9-NEXT: s_mov_b32 s6, -1
++; GFX9-NEXT: s_waitcnt lgkmcnt(0)
++; GFX9-NEXT: s_mul_i32 s0, s12, s11
++; GFX9-NEXT: s_mul_hi_u32 s1, s12, s10
++; GFX9-NEXT: s_mul_i32 s2, s14, s9
++; GFX9-NEXT: s_mul_hi_u32 s3, s14, s8
++; GFX9-NEXT: s_add_i32 s0, s1, s0
++; GFX9-NEXT: s_mul_i32 s1, s13, s10
++; GFX9-NEXT: s_add_i32 s2, s3, s2
++; GFX9-NEXT: s_mul_i32 s3, s15, s8
++; GFX9-NEXT: s_add_i32 s0, s0, s1
++; GFX9-NEXT: s_mul_i32 s1, s12, s10
++; GFX9-NEXT: s_add_i32 s2, s2, s3
++; GFX9-NEXT: s_mul_i32 s3, s14, s8
++; GFX9-NEXT: s_add_u32 s3, s3, s1
++; GFX9-NEXT: s_addc_u32 s2, s2, s0
++; GFX9-NEXT: s_mul_i32 s14, s9, s12
++; GFX9-NEXT: s_mul_hi_u32 s15, s8, s12
++; GFX9-NEXT: s_mul_hi_u32 s11, s9, s12
++; GFX9-NEXT: s_add_u32 s14, s14, s15
++; GFX9-NEXT: s_mul_i32 s1, s8, s13
++; GFX9-NEXT: s_addc_u32 s11, s11, 0
++; GFX9-NEXT: s_mul_hi_u32 s10, s8, s13
++; GFX9-NEXT: s_add_u32 s1, s1, s14
++; GFX9-NEXT: s_addc_u32 s10, s10, 0
++; GFX9-NEXT: s_add_u32 s10, s11, s10
++; GFX9-NEXT: s_addc_u32 s11, 0, 0
++; GFX9-NEXT: s_mul_hi_u32 s14, s9, s13
++; GFX9-NEXT: s_mul_i32 s9, s9, s13
++; GFX9-NEXT: s_add_u32 s9, s9, s10
++; GFX9-NEXT: s_addc_u32 s10, s14, s11
++; GFX9-NEXT: s_mov_b32 s0, 0
++; GFX9-NEXT: s_add_u32 s9, s9, s3
++; GFX9-NEXT: s_addc_u32 s10, s10, s2
++; GFX9-NEXT: s_mul_i32 s2, s8, s12
++; GFX9-NEXT: s_mov_b32 s3, s0
++; GFX9-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1]
++; GFX9-NEXT: v_mov_b32_e32 v0, s0
++; GFX9-NEXT: v_mov_b32_e32 v1, s1
++; GFX9-NEXT: v_mov_b32_e32 v2, s9
++; GFX9-NEXT: v_mov_b32_e32 v3, s10
++; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
++; GFX9-NEXT: s_endpgm
++;
++; GFX10-LABEL: s_mul_i128:
++; GFX10: ; %bb.0: ; %entry
++; GFX10-NEXT: s_clause 0x1
++; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x4c
++; GFX10-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x7c
++; GFX10-NEXT: s_mov_b32 s2, 0
++; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
++; GFX10-NEXT: s_mov_b32 s13, s2
++; GFX10-NEXT: s_waitcnt lgkmcnt(0)
++; GFX10-NEXT: s_mul_i32 s3, s8, s7
++; GFX10-NEXT: s_mul_hi_u32 s7, s8, s6
++; GFX10-NEXT: s_mul_i32 s14, s10, s5
++; GFX10-NEXT: s_mul_hi_u32 s15, s10, s4
++; GFX10-NEXT: s_mul_i32 s12, s9, s6
++; GFX10-NEXT: s_mul_i32 s11, s11, s4
++; GFX10-NEXT: s_add_i32 s3, s7, s3
++; GFX10-NEXT: s_add_i32 s7, s15, s14
++; GFX10-NEXT: s_mul_i32 s6, s8, s6
++; GFX10-NEXT: s_mul_i32 s10, s10, s4
++; GFX10-NEXT: s_add_i32 s3, s3, s12
++; GFX10-NEXT: s_add_i32 s7, s7, s11
++; GFX10-NEXT: s_mul_i32 s19, s5, s8
++; GFX10-NEXT: s_mul_hi_u32 s20, s4, s8
++; GFX10-NEXT: s_add_u32 s6, s10, s6
++; GFX10-NEXT: s_mul_hi_u32 s18, s5, s8
++; GFX10-NEXT: s_addc_u32 s7, s7, s3
++; GFX10-NEXT: s_mul_i32 s17, s4, s9
++; GFX10-NEXT: s_add_u32 s3, s19, s20
++; GFX10-NEXT: s_mul_hi_u32 s16, s4, s9
++; GFX10-NEXT: s_mul_hi_u32 s21, s5, s9
++; GFX10-NEXT: s_mul_i32 s5, s5, s9
++; GFX10-NEXT: s_addc_u32 s9, s18, 0
++; GFX10-NEXT: s_add_u32 s3, s17, s3
++; GFX10-NEXT: s_addc_u32 s10, s16, 0
++; GFX10-NEXT: s_mul_i32 s12, s4, s8
++; GFX10-NEXT: s_add_u32 s4, s9, s10
++; GFX10-NEXT: s_addc_u32 s8, 0, 0
++; GFX10-NEXT: s_add_u32 s4, s5, s4
++; GFX10-NEXT: s_addc_u32 s5, s21, s8
++; GFX10-NEXT: s_add_u32 s4, s4, s6
++; GFX10-NEXT: s_addc_u32 s5, s5, s7
++; GFX10-NEXT: s_or_b64 s[2:3], s[12:13], s[2:3]
++; GFX10-NEXT: v_mov_b32_e32 v2, s4
++; GFX10-NEXT: v_mov_b32_e32 v0, s2
++; GFX10-NEXT: v_mov_b32_e32 v1, s3
++; GFX10-NEXT: v_mov_b32_e32 v3, s5
++; GFX10-NEXT: s_mov_b32 s3, 0x31016000
++; GFX10-NEXT: s_mov_b32 s2, -1
++; GFX10-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
++; GFX10-NEXT: s_endpgm
++;
++; GFX11-LABEL: s_mul_i128:
++; GFX11: ; %bb.0: ; %entry
++; GFX11-NEXT: s_clause 0x2
++; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x4c
++; GFX11-NEXT: s_load_b128 s[8:11], s[0:1], 0x7c
++; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
++; GFX11-NEXT: s_mov_b32 s2, 0
++; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
++; GFX11-NEXT: s_mov_b32 s13, s2
++; GFX11-NEXT: s_waitcnt lgkmcnt(0)
++; GFX11-NEXT: s_mul_i32 s3, s8, s7
++; GFX11-NEXT: s_mul_hi_u32 s7, s8, s6
++; GFX11-NEXT: s_mul_i32 s14, s10, s5
++; GFX11-NEXT: s_mul_hi_u32 s15, s10, s4
++; GFX11-NEXT: s_mul_i32 s12, s9, s6
++; GFX11-NEXT: s_mul_i32 s11, s11, s4
++; GFX11-NEXT: s_add_i32 s3, s7, s3
++; GFX11-NEXT: s_add_i32 s7, s15, s14
++; GFX11-NEXT: s_mul_i32 s6, s8, s6
++; GFX11-NEXT: s_mul_i32 s10, s10, s4
++; GFX11-NEXT: s_add_i32 s3, s3, s12
++; GFX11-NEXT: s_add_i32 s7, s7, s11
++; GFX11-NEXT: s_mul_i32 s19, s5, s8
++; GFX11-NEXT: s_mul_hi_u32 s20, s4, s8
++; GFX11-NEXT: s_add_u32 s6, s10, s6
++; GFX11-NEXT: s_mul_hi_u32 s18, s5, s8
++; GFX11-NEXT: s_addc_u32 s7, s7, s3
++; GFX11-NEXT: s_mul_i32 s17, s4, s9
++; GFX11-NEXT: s_add_u32 s3, s19, s20
++; GFX11-NEXT: s_mul_hi_u32 s16, s4, s9
++; GFX11-NEXT: s_mul_hi_u32 s21, s5, s9
++; GFX11-NEXT: s_mul_i32 s5, s5, s9
++; GFX11-NEXT: s_addc_u32 s9, s18, 0
++; GFX11-NEXT: s_add_u32 s3, s17, s3
++; GFX11-NEXT: s_addc_u32 s10, s16, 0
++; GFX11-NEXT: s_mul_i32 s12, s4, s8
++; GFX11-NEXT: s_add_u32 s4, s9, s10
++; GFX11-NEXT: s_addc_u32 s8, 0, 0
++; GFX11-NEXT: s_add_u32 s4, s5, s4
++; GFX11-NEXT: s_addc_u32 s5, s21, s8
++; GFX11-NEXT: s_add_u32 s4, s4, s6
++; GFX11-NEXT: s_addc_u32 s5, s5, s7
++; GFX11-NEXT: s_or_b64 s[2:3], s[12:13], s[2:3]
++; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
++; GFX11-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v1, s3
++; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v3, s5
++; GFX11-NEXT: s_mov_b32 s3, 0x31016000
++; GFX11-NEXT: s_mov_b32 s2, -1
++; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0
++; GFX11-NEXT: s_nop 0
++; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
++; GFX11-NEXT: s_endpgm
++;
++; EG-LABEL: s_mul_i128:
++; EG: ; %bb.0: ; %entry
++; EG-NEXT: ALU 41, @4, KC0[CB0:0-32], KC1[]
++; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
++; EG-NEXT: CF_END
++; EG-NEXT: PAD
++; EG-NEXT: ALU clause starting at 4:
++; EG-NEXT: MULLO_INT * T0.X, KC0[5].X, KC0[8].X,
++; EG-NEXT: MULHI * T0.Y, KC0[5].X, KC0[8].X,
++; EG-NEXT: MULLO_INT * T0.Z, KC0[8].Y, KC0[4].W,
++; EG-NEXT: MULLO_INT * T0.W, KC0[8].X, KC0[5].Y,
++; EG-NEXT: MULHI * T1.X, KC0[5].X, KC0[7].W,
++; EG-NEXT: MULHI * T1.Y, KC0[4].W, KC0[8].X,
++; EG-NEXT: MULHI * T1.Z, KC0[8].Y, KC0[4].W,
++; EG-NEXT: MULLO_INT * T1.W, KC0[8].Y, KC0[5].X,
++; EG-NEXT: MULHI * T2.X, KC0[7].W, KC0[5].Y,
++; EG-NEXT: MULLO_INT * T2.Y, KC0[5].X, KC0[7].W,
++; EG-NEXT: MULHI * T2.Z, KC0[4].W, KC0[7].W,
++; EG-NEXT: ADD_INT T2.W, T2.Y, PS,
++; EG-NEXT: MULLO_INT * T3.X, KC0[4].W, KC0[8].X,
++; EG-NEXT: ADDC_UINT T2.Z, T2.Y, T2.Z,
++; EG-NEXT: ADDC_UINT T3.W, PS, PV.W,
++; EG-NEXT: MULLO_INT * T2.Y, KC0[7].W, KC0[5].Z,
++; EG-NEXT: ADD_INT T2.X, T2.X, PS,
++; EG-NEXT: ADD_INT T2.Y, T1.Z, T1.W,
++; EG-NEXT: ADD_INT T1.Z, T1.Y, PV.W,
++; EG-NEXT: ADD_INT T1.W, T1.X, PV.Z, BS:VEC_120/SCL_212
++; EG-NEXT: MULLO_INT * T1.X, KC0[8].Z, KC0[4].W,
++; EG-NEXT: ADD_INT T4.X, PV.W, PV.Z,
++; EG-NEXT: ADDC_UINT T1.Y, PV.W, PV.Z,
++; EG-NEXT: ADD_INT T1.Z, PV.Y, PS,
++; EG-NEXT: ADD_INT T0.W, PV.X, T0.W,
++; EG-NEXT: MULLO_INT * T1.X, KC0[7].W, KC0[5].Y,
++; EG-NEXT: ADD_INT T2.Y, PV.Z, PV.W,
++; EG-NEXT: ADDC_UINT T1.Z, T0.Z, PS,
++; EG-NEXT: ADD_INT T0.W, T0.Y, PV.Y,
++; EG-NEXT: ADDC_UINT * T1.W, T0.X, PV.X,
++; EG-NEXT: ADD_INT T0.Y, T0.X, T4.X,
++; EG-NEXT: ADD_INT T0.Z, T0.Z, T1.X, BS:VEC_021/SCL_122
++; EG-NEXT: ADD_INT T0.W, PV.W, PS,
++; EG-NEXT: ADD_INT * T1.W, PV.Y, PV.Z,
++; EG-NEXT: ADD_INT T0.W, PV.W, PS,
++; EG-NEXT: ADDC_UINT * T1.W, PV.Y, PV.Z,
++; EG-NEXT: ADD_INT * T0.W, PV.W, PS,
++; EG-NEXT: ADD_INT * T0.Z, T0.Y, T0.Z,
++; EG-NEXT: ADD_INT * T0.Y, T3.X, T2.W,
++; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
++; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
++; EG-NEXT: MULLO_INT * T0.X, KC0[4].W, KC0[7].W,
++entry:
+ %mul = mul i128 %a, %b
+ store i128 %mul, ptr addrspace(1) %out
+ ret void
+ }
+
+-; FUNC-LABEL: {{^}}v_mul_i128:
+-; GCN: {{buffer|flat}}_load_dwordx4
+-; GCN: {{buffer|flat}}_load_dwordx4
+-
+-; SI-DAG: v_mul_lo_u32
+-; SI-DAG: v_mul_hi_u32
+-; SI-DAG: v_mul_hi_u32
+-; SI-DAG: v_mul_lo_u32
+-; SI-DAG: v_mul_hi_u32
+-; SI-DAG: v_mul_hi_u32
+-; SI-DAG: v_mul_lo_u32
+-; SI-DAG: v_mul_lo_u32
+-; SI-DAG: v_add_i32_e32
+-
+-; SI-DAG: v_mul_hi_u32
+-; SI-DAG: v_mul_lo_u32
+-; SI-DAG: v_mul_hi_u32
+-; SI-DAG: v_mul_lo_u32
+-; SI-DAG: v_mul_lo_u32
+-; SI-DAG: v_mul_lo_u32
+-; SI-DAG: v_mul_lo_u32
+-; SI-DAG: v_mul_lo_u32
+-
+-; VI-DAG: v_mad_u64_u32
+-; VI-DAG: v_mad_u64_u32
+-; VI-DAG: v_mad_u64_u32
+-; VI-DAG: v_mad_u64_u32
+-; VI-DAG: v_mad_u64_u32
+-; VI-DAG: v_mad_u64_u32
+-; VI-DAG: v_mul_lo_u32
+-; VI-DAG: v_mul_lo_u32
+-; VI-DAG: v_mul_lo_u32
+-
+-; GCN: {{buffer|flat}}_store_dwordx4
+ define amdgpu_kernel void @v_mul_i128(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) #0 {
++; SI-LABEL: v_mul_i128:
++; SI: ; %bb.0: ; %entry
++; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xb
++; SI-NEXT: s_mov_b32 s7, 0xf000
++; SI-NEXT: s_mov_b32 s6, 0
++; SI-NEXT: v_lshlrev_b32_e32 v8, 4, v0
++; SI-NEXT: v_mov_b32_e32 v9, 0
++; SI-NEXT: s_waitcnt lgkmcnt(0)
++; SI-NEXT: s_mov_b64 s[4:5], s[0:1]
++; SI-NEXT: s_mov_b64 s[0:1], s[2:3]
++; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
++; SI-NEXT: buffer_load_dwordx4 v[0:3], v[8:9], s[4:7], 0 addr64
++; SI-NEXT: buffer_load_dwordx4 v[4:7], v[8:9], s[0:3], 0 addr64
++; SI-NEXT: s_waitcnt vmcnt(0)
++; SI-NEXT: v_mul_lo_u32 v3, v4, v3
++; SI-NEXT: v_mul_hi_u32 v10, v4, v2
++; SI-NEXT: v_mul_lo_u32 v12, v6, v1
++; SI-NEXT: v_mul_hi_u32 v13, v6, v0
++; SI-NEXT: v_mul_lo_u32 v17, v1, v4
++; SI-NEXT: v_mul_hi_u32 v18, v0, v4
++; SI-NEXT: v_mul_lo_u32 v11, v5, v2
++; SI-NEXT: v_mul_lo_u32 v7, v7, v0
++; SI-NEXT: v_mul_hi_u32 v16, v1, v4
++; SI-NEXT: v_mul_lo_u32 v15, v0, v5
++; SI-NEXT: v_mul_hi_u32 v14, v0, v5
++; SI-NEXT: v_mul_hi_u32 v19, v1, v5
++; SI-NEXT: v_mul_lo_u32 v5, v1, v5
++; SI-NEXT: v_add_i32_e32 v1, vcc, v10, v3
++; SI-NEXT: v_add_i32_e32 v3, vcc, v13, v12
++; SI-NEXT: v_mul_lo_u32 v2, v4, v2
++; SI-NEXT: v_mul_lo_u32 v6, v6, v0
++; SI-NEXT: v_mul_lo_u32 v0, v0, v4
++; SI-NEXT: v_add_i32_e32 v4, vcc, v17, v18
++; SI-NEXT: v_addc_u32_e32 v10, vcc, 0, v16, vcc
++; SI-NEXT: v_add_i32_e32 v11, vcc, v1, v11
++; SI-NEXT: v_add_i32_e32 v3, vcc, v3, v7
++; SI-NEXT: v_add_i32_e32 v1, vcc, v15, v4
++; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v14, vcc
++; SI-NEXT: v_add_i32_e32 v2, vcc, v6, v2
++; SI-NEXT: v_addc_u32_e32 v3, vcc, v3, v11, vcc
++; SI-NEXT: v_add_i32_e32 v4, vcc, v10, v4
++; SI-NEXT: v_addc_u32_e64 v6, s[4:5], 0, 0, vcc
++; SI-NEXT: v_add_i32_e32 v4, vcc, v5, v4
++; SI-NEXT: v_addc_u32_e32 v5, vcc, v19, v6, vcc
++; SI-NEXT: v_add_i32_e32 v2, vcc, v4, v2
++; SI-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc
++; SI-NEXT: buffer_store_dwordx4 v[0:3], v[8:9], s[0:3], 0 addr64
++; SI-NEXT: s_endpgm
++;
++; VI-LABEL: v_mul_i128:
++; VI: ; %bb.0: ; %entry
++; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2c
++; VI-NEXT: v_lshlrev_b32_e32 v2, 4, v0
++; VI-NEXT: v_mov_b32_e32 v11, 0
++; VI-NEXT: s_waitcnt lgkmcnt(0)
++; VI-NEXT: v_mov_b32_e32 v1, s1
++; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
++; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
++; VI-NEXT: v_mov_b32_e32 v3, s3
++; VI-NEXT: v_add_u32_e32 v8, vcc, s2, v2
++; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v3, vcc
++; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
++; VI-NEXT: flat_load_dwordx4 v[4:7], v[8:9]
++; VI-NEXT: s_waitcnt vmcnt(0)
++; VI-NEXT: v_mul_lo_u32 v10, v4, v3
++; VI-NEXT: v_mad_u64_u32 v[12:13], s[0:1], v4, v2, 0
++; VI-NEXT: v_mul_lo_u32 v14, v5, v2
++; VI-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v0, v4, 0
++; VI-NEXT: v_mul_lo_u32 v15, v7, v0
++; VI-NEXT: v_add_u32_e32 v7, vcc, v13, v10
++; VI-NEXT: v_mov_b32_e32 v10, v3
++; VI-NEXT: v_mad_u64_u32 v[3:4], s[0:1], v1, v4, v[10:11]
++; VI-NEXT: v_add_u32_e32 v13, vcc, v7, v14
++; VI-NEXT: v_mov_b32_e32 v7, v4
++; VI-NEXT: v_mov_b32_e32 v4, v11
++; VI-NEXT: v_mad_u64_u32 v[12:13], s[0:1], v6, v0, v[12:13]
++; VI-NEXT: v_mad_u64_u32 v[3:4], s[0:1], v0, v5, v[3:4]
++; VI-NEXT: v_add_u32_e32 v11, vcc, v15, v13
++; VI-NEXT: v_mov_b32_e32 v0, v4
++; VI-NEXT: v_mul_lo_u32 v10, v6, v1
++; VI-NEXT: v_add_u32_e32 v6, vcc, v7, v0
++; VI-NEXT: v_addc_u32_e64 v7, s[0:1], 0, 0, vcc
++; VI-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v1, v5, v[6:7]
++; VI-NEXT: v_add_u32_e32 v5, vcc, v10, v11
++; VI-NEXT: v_add_u32_e32 v4, vcc, v0, v12
++; VI-NEXT: v_addc_u32_e32 v5, vcc, v1, v5, vcc
++; VI-NEXT: flat_store_dwordx4 v[8:9], v[2:5]
++; VI-NEXT: s_endpgm
++;
++; GFX9-LABEL: v_mul_i128:
++; GFX9: ; %bb.0: ; %entry
++; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2c
++; GFX9-NEXT: v_lshlrev_b32_e32 v13, 4, v0
++; GFX9-NEXT: v_mov_b32_e32 v10, 0
++; GFX9-NEXT: s_waitcnt lgkmcnt(0)
++; GFX9-NEXT: global_load_dwordx4 v[0:3], v13, s[0:1]
++; GFX9-NEXT: global_load_dwordx4 v[4:7], v13, s[2:3]
++; GFX9-NEXT: s_waitcnt vmcnt(0)
++; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v0, v4, 0
++; GFX9-NEXT: v_mul_lo_u32 v14, v5, v2
++; GFX9-NEXT: v_mul_lo_u32 v15, v4, v3
++; GFX9-NEXT: v_mad_u64_u32 v[11:12], s[0:1], v1, v4, v[9:10]
++; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v4, v2, 0
++; GFX9-NEXT: v_mov_b32_e32 v4, v12
++; GFX9-NEXT: v_mov_b32_e32 v12, v10
++; GFX9-NEXT: v_mad_u64_u32 v[9:10], s[0:1], v0, v5, v[11:12]
++; GFX9-NEXT: v_add3_u32 v3, v3, v15, v14
++; GFX9-NEXT: v_mul_lo_u32 v17, v7, v0
++; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v6, v0, v[2:3]
++; GFX9-NEXT: v_mov_b32_e32 v0, v10
++; GFX9-NEXT: v_mul_lo_u32 v16, v6, v1
++; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v4, v0
++; GFX9-NEXT: v_addc_co_u32_e64 v7, s[0:1], 0, 0, vcc
++; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v1, v5, v[6:7]
++; GFX9-NEXT: v_add3_u32 v3, v17, v3, v16
++; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v0, v2
++; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, v1, v3, vcc
++; GFX9-NEXT: global_store_dwordx4 v13, v[8:11], s[2:3]
++; GFX9-NEXT: s_endpgm
++;
++; GFX10-LABEL: v_mul_i128:
++; GFX10: ; %bb.0: ; %entry
++; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2c
++; GFX10-NEXT: v_lshlrev_b32_e32 v14, 4, v0
++; GFX10-NEXT: v_mov_b32_e32 v10, 0
++; GFX10-NEXT: s_waitcnt lgkmcnt(0)
++; GFX10-NEXT: s_clause 0x1
++; GFX10-NEXT: global_load_dwordx4 v[0:3], v14, s[0:1]
++; GFX10-NEXT: global_load_dwordx4 v[4:7], v14, s[2:3]
++; GFX10-NEXT: s_waitcnt vmcnt(0)
++; GFX10-NEXT: v_mad_u64_u32 v[8:9], s0, v0, v4, 0
++; GFX10-NEXT: v_mul_lo_u32 v7, v7, v0
++; GFX10-NEXT: v_mad_u64_u32 v[11:12], s0, v1, v4, v[9:10]
++; GFX10-NEXT: v_mov_b32_e32 v9, v12
++; GFX10-NEXT: v_mov_b32_e32 v12, v10
++; GFX10-NEXT: v_mul_lo_u32 v10, v5, v2
++; GFX10-NEXT: v_mad_u64_u32 v[12:13], s0, v0, v5, v[11:12]
++; GFX10-NEXT: v_mul_lo_u32 v11, v4, v3
++; GFX10-NEXT: v_mad_u64_u32 v[2:3], s0, v4, v2, 0
++; GFX10-NEXT: v_mov_b32_e32 v4, v13
++; GFX10-NEXT: v_mul_lo_u32 v13, v6, v1
++; GFX10-NEXT: v_add3_u32 v3, v3, v11, v10
++; GFX10-NEXT: v_add_co_u32 v9, s0, v9, v4
++; GFX10-NEXT: v_add_co_ci_u32_e64 v10, s0, 0, 0, s0
++; GFX10-NEXT: v_mad_u64_u32 v[2:3], s0, v6, v0, v[2:3]
++; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, v1, v5, v[9:10]
++; GFX10-NEXT: v_mov_b32_e32 v9, v12
++; GFX10-NEXT: v_add3_u32 v3, v7, v3, v13
++; GFX10-NEXT: v_add_co_u32 v10, vcc_lo, v0, v2
++; GFX10-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, v1, v3, vcc_lo
++; GFX10-NEXT: global_store_dwordx4 v14, v[8:11], s[2:3]
++; GFX10-NEXT: s_endpgm
++;
++; GFX11-LABEL: v_mul_i128:
++; GFX11: ; %bb.0: ; %entry
++; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x2c
++; GFX11-NEXT: v_lshlrev_b32_e32 v16, 4, v0
++; GFX11-NEXT: v_mov_b32_e32 v10, 0
++; GFX11-NEXT: s_waitcnt lgkmcnt(0)
++; GFX11-NEXT: s_clause 0x1
++; GFX11-NEXT: global_load_b128 v[0:3], v16, s[0:1]
++; GFX11-NEXT: global_load_b128 v[4:7], v16, s[2:3]
++; GFX11-NEXT: s_waitcnt vmcnt(0)
++; GFX11-NEXT: v_mad_u64_u32 v[8:9], null, v0, v4, 0
++; GFX11-NEXT: v_mul_lo_u32 v15, v5, v2
++; GFX11-NEXT: v_mul_lo_u32 v3, v4, v3
++; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
++; GFX11-NEXT: v_mad_u64_u32 v[11:12], null, v1, v4, v[9:10]
++; GFX11-NEXT: v_dual_mov_b32 v9, v12 :: v_dual_mov_b32 v12, v10
++; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
++; GFX11-NEXT: v_mad_u64_u32 v[13:14], null, v0, v5, v[11:12]
++; GFX11-NEXT: v_mad_u64_u32 v[10:11], null, v4, v2, 0
++; GFX11-NEXT: v_mul_lo_u32 v4, v6, v1
++; GFX11-NEXT: v_mul_lo_u32 v12, v7, v0
++; GFX11-NEXT: v_mov_b32_e32 v2, v14
++; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
++; GFX11-NEXT: v_add3_u32 v11, v11, v3, v15
++; GFX11-NEXT: v_add_co_u32 v2, s0, v9, v2
++; GFX11-NEXT: v_mov_b32_e32 v9, v13
++; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, 0, s0
++; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
++; GFX11-NEXT: v_mad_u64_u32 v[14:15], null, v6, v0, v[10:11]
++; GFX11-NEXT: v_mad_u64_u32 v[6:7], null, v1, v5, v[2:3]
++; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
++; GFX11-NEXT: v_add3_u32 v0, v12, v15, v4
++; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v6, v14
++; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
++; GFX11-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, v7, v0, vcc_lo
++; GFX11-NEXT: global_store_b128 v16, v[8:11], s[2:3]
++; GFX11-NEXT: s_nop 0
++; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
++; GFX11-NEXT: s_endpgm
++;
++; EG-LABEL: v_mul_i128:
++; EG: ; %bb.0: ; %entry
++; EG-NEXT: ALU 3, @10, KC0[CB0:0-32], KC1[]
++; EG-NEXT: TEX 1 @6
++; EG-NEXT: ALU 41, @14, KC0[], KC1[]
++; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
++; EG-NEXT: CF_END
++; EG-NEXT: PAD
++; EG-NEXT: Fetch clause starting at 6:
++; EG-NEXT: VTX_READ_128 T2.XYZW, T1.X, 0, #1
++; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1
++; EG-NEXT: ALU clause starting at 10:
++; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
++; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
++; EG-NEXT: ADD_INT T0.X, KC0[2].Z, PV.W,
++; EG-NEXT: ADD_INT * T1.X, KC0[2].W, PV.W,
++; EG-NEXT: ALU clause starting at 14:
++; EG-NEXT: MULLO_INT * T1.Y, T0.Y, T2.Y,
++; EG-NEXT: MULHI * T1.Z, T0.Y, T2.Y,
++; EG-NEXT: MULLO_INT * T1.W, T2.Z, T0.X,
++; EG-NEXT: MULLO_INT * T3.X, T2.Y, T0.Z,
++; EG-NEXT: MULHI * T3.Y, T0.Y, T2.X,
++; EG-NEXT: MULHI * T3.Z, T0.X, T2.Y,
++; EG-NEXT: MULHI * T3.W, T2.Z, T0.X,
++; EG-NEXT: MULLO_INT * T2.Z, T2.Z, T0.Y,
++; EG-NEXT: MULHI * T4.X, T2.X, T0.Z,
++; EG-NEXT: MULLO_INT * T0.Y, T0.Y, T2.X,
++; EG-NEXT: MULHI * T4.Y, T0.X, T2.X,
++; EG-NEXT: ADD_INT T4.W, T0.Y, PS,
++; EG-NEXT: MULLO_INT * T2.Y, T0.X, T2.Y,
++; EG-NEXT: ADDC_UINT T4.Z, T0.Y, T4.Y,
++; EG-NEXT: ADDC_UINT T5.W, PS, PV.W,
++; EG-NEXT: MULLO_INT * T0.Y, T2.X, T0.W,
++; EG-NEXT: ADD_INT T4.X, T4.X, PS,
++; EG-NEXT: ADD_INT T0.Y, T3.W, T2.Z,
++; EG-NEXT: ADD_INT T2.Z, T3.Z, PV.W,
++; EG-NEXT: ADD_INT T0.W, T3.Y, PV.Z,
++; EG-NEXT: MULLO_INT * T2.W, T2.W, T0.X,
++; EG-NEXT: ADD_INT T5.X, PV.W, PV.Z,
++; EG-NEXT: ADDC_UINT T3.Y, PV.W, PV.Z,
++; EG-NEXT: ADD_INT T2.Z, PV.Y, PS,
++; EG-NEXT: ADD_INT T0.W, PV.X, T3.X,
++; EG-NEXT: MULLO_INT * T0.Y, T2.X, T0.Z,
++; EG-NEXT: ADD_INT T4.Y, PV.Z, PV.W,
++; EG-NEXT: ADDC_UINT T0.Z, T1.W, PS,
++; EG-NEXT: ADD_INT T0.W, T1.Z, PV.Y,
++; EG-NEXT: ADDC_UINT * T2.W, T1.Y, PV.X,
++; EG-NEXT: ADD_INT T1.Y, T1.Y, T5.X,
++; EG-NEXT: ADD_INT T1.Z, T1.W, T0.Y,
++; EG-NEXT: ADD_INT T0.W, PV.W, PS,
++; EG-NEXT: ADD_INT * T1.W, PV.Y, PV.Z,
++; EG-NEXT: ADD_INT T0.W, PV.W, PS,
++; EG-NEXT: ADDC_UINT * T1.W, PV.Y, PV.Z,
++; EG-NEXT: ADD_INT * T0.W, PV.W, PS,
++; EG-NEXT: ADD_INT * T0.Z, T1.Y, T1.Z,
++; EG-NEXT: ADD_INT * T0.Y, T2.Y, T4.W,
++; EG-NEXT: LSHR T1.X, T1.X, literal.x,
++; EG-NEXT: MULLO_INT * T0.X, T0.X, T2.X,
++; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
++entry:
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep.a = getelementptr inbounds i128, ptr addrspace(1) %aptr, i32 %tid
+ %gep.b = getelementptr inbounds i128, ptr addrspace(1) %bptr, i32 %tid
+--
+2.31.1
+
diff --git a/var/spack/repos/builtin/packages/llvm-amdgpu/package.py b/var/spack/repos/builtin/packages/llvm-amdgpu/package.py
index 977db83012..d69575d933 100644
--- a/var/spack/repos/builtin/packages/llvm-amdgpu/package.py
+++ b/var/spack/repos/builtin/packages/llvm-amdgpu/package.py
@@ -24,6 +24,8 @@ class LlvmAmdgpu(CMakePackage):
maintainers("srekolam", "renjithravindrankannath", "haampie")
version("master", branch="amd-stg-open")
+ version("5.6.1", sha256="045e43c0c4a3f4f2f1db9fb603a4f1ea3d56e128147e19ba17909eb57d7f08e5")
+ version("5.6.0", sha256="e922bd492b54d99e56ed88c81e2009ed6472059a180b10cc56ce1f9bd2d7b6ed")
version("5.5.1", sha256="7d7181f20f89cb0715191aa32914186c67a34258c13457055570d47e15296553")
version("5.5.0", sha256="5dc6c99f612b69ff73145bee17524e3712990100e16445b71634106acf7927cf")
version("5.4.3", sha256="a844d3cc01613f6284a75d44db67c495ac1e9b600eacbb1eb13d2649f5d5404d")
@@ -160,7 +162,13 @@ class LlvmAmdgpu(CMakePackage):
# as per 5.2.0 llvm code. It used to be llvm/bin/../lib/libdevice.
# Below patch is to look in the old path.
patch("adjust-openmp-bitcode-directory-for-llvm-link.patch", when="@5.2.0:")
- patch("patch-llvm-5.5.0.patch", when="@5.5")
+ patch("patch-llvm-5.5.0.patch", when="@5.5:")
+
+ # i1 muls can sometimes happen after SCEV.
+ # They resulted in ISel failures because we were missing the patterns for them.
+ # This fix is targeting 6.1 rocm release.
+ # Need patch until https://github.com/llvm/llvm-project/pull/67291 is merged.
+ patch("001-Add-i1-mul-patterns.patch", when="@5.6:")
conflicts("^cmake@3.19.0")
@@ -169,6 +177,8 @@ class LlvmAmdgpu(CMakePackage):
# Add device libs sources so they can be an external LLVM project
for d_version, d_shasum in [
+ ("5.6.1", "f0dfab272ff936225bfa1e9dabeb3c5d12ce08b812bf53ffbddd2ddfac49761c"),
+ ("5.6.0", "efb5dcdca9b3a9fbe408d494fb4a23e0b78417eb5fa8eebd4a5d226088f28921"),
("5.5.1", "3b5f6dd85f0e3371f6078da7b59bf77d5b210e30f1cc66ef1e2de6bbcb775833"),
("5.5.0", "5ab95aeb9c8bed0514f96f7847e21e165ed901ed826cdc9382c14d199cbadbd3"),
("5.4.3", "f4f7281f2cea6d268fcc3662b37410957d4f0bc23e0df9f60b12eb0fcdf9e26e"),
diff --git a/var/spack/repos/builtin/packages/llvm/package.py b/var/spack/repos/builtin/packages/llvm/package.py
index dc5a8ed5fe..df0e762fc5 100644
--- a/var/spack/repos/builtin/packages/llvm/package.py
+++ b/var/spack/repos/builtin/packages/llvm/package.py
@@ -243,6 +243,8 @@ class Llvm(CMakePackage, CudaPackage):
description="Enable zstd support for static analyzer / lld",
)
+ provides("libllvm@16", when="@16.0.0:16")
+ provides("libllvm@15", when="@15.0.0:15")
provides("libllvm@14", when="@14.0.0:14")
provides("libllvm@13", when="@13.0.0:13")
provides("libllvm@12", when="@12.0.0:12")
diff --git a/var/spack/repos/builtin/packages/migraphx/0005-Adding-half-include-directory-path-migraphx.patch b/var/spack/repos/builtin/packages/migraphx/0005-Adding-half-include-directory-path-migraphx.patch
new file mode 100644
index 0000000000..b11445bdca
--- /dev/null
+++ b/var/spack/repos/builtin/packages/migraphx/0005-Adding-half-include-directory-path-migraphx.patch
@@ -0,0 +1,48 @@
+From 612664789657444daa88f8f28a183928e01595d0 Mon Sep 17 00:00:00 2001
+From: Renjith Ravindran <Renjith.RavindranKannath@amd.com>
+Date: Mon, 25 Sep 2023 19:30:19 +0000
+Subject: [PATCH] Adding-half-include-directory-path
+
+---
+ CMakeLists.txt | 4 +++-
+ cmake/PythonModules.cmake | 2 +-
+ 2 files changed, 4 insertions(+), 2 deletions(-)
+
+diff --git a/CMakeLists.txt b/CMakeLists.txt
+index 4601cdd..9cd48ad 100755
+--- a/CMakeLists.txt
++++ b/CMakeLists.txt
+@@ -51,7 +51,7 @@ set(CMAKE_BUILD_RPATH "${CMAKE_BINARY_DIR}/lib")
+ project(migraphx)
+ find_package(ROCM REQUIRED)
+
+-find_path(HALF_INCLUDE_DIR half.hpp PATH_SUFFIXES half)
++find_path(HALF_INCLUDE_DIR half.hpp)
+ if (NOT HALF_INCLUDE_DIR)
+ message(FATAL_ERROR "Could not find half.hpp - Please check that the install path of half.hpp has been added to CMAKE_PREFIX_PATH")
+ else()
+@@ -272,6 +272,8 @@ add_subdirectory(docs)
+ add_subdirectory(test)
+ add_subdirectory(tools)
+
++target_include_directories(migraphx PUBLIC "${NLOHMANN_JSON_INCLUDE} ${HALF_INCLUDE_DIR}")
++
+ set(DEST_DIR ${CMAKE_BINARY_DIR})
+ file(GLOB backend_files ${CMAKE_SOURCE_DIR}/src/py/backend/*.py)
+ file(MAKE_DIRECTORY ${DEST_DIR}/lib/onnx_migraphx)
+diff --git a/cmake/PythonModules.cmake b/cmake/PythonModules.cmake
+index b5818ce..b4bfbb3 100755
+--- a/cmake/PythonModules.cmake
++++ b/cmake/PythonModules.cmake
+@@ -76,7 +76,7 @@ function(py_add_module NAME)
+ )
+
+ endfunction()
+-set(PYTHON_SEARCH_VERSIONS 2.7 3.5 3.6 3.7 3.8 3.9 3.10)
++set(PYTHON_SEARCH_VERSIONS 3.5 3.6 3.7 3.8 3.9 3.10)
+ set(PYTHON_DISABLE_VERSIONS "" CACHE STRING "")
+ foreach(PYTHON_DISABLE_VERSION ${PYTHON_DISABLE_VERSIONS})
+ list(REMOVE_ITEM PYTHON_SEARCH_VERSIONS ${PYTHON_DISABLE_VERSION})
+--
+2.31.1
+
diff --git a/var/spack/repos/builtin/packages/migraphx/package.py b/var/spack/repos/builtin/packages/migraphx/package.py
index a0179de5ad..81bf1bff2b 100644
--- a/var/spack/repos/builtin/packages/migraphx/package.py
+++ b/var/spack/repos/builtin/packages/migraphx/package.py
@@ -19,6 +19,8 @@ class Migraphx(CMakePackage):
maintainers("srekolam", "renjithravindrankannath")
libraries = ["libmigraphx"]
+ version("5.6.1", sha256="b108c33f07572ffd880b20f6de06f1934ab2a1b41ae69095612322ac412fa91c")
+ version("5.6.0", sha256="eaec90535d62002fd5bb264677ad4a7e30c55f18d2a287680d0495c7e60432b2")
version("5.5.1", sha256="e71c4744f8ef6a1a99c179bbad94b8fe9bd7686eaa9397f376b70988c3341f0c")
version("5.5.0", sha256="6084eb596b170f5e38f22b5fa37e66aa43a8cbc626712c9f03cde48c8fecfc8f")
version("5.4.3", sha256="f83e7bbe5d6d0951fb2cf0abf7e8b3530e9a5e45f7cec6d760da055d6905d568")
@@ -110,19 +112,21 @@ class Migraphx(CMakePackage):
return url
- patch("0001-Adding-nlohmann-json-include-directory.patch", when="@3.9.0:")
+ patch("0001-Adding-nlohmann-json-include-directory.patch", when="@3.9.0:5.5")
# Restrict Python 2.7 usage to fix the issue below
# https://github.com/spack/spack/issues/24429
patch("0002-restrict-python-2.7-usage.patch", when="@3.9.0:5.1.3")
patch("0003-restrict-python-2.7-usage.patch", when="@5.2.0:5.4")
- patch("0004-restrict-python2.7-usage-for-5.5.0.patch", when="@5.5.0:")
+ patch("0004-restrict-python2.7-usage-for-5.5.0.patch", when="@5.5.0")
+ patch("0005-Adding-half-include-directory-path-migraphx.patch", when="@5.6.0:")
depends_on("cmake@3.5:", type="build")
depends_on("protobuf", type="link")
depends_on("blaze", type="build")
depends_on("nlohmann-json", type="link")
depends_on("msgpack-c", type="link")
- depends_on("half@1.12.0", type="link")
+ depends_on("half@1.12.0", type="link", when="@:5.5")
+ depends_on("half@2:", when="@5.6:")
depends_on("python@3.5:", type="build")
depends_on("py-pybind11", type="build", when="@:4.0.0")
depends_on("py-pybind11@2.6:", type="build", when="@4.1.0:")
@@ -154,6 +158,8 @@ class Migraphx(CMakePackage):
"5.4.3",
"5.5.0",
"5.5.1",
+ "5.6.0",
+ "5.6.1",
]:
depends_on("rocm-cmake@%s:" % ver, type="build", when="@" + ver)
depends_on("hip@" + ver, when="@" + ver)
@@ -193,3 +199,11 @@ class Migraphx(CMakePackage):
if "@5.5.0:" in self.spec:
args.append(self.define("CMAKE_CXX_FLAGS", "-I{0}".format(abspath)))
return args
+
+ def test(self):
+ if self.spec.satisfies("@:5.5.0"):
+ print("Skipping: stand-alone tests")
+ return
+ test_dir = join_path(self.spec["migraphx"].prefix, "bin")
+ with working_dir(test_dir, create=True):
+ self.run_test("UnitTests")
diff --git a/var/spack/repos/builtin/packages/miopen-hip/package.py b/var/spack/repos/builtin/packages/miopen-hip/package.py
index 79ed4c27d3..4843ae1173 100644
--- a/var/spack/repos/builtin/packages/miopen-hip/package.py
+++ b/var/spack/repos/builtin/packages/miopen-hip/package.py
@@ -19,7 +19,8 @@ class MiopenHip(CMakePackage):
maintainers("srekolam", "renjithravindrankannath")
libraries = ["libMIOpen"]
-
+ version("5.6.1", sha256="ff627d68ed9e52433a3c808b5d3ff179a398b77ce81b00cfea7b2c4da5162c6c")
+ version("5.6.0", sha256="d620ddab5b488bdf81242654fefa337c6b71dc410c2ff26d30a4ee86a8d22d11")
version("5.5.1", sha256="2cd75071b8ee876c69a94f028b6c8a9346d6d2fde7d4b64e6d635f3b6c994262")
version("5.5.0", sha256="791087242551669e546225e36123c21663f0dad14dbcfd6d0ce0e7bad0ab0de1")
version("5.4.3", sha256="37ffe2ed3d7942da8ea2f6bdb85c7a2f58e3ccd31767db158a322769d3604efd")
@@ -144,6 +145,8 @@ class MiopenHip(CMakePackage):
"5.4.3",
"5.5.0",
"5.5.1",
+ "5.6.0",
+ "5.6.1",
]:
depends_on("rocm-cmake@%s:" % ver, type="build", when="@" + ver)
depends_on("hip@" + ver, when="@" + ver)
@@ -153,12 +156,11 @@ class MiopenHip(CMakePackage):
for ver in ["5.1.0", "5.1.3", "5.2.0", "5.2.1", "5.2.3", "5.3.0", "5.3.3"]:
depends_on("mlirmiopen@" + ver, when="@" + ver)
- for ver in ["5.4.0", "5.4.3", "5.5.0", "5.5.1"]:
+ for ver in ["5.5.1", "5.6.0", "5.6.1"]:
depends_on("nlohmann-json", type="link")
+ depends_on("composable-kernel@" + ver, when="@" + ver)
for ver in ["5.4.0", "5.4.3", "5.5.0"]:
depends_on("rocmlir@" + ver, when="@" + ver)
- for ver in ["5.5.1"]:
- depends_on("composable-kernel@" + ver, when="@" + ver)
def setup_build_environment(self, env):
if "@3.9.0:" in self.spec:
@@ -209,7 +211,12 @@ class MiopenHip(CMakePackage):
)
if self.spec.satisfies("@5.4.0:5.5.0"):
args.append(self.define("MIOPEN_USE_COMPOSABLEKERNEL", "OFF"))
+ args.append(self.define("MIOPEN_USE_MLIR", "ON"))
+ args.append(self.define("MIOPEN_ENABLE_AI_KERNEL_TUNING", "OFF"))
if self.spec.satisfies("@5.5.1:"):
args.append(self.define("MIOPEN_USE_COMPOSABLEKERNEL", "ON"))
- args.append(self.define("MIOPEN_USE_MLIR", "OFF"))
+ args.append(self.define("MIOPEN_ENABLE_AI_KERNEL_TUNING", "OFF"))
+ args.append(
+ "-DNLOHMANN_JSON_INCLUDE={0}".format(self.spec["nlohmann-json"].prefix.include)
+ )
return args
diff --git a/var/spack/repos/builtin/packages/mivisionx/package.py b/var/spack/repos/builtin/packages/mivisionx/package.py
index 9d3a16959b..bd1a40a872 100644
--- a/var/spack/repos/builtin/packages/mivisionx/package.py
+++ b/var/spack/repos/builtin/packages/mivisionx/package.py
@@ -25,6 +25,8 @@ class Mivisionx(CMakePackage):
url = "https://github.com/GPUOpen-ProfessionalCompute-Libraries/MIVisionX/archive/rocm-{0}.tar.gz"
return url.format(version)
+ version("5.6.1", sha256="b2ff95c1488e244f379482631dae4f9ab92d94a513d180e03607aa1e184b5b0a")
+ version("5.6.0", sha256="34c184e202b1a6da2398b66e33c384d5bafd8f8291089c18539715c5cb73eb1f")
version("5.5.1", sha256="e8209f87a57c4222003a936240e7152bbfa496862113358f29d4c3e80d4cdf56")
version("5.5.0", sha256="af266550ecccad80f08954f23e47e8264eb338b0928a5314bd6efca349fc5a14")
version("5.4.3", sha256="4da82974962a70c326ce2427c664517b1efdff436efe222e6bc28817c222a082")
@@ -115,6 +117,8 @@ class Mivisionx(CMakePackage):
variant("opencl", default=False, description="Use OPENCL as the backend")
variant("hip", default=True, description="Use HIP as backend")
+ conflicts("+opencl", when="@5.6.0:")
+
def patch(self):
if self.spec.satisfies("@4.2.0"):
filter_file(
@@ -255,13 +259,16 @@ class Mivisionx(CMakePackage):
"5.4.3",
"5.5.0",
"5.5.1",
+ "5.6.0",
+ "5.6.1",
]:
depends_on("miopen-hip@" + ver, when="@" + ver)
- for ver in ["5.3.3", "5.4.0", "5.4.3", "5.5.0", "5.5.1"]:
+ for ver in ["5.3.3", "5.4.0", "5.4.3", "5.5.0", "5.5.1", "5.6.0", "5.6.1"]:
depends_on("migraphx@" + ver, when="@" + ver)
- for ver in ["5.5.0", "5.5.1"]:
+ for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1"]:
depends_on("rocm-core@" + ver, when="@" + ver)
+ depends_on("python@3.5:", type="build")
def flag_handler(self, name, flags):
spec = self.spec
diff --git a/var/spack/repos/builtin/packages/rccl/package.py b/var/spack/repos/builtin/packages/rccl/package.py
index 6545452cf3..677b077b4b 100644
--- a/var/spack/repos/builtin/packages/rccl/package.py
+++ b/var/spack/repos/builtin/packages/rccl/package.py
@@ -21,6 +21,8 @@ class Rccl(CMakePackage):
maintainers("srekolam", "renjithravindrankannath")
libraries = ["librccl"]
+ version("5.6.1", sha256="27ec6b86a1a329684d808f728c1fce134517ac8e6e7047689f95dbf8386c077e")
+ version("5.6.0", sha256="cce13c8a9e233e7ddf91a67b1626b7aaeaf818fefe61af8de6b6b6ff47cb358c")
version("5.5.1", sha256="f6b9dc6dafeb49d95c085825876b09317d8252771c746ccf5aa19a9204a404b2")
version("5.5.0", sha256="be2964b408741d046bcd606d339a233d1d1deac7b841647ec53d6d62d71452ba")
version("5.4.3", sha256="a2524f602bd7b3b6afeb8ba9aff660216ee807fa836e46442d068b5ed5f51a4d")
@@ -143,6 +145,8 @@ class Rccl(CMakePackage):
"5.4.3",
"5.5.0",
"5.5.1",
+ "5.6.0",
+ "5.6.1",
]:
depends_on("rocm-cmake@%s:" % ver, type="build", when="@" + ver)
depends_on("hip@" + ver, when="@" + ver)
@@ -174,6 +178,8 @@ class Rccl(CMakePackage):
"5.4.3",
"5.5.0",
"5.5.1",
+ "5.6.0",
+ "5.6.1",
]:
depends_on("numactl@2:", when="@" + ver)
for ver in [
@@ -190,12 +196,15 @@ class Rccl(CMakePackage):
"5.3.3",
"5.4.0",
"5.4.3",
+ "5.5.0",
"5.5.1",
+ "5.6.0",
+ "5.6.1",
]:
depends_on("rocm-smi-lib@" + ver, when="@" + ver)
depends_on("chrpath", when="@5.3.0:")
- for ver in ["5.5.0", "5.5.1"]:
+ for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1"]:
depends_on("rocm-core@" + ver, when="@" + ver)
depends_on("googletest@1.11.0:", when="@5.3:")
diff --git a/var/spack/repos/builtin/packages/rdc/package.py b/var/spack/repos/builtin/packages/rdc/package.py
index bb6314b420..8f88417ebf 100644
--- a/var/spack/repos/builtin/packages/rdc/package.py
+++ b/var/spack/repos/builtin/packages/rdc/package.py
@@ -26,6 +26,8 @@ class Rdc(CMakePackage):
url = "https://github.com/RadeonOpenCompute/rdc/archive/rocm-{0}.tar.gz"
return url.format(version)
+ version("5.6.1", sha256="9e9f57cebbc5ae386a405957ed2c17344cdb42db5e1a71285f2c9bc09eea6519")
+ version("5.6.0", sha256="5213cd89215463862f6a1e9480ebe017944a6bb6b0db1722628afaa34af57991")
version("5.5.1", sha256="a58a319ee702cf61cf71a4eba647c231392f68449b35419d941079c6de944844")
version("5.5.0", sha256="56e85e77581963fbcfcc43e091a91773de470152347808ae730bcaf92c9f5ee8")
version("5.4.3", sha256="c44f0b070b5650bc78e2eb968aae57a8ac1e1fd160e897055b79f3026c4fbad3")
@@ -130,6 +132,8 @@ class Rdc(CMakePackage):
"5.4.3",
"5.5.0",
"5.5.1",
+ "5.6.0",
+ "5.6.1",
]:
depends_on("rocm-smi-lib@" + ver, type=("build", "link"), when="@" + ver)
@@ -147,10 +151,12 @@ class Rdc(CMakePackage):
"5.4.3",
"5.5.0",
"5.5.1",
+ "5.6.0",
+ "5.6.1",
]:
depends_on("hsa-rocr-dev@" + ver, when="@" + ver)
- for ver in ["5.5.0", "5.5.1"]:
+ for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1"]:
depends_on("rocm-core@" + ver, when="@" + ver)
def patch(self):
diff --git a/var/spack/repos/builtin/packages/rocalution/package.py b/var/spack/repos/builtin/packages/rocalution/package.py
index c8afbc226a..b0ba2021ba 100644
--- a/var/spack/repos/builtin/packages/rocalution/package.py
+++ b/var/spack/repos/builtin/packages/rocalution/package.py
@@ -24,7 +24,8 @@ class Rocalution(CMakePackage):
maintainers("cgmb", "srekolam", "renjithravindrankannath")
libraries = ["librocalution_hip"]
-
+ version("5.6.1", sha256="7197b3617a0c91e90adaa32003c04d247a5f585d216e77493d20984ba215addb")
+ version("5.6.0", sha256="7397a2039e9615c0cf6776c33c4083c00b185b5d5c4149c89fea25a8976a3097")
version("5.5.1", sha256="4612e30a0290b1732c8862eea655122abc2d22ce4345b8498fe4127697e880b4")
version("5.5.0", sha256="626e966b67b83a1ef79f9bf27aba998c49cf65c4208092516aa1e32a6cbd8c36")
version("5.4.3", sha256="39d00951a9b3cbdc4205a7e3ce75c026d9428c71c784815288c445f84a7f8a0e")
@@ -155,6 +156,8 @@ class Rocalution(CMakePackage):
"5.4.3",
"5.5.0",
"5.5.1",
+ "5.6.0",
+ "5.6.1",
]:
depends_on("hip@" + ver, when="@" + ver)
depends_on("rocprim@" + ver, when="@" + ver)
diff --git a/var/spack/repos/builtin/packages/rocblas/package.py b/var/spack/repos/builtin/packages/rocblas/package.py
index 9727be5ab3..0b59eadd7e 100644
--- a/var/spack/repos/builtin/packages/rocblas/package.py
+++ b/var/spack/repos/builtin/packages/rocblas/package.py
@@ -21,6 +21,8 @@ class Rocblas(CMakePackage):
version("develop", branch="develop")
version("master", branch="master")
+ version("5.6.1", sha256="73896ebd445162a69af97f9fd462684609b4e0cf617eab450cd4558b4a23941e")
+ version("5.6.0", sha256="6a70b27eede02c45f46095a6ce8421af9a774a565e39f5e1074783ecf00c1ea7")
version("5.5.1", sha256="7916a8d238d51cc239949d799f0b61c9d5cd63c6ccaed0e16749489b89ca8ff3")
version("5.5.0", sha256="b5260517f199e806ae18f2c4495f163884e0d7a0a7c67af0770f7428ea50f898")
version("5.4.3", sha256="d82cd334b7a9b40d16ec4f4bb1fb5662382dcbfc86ee5e262413ed63d9e6a701")
@@ -174,6 +176,8 @@ class Rocblas(CMakePackage):
"5.4.3",
"5.5.0",
"5.5.1",
+ "5.6.0",
+ "5.6.1",
]:
depends_on("hip@" + ver, when="@" + ver)
depends_on("llvm-amdgpu@" + ver, type="build", when="@" + ver)
@@ -191,6 +195,8 @@ class Rocblas(CMakePackage):
depends_on("py-wheel", type="build")
depends_on("py-msgpack", type="build")
depends_on("py-pip", type="build")
+ depends_on("py-joblib", type="build", when="@5.6:")
+ depends_on("procps", type="build", when="@5.6:")
for t_version, t_commit in [
("@3.5.0", "f842a1a4427624eff6cbddb2405c36dec9a210cd"),
@@ -218,6 +224,8 @@ class Rocblas(CMakePackage):
("@5.4.3", "5aec08937473b27865fa969bb38a83bcf9463c2b"),
("@5.5.0", "38d444a9f2b6cddfeaeedcb39a5688150fa27093"),
("@5.5.1", "38d444a9f2b6cddfeaeedcb39a5688150fa27093"),
+ ("@5.6.0", "7d0a9d040c3bbae893df7ecef6a19d9cd1c304aa"),
+ ("@5.6.1", "7d0a9d040c3bbae893df7ecef6a19d9cd1c304aa"),
]:
resource(
name="Tensile",
diff --git a/var/spack/repos/builtin/packages/rocfft/package.py b/var/spack/repos/builtin/packages/rocfft/package.py
index bc5251a6c0..63c0548ce3 100644
--- a/var/spack/repos/builtin/packages/rocfft/package.py
+++ b/var/spack/repos/builtin/packages/rocfft/package.py
@@ -18,6 +18,8 @@ class Rocfft(CMakePackage):
maintainers("cgmb", "srekolam", "renjithravindrankannath", "haampie")
libraries = ["librocfft"]
+ version("5.6.1", sha256="a65861e453587c3e6393da75b0b1976508c61f968aecda77fbec920fea48489e")
+ version("5.6.0", sha256="e3d4a6c1bdac78f9a22033f57011af783d560308103f73542f9e0e4dd133d38a")
version("5.5.1", sha256="57423a64f5cdb1c37ff0891b6c17b59f73198d46be42db4ae23781ef2c0cd49d")
version("5.5.0", sha256="9288152e66504b06082e4eed8cdb791b4f9ae2836b3defbeb4d2b54901b96485")
version("5.4.3", sha256="ed9664adc9825c237327497bc4b23f020d50be7645647f14a45f4d943dd506e7")
@@ -155,6 +157,8 @@ class Rocfft(CMakePackage):
"5.4.3",
"5.5.0",
"5.5.1",
+ "5.6.0",
+ "5.6.1",
]:
depends_on("hip@" + ver, when="@" + ver)
depends_on("rocm-cmake@%s:" % ver, type="build", when="@" + ver)
diff --git a/var/spack/repos/builtin/packages/rocm-bandwidth-test/package.py b/var/spack/repos/builtin/packages/rocm-bandwidth-test/package.py
index 3c1f1d9e82..af829cf7ad 100644
--- a/var/spack/repos/builtin/packages/rocm-bandwidth-test/package.py
+++ b/var/spack/repos/builtin/packages/rocm-bandwidth-test/package.py
@@ -18,6 +18,8 @@ class RocmBandwidthTest(CMakePackage):
maintainers("srekolam", "renjithravindrankannath")
version("master", branch="master")
+ version("5.6.1", sha256="849af715d08dfd89e7aa5e4453b624151db1cafaa567ab5fa36a77948b90bf0d")
+ version("5.6.0", sha256="ae2f7263a21a3a650068f43e3112b2b765eea80a5af2297572f850c77f83c85e")
version("5.5.1", sha256="768b3da49fe7d4bb4e6536a8ee15be9f5e865d961e813ed4a407f32402685e1f")
version("5.5.0", sha256="1070ce14d45f34c2c6b2fb003184f3ae735ccfd640e9df1c228988b2a5a82949")
version("5.4.3", sha256="a2f5a75bf47db1e39a4626a9f5cd2d120bcafe56b1baf2455d794f7a4734993e")
@@ -128,12 +130,14 @@ class RocmBandwidthTest(CMakePackage):
"5.4.3",
"5.5.0",
"5.5.1",
+ "5.6.0",
+ "5.6.1",
"master",
]:
depends_on("hsa-rocr-dev@" + ver, when="@" + ver)
depends_on("hsakmt-roct@" + ver, when="@" + ver)
- for ver in ["5.5.0", "5.5.1"]:
+ for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1"]:
depends_on("rocm-core@" + ver, when="@" + ver)
build_targets = ["package"]
diff --git a/var/spack/repos/builtin/packages/rocm-clang-ocl/package.py b/var/spack/repos/builtin/packages/rocm-clang-ocl/package.py
index 702c627d34..941b1900f0 100644
--- a/var/spack/repos/builtin/packages/rocm-clang-ocl/package.py
+++ b/var/spack/repos/builtin/packages/rocm-clang-ocl/package.py
@@ -16,6 +16,8 @@ class RocmClangOcl(CMakePackage):
maintainers("srekolam", "renjithravindrankannath")
version("master", branch="master")
+ version("5.6.1", sha256="c41deb1b564d939fc897b2bbdb13570b2234fa4c052a39783f5ad2dd1052f901")
+ version("5.6.0", sha256="1afc47dee02d73c10de422f254067f4ef3ff921c4a1204d54ecc40e61fc63497")
version("5.5.1", sha256="bfa62ad14830e2bd5afbc346685216c69f8cbef0eb449954f793178e10b19a38")
version("5.5.0", sha256="43a5459165693301ba2ebcc41b2b0705df9a3a47571d43bdc2cc49cfdd0833a7")
version("5.4.3", sha256="689e0354ea685bd488116de8eb902b902492e9ace184c3109b97b9a43f8b2d59")
@@ -126,6 +128,8 @@ class RocmClangOcl(CMakePackage):
"5.4.3",
"5.5.0",
"5.5.1",
+ "5.6.0",
+ "5.6.1",
"master",
]:
depends_on("rocm-cmake@%s:" % ver, type="build", when="@" + ver)
@@ -135,7 +139,7 @@ class RocmClangOcl(CMakePackage):
depends_on(
"rocm-device-libs@" + ver, when="@{0} ^llvm-amdgpu ~rocm-device-libs".format(ver)
)
- for ver in ["5.5.0", "5.5.1"]:
+ for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1"]:
depends_on("rocm-core@" + ver, when="@" + ver)
test_src_dir = "test"
diff --git a/var/spack/repos/builtin/packages/rocm-cmake/package.py b/var/spack/repos/builtin/packages/rocm-cmake/package.py
index 30cbfc397f..c833db6755 100644
--- a/var/spack/repos/builtin/packages/rocm-cmake/package.py
+++ b/var/spack/repos/builtin/packages/rocm-cmake/package.py
@@ -13,12 +13,14 @@ class RocmCmake(CMakePackage):
homepage = "https://github.com/RadeonOpenCompute/rocm-cmake"
git = "https://github.com/RadeonOpenCompute/rocm-cmake.git"
- url = "https://github.com/RadeonOpenCompute/rocm-cmake/archive/rocm-5.5.0.tar.gz"
+ url = "https://github.com/RadeonOpenCompute/rocm-cmake/archive/rocm-5.6.0.tar.gz"
tags = ["rocm"]
maintainers("srekolam", "renjithravindrankannath")
version("master", branch="master")
+ version("5.6.1", sha256="98bf5fe2e6e12f55d122807d0060f1bb19c80d63d2c2f6fee579c40bfd244fa6")
+ version("5.6.0", sha256="a118ca937856a4d0039955a8aef2466ef1fd1f08f7f7221cda53e1b5d02e476a")
version("5.5.1", sha256="60113412b35d94e20e8100ed3db688c35801991b4b8fa282fdc6fd6fd413fb6e")
version("5.5.0", sha256="b7884c346737eba70ae11044e41598b2482a92e21f3e0719b1ca11619f02a20b")
version("5.4.3", sha256="c185b3a10d191d73b76770ca0f9d6bdc355ee91fe0c9016a3779c9cfe042ba0f")
@@ -104,7 +106,7 @@ class RocmCmake(CMakePackage):
depends_on("cmake@3:", type="build")
depends_on("cmake@3.6:", type="build", when="@4.1.0:")
- for ver in ["5.5.0", "5.5.1"]:
+ for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1"]:
depends_on("rocm-core@" + ver, when="@" + ver)
test_src_dir = "test"
diff --git a/var/spack/repos/builtin/packages/rocm-core/package.py b/var/spack/repos/builtin/packages/rocm-core/package.py
index fe2f3bfbeb..45d947ce0e 100644
--- a/var/spack/repos/builtin/packages/rocm-core/package.py
+++ b/var/spack/repos/builtin/packages/rocm-core/package.py
@@ -19,6 +19,8 @@ class RocmCore(CMakePackage):
maintainers("srekolam", "renjithravindrankannath")
libraries = ["librocm-core"]
+ version("5.6.1", sha256="eeef75e16e05380ccbc8df17a02dc141a66dddaadb444a97f7278f78067c498c")
+ version("5.6.0", sha256="3c3d47c8b774968d768d42810a3fed42d058b7d6da248d5295df2a7ffb262568")
version("5.5.1", sha256="bc73060432ffdc2e210394835d383890b9652476074ef4708d447473f273ce76")
version("5.5.0", sha256="684d3312bb14f05dc280cf136f5eddff38ba340cd85c383d6a217d8e27d3d57d")
diff --git a/var/spack/repos/builtin/packages/rocm-dbgapi/package.py b/var/spack/repos/builtin/packages/rocm-dbgapi/package.py
index 046ca8913c..6b28adb40c 100644
--- a/var/spack/repos/builtin/packages/rocm-dbgapi/package.py
+++ b/var/spack/repos/builtin/packages/rocm-dbgapi/package.py
@@ -23,6 +23,8 @@ class RocmDbgapi(CMakePackage):
libraries = ["librocm-dbgapi"]
version("master", branch="amd-master")
+ version("5.6.1", sha256="c7241bf94bdb97a4cf1befbf25b8c35720797710da6f6b5b9d6a4094c1bc9c8b")
+ version("5.6.0", sha256="9b66e47f4eccb3c8bbc324aade92aac6139539dda449427b7823d0c45341afc8")
version("5.5.1", sha256="c41dfc62591bcf42003fe744d8bd03a51311d54e4b012f946ca0ede0c14dd977")
version("5.5.0", sha256="ce572340a3fe99e4f1538eb614933153456003f8dfe9306a5735cdd25b451e25")
version("5.4.3", sha256="d647c9121a50f2c54367c567d8f39a145cb135e1ceed931581659f57f49f61e5")
@@ -134,12 +136,14 @@ class RocmDbgapi(CMakePackage):
"5.4.3",
"5.5.0",
"5.5.1",
+ "5.6.0",
+ "5.6.1",
"master",
]:
depends_on("hsa-rocr-dev@" + ver, type="build", when="@" + ver)
depends_on("comgr@" + ver, type=("build", "link"), when="@" + ver)
- for ver in ["5.5.0", "5.5.1"]:
+ for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1"]:
depends_on("rocm-core@" + ver, when="@" + ver)
@classmethod
diff --git a/var/spack/repos/builtin/packages/rocm-debug-agent/package.py b/var/spack/repos/builtin/packages/rocm-debug-agent/package.py
index 13f631e9e6..4b5850f0d0 100644
--- a/var/spack/repos/builtin/packages/rocm-debug-agent/package.py
+++ b/var/spack/repos/builtin/packages/rocm-debug-agent/package.py
@@ -18,6 +18,8 @@ class RocmDebugAgent(CMakePackage):
maintainers("srekolam", "renjithravindrankannath")
libraries = ["librocm-debug-agent"]
+ version("5.6.1", sha256="d3b1d5d757489ed3cc66d351cec56b7b850aaa7ecf6a55b0350b89c3dee3153a")
+ version("5.6.0", sha256="0bed788f07906afeb9092d0bec184a7963233ac9d8ccd20b4afeb624a1d20698")
version("5.5.1", sha256="1bb66734f11bb57df6efa507f0217651446653bf28b3ca36acfcf94511a7c2bc")
version("5.5.0", sha256="4f2431a395a77a06dc417ed1e9188731b031a0c680e62c6eee19d60965317f5a")
version("5.4.3", sha256="b2c9ac198ea3cbf35e7e80f57c5d81c461de78b821d07b637ea4037a65cdf49f")
@@ -138,6 +140,8 @@ class RocmDebugAgent(CMakePackage):
"5.4.3",
"5.5.0",
"5.5.1",
+ "5.6.0",
+ "5.6.1",
]:
depends_on("hsa-rocr-dev@" + ver, when="@" + ver)
depends_on("hsakmt-roct@" + ver, when="@" + ver)
@@ -167,11 +171,13 @@ class RocmDebugAgent(CMakePackage):
"5.4.3",
"5.5.0",
"5.5.1",
+ "5.6.0",
+ "5.6.1",
]:
depends_on("rocm-dbgapi@" + ver, when="@" + ver)
depends_on("hip@" + ver, when="@" + ver)
- for ver in ["5.5.0", "5.5.1"]:
+ for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1"]:
depends_on("rocm-core@" + ver, when="@" + ver)
# https://github.com/ROCm-Developer-Tools/rocr_debug_agent/pull/4
diff --git a/var/spack/repos/builtin/packages/rocm-device-libs/package.py b/var/spack/repos/builtin/packages/rocm-device-libs/package.py
index 24d9c1e826..cb784a050f 100644
--- a/var/spack/repos/builtin/packages/rocm-device-libs/package.py
+++ b/var/spack/repos/builtin/packages/rocm-device-libs/package.py
@@ -18,6 +18,8 @@ class RocmDeviceLibs(CMakePackage):
maintainers("srekolam", "renjithravindrankannath", "haampie")
version("master", branch="amd-stg-open")
+ version("5.6.1", sha256="f0dfab272ff936225bfa1e9dabeb3c5d12ce08b812bf53ffbddd2ddfac49761c")
+ version("5.6.0", sha256="efb5dcdca9b3a9fbe408d494fb4a23e0b78417eb5fa8eebd4a5d226088f28921")
version("5.5.1", sha256="3b5f6dd85f0e3371f6078da7b59bf77d5b210e30f1cc66ef1e2de6bbcb775833")
version("5.5.0", sha256="5ab95aeb9c8bed0514f96f7847e21e165ed901ed826cdc9382c14d199cbadbd3")
version("5.4.3", sha256="f4f7281f2cea6d268fcc3662b37410957d4f0bc23e0df9f60b12eb0fcdf9e26e")
@@ -138,11 +140,13 @@ class RocmDeviceLibs(CMakePackage):
"5.4.3",
"5.5.0",
"5.5.1",
+ "5.6.0",
+ "5.6.1",
"master",
]:
depends_on("llvm-amdgpu@" + ver, when="@" + ver)
- for ver in ["5.5.0", "5.5.1"]:
+ for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1"]:
depends_on("rocm-core@" + ver, when="@" + ver)
def cmake_args(self):
diff --git a/var/spack/repos/builtin/packages/rocm-gdb/package.py b/var/spack/repos/builtin/packages/rocm-gdb/package.py
index b7c58074af..a752f0c4d7 100644
--- a/var/spack/repos/builtin/packages/rocm-gdb/package.py
+++ b/var/spack/repos/builtin/packages/rocm-gdb/package.py
@@ -16,6 +16,8 @@ class RocmGdb(AutotoolsPackage):
tags = ["rocm"]
maintainers("srekolam", "renjithravindrankannath")
+ version("5.6.1", sha256="d2b40d4c5aa41a6ce2a84307627b30d16a458672e03e13f9d27c12f2dc3f21d6")
+ version("5.6.0", sha256="997ef1883aac2769552bc7082c70b837f4e98b57d24c133cea52b9c92fb0dee1")
version("5.5.1", sha256="359258548bc7e6abff16bb13c301339fb96560b2b961433c9e0712e4aaf2d9e1")
version("5.5.0", sha256="d3b100e332facd9635e328f5efd9f0565250edbe05be986baa2e0470a19bcd79")
version("5.4.3", sha256="28c1ce39fb1fabe61f86f6e3c6940c10f9a8b8de77f7bb4fdd73b04e172f85f6")
@@ -135,11 +137,13 @@ class RocmGdb(AutotoolsPackage):
"5.4.3",
"5.5.0",
"5.5.1",
+ "5.6.0",
+ "5.6.1",
]:
depends_on("rocm-dbgapi@" + ver, type="link", when="@" + ver)
depends_on("comgr@" + ver, type="link", when="@" + ver)
- for ver in ["5.5.0", "5.5.1"]:
+ for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1"]:
depends_on("rocm-core@" + ver, when="@" + ver)
build_directory = "spack-build"
diff --git a/var/spack/repos/builtin/packages/rocm-opencl/package.py b/var/spack/repos/builtin/packages/rocm-opencl/package.py
index ddea6c7a8c..f79496e91d 100644
--- a/var/spack/repos/builtin/packages/rocm-opencl/package.py
+++ b/var/spack/repos/builtin/packages/rocm-opencl/package.py
@@ -29,6 +29,8 @@ class RocmOpencl(CMakePackage):
return url.format(version)
version("master", branch="main")
+ version("5.6.1", sha256="ec26049f7d93c95050c27ba65472736665ec7a40f25920a868616b2970f6b845")
+ version("5.6.0", sha256="52ab260d00d279c2a86c353901ffd88ee61b934ad89e9eb480f210656705f04e")
version("5.5.1", sha256="a8a62a7c6fc5398406d2203b8cb75621a24944688e545d917033d87de2724498")
version("5.5.0", sha256="0df9fa0b8aa0c8e6711d34eec0fdf1ed356adcd9625bc8f1ce9b3e72090f3e4f")
version("5.4.3", sha256="b0f8339c844a2e62773bd85cd1e7c5ecddfe71d7c8e8d604e1a1d60900c30873")
@@ -116,6 +118,8 @@ class RocmOpencl(CMakePackage):
depends_on("numactl", type="link", when="@3.7.0:")
for d_version, d_shasum in [
+ ("5.6.1", "cc9a99c7e4de3d9360c0a471b27d626e84a39c9e60e0aff1e8e1500d82391819"),
+ ("5.6.0", "864f87323e793e60b16905284fba381a7182b960dd4a37fb67420c174442c03c"),
("5.5.1", "1375fc7723cfaa0ae22a78682186d4804188b0a54990bfd9c0b8eb421b85e37e"),
("5.5.0", "efbae9a1ef2ab3de5ca44091e9bb78522e76759c43524c1349114f9596cc61d1"),
("5.4.3", "71d9668619ab57ec8a4564d11860438c5aad5bd161a3e58fbc49555fbd59182d"),
@@ -186,12 +190,14 @@ class RocmOpencl(CMakePackage):
"5.4.3",
"5.5.0",
"5.5.1",
+ "5.6.0",
+ "5.6.1",
"master",
]:
depends_on("comgr@" + ver, type="build", when="@" + ver)
depends_on("hsa-rocr-dev@" + ver, type="link", when="@" + ver)
- for ver in ["5.5.0", "5.5.1"]:
+ for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1"]:
depends_on("rocm-core@" + ver, when="@" + ver)
@classmethod
diff --git a/var/spack/repos/builtin/packages/rocm-openmp-extras/package.py b/var/spack/repos/builtin/packages/rocm-openmp-extras/package.py
index d918f3d5f0..dedba382c5 100644
--- a/var/spack/repos/builtin/packages/rocm-openmp-extras/package.py
+++ b/var/spack/repos/builtin/packages/rocm-openmp-extras/package.py
@@ -37,6 +37,8 @@ aomp = [
"7f90634fb621169b21bcbd920c2e299acc88ba0eeb1a33fd40ae26e13201b652",
"23cc7d1c82e35c74f48285a0a1c27e7b3cae1767568bb7b9367ea21f53dd6598",
"9ec03a69cc462ada43e1fd4ca905a765b08c10e0911fb7a202c893cc577855e6",
+ "0673820a81986c9e2f28f15bbb45ad18934bca56a9d08aae6c49ec3895b38487",
+ "6c051bf7625f682ba3d2ea80b46a38ca2cbcd20f5d89ae3433602d3e7ef0403a",
]
devlib = [
@@ -62,6 +64,8 @@ devlib = [
"f4f7281f2cea6d268fcc3662b37410957d4f0bc23e0df9f60b12eb0fcdf9e26e",
"5ab95aeb9c8bed0514f96f7847e21e165ed901ed826cdc9382c14d199cbadbd3",
"3b5f6dd85f0e3371f6078da7b59bf77d5b210e30f1cc66ef1e2de6bbcb775833",
+ "efb5dcdca9b3a9fbe408d494fb4a23e0b78417eb5fa8eebd4a5d226088f28921",
+ "f0dfab272ff936225bfa1e9dabeb3c5d12ce08b812bf53ffbddd2ddfac49761c",
]
llvm = [
@@ -87,6 +91,8 @@ llvm = [
"a844d3cc01613f6284a75d44db67c495ac1e9b600eacbb1eb13d2649f5d5404d",
"5dc6c99f612b69ff73145bee17524e3712990100e16445b71634106acf7927cf",
"7d7181f20f89cb0715191aa32914186c67a34258c13457055570d47e15296553",
+ "e922bd492b54d99e56ed88c81e2009ed6472059a180b10cc56ce1f9bd2d7b6ed",
+ "045e43c0c4a3f4f2f1db9fb603a4f1ea3d56e128147e19ba17909eb57d7f08e5",
]
flang = [
@@ -112,6 +118,8 @@ flang = [
"b283d76244d19ab16c9d087ee7de0d340036e9c842007aa9d288aa4e6bf3749f",
"a18522588686672150c7862f2b23048a429baa4a66010c4196e969cc77bd152c",
"7c3b4eb3e95b9e2f91234f202a76034628d230a92e57b7c5ee9dcca1097bec46",
+ "fcefebddca0b373da81ff84f0f5469a1ef77a05430a5195d0f2e6399d3af31c3",
+ "5ebcbca2e03bd0686e677f44ea551e97bd9395c6b119f832fa784818733aa652",
]
extras = [
@@ -137,6 +145,8 @@ extras = [
"d393f27a85c9229433b50daee8154e11517160beb1049c1de9c55fc31dd11fac",
"8f49026a80eb8685cbfb6d3d3b9898dd083df4d71893984ae5330d4804c685fb",
"8955aa9d039fd6c1ff2e26d7298f0bf09bbcf03f09c6df92c91a9ab2510df9da",
+ "017bfed52fbe08185d8dbde79377918454215683562519a9e47acf403d9a1c29",
+ "437e2017cfe2ab73b15ada0fc1ea88f794f0b108cc5410f457268ae7e4e8985a",
]
versions = [
@@ -162,6 +172,8 @@ versions = [
"5.4.3",
"5.5.0",
"5.5.1",
+ "5.6.0",
+ "5.6.1",
]
versions_dict = dict() # type: Dict[str,Dict[str,str]]
components = ["aomp", "devlib", "llvm", "flang", "extras"]
@@ -183,6 +195,8 @@ class RocmOpenmpExtras(Package):
tags = ["rocm"]
maintainers("srekolam", "renjithravindrankannath", "estewart08")
+ version("5.6.1", sha256=versions_dict["5.6.1"]["aomp"])
+ version("5.6.0", sha256=versions_dict["5.6.0"]["aomp"])
version("5.5.1", sha256=versions_dict["5.5.1"]["aomp"])
version("5.5.0", sha256=versions_dict["5.5.0"]["aomp"])
version("5.4.3", sha256=versions_dict["5.4.3"]["aomp"])
@@ -237,13 +251,15 @@ class RocmOpenmpExtras(Package):
"5.4.3",
"5.5.0",
"5.5.1",
+ "5.6.0",
+ "5.6.1",
]:
depends_on("hsakmt-roct@" + ver, when="@" + ver)
depends_on("comgr@" + ver, when="@" + ver)
depends_on("hsa-rocr-dev@" + ver, when="@" + ver)
depends_on("llvm-amdgpu@{0} ~openmp".format(ver), when="@" + ver)
- for ver in ["5.5.0", "5.5.1"]:
+ for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1"]:
depends_on("rocm-core@" + ver, when="@" + ver)
# tag changed to 'rocm-' in 4.0.0
diff --git a/var/spack/repos/builtin/packages/rocm-smi-lib/package.py b/var/spack/repos/builtin/packages/rocm-smi-lib/package.py
index 80ced92254..11ad3aa2ab 100644
--- a/var/spack/repos/builtin/packages/rocm-smi-lib/package.py
+++ b/var/spack/repos/builtin/packages/rocm-smi-lib/package.py
@@ -25,6 +25,8 @@ class RocmSmiLib(CMakePackage):
libraries = ["librocm_smi64"]
version("master", branch="master")
+ version("5.6.1", sha256="9e94f9a941202c3d7ce917fd1cd78c4e0f06f48d6c929f3aa916378ccef1e02c")
+ version("5.6.0", sha256="88be875948a29454b8aacced8bb8ad967502a7a074ecbc579ed673c1650a2f7e")
version("5.5.1", sha256="37f32350bfaf6c697312628696d1b1d5fd9165f183882759bc6cb9a5d65b9430")
version("5.5.0", sha256="0703f49b1c2924cc1d3f613258eabdff1925cb5bcf7cf22bb6b955dd065e4ce8")
version("5.4.3", sha256="34d550272e420684230ceb7845aefcef79b155e51cf9ec55e31fdba2a4ed177b")
@@ -112,14 +114,14 @@ class RocmSmiLib(CMakePackage):
depends_on("cmake@3:", type="build")
depends_on("python@3:", type=("build", "run"), when="@3.9.0:")
- for ver in ["5.5.0", "5.5.1"]:
+ for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1"]:
depends_on("rocm-core@" + ver, when="@" + ver)
patch("disable_pdf_generation_with_doxygen_and_latex.patch", when="@4.5.2:")
def cmake_args(self):
args = [
self.define_from_variant("BUILD_SHARED_LIBS", "shared"),
- self.define("CMAKE_INSTALL_LIBDIR", self.prefix.lib),
+ self.define("CMAKE_INSTALL_LIBDIR", "lib"),
]
return args
diff --git a/var/spack/repos/builtin/packages/rocm-tensile/package.py b/var/spack/repos/builtin/packages/rocm-tensile/package.py
index 19d3b3d63d..29a6e82e09 100644
--- a/var/spack/repos/builtin/packages/rocm-tensile/package.py
+++ b/var/spack/repos/builtin/packages/rocm-tensile/package.py
@@ -18,6 +18,8 @@ class RocmTensile(CMakePackage):
maintainers("srekolam", "renjithravindrankannath", "haampie")
+ version("5.6.1", sha256="3e78c933563fade8781a1dca2079bff135af2f5d2c6eb0147797d2c1f24d006c")
+ version("5.6.0", sha256="383728ecf49def59ab9a7f8a1d1e2eaf8b528e36b461e27030a2aab1a1ed80cb")
version("5.5.1", sha256="b65cb7335abe51ba33be9d46a5ede992b4e5932fa33797397899a6bf33a770e9")
version("5.5.0", sha256="70fd736d40bb4c3461f07c77ad3ae6c485e3e842671ce9b223d023d836884ae2")
version("5.4.3", sha256="a4c5e62edd33ea6b8528eb3f017a14c28eaa67c540f5c9023f6a245340198b0f")
@@ -157,6 +159,8 @@ class RocmTensile(CMakePackage):
"5.4.3",
"5.5.0",
"5.5.1",
+ "5.6.0",
+ "5.6.1",
]:
depends_on("rocm-cmake@" + ver, type="build", when="@" + ver)
depends_on("hip@" + ver, when="@" + ver)
@@ -174,6 +178,8 @@ class RocmTensile(CMakePackage):
"5.4.3",
"5.5.0",
"5.5.1",
+ "5.6.0",
+ "5.6.1",
]:
depends_on("rocm-openmp-extras@" + ver, when="@" + ver)
@@ -201,6 +207,8 @@ class RocmTensile(CMakePackage):
"5.4.3",
"5.5.0",
"5.5.1",
+ "5.6.0",
+ "5.6.1",
]:
depends_on("rocm-smi-lib@" + ver, type="build", when="@" + ver)
diff --git a/var/spack/repos/builtin/packages/rocm-validation-suite/007-cleanup-path-reference-donot-download-googletest-yaml-library-path_5.6.patch b/var/spack/repos/builtin/packages/rocm-validation-suite/007-cleanup-path-reference-donot-download-googletest-yaml-library-path_5.6.patch
new file mode 100644
index 0000000000..7acd960614
--- /dev/null
+++ b/var/spack/repos/builtin/packages/rocm-validation-suite/007-cleanup-path-reference-donot-download-googletest-yaml-library-path_5.6.patch
@@ -0,0 +1,532 @@
+From 795e7474acf23eb2f7815fd54ffdd3fd41ff8c35 Mon Sep 17 00:00:00 2001
+From: Renjith Ravindran <Renjith.RavindranKannath@amd.com>
+Date: Tue, 12 Sep 2023 07:00:31 +0000
+Subject: [PATCH] 5.6 Patch to add rocm-smi library and include path
+
+---
+ CMakeLists.txt | 105 ++++-----------------------------
+ babel.so/CMakeLists.txt | 16 ++---
+ cmake_modules/tests_unit.cmake | 2 +-
+ edp.so/CMakeLists.txt | 3 +-
+ gm.so/CMakeLists.txt | 4 +-
+ gpup.so/CMakeLists.txt | 2 +-
+ gst.so/CMakeLists.txt | 4 +-
+ iet.so/CMakeLists.txt | 6 +-
+ mem.so/CMakeLists.txt | 4 +-
+ pbqt.so/CMakeLists.txt | 2 +-
+ pebb.so/CMakeLists.txt | 2 +-
+ peqt.so/CMakeLists.txt | 4 +-
+ perf.so/CMakeLists.txt | 4 +-
+ pesm.so/CMakeLists.txt | 2 +-
+ rcqt.so/CMakeLists.txt | 2 +-
+ rvs/CMakeLists.txt | 2 +-
+ rvs/tests.cmake | 2 +-
+ rvslib/CMakeLists.txt | 2 +-
+ smqt.so/CMakeLists.txt | 2 +-
+ testif.so/CMakeLists.txt | 2 +-
+ 20 files changed, 45 insertions(+), 127 deletions(-)
+
+diff --git a/CMakeLists.txt b/CMakeLists.txt
+index a12eb41..900657a 100644
+--- a/CMakeLists.txt
++++ b/CMakeLists.txt
+@@ -68,13 +68,12 @@ endif(rocblas_FOUND)
+ # variables since we will pass them as cmake params appropriately, and
+ # all find_packages relevant to this build will be in ROCM path hence appending it to CMAKE_PREFIX_PATH
+ set(ROCM_PATH "/opt/rocm" CACHE PATH "ROCM install path")
+-set(CMAKE_INSTALL_PREFIX "/opt/rocm" CACHE PATH "CMAKE installation directory")
+-set(CMAKE_PACKAGING_INSTALL_PREFIX "/opt/rocm" CACHE PATH "Prefix used in built packages")
++set (CMAKE_PACKAGING_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}" )
+ list(APPEND CMAKE_PREFIX_PATH "${ROCM_PATH}")
+-set(ROCR_INC_DIR "${ROCM_PATH}/include" CACHE PATH "Contains header files exported by ROC Runtime" FORCE)
+-set(ROCR_LIB_DIR "${ROCM_PATH}/lib" CACHE PATH "Contains library files exported by ROC Runtime" FORCE)
+-set(HIP_INC_DIR "${ROCM_PATH}" CACHE PATH "Contains header files exported by ROC Runtime")
+-set(ROCT_INC_DIR "${ROCM_PATH}/include" CACHE PATH "Contains header files exported by ROC Trunk" FORCE)
++set(ROCR_INC_DIR "${HSA_PATH}/include" CACHE PATH "Contains header files exported by ROC Runtime")
++set(ROCR_LIB_DIR "${HSA_PATH}/lib" CACHE PATH "Contains library files exported by ROC Runtime")
++set(HIP_INC_DIR "${HIP_PATH}" CACHE PATH "Contains header files exported by ROC Runtime")
++set(ROCT_INC_DIR "${ROCM_PATH}/include" CACHE PATH "Contains header files exported by ROC Trunk")
+
+
+ #
+@@ -193,8 +192,6 @@ set(RVS_ROCBLAS "0" CACHE STRING "1 = use local rocBLAS")
+ set(RVS_ROCMSMI "0" CACHE STRING "1 = use local rocm_smi_lib")
+
+ set(RVS_LIB_DIR "${CMAKE_BINARY_DIR}/rvslib" CACHE PATH "Contains RVS library")
+-set(YAML_INC_DIR "${CMAKE_BINARY_DIR}/yaml-src/include" CACHE PATH "Contains header files exported by yaml-cpp")
+-set(YAML_LIB_DIR "${CMAKE_BINARY_DIR}/yaml-build" CACHE PATH "Contains library files exported by yaml-cpp")
+
+ if (${RVS_OS_TYPE} STREQUAL "centos")
+ set(ROCT_LIB_DIR "${ROCM_PATH}/lib64" CACHE PATH "Contains library files exported by ROC Trunk")
+@@ -238,86 +235,6 @@ if (NOT DEFINED CPACK_GENERATOR )
+ endif()
+ message (STATUS "CPACK_GENERATOR ${CPACK_GENERATOR}" )
+
+-
+-################################################################################
+-# Download and unpack yaml-cpp at configure time
+-configure_file(CMakeYamlDownload.cmake yaml-download/CMakeLists.txt)
+-execute_process(COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" .
+- RESULT_VARIABLE result
+- WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/yaml-download )
+-if(result)
+- message(FATAL_ERROR "CMake step for yaml-download failed: ${result}")
+-endif()
+-execute_process(COMMAND ${CMAKE_COMMAND} --build .
+- RESULT_VARIABLE result
+- WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/yaml-download )
+-if(result)
+- message(FATAL_ERROR "Build step for yaml-download failed: ${result}")
+-endif()
+-execute_process(COMMAND ${CMAKE_COMMAND} ${CMAKE_BINARY_DIR}/yaml-src -B${CMAKE_BINARY_DIR}/yaml-build
+- RESULT_VARIABLE result
+- WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/yaml-src )
+-if(result)
+- message(FATAL_ERROR "Config step for yaml-src failed: ${result}")
+-endif()
+-
+-add_custom_target(rvs_yaml_target
+- DEPENDS ${CMAKE_BINARY_DIR}/yaml-build/libyaml-cpp.a
+-)
+-
+-add_custom_command(OUTPUT ${CMAKE_BINARY_DIR}/yaml-build/libyaml-cpp.a
+- COMMAND make -C ${CMAKE_BINARY_DIR}/yaml-build
+- WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/yaml-src
+- COMMENT "Generating yaml-cpp targets"
+- VERBATIM)
+-
+-################################################################################
+-## GOOGLE TEST
+-if(RVS_BUILD_TESTS)
+- # Download and unpack googletest at configure time
+- configure_file(CMakeGtestDownload.cmake googletest-download/CMakeLists.txt)
+- execute_process(COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" .
+- RESULT_VARIABLE result
+- WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/googletest-download )
+- if(result)
+- message(FATAL_ERROR "CMake step for googletest failed: ${result}")
+- endif()
+- execute_process(COMMAND ${CMAKE_COMMAND} --build .
+- RESULT_VARIABLE result
+- WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/googletest-download )
+- if(result)
+- message(FATAL_ERROR "Build step for googletest failed: ${result}")
+- endif()
+- execute_process(COMMAND ${CMAKE_COMMAND} ${CMAKE_BINARY_DIR}/googletest-src -B${CMAKE_BINARY_DIR}/googletest-build
+- RESULT_VARIABLE result
+- WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/googletest-src )
+- if(result)
+- message(FATAL_ERROR "Config step for googletest-src failed: ${result}")
+- endif()
+-
+- add_custom_target(rvs_gtest_target
+- DEPENDS ${CMAKE_BINARY_DIR}/googletest-build/lib/libgtest_main.a
+- )
+-
+- add_custom_command(OUTPUT ${CMAKE_BINARY_DIR}/googletest-build/lib/libgtest_main.a
+- COMMAND make -C ${CMAKE_BINARY_DIR}/googletest-build
+- WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/googletest-src
+- COMMENT "Generating googletest targets"
+- VERBATIM)
+-
+- ## Set default unit test framework include path
+- if (NOT DEFINED UT_INC)
+- set (UT_INC "${CMAKE_BINARY_DIR}/googletest-src/googletest/include")
+- message ("UT_INC ${UT_INC}")
+- endif ()
+-
+- ## Set default unit test framework include path
+- if (NOT DEFINED UT_LIB)
+- set (UT_LIB "${CMAKE_BINARY_DIR}/googletest-build/lib")
+- message ("UT_LIB ${UT_LIB}")
+- endif()
+-
+-endif()
+ ################################################################################
+ ## rocBLAS
+
+@@ -441,8 +358,8 @@ if (RVS_ROCBLAS EQUAL 1)
+ set(ROCBLAS_INC_DIR "${CMAKE_BINARY_DIR}/rvs_rblas-src/build/release/rocblas-install")
+ set(ROCBLAS_LIB_DIR "${CMAKE_BINARY_DIR}/rvs_rblas-src/build/release/rocblas-install/lib/")
+ else()
+- set(ROCBLAS_INC_DIR "${ROCM_PATH}/include")
+- set(ROCBLAS_LIB_DIR "${ROCM_PATH}/lib")
++ set(ROCBLAS_INC_DIR "${ROCBLAS_DIR}/include")
++ set(ROCBLAS_LIB_DIR "${ROCBLAS_DIR}/lib")
+ endif()
+
+ if (RVS_ROCMSMI EQUAL 1)
+@@ -457,8 +374,8 @@ else()
+ set(ROCM_SMI_LIB_DIR "${ROCM_PATH}/rocm_smi/lib")
+ else()
+ message( STATUS "ROCBLAS REORG Enabled Version: ${RVS_ROCBLAS_VERSION_FLAT}" )
+- set(ROCM_SMI_INC_DIR "${ROCM_PATH}/include")
+- set(ROCM_SMI_LIB_DIR "${ROCM_PATH}/lib")
++ set(ROCM_SMI_INC_DIR "${ROCM_SMI_DIR}/include")
++ set(ROCM_SMI_LIB_DIR "${ROCM_SMI_DIR}/lib")
+ endif()
+ endif()
+ set(ROCM_SMI_LIB "rocm_smi64" CACHE STRING "rocm_smi library name")
+@@ -502,7 +419,7 @@ if (RVS_BUILD_TESTS)
+ add_subdirectory(testif.so)
+ endif()
+
+-add_dependencies(rvshelper rvs_bin_folder rvs_doc rvs_yaml_target)
++add_dependencies(rvshelper rvs_bin_folder rvs_doc)
+
+
+ add_dependencies(pesm rvslib rvslibrt)
+@@ -537,7 +454,7 @@ if (RVS_BUILD_TESTS)
+ WORKING_DIRECTORY ${CMAKE_BINARY_DIR}
+ COMMENT "Create the bintest directory"
+ VERBATIM)
+- add_dependencies(rvshelper rvs_bintest_folder rvs_gtest_target)
++ add_dependencies(rvshelper rvs_bintest_folder)
+ endif()
+
+ add_custom_target(rvs_doc ALL
+diff --git a/babel.so/CMakeLists.txt b/babel.so/CMakeLists.txt
+index 7290cef..ebd55ad 100644
+--- a/babel.so/CMakeLists.txt
++++ b/babel.so/CMakeLists.txt
+@@ -107,13 +107,13 @@ set(HIP_HCC_LIB "amdhip64")
+ add_compile_options(-DRVS_ROCBLAS_VERSION_FLAT=${RVS_ROCBLAS_VERSION_FLAT})
+
+ # Determine Roc Runtime header files are accessible
+-if(NOT EXISTS ${HIP_INC_DIR}/include/hip/hip_runtime.h)
+- message("ERROR: ROC Runtime headers can't be found under specified path. Please set HIP_INC_DIR path. Current value is : " ${HIP_INC_DIR})
++if(NOT EXISTS ${HIP_PATH}/include/hip/hip_runtime.h)
++ message("ERROR: ROC Runtime headers can't be found under specified path. Please set HIP_PATH path. Current value is : " ${HIP_PATH})
+ RETURN()
+ endif()
+
+-if(NOT EXISTS ${HIP_INC_DIR}/include/hip/hip_runtime_api.h)
+- message("ERROR: ROC Runtime headers can't be found under specified path. Please set HIP_INC_DIR path. Current value is : " ${HIP_INC_DIR})
++if(NOT EXISTS ${HIP_PATH}/include/hip/hip_runtime_api.h)
++ message("ERROR: ROC Runtime headers can't be found under specified path. Please set HIP_PATH path. Current value is : " ${HIP_PATH})
+ RETURN()
+ endif()
+
+@@ -133,16 +133,16 @@ if(DEFINED RVS_ROCMSMI)
+ endif()
+
+
+-if(NOT EXISTS "${ROCR_LIB_DIR}/lib${HIP_HCC_LIB}.so")
+- message("ERROR: ROC Runtime libraries can't be found under specified path. Please set ROCR_LIB_DIR path. Current value is : " ${ROCR_LIB_DIR})
++if(NOT EXISTS "${HIP_PATH}/lib/lib${HIP_HCC_LIB}.so")
++ message("ERROR: ROC Runtime libraries can't be found under specified path. Please set HIP_PATH path. Current value is : " ${HIP_PATH})
+ RETURN()
+ endif()
+
+ ## define include directories
+-include_directories(./ ../ ${ROCR_INC_DIR} ${HIP_INC_DIR})
++include_directories(./ ../ ${HIP_PATH})
+
+ # Add directories to look for library files to link
+-link_directories(${RVS_LIB_DIR} ${ROCR_LIB_DIR} ${ROCBLAS_LIB_DIR} ${ASAN_LIB_PATH})
++link_directories(${RVS_LIB_DIR} ${HIP_PATH}/lib/ ${ROCBLAS_LIB_DIR} ${ASAN_LIB_PATH} ${ROCM_SMI_LIB_DIR})
+ ## additional libraries
+ set (PROJECT_LINK_LIBS rvslibrt rvslib libpthread.so libpci.so libm.so)
+
+diff --git a/cmake_modules/tests_unit.cmake b/cmake_modules/tests_unit.cmake
+index 586f453..c8b6560 100644
+--- a/cmake_modules/tests_unit.cmake
++++ b/cmake_modules/tests_unit.cmake
+@@ -27,7 +27,7 @@
+ ## define additional unit testing include directories
+ include_directories(${UT_INC})
+ ## define additional unit testing lib directories
+-link_directories(${UT_LIB} ${RVS_LIB_DIR})
++link_directories(${UT_LIB} ${RVS_LIB_DIR} ${ROCM_SMI_LIB_DIR})
+
+ file(GLOB TESTSOURCES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} test/test*.cpp )
+ #message ( "TESTSOURCES: ${TESTSOURCES}" )
+diff --git a/edp.so/CMakeLists.txt b/edp.so/CMakeLists.txt
+index a933061..d117e03 100644
+--- a/edp.so/CMakeLists.txt
++++ b/edp.so/CMakeLists.txt
+@@ -129,6 +129,7 @@ endif()
+
+
+ if(NOT EXISTS "${ROCR_LIB_DIR}/lib${HIP_HCC_LIB}.so")
++ message("${ROCR_LIB_DIR}/lib${HIP_HCC_LIB}.so not found")
+ message("ERROR: ROC Runtime libraries can't be found under specified path. Please set ROCR_LIB_DIR path. Current value is : " ${ROCR_LIB_DIR})
+ RETURN()
+ endif()
+@@ -136,7 +137,7 @@ endif()
+ ## define include directories
+ include_directories(./ ../ ${ROCR_INC_DIR} ${ROCBLAS_INC_DIR} ${HIP_INC_DIR})
+ # Add directories to look for library files to link
+-link_directories(${RVS_LIB_DIR} ${ROCR_LIB_DIR} ${ROCBLAS_LIB_DIR})
++link_directories(${RVS_LIB_DIR} ${ROCR_LIB_DIR} ${ROCBLAS_LIB_DIR} ${ROCM_SMI_LIB_DIR})
+ ## additional libraries
+ set (PROJECT_LINK_LIBS rvslibrt rvslib libpthread.so libpciaccess.so libpci.so libm.so)
+
+diff --git a/gm.so/CMakeLists.txt b/gm.so/CMakeLists.txt
+index afaafcb..7c0cd79 100644
+--- a/gm.so/CMakeLists.txt
++++ b/gm.so/CMakeLists.txt
+@@ -122,7 +122,7 @@ include_directories(./ ../ ${ROCM_SMI_INC_DIR})
+ # Add directories to look for library files to link
+ link_directories(${RVS_LIB_DIR} ${ROCM_SMI_LIB_DIR} ${ASAN_LIB_PATH})
+ ## additional libraries
+-set (PROJECT_LINK_LIBS rvslibrt rvslib libpthread.so libpci.so libm.so)
++set (PROJECT_LINK_LIBS rvslibrt rvslib libpthread.so libpci.so libm.so librocm_smi64.so)
+
+ ## define source files
+ set(SOURCES src/rvs_module.cpp src/action.cpp src/worker.cpp)
+@@ -133,7 +133,7 @@ add_library( ${RVS_TARGET} SHARED ${SOURCES})
+ set_target_properties(${RVS_TARGET} PROPERTIES
+ SUFFIX .so.${LIB_VERSION_STRING}
+ LIBRARY_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
+-target_link_libraries(${RVS_TARGET} ${PROJECT_LINK_LIBS} ${ROCM_SMI_LIB})
++target_link_libraries(${RVS_TARGET} ${PROJECT_LINK_LIBS})
+ add_dependencies(${RVS_TARGET} rvslibrt rvslib)
+
+ add_custom_command(TARGET ${RVS_TARGET} POST_BUILD
+diff --git a/gpup.so/CMakeLists.txt b/gpup.so/CMakeLists.txt
+index ca1674b..a9e4d16 100644
+--- a/gpup.so/CMakeLists.txt
++++ b/gpup.so/CMakeLists.txt
+@@ -111,7 +111,7 @@ endif()
+ ## define include directories
+ include_directories(./ ../ include ../include)
+ # Add directories to look for library files to link
+-link_directories(${RVS_LIB_DIR} ${ASAN_LIB_PATH})
++link_directories(${RVS_LIB_DIR} ${ASAN_LIB_PATH} ${ROCM_SMI_LIB_DIR})
+ ## additional libraries
+ set (PROJECT_LINK_LIBS rvslibrt rvslib libpci.so libm.so)
+
+diff --git a/gst.so/CMakeLists.txt b/gst.so/CMakeLists.txt
+index d85eadb..ca7fff4 100644
+--- a/gst.so/CMakeLists.txt
++++ b/gst.so/CMakeLists.txt
+@@ -137,7 +137,7 @@ if(DEFINED RVS_ROCMSMI)
+ endif()
+
+
+-if(NOT EXISTS "${ROCR_LIB_DIR}/lib${HIP_HCC_LIB}.so")
++if(NOT EXISTS "${HIP_INC_DIR}/lib/lib${HIP_HCC_LIB}.so")
+ message("ERROR: ROC Runtime libraries can't be found under specified path. Please set ROCR_LIB_DIR path. Current value is : " ${ROCR_LIB_DIR})
+ RETURN()
+ endif()
+@@ -145,7 +145,7 @@ endif()
+ ## define include directories
+ include_directories(./ ../ ${ROCR_INC_DIR} ${ROCBLAS_INC_DIR} ${HIP_INC_DIR})
+ # Add directories to look for library files to link
+-link_directories(${RVS_LIB_DIR} ${ROCR_LIB_DIR} ${ROCBLAS_LIB_DIR} ${ASAN_LIB_PATH})
++link_directories(${RVS_LIB_DIR} ${ROCR_LIB_DIR} ${HIP_INC_DIR}/lib/ ${ROCBLAS_LIB_DIR} ${ASAN_LIB_PATH} ${ROCM_SMI_LIB_DIR})
+ ## additional libraries
+ set (PROJECT_LINK_LIBS rvslibrt rvslib libpthread.so libpci.so libm.so)
+
+diff --git a/iet.so/CMakeLists.txt b/iet.so/CMakeLists.txt
+index 3263d12..62f4318 100644
+--- a/iet.so/CMakeLists.txt
++++ b/iet.so/CMakeLists.txt
+@@ -140,7 +140,7 @@ if(DEFINED RVS_ROCMSMI)
+ endif()
+ endif()
+
+-if(NOT EXISTS "${ROCR_LIB_DIR}/lib${HIP_HCC_LIB}.so")
++if(NOT EXISTS "${HIP_INC_DIR}/lib/lib${HIP_HCC_LIB}.so")
+ message("ERROR: ROC Runtime libraries can't be found under specified path. Please set ROCR_LIB_DIR path. Current value is : " ${ROCR_LIB_DIR})
+ RETURN()
+ endif()
+@@ -159,7 +159,7 @@ include_directories(./ ../ ${ROCM_SMI_INC_DIR} ${ROCBLAS_INC_DIR} ${ROCR_INC_DIR
+ # Add directories to look for library files to link
+ link_directories(${RVS_LIB_DIR} ${ROCR_LIB_DIR} ${ROCBLAS_LIB_DIR} ${ROCM_SMI_LIB_DIR} ${ASAN_LIB_PATH})
+ ## additional libraries
+-set (PROJECT_LINK_LIBS rvslibrt rvslib libpthread.so libpci.so libm.so)
++set (PROJECT_LINK_LIBS rvslibrt rvslib libpthread.so libpci.so libm.so librocm_smi64.so)
+
+ set(SOURCES src/rvs_module.cpp src/action.cpp src/iet_worker.cpp )
+
+@@ -168,7 +168,7 @@ add_library( ${RVS_TARGET} SHARED ${SOURCES})
+ set_target_properties(${RVS_TARGET} PROPERTIES
+ SUFFIX .so.${LIB_VERSION_STRING}
+ LIBRARY_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
+-target_link_libraries(${RVS_TARGET} ${PROJECT_LINK_LIBS} ${HIP_HCC_LIB} ${ROCBLAS_LIB} ${ROCM_SMI_LIB})
++target_link_libraries(${RVS_TARGET} ${PROJECT_LINK_LIBS} ${HIP_INC_DIR}/lib/ ${HIP_HCC_LIB} ${ROCBLAS_LIB} ${ROCM_SMI_LIB_DIR})
+ add_dependencies(${RVS_TARGET} rvslibrt rvslib)
+
+ add_custom_command(TARGET ${RVS_TARGET} POST_BUILD
+diff --git a/mem.so/CMakeLists.txt b/mem.so/CMakeLists.txt
+index 5a0f401..3fc4f51 100644
+--- a/mem.so/CMakeLists.txt
++++ b/mem.so/CMakeLists.txt
+@@ -134,7 +134,7 @@ if(DEFINED RVS_ROCMSMI)
+ endif()
+
+
+-if(NOT EXISTS "${ROCR_LIB_DIR}/lib${HIP_HCC_LIB}.so")
++if(NOT EXISTS "${HIP_INC_DIR}/lib/lib${HIP_HCC_LIB}.so")
+ message("ERROR: ROC Runtime libraries can't be found under specified path. Please set ROCR_LIB_DIR path. Current value is : " ${ROCR_LIB_DIR})
+ RETURN()
+ endif()
+@@ -143,7 +143,7 @@ endif()
+ include_directories(./ ../ ${ROCR_INC_DIR} ${HIP_INC_DIR})
+
+ # Add directories to look for library files to link
+-link_directories(${RVS_LIB_DIR} ${ROCR_LIB_DIR} ${ROCBLAS_LIB_DIR} ${ASAN_LIB_PATH})
++link_directories(${RVS_LIB_DIR} ${ROCR_LIB_DIR} ${HIP_INC_DIR}/lib ${ROCBLAS_LIB_DIR} ${ASAN_LIB_PATH} ${ROCM_SMI_LIB_DIR})
+ ## additional libraries
+ set (PROJECT_LINK_LIBS rvslibrt rvslib libpthread.so libpci.so libm.so)
+
+diff --git a/pbqt.so/CMakeLists.txt b/pbqt.so/CMakeLists.txt
+index d75211d..80abe22 100644
+--- a/pbqt.so/CMakeLists.txt
++++ b/pbqt.so/CMakeLists.txt
+@@ -138,7 +138,7 @@ endif()
+ ## define include directories
+ include_directories(./ ../ pci ${ROCR_INC_DIR})
+ # Add directories to look for library files to link
+-link_directories(${RVS_LIB_DIR} ${ROCR_LIB_DIR} ${ROCT_LIB_DIR} ${ASAN_LIB_PATH})
++link_directories(${RVS_LIB_DIR} ${ROCR_LIB_DIR} ${HSAKMT_LIB_DIR} ${ROCT_LIB_DIR} ${ASAN_LIB_PATH} ${ROCM_SMI_LIB_DIR})
+ ## additional libraries
+ set (PROJECT_LINK_LIBS rvslibrt rvslib libpthread.so libpci.so libm.so)
+
+diff --git a/pebb.so/CMakeLists.txt b/pebb.so/CMakeLists.txt
+index 7ba031c..e64be8e 100644
+--- a/pebb.so/CMakeLists.txt
++++ b/pebb.so/CMakeLists.txt
+@@ -139,7 +139,7 @@ endif()
+ ## define include directories
+ include_directories(./ ../ pci ${ROCR_INC_DIR})
+ # Add directories to look for library files to link
+-link_directories(${RVS_LIB_DIR} ${ROCR_LIB_DIR} ${ROCT_LIB_DIR} ${ASAN_LIB_PATH})
++link_directories(${RVS_LIB_DIR} ${ROCR_LIB_DIR} ${HSAKMT_LIB_DIR} ${ROCT_LIB_DIR} ${ASAN_LIB_PATH} ${ROCM_SMI_LIB_DIR} )
+ ## additional libraries
+ set (PROJECT_LINK_LIBS rvslibrt rvslib libpthread.so libpci.so libm.so)
+
+diff --git a/peqt.so/CMakeLists.txt b/peqt.so/CMakeLists.txt
+index 2248d91..7f5912d 100644
+--- a/peqt.so/CMakeLists.txt
++++ b/peqt.so/CMakeLists.txt
+@@ -107,9 +107,9 @@ else()
+ endif()
+
+ ## define include directories
+-include_directories(./ ../)
++include_directories(./ ../ ${HSA_PATH})
+ # Add directories to look for library files to link
+-link_directories(${RVS_LIB_DIR} ${ASAN_LIB_PATH})
++link_directories(${RVS_LIB_DIR} ${HSA_PATH}/lib/ ${HSAKMT_LIB_DIR} ${ASAN_LIB_PATH} ${ROCM_SMI_LIB_DIR})
+ ## additional libraries
+ set (PROJECT_LINK_LIBS rvslibrt rvslib libpci.so libm.so)
+
+diff --git a/perf.so/CMakeLists.txt b/perf.so/CMakeLists.txt
+index b319396..b9abe15 100644
+--- a/perf.so/CMakeLists.txt
++++ b/perf.so/CMakeLists.txt
+@@ -137,7 +137,7 @@ if(DEFINED RVS_ROCMSMI)
+ endif()
+
+
+-if(NOT EXISTS "${ROCR_LIB_DIR}/lib${HIP_HCC_LIB}.so")
++if(NOT EXISTS "${HIP_INC_DIR}/lib/lib${HIP_HCC_LIB}.so")
+ message("ERROR: ROC Runtime libraries can't be found under specified path. Please set ROCR_LIB_DIR path. Current value is : " ${ROCR_LIB_DIR})
+ RETURN()
+ endif()
+@@ -145,7 +145,7 @@ endif()
+ ## define include directories
+ include_directories(./ ../ ${ROCR_INC_DIR} ${ROCBLAS_INC_DIR} ${HIP_INC_DIR})
+ # Add directories to look for library files to link
+-link_directories(${RVS_LIB_DIR} ${ROCR_LIB_DIR} ${ROCBLAS_LIB_DIR} ${ASAN_LIB_PATH})
++link_directories(${RVS_LIB_DIR} ${ROCR_LIB_DIR} ${HIP_INC_DIR}/lib ${ROCBLAS_LIB_DIR} ${ASAN_LIB_PATH} ${ROCM_SMI_LIB_DIR})
+ ## additional libraries
+ set (PROJECT_LINK_LIBS rvslibrt rvslib libpthread.so libpci.so libm.so)
+
+diff --git a/pesm.so/CMakeLists.txt b/pesm.so/CMakeLists.txt
+index ff60729..e7a2402 100644
+--- a/pesm.so/CMakeLists.txt
++++ b/pesm.so/CMakeLists.txt
+@@ -109,7 +109,7 @@ endif()
+ ## define include directories
+ include_directories(./ ../ pci)
+ # Add directories to look for library files to link
+-link_directories(${RVS_LIB_DIR} ${ROCR_LIB_DIR} ${ROCBLAS_LIB_DIR} ${ASAN_LIB_PATH})
++link_directories(${RVS_LIB_DIR} ${ROCR_LIB_DIR} ${ROCBLAS_LIB_DIR} ${ASAN_LIB_PATH} ${ROCM_SMI_LIB_DIR})
+ ## additional libraries
+ set (PROJECT_LINK_LIBS libpthread.so libpci.so libm.so)
+
+diff --git a/rcqt.so/CMakeLists.txt b/rcqt.so/CMakeLists.txt
+index 32e1004..ac826ea 100644
+--- a/rcqt.so/CMakeLists.txt
++++ b/rcqt.so/CMakeLists.txt
+@@ -110,7 +110,7 @@ endif()
+ ## define include directories
+ include_directories(./ ../)
+ # Add directories to look for library files to link
+-link_directories(${RVS_LIB_DIR} ${ASAN_LIB_PATH})
++link_directories(${RVS_LIB_DIR} ${ASAN_LIB_PATH} ${ASAN_LIB_PATH} ${HSAKMT_LIB_DIR} ${ROCM_SMI_LIB_DIR})
+ ## additional libraries
+ set (PROJECT_LINK_LIBS rvslibrt rvslib)
+
+diff --git a/rvs/CMakeLists.txt b/rvs/CMakeLists.txt
+index b350429..c855a32 100644
+--- a/rvs/CMakeLists.txt
++++ b/rvs/CMakeLists.txt
+@@ -115,7 +115,7 @@ endif()
+ ## define include directories
+ include_directories(./ ../ ${YAML_INC_DIR} ${YAML_LIB_DIR}/include)
+ ## define lib directories
+-link_directories(${CMAKE_CURRENT_BINARY_DIR} ${RVS_LIB_DIR} ${ASAN_LIB_PATH})
++link_directories(${CMAKE_CURRENT_BINARY_DIR} ${RVS_LIB_DIR} ${ASAN_LIB_PATH} ${ROCM_SMI_LIB_DIR})
+ ## additional libraries
+ set (PROJECT_LINK_LIBS libdl.so "${YAML_LIB_DIR}/libyaml-cpp.a" libpthread.so)
+
+diff --git a/rvs/tests.cmake b/rvs/tests.cmake
+index 32301c8..a058749 100644
+--- a/rvs/tests.cmake
++++ b/rvs/tests.cmake
+@@ -179,7 +179,7 @@ add_test(NAME unit.ttf.rvs.config.noconfig
+ ## define include directories
+ include_directories(${UT_INC})
+ ## define lib directories
+-link_directories(${UT_LIB})
++link_directories(${UT_LIB} ${ROCM_SMI_LIB_DIR})
+ ## additional libraries for unit tests
+ set (PROJECT_TEST_LINK_LIBS ${PROJECT_LINK_LIBS} libpci.so)
+
+diff --git a/rvslib/CMakeLists.txt b/rvslib/CMakeLists.txt
+index 31e6143..4ffed0f 100644
+--- a/rvslib/CMakeLists.txt
++++ b/rvslib/CMakeLists.txt
+@@ -115,7 +115,7 @@ endif()
+
+ ## define include directories
+ include_directories(./ ../
+- ${ROCM_SMI_INC_DIR} ${ROCR_INC_DIR} ${ROCBLAS_INC_DIR} ${HIP_INC_DIR}
++ ${ROCM_SMI_INC_DIR} ${HIP_PATH} ${ROCBLAS_INC_DIR}
+ )
+ link_directories(${ASAN_LIB_PATH} ${ROCM_SMI_LIB_DIR})
+
+diff --git a/smqt.so/CMakeLists.txt b/smqt.so/CMakeLists.txt
+index e6b8ec4..722f329 100644
+--- a/smqt.so/CMakeLists.txt
++++ b/smqt.so/CMakeLists.txt
+@@ -108,7 +108,7 @@ endif()
+ ## define include directories
+ include_directories(./ ../ pci)
+ # Add directories to look for library files to link
+-link_directories(${RVS_LIB_DIR} ${ASAN_LIB_PATH})
++link_directories(${RVS_LIB_DIR} ${ASAN_LIB_PATH} ${ROCM_SMI_LIB_DIR})
+ ## additional libraries
+ set (PROJECT_LINK_LIBS rvslibrt rvslib libpci.so libm.so)
+
+diff --git a/testif.so/CMakeLists.txt b/testif.so/CMakeLists.txt
+index ed7d3d3..f09951e 100644
+--- a/testif.so/CMakeLists.txt
++++ b/testif.so/CMakeLists.txt
+@@ -110,7 +110,7 @@ endif()
+ ## define include directories
+ include_directories(./ ../ pci)
+ # Add directories to look for library files to link
+-link_directories(${RVS_LIB_DIR} ${ROCR_LIB_DIR} ${ROCBLAS_LIB_DIR} ${ASAN_LIB_PATH})
++link_directories(${RVS_LIB_DIR} ${ROCBLAS_LIB_DIR} ${ASAN_LIB_PATH} ${ROCM_SMI_LIB_DIR})
+ ## additional libraries
+ set (PROJECT_LINK_LIBS libpthread.so libpci.so libm.so)
+
+--
+2.39.3
+
diff --git a/var/spack/repos/builtin/packages/rocm-validation-suite/package.py b/var/spack/repos/builtin/packages/rocm-validation-suite/package.py
index c3f5c88c9a..dfefd8ef75 100644
--- a/var/spack/repos/builtin/packages/rocm-validation-suite/package.py
+++ b/var/spack/repos/builtin/packages/rocm-validation-suite/package.py
@@ -21,6 +21,8 @@ class RocmValidationSuite(CMakePackage):
maintainers("srekolam", "renjithravindrankannath")
+ version("5.6.1", sha256="d5e4100e2d07311dfa101563c15d026a8130442cdee8af9ef861832cd7866c0d")
+ version("5.6.0", sha256="54cc5167055870570c97ee7114f48d24d5415f984e0c9d7b58b83467e0cf18fb")
version("5.5.1", sha256="0fbfaa9f68642b590ef04f9778013925bbf3f17bdcd35d4c85a8ffd091169a6e")
version("5.5.0", sha256="296add772171db67ab8838d2db1ea56df21e895c0348c038768e40146e4fe86a")
version("5.4.3", sha256="1f0888e559104a4b8c2f5322f7463e425f2baaf12aeb1a8982a5974516e7b667")
@@ -111,7 +113,11 @@ class RocmValidationSuite(CMakePackage):
patch("006-library-path.patch", when="@4.5.0:5.2")
patch(
"007-cleanup-path-reference-donot-download-googletest-yaml-library-path_5.3.patch",
- when="@5.3.0:",
+ when="@5.3.0:5.5",
+ )
+ patch(
+ "007-cleanup-path-reference-donot-download-googletest-yaml-library-path_5.6.patch",
+ when="@5.6:",
)
depends_on("cmake@3.5:", type="build")
@@ -150,6 +156,8 @@ class RocmValidationSuite(CMakePackage):
"5.4.3",
"5.5.0",
"5.5.1",
+ "5.6.0",
+ "5.6.1",
]:
depends_on("hip@" + ver, when="@" + ver)
depends_on("rocminfo@" + ver, when="@" + ver)
diff --git a/var/spack/repos/builtin/packages/rocminfo/package.py b/var/spack/repos/builtin/packages/rocminfo/package.py
index 1d45ed1f86..92fcd8c826 100644
--- a/var/spack/repos/builtin/packages/rocminfo/package.py
+++ b/var/spack/repos/builtin/packages/rocminfo/package.py
@@ -18,6 +18,8 @@ class Rocminfo(CMakePackage):
maintainers("srekolam", "renjithravindrankannath", "haampie")
version("master", branch="master")
+ version("5.6.1", sha256="780b186ac7410a503eca1060f4bbc35db1b7b4d1d714d15c7534cd26d8af7b54")
+ version("5.6.0", sha256="87d98a736e4f7510d1475d35717842068d826096a0af7c15a395bcf9d36d7fa0")
version("5.5.1", sha256="bcab27bb3595d5a4c981e2416458d169e85c27e603c22e743d9240473bfbe98a")
version("5.5.0", sha256="b6107d362b70e20a10911741eb44247139b4eb43489f7fa648daff880b6de37f")
version("5.4.3", sha256="72159eed31f8deee0df9228b9e306a18fe9efdd4d6c0eead871cad4617874170")
@@ -128,12 +130,14 @@ class Rocminfo(CMakePackage):
"5.4.3",
"5.5.0",
"5.5.1",
+ "5.6.0",
+ "5.6.1",
"master",
]:
depends_on("hsakmt-roct@" + ver, when="@" + ver)
depends_on("hsa-rocr-dev@" + ver, when="@" + ver)
- for ver in ["5.5.0", "5.5.1"]:
+ for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1"]:
depends_on("rocm-core@" + ver, when="@" + ver)
def cmake_args(self):
diff --git a/var/spack/repos/builtin/packages/rocprim/package.py b/var/spack/repos/builtin/packages/rocprim/package.py
index daa6154482..5394f73958 100644
--- a/var/spack/repos/builtin/packages/rocprim/package.py
+++ b/var/spack/repos/builtin/packages/rocprim/package.py
@@ -16,6 +16,8 @@ class Rocprim(CMakePackage):
maintainers("cgmb", "srekolam", "renjithravindrankannath")
+ version("5.6.1", sha256="e9ec1b0039c07cf3096653a04224fe5fe755afc6ba000f6838b3a8bc84df27de")
+ version("5.6.0", sha256="360d6ece3c4a3c289dd88043432026fb989e982ae4d05230d8cdc858bcd50466")
version("5.5.1", sha256="63cdc682afb39efd18f097faf695ce64c851c4a550a8ad96fa89d694451b6a42")
version("5.5.0", sha256="968d9059f93d3f0f8a602f7b989e54e36cff2f9136486b6869e4534a5bf8c7d9")
version("5.4.3", sha256="7be6314a46195912d3203e7e59cb8880a46ed7c1fd221e92fadedd20532e0e48")
@@ -138,6 +140,8 @@ class Rocprim(CMakePackage):
"5.4.3",
"5.5.0",
"5.5.1",
+ "5.6.0",
+ "5.6.1",
]:
depends_on("hip@" + ver, when="@" + ver)
depends_on("comgr@" + ver, when="@" + ver)
diff --git a/var/spack/repos/builtin/packages/rocrand/package.py b/var/spack/repos/builtin/packages/rocrand/package.py
index 893e3e4851..eb6496d338 100644
--- a/var/spack/repos/builtin/packages/rocrand/package.py
+++ b/var/spack/repos/builtin/packages/rocrand/package.py
@@ -25,6 +25,8 @@ class Rocrand(CMakePackage):
version("develop", branch="develop")
version("master", branch="master")
+ version("5.6.1", sha256="6bf71e687ffa0fcc1b00e3567dd43da4147a82390f1b2db5e6f1f594dee6066d")
+ version("5.6.0", sha256="cc894d2f1af55e16b62c179062063946609c656043556189c656a115fd7d6f5f")
version("5.5.1", sha256="e8bed3741b19e296bd698fc55b43686206f42f4deea6ace71513e0c48258cc6e")
version("5.5.0", sha256="0481e7ef74c181026487a532d1c17e62dd468e508106edde0279ca1adeee6f9a")
version("5.4.3", sha256="463aa760e9f74e45b326765040bb8a8a4fa27aaeaa5e5df16f8289125f88a619")
@@ -193,6 +195,8 @@ class Rocrand(CMakePackage):
"5.4.3",
"5.5.0",
"5.5.1",
+ "5.6.0",
+ "5.6.1",
]:
depends_on("hip@" + ver, when="@" + ver)
depends_on("rocm-cmake@%s:" % ver, type="build", when="@" + ver)
diff --git a/var/spack/repos/builtin/packages/rocsolver/package.py b/var/spack/repos/builtin/packages/rocsolver/package.py
index f8c081299f..3b1cfcb511 100644
--- a/var/spack/repos/builtin/packages/rocsolver/package.py
+++ b/var/spack/repos/builtin/packages/rocsolver/package.py
@@ -39,6 +39,8 @@ class Rocsolver(CMakePackage):
version("develop", branch="develop")
version("master", branch="master")
+ version("5.6.1", sha256="6a8f366218aee599a0e56755030f94ee690b34f30e6d602748632226c5dc21bb")
+ version("5.6.0", sha256="54baa7f35f3c53da9005054e6f7aeecece5526dafcb277af32cbcb3996b0cbbc")
version("5.5.1", sha256="8bf843e42d2e89203ea5fdb6e6082cea90da8d02920ab4c09bcc2b6f69909760")
version("5.5.0", sha256="6775aa5b96731208c12c5b450cf218d4c262a80b7ea20c2c3034c448bb2ca4d2")
version("5.4.3", sha256="5308b68ea72f465239a4bb2ed1a0507f0df7c98d3df3fd1f392e6d9ed7975232")
@@ -132,7 +134,7 @@ class Rocsolver(CMakePackage):
# Backport https://github.com/ROCmSoftwarePlatform/rocSOLVER/commit/2bbfb8976f6e4d667499c77e41a6433850063e88
patch("fmt-8.1-compatibility.patch", when="@4.5.0:5.1.3")
# Maximize compatibility with other libraries that are using fmt.
- patch("fmt-9-compatibility.patch", when="@5.2.0:")
+ patch("fmt-9-compatibility.patch", when="@5.2.0:5.5")
def check(self):
exe = join_path(self.build_directory, "clients", "staging", "rocsolver-test")
@@ -173,9 +175,13 @@ class Rocsolver(CMakePackage):
"5.4.3",
"5.5.0",
"5.5.1",
+ "5.6.0",
+ "5.6.1",
]:
depends_on("hip@" + ver, when="@" + ver)
depends_on("rocblas@" + ver, when="@" + ver)
+ for ver in ["5.6.0", "5.6.1"]:
+ depends_on("rocsparse@5.2:", when="@5.6:")
for tgt in itertools.chain(["auto"], amdgpu_targets):
depends_on("rocblas amdgpu_target={0}".format(tgt), when="amdgpu_target={0}".format(tgt))
diff --git a/var/spack/repos/builtin/packages/rocsparse/package.py b/var/spack/repos/builtin/packages/rocsparse/package.py
index 8f3693b469..4fb8fb1646 100644
--- a/var/spack/repos/builtin/packages/rocsparse/package.py
+++ b/var/spack/repos/builtin/packages/rocsparse/package.py
@@ -32,6 +32,9 @@ class Rocsparse(CMakePackage):
sticky=True,
)
variant("test", default=False, description="Build rocsparse-test client")
+
+ version("5.6.1", sha256="6a50a64354507f1374e1a86aa7f5c07d1aaa96ac193ac292c279153087bb5d54")
+ version("5.6.0", sha256="5797db3deb4a532e691447e3e8c923b93bd9fe4c468f3a88f00cecd80bebcae4")
version("5.5.1", sha256="1dd2d18898dfebdf898e8fe7d1c1198e8f8451fd70ff12a1990ec1419cf359e1")
version("5.5.0", sha256="cbee79b637691bc710c1c83fbaa91db7498d38d4df873be23e28ed5617acde72")
version("5.4.3", sha256="9fb633f235eb0567cc54fae6bdc779f16bf0bb4e6f5bdddb40312c6d11ca8478")
@@ -142,6 +145,8 @@ class Rocsparse(CMakePackage):
"5.4.3",
"5.5.0",
"5.5.1",
+ "5.6.0",
+ "5.6.1",
]:
depends_on("hip@" + ver, when="@" + ver)
depends_on("rocprim@" + ver, when="@" + ver)
diff --git a/var/spack/repos/builtin/packages/rocthrust/package.py b/var/spack/repos/builtin/packages/rocthrust/package.py
index 62bf4fb3da..196bd7eaa1 100644
--- a/var/spack/repos/builtin/packages/rocthrust/package.py
+++ b/var/spack/repos/builtin/packages/rocthrust/package.py
@@ -19,6 +19,8 @@ class Rocthrust(CMakePackage):
maintainers("cgmb", "srekolam", "renjithravindrankannath")
+ version("5.6.1", sha256="63df61d5ab46d4cfda6066d748274bacecc77151692e372e6f7df5e91852bdc2")
+ version("5.6.0", sha256="e52a27bcb4add38a5f0f3a5c7e409c230bf4ba9afae19bd2e06c2be00d39db59")
version("5.5.1", sha256="66f126e5ea46ca761533411f81e83402773f95d3184cb7645ca73df227413023")
version("5.5.0", sha256="c031f71cd4b6eaf98664fd2ad50fc18f7ccbfa67be415dca425169d2d1c81e9e")
version("5.4.3", sha256="d133e14ea6d27d358d1bd4d31b79fb1562d1aea7c400e5a2d28d0f159cb6c8a8")
@@ -142,6 +144,8 @@ class Rocthrust(CMakePackage):
"5.4.3",
"5.5.0",
"5.5.1",
+ "5.6.0",
+ "5.6.1",
]:
depends_on("hip@" + ver, when="@" + ver)
depends_on("rocprim@" + ver, when="@" + ver)
diff --git a/var/spack/repos/builtin/packages/roctracer-dev-api/package.py b/var/spack/repos/builtin/packages/roctracer-dev-api/package.py
index 505dc254c1..c7a80816c4 100644
--- a/var/spack/repos/builtin/packages/roctracer-dev-api/package.py
+++ b/var/spack/repos/builtin/packages/roctracer-dev-api/package.py
@@ -17,6 +17,8 @@ class RoctracerDevApi(Package):
tags = ["rocm"]
maintainers("srekolam", "renjithravindrankannath")
+ version("5.6.1", sha256="007c498be25b067ad9a7631a2b0892f9129150ee9714e471a921225875d45e69")
+ version("5.6.0", sha256="cbcfe4fa2e8b627006b320a93992fb3078696d8ef2ef049b4b880b6b7d57e13e")
version("5.5.1", sha256="3afc31ebfdb14b0365185ca6b9326a83b1503a94a51d910f5ce7ced192d8c133")
version("5.5.0", sha256="fe9ad95628fa96639db6fc33f78d334c814c7161b4a754598f5a4a7852625777")
version("5.4.3", sha256="6b5111be5efd4d7fd6935ca99b06fab19b43d97a58d26fc1fe6e783c4de9a926")
diff --git a/var/spack/repos/builtin/packages/roctracer-dev/package.py b/var/spack/repos/builtin/packages/roctracer-dev/package.py
index b50574667e..328aa0844b 100644
--- a/var/spack/repos/builtin/packages/roctracer-dev/package.py
+++ b/var/spack/repos/builtin/packages/roctracer-dev/package.py
@@ -20,6 +20,8 @@ class RoctracerDev(CMakePackage, ROCmPackage):
maintainers("srekolam", "renjithravindrankannath")
libraries = ["libroctracer64"]
+ version("5.6.1", sha256="007c498be25b067ad9a7631a2b0892f9129150ee9714e471a921225875d45e69")
+ version("5.6.0", sha256="cbcfe4fa2e8b627006b320a93992fb3078696d8ef2ef049b4b880b6b7d57e13e")
version("5.5.1", sha256="3afc31ebfdb14b0365185ca6b9326a83b1503a94a51d910f5ce7ced192d8c133")
version("5.5.0", sha256="fe9ad95628fa96639db6fc33f78d334c814c7161b4a754598f5a4a7852625777")
version("5.4.3", sha256="6b5111be5efd4d7fd6935ca99b06fab19b43d97a58d26fc1fe6e783c4de9a926")
@@ -72,6 +74,8 @@ class RoctracerDev(CMakePackage, ROCmPackage):
"5.4.3",
"5.5.0",
"5.5.1",
+ "5.6.0",
+ "5.6.1",
]:
depends_on("hsakmt-roct@" + ver, when="@" + ver)
depends_on("hsa-rocr-dev@" + ver, when="@" + ver)
@@ -94,7 +98,7 @@ class RoctracerDev(CMakePackage, ROCmPackage):
]:
depends_on("rocprofiler-dev@" + ver, when="@" + ver)
- for ver in ["5.5.0", "5.5.1"]:
+ for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1"]:
depends_on("rocm-core@" + ver, when="@" + ver)
patch("0001-include-rocprofiler-dev-path.patch", when="@5.3:5.4")
diff --git a/var/spack/repos/builtin/packages/rocwmma/0001-add-rocm-smi-lib-path-for-building-tests.patch b/var/spack/repos/builtin/packages/rocwmma/0001-add-rocm-smi-lib-path-for-building-tests.patch
new file mode 100644
index 0000000000..cfa3cb4180
--- /dev/null
+++ b/var/spack/repos/builtin/packages/rocwmma/0001-add-rocm-smi-lib-path-for-building-tests.patch
@@ -0,0 +1,31 @@
+From 099ac638f41d9224f649fe23a64783bb408a2b09 Mon Sep 17 00:00:00 2001
+From: Sreenivasa Murthy Kolam <sreenivasamurthy.kolam@amd.com>
+Date: Wed, 30 Aug 2023 09:41:15 +0000
+Subject: [PATCH] add rocm-smi-lib path for building tests
+
+---
+ test/CMakeLists.txt | 5 +++--
+ 1 file changed, 3 insertions(+), 2 deletions(-)
+
+diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
+index 85f98d0..269f517 100644
+--- a/test/CMakeLists.txt
++++ b/test/CMakeLists.txt
+@@ -69,11 +69,12 @@ function(add_rocwmma_test TEST_TARGET TEST_SOURCE)
+
+ list(APPEND TEST_SOURCE ${ARGN})
+ add_executable(${TEST_TARGET} ${TEST_SOURCE})
+- target_link_libraries(${TEST_TARGET} rocwmma gtest)
++ target_link_libraries(${TEST_TARGET} rocwmma gtest ${ROCM_SMI_DIR}/lib)
+ target_link_libraries(${TEST_TARGET} OpenMP::OpenMP_CXX "-L${HIP_CLANG_ROOT}/lib" "-Wl,-rpath=${HIP_CLANG_ROOT}/lib")
+ target_include_directories(${TEST_TARGET} PRIVATE
+ ${CMAKE_CURRENT_SOURCE_DIR}
+- ${ROCWMMA_TEST_INCLUDE_DIRS})
++ ${ROCWMMA_TEST_INCLUDE_DIRS}
++ ${ROCM_SMI_DIR}/include)
+
+ # Add support to include extended test coverage
+ if(ROCWMMA_BUILD_EXTENDED_TESTS)
+--
+2.39.3
+
diff --git a/var/spack/repos/builtin/packages/rocwmma/package.py b/var/spack/repos/builtin/packages/rocwmma/package.py
index 774bfe6b26..96978f7862 100644
--- a/var/spack/repos/builtin/packages/rocwmma/package.py
+++ b/var/spack/repos/builtin/packages/rocwmma/package.py
@@ -25,7 +25,8 @@ class Rocwmma(CMakePackage):
tags = ["rocm"]
maintainers("srekolam", "renjithravindrankannath")
-
+ version("5.6.1", sha256="41a5159ee1ad5fc411fe6220f37bd754e26d3883c24c0f2378f50ef628bc1b8f")
+ version("5.6.0", sha256="78b6ab10fce71d10a9d762b2eaab3390eb13b05c764f47a3b0a303ec3d37acf8")
version("5.5.1", sha256="ada30d5e52df5da0d3f4e212a25efb492dbedc129628f4db4ef4ed77667da228")
version("5.5.0", sha256="b9e1938cba111eeea295414c42de34d54a878f0d41a26e433809d60c12d31dbf")
version("5.4.3", sha256="0968366c83b78a9d058d483be536aba03e79b300ccb6890d3da43298be54c288")
@@ -59,16 +60,33 @@ class Rocwmma(CMakePackage):
depends_on("googletest@1.10.0:", type="test")
- for ver in ["5.2.0", "5.2.1", "5.2.3", "5.3.0", "5.3.3", "5.4.0", "5.4.3", "5.5.0", "5.5.1"]:
+ for ver in [
+ "5.2.0",
+ "5.2.1",
+ "5.2.3",
+ "5.3.0",
+ "5.3.3",
+ "5.4.0",
+ "5.4.3",
+ "5.5.0",
+ "5.5.1",
+ "5.6.0",
+ "5.6.1",
+ ]:
depends_on("rocm-cmake@%s:" % ver, type="build", when="@" + ver)
depends_on("llvm-amdgpu@" + ver, type="build", when="@" + ver)
depends_on("hip@" + ver, when="@" + ver)
depends_on("rocblas@" + ver, type="build", when="@" + ver)
depends_on("rocm-openmp-extras@" + ver, type="build", when="@" + ver)
+ for ver in ["5.6.0", "5.6.1"]:
+ depends_on("rocm-smi-lib@" + ver, when="@" + ver)
+
for tgt in itertools.chain(["auto"], amdgpu_targets):
depends_on("rocblas amdgpu_target={0}".format(tgt), when="amdgpu_target={0}".format(tgt))
+ patch("0001-add-rocm-smi-lib-path-for-building-tests.patch", when="@5.6:")
+
def setup_build_environment(self, env):
env.set("CXX", self.spec["hip"].hipcc)
@@ -93,5 +111,7 @@ class Rocwmma(CMakePackage):
tgt = self.spec.variants["amdgpu_target"]
if "auto" not in tgt:
args.append(self.define_from_variant("AMDGPU_TARGETS", "amdgpu_target"))
+ if self.spec.satisfies("@5.6.0:"):
+ args.append(self.define("ROCM_SMI_DIR", self.spec["rocm-smi-lib"].prefix))
return args