diff options
author | renjithravindrankannath <94420380+renjithravindrankannath@users.noreply.github.com> | 2023-10-02 11:36:51 -0700 |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-10-02 11:36:51 -0700 |
commit | 615312fceeae6402b6aa01cd42fde8511a5fb284 (patch) | |
tree | 4fbd0f03c107a21f746406e47e0d595f73ae9353 | |
parent | 453625014da8a471a34cc10b0c18ddd0e141f130 (diff) | |
download | spack-615312fceeae6402b6aa01cd42fde8511a5fb284.tar.gz spack-615312fceeae6402b6aa01cd42fde8511a5fb284.tar.bz2 spack-615312fceeae6402b6aa01cd42fde8511a5fb284.tar.xz spack-615312fceeae6402b6aa01cd42fde8511a5fb284.zip |
Rocm 5.6.0 & 5.6.1 release updates (#39673)
* 5.6.0 updates
* Rocm 5.6.0 updates
* Style and audit corrections for 5.6
* Patching smi path for tests.
* Style correction
* 5.6.1 updates
* Updated hip tests for ci build failure
Updated hiprand with the release tag
Taken care the review comment rocsolver
* Adding rocm-smi path for 5.6
* Adding the patch file
* Setting library directory uniform
* gl depends on mesa but it should not be llvm variant
* Fix for the issue 39520 by setting CMAKE_INSTALL_LIBDIR=lib
* i1 muls can sometimes happen after SCEV. They resulted in
ISel failures because we were missing the patterns for them.
* 5.6.0 & 5.6.1 updates for migraphx, miopen-hip, mivisionx
* Revert "5.6.0 & 5.6.1 updates for migraphx, miopen-hip, mivisionx"
This reverts commit f54c9c6c67a4e5a54859f59d6550eb8e542d6c26.
* Revert operator mixup fix
* Splitting compiler-rt-linkage-for-host and operator mixup patch
* Adding missing patch for reverting operator mixup
* 5.6 update for composable-kernel,migraphx,miopen-hip and mivisionx
* Updating rvs, rcd and rccl for 5.6.1. adding comment for llvm patch
53 files changed, 4047 insertions, 61 deletions
diff --git a/var/spack/repos/builtin/packages/comgr/package.py b/var/spack/repos/builtin/packages/comgr/package.py index 93a5fcf740..a17bcc7e94 100644 --- a/var/spack/repos/builtin/packages/comgr/package.py +++ b/var/spack/repos/builtin/packages/comgr/package.py @@ -21,6 +21,8 @@ class Comgr(CMakePackage): libraries = ["libamd_comgr"] version("master", branch="amd-stg-open") + version("5.6.1", sha256="0a85d84619f98be26ca7a32c71f94ed3c4e9866133789eabb451be64ce739300") + version("5.6.0", sha256="9396a7238b547ee68146c669b10b9d5de8f1d76527c649133c75d8076a185a72") version("5.5.1", sha256="0fbb15fe5a95c2e141ccd360bc413e1feda283334781540a6e5095ab27fd8019") version("5.5.0", sha256="97dfff03226ce0902b9d5d1c8c7bebb7a15978a81b6e9c750bf2d2473890bd42") version("5.4.3", sha256="8af18035550977fe0aa9cca8dfacbe65fe292e971de5a0e160710bafda05a81f") @@ -142,6 +144,8 @@ class Comgr(CMakePackage): "5.4.3", "5.5.0", "5.5.1", + "5.6.0", + "5.6.1", "master", ]: # llvm libs are linked statically, so this *could* be a build dep @@ -153,7 +157,7 @@ class Comgr(CMakePackage): "rocm-device-libs@" + ver, when="@{0} ^llvm-amdgpu ~rocm-device-libs".format(ver) ) - for ver in ["5.5.0", "5.5.1"]: + for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1"]: depends_on("rocm-core@" + ver, when="@" + ver) root_cmakelists_dir = join_path("lib", "comgr") diff --git a/var/spack/repos/builtin/packages/composable-kernel/package.py b/var/spack/repos/builtin/packages/composable-kernel/package.py index 57bfc6a17c..efa05197f9 100644 --- a/var/spack/repos/builtin/packages/composable-kernel/package.py +++ b/var/spack/repos/builtin/packages/composable-kernel/package.py @@ -17,6 +17,8 @@ class ComposableKernel(CMakePackage): maintainers("srekolam", "afzpatel") version("master", branch="develop") + version("5.6.1", commit="f5ec04f091fa5c48c67d7bacec36a414d0be06a5") + version("5.6.0", commit="f0fd02634c2f8f8c70f5a0ab2a8c84db5e36eeca") version("5.5.1", commit="ac9e01e2cc3721be24619807adc444e1f59a9d25") version("5.5.0", commit="8b76b832420a3d69708401de6607a033163edcce") version("5.4.3", commit="bb3d9546f186e39cefedc3e7f01d88924ba20168") @@ -40,7 +42,7 @@ class ComposableKernel(CMakePackage): depends_on("pkgconfig", type="build") depends_on("cmake@3.16:", type="build") - for ver in ["master", "5.5.1", "5.5.0", "5.4.3", "5.4.0"]: + for ver in ["master", "5.6.1", "5.6.0", "5.5.1", "5.5.0", "5.4.3", "5.4.0"]: depends_on("hip@" + ver, when="@" + ver) depends_on("llvm-amdgpu@" + ver, when="@" + ver) depends_on("rocm-cmake@" + ver, when="@" + ver, type="build") diff --git a/var/spack/repos/builtin/packages/hip-rocclr/package.py b/var/spack/repos/builtin/packages/hip-rocclr/package.py index 3c2239d179..e6a4b3bbdf 100644 --- a/var/spack/repos/builtin/packages/hip-rocclr/package.py +++ b/var/spack/repos/builtin/packages/hip-rocclr/package.py @@ -27,6 +27,8 @@ class HipRocclr(CMakePackage): return url.format(version) version("master", branch="main") + version("5.6.1", sha256="cc9a99c7e4de3d9360c0a471b27d626e84a39c9e60e0aff1e8e1500d82391819") + version("5.6.0", sha256="864f87323e793e60b16905284fba381a7182b960dd4a37fb67420c174442c03c") version("5.5.1", sha256="1375fc7723cfaa0ae22a78682186d4804188b0a54990bfd9c0b8eb421b85e37e") version("5.5.0", sha256="efbae9a1ef2ab3de5ca44091e9bb78522e76759c43524c1349114f9596cc61d1") version("5.4.3", sha256="71d9668619ab57ec8a4564d11860438c5aad5bd161a3e58fbc49555fbd59182d") @@ -140,6 +142,8 @@ class HipRocclr(CMakePackage): "5.4.3", "5.5.0", "5.5.1", + "5.6.0", + "5.6.1", "master", ]: depends_on("hsakmt-roct@" + ver, when="@" + ver) @@ -162,6 +166,8 @@ class HipRocclr(CMakePackage): # Add opencl sources thru the below for d_version, d_shasum in [ + ("5.6.1", "ec26049f7d93c95050c27ba65472736665ec7a40f25920a868616b2970f6b845"), + ("5.6.0", "52ab260d00d279c2a86c353901ffd88ee61b934ad89e9eb480f210656705f04e"), ("5.5.1", "a8a62a7c6fc5398406d2203b8cb75621a24944688e545d917033d87de2724498"), ("5.5.0", "0df9fa0b8aa0c8e6711d34eec0fdf1ed356adcd9625bc8f1ce9b3e72090f3e4f"), ("5.4.3", "b0f8339c844a2e62773bd85cd1e7c5ecddfe71d7c8e8d604e1a1d60900c30873"), diff --git a/var/spack/repos/builtin/packages/hip/0014-remove-compiler-rt-linkage-for-host.5.6.0.patch b/var/spack/repos/builtin/packages/hip/0014-remove-compiler-rt-linkage-for-host.5.6.0.patch new file mode 100644 index 0000000000..dfca3691f1 --- /dev/null +++ b/var/spack/repos/builtin/packages/hip/0014-remove-compiler-rt-linkage-for-host.5.6.0.patch @@ -0,0 +1,75 @@ +From cd4283eab943a3018237035afea61f1b5e0042cd Mon Sep 17 00:00:00 2001 +From: Renjith Ravindran <Renjith.RavindranKannath@amd.com> +Date: Wed, 27 Sep 2023 06:38:18 +0000 +Subject: [PATCH] Remove-compiler-rt-linkage-for-host + +--- + clr/hipamd/CMakeLists.txt | 6 ++++-- + clr/hipamd/hip-config.cmake.in | 1 - + hipcc/bin/hipcc.pl | 11 ++++++++--- + 3 files changed, 12 insertions(+), 6 deletions(-) + +diff --git a/clr/hipamd/CMakeLists.txt b/clr/hipamd/CMakeLists.txt +index c14a9ad..ca49f7f 100755 +--- a/clr/hipamd/CMakeLists.txt ++++ b/clr/hipamd/CMakeLists.txt +@@ -400,8 +400,10 @@ if (NOT ${HIPCC_BIN_DIR} STREQUAL "") + install(PROGRAMS ${HIPCC_BIN_DIR}/hipcc.pl DESTINATION bin) + install(PROGRAMS ${HIPCC_BIN_DIR}/hipconfig.pl DESTINATION bin) + install(PROGRAMS ${HIPCC_BIN_DIR}/hipvars.pm DESTINATION bin) +- install(PROGRAMS ${HIPCC_BIN_DIR}/hipcc.bat DESTINATION bin) +- install(PROGRAMS ${HIPCC_BIN_DIR}/hipconfig.bat DESTINATION bin) ++ if(WIN32) ++ install(PROGRAMS ${HIPCC_BIN_DIR}/hipcc.bat DESTINATION bin) ++ install(PROGRAMS ${HIPCC_BIN_DIR}/hipconfig.bat DESTINATION bin) ++ endif() + endif() + + ############################# +diff --git a/clr/hipamd/hip-config.cmake.in b/clr/hipamd/hip-config.cmake.in +index 537a599..7d10273 100755 +--- a/clr/hipamd/hip-config.cmake.in ++++ b/clr/hipamd/hip-config.cmake.in +@@ -245,7 +245,6 @@ if(HIP_COMPILER STREQUAL "clang") + # Add support for __fp16 and _Float16, explicitly link with compiler-rt + if( "${CLANGRT_BUILTINS_FETCH_EXIT_CODE}" STREQUAL "0" ) + # CLANG_RT Builtins found Successfully Set interface link libraries property +- set_property(TARGET hip::host APPEND PROPERTY INTERFACE_LINK_LIBRARIES "${CLANGRT_BUILTINS}") + set_property(TARGET hip::device APPEND PROPERTY INTERFACE_LINK_LIBRARIES "${CLANGRT_BUILTINS}") + else() + message(STATUS "clangrt builtins lib not found: ${CLANGRT_BUILTINS_FETCH_EXIT_CODE}") +diff --git a/hipcc/bin/hipcc.pl b/hipcc/bin/hipcc.pl +index 56dcda2..c7ae60b 100755 +--- a/hipcc/bin/hipcc.pl ++++ b/hipcc/bin/hipcc.pl +@@ -155,11 +155,15 @@ if ($HIP_PLATFORM eq "amd") { + if($isWindows) { + $execExtension = ".exe"; + } +- $HIPCC="$HIP_CLANG_PATH/clang++" . $execExtension; ++ # llvm_path is set inside the hip recipe ++ $LLVM_PATH= $ENV{'LLVM_PATH'}; ++ $HIPCC="${LLVM_PATH}/bin/clang++" . $execExtension; ++ + + # If $HIPCC clang++ is not compiled, use clang instead + if ( ! -e $HIPCC ) { +- $HIPCC="$HIP_CLANG_PATH/clang" . $execExtension; ++ $LLVM_PATH= $ENV{'LLVM_PATH'}; ++ $HIPCC="${LLVM_PATH}/bin/clang" . $execExtension; + $HIPLDFLAGS = "--driver-mode=g++"; + } + # to avoid using dk linker or MSVC linker +@@ -483,7 +487,8 @@ if($HIP_PLATFORM eq "amd"){ + $targetsStr = $ENV{HCC_AMDGPU_TARGET}; + } elsif (not $isWindows) { + # Else try using rocm_agent_enumerator +- $ROCM_AGENT_ENUM = "${ROCM_PATH}/bin/rocm_agent_enumerator"; ++ $ROCMINFO_PATH = $ENV{'ROCMINFO_PATH'} // $ROCMINFO_PATH; ++ $ROCM_AGENT_ENUM = "${ROCMINFO_PATH}/bin/rocm_agent_enumerator"; + $targetsStr = `${ROCM_AGENT_ENUM} -t GPU`; + $targetsStr =~ s/\n/,/g; + } +-- +2.31.1 + diff --git a/var/spack/repos/builtin/packages/hip/0015-reverting-operator-mixup-fix-for-slate.patch b/var/spack/repos/builtin/packages/hip/0015-reverting-operator-mixup-fix-for-slate.patch new file mode 100644 index 0000000000..36bfadfe94 --- /dev/null +++ b/var/spack/repos/builtin/packages/hip/0015-reverting-operator-mixup-fix-for-slate.patch @@ -0,0 +1,107 @@ +From 1d7f7eb9a52af2b83d3cb06bb4fe0f31eb47ce7f Mon Sep 17 00:00:00 2001 +From: Renjith Ravindran <Renjith.RavindranKannath@amd.com> +Date: Wed, 27 Sep 2023 07:07:01 +0000 +Subject: [PATCH] Reverting operator mixup fix for slate + +--- + .../include/hip/amd_detail/amd_hip_complex.h | 17 ++++------ + .../hip/amd_detail/amd_hip_vector_types.h | 31 +++++++++++-------- + 2 files changed, 24 insertions(+), 24 deletions(-) + +diff --git a/clr/hipamd/include/hip/amd_detail/amd_hip_complex.h b/clr/hipamd/include/hip/amd_detail/amd_hip_complex.h +index 9d9dfd5..eba6eb5 100644 +--- a/clr/hipamd/include/hip/amd_detail/amd_hip_complex.h ++++ b/clr/hipamd/include/hip/amd_detail/amd_hip_complex.h +@@ -106,20 +106,15 @@ THE SOFTWARE. + return lhs; \ + } + +-#define COMPLEX_MUL_PREOP_OVERLOAD(type) \ +- __HOST_DEVICE__ static inline type& operator*=(type& lhs, const type& rhs) { \ +- type temp{lhs}; \ +- lhs.x = rhs.x * temp.x - rhs.y * temp.y; \ +- lhs.y = rhs.y * temp.x + rhs.x * temp.y; \ +- return lhs; \ ++#define COMPLEX_MUL_PREOP_OVERLOAD(type) \ ++ __HOST_DEVICE__ static inline type& operator*=(type& lhs, const type& rhs) { \ ++ lhs = lhs * rhs; \ ++ return lhs; \ + } + + #define COMPLEX_DIV_PREOP_OVERLOAD(type) \ +- __HOST_DEVICE__ static inline type& operator/=(type& lhs, const type& rhs) { \ +- type temp; \ +- temp.x = (lhs.x*rhs.x + lhs.y * rhs.y) / (rhs.x*rhs.x + rhs.y*rhs.y); \ +- temp.y = (lhs.y * rhs.x - lhs.x * rhs.y) / (rhs.x*rhs.x + rhs.y*rhs.y); \ +- lhs = temp; \ ++ __HOST_DEVICE__ static inline type& operator/=(type& lhs, const type& rhs) { \ ++ lhs = lhs / rhs; \ + return lhs; \ + } + +diff --git a/clr/hipamd/include/hip/amd_detail/amd_hip_vector_types.h b/clr/hipamd/include/hip/amd_detail/amd_hip_vector_types.h +index 8215fb0..dfd3b39 100644 +--- a/clr/hipamd/include/hip/amd_detail/amd_hip_vector_types.h ++++ b/clr/hipamd/include/hip/amd_detail/amd_hip_vector_types.h +@@ -544,13 +544,6 @@ template <typename __T> struct is_scalar : public integral_constant<bool, __is_s + data *= x.data; + return *this; + } +- +- friend __HOST_DEVICE__ inline constexpr HIP_vector_type operator*( +- HIP_vector_type x, const HIP_vector_type& y) noexcept +- { +- return HIP_vector_type{ x } *= y; +- } +- + template< + typename U, + typename std::enable_if< +@@ -561,12 +554,6 @@ template <typename __T> struct is_scalar : public integral_constant<bool, __is_s + return *this *= HIP_vector_type{x}; + } + +- friend __HOST_DEVICE__ inline constexpr HIP_vector_type operator/( +- HIP_vector_type x, const HIP_vector_type& y) noexcept +- { +- return HIP_vector_type{ x } /= y; +- } +- + __HOST_DEVICE__ + HIP_vector_type& operator/=(const HIP_vector_type& x) noexcept + { +@@ -722,6 +709,15 @@ template <typename __T> struct is_scalar : public integral_constant<bool, __is_s + return HIP_vector_type<T, n>{x} -= y; + } + ++ template<typename T, unsigned int n> ++ __HOST_DEVICE__ ++ inline ++ constexpr ++ HIP_vector_type<T, n> operator*( ++ const HIP_vector_type<T, n>& x, const HIP_vector_type<T, n>& y) noexcept ++ { ++ return HIP_vector_type<T, n>{x} *= y; ++ } + template<typename T, unsigned int n, typename U> + __HOST_DEVICE__ + inline +@@ -741,6 +737,15 @@ template <typename __T> struct is_scalar : public integral_constant<bool, __is_s + return HIP_vector_type<T, n>{x} *= y; + } + ++ template<typename T, unsigned int n> ++ __HOST_DEVICE__ ++ inline ++ constexpr ++ HIP_vector_type<T, n> operator/( ++ const HIP_vector_type<T, n>& x, const HIP_vector_type<T, n>& y) noexcept ++ { ++ return HIP_vector_type<T, n>{x} /= y; ++ } + template<typename T, unsigned int n, typename U> + __HOST_DEVICE__ + inline +-- +2.31.1 + diff --git a/var/spack/repos/builtin/packages/hip/package.py b/var/spack/repos/builtin/packages/hip/package.py index 2bf04a1983..5e1d6744cb 100644 --- a/var/spack/repos/builtin/packages/hip/package.py +++ b/var/spack/repos/builtin/packages/hip/package.py @@ -25,6 +25,8 @@ class Hip(CMakePackage): libraries = ["libamdhip64"] version("master", branch="master") + version("5.6.1", sha256="4b3c4dfcf8595da0e1b8c3e8067b1ccebeaac337762ff098db14375fa8dd4487") + version("5.6.0", sha256="a8237768c1ae70029d972376f8d279f4de18a1e6106fff6215d1e16847bc375e") version("5.5.1", sha256="1f5f6bb72d8d64335ccc8242ef2e2ea8efeb380cce2997f475b1ee77528d9fb4") version("5.5.0", sha256="5b0d0253e62f85cc21d043513f7c11c64e4a4ec416159668f0b160d732d09a3c") version("5.4.3", sha256="23e51d3af517cd63019f8d199e46b84d5a18251d148e727f3985e8d99ccb0e58") @@ -162,6 +164,8 @@ class Hip(CMakePackage): "5.4.3", "5.5.0", "5.5.1", + "5.6.0", + "5.6.1", ]: depends_on("hsakmt-roct@" + ver, when="@" + ver) depends_on("hsa-rocr-dev@" + ver, when="@" + ver) @@ -170,10 +174,10 @@ class Hip(CMakePackage): depends_on("rocminfo@" + ver, when="@" + ver) depends_on("roctracer-dev-api@" + ver, when="@" + ver) - for ver in ["5.4.0", "5.4.3", "5.5.0", "5.5.1"]: + for ver in ["5.4.0", "5.4.3", "5.5.0", "5.5.1", "5.6.0", "5.6.1"]: depends_on("hipify-clang", when="@" + ver) - for ver in ["5.5.0", "5.5.1"]: + for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1"]: depends_on("rocm-core@" + ver, when="@" + ver) # hipcc likes to add `-lnuma` by default :( # ref https://github.com/ROCm-Developer-Tools/HIP/pull/2202 @@ -269,6 +273,55 @@ class Hip(CMakePackage): placement="rocclr", when="@{0}".format(d_version), ) + # Add hip-clr sources thru the below + for d_version, d_shasum in [ + ("5.6.1", "0b88af1e99643899d11b1c8cf8a3c46601051b328a5e0ffbd44ee88b7eb0db33"), + ("5.6.0", "8dcd99110737a294f67a805639cf372890c8ca16c7603caaa793e71e84478fe4"), + ]: + resource( + name="clr", + url="https://github.com/ROCm-Developer-Tools/clr/archive/refs/tags/rocm-{0}.tar.gz".format( + d_version + ), + sha256=d_shasum, + expand=True, + destination="", + placement="clr", + when="@{0}".format(d_version), + ) + + # Add hipcc sources thru the below + for d_version, d_shasum in [ + ("5.6.1", "5800fac92b841ef6f52acda78d9bf86f83970bec0fb848a6265d239bdb7eb51a"), + ("5.6.0", "fdb7fdc9e4648376120330f034ee8353038d34c8a015f9eb0c208c56eeddd097"), + ]: + resource( + name="hipcc", + url="https://github.com/ROCm-Developer-Tools/HIPCC/archive/refs/tags/rocm-{0}.tar.gz".format( + d_version + ), + sha256=d_shasum, + expand=True, + destination="", + placement="hipcc", + when="@{0}".format(d_version), + ) + # Add hiptests sources thru the below + for d_version, d_shasum in [ + ("5.6.1", "5b3002ddfafda162329e4d9e6ac1200eeb48ff08e666b342aa8aeca30750f48b"), + ("5.6.0", "8cf4509bf9c0747dab8ed8fec1365a9156792034b517207a0b2d63270429fd2e"), + ]: + resource( + name="hip-tests", + url="https://github.com/ROCm-Developer-Tools/hip-tests/archive/refs/tags/rocm-{0}.tar.gz".format( + d_version + ), + sha256=d_shasum, + expand=True, + destination="", + placement="hip-tests", + when="@{0}".format(d_version), + ) # Note: the ROCm ecosystem expects `lib/` and `bin/` folders with symlinks # in the parent directory of the package, which is incompatible with spack. # In hipcc the ROCM_PATH variable is used to point to the parent directory @@ -331,10 +384,11 @@ class Hip(CMakePackage): patch("0005-Disable-tests-4.1.0.patch", when="@4.1.0:4.3.2") patch("Add_missing_open_cl_header_file_for_4.3.0.patch", when="@4.3.0:4.3.2") - patch("0014-hip-test-file-reorg-5.4.0.patch", when="@5.4.0:") - patch("0016-hip-sample-fix-hipMalloc-call.patch", when="@5.4.3:") + patch("0014-hip-test-file-reorg-5.4.0.patch", when="@5.4.0:5.5") + patch("0016-hip-sample-fix-hipMalloc-call.patch", when="@5.4.3:5.5") patch("0014-remove-compiler-rt-linkage-for-host.5.5.0.patch", when="@5.5") - + patch("0014-remove-compiler-rt-linkage-for-host.5.6.0.patch", when="@5.6:") + patch("0015-reverting-operator-mixup-fix-for-slate.patch", when="@5.6:") # See https://github.com/ROCm-Developer-Tools/HIP/pull/3206 patch( "https://github.com/ROCm-Developer-Tools/HIP/commit/50ee82f6bc4aad10908ce09198c9f7ebfb2a3561.patch?full_index=1", @@ -346,8 +400,10 @@ class Hip(CMakePackage): def root_cmakelists_dir(self): if self.spec.satisfies("@:4.3.2"): return self.stage.source_path - else: + elif self.spec.satisfies("@4.5:5.5"): return "hipamd" + else: + return "clr" def get_paths(self): if self.spec.external: @@ -393,6 +449,7 @@ class Hip(CMakePackage): "llvm-amdgpu": rocm_prefix.llvm, "hsa-rocr-dev": rocm_prefix.hsa, "rocminfo": rocm_prefix, + "comgr": rocm_prefix, "rocm-device-libs": rocm_prefix, } @@ -405,6 +462,7 @@ class Hip(CMakePackage): "llvm-amdgpu": self.spec["llvm-amdgpu"].prefix, "hsa-rocr-dev": self.spec["hsa-rocr-dev"].prefix, "rocminfo": self.spec["rocminfo"].prefix, + "comgr": self.spec["comgr"].prefix, "rocm-device-libs": self.spec["llvm-amdgpu"].prefix, } @@ -476,6 +534,7 @@ class Hip(CMakePackage): # hiprtcCreateProgram: # https://github.com/RadeonOpenCompute/ROCm-CompilerSupport/blob/rocm-4.0.0/lib/comgr/src/comgr-env.cpp env.set("LLVM_PATH", paths["llvm-amdgpu"]) + env.set("COMGR_PATH", paths["comgr"]) # Finally we have to set --rocm-path=<prefix> ourselves, which is not # the same as --hip-device-lib-path (set by hipcc). It's used to set @@ -525,13 +584,20 @@ class Hip(CMakePackage): "hip-config.cmake.in", string=True, ) - if self.spec.satisfies("@5.2: +rocm"): + if self.spec.satisfies("@5.2:5.4 +rocm"): filter_file( '"${ROCM_PATH}/llvm"', self.spec["llvm-amdgpu"].prefix, "hipamd/hip-config.cmake.in", string=True, ) + if self.spec.satisfies("@5.6: +rocm"): + filter_file( + '"${ROCM_PATH}/llvm"', + self.spec["llvm-amdgpu"].prefix, + "clr/hipamd/hip-config.cmake.in", + string=True, + ) perl = self.spec["perl"].command kwargs = {"ignore_absent": False, "backup": False, "string": False} @@ -552,13 +618,13 @@ class Hip(CMakePackage): "roc-obj-ls", "hipvars.pm", ] - elif self.spec.satisfies("@4.5.0:"): + elif self.spec.satisfies("@4.5.0:5.5"): files = [] - filter_file(match, substitute, *files, **kwargs) - # This guy is used during the cmake phase, so we have to fix the - # shebang already here in case it is too long. - filter_shebang("hipconfig") - if self.spec.satisfies("@4.5.0:"): + filter_file(match, substitute, *files, **kwargs) + # This guy is used during the cmake phase, so we have to fix the + # shebang already here in case it is too long. + filter_shebang("hipconfig") + if self.spec.satisfies("@4.5.0:5.5"): perl = self.spec["perl"].command kwargs = {"ignore_absent": False, "backup": False, "string": False} with working_dir("hipamd/bin"): @@ -566,6 +632,18 @@ class Hip(CMakePackage): substitute = "#!{perl}".format(perl=perl) files = ["roc-obj-extract", "roc-obj-ls"] filter_file(match, substitute, *files, **kwargs) + if self.spec.satisfies("@5.6.0:"): + perl = self.spec["perl"].command + kwargs = {"ignore_absent": False, "backup": False, "string": False} + match = "^#!/usr/bin/perl" + substitute = "#!{perl}".format(perl=perl) + with working_dir("clr/hipamd/bin"): + files = ["roc-obj-extract", "roc-obj-ls"] + filter_file(match, substitute, *files, **kwargs) + with working_dir("hipcc/bin"): + files = [] + filter_file(match, substitute, *files, **kwargs) + filter_shebang("hipconfig") if "@3.7.0: +rocm" in self.spec: numactl = self.spec["numactl"].prefix.lib kwargs = {"ignore_absent": False, "backup": False, "string": False} @@ -573,7 +651,16 @@ class Hip(CMakePackage): with working_dir("bin"): match = " -lnuma" substitute = " -L{numactl} -lnuma".format(numactl=numactl) - filter_file(match, substitute, "hipcc", **kwargs) + if self.spec.satisfies("@4.5.0:5.5"): + filter_file(match, substitute, "hipcc", **kwargs) + if "@5.6.0: +rocm" in self.spec: + numactl = self.spec["numactl"].prefix.lib + kwargs = {"ignore_absent": False, "backup": False, "string": False} + + with working_dir("hipcc/src"): + match = " -lnuma" + substitute = " -L{numactl} -lnuma".format(numactl=numactl) + filter_file(match, substitute, "hipBin_amd.h", **kwargs) def flag_handler(self, name, flags): if name == "cxxflags" and self.spec.satisfies("@3.7.0:4.3.2"): @@ -609,21 +696,30 @@ class Hip(CMakePackage): if "@4.5.0:" in self.spec: args.append(self.define("HIP_COMMON_DIR", self.stage.source_path)) args.append(self.define("HIP_CATCH_TEST", "OFF")) - args.append(self.define("ROCCLR_PATH", self.stage.source_path + "/rocclr")) - args.append(self.define("AMD_OPENCL_PATH", self.stage.source_path + "/opencl")) + if "@4.5.0:5.5" in self.spec: + args.append(self.define("ROCCLR_PATH", self.stage.source_path + "rocclr")) + args.append(self.define("AMD_OPENCL_PATH", self.stage.source_path + "opencl")) if "@5.3.0:" in self.spec: args.append("-DCMAKE_INSTALL_LIBDIR=lib") - + if "@5.6.0:" in self.spec: + args.append(self.define("ROCCLR_PATH", self.stage.source_path + "/clr/rocclr")) + args.append(self.define("AMD_OPENCL_PATH", self.stage.source_path + "/clr/opencl")) + args.append(self.define("HIPCC_BIN_DIR", self.stage.source_path + "/hipcc/bin")), + args.append(self.define("CLR_BUILD_HIP", True)), + args.append(self.define("CLR_BUILD_OCL", False)), return args - test_src_dir = "samples" - @run_after("install") def cache_test_sources(self): """Copy the tests source files after the package is installed to an install test subdirectory for use during `spack test run`.""" if self.spec.satisfies("@:5.1.0"): return + else: + if "@:5.5" in self.spec: + self.test_src_dir = "samples" + else: + self.test_src_dir = "hip-tests/samples" self.cache_extra_test_sources([self.test_src_dir]) def test_samples(self): diff --git a/var/spack/repos/builtin/packages/hipblas/package.py b/var/spack/repos/builtin/packages/hipblas/package.py index 08998400ee..b0261bd5db 100644 --- a/var/spack/repos/builtin/packages/hipblas/package.py +++ b/var/spack/repos/builtin/packages/hipblas/package.py @@ -22,6 +22,8 @@ class Hipblas(CMakePackage, CudaPackage, ROCmPackage): version("develop", branch="develop") version("master", branch="master") + version("5.6.1", sha256="f9da82fbefc68b84081ea0ed0139b91d2a540357fcf505c7f1d57eab01eb327c") + version("5.6.0", sha256="9453a31324e10ba528f8f4755d2c270d0ed9baa33e980d8f8383204d8e28a563") version("5.5.1", sha256="5920c9a9c83cf7e2b42d1f99f5d5091cac7f6c0a040a737e869e57b92d7045a9") version("5.5.0", sha256="b080c25cb61531228d26badcdca856c46c640035c058bfc1c9f63de65f418cd5") version("5.4.3", sha256="5acac147aafc15c249c2f24c19459135ed68b506403aa92e602b67cfc10c38b7") @@ -167,6 +169,8 @@ class Hipblas(CMakePackage, CudaPackage, ROCmPackage): "5.4.3", "5.5.0", "5.5.1", + "5.6.0", + "5.6.1", "master", "develop", ]: diff --git a/var/spack/repos/builtin/packages/hipcub/package.py b/var/spack/repos/builtin/packages/hipcub/package.py index 94e6055705..cb878d1823 100644 --- a/var/spack/repos/builtin/packages/hipcub/package.py +++ b/var/spack/repos/builtin/packages/hipcub/package.py @@ -15,7 +15,8 @@ class Hipcub(CMakePackage, CudaPackage, ROCmPackage): tags = ["rocm"] maintainers("srekolam", "renjithravindrankannath") - + version("5.6.1", sha256="4b9479daa40424c9ddbc14ce967aa170680f8ca1ed01a514e6e30ccfa22552ce") + version("5.6.0", sha256="5e74ddbf833f39836bf9ec6c6750348c7386a85ca67aaf9bb54d16c9e1959031") version("5.5.1", sha256="ad83f3f1ed85ead9e3012906957c125a896168be913f6fb6af298228fc571480") version("5.5.0", sha256="3eec838119326a67eb4cc006c706e328f3a51a01e98bbfb518df8fe4a4707e13") version("5.4.3", sha256="cf528d9acb4f9b9c3aad439ae76bfc3d02be6e7a74d96099544e5d54e1a23675") @@ -148,6 +149,8 @@ class Hipcub(CMakePackage, CudaPackage, ROCmPackage): "5.4.3", "5.5.0", "5.5.1", + "5.6.0", + "5.6.1", ]: depends_on("rocprim@" + ver, when="+rocm @" + ver) depends_on("rocm-cmake@%s:" % ver, type="build", when="@" + ver) diff --git a/var/spack/repos/builtin/packages/hipfft/package.py b/var/spack/repos/builtin/packages/hipfft/package.py index 92e3db29cd..046d908e3e 100644 --- a/var/spack/repos/builtin/packages/hipfft/package.py +++ b/var/spack/repos/builtin/packages/hipfft/package.py @@ -22,7 +22,8 @@ class Hipfft(CMakePackage, CudaPackage, ROCmPackage): maintainers("renjithravindrankannath", "srekolam") version("master", branch="master") - + version("5.6.1", sha256="d2ae36b8eacd39b865e8a7972b8eb86bcea2de4ac90711bba7e29b39b01eaa74") + version("5.6.0", sha256="c7f425b693caf9371b42226d86392335d993a117d23219b6ba1fd13523cb8261") version("5.5.1", sha256="3addd15a459752ad657e84c2a7b6b6289600d1d0a5f90d6e0946ba11e8148fc0") version("5.5.0", sha256="47ec6f7da7346c312b80daaa8f763e86c7bdc33ac8617cfa3344068e5b20dd9e") version("5.4.3", sha256="ae37f40b6019a11f10646ef193716836f366d269eab3c5cc2ed09af85355b945") @@ -116,6 +117,8 @@ class Hipfft(CMakePackage, CudaPackage, ROCmPackage): "5.4.3", "5.5.0", "5.5.1", + "5.6.0", + "5.6.1", ]: depends_on("rocm-cmake@%s:" % ver, type="build", when="@" + ver) depends_on("rocfft@" + ver, when="+rocm @" + ver) diff --git a/var/spack/repos/builtin/packages/hipfort/package.py b/var/spack/repos/builtin/packages/hipfort/package.py index ec1e64ce13..da688d9c1f 100644 --- a/var/spack/repos/builtin/packages/hipfort/package.py +++ b/var/spack/repos/builtin/packages/hipfort/package.py @@ -15,7 +15,8 @@ class Hipfort(CMakePackage): tags = ["rocm"] maintainers("cgmb", "srekolam", "renjithravindrankannath") - + version("5.6.1", sha256="a55345cc9ccaf0cd69d306b8eb9ec2a02c220a57e9c396443cc7273aa3377adc") + version("5.6.0", sha256="03176a099bc81e212ad1bf9d86f35561f8f2d21a2f126732d7620e1ea59888d5") version("5.5.1", sha256="abc59f7b81cbefbe3555cbf1bf0d80e8aa65901c70799748c40870fe6f3fea60") version("5.5.0", sha256="cae75ffeac129639cabebfe2f95f254c83d6c0a6cffd98142ea3537a132e42bb") version("5.4.3", sha256="1954a1cba351d566872ced5549b2ced7ab6332221e2b98dba3c07180dce8f173") @@ -118,6 +119,8 @@ class Hipfort(CMakePackage): "5.4.3", "5.5.0", "5.5.1", + "5.6.0", + "5.6.1", ]: depends_on("hip@" + ver, type="build", when="@" + ver) diff --git a/var/spack/repos/builtin/packages/hipify-clang/0002-install-hipify-clang-in-bin-dir-and-llvm-clangs-head.patch b/var/spack/repos/builtin/packages/hipify-clang/0002-install-hipify-clang-in-bin-dir-and-llvm-clangs-head.patch new file mode 100644 index 0000000000..c2fad6d3f2 --- /dev/null +++ b/var/spack/repos/builtin/packages/hipify-clang/0002-install-hipify-clang-in-bin-dir-and-llvm-clangs-head.patch @@ -0,0 +1,13 @@ +diff --git a/CMakeLists.txt b/CMakeLists.txt +index 80c8a3f..d2b88c0 100644 +--- a/CMakeLists.txt ++++ b/CMakeLists.txt +@@ -137,7 +137,7 @@ install( + # install all folders under clang/version/ in CMAKE_INSTALL_PREFIX path + install( + DIRECTORY ${LLVM_DIR}/../../clang/${LLVM_VERSION_MAJOR}.${LLVM_VERSION_MINOR}.${LLVM_VERSION_PATCH}/ +- DESTINATION . ++ DESTINATION ${CMAKE_INSTALL_PREFIX}/include + COMPONENT clang-resource-headers + FILES_MATCHING + PATTERN "*.h" diff --git a/var/spack/repos/builtin/packages/hipify-clang/package.py b/var/spack/repos/builtin/packages/hipify-clang/package.py index 65dd8df60f..dd6b99ee71 100644 --- a/var/spack/repos/builtin/packages/hipify-clang/package.py +++ b/var/spack/repos/builtin/packages/hipify-clang/package.py @@ -18,6 +18,8 @@ class HipifyClang(CMakePackage): maintainers("srekolam", "renjithravindrankannath") version("master", branch="master") + version("5.6.1", sha256="ec3a4f276556f9fd924ea3c89be11b6c6ddf999cdd4387f669e38e41ee0042e8") + version("5.6.0", sha256="a2572037a7d3bd0813bd6819a5e6c0e911678db5fd3ab15a65370601df91891b") version("5.5.1", sha256="35b9c07a7afaf9cf6f3bbe9dd147fa81b1b297af3e5e26e60c55629e83feaa48") version("5.5.0", sha256="1b75c702799ac93027337f8fb61d7c27ba960e8ece60d907fc8c5ab3f15c3fe9") version("5.4.3", sha256="79e27bd6c0a28e6a62b02dccc0b5d88a81f69fe58487e83f3b7ab47d6b64341b") @@ -103,7 +105,8 @@ class HipifyClang(CMakePackage): # the patch was added to install the targets in the correct directory structure # this will fix the issue https://github.com/spack/spack/issues/30711 - patch("0001-install-hipify-clang-in-bin-dir-and-llvm-clangs-head.patch", when="@5.1.0:") + patch("0001-install-hipify-clang-in-bin-dir-and-llvm-clangs-head.patch", when="@5.1.0:5.5") + patch("0002-install-hipify-clang-in-bin-dir-and-llvm-clangs-head.patch", when="@5.6:") depends_on("cmake@3.5:", type="build") for ver in [ @@ -132,11 +135,13 @@ class HipifyClang(CMakePackage): "5.4.3", "5.5.0", "5.5.1", + "5.6.0", + "5.6.1", "master", ]: depends_on("llvm-amdgpu@" + ver, when="@" + ver) - for ver in ["5.5.0", "5.5.1"]: + for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1"]: depends_on("rocm-core@" + ver, when="@" + ver) def setup_run_environment(self, env): diff --git a/var/spack/repos/builtin/packages/hiprand/package.py b/var/spack/repos/builtin/packages/hiprand/package.py index f55092df19..5f85c46cb9 100644 --- a/var/spack/repos/builtin/packages/hiprand/package.py +++ b/var/spack/repos/builtin/packages/hiprand/package.py @@ -22,6 +22,8 @@ class Hiprand(CMakePackage, CudaPackage, ROCmPackage): version("develop", branch="develop") version("master", branch="master") + version("5.6.1", sha256="a73d5578bc7f8dff0b8960e4bff97bc4fc28f508a19ed6acd1cfd4d3e76b47ee") + version("5.6.0", sha256="8c214e2f90337a5317a69950026bf337b1e567d43bb9ae64f2a802af2228c313") version("5.5.1", sha256="5df9d78eae0991be5ec9f60e8d3530fabc23793d9f9cf274b075d689675db04e") version("5.5.0", sha256="7c7dde7b989d5da9c0b0251233245f955b477c090462c7d34e3e0284c5fca761") version("5.4.3", sha256="7d3d04476880ec90c088dff81f69aac8699eaef972476000e5c4726584ffa98f") @@ -78,6 +80,8 @@ class Hiprand(CMakePackage, CudaPackage, ROCmPackage): "5.4.3", "5.5.0", "5.5.1", + "5.6.0", + "5.6.1", "master", "develop", ]: diff --git a/var/spack/repos/builtin/packages/hipsolver/package.py b/var/spack/repos/builtin/packages/hipsolver/package.py index c1ca0db616..1ef38160a6 100644 --- a/var/spack/repos/builtin/packages/hipsolver/package.py +++ b/var/spack/repos/builtin/packages/hipsolver/package.py @@ -26,7 +26,8 @@ class Hipsolver(CMakePackage, CudaPackage, ROCmPackage): version("develop", branch="develop") version("master", branch="master") - + version("5.6.1", sha256="2e546bc7771f7bf0aa7892b69cded725941573e8b70614759c3d03c21eb78dde") + version("5.6.0", sha256="11fa51d210853d93d24d55b20367738e49711793412f58e8d7689710b92ae16c") version("5.5.1", sha256="826bd64a4887176595bb7319d9a3612e7327602efe1f42aa3f2ad0e783d1a180") version("5.5.0", sha256="0f45be0f90907381ae3e82424599e2ca2112d6411b4a64c72558d63f00409b83") version("5.4.3", sha256="02a1bffecc494393f49f97174db7d2c101db557d32404923a44520876e682e3a") @@ -106,6 +107,8 @@ class Hipsolver(CMakePackage, CudaPackage, ROCmPackage): "5.4.3", "5.5.0", "5.5.1", + "5.6.0", + "5.6.1", "master", "develop", ]: diff --git a/var/spack/repos/builtin/packages/hipsparse/package.py b/var/spack/repos/builtin/packages/hipsparse/package.py index f698d783bc..696094cb5a 100644 --- a/var/spack/repos/builtin/packages/hipsparse/package.py +++ b/var/spack/repos/builtin/packages/hipsparse/package.py @@ -19,6 +19,8 @@ class Hipsparse(CMakePackage, CudaPackage, ROCmPackage): maintainers("cgmb", "srekolam", "renjithravindrankannath", "haampie") libraries = ["libhipsparse"] + version("5.6.1", sha256="d636d0c5d1e38cc0c09b1e95380199ec82bd465b94bd6661f0c8d9374d9b565d") + version("5.6.0", sha256="3a6931b744ebaa4469a4c50d059a008403e4dc2a4f04dd69c3c6d20916b4a491") version("5.5.1", sha256="3d291e4fe2c611d555e54de66149b204fe7ac59f5dd00a9ad93bc6dca0528880") version("5.5.0", sha256="8122c8f17d899385de83efb7ac0d8a4fabfcd2aa21bbed63e63ea7adf0d22df6") version("5.4.3", sha256="b373eccd03679a13fab4e740fc780da25cbd598abca3a1e5e3613ae14954f9db") @@ -149,6 +151,8 @@ class Hipsparse(CMakePackage, CudaPackage, ROCmPackage): "5.4.3", "5.5.0", "5.5.1", + "5.6.0", + "5.6.1", ]: depends_on("rocm-cmake@%s:" % ver, type="build", when="@" + ver) depends_on("rocsparse@" + ver, when="+rocm @" + ver) diff --git a/var/spack/repos/builtin/packages/hsa-rocr-dev/package.py b/var/spack/repos/builtin/packages/hsa-rocr-dev/package.py index 40ec2435f5..8a24050226 100644 --- a/var/spack/repos/builtin/packages/hsa-rocr-dev/package.py +++ b/var/spack/repos/builtin/packages/hsa-rocr-dev/package.py @@ -24,6 +24,8 @@ class HsaRocrDev(CMakePackage): libraries = ["libhsa-runtime64"] version("master", branch="master") + version("5.6.1", sha256="4de9a57c2092edf9398d671c8a2c60626eb7daf358caf710da70d9c105490221") + version("5.6.0", sha256="30875d440df9d8481ffb24d87755eae20a0efc1114849a72619ea954f1e9206c") version("5.5.1", sha256="53d84ad5ba5086ed4ad67ad892c52c0e4eba8ddfa85c2dd341bf825f4d5fe4ee") version("5.5.0", sha256="8dbc776b56f93ddaa2ca38bf3b88299b8091de7c1b3f2e481064896cf6808e6c") version("5.4.3", sha256="a600eed848d47a7578c60da7e64eb92f29bbce2ec67932b251eafd4c2974cb67") @@ -145,6 +147,8 @@ class HsaRocrDev(CMakePackage): "5.4.3", "5.5.0", "5.5.1", + "5.6.0", + "5.6.1", "master", ]: depends_on("hsakmt-roct@" + ver, when="@" + ver) @@ -153,7 +157,7 @@ class HsaRocrDev(CMakePackage): depends_on( "rocm-device-libs@" + ver, when="@{0} ^llvm-amdgpu ~rocm-device-libs".format(ver) ) - for ver in ["5.5.0", "5.5.1"]: + for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1"]: depends_on("rocm-core@" + ver, when="@" + ver) # Both 3.5.0 and 3.7.0 force INSTALL_RPATH in different ways @@ -198,4 +202,6 @@ class HsaRocrDev(CMakePackage): args.append(self.define("BITCODE_DIR", bitcode_dir)) + if self.spec.satisfies("@5.6:"): + args.append("-DCMAKE_INSTALL_LIBDIR=lib") return args diff --git a/var/spack/repos/builtin/packages/hsakmt-roct/package.py b/var/spack/repos/builtin/packages/hsakmt-roct/package.py index 1a2ce25a04..0bebaae6bf 100644 --- a/var/spack/repos/builtin/packages/hsakmt-roct/package.py +++ b/var/spack/repos/builtin/packages/hsakmt-roct/package.py @@ -22,6 +22,8 @@ class HsakmtRoct(CMakePackage): maintainers("srekolam", "renjithravindrankannath") version("master", branch="master") + version("5.6.1", sha256="d60b355bfd21a08e0e36270fd56f98d052c3c6edca47da887fa32bf32759c29b") + version("5.6.0", sha256="cd009c5c09f664f046c428ba9843582ab468f7b88d560747eb949d8d7f8c5567") version("5.5.1", sha256="4ffde3fc1f91f24cdbf09263fd8e012a3995ad10854f4c1d866beab7b9f36bf4") version("5.5.0", sha256="2b11fd8937c2b06cd4ddea2c3699fbf3d1651892c4c5957d38553b993dd9af18") version("5.4.3", sha256="3799abbe7177fbff3b304e2a363e2b39e8864f8650ae569b2b88b9291f9a710c") @@ -114,7 +116,7 @@ class HsakmtRoct(CMakePackage): for ver in ["5.3.0", "5.4.0", "5.4.3"]: depends_on("llvm-amdgpu@" + ver, type="test", when="@" + ver) - for ver in ["5.5.0", "5.5.1"]: + for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1"]: depends_on("rocm-core@" + ver, when="@" + ver) depends_on("llvm-amdgpu@" + ver, type="test", when="@" + ver) diff --git a/var/spack/repos/builtin/packages/llvm-amdgpu/001-Add-i1-mul-patterns.patch b/var/spack/repos/builtin/packages/llvm-amdgpu/001-Add-i1-mul-patterns.patch new file mode 100644 index 0000000000..f93fcb99db --- /dev/null +++ b/var/spack/repos/builtin/packages/llvm-amdgpu/001-Add-i1-mul-patterns.patch @@ -0,0 +1,2842 @@ +From a0f3d7f45075a3e9545c0c9fa25a9f8fc840cdd7 Mon Sep 17 00:00:00 2001 +From: Renjith Ravindran <Renjith.RavindranKannath@amd.com> +Date: Mon, 25 Sep 2023 18:38:17 +0000 +Subject: [PATCH] i1 muls can sometimes happen after SCEV. They resulted in + ISel failures because we were missing the patterns for them. + +--- + llvm/lib/Target/AMDGPU/SIInstructions.td | 10 + + llvm/test/CodeGen/AMDGPU/mul.ll | 2676 ++++++++++++++++++++-- + 2 files changed, 2544 insertions(+), 142 deletions(-) + +diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td +index 03b2160a1..3bf4e42de 100644 +--- a/llvm/lib/Target/AMDGPU/SIInstructions.td ++++ b/llvm/lib/Target/AMDGPU/SIInstructions.td +@@ -2372,6 +2372,11 @@ def : GCNPat < + (S_AND_B64 $src0, $src1) + >; + ++def : GCNPat < ++ (i1 (mul i1:$src0, i1:$src1)), ++ (S_AND_B64 $src0, $src1) ++>; ++ + def : GCNPat < + (i1 (or i1:$src0, i1:$src1)), + (S_OR_B64 $src0, $src1) +@@ -2411,6 +2416,11 @@ def : GCNPat < + (S_AND_B32 $src0, $src1) + >; + ++def : GCNPat < ++ (i1 (mul i1:$src0, i1:$src1)), ++ (S_AND_B32 $src0, $src1) ++>; ++ + def : GCNPat < + (i1 (or i1:$src0, i1:$src1)), + (S_OR_B32 $src0, $src1) +diff --git a/llvm/test/CodeGen/AMDGPU/mul.ll b/llvm/test/CodeGen/AMDGPU/mul.ll +index 85dd59a0c..a8973d845 100644 +--- a/llvm/test/CodeGen/AMDGPU/mul.ll ++++ b/llvm/test/CodeGen/AMDGPU/mul.ll +@@ -1,20 +1,129 @@ +-; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,SI,FUNC %s +-; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,VI,FUNC %s +-; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=FUNC,GFX9PLUS %s +-; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=FUNC,GFX9PLUS %s +-; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=FUNC,GFX9PLUS %s +-; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=EG,FUNC %s ++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ++; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefixes=SI %s ++; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=VI %s ++; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s ++; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s ++; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s ++; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -check-prefixes=EG %s + + ; mul24 and mad24 are affected + +-; FUNC-LABEL: {{^}}test_mul_v2i32: +-; EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +-; EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +- +-; GCN: v_mul_lo_u32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +-; GCN: v_mul_lo_u32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +- + define amdgpu_kernel void @test_mul_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ++; SI-LABEL: test_mul_v2i32: ++; SI: ; %bb.0: ; %entry ++; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ++; SI-NEXT: s_mov_b32 s7, 0xf000 ++; SI-NEXT: s_mov_b32 s6, -1 ++; SI-NEXT: s_mov_b32 s10, s6 ++; SI-NEXT: s_mov_b32 s11, s7 ++; SI-NEXT: s_waitcnt lgkmcnt(0) ++; SI-NEXT: s_mov_b32 s8, s2 ++; SI-NEXT: s_mov_b32 s9, s3 ++; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 ++; SI-NEXT: s_mov_b32 s4, s0 ++; SI-NEXT: s_mov_b32 s5, s1 ++; SI-NEXT: s_waitcnt vmcnt(0) ++; SI-NEXT: v_mul_lo_u32 v1, v1, v3 ++; SI-NEXT: v_mul_lo_u32 v0, v0, v2 ++; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ++; SI-NEXT: s_endpgm ++; ++; VI-LABEL: test_mul_v2i32: ++; VI: ; %bb.0: ; %entry ++; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ++; VI-NEXT: s_mov_b32 s7, 0xf000 ++; VI-NEXT: s_mov_b32 s6, -1 ++; VI-NEXT: s_mov_b32 s10, s6 ++; VI-NEXT: s_mov_b32 s11, s7 ++; VI-NEXT: s_waitcnt lgkmcnt(0) ++; VI-NEXT: s_mov_b32 s8, s2 ++; VI-NEXT: s_mov_b32 s9, s3 ++; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 ++; VI-NEXT: s_mov_b32 s4, s0 ++; VI-NEXT: s_mov_b32 s5, s1 ++; VI-NEXT: s_waitcnt vmcnt(0) ++; VI-NEXT: v_mul_lo_u32 v1, v1, v3 ++; VI-NEXT: v_mul_lo_u32 v0, v0, v2 ++; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ++; VI-NEXT: s_endpgm ++; ++; GFX9-LABEL: test_mul_v2i32: ++; GFX9: ; %bb.0: ; %entry ++; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ++; GFX9-NEXT: s_mov_b32 s7, 0xf000 ++; GFX9-NEXT: s_mov_b32 s6, -1 ++; GFX9-NEXT: s_mov_b32 s10, s6 ++; GFX9-NEXT: s_mov_b32 s11, s7 ++; GFX9-NEXT: s_waitcnt lgkmcnt(0) ++; GFX9-NEXT: s_mov_b32 s8, s2 ++; GFX9-NEXT: s_mov_b32 s9, s3 ++; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 ++; GFX9-NEXT: s_mov_b32 s4, s0 ++; GFX9-NEXT: s_mov_b32 s5, s1 ++; GFX9-NEXT: s_waitcnt vmcnt(0) ++; GFX9-NEXT: v_mul_lo_u32 v1, v1, v3 ++; GFX9-NEXT: v_mul_lo_u32 v0, v0, v2 ++; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ++; GFX9-NEXT: s_endpgm ++; ++; GFX10-LABEL: test_mul_v2i32: ++; GFX10: ; %bb.0: ; %entry ++; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ++; GFX10-NEXT: s_mov_b32 s6, -1 ++; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ++; GFX10-NEXT: s_mov_b32 s10, s6 ++; GFX10-NEXT: s_mov_b32 s11, s7 ++; GFX10-NEXT: s_waitcnt lgkmcnt(0) ++; GFX10-NEXT: s_mov_b32 s8, s2 ++; GFX10-NEXT: s_mov_b32 s9, s3 ++; GFX10-NEXT: s_mov_b32 s4, s0 ++; GFX10-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 ++; GFX10-NEXT: s_mov_b32 s5, s1 ++; GFX10-NEXT: s_waitcnt vmcnt(0) ++; GFX10-NEXT: v_mul_lo_u32 v1, v1, v3 ++; GFX10-NEXT: v_mul_lo_u32 v0, v0, v2 ++; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ++; GFX10-NEXT: s_endpgm ++; ++; GFX11-LABEL: test_mul_v2i32: ++; GFX11: ; %bb.0: ; %entry ++; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ++; GFX11-NEXT: s_mov_b32 s6, -1 ++; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ++; GFX11-NEXT: s_mov_b32 s10, s6 ++; GFX11-NEXT: s_mov_b32 s11, s7 ++; GFX11-NEXT: s_waitcnt lgkmcnt(0) ++; GFX11-NEXT: s_mov_b32 s8, s2 ++; GFX11-NEXT: s_mov_b32 s9, s3 ++; GFX11-NEXT: s_mov_b32 s4, s0 ++; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[8:11], 0 ++; GFX11-NEXT: s_mov_b32 s5, s1 ++; GFX11-NEXT: s_waitcnt vmcnt(0) ++; GFX11-NEXT: v_mul_lo_u32 v1, v1, v3 ++; GFX11-NEXT: v_mul_lo_u32 v0, v0, v2 ++; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 ++; GFX11-NEXT: s_nop 0 ++; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ++; GFX11-NEXT: s_endpgm ++; ++; EG-LABEL: test_mul_v2i32: ++; EG: ; %bb.0: ; %entry ++; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] ++; EG-NEXT: TEX 0 @6 ++; EG-NEXT: ALU 3, @9, KC0[CB0:0-32], KC1[] ++; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 ++; EG-NEXT: CF_END ++; EG-NEXT: PAD ++; EG-NEXT: Fetch clause starting at 6: ++; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1 ++; EG-NEXT: ALU clause starting at 8: ++; EG-NEXT: MOV * T0.X, KC0[2].Z, ++; EG-NEXT: ALU clause starting at 9: ++; EG-NEXT: MULLO_INT * T0.Y, T0.Y, T0.W, ++; EG-NEXT: LSHR T1.X, KC0[2].Y, literal.x, ++; EG-NEXT: MULLO_INT * T0.X, T0.X, T0.Z, ++; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) ++entry: + %b_ptr = getelementptr <2 x i32>, ptr addrspace(1) %in, i32 1 + %a = load <2 x i32>, ptr addrspace(1) %in + %b = load <2 x i32>, ptr addrspace(1) %b_ptr +@@ -23,18 +132,142 @@ define amdgpu_kernel void @test_mul_v2i32(ptr addrspace(1) %out, ptr addrspace(1 + ret void + } + +-; FUNC-LABEL: {{^}}v_mul_v4i32: +-; EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +-; EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +-; EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +-; EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +- +-; GCN: v_mul_lo_u32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +-; GCN: v_mul_lo_u32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +-; GCN: v_mul_lo_u32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +-; GCN: v_mul_lo_u32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +- + define amdgpu_kernel void @v_mul_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ++; SI-LABEL: v_mul_v4i32: ++; SI: ; %bb.0: ; %entry ++; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ++; SI-NEXT: s_mov_b32 s7, 0xf000 ++; SI-NEXT: s_mov_b32 s6, -1 ++; SI-NEXT: s_mov_b32 s10, s6 ++; SI-NEXT: s_mov_b32 s11, s7 ++; SI-NEXT: s_waitcnt lgkmcnt(0) ++; SI-NEXT: s_mov_b32 s8, s2 ++; SI-NEXT: s_mov_b32 s9, s3 ++; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 ++; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 ++; SI-NEXT: s_mov_b32 s4, s0 ++; SI-NEXT: s_mov_b32 s5, s1 ++; SI-NEXT: s_waitcnt vmcnt(0) ++; SI-NEXT: v_mul_lo_u32 v3, v3, v7 ++; SI-NEXT: v_mul_lo_u32 v2, v2, v6 ++; SI-NEXT: v_mul_lo_u32 v1, v1, v5 ++; SI-NEXT: v_mul_lo_u32 v0, v0, v4 ++; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ++; SI-NEXT: s_endpgm ++; ++; VI-LABEL: v_mul_v4i32: ++; VI: ; %bb.0: ; %entry ++; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ++; VI-NEXT: s_mov_b32 s7, 0xf000 ++; VI-NEXT: s_mov_b32 s6, -1 ++; VI-NEXT: s_mov_b32 s10, s6 ++; VI-NEXT: s_mov_b32 s11, s7 ++; VI-NEXT: s_waitcnt lgkmcnt(0) ++; VI-NEXT: s_mov_b32 s8, s2 ++; VI-NEXT: s_mov_b32 s9, s3 ++; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 ++; VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 ++; VI-NEXT: s_mov_b32 s4, s0 ++; VI-NEXT: s_mov_b32 s5, s1 ++; VI-NEXT: s_waitcnt vmcnt(0) ++; VI-NEXT: v_mul_lo_u32 v3, v3, v7 ++; VI-NEXT: v_mul_lo_u32 v2, v2, v6 ++; VI-NEXT: v_mul_lo_u32 v1, v1, v5 ++; VI-NEXT: v_mul_lo_u32 v0, v0, v4 ++; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ++; VI-NEXT: s_endpgm ++; ++; GFX9-LABEL: v_mul_v4i32: ++; GFX9: ; %bb.0: ; %entry ++; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ++; GFX9-NEXT: s_mov_b32 s7, 0xf000 ++; GFX9-NEXT: s_mov_b32 s6, -1 ++; GFX9-NEXT: s_mov_b32 s10, s6 ++; GFX9-NEXT: s_mov_b32 s11, s7 ++; GFX9-NEXT: s_waitcnt lgkmcnt(0) ++; GFX9-NEXT: s_mov_b32 s8, s2 ++; GFX9-NEXT: s_mov_b32 s9, s3 ++; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 ++; GFX9-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 ++; GFX9-NEXT: s_mov_b32 s4, s0 ++; GFX9-NEXT: s_mov_b32 s5, s1 ++; GFX9-NEXT: s_waitcnt vmcnt(0) ++; GFX9-NEXT: v_mul_lo_u32 v3, v3, v7 ++; GFX9-NEXT: v_mul_lo_u32 v2, v2, v6 ++; GFX9-NEXT: v_mul_lo_u32 v1, v1, v5 ++; GFX9-NEXT: v_mul_lo_u32 v0, v0, v4 ++; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ++; GFX9-NEXT: s_endpgm ++; ++; GFX10-LABEL: v_mul_v4i32: ++; GFX10: ; %bb.0: ; %entry ++; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ++; GFX10-NEXT: s_mov_b32 s6, -1 ++; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ++; GFX10-NEXT: s_mov_b32 s10, s6 ++; GFX10-NEXT: s_mov_b32 s11, s7 ++; GFX10-NEXT: s_waitcnt lgkmcnt(0) ++; GFX10-NEXT: s_mov_b32 s8, s2 ++; GFX10-NEXT: s_mov_b32 s9, s3 ++; GFX10-NEXT: s_clause 0x1 ++; GFX10-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 ++; GFX10-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 ++; GFX10-NEXT: s_mov_b32 s4, s0 ++; GFX10-NEXT: s_mov_b32 s5, s1 ++; GFX10-NEXT: s_waitcnt vmcnt(0) ++; GFX10-NEXT: v_mul_lo_u32 v3, v3, v7 ++; GFX10-NEXT: v_mul_lo_u32 v2, v2, v6 ++; GFX10-NEXT: v_mul_lo_u32 v1, v1, v5 ++; GFX10-NEXT: v_mul_lo_u32 v0, v0, v4 ++; GFX10-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ++; GFX10-NEXT: s_endpgm ++; ++; GFX11-LABEL: v_mul_v4i32: ++; GFX11: ; %bb.0: ; %entry ++; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ++; GFX11-NEXT: s_mov_b32 s6, -1 ++; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ++; GFX11-NEXT: s_mov_b32 s10, s6 ++; GFX11-NEXT: s_mov_b32 s11, s7 ++; GFX11-NEXT: s_waitcnt lgkmcnt(0) ++; GFX11-NEXT: s_mov_b32 s8, s2 ++; GFX11-NEXT: s_mov_b32 s9, s3 ++; GFX11-NEXT: s_clause 0x1 ++; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[8:11], 0 ++; GFX11-NEXT: buffer_load_b128 v[4:7], off, s[8:11], 0 offset:16 ++; GFX11-NEXT: s_mov_b32 s4, s0 ++; GFX11-NEXT: s_mov_b32 s5, s1 ++; GFX11-NEXT: s_waitcnt vmcnt(0) ++; GFX11-NEXT: v_mul_lo_u32 v3, v3, v7 ++; GFX11-NEXT: v_mul_lo_u32 v2, v2, v6 ++; GFX11-NEXT: v_mul_lo_u32 v1, v1, v5 ++; GFX11-NEXT: v_mul_lo_u32 v0, v0, v4 ++; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[4:7], 0 ++; GFX11-NEXT: s_nop 0 ++; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ++; GFX11-NEXT: s_endpgm ++; ++; EG-LABEL: v_mul_v4i32: ++; EG: ; %bb.0: ; %entry ++; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[] ++; EG-NEXT: TEX 1 @6 ++; EG-NEXT: ALU 5, @11, KC0[CB0:0-32], KC1[] ++; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T2.X, 1 ++; EG-NEXT: CF_END ++; EG-NEXT: PAD ++; EG-NEXT: Fetch clause starting at 6: ++; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 16, #1 ++; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1 ++; EG-NEXT: ALU clause starting at 10: ++; EG-NEXT: MOV * T0.X, KC0[2].Z, ++; EG-NEXT: ALU clause starting at 11: ++; EG-NEXT: MULLO_INT * T0.W, T0.W, T1.W, ++; EG-NEXT: MULLO_INT * T0.Z, T0.Z, T1.Z, ++; EG-NEXT: MULLO_INT * T0.Y, T0.Y, T1.Y, ++; EG-NEXT: LSHR T2.X, KC0[2].Y, literal.x, ++; EG-NEXT: MULLO_INT * T0.X, T0.X, T1.X, ++; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) ++entry: + %b_ptr = getelementptr <4 x i32>, ptr addrspace(1) %in, i32 1 + %a = load <4 x i32>, ptr addrspace(1) %in + %b = load <4 x i32>, ptr addrspace(1) %b_ptr +@@ -43,24 +276,232 @@ define amdgpu_kernel void @v_mul_v4i32(ptr addrspace(1) %out, ptr addrspace(1) % + ret void + } + +-; FUNC-LABEL: {{^}}s_trunc_i64_mul_to_i32: +-; GCN: s_load_dword +-; GCN: s_load_dword +-; GCN: s_mul_i32 +-; GCN: buffer_store_dword + define amdgpu_kernel void @s_trunc_i64_mul_to_i32(ptr addrspace(1) %out, i64 %a, i64 %b) { ++; SI-LABEL: s_trunc_i64_mul_to_i32: ++; SI: ; %bb.0: ; %entry ++; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ++; SI-NEXT: s_waitcnt lgkmcnt(0) ++; SI-NEXT: s_load_dword s7, s[0:1], 0xd ++; SI-NEXT: s_mov_b32 s3, 0xf000 ++; SI-NEXT: s_mov_b32 s2, -1 ++; SI-NEXT: s_mov_b32 s0, s4 ++; SI-NEXT: s_waitcnt lgkmcnt(0) ++; SI-NEXT: s_mul_i32 s4, s7, s6 ++; SI-NEXT: s_mov_b32 s1, s5 ++; SI-NEXT: v_mov_b32_e32 v0, s4 ++; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ++; SI-NEXT: s_endpgm ++; ++; VI-LABEL: s_trunc_i64_mul_to_i32: ++; VI: ; %bb.0: ; %entry ++; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ++; VI-NEXT: s_waitcnt lgkmcnt(0) ++; VI-NEXT: s_load_dword s7, s[0:1], 0x34 ++; VI-NEXT: s_mov_b32 s3, 0xf000 ++; VI-NEXT: s_mov_b32 s2, -1 ++; VI-NEXT: s_mov_b32 s0, s4 ++; VI-NEXT: s_waitcnt lgkmcnt(0) ++; VI-NEXT: s_mul_i32 s4, s7, s6 ++; VI-NEXT: s_mov_b32 s1, s5 ++; VI-NEXT: v_mov_b32_e32 v0, s4 ++; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ++; VI-NEXT: s_endpgm ++; ++; GFX9-LABEL: s_trunc_i64_mul_to_i32: ++; GFX9: ; %bb.0: ; %entry ++; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ++; GFX9-NEXT: s_waitcnt lgkmcnt(0) ++; GFX9-NEXT: s_load_dword s7, s[0:1], 0x34 ++; GFX9-NEXT: ; kill: killed $sgpr0_sgpr1 ++; GFX9-NEXT: s_mov_b32 s3, 0xf000 ++; GFX9-NEXT: s_mov_b32 s2, -1 ++; GFX9-NEXT: s_mov_b32 s0, s4 ++; GFX9-NEXT: s_waitcnt lgkmcnt(0) ++; GFX9-NEXT: s_mul_i32 s4, s7, s6 ++; GFX9-NEXT: s_mov_b32 s1, s5 ++; GFX9-NEXT: v_mov_b32_e32 v0, s4 ++; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ++; GFX9-NEXT: s_endpgm ++; ++; GFX10-LABEL: s_trunc_i64_mul_to_i32: ++; GFX10: ; %bb.0: ; %entry ++; GFX10-NEXT: s_clause 0x1 ++; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ++; GFX10-NEXT: s_load_dword s2, s[0:1], 0x34 ++; GFX10-NEXT: s_waitcnt lgkmcnt(0) ++; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ++; GFX10-NEXT: s_mul_i32 s0, s2, s6 ++; GFX10-NEXT: s_mov_b32 s6, -1 ++; GFX10-NEXT: v_mov_b32_e32 v0, s0 ++; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 ++; GFX10-NEXT: s_endpgm ++; ++; GFX11-LABEL: s_trunc_i64_mul_to_i32: ++; GFX11: ; %bb.0: ; %entry ++; GFX11-NEXT: s_clause 0x1 ++; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ++; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 ++; GFX11-NEXT: s_waitcnt lgkmcnt(0) ++; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ++; GFX11-NEXT: s_mul_i32 s0, s0, s6 ++; GFX11-NEXT: s_mov_b32 s6, -1 ++; GFX11-NEXT: v_mov_b32_e32 v0, s0 ++; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 ++; GFX11-NEXT: s_nop 0 ++; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ++; GFX11-NEXT: s_endpgm ++; ++; EG-LABEL: s_trunc_i64_mul_to_i32: ++; EG: ; %bb.0: ; %entry ++; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] ++; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 ++; EG-NEXT: CF_END ++; EG-NEXT: PAD ++; EG-NEXT: ALU clause starting at 4: ++; EG-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, ++; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) ++; EG-NEXT: MULLO_INT * T1.X, KC0[3].Y, KC0[2].W, ++entry: + %mul = mul i64 %b, %a + %trunc = trunc i64 %mul to i32 + store i32 %trunc, ptr addrspace(1) %out, align 8 + ret void + } + +-; FUNC-LABEL: {{^}}v_trunc_i64_mul_to_i32: +-; GCN: s_load_dword +-; GCN: s_load_dword +-; GCN: v_mul_lo_u32 +-; GCN: buffer_store_dword + define amdgpu_kernel void @v_trunc_i64_mul_to_i32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind { ++; SI-LABEL: v_trunc_i64_mul_to_i32: ++; SI: ; %bb.0: ; %entry ++; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ++; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ++; SI-NEXT: s_mov_b32 s3, 0xf000 ++; SI-NEXT: s_mov_b32 s2, -1 ++; SI-NEXT: s_mov_b32 s14, s2 ++; SI-NEXT: s_waitcnt lgkmcnt(0) ++; SI-NEXT: s_mov_b32 s12, s6 ++; SI-NEXT: s_mov_b32 s13, s7 ++; SI-NEXT: s_mov_b32 s15, s3 ++; SI-NEXT: s_mov_b32 s10, s2 ++; SI-NEXT: s_mov_b32 s11, s3 ++; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 ++; SI-NEXT: buffer_load_dword v1, off, s[8:11], 0 ++; SI-NEXT: s_mov_b32 s0, s4 ++; SI-NEXT: s_mov_b32 s1, s5 ++; SI-NEXT: s_waitcnt vmcnt(0) ++; SI-NEXT: v_mul_lo_u32 v0, v1, v0 ++; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ++; SI-NEXT: s_endpgm ++; ++; VI-LABEL: v_trunc_i64_mul_to_i32: ++; VI: ; %bb.0: ; %entry ++; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ++; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ++; VI-NEXT: s_mov_b32 s3, 0xf000 ++; VI-NEXT: s_mov_b32 s2, -1 ++; VI-NEXT: s_mov_b32 s14, s2 ++; VI-NEXT: s_waitcnt lgkmcnt(0) ++; VI-NEXT: s_mov_b32 s12, s6 ++; VI-NEXT: s_mov_b32 s13, s7 ++; VI-NEXT: s_mov_b32 s15, s3 ++; VI-NEXT: s_mov_b32 s10, s2 ++; VI-NEXT: s_mov_b32 s11, s3 ++; VI-NEXT: buffer_load_dword v0, off, s[12:15], 0 ++; VI-NEXT: buffer_load_dword v1, off, s[8:11], 0 ++; VI-NEXT: s_mov_b32 s0, s4 ++; VI-NEXT: s_mov_b32 s1, s5 ++; VI-NEXT: s_waitcnt vmcnt(0) ++; VI-NEXT: v_mul_lo_u32 v0, v1, v0 ++; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ++; VI-NEXT: s_endpgm ++; ++; GFX9-LABEL: v_trunc_i64_mul_to_i32: ++; GFX9: ; %bb.0: ; %entry ++; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ++; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ++; GFX9-NEXT: s_mov_b32 s3, 0xf000 ++; GFX9-NEXT: s_mov_b32 s2, -1 ++; GFX9-NEXT: s_mov_b32 s14, s2 ++; GFX9-NEXT: s_waitcnt lgkmcnt(0) ++; GFX9-NEXT: s_mov_b32 s12, s6 ++; GFX9-NEXT: s_mov_b32 s13, s7 ++; GFX9-NEXT: s_mov_b32 s15, s3 ++; GFX9-NEXT: s_mov_b32 s10, s2 ++; GFX9-NEXT: s_mov_b32 s11, s3 ++; GFX9-NEXT: buffer_load_dword v0, off, s[12:15], 0 ++; GFX9-NEXT: buffer_load_dword v1, off, s[8:11], 0 ++; GFX9-NEXT: s_mov_b32 s0, s4 ++; GFX9-NEXT: s_mov_b32 s1, s5 ++; GFX9-NEXT: s_waitcnt vmcnt(0) ++; GFX9-NEXT: v_mul_lo_u32 v0, v1, v0 ++; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ++; GFX9-NEXT: s_endpgm ++; ++; GFX10-LABEL: v_trunc_i64_mul_to_i32: ++; GFX10: ; %bb.0: ; %entry ++; GFX10-NEXT: s_clause 0x1 ++; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ++; GFX10-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ++; GFX10-NEXT: s_mov_b32 s2, -1 ++; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ++; GFX10-NEXT: s_mov_b32 s14, s2 ++; GFX10-NEXT: s_mov_b32 s15, s3 ++; GFX10-NEXT: s_mov_b32 s10, s2 ++; GFX10-NEXT: s_mov_b32 s11, s3 ++; GFX10-NEXT: s_waitcnt lgkmcnt(0) ++; GFX10-NEXT: s_mov_b32 s12, s6 ++; GFX10-NEXT: s_mov_b32 s13, s7 ++; GFX10-NEXT: buffer_load_dword v0, off, s[12:15], 0 ++; GFX10-NEXT: buffer_load_dword v1, off, s[8:11], 0 ++; GFX10-NEXT: s_mov_b32 s0, s4 ++; GFX10-NEXT: s_mov_b32 s1, s5 ++; GFX10-NEXT: s_waitcnt vmcnt(0) ++; GFX10-NEXT: v_mul_lo_u32 v0, v1, v0 ++; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 ++; GFX10-NEXT: s_endpgm ++; ++; GFX11-LABEL: v_trunc_i64_mul_to_i32: ++; GFX11: ; %bb.0: ; %entry ++; GFX11-NEXT: s_clause 0x1 ++; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ++; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ++; GFX11-NEXT: s_mov_b32 s10, -1 ++; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ++; GFX11-NEXT: s_mov_b32 s14, s10 ++; GFX11-NEXT: s_mov_b32 s15, s11 ++; GFX11-NEXT: s_mov_b32 s2, s10 ++; GFX11-NEXT: s_mov_b32 s3, s11 ++; GFX11-NEXT: s_waitcnt lgkmcnt(0) ++; GFX11-NEXT: s_mov_b32 s12, s6 ++; GFX11-NEXT: s_mov_b32 s13, s7 ++; GFX11-NEXT: buffer_load_b32 v0, off, s[12:15], 0 ++; GFX11-NEXT: buffer_load_b32 v1, off, s[0:3], 0 ++; GFX11-NEXT: s_mov_b32 s8, s4 ++; GFX11-NEXT: s_mov_b32 s9, s5 ++; GFX11-NEXT: s_waitcnt vmcnt(0) ++; GFX11-NEXT: v_mul_lo_u32 v0, v1, v0 ++; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 ++; GFX11-NEXT: s_nop 0 ++; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ++; GFX11-NEXT: s_endpgm ++; ++; EG-LABEL: v_trunc_i64_mul_to_i32: ++; EG: ; %bb.0: ; %entry ++; EG-NEXT: ALU 1, @10, KC0[CB0:0-32], KC1[] ++; EG-NEXT: TEX 1 @6 ++; EG-NEXT: ALU 2, @12, KC0[CB0:0-32], KC1[] ++; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T2.X, 1 ++; EG-NEXT: CF_END ++; EG-NEXT: PAD ++; EG-NEXT: Fetch clause starting at 6: ++; EG-NEXT: VTX_READ_32 T1.X, T1.X, 0, #1 ++; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 ++; EG-NEXT: ALU clause starting at 10: ++; EG-NEXT: MOV T0.X, KC0[2].Z, ++; EG-NEXT: MOV * T1.X, KC0[2].W, ++; EG-NEXT: ALU clause starting at 12: ++; EG-NEXT: LSHR T2.X, KC0[2].Y, literal.x, ++; EG-NEXT: MULLO_INT * T0.X, T1.X, T0.X, ++; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) ++entry: + %a = load i64, ptr addrspace(1) %aptr, align 8 + %b = load i64, ptr addrspace(1) %bptr, align 8 + %mul = mul i64 %b, %a +@@ -71,13 +512,93 @@ define amdgpu_kernel void @v_trunc_i64_mul_to_i32(ptr addrspace(1) %out, ptr add + + ; This 64-bit multiply should just use MUL_HI and MUL_LO, since the top + ; 32-bits of both arguments are sign bits. +-; FUNC-LABEL: {{^}}mul64_sext_c: +-; EG-DAG: MULLO_INT +-; EG-DAG: MULHI_INT +-; SI-DAG: s_mulk_i32 +-; SI-DAG: v_mul_hi_i32 +-; VI: v_mad_i64_i32 ++ + define amdgpu_kernel void @mul64_sext_c(ptr addrspace(1) %out, i32 %in) { ++; SI-LABEL: mul64_sext_c: ++; SI: ; %bb.0: ; %entry ++; SI-NEXT: s_load_dword s4, s[0:1], 0xb ++; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ++; SI-NEXT: v_mov_b32_e32 v0, 0x50 ++; SI-NEXT: s_mov_b32 s3, 0xf000 ++; SI-NEXT: s_mov_b32 s2, -1 ++; SI-NEXT: s_waitcnt lgkmcnt(0) ++; SI-NEXT: v_mul_hi_i32 v1, s4, v0 ++; SI-NEXT: s_mulk_i32 s4, 0x50 ++; SI-NEXT: v_mov_b32_e32 v0, s4 ++; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ++; SI-NEXT: s_endpgm ++; ++; VI-LABEL: mul64_sext_c: ++; VI: ; %bb.0: ; %entry ++; VI-NEXT: s_load_dword s2, s[0:1], 0x2c ++; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ++; VI-NEXT: v_mov_b32_e32 v0, 0x50 ++; VI-NEXT: s_waitcnt lgkmcnt(0) ++; VI-NEXT: v_mad_i64_i32 v[0:1], s[2:3], s2, v0, 0 ++; VI-NEXT: s_mov_b32 s3, 0xf000 ++; VI-NEXT: s_mov_b32 s2, -1 ++; VI-NEXT: s_nop 2 ++; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ++; VI-NEXT: s_endpgm ++; ++; GFX9-LABEL: mul64_sext_c: ++; GFX9: ; %bb.0: ; %entry ++; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c ++; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ++; GFX9-NEXT: s_mov_b32 s7, 0xf000 ++; GFX9-NEXT: s_mov_b32 s6, -1 ++; GFX9-NEXT: s_waitcnt lgkmcnt(0) ++; GFX9-NEXT: s_mul_hi_i32 s0, s2, 0x50 ++; GFX9-NEXT: s_mulk_i32 s2, 0x50 ++; GFX9-NEXT: v_mov_b32_e32 v0, s2 ++; GFX9-NEXT: v_mov_b32_e32 v1, s0 ++; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ++; GFX9-NEXT: s_endpgm ++; ++; GFX10-LABEL: mul64_sext_c: ++; GFX10: ; %bb.0: ; %entry ++; GFX10-NEXT: s_clause 0x1 ++; GFX10-NEXT: s_load_dword s2, s[0:1], 0x2c ++; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ++; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ++; GFX10-NEXT: s_mov_b32 s6, -1 ++; GFX10-NEXT: s_waitcnt lgkmcnt(0) ++; GFX10-NEXT: s_mul_i32 s0, s2, 0x50 ++; GFX10-NEXT: s_mul_hi_i32 s1, s2, 0x50 ++; GFX10-NEXT: v_mov_b32_e32 v0, s0 ++; GFX10-NEXT: v_mov_b32_e32 v1, s1 ++; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ++; GFX10-NEXT: s_endpgm ++; ++; GFX11-LABEL: mul64_sext_c: ++; GFX11: ; %bb.0: ; %entry ++; GFX11-NEXT: s_clause 0x1 ++; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c ++; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ++; GFX11-NEXT: s_waitcnt lgkmcnt(0) ++; GFX11-NEXT: s_mul_i32 s3, s2, 0x50 ++; GFX11-NEXT: s_mul_hi_i32 s2, s2, 0x50 ++; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ++; GFX11-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s2 ++; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ++; GFX11-NEXT: s_mov_b32 s2, -1 ++; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 ++; GFX11-NEXT: s_nop 0 ++; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ++; GFX11-NEXT: s_endpgm ++; ++; EG-LABEL: mul64_sext_c: ++; EG: ; %bb.0: ; %entry ++; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[] ++; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 ++; EG-NEXT: CF_END ++; EG-NEXT: PAD ++; EG-NEXT: ALU clause starting at 4: ++; EG-NEXT: MULHI_INT * T0.Y, KC0[2].Z, literal.x, ++; EG-NEXT: 80(1.121039e-43), 0(0.000000e+00) ++; EG-NEXT: LSHR T1.X, KC0[2].Y, literal.x, ++; EG-NEXT: MULLO_INT * T0.X, KC0[2].Z, literal.y, ++; EG-NEXT: 2(2.802597e-45), 80(1.121039e-43) + entry: + %0 = sext i32 %in to i64 + %1 = mul i64 %0, 80 +@@ -85,14 +606,125 @@ entry: + ret void + } + +-; FUNC-LABEL: {{^}}v_mul64_sext_c: +-; EG-DAG: MULLO_INT +-; EG-DAG: MULHI_INT +-; SI-DAG: v_mul_lo_u32 +-; SI-DAG: v_mul_hi_i32 +-; VI: v_mad_i64_i32 +-; GCN: s_endpgm + define amdgpu_kernel void @v_mul64_sext_c(ptr addrspace(1) %out, ptr addrspace(1) %in) { ++; SI-LABEL: v_mul64_sext_c: ++; SI: ; %bb.0: ; %entry ++; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ++; SI-NEXT: s_mov_b32 s7, 0xf000 ++; SI-NEXT: s_mov_b32 s6, -1 ++; SI-NEXT: s_mov_b32 s10, s6 ++; SI-NEXT: s_mov_b32 s11, s7 ++; SI-NEXT: s_waitcnt lgkmcnt(0) ++; SI-NEXT: s_mov_b32 s8, s2 ++; SI-NEXT: s_mov_b32 s9, s3 ++; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 ++; SI-NEXT: s_movk_i32 s2, 0x50 ++; SI-NEXT: s_mov_b32 s4, s0 ++; SI-NEXT: s_mov_b32 s5, s1 ++; SI-NEXT: s_waitcnt vmcnt(0) ++; SI-NEXT: v_mul_hi_i32 v1, v0, s2 ++; SI-NEXT: v_mul_lo_u32 v0, v0, s2 ++; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ++; SI-NEXT: s_endpgm ++; ++; VI-LABEL: v_mul64_sext_c: ++; VI: ; %bb.0: ; %entry ++; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ++; VI-NEXT: s_mov_b32 s7, 0xf000 ++; VI-NEXT: s_mov_b32 s6, -1 ++; VI-NEXT: s_mov_b32 s10, s6 ++; VI-NEXT: s_mov_b32 s11, s7 ++; VI-NEXT: s_waitcnt lgkmcnt(0) ++; VI-NEXT: s_mov_b32 s8, s2 ++; VI-NEXT: s_mov_b32 s9, s3 ++; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 ++; VI-NEXT: s_movk_i32 s2, 0x50 ++; VI-NEXT: s_mov_b32 s4, s0 ++; VI-NEXT: s_mov_b32 s5, s1 ++; VI-NEXT: s_waitcnt vmcnt(0) ++; VI-NEXT: v_mad_i64_i32 v[0:1], s[2:3], v0, s2, 0 ++; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ++; VI-NEXT: s_endpgm ++; ++; GFX9-LABEL: v_mul64_sext_c: ++; GFX9: ; %bb.0: ; %entry ++; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ++; GFX9-NEXT: s_mov_b32 s7, 0xf000 ++; GFX9-NEXT: s_mov_b32 s6, -1 ++; GFX9-NEXT: s_mov_b32 s10, s6 ++; GFX9-NEXT: s_mov_b32 s11, s7 ++; GFX9-NEXT: s_waitcnt lgkmcnt(0) ++; GFX9-NEXT: s_mov_b32 s8, s2 ++; GFX9-NEXT: s_mov_b32 s9, s3 ++; GFX9-NEXT: buffer_load_dword v0, off, s[8:11], 0 ++; GFX9-NEXT: s_movk_i32 s2, 0x50 ++; GFX9-NEXT: s_mov_b32 s4, s0 ++; GFX9-NEXT: s_mov_b32 s5, s1 ++; GFX9-NEXT: s_waitcnt vmcnt(0) ++; GFX9-NEXT: v_mul_hi_i32 v1, v0, s2 ++; GFX9-NEXT: v_mul_lo_u32 v0, v0, s2 ++; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ++; GFX9-NEXT: s_endpgm ++; ++; GFX10-LABEL: v_mul64_sext_c: ++; GFX10: ; %bb.0: ; %entry ++; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ++; GFX10-NEXT: s_mov_b32 s6, -1 ++; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ++; GFX10-NEXT: s_mov_b32 s10, s6 ++; GFX10-NEXT: s_mov_b32 s11, s7 ++; GFX10-NEXT: s_waitcnt lgkmcnt(0) ++; GFX10-NEXT: s_mov_b32 s8, s2 ++; GFX10-NEXT: s_mov_b32 s9, s3 ++; GFX10-NEXT: s_mov_b32 s4, s0 ++; GFX10-NEXT: buffer_load_dword v0, off, s[8:11], 0 ++; GFX10-NEXT: s_mov_b32 s5, s1 ++; GFX10-NEXT: s_waitcnt vmcnt(0) ++; GFX10-NEXT: v_mul_hi_i32 v1, 0x50, v0 ++; GFX10-NEXT: v_mul_lo_u32 v0, 0x50, v0 ++; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ++; GFX10-NEXT: s_endpgm ++; ++; GFX11-LABEL: v_mul64_sext_c: ++; GFX11: ; %bb.0: ; %entry ++; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ++; GFX11-NEXT: s_mov_b32 s6, -1 ++; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ++; GFX11-NEXT: s_mov_b32 s10, s6 ++; GFX11-NEXT: s_mov_b32 s11, s7 ++; GFX11-NEXT: s_waitcnt lgkmcnt(0) ++; GFX11-NEXT: s_mov_b32 s8, s2 ++; GFX11-NEXT: s_mov_b32 s9, s3 ++; GFX11-NEXT: s_mov_b32 s4, s0 ++; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0 ++; GFX11-NEXT: s_mov_b32 s5, s1 ++; GFX11-NEXT: s_waitcnt vmcnt(0) ++; GFX11-NEXT: v_mul_hi_i32 v1, 0x50, v0 ++; GFX11-NEXT: v_mul_lo_u32 v0, 0x50, v0 ++; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 ++; GFX11-NEXT: s_nop 0 ++; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ++; GFX11-NEXT: s_endpgm ++; ++; EG-LABEL: v_mul64_sext_c: ++; EG: ; %bb.0: ; %entry ++; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] ++; EG-NEXT: TEX 0 @6 ++; EG-NEXT: ALU 4, @9, KC0[CB0:0-32], KC1[] ++; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 ++; EG-NEXT: CF_END ++; EG-NEXT: PAD ++; EG-NEXT: Fetch clause starting at 6: ++; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 ++; EG-NEXT: ALU clause starting at 8: ++; EG-NEXT: MOV * T0.X, KC0[2].Z, ++; EG-NEXT: ALU clause starting at 9: ++; EG-NEXT: MULHI_INT * T0.Y, T0.X, literal.x, ++; EG-NEXT: 80(1.121039e-43), 0(0.000000e+00) ++; EG-NEXT: LSHR T1.X, KC0[2].Y, literal.x, ++; EG-NEXT: MULLO_INT * T0.X, T0.X, literal.y, ++; EG-NEXT: 2(2.802597e-45), 80(1.121039e-43) ++entry: + %val = load i32, ptr addrspace(1) %in, align 4 + %ext = sext i32 %val to i64 + %mul = mul i64 %ext, 80 +@@ -100,12 +732,122 @@ define amdgpu_kernel void @v_mul64_sext_c(ptr addrspace(1) %out, ptr addrspace(1 + ret void + } + +-; FUNC-LABEL: {{^}}v_mul64_sext_inline_imm: +-; SI-DAG: v_mul_lo_u32 v{{[0-9]+}}, v{{[0-9]+}}, 9 +-; SI-DAG: v_mul_hi_i32 v{{[0-9]+}}, v{{[0-9]+}}, 9 +-; VI: v_mad_i64_i32 v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}, 9, 0 +-; GCN: s_endpgm + define amdgpu_kernel void @v_mul64_sext_inline_imm(ptr addrspace(1) %out, ptr addrspace(1) %in) { ++; SI-LABEL: v_mul64_sext_inline_imm: ++; SI: ; %bb.0: ; %entry ++; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ++; SI-NEXT: s_mov_b32 s7, 0xf000 ++; SI-NEXT: s_mov_b32 s6, -1 ++; SI-NEXT: s_mov_b32 s10, s6 ++; SI-NEXT: s_mov_b32 s11, s7 ++; SI-NEXT: s_waitcnt lgkmcnt(0) ++; SI-NEXT: s_mov_b32 s8, s2 ++; SI-NEXT: s_mov_b32 s9, s3 ++; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 ++; SI-NEXT: s_mov_b32 s4, s0 ++; SI-NEXT: s_mov_b32 s5, s1 ++; SI-NEXT: s_waitcnt vmcnt(0) ++; SI-NEXT: v_mul_hi_i32 v1, v0, 9 ++; SI-NEXT: v_mul_lo_u32 v0, v0, 9 ++; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ++; SI-NEXT: s_endpgm ++; ++; VI-LABEL: v_mul64_sext_inline_imm: ++; VI: ; %bb.0: ; %entry ++; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ++; VI-NEXT: s_mov_b32 s7, 0xf000 ++; VI-NEXT: s_mov_b32 s6, -1 ++; VI-NEXT: s_mov_b32 s10, s6 ++; VI-NEXT: s_mov_b32 s11, s7 ++; VI-NEXT: s_waitcnt lgkmcnt(0) ++; VI-NEXT: s_mov_b32 s8, s2 ++; VI-NEXT: s_mov_b32 s9, s3 ++; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 ++; VI-NEXT: s_mov_b32 s4, s0 ++; VI-NEXT: s_mov_b32 s5, s1 ++; VI-NEXT: s_waitcnt vmcnt(0) ++; VI-NEXT: v_mad_i64_i32 v[0:1], s[2:3], v0, 9, 0 ++; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ++; VI-NEXT: s_endpgm ++; ++; GFX9-LABEL: v_mul64_sext_inline_imm: ++; GFX9: ; %bb.0: ; %entry ++; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ++; GFX9-NEXT: s_mov_b32 s7, 0xf000 ++; GFX9-NEXT: s_mov_b32 s6, -1 ++; GFX9-NEXT: s_mov_b32 s10, s6 ++; GFX9-NEXT: s_mov_b32 s11, s7 ++; GFX9-NEXT: s_waitcnt lgkmcnt(0) ++; GFX9-NEXT: s_mov_b32 s8, s2 ++; GFX9-NEXT: s_mov_b32 s9, s3 ++; GFX9-NEXT: buffer_load_dword v0, off, s[8:11], 0 ++; GFX9-NEXT: s_mov_b32 s4, s0 ++; GFX9-NEXT: s_mov_b32 s5, s1 ++; GFX9-NEXT: s_waitcnt vmcnt(0) ++; GFX9-NEXT: v_mul_hi_i32 v1, v0, 9 ++; GFX9-NEXT: v_mul_lo_u32 v0, v0, 9 ++; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ++; GFX9-NEXT: s_endpgm ++; ++; GFX10-LABEL: v_mul64_sext_inline_imm: ++; GFX10: ; %bb.0: ; %entry ++; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ++; GFX10-NEXT: s_mov_b32 s6, -1 ++; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ++; GFX10-NEXT: s_mov_b32 s10, s6 ++; GFX10-NEXT: s_mov_b32 s11, s7 ++; GFX10-NEXT: s_waitcnt lgkmcnt(0) ++; GFX10-NEXT: s_mov_b32 s8, s2 ++; GFX10-NEXT: s_mov_b32 s9, s3 ++; GFX10-NEXT: s_mov_b32 s4, s0 ++; GFX10-NEXT: buffer_load_dword v0, off, s[8:11], 0 ++; GFX10-NEXT: s_mov_b32 s5, s1 ++; GFX10-NEXT: s_waitcnt vmcnt(0) ++; GFX10-NEXT: v_mul_hi_i32 v1, v0, 9 ++; GFX10-NEXT: v_mul_lo_u32 v0, v0, 9 ++; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ++; GFX10-NEXT: s_endpgm ++; ++; GFX11-LABEL: v_mul64_sext_inline_imm: ++; GFX11: ; %bb.0: ; %entry ++; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ++; GFX11-NEXT: s_mov_b32 s6, -1 ++; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ++; GFX11-NEXT: s_mov_b32 s10, s6 ++; GFX11-NEXT: s_mov_b32 s11, s7 ++; GFX11-NEXT: s_waitcnt lgkmcnt(0) ++; GFX11-NEXT: s_mov_b32 s8, s2 ++; GFX11-NEXT: s_mov_b32 s9, s3 ++; GFX11-NEXT: s_mov_b32 s4, s0 ++; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0 ++; GFX11-NEXT: s_mov_b32 s5, s1 ++; GFX11-NEXT: s_waitcnt vmcnt(0) ++; GFX11-NEXT: v_mul_hi_i32 v1, v0, 9 ++; GFX11-NEXT: v_mul_lo_u32 v0, v0, 9 ++; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 ++; GFX11-NEXT: s_nop 0 ++; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ++; GFX11-NEXT: s_endpgm ++; ++; EG-LABEL: v_mul64_sext_inline_imm: ++; EG: ; %bb.0: ; %entry ++; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] ++; EG-NEXT: TEX 0 @6 ++; EG-NEXT: ALU 4, @9, KC0[CB0:0-32], KC1[] ++; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 ++; EG-NEXT: CF_END ++; EG-NEXT: PAD ++; EG-NEXT: Fetch clause starting at 6: ++; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 ++; EG-NEXT: ALU clause starting at 8: ++; EG-NEXT: MOV * T0.X, KC0[2].Z, ++; EG-NEXT: ALU clause starting at 9: ++; EG-NEXT: MULHI_INT * T0.Y, T0.X, literal.x, ++; EG-NEXT: 9(1.261169e-44), 0(0.000000e+00) ++; EG-NEXT: LSHR T1.X, KC0[2].Y, literal.x, ++; EG-NEXT: MULLO_INT * T0.X, T0.X, literal.y, ++; EG-NEXT: 2(2.802597e-45), 9(1.261169e-44) ++entry: + %val = load i32, ptr addrspace(1) %in, align 4 + %ext = sext i32 %val to i64 + %mul = mul i64 %ext, 9 +@@ -113,22 +855,202 @@ define amdgpu_kernel void @v_mul64_sext_inline_imm(ptr addrspace(1) %out, ptr ad + ret void + } + +-; FUNC-LABEL: {{^}}s_mul_i32: +-; GCN: s_load_dword [[SRC0:s[0-9]+]], +-; GCN: s_load_dword [[SRC1:s[0-9]+]], +-; GCN: s_mul_i32 [[SRESULT:s[0-9]+]], [[SRC0]], [[SRC1]] +-; GCN: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]] +-; GCN: buffer_store_dword [[VRESULT]], +-; GCN: s_endpgm + define amdgpu_kernel void @s_mul_i32(ptr addrspace(1) %out, [8 x i32], i32 %a, [8 x i32], i32 %b) nounwind { ++; SI-LABEL: s_mul_i32: ++; SI: ; %bb.0: ; %entry ++; SI-NEXT: s_load_dword s4, s[0:1], 0x13 ++; SI-NEXT: s_load_dword s5, s[0:1], 0x1c ++; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ++; SI-NEXT: s_mov_b32 s3, 0xf000 ++; SI-NEXT: s_mov_b32 s2, -1 ++; SI-NEXT: s_waitcnt lgkmcnt(0) ++; SI-NEXT: s_mul_i32 s4, s4, s5 ++; SI-NEXT: v_mov_b32_e32 v0, s4 ++; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ++; SI-NEXT: s_endpgm ++; ++; VI-LABEL: s_mul_i32: ++; VI: ; %bb.0: ; %entry ++; VI-NEXT: s_load_dword s4, s[0:1], 0x4c ++; VI-NEXT: s_load_dword s5, s[0:1], 0x70 ++; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ++; VI-NEXT: s_mov_b32 s3, 0xf000 ++; VI-NEXT: s_mov_b32 s2, -1 ++; VI-NEXT: s_waitcnt lgkmcnt(0) ++; VI-NEXT: s_mul_i32 s4, s4, s5 ++; VI-NEXT: v_mov_b32_e32 v0, s4 ++; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ++; VI-NEXT: s_endpgm ++; ++; GFX9-LABEL: s_mul_i32: ++; GFX9: ; %bb.0: ; %entry ++; GFX9-NEXT: s_load_dword s2, s[0:1], 0x4c ++; GFX9-NEXT: s_load_dword s3, s[0:1], 0x70 ++; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ++; GFX9-NEXT: s_mov_b32 s7, 0xf000 ++; GFX9-NEXT: s_mov_b32 s6, -1 ++; GFX9-NEXT: s_waitcnt lgkmcnt(0) ++; GFX9-NEXT: s_mul_i32 s0, s2, s3 ++; GFX9-NEXT: v_mov_b32_e32 v0, s0 ++; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ++; GFX9-NEXT: s_endpgm ++; ++; GFX10-LABEL: s_mul_i32: ++; GFX10: ; %bb.0: ; %entry ++; GFX10-NEXT: s_clause 0x2 ++; GFX10-NEXT: s_load_dword s2, s[0:1], 0x4c ++; GFX10-NEXT: s_load_dword s3, s[0:1], 0x70 ++; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ++; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ++; GFX10-NEXT: s_mov_b32 s6, -1 ++; GFX10-NEXT: s_waitcnt lgkmcnt(0) ++; GFX10-NEXT: s_mul_i32 s0, s2, s3 ++; GFX10-NEXT: v_mov_b32_e32 v0, s0 ++; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 ++; GFX10-NEXT: s_endpgm ++; ++; GFX11-LABEL: s_mul_i32: ++; GFX11: ; %bb.0: ; %entry ++; GFX11-NEXT: s_clause 0x2 ++; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x4c ++; GFX11-NEXT: s_load_b32 s3, s[0:1], 0x70 ++; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ++; GFX11-NEXT: s_waitcnt lgkmcnt(0) ++; GFX11-NEXT: s_mul_i32 s2, s2, s3 ++; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ++; GFX11-NEXT: v_mov_b32_e32 v0, s2 ++; GFX11-NEXT: s_mov_b32 s2, -1 ++; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ++; GFX11-NEXT: s_nop 0 ++; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ++; GFX11-NEXT: s_endpgm ++; ++; EG-LABEL: s_mul_i32: ++; EG: ; %bb.0: ; %entry ++; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] ++; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 ++; EG-NEXT: CF_END ++; EG-NEXT: PAD ++; EG-NEXT: ALU clause starting at 4: ++; EG-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, ++; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) ++; EG-NEXT: MULLO_INT * T1.X, KC0[4].Z, KC0[6].W, ++entry: + %mul = mul i32 %a, %b + store i32 %mul, ptr addrspace(1) %out, align 4 + ret void + } + +-; FUNC-LABEL: {{^}}v_mul_i32: +-; GCN: v_mul_lo_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} + define amdgpu_kernel void @v_mul_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ++; SI-LABEL: v_mul_i32: ++; SI: ; %bb.0: ; %entry ++; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ++; SI-NEXT: s_mov_b32 s7, 0xf000 ++; SI-NEXT: s_mov_b32 s6, -1 ++; SI-NEXT: s_mov_b32 s10, s6 ++; SI-NEXT: s_mov_b32 s11, s7 ++; SI-NEXT: s_waitcnt lgkmcnt(0) ++; SI-NEXT: s_mov_b32 s8, s2 ++; SI-NEXT: s_mov_b32 s9, s3 ++; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 ++; SI-NEXT: s_mov_b32 s4, s0 ++; SI-NEXT: s_mov_b32 s5, s1 ++; SI-NEXT: s_waitcnt vmcnt(0) ++; SI-NEXT: v_mul_lo_u32 v0, v0, v1 ++; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ++; SI-NEXT: s_endpgm ++; ++; VI-LABEL: v_mul_i32: ++; VI: ; %bb.0: ; %entry ++; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ++; VI-NEXT: s_mov_b32 s7, 0xf000 ++; VI-NEXT: s_mov_b32 s6, -1 ++; VI-NEXT: s_mov_b32 s10, s6 ++; VI-NEXT: s_mov_b32 s11, s7 ++; VI-NEXT: s_waitcnt lgkmcnt(0) ++; VI-NEXT: s_mov_b32 s8, s2 ++; VI-NEXT: s_mov_b32 s9, s3 ++; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 ++; VI-NEXT: s_mov_b32 s4, s0 ++; VI-NEXT: s_mov_b32 s5, s1 ++; VI-NEXT: s_waitcnt vmcnt(0) ++; VI-NEXT: v_mul_lo_u32 v0, v0, v1 ++; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ++; VI-NEXT: s_endpgm ++; ++; GFX9-LABEL: v_mul_i32: ++; GFX9: ; %bb.0: ; %entry ++; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ++; GFX9-NEXT: s_mov_b32 s7, 0xf000 ++; GFX9-NEXT: s_mov_b32 s6, -1 ++; GFX9-NEXT: s_mov_b32 s10, s6 ++; GFX9-NEXT: s_mov_b32 s11, s7 ++; GFX9-NEXT: s_waitcnt lgkmcnt(0) ++; GFX9-NEXT: s_mov_b32 s8, s2 ++; GFX9-NEXT: s_mov_b32 s9, s3 ++; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 ++; GFX9-NEXT: s_mov_b32 s4, s0 ++; GFX9-NEXT: s_mov_b32 s5, s1 ++; GFX9-NEXT: s_waitcnt vmcnt(0) ++; GFX9-NEXT: v_mul_lo_u32 v0, v0, v1 ++; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ++; GFX9-NEXT: s_endpgm ++; ++; GFX10-LABEL: v_mul_i32: ++; GFX10: ; %bb.0: ; %entry ++; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ++; GFX10-NEXT: s_mov_b32 s6, -1 ++; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ++; GFX10-NEXT: s_mov_b32 s10, s6 ++; GFX10-NEXT: s_mov_b32 s11, s7 ++; GFX10-NEXT: s_waitcnt lgkmcnt(0) ++; GFX10-NEXT: s_mov_b32 s8, s2 ++; GFX10-NEXT: s_mov_b32 s9, s3 ++; GFX10-NEXT: s_mov_b32 s4, s0 ++; GFX10-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 ++; GFX10-NEXT: s_mov_b32 s5, s1 ++; GFX10-NEXT: s_waitcnt vmcnt(0) ++; GFX10-NEXT: v_mul_lo_u32 v0, v0, v1 ++; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 ++; GFX10-NEXT: s_endpgm ++; ++; GFX11-LABEL: v_mul_i32: ++; GFX11: ; %bb.0: ; %entry ++; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ++; GFX11-NEXT: s_mov_b32 s6, -1 ++; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ++; GFX11-NEXT: s_mov_b32 s10, s6 ++; GFX11-NEXT: s_mov_b32 s11, s7 ++; GFX11-NEXT: s_waitcnt lgkmcnt(0) ++; GFX11-NEXT: s_mov_b32 s8, s2 ++; GFX11-NEXT: s_mov_b32 s9, s3 ++; GFX11-NEXT: s_mov_b32 s4, s0 ++; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[8:11], 0 ++; GFX11-NEXT: s_mov_b32 s5, s1 ++; GFX11-NEXT: s_waitcnt vmcnt(0) ++; GFX11-NEXT: v_mul_lo_u32 v0, v0, v1 ++; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 ++; GFX11-NEXT: s_nop 0 ++; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ++; GFX11-NEXT: s_endpgm ++; ++; EG-LABEL: v_mul_i32: ++; EG: ; %bb.0: ; %entry ++; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] ++; EG-NEXT: TEX 0 @6 ++; EG-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[] ++; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 ++; EG-NEXT: CF_END ++; EG-NEXT: PAD ++; EG-NEXT: Fetch clause starting at 6: ++; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1 ++; EG-NEXT: ALU clause starting at 8: ++; EG-NEXT: MOV * T0.X, KC0[2].Z, ++; EG-NEXT: ALU clause starting at 9: ++; EG-NEXT: LSHR T1.X, KC0[2].Y, literal.x, ++; EG-NEXT: MULLO_INT * T0.X, T0.X, T0.Y, ++; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) ++entry: + %b_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1 + %a = load i32, ptr addrspace(1) %in + %b = load i32, ptr addrspace(1) %b_ptr +@@ -137,6 +1059,298 @@ define amdgpu_kernel void @v_mul_i32(ptr addrspace(1) %out, ptr addrspace(1) %in + ret void + } + ++define amdgpu_kernel void @s_mul_i1(ptr addrspace(1) %out, [8 x i32], i1 %a, [8 x i32], i1 %b) nounwind { ++; SI-LABEL: s_mul_i1: ++; SI: ; %bb.0: ; %entry ++; SI-NEXT: s_load_dword s2, s[0:1], 0x13 ++; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ++; SI-NEXT: s_load_dword s3, s[0:1], 0x1c ++; SI-NEXT: s_mov_b32 s7, 0xf000 ++; SI-NEXT: s_mov_b32 s6, -1 ++; SI-NEXT: s_waitcnt lgkmcnt(0) ++; SI-NEXT: s_bitcmp1_b32 s2, 0 ++; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 ++; SI-NEXT: s_bitcmp1_b32 s3, 0 ++; SI-NEXT: s_cselect_b64 s[2:3], -1, 0 ++; SI-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] ++; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ++; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0 ++; SI-NEXT: s_endpgm ++; ++; VI-LABEL: s_mul_i1: ++; VI: ; %bb.0: ; %entry ++; VI-NEXT: s_load_dword s2, s[0:1], 0x4c ++; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ++; VI-NEXT: s_load_dword s3, s[0:1], 0x70 ++; VI-NEXT: s_mov_b32 s7, 0xf000 ++; VI-NEXT: s_mov_b32 s6, -1 ++; VI-NEXT: s_waitcnt lgkmcnt(0) ++; VI-NEXT: s_bitcmp1_b32 s2, 0 ++; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 ++; VI-NEXT: s_bitcmp1_b32 s3, 0 ++; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 ++; VI-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] ++; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ++; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0 ++; VI-NEXT: s_endpgm ++; ++; GFX9-LABEL: s_mul_i1: ++; GFX9: ; %bb.0: ; %entry ++; GFX9-NEXT: s_load_dword s2, s[0:1], 0x4c ++; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ++; GFX9-NEXT: s_load_dword s3, s[0:1], 0x70 ++; GFX9-NEXT: s_mov_b32 s7, 0xf000 ++; GFX9-NEXT: s_mov_b32 s6, -1 ++; GFX9-NEXT: s_waitcnt lgkmcnt(0) ++; GFX9-NEXT: s_bitcmp1_b32 s2, 0 ++; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 ++; GFX9-NEXT: s_bitcmp1_b32 s3, 0 ++; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0 ++; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] ++; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ++; GFX9-NEXT: buffer_store_byte v0, off, s[4:7], 0 ++; GFX9-NEXT: s_endpgm ++; ++; GFX10-LABEL: s_mul_i1: ++; GFX10: ; %bb.0: ; %entry ++; GFX10-NEXT: s_clause 0x2 ++; GFX10-NEXT: s_load_dword s2, s[0:1], 0x4c ++; GFX10-NEXT: s_load_dword s3, s[0:1], 0x70 ++; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ++; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ++; GFX10-NEXT: s_mov_b32 s6, -1 ++; GFX10-NEXT: s_waitcnt lgkmcnt(0) ++; GFX10-NEXT: s_bitcmp1_b32 s2, 0 ++; GFX10-NEXT: s_cselect_b32 s0, -1, 0 ++; GFX10-NEXT: s_bitcmp1_b32 s3, 0 ++; GFX10-NEXT: s_cselect_b32 s1, -1, 0 ++; GFX10-NEXT: s_and_b32 s0, s0, s1 ++; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 ++; GFX10-NEXT: buffer_store_byte v0, off, s[4:7], 0 ++; GFX10-NEXT: s_endpgm ++; ++; GFX11-LABEL: s_mul_i1: ++; GFX11: ; %bb.0: ; %entry ++; GFX11-NEXT: s_clause 0x2 ++; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x4c ++; GFX11-NEXT: s_load_b32 s3, s[0:1], 0x70 ++; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ++; GFX11-NEXT: s_waitcnt lgkmcnt(0) ++; GFX11-NEXT: s_bitcmp1_b32 s2, 0 ++; GFX11-NEXT: s_cselect_b32 s2, -1, 0 ++; GFX11-NEXT: s_bitcmp1_b32 s3, 0 ++; GFX11-NEXT: s_cselect_b32 s3, -1, 0 ++; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ++; GFX11-NEXT: s_and_b32 s2, s2, s3 ++; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ++; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 ++; GFX11-NEXT: s_mov_b32 s2, -1 ++; GFX11-NEXT: buffer_store_b8 v0, off, s[0:3], 0 ++; GFX11-NEXT: s_nop 0 ++; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ++; GFX11-NEXT: s_endpgm ++; ++; EG-LABEL: s_mul_i1: ++; EG: ; %bb.0: ; %entry ++; EG-NEXT: ALU 0, @10, KC0[], KC1[] ++; EG-NEXT: TEX 1 @6 ++; EG-NEXT: ALU 12, @11, KC0[CB0:0-32], KC1[] ++; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X ++; EG-NEXT: CF_END ++; EG-NEXT: PAD ++; EG-NEXT: Fetch clause starting at 6: ++; EG-NEXT: VTX_READ_8 T1.X, T0.X, 72, #3 ++; EG-NEXT: VTX_READ_8 T0.X, T0.X, 108, #3 ++; EG-NEXT: ALU clause starting at 10: ++; EG-NEXT: MOV * T0.X, 0.0, ++; EG-NEXT: ALU clause starting at 11: ++; EG-NEXT: AND_INT T0.W, KC0[2].Y, literal.x, ++; EG-NEXT: MULLO_INT * T0.X, T1.X, T0.X, ++; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) ++; EG-NEXT: AND_INT T1.W, PS, 1, ++; EG-NEXT: LSHL * T0.W, PV.W, literal.x, ++; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) ++; EG-NEXT: LSHL T0.X, PV.W, PS, ++; EG-NEXT: LSHL * T0.W, literal.x, PS, ++; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) ++; EG-NEXT: MOV T0.Y, 0.0, ++; EG-NEXT: MOV * T0.Z, 0.0, ++; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ++; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) ++entry: ++ %mul = mul i1 %a, %b ++ store i1 %mul, ptr addrspace(1) %out, align 4 ++ ret void ++} ++ ++define amdgpu_kernel void @v_mul_i1(ptr addrspace(1) %out, ptr addrspace(1) %in) { ++; SI-LABEL: v_mul_i1: ++; SI: ; %bb.0: ; %entry ++; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ++; SI-NEXT: s_mov_b32 s7, 0xf000 ++; SI-NEXT: s_mov_b32 s6, -1 ++; SI-NEXT: s_mov_b32 s10, s6 ++; SI-NEXT: s_mov_b32 s11, s7 ++; SI-NEXT: s_waitcnt lgkmcnt(0) ++; SI-NEXT: s_mov_b32 s8, s2 ++; SI-NEXT: s_mov_b32 s9, s3 ++; SI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 ++; SI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:4 ++; SI-NEXT: s_mov_b32 s4, s0 ++; SI-NEXT: s_mov_b32 s5, s1 ++; SI-NEXT: s_waitcnt vmcnt(1) ++; SI-NEXT: v_and_b32_e32 v0, 1, v0 ++; SI-NEXT: s_waitcnt vmcnt(0) ++; SI-NEXT: v_and_b32_e32 v1, 1, v1 ++; SI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ++; SI-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v1 ++; SI-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ++; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ++; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0 ++; SI-NEXT: s_endpgm ++; ++; VI-LABEL: v_mul_i1: ++; VI: ; %bb.0: ; %entry ++; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ++; VI-NEXT: s_mov_b32 s7, 0xf000 ++; VI-NEXT: s_mov_b32 s6, -1 ++; VI-NEXT: s_mov_b32 s10, s6 ++; VI-NEXT: s_mov_b32 s11, s7 ++; VI-NEXT: s_waitcnt lgkmcnt(0) ++; VI-NEXT: s_mov_b32 s8, s2 ++; VI-NEXT: s_mov_b32 s9, s3 ++; VI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 ++; VI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:4 ++; VI-NEXT: s_mov_b32 s4, s0 ++; VI-NEXT: s_mov_b32 s5, s1 ++; VI-NEXT: s_waitcnt vmcnt(1) ++; VI-NEXT: v_and_b32_e32 v0, 1, v0 ++; VI-NEXT: s_waitcnt vmcnt(0) ++; VI-NEXT: v_and_b32_e32 v1, 1, v1 ++; VI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ++; VI-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v1 ++; VI-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ++; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ++; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0 ++; VI-NEXT: s_endpgm ++; ++; GFX9-LABEL: v_mul_i1: ++; GFX9: ; %bb.0: ; %entry ++; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ++; GFX9-NEXT: s_mov_b32 s7, 0xf000 ++; GFX9-NEXT: s_mov_b32 s6, -1 ++; GFX9-NEXT: s_mov_b32 s10, s6 ++; GFX9-NEXT: s_mov_b32 s11, s7 ++; GFX9-NEXT: s_waitcnt lgkmcnt(0) ++; GFX9-NEXT: s_mov_b32 s8, s2 ++; GFX9-NEXT: s_mov_b32 s9, s3 ++; GFX9-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 ++; GFX9-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:4 ++; GFX9-NEXT: s_mov_b32 s4, s0 ++; GFX9-NEXT: s_mov_b32 s5, s1 ++; GFX9-NEXT: s_waitcnt vmcnt(1) ++; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ++; GFX9-NEXT: s_waitcnt vmcnt(0) ++; GFX9-NEXT: v_and_b32_e32 v1, 1, v1 ++; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ++; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v1 ++; GFX9-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ++; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ++; GFX9-NEXT: buffer_store_byte v0, off, s[4:7], 0 ++; GFX9-NEXT: s_endpgm ++; ++; GFX10-LABEL: v_mul_i1: ++; GFX10: ; %bb.0: ; %entry ++; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ++; GFX10-NEXT: s_mov_b32 s2, -1 ++; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ++; GFX10-NEXT: s_mov_b32 s10, s2 ++; GFX10-NEXT: s_mov_b32 s11, s3 ++; GFX10-NEXT: s_waitcnt lgkmcnt(0) ++; GFX10-NEXT: s_mov_b32 s8, s6 ++; GFX10-NEXT: s_mov_b32 s9, s7 ++; GFX10-NEXT: s_clause 0x1 ++; GFX10-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 ++; GFX10-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:4 ++; GFX10-NEXT: s_mov_b32 s1, s5 ++; GFX10-NEXT: s_waitcnt vmcnt(1) ++; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 ++; GFX10-NEXT: s_waitcnt vmcnt(0) ++; GFX10-NEXT: v_and_b32_e32 v1, 1, v1 ++; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ++; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 1, v1 ++; GFX10-NEXT: s_and_b32 s0, vcc_lo, s0 ++; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 ++; GFX10-NEXT: s_mov_b32 s0, s4 ++; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], 0 ++; GFX10-NEXT: s_endpgm ++; ++; GFX11-LABEL: v_mul_i1: ++; GFX11: ; %bb.0: ; %entry ++; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ++; GFX11-NEXT: s_mov_b32 s2, -1 ++; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ++; GFX11-NEXT: s_mov_b32 s10, s2 ++; GFX11-NEXT: s_mov_b32 s11, s3 ++; GFX11-NEXT: s_waitcnt lgkmcnt(0) ++; GFX11-NEXT: s_mov_b32 s8, s6 ++; GFX11-NEXT: s_mov_b32 s9, s7 ++; GFX11-NEXT: s_clause 0x1 ++; GFX11-NEXT: buffer_load_u8 v0, off, s[8:11], 0 ++; GFX11-NEXT: buffer_load_u8 v1, off, s[8:11], 0 offset:4 ++; GFX11-NEXT: s_mov_b32 s1, s5 ++; GFX11-NEXT: s_waitcnt vmcnt(1) ++; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 ++; GFX11-NEXT: s_waitcnt vmcnt(0) ++; GFX11-NEXT: v_and_b32_e32 v1, 1, v1 ++; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ++; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ++; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 1, v1 ++; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ++; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 ++; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 ++; GFX11-NEXT: s_mov_b32 s0, s4 ++; GFX11-NEXT: buffer_store_b8 v0, off, s[0:3], 0 ++; GFX11-NEXT: s_nop 0 ++; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ++; GFX11-NEXT: s_endpgm ++; ++; EG-LABEL: v_mul_i1: ++; EG: ; %bb.0: ; %entry ++; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[] ++; EG-NEXT: TEX 1 @6 ++; EG-NEXT: ALU 12, @11, KC0[CB0:0-32], KC1[] ++; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X ++; EG-NEXT: CF_END ++; EG-NEXT: PAD ++; EG-NEXT: Fetch clause starting at 6: ++; EG-NEXT: VTX_READ_8 T1.X, T0.X, 4, #1 ++; EG-NEXT: VTX_READ_8 T0.X, T0.X, 0, #1 ++; EG-NEXT: ALU clause starting at 10: ++; EG-NEXT: MOV * T0.X, KC0[2].Z, ++; EG-NEXT: ALU clause starting at 11: ++; EG-NEXT: AND_INT T0.W, KC0[2].Y, literal.x, ++; EG-NEXT: MULLO_INT * T0.X, T0.X, T1.X, ++; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) ++; EG-NEXT: AND_INT T1.W, PS, 1, ++; EG-NEXT: LSHL * T0.W, PV.W, literal.x, ++; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) ++; EG-NEXT: LSHL T0.X, PV.W, PS, ++; EG-NEXT: LSHL * T0.W, literal.x, PS, ++; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) ++; EG-NEXT: MOV T0.Y, 0.0, ++; EG-NEXT: MOV * T0.Z, 0.0, ++; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ++; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) ++entry: ++ %b_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1 ++ %a = load i1, ptr addrspace(1) %in ++ %b = load i1, ptr addrspace(1) %b_ptr ++ %result = mul i1 %a, %b ++ store i1 %result, ptr addrspace(1) %out ++ ret void ++} ++ + ; A standard 64-bit multiply. The expansion should be around 6 instructions. + ; It would be difficult to match the expansion correctly without writing + ; a really complicated list of FileCheck expressions. I don't want +@@ -144,21 +1358,294 @@ define amdgpu_kernel void @v_mul_i32(ptr addrspace(1) %out, ptr addrspace(1) %in + ; so this test just uses FUNC-LABEL to make sure the compiler does not + ; crash with a 'failed to select' error. + +-; FUNC-LABEL: {{^}}s_mul_i64: +-; GFX9PLUS-DAG: s_mul_i32 +-; GFX9PLUS-DAG: s_mul_hi_u32 +-; GFX9PLUS-DAG: s_mul_i32 +-; GFX9PLUS-DAG: s_mul_i32 +-; GFX9PLUS: s_endpgm + define amdgpu_kernel void @s_mul_i64(ptr addrspace(1) %out, i64 %a, i64 %b) nounwind { ++; SI-LABEL: s_mul_i64: ++; SI: ; %bb.0: ; %entry ++; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ++; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ++; SI-NEXT: s_mov_b32 s3, 0xf000 ++; SI-NEXT: s_mov_b32 s2, -1 ++; SI-NEXT: s_waitcnt lgkmcnt(0) ++; SI-NEXT: s_mov_b32 s0, s4 ++; SI-NEXT: v_mov_b32_e32 v0, s8 ++; SI-NEXT: v_mul_hi_u32 v0, s6, v0 ++; SI-NEXT: s_mul_i32 s4, s6, s9 ++; SI-NEXT: s_mov_b32 s1, s5 ++; SI-NEXT: v_add_i32_e32 v0, vcc, s4, v0 ++; SI-NEXT: s_mul_i32 s4, s7, s8 ++; SI-NEXT: v_add_i32_e32 v1, vcc, s4, v0 ++; SI-NEXT: s_mul_i32 s4, s6, s8 ++; SI-NEXT: v_mov_b32_e32 v0, s4 ++; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ++; SI-NEXT: s_endpgm ++; ++; VI-LABEL: s_mul_i64: ++; VI: ; %bb.0: ; %entry ++; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ++; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ++; VI-NEXT: s_mov_b32 s3, 0xf000 ++; VI-NEXT: s_mov_b32 s2, -1 ++; VI-NEXT: s_waitcnt lgkmcnt(0) ++; VI-NEXT: s_mov_b32 s0, s4 ++; VI-NEXT: v_mov_b32_e32 v0, s8 ++; VI-NEXT: v_mad_u64_u32 v[0:1], s[10:11], s6, v0, 0 ++; VI-NEXT: s_mul_i32 s4, s6, s9 ++; VI-NEXT: s_mov_b32 s1, s5 ++; VI-NEXT: v_add_u32_e32 v1, vcc, s4, v1 ++; VI-NEXT: s_mul_i32 s4, s7, s8 ++; VI-NEXT: v_add_u32_e32 v1, vcc, s4, v1 ++; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ++; VI-NEXT: s_endpgm ++; ++; GFX9-LABEL: s_mul_i64: ++; GFX9: ; %bb.0: ; %entry ++; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ++; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ++; GFX9-NEXT: s_mov_b32 s3, 0xf000 ++; GFX9-NEXT: s_mov_b32 s2, -1 ++; GFX9-NEXT: s_waitcnt lgkmcnt(0) ++; GFX9-NEXT: s_mov_b32 s0, s4 ++; GFX9-NEXT: s_mov_b32 s1, s5 ++; GFX9-NEXT: s_mul_i32 s4, s6, s9 ++; GFX9-NEXT: s_mul_hi_u32 s5, s6, s8 ++; GFX9-NEXT: s_add_i32 s4, s5, s4 ++; GFX9-NEXT: s_mul_i32 s5, s7, s8 ++; GFX9-NEXT: s_add_i32 s4, s4, s5 ++; GFX9-NEXT: s_mul_i32 s5, s6, s8 ++; GFX9-NEXT: v_mov_b32_e32 v0, s5 ++; GFX9-NEXT: v_mov_b32_e32 v1, s4 ++; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ++; GFX9-NEXT: s_endpgm ++; ++; GFX10-LABEL: s_mul_i64: ++; GFX10: ; %bb.0: ; %entry ++; GFX10-NEXT: s_clause 0x1 ++; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ++; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ++; GFX10-NEXT: s_waitcnt lgkmcnt(0) ++; GFX10-NEXT: s_mul_i32 s0, s6, s3 ++; GFX10-NEXT: s_mul_hi_u32 s1, s6, s2 ++; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ++; GFX10-NEXT: s_add_i32 s0, s1, s0 ++; GFX10-NEXT: s_mul_i32 s1, s7, s2 ++; GFX10-NEXT: s_mul_i32 s2, s6, s2 ++; GFX10-NEXT: s_add_i32 s0, s0, s1 ++; GFX10-NEXT: v_mov_b32_e32 v0, s2 ++; GFX10-NEXT: v_mov_b32_e32 v1, s0 ++; GFX10-NEXT: s_mov_b32 s2, -1 ++; GFX10-NEXT: s_mov_b32 s0, s4 ++; GFX10-NEXT: s_mov_b32 s1, s5 ++; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ++; GFX10-NEXT: s_endpgm ++; ++; GFX11-LABEL: s_mul_i64: ++; GFX11: ; %bb.0: ; %entry ++; GFX11-NEXT: s_clause 0x1 ++; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ++; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ++; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ++; GFX11-NEXT: s_waitcnt lgkmcnt(0) ++; GFX11-NEXT: s_mul_i32 s1, s6, s1 ++; GFX11-NEXT: s_mul_hi_u32 s2, s6, s0 ++; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ++; GFX11-NEXT: s_add_i32 s1, s2, s1 ++; GFX11-NEXT: s_mul_i32 s2, s7, s0 ++; GFX11-NEXT: s_mul_i32 s0, s6, s0 ++; GFX11-NEXT: s_add_i32 s1, s1, s2 ++; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ++; GFX11-NEXT: s_mov_b32 s2, -1 ++; GFX11-NEXT: s_mov_b32 s0, s4 ++; GFX11-NEXT: s_mov_b32 s1, s5 ++; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 ++; GFX11-NEXT: s_nop 0 ++; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ++; GFX11-NEXT: s_endpgm ++; ++; EG-LABEL: s_mul_i64: ++; EG: ; %bb.0: ; %entry ++; EG-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[] ++; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 ++; EG-NEXT: CF_END ++; EG-NEXT: PAD ++; EG-NEXT: ALU clause starting at 4: ++; EG-NEXT: MULHI * T0.X, KC0[2].W, KC0[3].Y, ++; EG-NEXT: MULLO_INT * T0.Y, KC0[2].W, KC0[3].Z, ++; EG-NEXT: ADD_INT T0.W, T0.X, PS, ++; EG-NEXT: MULLO_INT * T0.X, KC0[3].X, KC0[3].Y, ++; EG-NEXT: ADD_INT * T0.Y, PV.W, PS, ++; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ++; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) ++; EG-NEXT: MULLO_INT * T0.X, KC0[2].W, KC0[3].Y, ++entry: + %mul = mul i64 %a, %b + store i64 %mul, ptr addrspace(1) %out, align 8 + ret void + } + +-; FUNC-LABEL: {{^}}v_mul_i64: +-; GCN: v_mul_lo_u32 + define amdgpu_kernel void @v_mul_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) { ++; SI-LABEL: v_mul_i64: ++; SI: ; %bb.0: ; %entry ++; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ++; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ++; SI-NEXT: s_mov_b32 s3, 0xf000 ++; SI-NEXT: s_mov_b32 s2, -1 ++; SI-NEXT: s_mov_b32 s10, s2 ++; SI-NEXT: s_mov_b32 s11, s3 ++; SI-NEXT: s_waitcnt lgkmcnt(0) ++; SI-NEXT: s_mov_b32 s12, s6 ++; SI-NEXT: s_mov_b32 s13, s7 ++; SI-NEXT: s_mov_b32 s14, s2 ++; SI-NEXT: s_mov_b32 s15, s3 ++; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 ++; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0 ++; SI-NEXT: s_mov_b32 s0, s4 ++; SI-NEXT: s_mov_b32 s1, s5 ++; SI-NEXT: s_waitcnt vmcnt(0) ++; SI-NEXT: v_mul_lo_u32 v1, v2, v1 ++; SI-NEXT: v_mul_hi_u32 v4, v2, v0 ++; SI-NEXT: v_mul_lo_u32 v3, v3, v0 ++; SI-NEXT: v_mul_lo_u32 v0, v2, v0 ++; SI-NEXT: v_add_i32_e32 v1, vcc, v1, v4 ++; SI-NEXT: v_add_i32_e32 v1, vcc, v1, v3 ++; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ++; SI-NEXT: s_endpgm ++; ++; VI-LABEL: v_mul_i64: ++; VI: ; %bb.0: ; %entry ++; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ++; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ++; VI-NEXT: s_mov_b32 s3, 0xf000 ++; VI-NEXT: s_mov_b32 s2, -1 ++; VI-NEXT: s_mov_b32 s10, s2 ++; VI-NEXT: s_mov_b32 s11, s3 ++; VI-NEXT: s_waitcnt lgkmcnt(0) ++; VI-NEXT: s_mov_b32 s12, s6 ++; VI-NEXT: s_mov_b32 s13, s7 ++; VI-NEXT: s_mov_b32 s14, s2 ++; VI-NEXT: s_mov_b32 s15, s3 ++; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 ++; VI-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0 ++; VI-NEXT: s_mov_b32 s0, s4 ++; VI-NEXT: s_mov_b32 s1, s5 ++; VI-NEXT: s_waitcnt vmcnt(0) ++; VI-NEXT: v_mul_lo_u32 v4, v2, v1 ++; VI-NEXT: v_mad_u64_u32 v[1:2], s[6:7], v2, v0, 0 ++; VI-NEXT: v_mul_lo_u32 v0, v3, v0 ++; VI-NEXT: v_add_u32_e32 v2, vcc, v4, v2 ++; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ++; VI-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0 ++; VI-NEXT: s_endpgm ++; ++; GFX9-LABEL: v_mul_i64: ++; GFX9: ; %bb.0: ; %entry ++; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ++; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ++; GFX9-NEXT: s_mov_b32 s3, 0xf000 ++; GFX9-NEXT: s_mov_b32 s2, -1 ++; GFX9-NEXT: s_mov_b32 s10, s2 ++; GFX9-NEXT: s_mov_b32 s11, s3 ++; GFX9-NEXT: s_waitcnt lgkmcnt(0) ++; GFX9-NEXT: s_mov_b32 s12, s6 ++; GFX9-NEXT: s_mov_b32 s13, s7 ++; GFX9-NEXT: s_mov_b32 s14, s2 ++; GFX9-NEXT: s_mov_b32 s15, s3 ++; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 ++; GFX9-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0 ++; GFX9-NEXT: s_mov_b32 s0, s4 ++; GFX9-NEXT: s_mov_b32 s1, s5 ++; GFX9-NEXT: s_waitcnt vmcnt(0) ++; GFX9-NEXT: v_mul_lo_u32 v1, v2, v1 ++; GFX9-NEXT: v_mul_hi_u32 v4, v2, v0 ++; GFX9-NEXT: v_mul_lo_u32 v3, v3, v0 ++; GFX9-NEXT: v_mul_lo_u32 v0, v2, v0 ++; GFX9-NEXT: v_add_u32_e32 v1, v4, v1 ++; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 ++; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ++; GFX9-NEXT: s_endpgm ++; ++; GFX10-LABEL: v_mul_i64: ++; GFX10: ; %bb.0: ; %entry ++; GFX10-NEXT: s_clause 0x1 ++; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ++; GFX10-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ++; GFX10-NEXT: s_mov_b32 s2, -1 ++; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ++; GFX10-NEXT: s_mov_b32 s10, s2 ++; GFX10-NEXT: s_mov_b32 s11, s3 ++; GFX10-NEXT: s_mov_b32 s14, s2 ++; GFX10-NEXT: s_mov_b32 s15, s3 ++; GFX10-NEXT: s_waitcnt lgkmcnt(0) ++; GFX10-NEXT: s_mov_b32 s12, s6 ++; GFX10-NEXT: s_mov_b32 s13, s7 ++; GFX10-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 ++; GFX10-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0 ++; GFX10-NEXT: s_mov_b32 s0, s4 ++; GFX10-NEXT: s_mov_b32 s1, s5 ++; GFX10-NEXT: s_waitcnt vmcnt(0) ++; GFX10-NEXT: v_mul_lo_u32 v1, v2, v1 ++; GFX10-NEXT: v_mul_hi_u32 v4, v2, v0 ++; GFX10-NEXT: v_mul_lo_u32 v3, v3, v0 ++; GFX10-NEXT: v_mul_lo_u32 v0, v2, v0 ++; GFX10-NEXT: v_add_nc_u32_e32 v1, v4, v1 ++; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v3 ++; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ++; GFX10-NEXT: s_endpgm ++; ++; GFX11-LABEL: v_mul_i64: ++; GFX11: ; %bb.0: ; %entry ++; GFX11-NEXT: s_clause 0x1 ++; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ++; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ++; GFX11-NEXT: s_mov_b32 s10, -1 ++; GFX11-NEXT: s_mov_b32 s11, 0x31016000 ++; GFX11-NEXT: s_mov_b32 s2, s10 ++; GFX11-NEXT: s_mov_b32 s3, s11 ++; GFX11-NEXT: s_mov_b32 s14, s10 ++; GFX11-NEXT: s_mov_b32 s15, s11 ++; GFX11-NEXT: s_waitcnt lgkmcnt(0) ++; GFX11-NEXT: s_mov_b32 s12, s6 ++; GFX11-NEXT: s_mov_b32 s13, s7 ++; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[0:3], 0 ++; GFX11-NEXT: buffer_load_b64 v[2:3], off, s[12:15], 0 ++; GFX11-NEXT: s_mov_b32 s8, s4 ++; GFX11-NEXT: s_mov_b32 s9, s5 ++; GFX11-NEXT: s_waitcnt vmcnt(0) ++; GFX11-NEXT: v_mul_lo_u32 v1, v2, v1 ++; GFX11-NEXT: v_mul_hi_u32 v4, v2, v0 ++; GFX11-NEXT: v_mul_lo_u32 v3, v3, v0 ++; GFX11-NEXT: v_mul_lo_u32 v0, v2, v0 ++; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ++; GFX11-NEXT: v_add_nc_u32_e32 v1, v4, v1 ++; GFX11-NEXT: v_add_nc_u32_e32 v1, v1, v3 ++; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0 ++; GFX11-NEXT: s_nop 0 ++; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ++; GFX11-NEXT: s_endpgm ++; ++; EG-LABEL: v_mul_i64: ++; EG: ; %bb.0: ; %entry ++; EG-NEXT: ALU 1, @10, KC0[CB0:0-32], KC1[] ++; EG-NEXT: TEX 1 @6 ++; EG-NEXT: ALU 7, @12, KC0[CB0:0-32], KC1[] ++; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T2.X, 1 ++; EG-NEXT: CF_END ++; EG-NEXT: PAD ++; EG-NEXT: Fetch clause starting at 6: ++; EG-NEXT: VTX_READ_64 T1.XY, T1.X, 0, #1 ++; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1 ++; EG-NEXT: ALU clause starting at 10: ++; EG-NEXT: MOV T0.X, KC0[2].Z, ++; EG-NEXT: MOV * T1.X, KC0[2].W, ++; EG-NEXT: ALU clause starting at 12: ++; EG-NEXT: MULHI * T0.Z, T0.X, T1.X, ++; EG-NEXT: MULLO_INT * T0.W, T0.X, T1.Y, ++; EG-NEXT: ADD_INT T0.W, T0.Z, PS, ++; EG-NEXT: MULLO_INT * T0.Y, T0.Y, T1.X, ++; EG-NEXT: ADD_INT * T0.Y, PV.W, PS, ++; EG-NEXT: LSHR T2.X, KC0[2].Y, literal.x, ++; EG-NEXT: MULLO_INT * T0.X, T0.X, T1.X, ++; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) ++entry: + %a = load i64, ptr addrspace(1) %aptr, align 8 + %b = load i64, ptr addrspace(1) %bptr, align 8 + %mul = mul i64 %a, %b +@@ -166,9 +1653,220 @@ define amdgpu_kernel void @v_mul_i64(ptr addrspace(1) %out, ptr addrspace(1) %ap + ret void + } + +-; FUNC-LABEL: {{^}}mul32_in_branch: +-; GCN: s_mul_i32 + define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %a, i32 %b, i32 %c) { ++; SI-LABEL: mul32_in_branch: ++; SI: ; %bb.0: ; %entry ++; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd ++; SI-NEXT: s_waitcnt lgkmcnt(0) ++; SI-NEXT: s_cmp_lg_u32 s2, 0 ++; SI-NEXT: s_cbranch_scc0 .LBB13_2 ++; SI-NEXT: ; %bb.1: ; %else ++; SI-NEXT: s_mul_i32 s6, s2, s3 ++; SI-NEXT: s_mov_b64 s[4:5], 0 ++; SI-NEXT: s_branch .LBB13_3 ++; SI-NEXT: .LBB13_2: ++; SI-NEXT: s_mov_b64 s[4:5], -1 ++; SI-NEXT: ; implicit-def: $sgpr6 ++; SI-NEXT: .LBB13_3: ; %Flow ++; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ++; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] ++; SI-NEXT: s_waitcnt lgkmcnt(0) ++; SI-NEXT: s_mov_b64 vcc, vcc ++; SI-NEXT: s_cbranch_vccnz .LBB13_5 ++; SI-NEXT: ; %bb.4: ; %if ++; SI-NEXT: s_mov_b32 s7, 0xf000 ++; SI-NEXT: s_mov_b32 s6, -1 ++; SI-NEXT: s_mov_b32 s4, s2 ++; SI-NEXT: s_mov_b32 s5, s3 ++; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 ++; SI-NEXT: s_branch .LBB13_6 ++; SI-NEXT: .LBB13_5: ++; SI-NEXT: v_mov_b32_e32 v0, s6 ++; SI-NEXT: .LBB13_6: ; %endif ++; SI-NEXT: s_mov_b32 s3, 0xf000 ++; SI-NEXT: s_mov_b32 s2, -1 ++; SI-NEXT: s_waitcnt vmcnt(0) ++; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ++; SI-NEXT: s_endpgm ++; ++; VI-LABEL: mul32_in_branch: ++; VI: ; %bb.0: ; %entry ++; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ++; VI-NEXT: s_waitcnt lgkmcnt(0) ++; VI-NEXT: s_cmp_lg_u32 s2, 0 ++; VI-NEXT: s_cbranch_scc0 .LBB13_2 ++; VI-NEXT: ; %bb.1: ; %else ++; VI-NEXT: s_mul_i32 s6, s2, s3 ++; VI-NEXT: s_mov_b64 s[4:5], 0 ++; VI-NEXT: s_branch .LBB13_3 ++; VI-NEXT: .LBB13_2: ++; VI-NEXT: s_mov_b64 s[4:5], -1 ++; VI-NEXT: ; implicit-def: $sgpr6 ++; VI-NEXT: .LBB13_3: ; %Flow ++; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ++; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5] ++; VI-NEXT: s_cbranch_vccnz .LBB13_5 ++; VI-NEXT: ; %bb.4: ; %if ++; VI-NEXT: s_mov_b32 s7, 0xf000 ++; VI-NEXT: s_mov_b32 s6, -1 ++; VI-NEXT: s_waitcnt lgkmcnt(0) ++; VI-NEXT: s_mov_b32 s4, s2 ++; VI-NEXT: s_mov_b32 s5, s3 ++; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 ++; VI-NEXT: s_branch .LBB13_6 ++; VI-NEXT: .LBB13_5: ++; VI-NEXT: v_mov_b32_e32 v0, s6 ++; VI-NEXT: .LBB13_6: ; %endif ++; VI-NEXT: s_waitcnt lgkmcnt(0) ++; VI-NEXT: s_mov_b32 s3, 0xf000 ++; VI-NEXT: s_mov_b32 s2, -1 ++; VI-NEXT: s_waitcnt vmcnt(0) ++; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ++; VI-NEXT: s_endpgm ++; ++; GFX9-LABEL: mul32_in_branch: ++; GFX9: ; %bb.0: ; %entry ++; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ++; GFX9-NEXT: s_waitcnt lgkmcnt(0) ++; GFX9-NEXT: s_cmp_lg_u32 s2, 0 ++; GFX9-NEXT: s_cbranch_scc0 .LBB13_2 ++; GFX9-NEXT: ; %bb.1: ; %else ++; GFX9-NEXT: s_mul_i32 s6, s2, s3 ++; GFX9-NEXT: s_mov_b64 s[4:5], 0 ++; GFX9-NEXT: s_branch .LBB13_3 ++; GFX9-NEXT: .LBB13_2: ++; GFX9-NEXT: s_mov_b64 s[4:5], -1 ++; GFX9-NEXT: ; implicit-def: $sgpr6 ++; GFX9-NEXT: .LBB13_3: ; %Flow ++; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ++; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] ++; GFX9-NEXT: s_cbranch_vccnz .LBB13_5 ++; GFX9-NEXT: ; %bb.4: ; %if ++; GFX9-NEXT: s_mov_b32 s7, 0xf000 ++; GFX9-NEXT: s_mov_b32 s6, -1 ++; GFX9-NEXT: s_waitcnt lgkmcnt(0) ++; GFX9-NEXT: s_mov_b32 s4, s2 ++; GFX9-NEXT: s_mov_b32 s5, s3 ++; GFX9-NEXT: buffer_load_dword v0, off, s[4:7], 0 ++; GFX9-NEXT: s_branch .LBB13_6 ++; GFX9-NEXT: .LBB13_5: ++; GFX9-NEXT: v_mov_b32_e32 v0, s6 ++; GFX9-NEXT: .LBB13_6: ; %endif ++; GFX9-NEXT: s_waitcnt lgkmcnt(0) ++; GFX9-NEXT: s_mov_b32 s3, 0xf000 ++; GFX9-NEXT: s_mov_b32 s2, -1 ++; GFX9-NEXT: s_waitcnt vmcnt(0) ++; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ++; GFX9-NEXT: s_endpgm ++; ++; GFX10-LABEL: mul32_in_branch: ++; GFX10: ; %bb.0: ; %entry ++; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ++; GFX10-NEXT: s_mov_b32 s4, 0 ++; GFX10-NEXT: s_waitcnt lgkmcnt(0) ++; GFX10-NEXT: s_cmp_lg_u32 s2, 0 ++; GFX10-NEXT: s_cbranch_scc0 .LBB13_2 ++; GFX10-NEXT: ; %bb.1: ; %else ++; GFX10-NEXT: s_mul_i32 s5, s2, s3 ++; GFX10-NEXT: s_branch .LBB13_3 ++; GFX10-NEXT: .LBB13_2: ++; GFX10-NEXT: s_mov_b32 s4, -1 ++; GFX10-NEXT: ; implicit-def: $sgpr5 ++; GFX10-NEXT: .LBB13_3: ; %Flow ++; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ++; GFX10-NEXT: s_andn2_b32 vcc_lo, exec_lo, s4 ++; GFX10-NEXT: s_cbranch_vccnz .LBB13_5 ++; GFX10-NEXT: ; %bb.4: ; %if ++; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ++; GFX10-NEXT: s_mov_b32 s6, -1 ++; GFX10-NEXT: s_waitcnt lgkmcnt(0) ++; GFX10-NEXT: s_mov_b32 s4, s2 ++; GFX10-NEXT: s_mov_b32 s5, s3 ++; GFX10-NEXT: buffer_load_dword v0, off, s[4:7], 0 ++; GFX10-NEXT: s_branch .LBB13_6 ++; GFX10-NEXT: .LBB13_5: ++; GFX10-NEXT: v_mov_b32_e32 v0, s5 ++; GFX10-NEXT: .LBB13_6: ; %endif ++; GFX10-NEXT: s_waitcnt lgkmcnt(0) ++; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ++; GFX10-NEXT: s_mov_b32 s2, -1 ++; GFX10-NEXT: s_waitcnt vmcnt(0) ++; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 ++; GFX10-NEXT: s_endpgm ++; ++; GFX11-LABEL: mul32_in_branch: ++; GFX11: ; %bb.0: ; %entry ++; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34 ++; GFX11-NEXT: s_mov_b32 s4, 0 ++; GFX11-NEXT: s_waitcnt lgkmcnt(0) ++; GFX11-NEXT: s_cmp_lg_u32 s2, 0 ++; GFX11-NEXT: s_cbranch_scc0 .LBB13_2 ++; GFX11-NEXT: ; %bb.1: ; %else ++; GFX11-NEXT: s_mul_i32 s5, s2, s3 ++; GFX11-NEXT: s_branch .LBB13_3 ++; GFX11-NEXT: .LBB13_2: ++; GFX11-NEXT: s_mov_b32 s4, -1 ++; GFX11-NEXT: ; implicit-def: $sgpr5 ++; GFX11-NEXT: .LBB13_3: ; %Flow ++; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ++; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 ++; GFX11-NEXT: s_cbranch_vccnz .LBB13_5 ++; GFX11-NEXT: ; %bb.4: ; %if ++; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ++; GFX11-NEXT: s_mov_b32 s6, -1 ++; GFX11-NEXT: s_waitcnt lgkmcnt(0) ++; GFX11-NEXT: s_mov_b32 s4, s2 ++; GFX11-NEXT: s_mov_b32 s5, s3 ++; GFX11-NEXT: buffer_load_b32 v0, off, s[4:7], 0 ++; GFX11-NEXT: s_branch .LBB13_6 ++; GFX11-NEXT: .LBB13_5: ++; GFX11-NEXT: v_mov_b32_e32 v0, s5 ++; GFX11-NEXT: .LBB13_6: ; %endif ++; GFX11-NEXT: s_waitcnt lgkmcnt(0) ++; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ++; GFX11-NEXT: s_mov_b32 s2, -1 ++; GFX11-NEXT: s_waitcnt vmcnt(0) ++; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ++; GFX11-NEXT: s_nop 0 ++; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ++; GFX11-NEXT: s_endpgm ++; ++; EG-LABEL: mul32_in_branch: ++; EG: ; %bb.0: ; %entry ++; EG-NEXT: ALU_PUSH_BEFORE 3, @14, KC0[CB0:0-32], KC1[] ++; EG-NEXT: JUMP @3 POP:1 ++; EG-NEXT: ALU_POP_AFTER 4, @18, KC0[CB0:0-32], KC1[] ++; EG-NEXT: ALU_PUSH_BEFORE 2, @23, KC0[CB0:0-32], KC1[] ++; EG-NEXT: JUMP @8 POP:1 ++; EG-NEXT: ALU 0, @26, KC0[CB0:0-32], KC1[] ++; EG-NEXT: TEX 0 @12 ++; EG-NEXT: POP @8 POP:1 ++; EG-NEXT: ALU 1, @27, KC0[], KC1[] ++; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 ++; EG-NEXT: CF_END ++; EG-NEXT: PAD ++; EG-NEXT: Fetch clause starting at 12: ++; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 ++; EG-NEXT: ALU clause starting at 14: ++; EG-NEXT: MOV T0.W, literal.x, ++; EG-NEXT: SETNE_INT * T1.W, KC0[2].W, 0.0, ++; EG-NEXT: 1(1.401298e-45), 0(0.000000e+00) ++; EG-NEXT: PRED_SETNE_INT * ExecMask,PredicateBit (MASKED), PS, 0.0, ++; EG-NEXT: ALU clause starting at 18: ++; EG-NEXT: MOV T1.W, KC0[2].W, ++; EG-NEXT: MOV * T2.W, KC0[3].X, ++; EG-NEXT: MOV T0.W, literal.x, ++; EG-NEXT: MULLO_INT * T0.X, PV.W, PS, ++; EG-NEXT: 0(0.000000e+00), 0(0.000000e+00) ++; EG-NEXT: ALU clause starting at 23: ++; EG-NEXT: MOV T1.W, KC0[2].Y, ++; EG-NEXT: SETE_INT * T0.W, T0.W, 0.0, ++; EG-NEXT: PRED_SETE_INT * ExecMask,PredicateBit (MASKED), PS, 0.0, ++; EG-NEXT: ALU clause starting at 26: ++; EG-NEXT: MOV * T0.X, KC0[2].Z, ++; EG-NEXT: ALU clause starting at 27: ++; EG-NEXT: LSHR * T1.X, T1.W, literal.x, ++; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) + entry: + %0 = icmp eq i32 %a, 0 + br i1 %0, label %if, label %else +@@ -187,12 +1885,227 @@ endif: + ret void + } + +-; FUNC-LABEL: {{^}}mul64_in_branch: +-; SI-DAG: s_mul_i32 +-; SI-DAG: v_mul_hi_u32 +-; VI: v_mad_u64_u32 +-; GCN: s_endpgm + define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace(1) %in, i64 %a, i64 %b, i64 %c) { ++; SI-LABEL: mul64_in_branch: ++; SI: ; %bb.0: ; %entry ++; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ++; SI-NEXT: s_mov_b64 s[8:9], 0 ++; SI-NEXT: s_waitcnt lgkmcnt(0) ++; SI-NEXT: v_cmp_ne_u64_e64 s[10:11], s[4:5], 0 ++; SI-NEXT: s_and_b64 vcc, exec, s[10:11] ++; SI-NEXT: s_cbranch_vccz .LBB14_4 ++; SI-NEXT: ; %bb.1: ; %else ++; SI-NEXT: v_mov_b32_e32 v0, s6 ++; SI-NEXT: v_mul_hi_u32 v0, s4, v0 ++; SI-NEXT: s_mul_i32 s7, s4, s7 ++; SI-NEXT: s_mul_i32 s5, s5, s6 ++; SI-NEXT: s_mul_i32 s4, s4, s6 ++; SI-NEXT: v_add_i32_e32 v0, vcc, s7, v0 ++; SI-NEXT: v_add_i32_e32 v1, vcc, s5, v0 ++; SI-NEXT: v_mov_b32_e32 v0, s4 ++; SI-NEXT: s_andn2_b64 vcc, exec, s[8:9] ++; SI-NEXT: s_cbranch_vccnz .LBB14_3 ++; SI-NEXT: .LBB14_2: ; %if ++; SI-NEXT: s_mov_b32 s7, 0xf000 ++; SI-NEXT: s_mov_b32 s6, -1 ++; SI-NEXT: s_mov_b32 s4, s2 ++; SI-NEXT: s_mov_b32 s5, s3 ++; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 ++; SI-NEXT: .LBB14_3: ; %endif ++; SI-NEXT: s_mov_b32 s3, 0xf000 ++; SI-NEXT: s_mov_b32 s2, -1 ++; SI-NEXT: s_waitcnt vmcnt(0) ++; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ++; SI-NEXT: s_endpgm ++; SI-NEXT: .LBB14_4: ++; SI-NEXT: ; implicit-def: $vgpr0_vgpr1 ++; SI-NEXT: s_branch .LBB14_2 ++; ++; VI-LABEL: mul64_in_branch: ++; VI: ; %bb.0: ; %entry ++; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ++; VI-NEXT: s_mov_b64 s[8:9], 0 ++; VI-NEXT: s_waitcnt lgkmcnt(0) ++; VI-NEXT: s_cmp_lg_u64 s[4:5], 0 ++; VI-NEXT: s_cbranch_scc0 .LBB14_4 ++; VI-NEXT: ; %bb.1: ; %else ++; VI-NEXT: v_mov_b32_e32 v0, s6 ++; VI-NEXT: v_mad_u64_u32 v[0:1], s[10:11], s4, v0, 0 ++; VI-NEXT: s_mul_i32 s4, s4, s7 ++; VI-NEXT: v_add_u32_e32 v1, vcc, s4, v1 ++; VI-NEXT: s_mul_i32 s4, s5, s6 ++; VI-NEXT: v_add_u32_e32 v1, vcc, s4, v1 ++; VI-NEXT: s_andn2_b64 vcc, exec, s[8:9] ++; VI-NEXT: s_cbranch_vccnz .LBB14_3 ++; VI-NEXT: .LBB14_2: ; %if ++; VI-NEXT: s_mov_b32 s7, 0xf000 ++; VI-NEXT: s_mov_b32 s6, -1 ++; VI-NEXT: s_mov_b32 s4, s2 ++; VI-NEXT: s_mov_b32 s5, s3 ++; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 ++; VI-NEXT: .LBB14_3: ; %endif ++; VI-NEXT: s_mov_b32 s3, 0xf000 ++; VI-NEXT: s_mov_b32 s2, -1 ++; VI-NEXT: s_waitcnt vmcnt(0) ++; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ++; VI-NEXT: s_endpgm ++; VI-NEXT: .LBB14_4: ++; VI-NEXT: ; implicit-def: $vgpr0_vgpr1 ++; VI-NEXT: s_branch .LBB14_2 ++; ++; GFX9-LABEL: mul64_in_branch: ++; GFX9: ; %bb.0: ; %entry ++; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ++; GFX9-NEXT: s_mov_b64 s[8:9], 0 ++; GFX9-NEXT: s_waitcnt lgkmcnt(0) ++; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0 ++; GFX9-NEXT: s_cbranch_scc0 .LBB14_3 ++; GFX9-NEXT: ; %bb.1: ; %else ++; GFX9-NEXT: s_mul_i32 s7, s4, s7 ++; GFX9-NEXT: s_mul_hi_u32 s10, s4, s6 ++; GFX9-NEXT: s_add_i32 s7, s10, s7 ++; GFX9-NEXT: s_mul_i32 s5, s5, s6 ++; GFX9-NEXT: s_add_i32 s5, s7, s5 ++; GFX9-NEXT: s_mul_i32 s4, s4, s6 ++; GFX9-NEXT: s_andn2_b64 vcc, exec, s[8:9] ++; GFX9-NEXT: s_cbranch_vccnz .LBB14_4 ++; GFX9-NEXT: .LBB14_2: ; %if ++; GFX9-NEXT: s_mov_b32 s7, 0xf000 ++; GFX9-NEXT: s_mov_b32 s6, -1 ++; GFX9-NEXT: s_mov_b32 s4, s2 ++; GFX9-NEXT: s_mov_b32 s5, s3 ++; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 ++; GFX9-NEXT: s_branch .LBB14_5 ++; GFX9-NEXT: .LBB14_3: ++; GFX9-NEXT: ; implicit-def: $sgpr4_sgpr5 ++; GFX9-NEXT: s_branch .LBB14_2 ++; GFX9-NEXT: .LBB14_4: ++; GFX9-NEXT: v_mov_b32_e32 v0, s4 ++; GFX9-NEXT: v_mov_b32_e32 v1, s5 ++; GFX9-NEXT: .LBB14_5: ; %endif ++; GFX9-NEXT: s_mov_b32 s3, 0xf000 ++; GFX9-NEXT: s_mov_b32 s2, -1 ++; GFX9-NEXT: s_waitcnt vmcnt(0) ++; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ++; GFX9-NEXT: s_endpgm ++; ++; GFX10-LABEL: mul64_in_branch: ++; GFX10: ; %bb.0: ; %entry ++; GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ++; GFX10-NEXT: s_waitcnt lgkmcnt(0) ++; GFX10-NEXT: s_cmp_lg_u64 s[4:5], 0 ++; GFX10-NEXT: s_cbranch_scc0 .LBB14_3 ++; GFX10-NEXT: ; %bb.1: ; %else ++; GFX10-NEXT: s_mul_i32 s7, s4, s7 ++; GFX10-NEXT: s_mul_hi_u32 s8, s4, s6 ++; GFX10-NEXT: s_mul_i32 s5, s5, s6 ++; GFX10-NEXT: s_add_i32 s7, s8, s7 ++; GFX10-NEXT: s_mul_i32 s4, s4, s6 ++; GFX10-NEXT: s_add_i32 s5, s7, s5 ++; GFX10-NEXT: s_mov_b32 s6, 0 ++; GFX10-NEXT: s_cbranch_execnz .LBB14_4 ++; GFX10-NEXT: .LBB14_2: ; %if ++; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ++; GFX10-NEXT: s_mov_b32 s6, -1 ++; GFX10-NEXT: s_mov_b32 s4, s2 ++; GFX10-NEXT: s_mov_b32 s5, s3 ++; GFX10-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 ++; GFX10-NEXT: s_branch .LBB14_5 ++; GFX10-NEXT: .LBB14_3: ++; GFX10-NEXT: s_mov_b32 s6, -1 ++; GFX10-NEXT: ; implicit-def: $sgpr4_sgpr5 ++; GFX10-NEXT: s_branch .LBB14_2 ++; GFX10-NEXT: .LBB14_4: ++; GFX10-NEXT: v_mov_b32_e32 v0, s4 ++; GFX10-NEXT: v_mov_b32_e32 v1, s5 ++; GFX10-NEXT: .LBB14_5: ; %endif ++; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ++; GFX10-NEXT: s_mov_b32 s2, -1 ++; GFX10-NEXT: s_waitcnt vmcnt(0) ++; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ++; GFX10-NEXT: s_endpgm ++; ++; GFX11-LABEL: mul64_in_branch: ++; GFX11: ; %bb.0: ; %entry ++; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ++; GFX11-NEXT: s_waitcnt lgkmcnt(0) ++; GFX11-NEXT: s_cmp_lg_u64 s[4:5], 0 ++; GFX11-NEXT: s_cbranch_scc0 .LBB14_3 ++; GFX11-NEXT: ; %bb.1: ; %else ++; GFX11-NEXT: s_mul_i32 s7, s4, s7 ++; GFX11-NEXT: s_mul_hi_u32 s8, s4, s6 ++; GFX11-NEXT: s_mul_i32 s5, s5, s6 ++; GFX11-NEXT: s_add_i32 s7, s8, s7 ++; GFX11-NEXT: s_mul_i32 s4, s4, s6 ++; GFX11-NEXT: s_add_i32 s5, s7, s5 ++; GFX11-NEXT: s_mov_b32 s6, 0 ++; GFX11-NEXT: s_cbranch_execnz .LBB14_4 ++; GFX11-NEXT: .LBB14_2: ; %if ++; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ++; GFX11-NEXT: s_mov_b32 s6, -1 ++; GFX11-NEXT: s_mov_b32 s4, s2 ++; GFX11-NEXT: s_mov_b32 s5, s3 ++; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[4:7], 0 ++; GFX11-NEXT: s_branch .LBB14_5 ++; GFX11-NEXT: .LBB14_3: ++; GFX11-NEXT: s_mov_b32 s6, -1 ++; GFX11-NEXT: ; implicit-def: $sgpr4_sgpr5 ++; GFX11-NEXT: s_branch .LBB14_2 ++; GFX11-NEXT: .LBB14_4: ++; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ++; GFX11-NEXT: .LBB14_5: ; %endif ++; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ++; GFX11-NEXT: s_mov_b32 s2, -1 ++; GFX11-NEXT: s_waitcnt vmcnt(0) ++; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 ++; GFX11-NEXT: s_nop 0 ++; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ++; GFX11-NEXT: s_endpgm ++; ++; EG-LABEL: mul64_in_branch: ++; EG: ; %bb.0: ; %entry ++; EG-NEXT: ALU_PUSH_BEFORE 4, @14, KC0[CB0:0-32], KC1[] ++; EG-NEXT: JUMP @3 POP:1 ++; EG-NEXT: ALU_POP_AFTER 11, @19, KC0[CB0:0-32], KC1[] ++; EG-NEXT: ALU_PUSH_BEFORE 2, @31, KC0[CB0:0-32], KC1[] ++; EG-NEXT: JUMP @8 POP:1 ++; EG-NEXT: ALU 0, @34, KC0[CB0:0-32], KC1[] ++; EG-NEXT: TEX 0 @12 ++; EG-NEXT: POP @8 POP:1 ++; EG-NEXT: ALU 1, @35, KC0[], KC1[] ++; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 ++; EG-NEXT: CF_END ++; EG-NEXT: PAD ++; EG-NEXT: Fetch clause starting at 12: ++; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1 ++; EG-NEXT: ALU clause starting at 14: ++; EG-NEXT: OR_INT T0.W, KC0[2].W, KC0[3].X, ++; EG-NEXT: MOV * T1.W, literal.x, ++; EG-NEXT: 1(1.401298e-45), 0(0.000000e+00) ++; EG-NEXT: SETNE_INT * T0.W, PV.W, 0.0, ++; EG-NEXT: PRED_SETNE_INT * ExecMask,PredicateBit (MASKED), PV.W, 0.0, ++; EG-NEXT: ALU clause starting at 19: ++; EG-NEXT: MOV T0.W, KC0[2].W, ++; EG-NEXT: MOV * T1.W, KC0[3].Z, ++; EG-NEXT: MOV T2.W, KC0[3].Y, ++; EG-NEXT: MULLO_INT * T0.X, PV.W, PS, ++; EG-NEXT: MOV T1.W, KC0[3].X, ++; EG-NEXT: MULHI * T0.Y, T0.W, PV.W, ++; EG-NEXT: ADD_INT T3.W, PS, T0.X, ++; EG-NEXT: MULLO_INT * T0.X, PV.W, T2.W, ++; EG-NEXT: ADD_INT T0.Y, PV.W, PS, ++; EG-NEXT: MOV T1.W, literal.x, ++; EG-NEXT: MULLO_INT * T0.X, T0.W, T2.W, ++; EG-NEXT: 0(0.000000e+00), 0(0.000000e+00) ++; EG-NEXT: ALU clause starting at 31: ++; EG-NEXT: MOV T0.W, KC0[2].Y, ++; EG-NEXT: SETE_INT * T1.W, T1.W, 0.0, ++; EG-NEXT: PRED_SETE_INT * ExecMask,PredicateBit (MASKED), PS, 0.0, ++; EG-NEXT: ALU clause starting at 34: ++; EG-NEXT: MOV * T0.X, KC0[2].Z, ++; EG-NEXT: ALU clause starting at 35: ++; EG-NEXT: LSHR * T1.X, T0.W, literal.x, ++; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) + entry: + %0 = icmp eq i64 %a, 0 + br i1 %0, label %if, label %else +@@ -211,79 +2124,558 @@ endif: + ret void + } + +-; FIXME: Load dwordx4 +-; FUNC-LABEL: {{^}}s_mul_i128: +-; GCN: s_load_dwordx4 +-; GCN: s_load_dwordx4 +- +-; SI: v_mul_hi_u32 +-; SI: v_mul_hi_u32 +-; SI: s_mul_i32 +-; SI: v_mul_hi_u32 +-; SI: s_mul_i32 +-; SI: s_mul_i32 +- +-; SI-DAG: s_mul_i32 +-; SI-DAG: v_mul_hi_u32 +-; SI-DAG: v_mul_hi_u32 +-; SI-DAG: s_mul_i32 +-; SI-DAG: s_mul_i32 +-; SI-DAG: v_mul_hi_u32 +- +-; VI-DAG: v_mad_u64_u32 +-; VI-DAG: v_mad_u64_u32 +-; VI-DAG: v_mad_u64_u32 +-; VI-DAG: v_mad_u64_u32 +-; VI-DAG: v_mad_u64_u32 +-; VI-DAG: v_mad_u64_u32 +-; VI-DAG: s_mul_i32 +-; VI-DAG: s_mul_i32 +-; VI-DAG: s_mul_i32 +-; VI-DAG: s_mul_i32 +- +- +-; GCN: buffer_store_dwordx4 + define amdgpu_kernel void @s_mul_i128(ptr addrspace(1) %out, [8 x i32], i128 %a, [8 x i32], i128 %b) nounwind #0 { ++; SI-LABEL: s_mul_i128: ++; SI: ; %bb.0: ; %entry ++; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x13 ++; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x1f ++; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ++; SI-NEXT: s_mov_b32 s3, 0xf000 ++; SI-NEXT: s_mov_b32 s2, -1 ++; SI-NEXT: s_waitcnt lgkmcnt(0) ++; SI-NEXT: v_mov_b32_e32 v0, s6 ++; SI-NEXT: v_mul_hi_u32 v0, s8, v0 ++; SI-NEXT: v_mov_b32_e32 v1, s4 ++; SI-NEXT: v_mul_hi_u32 v1, s10, v1 ++; SI-NEXT: s_mul_i32 s7, s8, s7 ++; SI-NEXT: v_add_i32_e32 v0, vcc, s7, v0 ++; SI-NEXT: s_mul_i32 s7, s10, s5 ++; SI-NEXT: s_mul_i32 s12, s9, s6 ++; SI-NEXT: s_mul_i32 s6, s8, s6 ++; SI-NEXT: v_add_i32_e32 v1, vcc, s7, v1 ++; SI-NEXT: s_mul_i32 s7, s11, s4 ++; SI-NEXT: v_add_i32_e32 v0, vcc, s12, v0 ++; SI-NEXT: v_add_i32_e32 v1, vcc, s7, v1 ++; SI-NEXT: s_mul_i32 s7, s10, s4 ++; SI-NEXT: v_mov_b32_e32 v2, s6 ++; SI-NEXT: v_add_i32_e32 v2, vcc, s7, v2 ++; SI-NEXT: v_addc_u32_e32 v0, vcc, v1, v0, vcc ++; SI-NEXT: v_mov_b32_e32 v1, s8 ++; SI-NEXT: v_mul_hi_u32 v5, s4, v1 ++; SI-NEXT: v_mul_hi_u32 v1, s5, v1 ++; SI-NEXT: v_mov_b32_e32 v3, s9 ++; SI-NEXT: v_mul_hi_u32 v4, s4, v3 ++; SI-NEXT: s_mul_i32 s7, s5, s8 ++; SI-NEXT: v_add_i32_e32 v5, vcc, s7, v5 ++; SI-NEXT: s_mul_i32 s6, s4, s9 ++; SI-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc ++; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v5 ++; SI-NEXT: v_mul_hi_u32 v3, s5, v3 ++; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc ++; SI-NEXT: v_add_i32_e32 v4, vcc, v6, v4 ++; SI-NEXT: s_mul_i32 s5, s5, s9 ++; SI-NEXT: v_addc_u32_e64 v5, s[6:7], 0, 0, vcc ++; SI-NEXT: v_add_i32_e32 v4, vcc, s5, v4 ++; SI-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc ++; SI-NEXT: v_add_i32_e32 v2, vcc, v4, v2 ++; SI-NEXT: s_mul_i32 s4, s4, s8 ++; SI-NEXT: v_addc_u32_e32 v3, vcc, v3, v0, vcc ++; SI-NEXT: v_mov_b32_e32 v0, s4 ++; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ++; SI-NEXT: s_endpgm ++; ++; VI-LABEL: s_mul_i128: ++; VI: ; %bb.0: ; %entry ++; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x4c ++; VI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x7c ++; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ++; VI-NEXT: v_mov_b32_e32 v5, 0 ++; VI-NEXT: s_mov_b32 s3, 0xf000 ++; VI-NEXT: s_waitcnt lgkmcnt(0) ++; VI-NEXT: v_mov_b32_e32 v0, s6 ++; VI-NEXT: v_mad_u64_u32 v[2:3], s[12:13], s8, v0, 0 ++; VI-NEXT: s_mul_i32 s7, s8, s7 ++; VI-NEXT: v_mov_b32_e32 v6, s8 ++; VI-NEXT: v_add_u32_e32 v3, vcc, s7, v3 ++; VI-NEXT: s_mul_i32 s12, s9, s6 ++; VI-NEXT: v_mad_u64_u32 v[0:1], s[6:7], s4, v6, 0 ++; VI-NEXT: v_add_u32_e32 v3, vcc, s12, v3 ++; VI-NEXT: v_mov_b32_e32 v4, v1 ++; VI-NEXT: v_mad_u64_u32 v[6:7], s[6:7], s5, v6, v[4:5] ++; VI-NEXT: v_mov_b32_e32 v8, s4 ++; VI-NEXT: v_mad_u64_u32 v[1:2], s[6:7], s10, v8, v[2:3] ++; VI-NEXT: v_mov_b32_e32 v3, v7 ++; VI-NEXT: v_mov_b32_e32 v7, v5 ++; VI-NEXT: v_mov_b32_e32 v8, s9 ++; VI-NEXT: v_mad_u64_u32 v[4:5], s[6:7], s4, v8, v[6:7] ++; VI-NEXT: s_mul_i32 s8, s11, s4 ++; VI-NEXT: v_add_u32_e32 v6, vcc, s8, v2 ++; VI-NEXT: v_mov_b32_e32 v2, v5 ++; VI-NEXT: v_add_u32_e32 v2, vcc, v3, v2 ++; VI-NEXT: v_addc_u32_e64 v3, s[6:7], 0, 0, vcc ++; VI-NEXT: s_mul_i32 s8, s10, s5 ++; VI-NEXT: v_mad_u64_u32 v[2:3], s[4:5], s5, v8, v[2:3] ++; VI-NEXT: v_add_u32_e32 v5, vcc, s8, v6 ++; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ++; VI-NEXT: s_mov_b32 s2, -1 ++; VI-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc ++; VI-NEXT: v_mov_b32_e32 v1, v4 ++; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ++; VI-NEXT: s_endpgm ++; ++; GFX9-LABEL: s_mul_i128: ++; GFX9: ; %bb.0: ; %entry ++; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x4c ++; GFX9-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x7c ++; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ++; GFX9-NEXT: s_mov_b32 s7, 0xf000 ++; GFX9-NEXT: s_mov_b32 s6, -1 ++; GFX9-NEXT: s_waitcnt lgkmcnt(0) ++; GFX9-NEXT: s_mul_i32 s0, s12, s11 ++; GFX9-NEXT: s_mul_hi_u32 s1, s12, s10 ++; GFX9-NEXT: s_mul_i32 s2, s14, s9 ++; GFX9-NEXT: s_mul_hi_u32 s3, s14, s8 ++; GFX9-NEXT: s_add_i32 s0, s1, s0 ++; GFX9-NEXT: s_mul_i32 s1, s13, s10 ++; GFX9-NEXT: s_add_i32 s2, s3, s2 ++; GFX9-NEXT: s_mul_i32 s3, s15, s8 ++; GFX9-NEXT: s_add_i32 s0, s0, s1 ++; GFX9-NEXT: s_mul_i32 s1, s12, s10 ++; GFX9-NEXT: s_add_i32 s2, s2, s3 ++; GFX9-NEXT: s_mul_i32 s3, s14, s8 ++; GFX9-NEXT: s_add_u32 s3, s3, s1 ++; GFX9-NEXT: s_addc_u32 s2, s2, s0 ++; GFX9-NEXT: s_mul_i32 s14, s9, s12 ++; GFX9-NEXT: s_mul_hi_u32 s15, s8, s12 ++; GFX9-NEXT: s_mul_hi_u32 s11, s9, s12 ++; GFX9-NEXT: s_add_u32 s14, s14, s15 ++; GFX9-NEXT: s_mul_i32 s1, s8, s13 ++; GFX9-NEXT: s_addc_u32 s11, s11, 0 ++; GFX9-NEXT: s_mul_hi_u32 s10, s8, s13 ++; GFX9-NEXT: s_add_u32 s1, s1, s14 ++; GFX9-NEXT: s_addc_u32 s10, s10, 0 ++; GFX9-NEXT: s_add_u32 s10, s11, s10 ++; GFX9-NEXT: s_addc_u32 s11, 0, 0 ++; GFX9-NEXT: s_mul_hi_u32 s14, s9, s13 ++; GFX9-NEXT: s_mul_i32 s9, s9, s13 ++; GFX9-NEXT: s_add_u32 s9, s9, s10 ++; GFX9-NEXT: s_addc_u32 s10, s14, s11 ++; GFX9-NEXT: s_mov_b32 s0, 0 ++; GFX9-NEXT: s_add_u32 s9, s9, s3 ++; GFX9-NEXT: s_addc_u32 s10, s10, s2 ++; GFX9-NEXT: s_mul_i32 s2, s8, s12 ++; GFX9-NEXT: s_mov_b32 s3, s0 ++; GFX9-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] ++; GFX9-NEXT: v_mov_b32_e32 v0, s0 ++; GFX9-NEXT: v_mov_b32_e32 v1, s1 ++; GFX9-NEXT: v_mov_b32_e32 v2, s9 ++; GFX9-NEXT: v_mov_b32_e32 v3, s10 ++; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ++; GFX9-NEXT: s_endpgm ++; ++; GFX10-LABEL: s_mul_i128: ++; GFX10: ; %bb.0: ; %entry ++; GFX10-NEXT: s_clause 0x1 ++; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x4c ++; GFX10-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x7c ++; GFX10-NEXT: s_mov_b32 s2, 0 ++; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ++; GFX10-NEXT: s_mov_b32 s13, s2 ++; GFX10-NEXT: s_waitcnt lgkmcnt(0) ++; GFX10-NEXT: s_mul_i32 s3, s8, s7 ++; GFX10-NEXT: s_mul_hi_u32 s7, s8, s6 ++; GFX10-NEXT: s_mul_i32 s14, s10, s5 ++; GFX10-NEXT: s_mul_hi_u32 s15, s10, s4 ++; GFX10-NEXT: s_mul_i32 s12, s9, s6 ++; GFX10-NEXT: s_mul_i32 s11, s11, s4 ++; GFX10-NEXT: s_add_i32 s3, s7, s3 ++; GFX10-NEXT: s_add_i32 s7, s15, s14 ++; GFX10-NEXT: s_mul_i32 s6, s8, s6 ++; GFX10-NEXT: s_mul_i32 s10, s10, s4 ++; GFX10-NEXT: s_add_i32 s3, s3, s12 ++; GFX10-NEXT: s_add_i32 s7, s7, s11 ++; GFX10-NEXT: s_mul_i32 s19, s5, s8 ++; GFX10-NEXT: s_mul_hi_u32 s20, s4, s8 ++; GFX10-NEXT: s_add_u32 s6, s10, s6 ++; GFX10-NEXT: s_mul_hi_u32 s18, s5, s8 ++; GFX10-NEXT: s_addc_u32 s7, s7, s3 ++; GFX10-NEXT: s_mul_i32 s17, s4, s9 ++; GFX10-NEXT: s_add_u32 s3, s19, s20 ++; GFX10-NEXT: s_mul_hi_u32 s16, s4, s9 ++; GFX10-NEXT: s_mul_hi_u32 s21, s5, s9 ++; GFX10-NEXT: s_mul_i32 s5, s5, s9 ++; GFX10-NEXT: s_addc_u32 s9, s18, 0 ++; GFX10-NEXT: s_add_u32 s3, s17, s3 ++; GFX10-NEXT: s_addc_u32 s10, s16, 0 ++; GFX10-NEXT: s_mul_i32 s12, s4, s8 ++; GFX10-NEXT: s_add_u32 s4, s9, s10 ++; GFX10-NEXT: s_addc_u32 s8, 0, 0 ++; GFX10-NEXT: s_add_u32 s4, s5, s4 ++; GFX10-NEXT: s_addc_u32 s5, s21, s8 ++; GFX10-NEXT: s_add_u32 s4, s4, s6 ++; GFX10-NEXT: s_addc_u32 s5, s5, s7 ++; GFX10-NEXT: s_or_b64 s[2:3], s[12:13], s[2:3] ++; GFX10-NEXT: v_mov_b32_e32 v2, s4 ++; GFX10-NEXT: v_mov_b32_e32 v0, s2 ++; GFX10-NEXT: v_mov_b32_e32 v1, s3 ++; GFX10-NEXT: v_mov_b32_e32 v3, s5 ++; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ++; GFX10-NEXT: s_mov_b32 s2, -1 ++; GFX10-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ++; GFX10-NEXT: s_endpgm ++; ++; GFX11-LABEL: s_mul_i128: ++; GFX11: ; %bb.0: ; %entry ++; GFX11-NEXT: s_clause 0x2 ++; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x4c ++; GFX11-NEXT: s_load_b128 s[8:11], s[0:1], 0x7c ++; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ++; GFX11-NEXT: s_mov_b32 s2, 0 ++; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ++; GFX11-NEXT: s_mov_b32 s13, s2 ++; GFX11-NEXT: s_waitcnt lgkmcnt(0) ++; GFX11-NEXT: s_mul_i32 s3, s8, s7 ++; GFX11-NEXT: s_mul_hi_u32 s7, s8, s6 ++; GFX11-NEXT: s_mul_i32 s14, s10, s5 ++; GFX11-NEXT: s_mul_hi_u32 s15, s10, s4 ++; GFX11-NEXT: s_mul_i32 s12, s9, s6 ++; GFX11-NEXT: s_mul_i32 s11, s11, s4 ++; GFX11-NEXT: s_add_i32 s3, s7, s3 ++; GFX11-NEXT: s_add_i32 s7, s15, s14 ++; GFX11-NEXT: s_mul_i32 s6, s8, s6 ++; GFX11-NEXT: s_mul_i32 s10, s10, s4 ++; GFX11-NEXT: s_add_i32 s3, s3, s12 ++; GFX11-NEXT: s_add_i32 s7, s7, s11 ++; GFX11-NEXT: s_mul_i32 s19, s5, s8 ++; GFX11-NEXT: s_mul_hi_u32 s20, s4, s8 ++; GFX11-NEXT: s_add_u32 s6, s10, s6 ++; GFX11-NEXT: s_mul_hi_u32 s18, s5, s8 ++; GFX11-NEXT: s_addc_u32 s7, s7, s3 ++; GFX11-NEXT: s_mul_i32 s17, s4, s9 ++; GFX11-NEXT: s_add_u32 s3, s19, s20 ++; GFX11-NEXT: s_mul_hi_u32 s16, s4, s9 ++; GFX11-NEXT: s_mul_hi_u32 s21, s5, s9 ++; GFX11-NEXT: s_mul_i32 s5, s5, s9 ++; GFX11-NEXT: s_addc_u32 s9, s18, 0 ++; GFX11-NEXT: s_add_u32 s3, s17, s3 ++; GFX11-NEXT: s_addc_u32 s10, s16, 0 ++; GFX11-NEXT: s_mul_i32 s12, s4, s8 ++; GFX11-NEXT: s_add_u32 s4, s9, s10 ++; GFX11-NEXT: s_addc_u32 s8, 0, 0 ++; GFX11-NEXT: s_add_u32 s4, s5, s4 ++; GFX11-NEXT: s_addc_u32 s5, s21, s8 ++; GFX11-NEXT: s_add_u32 s4, s4, s6 ++; GFX11-NEXT: s_addc_u32 s5, s5, s7 ++; GFX11-NEXT: s_or_b64 s[2:3], s[12:13], s[2:3] ++; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ++; GFX11-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v1, s3 ++; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v3, s5 ++; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ++; GFX11-NEXT: s_mov_b32 s2, -1 ++; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 ++; GFX11-NEXT: s_nop 0 ++; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ++; GFX11-NEXT: s_endpgm ++; ++; EG-LABEL: s_mul_i128: ++; EG: ; %bb.0: ; %entry ++; EG-NEXT: ALU 41, @4, KC0[CB0:0-32], KC1[] ++; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1 ++; EG-NEXT: CF_END ++; EG-NEXT: PAD ++; EG-NEXT: ALU clause starting at 4: ++; EG-NEXT: MULLO_INT * T0.X, KC0[5].X, KC0[8].X, ++; EG-NEXT: MULHI * T0.Y, KC0[5].X, KC0[8].X, ++; EG-NEXT: MULLO_INT * T0.Z, KC0[8].Y, KC0[4].W, ++; EG-NEXT: MULLO_INT * T0.W, KC0[8].X, KC0[5].Y, ++; EG-NEXT: MULHI * T1.X, KC0[5].X, KC0[7].W, ++; EG-NEXT: MULHI * T1.Y, KC0[4].W, KC0[8].X, ++; EG-NEXT: MULHI * T1.Z, KC0[8].Y, KC0[4].W, ++; EG-NEXT: MULLO_INT * T1.W, KC0[8].Y, KC0[5].X, ++; EG-NEXT: MULHI * T2.X, KC0[7].W, KC0[5].Y, ++; EG-NEXT: MULLO_INT * T2.Y, KC0[5].X, KC0[7].W, ++; EG-NEXT: MULHI * T2.Z, KC0[4].W, KC0[7].W, ++; EG-NEXT: ADD_INT T2.W, T2.Y, PS, ++; EG-NEXT: MULLO_INT * T3.X, KC0[4].W, KC0[8].X, ++; EG-NEXT: ADDC_UINT T2.Z, T2.Y, T2.Z, ++; EG-NEXT: ADDC_UINT T3.W, PS, PV.W, ++; EG-NEXT: MULLO_INT * T2.Y, KC0[7].W, KC0[5].Z, ++; EG-NEXT: ADD_INT T2.X, T2.X, PS, ++; EG-NEXT: ADD_INT T2.Y, T1.Z, T1.W, ++; EG-NEXT: ADD_INT T1.Z, T1.Y, PV.W, ++; EG-NEXT: ADD_INT T1.W, T1.X, PV.Z, BS:VEC_120/SCL_212 ++; EG-NEXT: MULLO_INT * T1.X, KC0[8].Z, KC0[4].W, ++; EG-NEXT: ADD_INT T4.X, PV.W, PV.Z, ++; EG-NEXT: ADDC_UINT T1.Y, PV.W, PV.Z, ++; EG-NEXT: ADD_INT T1.Z, PV.Y, PS, ++; EG-NEXT: ADD_INT T0.W, PV.X, T0.W, ++; EG-NEXT: MULLO_INT * T1.X, KC0[7].W, KC0[5].Y, ++; EG-NEXT: ADD_INT T2.Y, PV.Z, PV.W, ++; EG-NEXT: ADDC_UINT T1.Z, T0.Z, PS, ++; EG-NEXT: ADD_INT T0.W, T0.Y, PV.Y, ++; EG-NEXT: ADDC_UINT * T1.W, T0.X, PV.X, ++; EG-NEXT: ADD_INT T0.Y, T0.X, T4.X, ++; EG-NEXT: ADD_INT T0.Z, T0.Z, T1.X, BS:VEC_021/SCL_122 ++; EG-NEXT: ADD_INT T0.W, PV.W, PS, ++; EG-NEXT: ADD_INT * T1.W, PV.Y, PV.Z, ++; EG-NEXT: ADD_INT T0.W, PV.W, PS, ++; EG-NEXT: ADDC_UINT * T1.W, PV.Y, PV.Z, ++; EG-NEXT: ADD_INT * T0.W, PV.W, PS, ++; EG-NEXT: ADD_INT * T0.Z, T0.Y, T0.Z, ++; EG-NEXT: ADD_INT * T0.Y, T3.X, T2.W, ++; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ++; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) ++; EG-NEXT: MULLO_INT * T0.X, KC0[4].W, KC0[7].W, ++entry: + %mul = mul i128 %a, %b + store i128 %mul, ptr addrspace(1) %out + ret void + } + +-; FUNC-LABEL: {{^}}v_mul_i128: +-; GCN: {{buffer|flat}}_load_dwordx4 +-; GCN: {{buffer|flat}}_load_dwordx4 +- +-; SI-DAG: v_mul_lo_u32 +-; SI-DAG: v_mul_hi_u32 +-; SI-DAG: v_mul_hi_u32 +-; SI-DAG: v_mul_lo_u32 +-; SI-DAG: v_mul_hi_u32 +-; SI-DAG: v_mul_hi_u32 +-; SI-DAG: v_mul_lo_u32 +-; SI-DAG: v_mul_lo_u32 +-; SI-DAG: v_add_i32_e32 +- +-; SI-DAG: v_mul_hi_u32 +-; SI-DAG: v_mul_lo_u32 +-; SI-DAG: v_mul_hi_u32 +-; SI-DAG: v_mul_lo_u32 +-; SI-DAG: v_mul_lo_u32 +-; SI-DAG: v_mul_lo_u32 +-; SI-DAG: v_mul_lo_u32 +-; SI-DAG: v_mul_lo_u32 +- +-; VI-DAG: v_mad_u64_u32 +-; VI-DAG: v_mad_u64_u32 +-; VI-DAG: v_mad_u64_u32 +-; VI-DAG: v_mad_u64_u32 +-; VI-DAG: v_mad_u64_u32 +-; VI-DAG: v_mad_u64_u32 +-; VI-DAG: v_mul_lo_u32 +-; VI-DAG: v_mul_lo_u32 +-; VI-DAG: v_mul_lo_u32 +- +-; GCN: {{buffer|flat}}_store_dwordx4 + define amdgpu_kernel void @v_mul_i128(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) #0 { ++; SI-LABEL: v_mul_i128: ++; SI: ; %bb.0: ; %entry ++; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xb ++; SI-NEXT: s_mov_b32 s7, 0xf000 ++; SI-NEXT: s_mov_b32 s6, 0 ++; SI-NEXT: v_lshlrev_b32_e32 v8, 4, v0 ++; SI-NEXT: v_mov_b32_e32 v9, 0 ++; SI-NEXT: s_waitcnt lgkmcnt(0) ++; SI-NEXT: s_mov_b64 s[4:5], s[0:1] ++; SI-NEXT: s_mov_b64 s[0:1], s[2:3] ++; SI-NEXT: s_mov_b64 s[2:3], s[6:7] ++; SI-NEXT: buffer_load_dwordx4 v[0:3], v[8:9], s[4:7], 0 addr64 ++; SI-NEXT: buffer_load_dwordx4 v[4:7], v[8:9], s[0:3], 0 addr64 ++; SI-NEXT: s_waitcnt vmcnt(0) ++; SI-NEXT: v_mul_lo_u32 v3, v4, v3 ++; SI-NEXT: v_mul_hi_u32 v10, v4, v2 ++; SI-NEXT: v_mul_lo_u32 v12, v6, v1 ++; SI-NEXT: v_mul_hi_u32 v13, v6, v0 ++; SI-NEXT: v_mul_lo_u32 v17, v1, v4 ++; SI-NEXT: v_mul_hi_u32 v18, v0, v4 ++; SI-NEXT: v_mul_lo_u32 v11, v5, v2 ++; SI-NEXT: v_mul_lo_u32 v7, v7, v0 ++; SI-NEXT: v_mul_hi_u32 v16, v1, v4 ++; SI-NEXT: v_mul_lo_u32 v15, v0, v5 ++; SI-NEXT: v_mul_hi_u32 v14, v0, v5 ++; SI-NEXT: v_mul_hi_u32 v19, v1, v5 ++; SI-NEXT: v_mul_lo_u32 v5, v1, v5 ++; SI-NEXT: v_add_i32_e32 v1, vcc, v10, v3 ++; SI-NEXT: v_add_i32_e32 v3, vcc, v13, v12 ++; SI-NEXT: v_mul_lo_u32 v2, v4, v2 ++; SI-NEXT: v_mul_lo_u32 v6, v6, v0 ++; SI-NEXT: v_mul_lo_u32 v0, v0, v4 ++; SI-NEXT: v_add_i32_e32 v4, vcc, v17, v18 ++; SI-NEXT: v_addc_u32_e32 v10, vcc, 0, v16, vcc ++; SI-NEXT: v_add_i32_e32 v11, vcc, v1, v11 ++; SI-NEXT: v_add_i32_e32 v3, vcc, v3, v7 ++; SI-NEXT: v_add_i32_e32 v1, vcc, v15, v4 ++; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v14, vcc ++; SI-NEXT: v_add_i32_e32 v2, vcc, v6, v2 ++; SI-NEXT: v_addc_u32_e32 v3, vcc, v3, v11, vcc ++; SI-NEXT: v_add_i32_e32 v4, vcc, v10, v4 ++; SI-NEXT: v_addc_u32_e64 v6, s[4:5], 0, 0, vcc ++; SI-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ++; SI-NEXT: v_addc_u32_e32 v5, vcc, v19, v6, vcc ++; SI-NEXT: v_add_i32_e32 v2, vcc, v4, v2 ++; SI-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc ++; SI-NEXT: buffer_store_dwordx4 v[0:3], v[8:9], s[0:3], 0 addr64 ++; SI-NEXT: s_endpgm ++; ++; VI-LABEL: v_mul_i128: ++; VI: ; %bb.0: ; %entry ++; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2c ++; VI-NEXT: v_lshlrev_b32_e32 v2, 4, v0 ++; VI-NEXT: v_mov_b32_e32 v11, 0 ++; VI-NEXT: s_waitcnt lgkmcnt(0) ++; VI-NEXT: v_mov_b32_e32 v1, s1 ++; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ++; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ++; VI-NEXT: v_mov_b32_e32 v3, s3 ++; VI-NEXT: v_add_u32_e32 v8, vcc, s2, v2 ++; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v3, vcc ++; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ++; VI-NEXT: flat_load_dwordx4 v[4:7], v[8:9] ++; VI-NEXT: s_waitcnt vmcnt(0) ++; VI-NEXT: v_mul_lo_u32 v10, v4, v3 ++; VI-NEXT: v_mad_u64_u32 v[12:13], s[0:1], v4, v2, 0 ++; VI-NEXT: v_mul_lo_u32 v14, v5, v2 ++; VI-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v0, v4, 0 ++; VI-NEXT: v_mul_lo_u32 v15, v7, v0 ++; VI-NEXT: v_add_u32_e32 v7, vcc, v13, v10 ++; VI-NEXT: v_mov_b32_e32 v10, v3 ++; VI-NEXT: v_mad_u64_u32 v[3:4], s[0:1], v1, v4, v[10:11] ++; VI-NEXT: v_add_u32_e32 v13, vcc, v7, v14 ++; VI-NEXT: v_mov_b32_e32 v7, v4 ++; VI-NEXT: v_mov_b32_e32 v4, v11 ++; VI-NEXT: v_mad_u64_u32 v[12:13], s[0:1], v6, v0, v[12:13] ++; VI-NEXT: v_mad_u64_u32 v[3:4], s[0:1], v0, v5, v[3:4] ++; VI-NEXT: v_add_u32_e32 v11, vcc, v15, v13 ++; VI-NEXT: v_mov_b32_e32 v0, v4 ++; VI-NEXT: v_mul_lo_u32 v10, v6, v1 ++; VI-NEXT: v_add_u32_e32 v6, vcc, v7, v0 ++; VI-NEXT: v_addc_u32_e64 v7, s[0:1], 0, 0, vcc ++; VI-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v1, v5, v[6:7] ++; VI-NEXT: v_add_u32_e32 v5, vcc, v10, v11 ++; VI-NEXT: v_add_u32_e32 v4, vcc, v0, v12 ++; VI-NEXT: v_addc_u32_e32 v5, vcc, v1, v5, vcc ++; VI-NEXT: flat_store_dwordx4 v[8:9], v[2:5] ++; VI-NEXT: s_endpgm ++; ++; GFX9-LABEL: v_mul_i128: ++; GFX9: ; %bb.0: ; %entry ++; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2c ++; GFX9-NEXT: v_lshlrev_b32_e32 v13, 4, v0 ++; GFX9-NEXT: v_mov_b32_e32 v10, 0 ++; GFX9-NEXT: s_waitcnt lgkmcnt(0) ++; GFX9-NEXT: global_load_dwordx4 v[0:3], v13, s[0:1] ++; GFX9-NEXT: global_load_dwordx4 v[4:7], v13, s[2:3] ++; GFX9-NEXT: s_waitcnt vmcnt(0) ++; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v0, v4, 0 ++; GFX9-NEXT: v_mul_lo_u32 v14, v5, v2 ++; GFX9-NEXT: v_mul_lo_u32 v15, v4, v3 ++; GFX9-NEXT: v_mad_u64_u32 v[11:12], s[0:1], v1, v4, v[9:10] ++; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v4, v2, 0 ++; GFX9-NEXT: v_mov_b32_e32 v4, v12 ++; GFX9-NEXT: v_mov_b32_e32 v12, v10 ++; GFX9-NEXT: v_mad_u64_u32 v[9:10], s[0:1], v0, v5, v[11:12] ++; GFX9-NEXT: v_add3_u32 v3, v3, v15, v14 ++; GFX9-NEXT: v_mul_lo_u32 v17, v7, v0 ++; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v6, v0, v[2:3] ++; GFX9-NEXT: v_mov_b32_e32 v0, v10 ++; GFX9-NEXT: v_mul_lo_u32 v16, v6, v1 ++; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v4, v0 ++; GFX9-NEXT: v_addc_co_u32_e64 v7, s[0:1], 0, 0, vcc ++; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v1, v5, v[6:7] ++; GFX9-NEXT: v_add3_u32 v3, v17, v3, v16 ++; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v0, v2 ++; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, v1, v3, vcc ++; GFX9-NEXT: global_store_dwordx4 v13, v[8:11], s[2:3] ++; GFX9-NEXT: s_endpgm ++; ++; GFX10-LABEL: v_mul_i128: ++; GFX10: ; %bb.0: ; %entry ++; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2c ++; GFX10-NEXT: v_lshlrev_b32_e32 v14, 4, v0 ++; GFX10-NEXT: v_mov_b32_e32 v10, 0 ++; GFX10-NEXT: s_waitcnt lgkmcnt(0) ++; GFX10-NEXT: s_clause 0x1 ++; GFX10-NEXT: global_load_dwordx4 v[0:3], v14, s[0:1] ++; GFX10-NEXT: global_load_dwordx4 v[4:7], v14, s[2:3] ++; GFX10-NEXT: s_waitcnt vmcnt(0) ++; GFX10-NEXT: v_mad_u64_u32 v[8:9], s0, v0, v4, 0 ++; GFX10-NEXT: v_mul_lo_u32 v7, v7, v0 ++; GFX10-NEXT: v_mad_u64_u32 v[11:12], s0, v1, v4, v[9:10] ++; GFX10-NEXT: v_mov_b32_e32 v9, v12 ++; GFX10-NEXT: v_mov_b32_e32 v12, v10 ++; GFX10-NEXT: v_mul_lo_u32 v10, v5, v2 ++; GFX10-NEXT: v_mad_u64_u32 v[12:13], s0, v0, v5, v[11:12] ++; GFX10-NEXT: v_mul_lo_u32 v11, v4, v3 ++; GFX10-NEXT: v_mad_u64_u32 v[2:3], s0, v4, v2, 0 ++; GFX10-NEXT: v_mov_b32_e32 v4, v13 ++; GFX10-NEXT: v_mul_lo_u32 v13, v6, v1 ++; GFX10-NEXT: v_add3_u32 v3, v3, v11, v10 ++; GFX10-NEXT: v_add_co_u32 v9, s0, v9, v4 ++; GFX10-NEXT: v_add_co_ci_u32_e64 v10, s0, 0, 0, s0 ++; GFX10-NEXT: v_mad_u64_u32 v[2:3], s0, v6, v0, v[2:3] ++; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, v1, v5, v[9:10] ++; GFX10-NEXT: v_mov_b32_e32 v9, v12 ++; GFX10-NEXT: v_add3_u32 v3, v7, v3, v13 ++; GFX10-NEXT: v_add_co_u32 v10, vcc_lo, v0, v2 ++; GFX10-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, v1, v3, vcc_lo ++; GFX10-NEXT: global_store_dwordx4 v14, v[8:11], s[2:3] ++; GFX10-NEXT: s_endpgm ++; ++; GFX11-LABEL: v_mul_i128: ++; GFX11: ; %bb.0: ; %entry ++; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x2c ++; GFX11-NEXT: v_lshlrev_b32_e32 v16, 4, v0 ++; GFX11-NEXT: v_mov_b32_e32 v10, 0 ++; GFX11-NEXT: s_waitcnt lgkmcnt(0) ++; GFX11-NEXT: s_clause 0x1 ++; GFX11-NEXT: global_load_b128 v[0:3], v16, s[0:1] ++; GFX11-NEXT: global_load_b128 v[4:7], v16, s[2:3] ++; GFX11-NEXT: s_waitcnt vmcnt(0) ++; GFX11-NEXT: v_mad_u64_u32 v[8:9], null, v0, v4, 0 ++; GFX11-NEXT: v_mul_lo_u32 v15, v5, v2 ++; GFX11-NEXT: v_mul_lo_u32 v3, v4, v3 ++; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ++; GFX11-NEXT: v_mad_u64_u32 v[11:12], null, v1, v4, v[9:10] ++; GFX11-NEXT: v_dual_mov_b32 v9, v12 :: v_dual_mov_b32 v12, v10 ++; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) ++; GFX11-NEXT: v_mad_u64_u32 v[13:14], null, v0, v5, v[11:12] ++; GFX11-NEXT: v_mad_u64_u32 v[10:11], null, v4, v2, 0 ++; GFX11-NEXT: v_mul_lo_u32 v4, v6, v1 ++; GFX11-NEXT: v_mul_lo_u32 v12, v7, v0 ++; GFX11-NEXT: v_mov_b32_e32 v2, v14 ++; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) ++; GFX11-NEXT: v_add3_u32 v11, v11, v3, v15 ++; GFX11-NEXT: v_add_co_u32 v2, s0, v9, v2 ++; GFX11-NEXT: v_mov_b32_e32 v9, v13 ++; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, 0, s0 ++; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) ++; GFX11-NEXT: v_mad_u64_u32 v[14:15], null, v6, v0, v[10:11] ++; GFX11-NEXT: v_mad_u64_u32 v[6:7], null, v1, v5, v[2:3] ++; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ++; GFX11-NEXT: v_add3_u32 v0, v12, v15, v4 ++; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v6, v14 ++; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ++; GFX11-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, v7, v0, vcc_lo ++; GFX11-NEXT: global_store_b128 v16, v[8:11], s[2:3] ++; GFX11-NEXT: s_nop 0 ++; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ++; GFX11-NEXT: s_endpgm ++; ++; EG-LABEL: v_mul_i128: ++; EG: ; %bb.0: ; %entry ++; EG-NEXT: ALU 3, @10, KC0[CB0:0-32], KC1[] ++; EG-NEXT: TEX 1 @6 ++; EG-NEXT: ALU 41, @14, KC0[], KC1[] ++; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1 ++; EG-NEXT: CF_END ++; EG-NEXT: PAD ++; EG-NEXT: Fetch clause starting at 6: ++; EG-NEXT: VTX_READ_128 T2.XYZW, T1.X, 0, #1 ++; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1 ++; EG-NEXT: ALU clause starting at 10: ++; EG-NEXT: LSHL * T0.W, T0.X, literal.x, ++; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) ++; EG-NEXT: ADD_INT T0.X, KC0[2].Z, PV.W, ++; EG-NEXT: ADD_INT * T1.X, KC0[2].W, PV.W, ++; EG-NEXT: ALU clause starting at 14: ++; EG-NEXT: MULLO_INT * T1.Y, T0.Y, T2.Y, ++; EG-NEXT: MULHI * T1.Z, T0.Y, T2.Y, ++; EG-NEXT: MULLO_INT * T1.W, T2.Z, T0.X, ++; EG-NEXT: MULLO_INT * T3.X, T2.Y, T0.Z, ++; EG-NEXT: MULHI * T3.Y, T0.Y, T2.X, ++; EG-NEXT: MULHI * T3.Z, T0.X, T2.Y, ++; EG-NEXT: MULHI * T3.W, T2.Z, T0.X, ++; EG-NEXT: MULLO_INT * T2.Z, T2.Z, T0.Y, ++; EG-NEXT: MULHI * T4.X, T2.X, T0.Z, ++; EG-NEXT: MULLO_INT * T0.Y, T0.Y, T2.X, ++; EG-NEXT: MULHI * T4.Y, T0.X, T2.X, ++; EG-NEXT: ADD_INT T4.W, T0.Y, PS, ++; EG-NEXT: MULLO_INT * T2.Y, T0.X, T2.Y, ++; EG-NEXT: ADDC_UINT T4.Z, T0.Y, T4.Y, ++; EG-NEXT: ADDC_UINT T5.W, PS, PV.W, ++; EG-NEXT: MULLO_INT * T0.Y, T2.X, T0.W, ++; EG-NEXT: ADD_INT T4.X, T4.X, PS, ++; EG-NEXT: ADD_INT T0.Y, T3.W, T2.Z, ++; EG-NEXT: ADD_INT T2.Z, T3.Z, PV.W, ++; EG-NEXT: ADD_INT T0.W, T3.Y, PV.Z, ++; EG-NEXT: MULLO_INT * T2.W, T2.W, T0.X, ++; EG-NEXT: ADD_INT T5.X, PV.W, PV.Z, ++; EG-NEXT: ADDC_UINT T3.Y, PV.W, PV.Z, ++; EG-NEXT: ADD_INT T2.Z, PV.Y, PS, ++; EG-NEXT: ADD_INT T0.W, PV.X, T3.X, ++; EG-NEXT: MULLO_INT * T0.Y, T2.X, T0.Z, ++; EG-NEXT: ADD_INT T4.Y, PV.Z, PV.W, ++; EG-NEXT: ADDC_UINT T0.Z, T1.W, PS, ++; EG-NEXT: ADD_INT T0.W, T1.Z, PV.Y, ++; EG-NEXT: ADDC_UINT * T2.W, T1.Y, PV.X, ++; EG-NEXT: ADD_INT T1.Y, T1.Y, T5.X, ++; EG-NEXT: ADD_INT T1.Z, T1.W, T0.Y, ++; EG-NEXT: ADD_INT T0.W, PV.W, PS, ++; EG-NEXT: ADD_INT * T1.W, PV.Y, PV.Z, ++; EG-NEXT: ADD_INT T0.W, PV.W, PS, ++; EG-NEXT: ADDC_UINT * T1.W, PV.Y, PV.Z, ++; EG-NEXT: ADD_INT * T0.W, PV.W, PS, ++; EG-NEXT: ADD_INT * T0.Z, T1.Y, T1.Z, ++; EG-NEXT: ADD_INT * T0.Y, T2.Y, T4.W, ++; EG-NEXT: LSHR T1.X, T1.X, literal.x, ++; EG-NEXT: MULLO_INT * T0.X, T0.X, T2.X, ++; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) ++entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.a = getelementptr inbounds i128, ptr addrspace(1) %aptr, i32 %tid + %gep.b = getelementptr inbounds i128, ptr addrspace(1) %bptr, i32 %tid +-- +2.31.1 + diff --git a/var/spack/repos/builtin/packages/llvm-amdgpu/package.py b/var/spack/repos/builtin/packages/llvm-amdgpu/package.py index 977db83012..d69575d933 100644 --- a/var/spack/repos/builtin/packages/llvm-amdgpu/package.py +++ b/var/spack/repos/builtin/packages/llvm-amdgpu/package.py @@ -24,6 +24,8 @@ class LlvmAmdgpu(CMakePackage): maintainers("srekolam", "renjithravindrankannath", "haampie") version("master", branch="amd-stg-open") + version("5.6.1", sha256="045e43c0c4a3f4f2f1db9fb603a4f1ea3d56e128147e19ba17909eb57d7f08e5") + version("5.6.0", sha256="e922bd492b54d99e56ed88c81e2009ed6472059a180b10cc56ce1f9bd2d7b6ed") version("5.5.1", sha256="7d7181f20f89cb0715191aa32914186c67a34258c13457055570d47e15296553") version("5.5.0", sha256="5dc6c99f612b69ff73145bee17524e3712990100e16445b71634106acf7927cf") version("5.4.3", sha256="a844d3cc01613f6284a75d44db67c495ac1e9b600eacbb1eb13d2649f5d5404d") @@ -160,7 +162,13 @@ class LlvmAmdgpu(CMakePackage): # as per 5.2.0 llvm code. It used to be llvm/bin/../lib/libdevice. # Below patch is to look in the old path. patch("adjust-openmp-bitcode-directory-for-llvm-link.patch", when="@5.2.0:") - patch("patch-llvm-5.5.0.patch", when="@5.5") + patch("patch-llvm-5.5.0.patch", when="@5.5:") + + # i1 muls can sometimes happen after SCEV. + # They resulted in ISel failures because we were missing the patterns for them. + # This fix is targeting 6.1 rocm release. + # Need patch until https://github.com/llvm/llvm-project/pull/67291 is merged. + patch("001-Add-i1-mul-patterns.patch", when="@5.6:") conflicts("^cmake@3.19.0") @@ -169,6 +177,8 @@ class LlvmAmdgpu(CMakePackage): # Add device libs sources so they can be an external LLVM project for d_version, d_shasum in [ + ("5.6.1", "f0dfab272ff936225bfa1e9dabeb3c5d12ce08b812bf53ffbddd2ddfac49761c"), + ("5.6.0", "efb5dcdca9b3a9fbe408d494fb4a23e0b78417eb5fa8eebd4a5d226088f28921"), ("5.5.1", "3b5f6dd85f0e3371f6078da7b59bf77d5b210e30f1cc66ef1e2de6bbcb775833"), ("5.5.0", "5ab95aeb9c8bed0514f96f7847e21e165ed901ed826cdc9382c14d199cbadbd3"), ("5.4.3", "f4f7281f2cea6d268fcc3662b37410957d4f0bc23e0df9f60b12eb0fcdf9e26e"), diff --git a/var/spack/repos/builtin/packages/llvm/package.py b/var/spack/repos/builtin/packages/llvm/package.py index dc5a8ed5fe..df0e762fc5 100644 --- a/var/spack/repos/builtin/packages/llvm/package.py +++ b/var/spack/repos/builtin/packages/llvm/package.py @@ -243,6 +243,8 @@ class Llvm(CMakePackage, CudaPackage): description="Enable zstd support for static analyzer / lld", ) + provides("libllvm@16", when="@16.0.0:16") + provides("libllvm@15", when="@15.0.0:15") provides("libllvm@14", when="@14.0.0:14") provides("libllvm@13", when="@13.0.0:13") provides("libllvm@12", when="@12.0.0:12") diff --git a/var/spack/repos/builtin/packages/migraphx/0005-Adding-half-include-directory-path-migraphx.patch b/var/spack/repos/builtin/packages/migraphx/0005-Adding-half-include-directory-path-migraphx.patch new file mode 100644 index 0000000000..b11445bdca --- /dev/null +++ b/var/spack/repos/builtin/packages/migraphx/0005-Adding-half-include-directory-path-migraphx.patch @@ -0,0 +1,48 @@ +From 612664789657444daa88f8f28a183928e01595d0 Mon Sep 17 00:00:00 2001 +From: Renjith Ravindran <Renjith.RavindranKannath@amd.com> +Date: Mon, 25 Sep 2023 19:30:19 +0000 +Subject: [PATCH] Adding-half-include-directory-path + +--- + CMakeLists.txt | 4 +++- + cmake/PythonModules.cmake | 2 +- + 2 files changed, 4 insertions(+), 2 deletions(-) + +diff --git a/CMakeLists.txt b/CMakeLists.txt +index 4601cdd..9cd48ad 100755 +--- a/CMakeLists.txt ++++ b/CMakeLists.txt +@@ -51,7 +51,7 @@ set(CMAKE_BUILD_RPATH "${CMAKE_BINARY_DIR}/lib") + project(migraphx) + find_package(ROCM REQUIRED) + +-find_path(HALF_INCLUDE_DIR half.hpp PATH_SUFFIXES half) ++find_path(HALF_INCLUDE_DIR half.hpp) + if (NOT HALF_INCLUDE_DIR) + message(FATAL_ERROR "Could not find half.hpp - Please check that the install path of half.hpp has been added to CMAKE_PREFIX_PATH") + else() +@@ -272,6 +272,8 @@ add_subdirectory(docs) + add_subdirectory(test) + add_subdirectory(tools) + ++target_include_directories(migraphx PUBLIC "${NLOHMANN_JSON_INCLUDE} ${HALF_INCLUDE_DIR}") ++ + set(DEST_DIR ${CMAKE_BINARY_DIR}) + file(GLOB backend_files ${CMAKE_SOURCE_DIR}/src/py/backend/*.py) + file(MAKE_DIRECTORY ${DEST_DIR}/lib/onnx_migraphx) +diff --git a/cmake/PythonModules.cmake b/cmake/PythonModules.cmake +index b5818ce..b4bfbb3 100755 +--- a/cmake/PythonModules.cmake ++++ b/cmake/PythonModules.cmake +@@ -76,7 +76,7 @@ function(py_add_module NAME) + ) + + endfunction() +-set(PYTHON_SEARCH_VERSIONS 2.7 3.5 3.6 3.7 3.8 3.9 3.10) ++set(PYTHON_SEARCH_VERSIONS 3.5 3.6 3.7 3.8 3.9 3.10) + set(PYTHON_DISABLE_VERSIONS "" CACHE STRING "") + foreach(PYTHON_DISABLE_VERSION ${PYTHON_DISABLE_VERSIONS}) + list(REMOVE_ITEM PYTHON_SEARCH_VERSIONS ${PYTHON_DISABLE_VERSION}) +-- +2.31.1 + diff --git a/var/spack/repos/builtin/packages/migraphx/package.py b/var/spack/repos/builtin/packages/migraphx/package.py index a0179de5ad..81bf1bff2b 100644 --- a/var/spack/repos/builtin/packages/migraphx/package.py +++ b/var/spack/repos/builtin/packages/migraphx/package.py @@ -19,6 +19,8 @@ class Migraphx(CMakePackage): maintainers("srekolam", "renjithravindrankannath") libraries = ["libmigraphx"] + version("5.6.1", sha256="b108c33f07572ffd880b20f6de06f1934ab2a1b41ae69095612322ac412fa91c") + version("5.6.0", sha256="eaec90535d62002fd5bb264677ad4a7e30c55f18d2a287680d0495c7e60432b2") version("5.5.1", sha256="e71c4744f8ef6a1a99c179bbad94b8fe9bd7686eaa9397f376b70988c3341f0c") version("5.5.0", sha256="6084eb596b170f5e38f22b5fa37e66aa43a8cbc626712c9f03cde48c8fecfc8f") version("5.4.3", sha256="f83e7bbe5d6d0951fb2cf0abf7e8b3530e9a5e45f7cec6d760da055d6905d568") @@ -110,19 +112,21 @@ class Migraphx(CMakePackage): return url - patch("0001-Adding-nlohmann-json-include-directory.patch", when="@3.9.0:") + patch("0001-Adding-nlohmann-json-include-directory.patch", when="@3.9.0:5.5") # Restrict Python 2.7 usage to fix the issue below # https://github.com/spack/spack/issues/24429 patch("0002-restrict-python-2.7-usage.patch", when="@3.9.0:5.1.3") patch("0003-restrict-python-2.7-usage.patch", when="@5.2.0:5.4") - patch("0004-restrict-python2.7-usage-for-5.5.0.patch", when="@5.5.0:") + patch("0004-restrict-python2.7-usage-for-5.5.0.patch", when="@5.5.0") + patch("0005-Adding-half-include-directory-path-migraphx.patch", when="@5.6.0:") depends_on("cmake@3.5:", type="build") depends_on("protobuf", type="link") depends_on("blaze", type="build") depends_on("nlohmann-json", type="link") depends_on("msgpack-c", type="link") - depends_on("half@1.12.0", type="link") + depends_on("half@1.12.0", type="link", when="@:5.5") + depends_on("half@2:", when="@5.6:") depends_on("python@3.5:", type="build") depends_on("py-pybind11", type="build", when="@:4.0.0") depends_on("py-pybind11@2.6:", type="build", when="@4.1.0:") @@ -154,6 +158,8 @@ class Migraphx(CMakePackage): "5.4.3", "5.5.0", "5.5.1", + "5.6.0", + "5.6.1", ]: depends_on("rocm-cmake@%s:" % ver, type="build", when="@" + ver) depends_on("hip@" + ver, when="@" + ver) @@ -193,3 +199,11 @@ class Migraphx(CMakePackage): if "@5.5.0:" in self.spec: args.append(self.define("CMAKE_CXX_FLAGS", "-I{0}".format(abspath))) return args + + def test(self): + if self.spec.satisfies("@:5.5.0"): + print("Skipping: stand-alone tests") + return + test_dir = join_path(self.spec["migraphx"].prefix, "bin") + with working_dir(test_dir, create=True): + self.run_test("UnitTests") diff --git a/var/spack/repos/builtin/packages/miopen-hip/package.py b/var/spack/repos/builtin/packages/miopen-hip/package.py index 79ed4c27d3..4843ae1173 100644 --- a/var/spack/repos/builtin/packages/miopen-hip/package.py +++ b/var/spack/repos/builtin/packages/miopen-hip/package.py @@ -19,7 +19,8 @@ class MiopenHip(CMakePackage): maintainers("srekolam", "renjithravindrankannath") libraries = ["libMIOpen"] - + version("5.6.1", sha256="ff627d68ed9e52433a3c808b5d3ff179a398b77ce81b00cfea7b2c4da5162c6c") + version("5.6.0", sha256="d620ddab5b488bdf81242654fefa337c6b71dc410c2ff26d30a4ee86a8d22d11") version("5.5.1", sha256="2cd75071b8ee876c69a94f028b6c8a9346d6d2fde7d4b64e6d635f3b6c994262") version("5.5.0", sha256="791087242551669e546225e36123c21663f0dad14dbcfd6d0ce0e7bad0ab0de1") version("5.4.3", sha256="37ffe2ed3d7942da8ea2f6bdb85c7a2f58e3ccd31767db158a322769d3604efd") @@ -144,6 +145,8 @@ class MiopenHip(CMakePackage): "5.4.3", "5.5.0", "5.5.1", + "5.6.0", + "5.6.1", ]: depends_on("rocm-cmake@%s:" % ver, type="build", when="@" + ver) depends_on("hip@" + ver, when="@" + ver) @@ -153,12 +156,11 @@ class MiopenHip(CMakePackage): for ver in ["5.1.0", "5.1.3", "5.2.0", "5.2.1", "5.2.3", "5.3.0", "5.3.3"]: depends_on("mlirmiopen@" + ver, when="@" + ver) - for ver in ["5.4.0", "5.4.3", "5.5.0", "5.5.1"]: + for ver in ["5.5.1", "5.6.0", "5.6.1"]: depends_on("nlohmann-json", type="link") + depends_on("composable-kernel@" + ver, when="@" + ver) for ver in ["5.4.0", "5.4.3", "5.5.0"]: depends_on("rocmlir@" + ver, when="@" + ver) - for ver in ["5.5.1"]: - depends_on("composable-kernel@" + ver, when="@" + ver) def setup_build_environment(self, env): if "@3.9.0:" in self.spec: @@ -209,7 +211,12 @@ class MiopenHip(CMakePackage): ) if self.spec.satisfies("@5.4.0:5.5.0"): args.append(self.define("MIOPEN_USE_COMPOSABLEKERNEL", "OFF")) + args.append(self.define("MIOPEN_USE_MLIR", "ON")) + args.append(self.define("MIOPEN_ENABLE_AI_KERNEL_TUNING", "OFF")) if self.spec.satisfies("@5.5.1:"): args.append(self.define("MIOPEN_USE_COMPOSABLEKERNEL", "ON")) - args.append(self.define("MIOPEN_USE_MLIR", "OFF")) + args.append(self.define("MIOPEN_ENABLE_AI_KERNEL_TUNING", "OFF")) + args.append( + "-DNLOHMANN_JSON_INCLUDE={0}".format(self.spec["nlohmann-json"].prefix.include) + ) return args diff --git a/var/spack/repos/builtin/packages/mivisionx/package.py b/var/spack/repos/builtin/packages/mivisionx/package.py index 9d3a16959b..bd1a40a872 100644 --- a/var/spack/repos/builtin/packages/mivisionx/package.py +++ b/var/spack/repos/builtin/packages/mivisionx/package.py @@ -25,6 +25,8 @@ class Mivisionx(CMakePackage): url = "https://github.com/GPUOpen-ProfessionalCompute-Libraries/MIVisionX/archive/rocm-{0}.tar.gz" return url.format(version) + version("5.6.1", sha256="b2ff95c1488e244f379482631dae4f9ab92d94a513d180e03607aa1e184b5b0a") + version("5.6.0", sha256="34c184e202b1a6da2398b66e33c384d5bafd8f8291089c18539715c5cb73eb1f") version("5.5.1", sha256="e8209f87a57c4222003a936240e7152bbfa496862113358f29d4c3e80d4cdf56") version("5.5.0", sha256="af266550ecccad80f08954f23e47e8264eb338b0928a5314bd6efca349fc5a14") version("5.4.3", sha256="4da82974962a70c326ce2427c664517b1efdff436efe222e6bc28817c222a082") @@ -115,6 +117,8 @@ class Mivisionx(CMakePackage): variant("opencl", default=False, description="Use OPENCL as the backend") variant("hip", default=True, description="Use HIP as backend") + conflicts("+opencl", when="@5.6.0:") + def patch(self): if self.spec.satisfies("@4.2.0"): filter_file( @@ -255,13 +259,16 @@ class Mivisionx(CMakePackage): "5.4.3", "5.5.0", "5.5.1", + "5.6.0", + "5.6.1", ]: depends_on("miopen-hip@" + ver, when="@" + ver) - for ver in ["5.3.3", "5.4.0", "5.4.3", "5.5.0", "5.5.1"]: + for ver in ["5.3.3", "5.4.0", "5.4.3", "5.5.0", "5.5.1", "5.6.0", "5.6.1"]: depends_on("migraphx@" + ver, when="@" + ver) - for ver in ["5.5.0", "5.5.1"]: + for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1"]: depends_on("rocm-core@" + ver, when="@" + ver) + depends_on("python@3.5:", type="build") def flag_handler(self, name, flags): spec = self.spec diff --git a/var/spack/repos/builtin/packages/rccl/package.py b/var/spack/repos/builtin/packages/rccl/package.py index 6545452cf3..677b077b4b 100644 --- a/var/spack/repos/builtin/packages/rccl/package.py +++ b/var/spack/repos/builtin/packages/rccl/package.py @@ -21,6 +21,8 @@ class Rccl(CMakePackage): maintainers("srekolam", "renjithravindrankannath") libraries = ["librccl"] + version("5.6.1", sha256="27ec6b86a1a329684d808f728c1fce134517ac8e6e7047689f95dbf8386c077e") + version("5.6.0", sha256="cce13c8a9e233e7ddf91a67b1626b7aaeaf818fefe61af8de6b6b6ff47cb358c") version("5.5.1", sha256="f6b9dc6dafeb49d95c085825876b09317d8252771c746ccf5aa19a9204a404b2") version("5.5.0", sha256="be2964b408741d046bcd606d339a233d1d1deac7b841647ec53d6d62d71452ba") version("5.4.3", sha256="a2524f602bd7b3b6afeb8ba9aff660216ee807fa836e46442d068b5ed5f51a4d") @@ -143,6 +145,8 @@ class Rccl(CMakePackage): "5.4.3", "5.5.0", "5.5.1", + "5.6.0", + "5.6.1", ]: depends_on("rocm-cmake@%s:" % ver, type="build", when="@" + ver) depends_on("hip@" + ver, when="@" + ver) @@ -174,6 +178,8 @@ class Rccl(CMakePackage): "5.4.3", "5.5.0", "5.5.1", + "5.6.0", + "5.6.1", ]: depends_on("numactl@2:", when="@" + ver) for ver in [ @@ -190,12 +196,15 @@ class Rccl(CMakePackage): "5.3.3", "5.4.0", "5.4.3", + "5.5.0", "5.5.1", + "5.6.0", + "5.6.1", ]: depends_on("rocm-smi-lib@" + ver, when="@" + ver) depends_on("chrpath", when="@5.3.0:") - for ver in ["5.5.0", "5.5.1"]: + for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1"]: depends_on("rocm-core@" + ver, when="@" + ver) depends_on("googletest@1.11.0:", when="@5.3:") diff --git a/var/spack/repos/builtin/packages/rdc/package.py b/var/spack/repos/builtin/packages/rdc/package.py index bb6314b420..8f88417ebf 100644 --- a/var/spack/repos/builtin/packages/rdc/package.py +++ b/var/spack/repos/builtin/packages/rdc/package.py @@ -26,6 +26,8 @@ class Rdc(CMakePackage): url = "https://github.com/RadeonOpenCompute/rdc/archive/rocm-{0}.tar.gz" return url.format(version) + version("5.6.1", sha256="9e9f57cebbc5ae386a405957ed2c17344cdb42db5e1a71285f2c9bc09eea6519") + version("5.6.0", sha256="5213cd89215463862f6a1e9480ebe017944a6bb6b0db1722628afaa34af57991") version("5.5.1", sha256="a58a319ee702cf61cf71a4eba647c231392f68449b35419d941079c6de944844") version("5.5.0", sha256="56e85e77581963fbcfcc43e091a91773de470152347808ae730bcaf92c9f5ee8") version("5.4.3", sha256="c44f0b070b5650bc78e2eb968aae57a8ac1e1fd160e897055b79f3026c4fbad3") @@ -130,6 +132,8 @@ class Rdc(CMakePackage): "5.4.3", "5.5.0", "5.5.1", + "5.6.0", + "5.6.1", ]: depends_on("rocm-smi-lib@" + ver, type=("build", "link"), when="@" + ver) @@ -147,10 +151,12 @@ class Rdc(CMakePackage): "5.4.3", "5.5.0", "5.5.1", + "5.6.0", + "5.6.1", ]: depends_on("hsa-rocr-dev@" + ver, when="@" + ver) - for ver in ["5.5.0", "5.5.1"]: + for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1"]: depends_on("rocm-core@" + ver, when="@" + ver) def patch(self): diff --git a/var/spack/repos/builtin/packages/rocalution/package.py b/var/spack/repos/builtin/packages/rocalution/package.py index c8afbc226a..b0ba2021ba 100644 --- a/var/spack/repos/builtin/packages/rocalution/package.py +++ b/var/spack/repos/builtin/packages/rocalution/package.py @@ -24,7 +24,8 @@ class Rocalution(CMakePackage): maintainers("cgmb", "srekolam", "renjithravindrankannath") libraries = ["librocalution_hip"] - + version("5.6.1", sha256="7197b3617a0c91e90adaa32003c04d247a5f585d216e77493d20984ba215addb") + version("5.6.0", sha256="7397a2039e9615c0cf6776c33c4083c00b185b5d5c4149c89fea25a8976a3097") version("5.5.1", sha256="4612e30a0290b1732c8862eea655122abc2d22ce4345b8498fe4127697e880b4") version("5.5.0", sha256="626e966b67b83a1ef79f9bf27aba998c49cf65c4208092516aa1e32a6cbd8c36") version("5.4.3", sha256="39d00951a9b3cbdc4205a7e3ce75c026d9428c71c784815288c445f84a7f8a0e") @@ -155,6 +156,8 @@ class Rocalution(CMakePackage): "5.4.3", "5.5.0", "5.5.1", + "5.6.0", + "5.6.1", ]: depends_on("hip@" + ver, when="@" + ver) depends_on("rocprim@" + ver, when="@" + ver) diff --git a/var/spack/repos/builtin/packages/rocblas/package.py b/var/spack/repos/builtin/packages/rocblas/package.py index 9727be5ab3..0b59eadd7e 100644 --- a/var/spack/repos/builtin/packages/rocblas/package.py +++ b/var/spack/repos/builtin/packages/rocblas/package.py @@ -21,6 +21,8 @@ class Rocblas(CMakePackage): version("develop", branch="develop") version("master", branch="master") + version("5.6.1", sha256="73896ebd445162a69af97f9fd462684609b4e0cf617eab450cd4558b4a23941e") + version("5.6.0", sha256="6a70b27eede02c45f46095a6ce8421af9a774a565e39f5e1074783ecf00c1ea7") version("5.5.1", sha256="7916a8d238d51cc239949d799f0b61c9d5cd63c6ccaed0e16749489b89ca8ff3") version("5.5.0", sha256="b5260517f199e806ae18f2c4495f163884e0d7a0a7c67af0770f7428ea50f898") version("5.4.3", sha256="d82cd334b7a9b40d16ec4f4bb1fb5662382dcbfc86ee5e262413ed63d9e6a701") @@ -174,6 +176,8 @@ class Rocblas(CMakePackage): "5.4.3", "5.5.0", "5.5.1", + "5.6.0", + "5.6.1", ]: depends_on("hip@" + ver, when="@" + ver) depends_on("llvm-amdgpu@" + ver, type="build", when="@" + ver) @@ -191,6 +195,8 @@ class Rocblas(CMakePackage): depends_on("py-wheel", type="build") depends_on("py-msgpack", type="build") depends_on("py-pip", type="build") + depends_on("py-joblib", type="build", when="@5.6:") + depends_on("procps", type="build", when="@5.6:") for t_version, t_commit in [ ("@3.5.0", "f842a1a4427624eff6cbddb2405c36dec9a210cd"), @@ -218,6 +224,8 @@ class Rocblas(CMakePackage): ("@5.4.3", "5aec08937473b27865fa969bb38a83bcf9463c2b"), ("@5.5.0", "38d444a9f2b6cddfeaeedcb39a5688150fa27093"), ("@5.5.1", "38d444a9f2b6cddfeaeedcb39a5688150fa27093"), + ("@5.6.0", "7d0a9d040c3bbae893df7ecef6a19d9cd1c304aa"), + ("@5.6.1", "7d0a9d040c3bbae893df7ecef6a19d9cd1c304aa"), ]: resource( name="Tensile", diff --git a/var/spack/repos/builtin/packages/rocfft/package.py b/var/spack/repos/builtin/packages/rocfft/package.py index bc5251a6c0..63c0548ce3 100644 --- a/var/spack/repos/builtin/packages/rocfft/package.py +++ b/var/spack/repos/builtin/packages/rocfft/package.py @@ -18,6 +18,8 @@ class Rocfft(CMakePackage): maintainers("cgmb", "srekolam", "renjithravindrankannath", "haampie") libraries = ["librocfft"] + version("5.6.1", sha256="a65861e453587c3e6393da75b0b1976508c61f968aecda77fbec920fea48489e") + version("5.6.0", sha256="e3d4a6c1bdac78f9a22033f57011af783d560308103f73542f9e0e4dd133d38a") version("5.5.1", sha256="57423a64f5cdb1c37ff0891b6c17b59f73198d46be42db4ae23781ef2c0cd49d") version("5.5.0", sha256="9288152e66504b06082e4eed8cdb791b4f9ae2836b3defbeb4d2b54901b96485") version("5.4.3", sha256="ed9664adc9825c237327497bc4b23f020d50be7645647f14a45f4d943dd506e7") @@ -155,6 +157,8 @@ class Rocfft(CMakePackage): "5.4.3", "5.5.0", "5.5.1", + "5.6.0", + "5.6.1", ]: depends_on("hip@" + ver, when="@" + ver) depends_on("rocm-cmake@%s:" % ver, type="build", when="@" + ver) diff --git a/var/spack/repos/builtin/packages/rocm-bandwidth-test/package.py b/var/spack/repos/builtin/packages/rocm-bandwidth-test/package.py index 3c1f1d9e82..af829cf7ad 100644 --- a/var/spack/repos/builtin/packages/rocm-bandwidth-test/package.py +++ b/var/spack/repos/builtin/packages/rocm-bandwidth-test/package.py @@ -18,6 +18,8 @@ class RocmBandwidthTest(CMakePackage): maintainers("srekolam", "renjithravindrankannath") version("master", branch="master") + version("5.6.1", sha256="849af715d08dfd89e7aa5e4453b624151db1cafaa567ab5fa36a77948b90bf0d") + version("5.6.0", sha256="ae2f7263a21a3a650068f43e3112b2b765eea80a5af2297572f850c77f83c85e") version("5.5.1", sha256="768b3da49fe7d4bb4e6536a8ee15be9f5e865d961e813ed4a407f32402685e1f") version("5.5.0", sha256="1070ce14d45f34c2c6b2fb003184f3ae735ccfd640e9df1c228988b2a5a82949") version("5.4.3", sha256="a2f5a75bf47db1e39a4626a9f5cd2d120bcafe56b1baf2455d794f7a4734993e") @@ -128,12 +130,14 @@ class RocmBandwidthTest(CMakePackage): "5.4.3", "5.5.0", "5.5.1", + "5.6.0", + "5.6.1", "master", ]: depends_on("hsa-rocr-dev@" + ver, when="@" + ver) depends_on("hsakmt-roct@" + ver, when="@" + ver) - for ver in ["5.5.0", "5.5.1"]: + for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1"]: depends_on("rocm-core@" + ver, when="@" + ver) build_targets = ["package"] diff --git a/var/spack/repos/builtin/packages/rocm-clang-ocl/package.py b/var/spack/repos/builtin/packages/rocm-clang-ocl/package.py index 702c627d34..941b1900f0 100644 --- a/var/spack/repos/builtin/packages/rocm-clang-ocl/package.py +++ b/var/spack/repos/builtin/packages/rocm-clang-ocl/package.py @@ -16,6 +16,8 @@ class RocmClangOcl(CMakePackage): maintainers("srekolam", "renjithravindrankannath") version("master", branch="master") + version("5.6.1", sha256="c41deb1b564d939fc897b2bbdb13570b2234fa4c052a39783f5ad2dd1052f901") + version("5.6.0", sha256="1afc47dee02d73c10de422f254067f4ef3ff921c4a1204d54ecc40e61fc63497") version("5.5.1", sha256="bfa62ad14830e2bd5afbc346685216c69f8cbef0eb449954f793178e10b19a38") version("5.5.0", sha256="43a5459165693301ba2ebcc41b2b0705df9a3a47571d43bdc2cc49cfdd0833a7") version("5.4.3", sha256="689e0354ea685bd488116de8eb902b902492e9ace184c3109b97b9a43f8b2d59") @@ -126,6 +128,8 @@ class RocmClangOcl(CMakePackage): "5.4.3", "5.5.0", "5.5.1", + "5.6.0", + "5.6.1", "master", ]: depends_on("rocm-cmake@%s:" % ver, type="build", when="@" + ver) @@ -135,7 +139,7 @@ class RocmClangOcl(CMakePackage): depends_on( "rocm-device-libs@" + ver, when="@{0} ^llvm-amdgpu ~rocm-device-libs".format(ver) ) - for ver in ["5.5.0", "5.5.1"]: + for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1"]: depends_on("rocm-core@" + ver, when="@" + ver) test_src_dir = "test" diff --git a/var/spack/repos/builtin/packages/rocm-cmake/package.py b/var/spack/repos/builtin/packages/rocm-cmake/package.py index 30cbfc397f..c833db6755 100644 --- a/var/spack/repos/builtin/packages/rocm-cmake/package.py +++ b/var/spack/repos/builtin/packages/rocm-cmake/package.py @@ -13,12 +13,14 @@ class RocmCmake(CMakePackage): homepage = "https://github.com/RadeonOpenCompute/rocm-cmake" git = "https://github.com/RadeonOpenCompute/rocm-cmake.git" - url = "https://github.com/RadeonOpenCompute/rocm-cmake/archive/rocm-5.5.0.tar.gz" + url = "https://github.com/RadeonOpenCompute/rocm-cmake/archive/rocm-5.6.0.tar.gz" tags = ["rocm"] maintainers("srekolam", "renjithravindrankannath") version("master", branch="master") + version("5.6.1", sha256="98bf5fe2e6e12f55d122807d0060f1bb19c80d63d2c2f6fee579c40bfd244fa6") + version("5.6.0", sha256="a118ca937856a4d0039955a8aef2466ef1fd1f08f7f7221cda53e1b5d02e476a") version("5.5.1", sha256="60113412b35d94e20e8100ed3db688c35801991b4b8fa282fdc6fd6fd413fb6e") version("5.5.0", sha256="b7884c346737eba70ae11044e41598b2482a92e21f3e0719b1ca11619f02a20b") version("5.4.3", sha256="c185b3a10d191d73b76770ca0f9d6bdc355ee91fe0c9016a3779c9cfe042ba0f") @@ -104,7 +106,7 @@ class RocmCmake(CMakePackage): depends_on("cmake@3:", type="build") depends_on("cmake@3.6:", type="build", when="@4.1.0:") - for ver in ["5.5.0", "5.5.1"]: + for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1"]: depends_on("rocm-core@" + ver, when="@" + ver) test_src_dir = "test" diff --git a/var/spack/repos/builtin/packages/rocm-core/package.py b/var/spack/repos/builtin/packages/rocm-core/package.py index fe2f3bfbeb..45d947ce0e 100644 --- a/var/spack/repos/builtin/packages/rocm-core/package.py +++ b/var/spack/repos/builtin/packages/rocm-core/package.py @@ -19,6 +19,8 @@ class RocmCore(CMakePackage): maintainers("srekolam", "renjithravindrankannath") libraries = ["librocm-core"] + version("5.6.1", sha256="eeef75e16e05380ccbc8df17a02dc141a66dddaadb444a97f7278f78067c498c") + version("5.6.0", sha256="3c3d47c8b774968d768d42810a3fed42d058b7d6da248d5295df2a7ffb262568") version("5.5.1", sha256="bc73060432ffdc2e210394835d383890b9652476074ef4708d447473f273ce76") version("5.5.0", sha256="684d3312bb14f05dc280cf136f5eddff38ba340cd85c383d6a217d8e27d3d57d") diff --git a/var/spack/repos/builtin/packages/rocm-dbgapi/package.py b/var/spack/repos/builtin/packages/rocm-dbgapi/package.py index 046ca8913c..6b28adb40c 100644 --- a/var/spack/repos/builtin/packages/rocm-dbgapi/package.py +++ b/var/spack/repos/builtin/packages/rocm-dbgapi/package.py @@ -23,6 +23,8 @@ class RocmDbgapi(CMakePackage): libraries = ["librocm-dbgapi"] version("master", branch="amd-master") + version("5.6.1", sha256="c7241bf94bdb97a4cf1befbf25b8c35720797710da6f6b5b9d6a4094c1bc9c8b") + version("5.6.0", sha256="9b66e47f4eccb3c8bbc324aade92aac6139539dda449427b7823d0c45341afc8") version("5.5.1", sha256="c41dfc62591bcf42003fe744d8bd03a51311d54e4b012f946ca0ede0c14dd977") version("5.5.0", sha256="ce572340a3fe99e4f1538eb614933153456003f8dfe9306a5735cdd25b451e25") version("5.4.3", sha256="d647c9121a50f2c54367c567d8f39a145cb135e1ceed931581659f57f49f61e5") @@ -134,12 +136,14 @@ class RocmDbgapi(CMakePackage): "5.4.3", "5.5.0", "5.5.1", + "5.6.0", + "5.6.1", "master", ]: depends_on("hsa-rocr-dev@" + ver, type="build", when="@" + ver) depends_on("comgr@" + ver, type=("build", "link"), when="@" + ver) - for ver in ["5.5.0", "5.5.1"]: + for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1"]: depends_on("rocm-core@" + ver, when="@" + ver) @classmethod diff --git a/var/spack/repos/builtin/packages/rocm-debug-agent/package.py b/var/spack/repos/builtin/packages/rocm-debug-agent/package.py index 13f631e9e6..4b5850f0d0 100644 --- a/var/spack/repos/builtin/packages/rocm-debug-agent/package.py +++ b/var/spack/repos/builtin/packages/rocm-debug-agent/package.py @@ -18,6 +18,8 @@ class RocmDebugAgent(CMakePackage): maintainers("srekolam", "renjithravindrankannath") libraries = ["librocm-debug-agent"] + version("5.6.1", sha256="d3b1d5d757489ed3cc66d351cec56b7b850aaa7ecf6a55b0350b89c3dee3153a") + version("5.6.0", sha256="0bed788f07906afeb9092d0bec184a7963233ac9d8ccd20b4afeb624a1d20698") version("5.5.1", sha256="1bb66734f11bb57df6efa507f0217651446653bf28b3ca36acfcf94511a7c2bc") version("5.5.0", sha256="4f2431a395a77a06dc417ed1e9188731b031a0c680e62c6eee19d60965317f5a") version("5.4.3", sha256="b2c9ac198ea3cbf35e7e80f57c5d81c461de78b821d07b637ea4037a65cdf49f") @@ -138,6 +140,8 @@ class RocmDebugAgent(CMakePackage): "5.4.3", "5.5.0", "5.5.1", + "5.6.0", + "5.6.1", ]: depends_on("hsa-rocr-dev@" + ver, when="@" + ver) depends_on("hsakmt-roct@" + ver, when="@" + ver) @@ -167,11 +171,13 @@ class RocmDebugAgent(CMakePackage): "5.4.3", "5.5.0", "5.5.1", + "5.6.0", + "5.6.1", ]: depends_on("rocm-dbgapi@" + ver, when="@" + ver) depends_on("hip@" + ver, when="@" + ver) - for ver in ["5.5.0", "5.5.1"]: + for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1"]: depends_on("rocm-core@" + ver, when="@" + ver) # https://github.com/ROCm-Developer-Tools/rocr_debug_agent/pull/4 diff --git a/var/spack/repos/builtin/packages/rocm-device-libs/package.py b/var/spack/repos/builtin/packages/rocm-device-libs/package.py index 24d9c1e826..cb784a050f 100644 --- a/var/spack/repos/builtin/packages/rocm-device-libs/package.py +++ b/var/spack/repos/builtin/packages/rocm-device-libs/package.py @@ -18,6 +18,8 @@ class RocmDeviceLibs(CMakePackage): maintainers("srekolam", "renjithravindrankannath", "haampie") version("master", branch="amd-stg-open") + version("5.6.1", sha256="f0dfab272ff936225bfa1e9dabeb3c5d12ce08b812bf53ffbddd2ddfac49761c") + version("5.6.0", sha256="efb5dcdca9b3a9fbe408d494fb4a23e0b78417eb5fa8eebd4a5d226088f28921") version("5.5.1", sha256="3b5f6dd85f0e3371f6078da7b59bf77d5b210e30f1cc66ef1e2de6bbcb775833") version("5.5.0", sha256="5ab95aeb9c8bed0514f96f7847e21e165ed901ed826cdc9382c14d199cbadbd3") version("5.4.3", sha256="f4f7281f2cea6d268fcc3662b37410957d4f0bc23e0df9f60b12eb0fcdf9e26e") @@ -138,11 +140,13 @@ class RocmDeviceLibs(CMakePackage): "5.4.3", "5.5.0", "5.5.1", + "5.6.0", + "5.6.1", "master", ]: depends_on("llvm-amdgpu@" + ver, when="@" + ver) - for ver in ["5.5.0", "5.5.1"]: + for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1"]: depends_on("rocm-core@" + ver, when="@" + ver) def cmake_args(self): diff --git a/var/spack/repos/builtin/packages/rocm-gdb/package.py b/var/spack/repos/builtin/packages/rocm-gdb/package.py index b7c58074af..a752f0c4d7 100644 --- a/var/spack/repos/builtin/packages/rocm-gdb/package.py +++ b/var/spack/repos/builtin/packages/rocm-gdb/package.py @@ -16,6 +16,8 @@ class RocmGdb(AutotoolsPackage): tags = ["rocm"] maintainers("srekolam", "renjithravindrankannath") + version("5.6.1", sha256="d2b40d4c5aa41a6ce2a84307627b30d16a458672e03e13f9d27c12f2dc3f21d6") + version("5.6.0", sha256="997ef1883aac2769552bc7082c70b837f4e98b57d24c133cea52b9c92fb0dee1") version("5.5.1", sha256="359258548bc7e6abff16bb13c301339fb96560b2b961433c9e0712e4aaf2d9e1") version("5.5.0", sha256="d3b100e332facd9635e328f5efd9f0565250edbe05be986baa2e0470a19bcd79") version("5.4.3", sha256="28c1ce39fb1fabe61f86f6e3c6940c10f9a8b8de77f7bb4fdd73b04e172f85f6") @@ -135,11 +137,13 @@ class RocmGdb(AutotoolsPackage): "5.4.3", "5.5.0", "5.5.1", + "5.6.0", + "5.6.1", ]: depends_on("rocm-dbgapi@" + ver, type="link", when="@" + ver) depends_on("comgr@" + ver, type="link", when="@" + ver) - for ver in ["5.5.0", "5.5.1"]: + for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1"]: depends_on("rocm-core@" + ver, when="@" + ver) build_directory = "spack-build" diff --git a/var/spack/repos/builtin/packages/rocm-opencl/package.py b/var/spack/repos/builtin/packages/rocm-opencl/package.py index ddea6c7a8c..f79496e91d 100644 --- a/var/spack/repos/builtin/packages/rocm-opencl/package.py +++ b/var/spack/repos/builtin/packages/rocm-opencl/package.py @@ -29,6 +29,8 @@ class RocmOpencl(CMakePackage): return url.format(version) version("master", branch="main") + version("5.6.1", sha256="ec26049f7d93c95050c27ba65472736665ec7a40f25920a868616b2970f6b845") + version("5.6.0", sha256="52ab260d00d279c2a86c353901ffd88ee61b934ad89e9eb480f210656705f04e") version("5.5.1", sha256="a8a62a7c6fc5398406d2203b8cb75621a24944688e545d917033d87de2724498") version("5.5.0", sha256="0df9fa0b8aa0c8e6711d34eec0fdf1ed356adcd9625bc8f1ce9b3e72090f3e4f") version("5.4.3", sha256="b0f8339c844a2e62773bd85cd1e7c5ecddfe71d7c8e8d604e1a1d60900c30873") @@ -116,6 +118,8 @@ class RocmOpencl(CMakePackage): depends_on("numactl", type="link", when="@3.7.0:") for d_version, d_shasum in [ + ("5.6.1", "cc9a99c7e4de3d9360c0a471b27d626e84a39c9e60e0aff1e8e1500d82391819"), + ("5.6.0", "864f87323e793e60b16905284fba381a7182b960dd4a37fb67420c174442c03c"), ("5.5.1", "1375fc7723cfaa0ae22a78682186d4804188b0a54990bfd9c0b8eb421b85e37e"), ("5.5.0", "efbae9a1ef2ab3de5ca44091e9bb78522e76759c43524c1349114f9596cc61d1"), ("5.4.3", "71d9668619ab57ec8a4564d11860438c5aad5bd161a3e58fbc49555fbd59182d"), @@ -186,12 +190,14 @@ class RocmOpencl(CMakePackage): "5.4.3", "5.5.0", "5.5.1", + "5.6.0", + "5.6.1", "master", ]: depends_on("comgr@" + ver, type="build", when="@" + ver) depends_on("hsa-rocr-dev@" + ver, type="link", when="@" + ver) - for ver in ["5.5.0", "5.5.1"]: + for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1"]: depends_on("rocm-core@" + ver, when="@" + ver) @classmethod diff --git a/var/spack/repos/builtin/packages/rocm-openmp-extras/package.py b/var/spack/repos/builtin/packages/rocm-openmp-extras/package.py index d918f3d5f0..dedba382c5 100644 --- a/var/spack/repos/builtin/packages/rocm-openmp-extras/package.py +++ b/var/spack/repos/builtin/packages/rocm-openmp-extras/package.py @@ -37,6 +37,8 @@ aomp = [ "7f90634fb621169b21bcbd920c2e299acc88ba0eeb1a33fd40ae26e13201b652", "23cc7d1c82e35c74f48285a0a1c27e7b3cae1767568bb7b9367ea21f53dd6598", "9ec03a69cc462ada43e1fd4ca905a765b08c10e0911fb7a202c893cc577855e6", + "0673820a81986c9e2f28f15bbb45ad18934bca56a9d08aae6c49ec3895b38487", + "6c051bf7625f682ba3d2ea80b46a38ca2cbcd20f5d89ae3433602d3e7ef0403a", ] devlib = [ @@ -62,6 +64,8 @@ devlib = [ "f4f7281f2cea6d268fcc3662b37410957d4f0bc23e0df9f60b12eb0fcdf9e26e", "5ab95aeb9c8bed0514f96f7847e21e165ed901ed826cdc9382c14d199cbadbd3", "3b5f6dd85f0e3371f6078da7b59bf77d5b210e30f1cc66ef1e2de6bbcb775833", + "efb5dcdca9b3a9fbe408d494fb4a23e0b78417eb5fa8eebd4a5d226088f28921", + "f0dfab272ff936225bfa1e9dabeb3c5d12ce08b812bf53ffbddd2ddfac49761c", ] llvm = [ @@ -87,6 +91,8 @@ llvm = [ "a844d3cc01613f6284a75d44db67c495ac1e9b600eacbb1eb13d2649f5d5404d", "5dc6c99f612b69ff73145bee17524e3712990100e16445b71634106acf7927cf", "7d7181f20f89cb0715191aa32914186c67a34258c13457055570d47e15296553", + "e922bd492b54d99e56ed88c81e2009ed6472059a180b10cc56ce1f9bd2d7b6ed", + "045e43c0c4a3f4f2f1db9fb603a4f1ea3d56e128147e19ba17909eb57d7f08e5", ] flang = [ @@ -112,6 +118,8 @@ flang = [ "b283d76244d19ab16c9d087ee7de0d340036e9c842007aa9d288aa4e6bf3749f", "a18522588686672150c7862f2b23048a429baa4a66010c4196e969cc77bd152c", "7c3b4eb3e95b9e2f91234f202a76034628d230a92e57b7c5ee9dcca1097bec46", + "fcefebddca0b373da81ff84f0f5469a1ef77a05430a5195d0f2e6399d3af31c3", + "5ebcbca2e03bd0686e677f44ea551e97bd9395c6b119f832fa784818733aa652", ] extras = [ @@ -137,6 +145,8 @@ extras = [ "d393f27a85c9229433b50daee8154e11517160beb1049c1de9c55fc31dd11fac", "8f49026a80eb8685cbfb6d3d3b9898dd083df4d71893984ae5330d4804c685fb", "8955aa9d039fd6c1ff2e26d7298f0bf09bbcf03f09c6df92c91a9ab2510df9da", + "017bfed52fbe08185d8dbde79377918454215683562519a9e47acf403d9a1c29", + "437e2017cfe2ab73b15ada0fc1ea88f794f0b108cc5410f457268ae7e4e8985a", ] versions = [ @@ -162,6 +172,8 @@ versions = [ "5.4.3", "5.5.0", "5.5.1", + "5.6.0", + "5.6.1", ] versions_dict = dict() # type: Dict[str,Dict[str,str]] components = ["aomp", "devlib", "llvm", "flang", "extras"] @@ -183,6 +195,8 @@ class RocmOpenmpExtras(Package): tags = ["rocm"] maintainers("srekolam", "renjithravindrankannath", "estewart08") + version("5.6.1", sha256=versions_dict["5.6.1"]["aomp"]) + version("5.6.0", sha256=versions_dict["5.6.0"]["aomp"]) version("5.5.1", sha256=versions_dict["5.5.1"]["aomp"]) version("5.5.0", sha256=versions_dict["5.5.0"]["aomp"]) version("5.4.3", sha256=versions_dict["5.4.3"]["aomp"]) @@ -237,13 +251,15 @@ class RocmOpenmpExtras(Package): "5.4.3", "5.5.0", "5.5.1", + "5.6.0", + "5.6.1", ]: depends_on("hsakmt-roct@" + ver, when="@" + ver) depends_on("comgr@" + ver, when="@" + ver) depends_on("hsa-rocr-dev@" + ver, when="@" + ver) depends_on("llvm-amdgpu@{0} ~openmp".format(ver), when="@" + ver) - for ver in ["5.5.0", "5.5.1"]: + for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1"]: depends_on("rocm-core@" + ver, when="@" + ver) # tag changed to 'rocm-' in 4.0.0 diff --git a/var/spack/repos/builtin/packages/rocm-smi-lib/package.py b/var/spack/repos/builtin/packages/rocm-smi-lib/package.py index 80ced92254..11ad3aa2ab 100644 --- a/var/spack/repos/builtin/packages/rocm-smi-lib/package.py +++ b/var/spack/repos/builtin/packages/rocm-smi-lib/package.py @@ -25,6 +25,8 @@ class RocmSmiLib(CMakePackage): libraries = ["librocm_smi64"] version("master", branch="master") + version("5.6.1", sha256="9e94f9a941202c3d7ce917fd1cd78c4e0f06f48d6c929f3aa916378ccef1e02c") + version("5.6.0", sha256="88be875948a29454b8aacced8bb8ad967502a7a074ecbc579ed673c1650a2f7e") version("5.5.1", sha256="37f32350bfaf6c697312628696d1b1d5fd9165f183882759bc6cb9a5d65b9430") version("5.5.0", sha256="0703f49b1c2924cc1d3f613258eabdff1925cb5bcf7cf22bb6b955dd065e4ce8") version("5.4.3", sha256="34d550272e420684230ceb7845aefcef79b155e51cf9ec55e31fdba2a4ed177b") @@ -112,14 +114,14 @@ class RocmSmiLib(CMakePackage): depends_on("cmake@3:", type="build") depends_on("python@3:", type=("build", "run"), when="@3.9.0:") - for ver in ["5.5.0", "5.5.1"]: + for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1"]: depends_on("rocm-core@" + ver, when="@" + ver) patch("disable_pdf_generation_with_doxygen_and_latex.patch", when="@4.5.2:") def cmake_args(self): args = [ self.define_from_variant("BUILD_SHARED_LIBS", "shared"), - self.define("CMAKE_INSTALL_LIBDIR", self.prefix.lib), + self.define("CMAKE_INSTALL_LIBDIR", "lib"), ] return args diff --git a/var/spack/repos/builtin/packages/rocm-tensile/package.py b/var/spack/repos/builtin/packages/rocm-tensile/package.py index 19d3b3d63d..29a6e82e09 100644 --- a/var/spack/repos/builtin/packages/rocm-tensile/package.py +++ b/var/spack/repos/builtin/packages/rocm-tensile/package.py @@ -18,6 +18,8 @@ class RocmTensile(CMakePackage): maintainers("srekolam", "renjithravindrankannath", "haampie") + version("5.6.1", sha256="3e78c933563fade8781a1dca2079bff135af2f5d2c6eb0147797d2c1f24d006c") + version("5.6.0", sha256="383728ecf49def59ab9a7f8a1d1e2eaf8b528e36b461e27030a2aab1a1ed80cb") version("5.5.1", sha256="b65cb7335abe51ba33be9d46a5ede992b4e5932fa33797397899a6bf33a770e9") version("5.5.0", sha256="70fd736d40bb4c3461f07c77ad3ae6c485e3e842671ce9b223d023d836884ae2") version("5.4.3", sha256="a4c5e62edd33ea6b8528eb3f017a14c28eaa67c540f5c9023f6a245340198b0f") @@ -157,6 +159,8 @@ class RocmTensile(CMakePackage): "5.4.3", "5.5.0", "5.5.1", + "5.6.0", + "5.6.1", ]: depends_on("rocm-cmake@" + ver, type="build", when="@" + ver) depends_on("hip@" + ver, when="@" + ver) @@ -174,6 +178,8 @@ class RocmTensile(CMakePackage): "5.4.3", "5.5.0", "5.5.1", + "5.6.0", + "5.6.1", ]: depends_on("rocm-openmp-extras@" + ver, when="@" + ver) @@ -201,6 +207,8 @@ class RocmTensile(CMakePackage): "5.4.3", "5.5.0", "5.5.1", + "5.6.0", + "5.6.1", ]: depends_on("rocm-smi-lib@" + ver, type="build", when="@" + ver) diff --git a/var/spack/repos/builtin/packages/rocm-validation-suite/007-cleanup-path-reference-donot-download-googletest-yaml-library-path_5.6.patch b/var/spack/repos/builtin/packages/rocm-validation-suite/007-cleanup-path-reference-donot-download-googletest-yaml-library-path_5.6.patch new file mode 100644 index 0000000000..7acd960614 --- /dev/null +++ b/var/spack/repos/builtin/packages/rocm-validation-suite/007-cleanup-path-reference-donot-download-googletest-yaml-library-path_5.6.patch @@ -0,0 +1,532 @@ +From 795e7474acf23eb2f7815fd54ffdd3fd41ff8c35 Mon Sep 17 00:00:00 2001 +From: Renjith Ravindran <Renjith.RavindranKannath@amd.com> +Date: Tue, 12 Sep 2023 07:00:31 +0000 +Subject: [PATCH] 5.6 Patch to add rocm-smi library and include path + +--- + CMakeLists.txt | 105 ++++----------------------------- + babel.so/CMakeLists.txt | 16 ++--- + cmake_modules/tests_unit.cmake | 2 +- + edp.so/CMakeLists.txt | 3 +- + gm.so/CMakeLists.txt | 4 +- + gpup.so/CMakeLists.txt | 2 +- + gst.so/CMakeLists.txt | 4 +- + iet.so/CMakeLists.txt | 6 +- + mem.so/CMakeLists.txt | 4 +- + pbqt.so/CMakeLists.txt | 2 +- + pebb.so/CMakeLists.txt | 2 +- + peqt.so/CMakeLists.txt | 4 +- + perf.so/CMakeLists.txt | 4 +- + pesm.so/CMakeLists.txt | 2 +- + rcqt.so/CMakeLists.txt | 2 +- + rvs/CMakeLists.txt | 2 +- + rvs/tests.cmake | 2 +- + rvslib/CMakeLists.txt | 2 +- + smqt.so/CMakeLists.txt | 2 +- + testif.so/CMakeLists.txt | 2 +- + 20 files changed, 45 insertions(+), 127 deletions(-) + +diff --git a/CMakeLists.txt b/CMakeLists.txt +index a12eb41..900657a 100644 +--- a/CMakeLists.txt ++++ b/CMakeLists.txt +@@ -68,13 +68,12 @@ endif(rocblas_FOUND) + # variables since we will pass them as cmake params appropriately, and + # all find_packages relevant to this build will be in ROCM path hence appending it to CMAKE_PREFIX_PATH + set(ROCM_PATH "/opt/rocm" CACHE PATH "ROCM install path") +-set(CMAKE_INSTALL_PREFIX "/opt/rocm" CACHE PATH "CMAKE installation directory") +-set(CMAKE_PACKAGING_INSTALL_PREFIX "/opt/rocm" CACHE PATH "Prefix used in built packages") ++set (CMAKE_PACKAGING_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}" ) + list(APPEND CMAKE_PREFIX_PATH "${ROCM_PATH}") +-set(ROCR_INC_DIR "${ROCM_PATH}/include" CACHE PATH "Contains header files exported by ROC Runtime" FORCE) +-set(ROCR_LIB_DIR "${ROCM_PATH}/lib" CACHE PATH "Contains library files exported by ROC Runtime" FORCE) +-set(HIP_INC_DIR "${ROCM_PATH}" CACHE PATH "Contains header files exported by ROC Runtime") +-set(ROCT_INC_DIR "${ROCM_PATH}/include" CACHE PATH "Contains header files exported by ROC Trunk" FORCE) ++set(ROCR_INC_DIR "${HSA_PATH}/include" CACHE PATH "Contains header files exported by ROC Runtime") ++set(ROCR_LIB_DIR "${HSA_PATH}/lib" CACHE PATH "Contains library files exported by ROC Runtime") ++set(HIP_INC_DIR "${HIP_PATH}" CACHE PATH "Contains header files exported by ROC Runtime") ++set(ROCT_INC_DIR "${ROCM_PATH}/include" CACHE PATH "Contains header files exported by ROC Trunk") + + + # +@@ -193,8 +192,6 @@ set(RVS_ROCBLAS "0" CACHE STRING "1 = use local rocBLAS") + set(RVS_ROCMSMI "0" CACHE STRING "1 = use local rocm_smi_lib") + + set(RVS_LIB_DIR "${CMAKE_BINARY_DIR}/rvslib" CACHE PATH "Contains RVS library") +-set(YAML_INC_DIR "${CMAKE_BINARY_DIR}/yaml-src/include" CACHE PATH "Contains header files exported by yaml-cpp") +-set(YAML_LIB_DIR "${CMAKE_BINARY_DIR}/yaml-build" CACHE PATH "Contains library files exported by yaml-cpp") + + if (${RVS_OS_TYPE} STREQUAL "centos") + set(ROCT_LIB_DIR "${ROCM_PATH}/lib64" CACHE PATH "Contains library files exported by ROC Trunk") +@@ -238,86 +235,6 @@ if (NOT DEFINED CPACK_GENERATOR ) + endif() + message (STATUS "CPACK_GENERATOR ${CPACK_GENERATOR}" ) + +- +-################################################################################ +-# Download and unpack yaml-cpp at configure time +-configure_file(CMakeYamlDownload.cmake yaml-download/CMakeLists.txt) +-execute_process(COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" . +- RESULT_VARIABLE result +- WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/yaml-download ) +-if(result) +- message(FATAL_ERROR "CMake step for yaml-download failed: ${result}") +-endif() +-execute_process(COMMAND ${CMAKE_COMMAND} --build . +- RESULT_VARIABLE result +- WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/yaml-download ) +-if(result) +- message(FATAL_ERROR "Build step for yaml-download failed: ${result}") +-endif() +-execute_process(COMMAND ${CMAKE_COMMAND} ${CMAKE_BINARY_DIR}/yaml-src -B${CMAKE_BINARY_DIR}/yaml-build +- RESULT_VARIABLE result +- WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/yaml-src ) +-if(result) +- message(FATAL_ERROR "Config step for yaml-src failed: ${result}") +-endif() +- +-add_custom_target(rvs_yaml_target +- DEPENDS ${CMAKE_BINARY_DIR}/yaml-build/libyaml-cpp.a +-) +- +-add_custom_command(OUTPUT ${CMAKE_BINARY_DIR}/yaml-build/libyaml-cpp.a +- COMMAND make -C ${CMAKE_BINARY_DIR}/yaml-build +- WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/yaml-src +- COMMENT "Generating yaml-cpp targets" +- VERBATIM) +- +-################################################################################ +-## GOOGLE TEST +-if(RVS_BUILD_TESTS) +- # Download and unpack googletest at configure time +- configure_file(CMakeGtestDownload.cmake googletest-download/CMakeLists.txt) +- execute_process(COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" . +- RESULT_VARIABLE result +- WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/googletest-download ) +- if(result) +- message(FATAL_ERROR "CMake step for googletest failed: ${result}") +- endif() +- execute_process(COMMAND ${CMAKE_COMMAND} --build . +- RESULT_VARIABLE result +- WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/googletest-download ) +- if(result) +- message(FATAL_ERROR "Build step for googletest failed: ${result}") +- endif() +- execute_process(COMMAND ${CMAKE_COMMAND} ${CMAKE_BINARY_DIR}/googletest-src -B${CMAKE_BINARY_DIR}/googletest-build +- RESULT_VARIABLE result +- WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/googletest-src ) +- if(result) +- message(FATAL_ERROR "Config step for googletest-src failed: ${result}") +- endif() +- +- add_custom_target(rvs_gtest_target +- DEPENDS ${CMAKE_BINARY_DIR}/googletest-build/lib/libgtest_main.a +- ) +- +- add_custom_command(OUTPUT ${CMAKE_BINARY_DIR}/googletest-build/lib/libgtest_main.a +- COMMAND make -C ${CMAKE_BINARY_DIR}/googletest-build +- WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/googletest-src +- COMMENT "Generating googletest targets" +- VERBATIM) +- +- ## Set default unit test framework include path +- if (NOT DEFINED UT_INC) +- set (UT_INC "${CMAKE_BINARY_DIR}/googletest-src/googletest/include") +- message ("UT_INC ${UT_INC}") +- endif () +- +- ## Set default unit test framework include path +- if (NOT DEFINED UT_LIB) +- set (UT_LIB "${CMAKE_BINARY_DIR}/googletest-build/lib") +- message ("UT_LIB ${UT_LIB}") +- endif() +- +-endif() + ################################################################################ + ## rocBLAS + +@@ -441,8 +358,8 @@ if (RVS_ROCBLAS EQUAL 1) + set(ROCBLAS_INC_DIR "${CMAKE_BINARY_DIR}/rvs_rblas-src/build/release/rocblas-install") + set(ROCBLAS_LIB_DIR "${CMAKE_BINARY_DIR}/rvs_rblas-src/build/release/rocblas-install/lib/") + else() +- set(ROCBLAS_INC_DIR "${ROCM_PATH}/include") +- set(ROCBLAS_LIB_DIR "${ROCM_PATH}/lib") ++ set(ROCBLAS_INC_DIR "${ROCBLAS_DIR}/include") ++ set(ROCBLAS_LIB_DIR "${ROCBLAS_DIR}/lib") + endif() + + if (RVS_ROCMSMI EQUAL 1) +@@ -457,8 +374,8 @@ else() + set(ROCM_SMI_LIB_DIR "${ROCM_PATH}/rocm_smi/lib") + else() + message( STATUS "ROCBLAS REORG Enabled Version: ${RVS_ROCBLAS_VERSION_FLAT}" ) +- set(ROCM_SMI_INC_DIR "${ROCM_PATH}/include") +- set(ROCM_SMI_LIB_DIR "${ROCM_PATH}/lib") ++ set(ROCM_SMI_INC_DIR "${ROCM_SMI_DIR}/include") ++ set(ROCM_SMI_LIB_DIR "${ROCM_SMI_DIR}/lib") + endif() + endif() + set(ROCM_SMI_LIB "rocm_smi64" CACHE STRING "rocm_smi library name") +@@ -502,7 +419,7 @@ if (RVS_BUILD_TESTS) + add_subdirectory(testif.so) + endif() + +-add_dependencies(rvshelper rvs_bin_folder rvs_doc rvs_yaml_target) ++add_dependencies(rvshelper rvs_bin_folder rvs_doc) + + + add_dependencies(pesm rvslib rvslibrt) +@@ -537,7 +454,7 @@ if (RVS_BUILD_TESTS) + WORKING_DIRECTORY ${CMAKE_BINARY_DIR} + COMMENT "Create the bintest directory" + VERBATIM) +- add_dependencies(rvshelper rvs_bintest_folder rvs_gtest_target) ++ add_dependencies(rvshelper rvs_bintest_folder) + endif() + + add_custom_target(rvs_doc ALL +diff --git a/babel.so/CMakeLists.txt b/babel.so/CMakeLists.txt +index 7290cef..ebd55ad 100644 +--- a/babel.so/CMakeLists.txt ++++ b/babel.so/CMakeLists.txt +@@ -107,13 +107,13 @@ set(HIP_HCC_LIB "amdhip64") + add_compile_options(-DRVS_ROCBLAS_VERSION_FLAT=${RVS_ROCBLAS_VERSION_FLAT}) + + # Determine Roc Runtime header files are accessible +-if(NOT EXISTS ${HIP_INC_DIR}/include/hip/hip_runtime.h) +- message("ERROR: ROC Runtime headers can't be found under specified path. Please set HIP_INC_DIR path. Current value is : " ${HIP_INC_DIR}) ++if(NOT EXISTS ${HIP_PATH}/include/hip/hip_runtime.h) ++ message("ERROR: ROC Runtime headers can't be found under specified path. Please set HIP_PATH path. Current value is : " ${HIP_PATH}) + RETURN() + endif() + +-if(NOT EXISTS ${HIP_INC_DIR}/include/hip/hip_runtime_api.h) +- message("ERROR: ROC Runtime headers can't be found under specified path. Please set HIP_INC_DIR path. Current value is : " ${HIP_INC_DIR}) ++if(NOT EXISTS ${HIP_PATH}/include/hip/hip_runtime_api.h) ++ message("ERROR: ROC Runtime headers can't be found under specified path. Please set HIP_PATH path. Current value is : " ${HIP_PATH}) + RETURN() + endif() + +@@ -133,16 +133,16 @@ if(DEFINED RVS_ROCMSMI) + endif() + + +-if(NOT EXISTS "${ROCR_LIB_DIR}/lib${HIP_HCC_LIB}.so") +- message("ERROR: ROC Runtime libraries can't be found under specified path. Please set ROCR_LIB_DIR path. Current value is : " ${ROCR_LIB_DIR}) ++if(NOT EXISTS "${HIP_PATH}/lib/lib${HIP_HCC_LIB}.so") ++ message("ERROR: ROC Runtime libraries can't be found under specified path. Please set HIP_PATH path. Current value is : " ${HIP_PATH}) + RETURN() + endif() + + ## define include directories +-include_directories(./ ../ ${ROCR_INC_DIR} ${HIP_INC_DIR}) ++include_directories(./ ../ ${HIP_PATH}) + + # Add directories to look for library files to link +-link_directories(${RVS_LIB_DIR} ${ROCR_LIB_DIR} ${ROCBLAS_LIB_DIR} ${ASAN_LIB_PATH}) ++link_directories(${RVS_LIB_DIR} ${HIP_PATH}/lib/ ${ROCBLAS_LIB_DIR} ${ASAN_LIB_PATH} ${ROCM_SMI_LIB_DIR}) + ## additional libraries + set (PROJECT_LINK_LIBS rvslibrt rvslib libpthread.so libpci.so libm.so) + +diff --git a/cmake_modules/tests_unit.cmake b/cmake_modules/tests_unit.cmake +index 586f453..c8b6560 100644 +--- a/cmake_modules/tests_unit.cmake ++++ b/cmake_modules/tests_unit.cmake +@@ -27,7 +27,7 @@ + ## define additional unit testing include directories + include_directories(${UT_INC}) + ## define additional unit testing lib directories +-link_directories(${UT_LIB} ${RVS_LIB_DIR}) ++link_directories(${UT_LIB} ${RVS_LIB_DIR} ${ROCM_SMI_LIB_DIR}) + + file(GLOB TESTSOURCES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} test/test*.cpp ) + #message ( "TESTSOURCES: ${TESTSOURCES}" ) +diff --git a/edp.so/CMakeLists.txt b/edp.so/CMakeLists.txt +index a933061..d117e03 100644 +--- a/edp.so/CMakeLists.txt ++++ b/edp.so/CMakeLists.txt +@@ -129,6 +129,7 @@ endif() + + + if(NOT EXISTS "${ROCR_LIB_DIR}/lib${HIP_HCC_LIB}.so") ++ message("${ROCR_LIB_DIR}/lib${HIP_HCC_LIB}.so not found") + message("ERROR: ROC Runtime libraries can't be found under specified path. Please set ROCR_LIB_DIR path. Current value is : " ${ROCR_LIB_DIR}) + RETURN() + endif() +@@ -136,7 +137,7 @@ endif() + ## define include directories + include_directories(./ ../ ${ROCR_INC_DIR} ${ROCBLAS_INC_DIR} ${HIP_INC_DIR}) + # Add directories to look for library files to link +-link_directories(${RVS_LIB_DIR} ${ROCR_LIB_DIR} ${ROCBLAS_LIB_DIR}) ++link_directories(${RVS_LIB_DIR} ${ROCR_LIB_DIR} ${ROCBLAS_LIB_DIR} ${ROCM_SMI_LIB_DIR}) + ## additional libraries + set (PROJECT_LINK_LIBS rvslibrt rvslib libpthread.so libpciaccess.so libpci.so libm.so) + +diff --git a/gm.so/CMakeLists.txt b/gm.so/CMakeLists.txt +index afaafcb..7c0cd79 100644 +--- a/gm.so/CMakeLists.txt ++++ b/gm.so/CMakeLists.txt +@@ -122,7 +122,7 @@ include_directories(./ ../ ${ROCM_SMI_INC_DIR}) + # Add directories to look for library files to link + link_directories(${RVS_LIB_DIR} ${ROCM_SMI_LIB_DIR} ${ASAN_LIB_PATH}) + ## additional libraries +-set (PROJECT_LINK_LIBS rvslibrt rvslib libpthread.so libpci.so libm.so) ++set (PROJECT_LINK_LIBS rvslibrt rvslib libpthread.so libpci.so libm.so librocm_smi64.so) + + ## define source files + set(SOURCES src/rvs_module.cpp src/action.cpp src/worker.cpp) +@@ -133,7 +133,7 @@ add_library( ${RVS_TARGET} SHARED ${SOURCES}) + set_target_properties(${RVS_TARGET} PROPERTIES + SUFFIX .so.${LIB_VERSION_STRING} + LIBRARY_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) +-target_link_libraries(${RVS_TARGET} ${PROJECT_LINK_LIBS} ${ROCM_SMI_LIB}) ++target_link_libraries(${RVS_TARGET} ${PROJECT_LINK_LIBS}) + add_dependencies(${RVS_TARGET} rvslibrt rvslib) + + add_custom_command(TARGET ${RVS_TARGET} POST_BUILD +diff --git a/gpup.so/CMakeLists.txt b/gpup.so/CMakeLists.txt +index ca1674b..a9e4d16 100644 +--- a/gpup.so/CMakeLists.txt ++++ b/gpup.so/CMakeLists.txt +@@ -111,7 +111,7 @@ endif() + ## define include directories + include_directories(./ ../ include ../include) + # Add directories to look for library files to link +-link_directories(${RVS_LIB_DIR} ${ASAN_LIB_PATH}) ++link_directories(${RVS_LIB_DIR} ${ASAN_LIB_PATH} ${ROCM_SMI_LIB_DIR}) + ## additional libraries + set (PROJECT_LINK_LIBS rvslibrt rvslib libpci.so libm.so) + +diff --git a/gst.so/CMakeLists.txt b/gst.so/CMakeLists.txt +index d85eadb..ca7fff4 100644 +--- a/gst.so/CMakeLists.txt ++++ b/gst.so/CMakeLists.txt +@@ -137,7 +137,7 @@ if(DEFINED RVS_ROCMSMI) + endif() + + +-if(NOT EXISTS "${ROCR_LIB_DIR}/lib${HIP_HCC_LIB}.so") ++if(NOT EXISTS "${HIP_INC_DIR}/lib/lib${HIP_HCC_LIB}.so") + message("ERROR: ROC Runtime libraries can't be found under specified path. Please set ROCR_LIB_DIR path. Current value is : " ${ROCR_LIB_DIR}) + RETURN() + endif() +@@ -145,7 +145,7 @@ endif() + ## define include directories + include_directories(./ ../ ${ROCR_INC_DIR} ${ROCBLAS_INC_DIR} ${HIP_INC_DIR}) + # Add directories to look for library files to link +-link_directories(${RVS_LIB_DIR} ${ROCR_LIB_DIR} ${ROCBLAS_LIB_DIR} ${ASAN_LIB_PATH}) ++link_directories(${RVS_LIB_DIR} ${ROCR_LIB_DIR} ${HIP_INC_DIR}/lib/ ${ROCBLAS_LIB_DIR} ${ASAN_LIB_PATH} ${ROCM_SMI_LIB_DIR}) + ## additional libraries + set (PROJECT_LINK_LIBS rvslibrt rvslib libpthread.so libpci.so libm.so) + +diff --git a/iet.so/CMakeLists.txt b/iet.so/CMakeLists.txt +index 3263d12..62f4318 100644 +--- a/iet.so/CMakeLists.txt ++++ b/iet.so/CMakeLists.txt +@@ -140,7 +140,7 @@ if(DEFINED RVS_ROCMSMI) + endif() + endif() + +-if(NOT EXISTS "${ROCR_LIB_DIR}/lib${HIP_HCC_LIB}.so") ++if(NOT EXISTS "${HIP_INC_DIR}/lib/lib${HIP_HCC_LIB}.so") + message("ERROR: ROC Runtime libraries can't be found under specified path. Please set ROCR_LIB_DIR path. Current value is : " ${ROCR_LIB_DIR}) + RETURN() + endif() +@@ -159,7 +159,7 @@ include_directories(./ ../ ${ROCM_SMI_INC_DIR} ${ROCBLAS_INC_DIR} ${ROCR_INC_DIR + # Add directories to look for library files to link + link_directories(${RVS_LIB_DIR} ${ROCR_LIB_DIR} ${ROCBLAS_LIB_DIR} ${ROCM_SMI_LIB_DIR} ${ASAN_LIB_PATH}) + ## additional libraries +-set (PROJECT_LINK_LIBS rvslibrt rvslib libpthread.so libpci.so libm.so) ++set (PROJECT_LINK_LIBS rvslibrt rvslib libpthread.so libpci.so libm.so librocm_smi64.so) + + set(SOURCES src/rvs_module.cpp src/action.cpp src/iet_worker.cpp ) + +@@ -168,7 +168,7 @@ add_library( ${RVS_TARGET} SHARED ${SOURCES}) + set_target_properties(${RVS_TARGET} PROPERTIES + SUFFIX .so.${LIB_VERSION_STRING} + LIBRARY_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) +-target_link_libraries(${RVS_TARGET} ${PROJECT_LINK_LIBS} ${HIP_HCC_LIB} ${ROCBLAS_LIB} ${ROCM_SMI_LIB}) ++target_link_libraries(${RVS_TARGET} ${PROJECT_LINK_LIBS} ${HIP_INC_DIR}/lib/ ${HIP_HCC_LIB} ${ROCBLAS_LIB} ${ROCM_SMI_LIB_DIR}) + add_dependencies(${RVS_TARGET} rvslibrt rvslib) + + add_custom_command(TARGET ${RVS_TARGET} POST_BUILD +diff --git a/mem.so/CMakeLists.txt b/mem.so/CMakeLists.txt +index 5a0f401..3fc4f51 100644 +--- a/mem.so/CMakeLists.txt ++++ b/mem.so/CMakeLists.txt +@@ -134,7 +134,7 @@ if(DEFINED RVS_ROCMSMI) + endif() + + +-if(NOT EXISTS "${ROCR_LIB_DIR}/lib${HIP_HCC_LIB}.so") ++if(NOT EXISTS "${HIP_INC_DIR}/lib/lib${HIP_HCC_LIB}.so") + message("ERROR: ROC Runtime libraries can't be found under specified path. Please set ROCR_LIB_DIR path. Current value is : " ${ROCR_LIB_DIR}) + RETURN() + endif() +@@ -143,7 +143,7 @@ endif() + include_directories(./ ../ ${ROCR_INC_DIR} ${HIP_INC_DIR}) + + # Add directories to look for library files to link +-link_directories(${RVS_LIB_DIR} ${ROCR_LIB_DIR} ${ROCBLAS_LIB_DIR} ${ASAN_LIB_PATH}) ++link_directories(${RVS_LIB_DIR} ${ROCR_LIB_DIR} ${HIP_INC_DIR}/lib ${ROCBLAS_LIB_DIR} ${ASAN_LIB_PATH} ${ROCM_SMI_LIB_DIR}) + ## additional libraries + set (PROJECT_LINK_LIBS rvslibrt rvslib libpthread.so libpci.so libm.so) + +diff --git a/pbqt.so/CMakeLists.txt b/pbqt.so/CMakeLists.txt +index d75211d..80abe22 100644 +--- a/pbqt.so/CMakeLists.txt ++++ b/pbqt.so/CMakeLists.txt +@@ -138,7 +138,7 @@ endif() + ## define include directories + include_directories(./ ../ pci ${ROCR_INC_DIR}) + # Add directories to look for library files to link +-link_directories(${RVS_LIB_DIR} ${ROCR_LIB_DIR} ${ROCT_LIB_DIR} ${ASAN_LIB_PATH}) ++link_directories(${RVS_LIB_DIR} ${ROCR_LIB_DIR} ${HSAKMT_LIB_DIR} ${ROCT_LIB_DIR} ${ASAN_LIB_PATH} ${ROCM_SMI_LIB_DIR}) + ## additional libraries + set (PROJECT_LINK_LIBS rvslibrt rvslib libpthread.so libpci.so libm.so) + +diff --git a/pebb.so/CMakeLists.txt b/pebb.so/CMakeLists.txt +index 7ba031c..e64be8e 100644 +--- a/pebb.so/CMakeLists.txt ++++ b/pebb.so/CMakeLists.txt +@@ -139,7 +139,7 @@ endif() + ## define include directories + include_directories(./ ../ pci ${ROCR_INC_DIR}) + # Add directories to look for library files to link +-link_directories(${RVS_LIB_DIR} ${ROCR_LIB_DIR} ${ROCT_LIB_DIR} ${ASAN_LIB_PATH}) ++link_directories(${RVS_LIB_DIR} ${ROCR_LIB_DIR} ${HSAKMT_LIB_DIR} ${ROCT_LIB_DIR} ${ASAN_LIB_PATH} ${ROCM_SMI_LIB_DIR} ) + ## additional libraries + set (PROJECT_LINK_LIBS rvslibrt rvslib libpthread.so libpci.so libm.so) + +diff --git a/peqt.so/CMakeLists.txt b/peqt.so/CMakeLists.txt +index 2248d91..7f5912d 100644 +--- a/peqt.so/CMakeLists.txt ++++ b/peqt.so/CMakeLists.txt +@@ -107,9 +107,9 @@ else() + endif() + + ## define include directories +-include_directories(./ ../) ++include_directories(./ ../ ${HSA_PATH}) + # Add directories to look for library files to link +-link_directories(${RVS_LIB_DIR} ${ASAN_LIB_PATH}) ++link_directories(${RVS_LIB_DIR} ${HSA_PATH}/lib/ ${HSAKMT_LIB_DIR} ${ASAN_LIB_PATH} ${ROCM_SMI_LIB_DIR}) + ## additional libraries + set (PROJECT_LINK_LIBS rvslibrt rvslib libpci.so libm.so) + +diff --git a/perf.so/CMakeLists.txt b/perf.so/CMakeLists.txt +index b319396..b9abe15 100644 +--- a/perf.so/CMakeLists.txt ++++ b/perf.so/CMakeLists.txt +@@ -137,7 +137,7 @@ if(DEFINED RVS_ROCMSMI) + endif() + + +-if(NOT EXISTS "${ROCR_LIB_DIR}/lib${HIP_HCC_LIB}.so") ++if(NOT EXISTS "${HIP_INC_DIR}/lib/lib${HIP_HCC_LIB}.so") + message("ERROR: ROC Runtime libraries can't be found under specified path. Please set ROCR_LIB_DIR path. Current value is : " ${ROCR_LIB_DIR}) + RETURN() + endif() +@@ -145,7 +145,7 @@ endif() + ## define include directories + include_directories(./ ../ ${ROCR_INC_DIR} ${ROCBLAS_INC_DIR} ${HIP_INC_DIR}) + # Add directories to look for library files to link +-link_directories(${RVS_LIB_DIR} ${ROCR_LIB_DIR} ${ROCBLAS_LIB_DIR} ${ASAN_LIB_PATH}) ++link_directories(${RVS_LIB_DIR} ${ROCR_LIB_DIR} ${HIP_INC_DIR}/lib ${ROCBLAS_LIB_DIR} ${ASAN_LIB_PATH} ${ROCM_SMI_LIB_DIR}) + ## additional libraries + set (PROJECT_LINK_LIBS rvslibrt rvslib libpthread.so libpci.so libm.so) + +diff --git a/pesm.so/CMakeLists.txt b/pesm.so/CMakeLists.txt +index ff60729..e7a2402 100644 +--- a/pesm.so/CMakeLists.txt ++++ b/pesm.so/CMakeLists.txt +@@ -109,7 +109,7 @@ endif() + ## define include directories + include_directories(./ ../ pci) + # Add directories to look for library files to link +-link_directories(${RVS_LIB_DIR} ${ROCR_LIB_DIR} ${ROCBLAS_LIB_DIR} ${ASAN_LIB_PATH}) ++link_directories(${RVS_LIB_DIR} ${ROCR_LIB_DIR} ${ROCBLAS_LIB_DIR} ${ASAN_LIB_PATH} ${ROCM_SMI_LIB_DIR}) + ## additional libraries + set (PROJECT_LINK_LIBS libpthread.so libpci.so libm.so) + +diff --git a/rcqt.so/CMakeLists.txt b/rcqt.so/CMakeLists.txt +index 32e1004..ac826ea 100644 +--- a/rcqt.so/CMakeLists.txt ++++ b/rcqt.so/CMakeLists.txt +@@ -110,7 +110,7 @@ endif() + ## define include directories + include_directories(./ ../) + # Add directories to look for library files to link +-link_directories(${RVS_LIB_DIR} ${ASAN_LIB_PATH}) ++link_directories(${RVS_LIB_DIR} ${ASAN_LIB_PATH} ${ASAN_LIB_PATH} ${HSAKMT_LIB_DIR} ${ROCM_SMI_LIB_DIR}) + ## additional libraries + set (PROJECT_LINK_LIBS rvslibrt rvslib) + +diff --git a/rvs/CMakeLists.txt b/rvs/CMakeLists.txt +index b350429..c855a32 100644 +--- a/rvs/CMakeLists.txt ++++ b/rvs/CMakeLists.txt +@@ -115,7 +115,7 @@ endif() + ## define include directories + include_directories(./ ../ ${YAML_INC_DIR} ${YAML_LIB_DIR}/include) + ## define lib directories +-link_directories(${CMAKE_CURRENT_BINARY_DIR} ${RVS_LIB_DIR} ${ASAN_LIB_PATH}) ++link_directories(${CMAKE_CURRENT_BINARY_DIR} ${RVS_LIB_DIR} ${ASAN_LIB_PATH} ${ROCM_SMI_LIB_DIR}) + ## additional libraries + set (PROJECT_LINK_LIBS libdl.so "${YAML_LIB_DIR}/libyaml-cpp.a" libpthread.so) + +diff --git a/rvs/tests.cmake b/rvs/tests.cmake +index 32301c8..a058749 100644 +--- a/rvs/tests.cmake ++++ b/rvs/tests.cmake +@@ -179,7 +179,7 @@ add_test(NAME unit.ttf.rvs.config.noconfig + ## define include directories + include_directories(${UT_INC}) + ## define lib directories +-link_directories(${UT_LIB}) ++link_directories(${UT_LIB} ${ROCM_SMI_LIB_DIR}) + ## additional libraries for unit tests + set (PROJECT_TEST_LINK_LIBS ${PROJECT_LINK_LIBS} libpci.so) + +diff --git a/rvslib/CMakeLists.txt b/rvslib/CMakeLists.txt +index 31e6143..4ffed0f 100644 +--- a/rvslib/CMakeLists.txt ++++ b/rvslib/CMakeLists.txt +@@ -115,7 +115,7 @@ endif() + + ## define include directories + include_directories(./ ../ +- ${ROCM_SMI_INC_DIR} ${ROCR_INC_DIR} ${ROCBLAS_INC_DIR} ${HIP_INC_DIR} ++ ${ROCM_SMI_INC_DIR} ${HIP_PATH} ${ROCBLAS_INC_DIR} + ) + link_directories(${ASAN_LIB_PATH} ${ROCM_SMI_LIB_DIR}) + +diff --git a/smqt.so/CMakeLists.txt b/smqt.so/CMakeLists.txt +index e6b8ec4..722f329 100644 +--- a/smqt.so/CMakeLists.txt ++++ b/smqt.so/CMakeLists.txt +@@ -108,7 +108,7 @@ endif() + ## define include directories + include_directories(./ ../ pci) + # Add directories to look for library files to link +-link_directories(${RVS_LIB_DIR} ${ASAN_LIB_PATH}) ++link_directories(${RVS_LIB_DIR} ${ASAN_LIB_PATH} ${ROCM_SMI_LIB_DIR}) + ## additional libraries + set (PROJECT_LINK_LIBS rvslibrt rvslib libpci.so libm.so) + +diff --git a/testif.so/CMakeLists.txt b/testif.so/CMakeLists.txt +index ed7d3d3..f09951e 100644 +--- a/testif.so/CMakeLists.txt ++++ b/testif.so/CMakeLists.txt +@@ -110,7 +110,7 @@ endif() + ## define include directories + include_directories(./ ../ pci) + # Add directories to look for library files to link +-link_directories(${RVS_LIB_DIR} ${ROCR_LIB_DIR} ${ROCBLAS_LIB_DIR} ${ASAN_LIB_PATH}) ++link_directories(${RVS_LIB_DIR} ${ROCBLAS_LIB_DIR} ${ASAN_LIB_PATH} ${ROCM_SMI_LIB_DIR}) + ## additional libraries + set (PROJECT_LINK_LIBS libpthread.so libpci.so libm.so) + +-- +2.39.3 + diff --git a/var/spack/repos/builtin/packages/rocm-validation-suite/package.py b/var/spack/repos/builtin/packages/rocm-validation-suite/package.py index c3f5c88c9a..dfefd8ef75 100644 --- a/var/spack/repos/builtin/packages/rocm-validation-suite/package.py +++ b/var/spack/repos/builtin/packages/rocm-validation-suite/package.py @@ -21,6 +21,8 @@ class RocmValidationSuite(CMakePackage): maintainers("srekolam", "renjithravindrankannath") + version("5.6.1", sha256="d5e4100e2d07311dfa101563c15d026a8130442cdee8af9ef861832cd7866c0d") + version("5.6.0", sha256="54cc5167055870570c97ee7114f48d24d5415f984e0c9d7b58b83467e0cf18fb") version("5.5.1", sha256="0fbfaa9f68642b590ef04f9778013925bbf3f17bdcd35d4c85a8ffd091169a6e") version("5.5.0", sha256="296add772171db67ab8838d2db1ea56df21e895c0348c038768e40146e4fe86a") version("5.4.3", sha256="1f0888e559104a4b8c2f5322f7463e425f2baaf12aeb1a8982a5974516e7b667") @@ -111,7 +113,11 @@ class RocmValidationSuite(CMakePackage): patch("006-library-path.patch", when="@4.5.0:5.2") patch( "007-cleanup-path-reference-donot-download-googletest-yaml-library-path_5.3.patch", - when="@5.3.0:", + when="@5.3.0:5.5", + ) + patch( + "007-cleanup-path-reference-donot-download-googletest-yaml-library-path_5.6.patch", + when="@5.6:", ) depends_on("cmake@3.5:", type="build") @@ -150,6 +156,8 @@ class RocmValidationSuite(CMakePackage): "5.4.3", "5.5.0", "5.5.1", + "5.6.0", + "5.6.1", ]: depends_on("hip@" + ver, when="@" + ver) depends_on("rocminfo@" + ver, when="@" + ver) diff --git a/var/spack/repos/builtin/packages/rocminfo/package.py b/var/spack/repos/builtin/packages/rocminfo/package.py index 1d45ed1f86..92fcd8c826 100644 --- a/var/spack/repos/builtin/packages/rocminfo/package.py +++ b/var/spack/repos/builtin/packages/rocminfo/package.py @@ -18,6 +18,8 @@ class Rocminfo(CMakePackage): maintainers("srekolam", "renjithravindrankannath", "haampie") version("master", branch="master") + version("5.6.1", sha256="780b186ac7410a503eca1060f4bbc35db1b7b4d1d714d15c7534cd26d8af7b54") + version("5.6.0", sha256="87d98a736e4f7510d1475d35717842068d826096a0af7c15a395bcf9d36d7fa0") version("5.5.1", sha256="bcab27bb3595d5a4c981e2416458d169e85c27e603c22e743d9240473bfbe98a") version("5.5.0", sha256="b6107d362b70e20a10911741eb44247139b4eb43489f7fa648daff880b6de37f") version("5.4.3", sha256="72159eed31f8deee0df9228b9e306a18fe9efdd4d6c0eead871cad4617874170") @@ -128,12 +130,14 @@ class Rocminfo(CMakePackage): "5.4.3", "5.5.0", "5.5.1", + "5.6.0", + "5.6.1", "master", ]: depends_on("hsakmt-roct@" + ver, when="@" + ver) depends_on("hsa-rocr-dev@" + ver, when="@" + ver) - for ver in ["5.5.0", "5.5.1"]: + for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1"]: depends_on("rocm-core@" + ver, when="@" + ver) def cmake_args(self): diff --git a/var/spack/repos/builtin/packages/rocprim/package.py b/var/spack/repos/builtin/packages/rocprim/package.py index daa6154482..5394f73958 100644 --- a/var/spack/repos/builtin/packages/rocprim/package.py +++ b/var/spack/repos/builtin/packages/rocprim/package.py @@ -16,6 +16,8 @@ class Rocprim(CMakePackage): maintainers("cgmb", "srekolam", "renjithravindrankannath") + version("5.6.1", sha256="e9ec1b0039c07cf3096653a04224fe5fe755afc6ba000f6838b3a8bc84df27de") + version("5.6.0", sha256="360d6ece3c4a3c289dd88043432026fb989e982ae4d05230d8cdc858bcd50466") version("5.5.1", sha256="63cdc682afb39efd18f097faf695ce64c851c4a550a8ad96fa89d694451b6a42") version("5.5.0", sha256="968d9059f93d3f0f8a602f7b989e54e36cff2f9136486b6869e4534a5bf8c7d9") version("5.4.3", sha256="7be6314a46195912d3203e7e59cb8880a46ed7c1fd221e92fadedd20532e0e48") @@ -138,6 +140,8 @@ class Rocprim(CMakePackage): "5.4.3", "5.5.0", "5.5.1", + "5.6.0", + "5.6.1", ]: depends_on("hip@" + ver, when="@" + ver) depends_on("comgr@" + ver, when="@" + ver) diff --git a/var/spack/repos/builtin/packages/rocrand/package.py b/var/spack/repos/builtin/packages/rocrand/package.py index 893e3e4851..eb6496d338 100644 --- a/var/spack/repos/builtin/packages/rocrand/package.py +++ b/var/spack/repos/builtin/packages/rocrand/package.py @@ -25,6 +25,8 @@ class Rocrand(CMakePackage): version("develop", branch="develop") version("master", branch="master") + version("5.6.1", sha256="6bf71e687ffa0fcc1b00e3567dd43da4147a82390f1b2db5e6f1f594dee6066d") + version("5.6.0", sha256="cc894d2f1af55e16b62c179062063946609c656043556189c656a115fd7d6f5f") version("5.5.1", sha256="e8bed3741b19e296bd698fc55b43686206f42f4deea6ace71513e0c48258cc6e") version("5.5.0", sha256="0481e7ef74c181026487a532d1c17e62dd468e508106edde0279ca1adeee6f9a") version("5.4.3", sha256="463aa760e9f74e45b326765040bb8a8a4fa27aaeaa5e5df16f8289125f88a619") @@ -193,6 +195,8 @@ class Rocrand(CMakePackage): "5.4.3", "5.5.0", "5.5.1", + "5.6.0", + "5.6.1", ]: depends_on("hip@" + ver, when="@" + ver) depends_on("rocm-cmake@%s:" % ver, type="build", when="@" + ver) diff --git a/var/spack/repos/builtin/packages/rocsolver/package.py b/var/spack/repos/builtin/packages/rocsolver/package.py index f8c081299f..3b1cfcb511 100644 --- a/var/spack/repos/builtin/packages/rocsolver/package.py +++ b/var/spack/repos/builtin/packages/rocsolver/package.py @@ -39,6 +39,8 @@ class Rocsolver(CMakePackage): version("develop", branch="develop") version("master", branch="master") + version("5.6.1", sha256="6a8f366218aee599a0e56755030f94ee690b34f30e6d602748632226c5dc21bb") + version("5.6.0", sha256="54baa7f35f3c53da9005054e6f7aeecece5526dafcb277af32cbcb3996b0cbbc") version("5.5.1", sha256="8bf843e42d2e89203ea5fdb6e6082cea90da8d02920ab4c09bcc2b6f69909760") version("5.5.0", sha256="6775aa5b96731208c12c5b450cf218d4c262a80b7ea20c2c3034c448bb2ca4d2") version("5.4.3", sha256="5308b68ea72f465239a4bb2ed1a0507f0df7c98d3df3fd1f392e6d9ed7975232") @@ -132,7 +134,7 @@ class Rocsolver(CMakePackage): # Backport https://github.com/ROCmSoftwarePlatform/rocSOLVER/commit/2bbfb8976f6e4d667499c77e41a6433850063e88 patch("fmt-8.1-compatibility.patch", when="@4.5.0:5.1.3") # Maximize compatibility with other libraries that are using fmt. - patch("fmt-9-compatibility.patch", when="@5.2.0:") + patch("fmt-9-compatibility.patch", when="@5.2.0:5.5") def check(self): exe = join_path(self.build_directory, "clients", "staging", "rocsolver-test") @@ -173,9 +175,13 @@ class Rocsolver(CMakePackage): "5.4.3", "5.5.0", "5.5.1", + "5.6.0", + "5.6.1", ]: depends_on("hip@" + ver, when="@" + ver) depends_on("rocblas@" + ver, when="@" + ver) + for ver in ["5.6.0", "5.6.1"]: + depends_on("rocsparse@5.2:", when="@5.6:") for tgt in itertools.chain(["auto"], amdgpu_targets): depends_on("rocblas amdgpu_target={0}".format(tgt), when="amdgpu_target={0}".format(tgt)) diff --git a/var/spack/repos/builtin/packages/rocsparse/package.py b/var/spack/repos/builtin/packages/rocsparse/package.py index 8f3693b469..4fb8fb1646 100644 --- a/var/spack/repos/builtin/packages/rocsparse/package.py +++ b/var/spack/repos/builtin/packages/rocsparse/package.py @@ -32,6 +32,9 @@ class Rocsparse(CMakePackage): sticky=True, ) variant("test", default=False, description="Build rocsparse-test client") + + version("5.6.1", sha256="6a50a64354507f1374e1a86aa7f5c07d1aaa96ac193ac292c279153087bb5d54") + version("5.6.0", sha256="5797db3deb4a532e691447e3e8c923b93bd9fe4c468f3a88f00cecd80bebcae4") version("5.5.1", sha256="1dd2d18898dfebdf898e8fe7d1c1198e8f8451fd70ff12a1990ec1419cf359e1") version("5.5.0", sha256="cbee79b637691bc710c1c83fbaa91db7498d38d4df873be23e28ed5617acde72") version("5.4.3", sha256="9fb633f235eb0567cc54fae6bdc779f16bf0bb4e6f5bdddb40312c6d11ca8478") @@ -142,6 +145,8 @@ class Rocsparse(CMakePackage): "5.4.3", "5.5.0", "5.5.1", + "5.6.0", + "5.6.1", ]: depends_on("hip@" + ver, when="@" + ver) depends_on("rocprim@" + ver, when="@" + ver) diff --git a/var/spack/repos/builtin/packages/rocthrust/package.py b/var/spack/repos/builtin/packages/rocthrust/package.py index 62bf4fb3da..196bd7eaa1 100644 --- a/var/spack/repos/builtin/packages/rocthrust/package.py +++ b/var/spack/repos/builtin/packages/rocthrust/package.py @@ -19,6 +19,8 @@ class Rocthrust(CMakePackage): maintainers("cgmb", "srekolam", "renjithravindrankannath") + version("5.6.1", sha256="63df61d5ab46d4cfda6066d748274bacecc77151692e372e6f7df5e91852bdc2") + version("5.6.0", sha256="e52a27bcb4add38a5f0f3a5c7e409c230bf4ba9afae19bd2e06c2be00d39db59") version("5.5.1", sha256="66f126e5ea46ca761533411f81e83402773f95d3184cb7645ca73df227413023") version("5.5.0", sha256="c031f71cd4b6eaf98664fd2ad50fc18f7ccbfa67be415dca425169d2d1c81e9e") version("5.4.3", sha256="d133e14ea6d27d358d1bd4d31b79fb1562d1aea7c400e5a2d28d0f159cb6c8a8") @@ -142,6 +144,8 @@ class Rocthrust(CMakePackage): "5.4.3", "5.5.0", "5.5.1", + "5.6.0", + "5.6.1", ]: depends_on("hip@" + ver, when="@" + ver) depends_on("rocprim@" + ver, when="@" + ver) diff --git a/var/spack/repos/builtin/packages/roctracer-dev-api/package.py b/var/spack/repos/builtin/packages/roctracer-dev-api/package.py index 505dc254c1..c7a80816c4 100644 --- a/var/spack/repos/builtin/packages/roctracer-dev-api/package.py +++ b/var/spack/repos/builtin/packages/roctracer-dev-api/package.py @@ -17,6 +17,8 @@ class RoctracerDevApi(Package): tags = ["rocm"] maintainers("srekolam", "renjithravindrankannath") + version("5.6.1", sha256="007c498be25b067ad9a7631a2b0892f9129150ee9714e471a921225875d45e69") + version("5.6.0", sha256="cbcfe4fa2e8b627006b320a93992fb3078696d8ef2ef049b4b880b6b7d57e13e") version("5.5.1", sha256="3afc31ebfdb14b0365185ca6b9326a83b1503a94a51d910f5ce7ced192d8c133") version("5.5.0", sha256="fe9ad95628fa96639db6fc33f78d334c814c7161b4a754598f5a4a7852625777") version("5.4.3", sha256="6b5111be5efd4d7fd6935ca99b06fab19b43d97a58d26fc1fe6e783c4de9a926") diff --git a/var/spack/repos/builtin/packages/roctracer-dev/package.py b/var/spack/repos/builtin/packages/roctracer-dev/package.py index b50574667e..328aa0844b 100644 --- a/var/spack/repos/builtin/packages/roctracer-dev/package.py +++ b/var/spack/repos/builtin/packages/roctracer-dev/package.py @@ -20,6 +20,8 @@ class RoctracerDev(CMakePackage, ROCmPackage): maintainers("srekolam", "renjithravindrankannath") libraries = ["libroctracer64"] + version("5.6.1", sha256="007c498be25b067ad9a7631a2b0892f9129150ee9714e471a921225875d45e69") + version("5.6.0", sha256="cbcfe4fa2e8b627006b320a93992fb3078696d8ef2ef049b4b880b6b7d57e13e") version("5.5.1", sha256="3afc31ebfdb14b0365185ca6b9326a83b1503a94a51d910f5ce7ced192d8c133") version("5.5.0", sha256="fe9ad95628fa96639db6fc33f78d334c814c7161b4a754598f5a4a7852625777") version("5.4.3", sha256="6b5111be5efd4d7fd6935ca99b06fab19b43d97a58d26fc1fe6e783c4de9a926") @@ -72,6 +74,8 @@ class RoctracerDev(CMakePackage, ROCmPackage): "5.4.3", "5.5.0", "5.5.1", + "5.6.0", + "5.6.1", ]: depends_on("hsakmt-roct@" + ver, when="@" + ver) depends_on("hsa-rocr-dev@" + ver, when="@" + ver) @@ -94,7 +98,7 @@ class RoctracerDev(CMakePackage, ROCmPackage): ]: depends_on("rocprofiler-dev@" + ver, when="@" + ver) - for ver in ["5.5.0", "5.5.1"]: + for ver in ["5.5.0", "5.5.1", "5.6.0", "5.6.1"]: depends_on("rocm-core@" + ver, when="@" + ver) patch("0001-include-rocprofiler-dev-path.patch", when="@5.3:5.4") diff --git a/var/spack/repos/builtin/packages/rocwmma/0001-add-rocm-smi-lib-path-for-building-tests.patch b/var/spack/repos/builtin/packages/rocwmma/0001-add-rocm-smi-lib-path-for-building-tests.patch new file mode 100644 index 0000000000..cfa3cb4180 --- /dev/null +++ b/var/spack/repos/builtin/packages/rocwmma/0001-add-rocm-smi-lib-path-for-building-tests.patch @@ -0,0 +1,31 @@ +From 099ac638f41d9224f649fe23a64783bb408a2b09 Mon Sep 17 00:00:00 2001 +From: Sreenivasa Murthy Kolam <sreenivasamurthy.kolam@amd.com> +Date: Wed, 30 Aug 2023 09:41:15 +0000 +Subject: [PATCH] add rocm-smi-lib path for building tests + +--- + test/CMakeLists.txt | 5 +++-- + 1 file changed, 3 insertions(+), 2 deletions(-) + +diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt +index 85f98d0..269f517 100644 +--- a/test/CMakeLists.txt ++++ b/test/CMakeLists.txt +@@ -69,11 +69,12 @@ function(add_rocwmma_test TEST_TARGET TEST_SOURCE) + + list(APPEND TEST_SOURCE ${ARGN}) + add_executable(${TEST_TARGET} ${TEST_SOURCE}) +- target_link_libraries(${TEST_TARGET} rocwmma gtest) ++ target_link_libraries(${TEST_TARGET} rocwmma gtest ${ROCM_SMI_DIR}/lib) + target_link_libraries(${TEST_TARGET} OpenMP::OpenMP_CXX "-L${HIP_CLANG_ROOT}/lib" "-Wl,-rpath=${HIP_CLANG_ROOT}/lib") + target_include_directories(${TEST_TARGET} PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR} +- ${ROCWMMA_TEST_INCLUDE_DIRS}) ++ ${ROCWMMA_TEST_INCLUDE_DIRS} ++ ${ROCM_SMI_DIR}/include) + + # Add support to include extended test coverage + if(ROCWMMA_BUILD_EXTENDED_TESTS) +-- +2.39.3 + diff --git a/var/spack/repos/builtin/packages/rocwmma/package.py b/var/spack/repos/builtin/packages/rocwmma/package.py index 774bfe6b26..96978f7862 100644 --- a/var/spack/repos/builtin/packages/rocwmma/package.py +++ b/var/spack/repos/builtin/packages/rocwmma/package.py @@ -25,7 +25,8 @@ class Rocwmma(CMakePackage): tags = ["rocm"] maintainers("srekolam", "renjithravindrankannath") - + version("5.6.1", sha256="41a5159ee1ad5fc411fe6220f37bd754e26d3883c24c0f2378f50ef628bc1b8f") + version("5.6.0", sha256="78b6ab10fce71d10a9d762b2eaab3390eb13b05c764f47a3b0a303ec3d37acf8") version("5.5.1", sha256="ada30d5e52df5da0d3f4e212a25efb492dbedc129628f4db4ef4ed77667da228") version("5.5.0", sha256="b9e1938cba111eeea295414c42de34d54a878f0d41a26e433809d60c12d31dbf") version("5.4.3", sha256="0968366c83b78a9d058d483be536aba03e79b300ccb6890d3da43298be54c288") @@ -59,16 +60,33 @@ class Rocwmma(CMakePackage): depends_on("googletest@1.10.0:", type="test") - for ver in ["5.2.0", "5.2.1", "5.2.3", "5.3.0", "5.3.3", "5.4.0", "5.4.3", "5.5.0", "5.5.1"]: + for ver in [ + "5.2.0", + "5.2.1", + "5.2.3", + "5.3.0", + "5.3.3", + "5.4.0", + "5.4.3", + "5.5.0", + "5.5.1", + "5.6.0", + "5.6.1", + ]: depends_on("rocm-cmake@%s:" % ver, type="build", when="@" + ver) depends_on("llvm-amdgpu@" + ver, type="build", when="@" + ver) depends_on("hip@" + ver, when="@" + ver) depends_on("rocblas@" + ver, type="build", when="@" + ver) depends_on("rocm-openmp-extras@" + ver, type="build", when="@" + ver) + for ver in ["5.6.0", "5.6.1"]: + depends_on("rocm-smi-lib@" + ver, when="@" + ver) + for tgt in itertools.chain(["auto"], amdgpu_targets): depends_on("rocblas amdgpu_target={0}".format(tgt), when="amdgpu_target={0}".format(tgt)) + patch("0001-add-rocm-smi-lib-path-for-building-tests.patch", when="@5.6:") + def setup_build_environment(self, env): env.set("CXX", self.spec["hip"].hipcc) @@ -93,5 +111,7 @@ class Rocwmma(CMakePackage): tgt = self.spec.variants["amdgpu_target"] if "auto" not in tgt: args.append(self.define_from_variant("AMDGPU_TARGETS", "amdgpu_target")) + if self.spec.satisfies("@5.6.0:"): + args.append(self.define("ROCM_SMI_DIR", self.spec["rocm-smi-lib"].prefix)) return args |