summaryrefslogtreecommitdiff
path: root/var
diff options
context:
space:
mode:
authorBrian Van Essen <vanessen1@llnl.gov>2022-10-17 13:07:27 -0700
committerGitHub <noreply@github.com>2022-10-17 13:07:27 -0700
commit47bfc60845b71830ee54a04c597419c7eedd2a42 (patch)
treefce39cdd49957ef268845816f65c9ebece2449ac /var
parent9b87b4c8cd5ea1aeeedf673a352cf28838f01412 (diff)
downloadspack-47bfc60845b71830ee54a04c597419c7eedd2a42.tar.gz
spack-47bfc60845b71830ee54a04c597419c7eedd2a42.tar.bz2
spack-47bfc60845b71830ee54a04c597419c7eedd2a42.tar.xz
spack-47bfc60845b71830ee54a04c597419c7eedd2a42.zip
Bugfix HIP and aluminum rocm build (#33344)
* Fixed two bugs in the HIP package recipe. The first is that the HIP_PATH was being set to the actual spec, and not the spec prefix. The second bug is that HIP is expected to be in /opt/rocm-x.y.z/hip but it's libraries can exist at both /opt/rocm-x.y.z/hip/lib and /opt/rocm-x.y.z/lib. This means that the external detection logic may find it in either and it turns out that some modules only expose one of those two locations. Logic is added to ensure that the internal HIP_PATH and associated ROCM_PATH are correctly set in both cases. * Added support for Aluminum to use the libfabric plugin with either RCCL or NCCL.
Diffstat (limited to 'var')
-rw-r--r--var/spack/repos/builtin/packages/aluminum/package.py17
-rw-r--r--var/spack/repos/builtin/packages/hip/package.py21
2 files changed, 31 insertions, 7 deletions
diff --git a/var/spack/repos/builtin/packages/aluminum/package.py b/var/spack/repos/builtin/packages/aluminum/package.py
index 235aa0b996..3d903dfaf2 100644
--- a/var/spack/repos/builtin/packages/aluminum/package.py
+++ b/var/spack/repos/builtin/packages/aluminum/package.py
@@ -52,9 +52,16 @@ class Aluminum(CMakePackage, CudaPackage, ROCmPackage):
)
variant("rccl", default=False, description="Builds with support for RCCL communication lib")
variant(
- "ofi_rccl_plugin",
- default=False,
- description="Builds with support for OFI libfabric enhanced RCCL communication lib",
+ "ofi_libfabric_plugin",
+ default=True,
+ when="+rccl platform=cray",
+ description="Builds with support for OFI libfabric enhanced RCCL/NCCL communication lib",
+ )
+ variant(
+ "ofi_libfabric_plugin",
+ default=True,
+ when="+nccl platform=cray",
+ description="Builds with support for OFI libfabric enhanced RCCL/NCCL communication lib",
)
depends_on("cmake@3.21.0:", type="build", when="@1.0.1:")
@@ -68,12 +75,12 @@ class Aluminum(CMakePackage, CudaPackage, ROCmPackage):
depends_on("hipcub", when="@:0.1,0.6.0: +rocm")
depends_on("rccl", when="+rccl")
- depends_on("aws-ofi-rccl", when="+ofi_rccl_plugin platform=cray")
+ depends_on("aws-ofi-rccl", when="+rccl +ofi_libfabric_plugin platform=cray")
+ depends_on("aws-ofi-nccl", when="+nccl +ofi_libfabric_plugin platform=cray")
conflicts("~cuda", when="+cuda_rma", msg="CUDA RMA support requires CUDA")
conflicts("+cuda", when="+rocm", msg="CUDA and ROCm support are mutually exclusive")
conflicts("+nccl", when="+rccl", msg="NCCL and RCCL support are mutually exclusive")
- conflicts("~rccl", when="+ofi_rccl_plugin", msg="libfabric enhancements require RCCL support")
generator = "Ninja"
depends_on("ninja", type="build")
diff --git a/var/spack/repos/builtin/packages/hip/package.py b/var/spack/repos/builtin/packages/hip/package.py
index 41c2d2d29f..9dce218904 100644
--- a/var/spack/repos/builtin/packages/hip/package.py
+++ b/var/spack/repos/builtin/packages/hip/package.py
@@ -288,12 +288,22 @@ class Hip(CMakePackage):
if self.spec.external:
# For external packages we only assume the `hip` prefix is known,
# because spack does not set prefixes of dependencies of externals.
+ hip_libs_at_top = os.path.basename(self.spec.prefix) != "hip"
# We assume self.spec.prefix is /opt/rocm-x.y.z for rocm-5.2.0 and newer
# and /opt/rocm-x.y.z/hip for older versions
if self.spec.satisfies("@5.2.0:"):
rocm_prefix = Prefix(self.spec.prefix)
else:
- rocm_prefix = Prefix(os.path.dirname(self.spec.prefix))
+ # We assume self.spec.prefix is /opt/rocm-x.y.z/hip and rocm has a
+ # default installation with everything installed under
+ # /opt/rocm-x.y.z
+ # Note that since the key hip library can also exist at the top of the
+ # /opt/rocm-x.y.z/lib tree, it is possible that the package is detected
+ # without the correct prefix. Work around it.
+ if hip_libs_at_top:
+ rocm_prefix = Prefix(self.spec.prefix)
+ else:
+ rocm_prefix = Prefix(os.path.dirname(self.spec.prefix))
if not os.path.isdir(rocm_prefix):
msg = "Could not determine prefix for other rocm components\n"
@@ -302,7 +312,13 @@ class Hip(CMakePackage):
msg += "a workaround."
raise RuntimeError(msg)
+ if hip_libs_at_top:
+ hip_path = "{0}/hip".format(self.spec.prefix)
+ else:
+ hip_path = self.spec.prefix
+
paths = {
+ "hip-path": hip_path,
"rocm-path": rocm_prefix,
"llvm-amdgpu": rocm_prefix.llvm,
"hsa-rocr-dev": rocm_prefix.hsa,
@@ -311,6 +327,7 @@ class Hip(CMakePackage):
}
else:
paths = {
+ "hip-path": self.spec.prefix,
"rocm-path": self.spec.prefix,
"llvm-amdgpu": self.spec["llvm-amdgpu"].prefix,
"hsa-rocr-dev": self.spec["hsa-rocr-dev"].prefix,
@@ -374,7 +391,7 @@ class Hip(CMakePackage):
env.set("HIP_DEVICE_LIB_PATH", paths["bitcode"])
# Just the prefix of hip (used in hipcc)
- env.set("HIP_PATH", paths["rocm-path"])
+ env.set("HIP_PATH", paths["hip-path"])
# Used in comgr and seems necessary when using the JIT compiler, e.g.
# hiprtcCreateProgram: