summaryrefslogtreecommitdiff
path: root/var
diff options
context:
space:
mode:
authorBrian Van Essen <vanessen1@llnl.gov>2022-09-29 11:46:27 -0600
committerGitHub <noreply@github.com>2022-09-29 10:46:27 -0700
commit400a9f3df7e0f81aa52527718957964603a41d54 (patch)
treec2519f628037cf6733dde5107df3e1923cffe065 /var
parent699f575976c2467cabbd79596181384d6d887517 (diff)
downloadspack-400a9f3df7e0f81aa52527718957964603a41d54.tar.gz
spack-400a9f3df7e0f81aa52527718957964603a41d54.tar.bz2
spack-400a9f3df7e0f81aa52527718957964603a41d54.tar.xz
spack-400a9f3df7e0f81aa52527718957964603a41d54.zip
Add aws ofi rccl (#32773)
* Added a package for the aws-ofi-rccl plug-in from the ROCm software stack. It allows RCCL to use the libfabric communication library. Added support for using libfabric in Aluminum. * Updated the run environment so that the plugin would get loaded. * Added support for setting up the the LD_LIBRARY_PATH for dependent packages. * Added package for RCCL tests to assess the impact of OFI libfabric RCCL plug-in.
Diffstat (limited to 'var')
-rw-r--r--var/spack/repos/builtin/packages/aluminum/package.py12
-rw-r--r--var/spack/repos/builtin/packages/aws-ofi-rccl/package.py64
-rw-r--r--var/spack/repos/builtin/packages/libfabric/package.py1
-rw-r--r--var/spack/repos/builtin/packages/rccl-tests/package.py40
4 files changed, 115 insertions, 2 deletions
diff --git a/var/spack/repos/builtin/packages/aluminum/package.py b/var/spack/repos/builtin/packages/aluminum/package.py
index fbe4d3d37f..235aa0b996 100644
--- a/var/spack/repos/builtin/packages/aluminum/package.py
+++ b/var/spack/repos/builtin/packages/aluminum/package.py
@@ -50,7 +50,12 @@ class Aluminum(CMakePackage, CudaPackage, ROCmPackage):
description="Builds with support for CUDA intra-node "
" Put/Get and IPC RMA functionality",
)
- variant("rccl", default=False, description="Builds with support for NCCL communication lib")
+ variant("rccl", default=False, description="Builds with support for RCCL communication lib")
+ variant(
+ "ofi_rccl_plugin",
+ default=False,
+ description="Builds with support for OFI libfabric enhanced RCCL communication lib",
+ )
depends_on("cmake@3.21.0:", type="build", when="@1.0.1:")
depends_on("cmake@3.17.0:", type="build", when="@:1.0.0")
@@ -62,8 +67,13 @@ class Aluminum(CMakePackage, CudaPackage, ROCmPackage):
depends_on("cub", when="@:0.1,0.6.0: +cuda ^cuda@:10")
depends_on("hipcub", when="@:0.1,0.6.0: +rocm")
+ depends_on("rccl", when="+rccl")
+ depends_on("aws-ofi-rccl", when="+ofi_rccl_plugin platform=cray")
+
conflicts("~cuda", when="+cuda_rma", msg="CUDA RMA support requires CUDA")
conflicts("+cuda", when="+rocm", msg="CUDA and ROCm support are mutually exclusive")
+ conflicts("+nccl", when="+rccl", msg="NCCL and RCCL support are mutually exclusive")
+ conflicts("~rccl", when="+ofi_rccl_plugin", msg="libfabric enhancements require RCCL support")
generator = "Ninja"
depends_on("ninja", type="build")
diff --git a/var/spack/repos/builtin/packages/aws-ofi-rccl/package.py b/var/spack/repos/builtin/packages/aws-ofi-rccl/package.py
new file mode 100644
index 0000000000..b08f384b77
--- /dev/null
+++ b/var/spack/repos/builtin/packages/aws-ofi-rccl/package.py
@@ -0,0 +1,64 @@
+# Copyright 2013-2022 Lawrence Livermore National Security, LLC and other
+# Spack Project Developers. See the top-level COPYRIGHT file for details.
+#
+# SPDX-License-Identifier: (Apache-2.0 OR MIT)
+
+from spack.package import *
+
+
+class AwsOfiRccl(AutotoolsPackage):
+ """AWS OFI RCCL is a plug-in which enables EC2 developers to use
+ libfabric as a network provider while running AMD's RCCL based
+ applications."""
+
+ homepage = "https://github.com/ROCmSoftwarePlatform/aws-ofi-rccl"
+ git = "https://github.com/ROCmSoftwarePlatform/aws-ofi-rccl.git"
+ url = "https://github.com/ROCmSoftwarePlatform/aws-ofi-rccl.git"
+ tags = ["rocm"]
+
+ maintainers = ["bvanessen"]
+
+ version("cxi", branch="cxi", default=True)
+ version("master", branch="master")
+
+ variant("enable-trace", default=False, description="Enable printing trace messages")
+ variant("disable-tests", default=False, description="Disable build of tests")
+
+ depends_on("libfabric")
+ depends_on("hip")
+ depends_on("rccl")
+ depends_on("mpi")
+ depends_on("autoconf", type="build")
+ depends_on("automake", type="build")
+ depends_on("libtool", type="build")
+
+ # To enable this plug-in to work with RCCL add it to the LD_LIBRARY_PATH
+ def setup_run_environment(self, env):
+ aws_ofi_rccl_home = self.spec["aws-ofi-rccl"].prefix
+ env.append_path("LD_LIBRARY_PATH", aws_ofi_rccl_home.lib)
+
+ # To enable this plug-in to work with RCCL add it to the LD_LIBRARY_PATH
+ def setup_dependent_run_environment(self, env, dependent_spec):
+ aws_ofi_rccl_home = self.spec["aws-ofi-rccl"].prefix
+ env.append_path("LD_LIBRARY_PATH", aws_ofi_rccl_home.lib)
+
+ def configure_args(self):
+ spec = self.spec
+ args = []
+
+ args.extend(
+ [
+ "--with-libfabric={0}".format(spec["libfabric"].prefix),
+ "--with-hip={0}".format(spec["hip"].prefix),
+ "--with-rccl={0}".format(spec["rccl"].prefix),
+ "--with-mpi={0}".format(spec["mpi"].prefix),
+ ]
+ )
+
+ if "+enable-trace" in self.spec:
+ args.append("--enable-trace")
+
+ if "+disable-tests" in self.spec:
+ args.append("--disable-tests")
+
+ return args
diff --git a/var/spack/repos/builtin/packages/libfabric/package.py b/var/spack/repos/builtin/packages/libfabric/package.py
index 949fdef9d2..aa05b5c074 100644
--- a/var/spack/repos/builtin/packages/libfabric/package.py
+++ b/var/spack/repos/builtin/packages/libfabric/package.py
@@ -13,7 +13,6 @@ class Libfabric(AutotoolsPackage):
homepage = "https://libfabric.org/"
url = "https://github.com/ofiwg/libfabric/releases/download/v1.8.0/libfabric-1.8.0.tar.bz2"
git = "https://github.com/ofiwg/libfabric.git"
-
maintainers = ["rajachan"]
version("main", branch="main")
diff --git a/var/spack/repos/builtin/packages/rccl-tests/package.py b/var/spack/repos/builtin/packages/rccl-tests/package.py
new file mode 100644
index 0000000000..8c8f67cd71
--- /dev/null
+++ b/var/spack/repos/builtin/packages/rccl-tests/package.py
@@ -0,0 +1,40 @@
+# Copyright 2013-2022 Lawrence Livermore National Security, LLC and other
+# Spack Project Developers. See the top-level COPYRIGHT file for details.
+#
+# SPDX-License-Identifier: (Apache-2.0 OR MIT)
+
+from spack.package import *
+
+
+class RcclTests(MakefilePackage):
+ """These tests check both the performance and the correctness of RCCL
+ operations. They can be compiled against RCCL."""
+
+ homepage = "https://github.com/ROCmSoftwarePlatform/rccl-tests"
+ git = "https://github.com/ROCmSoftwarePlatform/rccl-tests.git"
+ url = "https://github.com/ROCmSoftwarePlatform/rccl-tests.git"
+ tags = ["rocm"]
+
+ maintainers = ["bvanessen"]
+
+ version("develop", branch="develop", default=True)
+ version("master", branch="master")
+
+ variant("mpi", default=True, description="with MPI support")
+
+ depends_on("hip")
+ depends_on("rccl")
+ depends_on("mpi", when="+mpi")
+
+ def build_targets(self):
+ targets = []
+ targets.append("HIP_HOME={0}".format(self.spec["hip"].prefix))
+ targets.append("RCCL_HOME={0}".format(self.spec["rccl"].prefix))
+ if "+mpi" in self.spec:
+ targets.append("MPI_HOME={0}".format(self.spec["mpi"].prefix))
+ targets.append("MPI=1")
+ return targets
+
+ def install(self, spec, prefix):
+ mkdirp(prefix.bin)
+ install_tree("./build", prefix.bin)