From 400a9f3df7e0f81aa52527718957964603a41d54 Mon Sep 17 00:00:00 2001 From: Brian Van Essen Date: Thu, 29 Sep 2022 11:46:27 -0600 Subject: Add aws ofi rccl (#32773) * Added a package for the aws-ofi-rccl plug-in from the ROCm software stack. It allows RCCL to use the libfabric communication library. Added support for using libfabric in Aluminum. * Updated the run environment so that the plugin would get loaded. * Added support for setting up the the LD_LIBRARY_PATH for dependent packages. * Added package for RCCL tests to assess the impact of OFI libfabric RCCL plug-in. --- .../repos/builtin/packages/aluminum/package.py | 12 +++- .../repos/builtin/packages/aws-ofi-rccl/package.py | 64 ++++++++++++++++++++++ .../repos/builtin/packages/libfabric/package.py | 1 - .../repos/builtin/packages/rccl-tests/package.py | 40 ++++++++++++++ 4 files changed, 115 insertions(+), 2 deletions(-) create mode 100644 var/spack/repos/builtin/packages/aws-ofi-rccl/package.py create mode 100644 var/spack/repos/builtin/packages/rccl-tests/package.py (limited to 'var') diff --git a/var/spack/repos/builtin/packages/aluminum/package.py b/var/spack/repos/builtin/packages/aluminum/package.py index fbe4d3d37f..235aa0b996 100644 --- a/var/spack/repos/builtin/packages/aluminum/package.py +++ b/var/spack/repos/builtin/packages/aluminum/package.py @@ -50,7 +50,12 @@ class Aluminum(CMakePackage, CudaPackage, ROCmPackage): description="Builds with support for CUDA intra-node " " Put/Get and IPC RMA functionality", ) - variant("rccl", default=False, description="Builds with support for NCCL communication lib") + variant("rccl", default=False, description="Builds with support for RCCL communication lib") + variant( + "ofi_rccl_plugin", + default=False, + description="Builds with support for OFI libfabric enhanced RCCL communication lib", + ) depends_on("cmake@3.21.0:", type="build", when="@1.0.1:") depends_on("cmake@3.17.0:", type="build", when="@:1.0.0") @@ -62,8 +67,13 @@ class Aluminum(CMakePackage, CudaPackage, ROCmPackage): depends_on("cub", when="@:0.1,0.6.0: +cuda ^cuda@:10") depends_on("hipcub", when="@:0.1,0.6.0: +rocm") + depends_on("rccl", when="+rccl") + depends_on("aws-ofi-rccl", when="+ofi_rccl_plugin platform=cray") + conflicts("~cuda", when="+cuda_rma", msg="CUDA RMA support requires CUDA") conflicts("+cuda", when="+rocm", msg="CUDA and ROCm support are mutually exclusive") + conflicts("+nccl", when="+rccl", msg="NCCL and RCCL support are mutually exclusive") + conflicts("~rccl", when="+ofi_rccl_plugin", msg="libfabric enhancements require RCCL support") generator = "Ninja" depends_on("ninja", type="build") diff --git a/var/spack/repos/builtin/packages/aws-ofi-rccl/package.py b/var/spack/repos/builtin/packages/aws-ofi-rccl/package.py new file mode 100644 index 0000000000..b08f384b77 --- /dev/null +++ b/var/spack/repos/builtin/packages/aws-ofi-rccl/package.py @@ -0,0 +1,64 @@ +# Copyright 2013-2022 Lawrence Livermore National Security, LLC and other +# Spack Project Developers. See the top-level COPYRIGHT file for details. +# +# SPDX-License-Identifier: (Apache-2.0 OR MIT) + +from spack.package import * + + +class AwsOfiRccl(AutotoolsPackage): + """AWS OFI RCCL is a plug-in which enables EC2 developers to use + libfabric as a network provider while running AMD's RCCL based + applications.""" + + homepage = "https://github.com/ROCmSoftwarePlatform/aws-ofi-rccl" + git = "https://github.com/ROCmSoftwarePlatform/aws-ofi-rccl.git" + url = "https://github.com/ROCmSoftwarePlatform/aws-ofi-rccl.git" + tags = ["rocm"] + + maintainers = ["bvanessen"] + + version("cxi", branch="cxi", default=True) + version("master", branch="master") + + variant("enable-trace", default=False, description="Enable printing trace messages") + variant("disable-tests", default=False, description="Disable build of tests") + + depends_on("libfabric") + depends_on("hip") + depends_on("rccl") + depends_on("mpi") + depends_on("autoconf", type="build") + depends_on("automake", type="build") + depends_on("libtool", type="build") + + # To enable this plug-in to work with RCCL add it to the LD_LIBRARY_PATH + def setup_run_environment(self, env): + aws_ofi_rccl_home = self.spec["aws-ofi-rccl"].prefix + env.append_path("LD_LIBRARY_PATH", aws_ofi_rccl_home.lib) + + # To enable this plug-in to work with RCCL add it to the LD_LIBRARY_PATH + def setup_dependent_run_environment(self, env, dependent_spec): + aws_ofi_rccl_home = self.spec["aws-ofi-rccl"].prefix + env.append_path("LD_LIBRARY_PATH", aws_ofi_rccl_home.lib) + + def configure_args(self): + spec = self.spec + args = [] + + args.extend( + [ + "--with-libfabric={0}".format(spec["libfabric"].prefix), + "--with-hip={0}".format(spec["hip"].prefix), + "--with-rccl={0}".format(spec["rccl"].prefix), + "--with-mpi={0}".format(spec["mpi"].prefix), + ] + ) + + if "+enable-trace" in self.spec: + args.append("--enable-trace") + + if "+disable-tests" in self.spec: + args.append("--disable-tests") + + return args diff --git a/var/spack/repos/builtin/packages/libfabric/package.py b/var/spack/repos/builtin/packages/libfabric/package.py index 949fdef9d2..aa05b5c074 100644 --- a/var/spack/repos/builtin/packages/libfabric/package.py +++ b/var/spack/repos/builtin/packages/libfabric/package.py @@ -13,7 +13,6 @@ class Libfabric(AutotoolsPackage): homepage = "https://libfabric.org/" url = "https://github.com/ofiwg/libfabric/releases/download/v1.8.0/libfabric-1.8.0.tar.bz2" git = "https://github.com/ofiwg/libfabric.git" - maintainers = ["rajachan"] version("main", branch="main") diff --git a/var/spack/repos/builtin/packages/rccl-tests/package.py b/var/spack/repos/builtin/packages/rccl-tests/package.py new file mode 100644 index 0000000000..8c8f67cd71 --- /dev/null +++ b/var/spack/repos/builtin/packages/rccl-tests/package.py @@ -0,0 +1,40 @@ +# Copyright 2013-2022 Lawrence Livermore National Security, LLC and other +# Spack Project Developers. See the top-level COPYRIGHT file for details. +# +# SPDX-License-Identifier: (Apache-2.0 OR MIT) + +from spack.package import * + + +class RcclTests(MakefilePackage): + """These tests check both the performance and the correctness of RCCL + operations. They can be compiled against RCCL.""" + + homepage = "https://github.com/ROCmSoftwarePlatform/rccl-tests" + git = "https://github.com/ROCmSoftwarePlatform/rccl-tests.git" + url = "https://github.com/ROCmSoftwarePlatform/rccl-tests.git" + tags = ["rocm"] + + maintainers = ["bvanessen"] + + version("develop", branch="develop", default=True) + version("master", branch="master") + + variant("mpi", default=True, description="with MPI support") + + depends_on("hip") + depends_on("rccl") + depends_on("mpi", when="+mpi") + + def build_targets(self): + targets = [] + targets.append("HIP_HOME={0}".format(self.spec["hip"].prefix)) + targets.append("RCCL_HOME={0}".format(self.spec["rccl"].prefix)) + if "+mpi" in self.spec: + targets.append("MPI_HOME={0}".format(self.spec["mpi"].prefix)) + targets.append("MPI=1") + return targets + + def install(self, spec, prefix): + mkdirp(prefix.bin) + install_tree("./build", prefix.bin) -- cgit v1.2.3-70-g09d2