summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorHoward Pritchard <howardp@lanl.gov>2024-01-08 13:43:05 -0700
committerGitHub <noreply@github.com>2024-01-08 13:43:05 -0700
commitcdb8fd68e2dc78cc2b0a4e59202d29390df9e96f (patch)
treea62bff423310dd236fb6e80431ffe6de9b6973ad
parent0e9c88385b25362dae556cf9913e1ab407ca3253 (diff)
downloadspack-cdb8fd68e2dc78cc2b0a4e59202d29390df9e96f.tar.gz
spack-cdb8fd68e2dc78cc2b0a4e59202d29390df9e96f.tar.bz2
spack-cdb8fd68e2dc78cc2b0a4e59202d29390df9e96f.tar.xz
spack-cdb8fd68e2dc78cc2b0a4e59202d29390df9e96f.zip
OpenMPI: add v5.0.0/5.0.1, patches, and workarounds (#40913)
* Add OpenMPI 5.0.0/5.0.1 release * Fix a problem with dlopen syms with 5.0.0 * Crank up lex buffer to 1MB so that Open MPI's compiler wrapper can parse the enormously long lines present in, for example, mpicc-wrapper-data.txt when the spack install is utilizing Spack's path padding feature. * Disable romio by default for 5.0.0 and beyond owing to problems compiling the romio package when using the Intel OneAPI compiler. * Patch for addiing cuda lib location in case of non-standard location of libcuda.so * build accel components as DSOs. It appears from looking at some of the spack CI that it implicitly assumes that Open MPI is built with components as DSOs. The default behavior for Open MPI was changed between the 4.1.x release stream and the 5.0.x release stream changed and this premise is now incorrect. Turns out that starting with Open MPI 5.0.0 building static does not work when using a now very important variant, namely cuda. In older versions of Open MPI the libcuda.so was dlopened at run time when needed, but now libcuda is linked in to the cuda components of openmpi directly. This works when using Open MPI's dynamically loadable component option, but doesn't work now for a lot of the Spack CI pipelines because they don't include libcuda.so in LD_LIBRARY_PATH of packages that dont think they are using cuda themselves. Signed-off-by: Howard Pritchard <howardp@lanl.gov> Co-authored-by: Jack Morrison <jack.morrison@cornelisnetworks.com> Co-authored-by: Harmen Stoppels <me@harmenstoppels.nl>
-rw-r--r--var/spack/repos/builtin/packages/openmpi/accelerator-build-components-as-dso-s-by-default.patch81
-rw-r--r--var/spack/repos/builtin/packages/openmpi/accelerator-cuda-fix-bug-in-makefile.patch33
-rw-r--r--var/spack/repos/builtin/packages/openmpi/btlsmcuda-fix-problem-with-makefile.patch73
-rw-r--r--var/spack/repos/builtin/packages/openmpi/fix-for-dlopen-missing-symbol-problem.patch32
-rw-r--r--var/spack/repos/builtin/packages/openmpi/package.py54
5 files changed, 265 insertions, 8 deletions
diff --git a/var/spack/repos/builtin/packages/openmpi/accelerator-build-components-as-dso-s-by-default.patch b/var/spack/repos/builtin/packages/openmpi/accelerator-build-components-as-dso-s-by-default.patch
new file mode 100644
index 0000000000..a4b5bf7342
--- /dev/null
+++ b/var/spack/repos/builtin/packages/openmpi/accelerator-build-components-as-dso-s-by-default.patch
@@ -0,0 +1,81 @@
+From 7e2e390e468db44c8540d2510841a22d146fa6ed Mon Sep 17 00:00:00 2001
+From: Howard Pritchard <howardp@lanl.gov>
+Date: Tue, 7 Nov 2023 10:06:47 -0500
+Subject: [PATCH] accelerator: build components as dso's by default
+
+also need to switch rcache/gpsum and rcache/rgpusum
+
+to DSO by default.
+
+Fix a problem in opal_mca.m4 where the enable-mca-dso list wasn't being
+processed correctly starting with 5.0.0.
+
+related to #12036
+
+Signed-off-by: Howard Pritchard <howardp@lanl.gov>
+
+diff --git a/config/opal_mca.m4 b/config/opal_mca.m4
+index 935b8c65..b425fe63 100644
+--- a/config/opal_mca.m4
++++ b/config/opal_mca.m4
+@@ -13,7 +13,7 @@ dnl All rights reserved.
+ dnl Copyright (c) 2010-2021 Cisco Systems, Inc. All rights reserved
+ dnl Copyright (c) 2013-2017 Intel, Inc. All rights reserved.
+ dnl Copyright (c) 2018-2022 Amazon.com, Inc. or its affiliates. All Rights reserved.
+-dnl Copyright (c) 2021 Triad National Security, LLC. All rights
++dnl Copyright (c) 2021-2023 Triad National Security, LLC. All rights
+ dnl reserved.
+ dnl $COPYRIGHT$
+ dnl
+@@ -167,6 +167,9 @@ of type-component pairs. For example, --enable-mca-no-build=pml-ob1])
+ # Second, set the DSO_all and STATIC_all variables. conflict
+ # resolution (prefer static) is done in the big loop below
+ #
++ # Exception here is the components of the accelerator framework,
++ # which by default are built to be dynamic, except for null.
++ #
+ AC_MSG_CHECKING([which components should be run-time loadable])
+ if test "$enable_static" != "no"; then
+ DSO_all=0
+@@ -174,9 +177,6 @@ of type-component pairs. For example, --enable-mca-no-build=pml-ob1])
+ elif test "$OPAL_ENABLE_DLOPEN_SUPPORT" = 0; then
+ DSO_all=0
+ msg="none (dlopen disabled)"
+- elif test -z "$enable_mca_dso"; then
+- DSO_all=0
+- msg=default
+ elif test "$enable_mca_dso" = "no"; then
+ DSO_all=0
+ msg=none
+@@ -184,15 +184,19 @@ of type-component pairs. For example, --enable-mca-no-build=pml-ob1])
+ DSO_all=1
+ msg=all
+ else
+- DSO_all=0
+- ifs_save="$IFS"
+- IFS="${IFS}$PATH_SEPARATOR,"
+- msg=
+- for item in $enable_mca_dso; do
+- AS_VAR_SET([AS_TR_SH([DSO_$item])], [1])
+- msg="$item $msg"
+- done
+- IFS="$ifs_save"
++ msg=
++ if test -z "$enable_mca_dso"; then
++ enable_mca_dso="accelerator-cuda,accelerator-rocm,accelerator-ze,btl-smcuda,rcache-gpusm,rcache-rgpusm"
++ msg="(default)"
++ fi
++ DSO_all=0
++ ifs_save="$IFS"
++ IFS="${IFS}$PATH_SEPARATOR,"
++ for item in $enable_mca_dso; do
++ AS_VAR_SET([AS_TR_SH([DSO_$item])], [1])
++ msg="$item $msg"
++ done
++ IFS="$ifs_save"
+ fi
+ AC_MSG_RESULT([$msg])
+ unset msg
+--
+2.35.3
+
diff --git a/var/spack/repos/builtin/packages/openmpi/accelerator-cuda-fix-bug-in-makefile.patch b/var/spack/repos/builtin/packages/openmpi/accelerator-cuda-fix-bug-in-makefile.patch
new file mode 100644
index 0000000000..f0681b6ad5
--- /dev/null
+++ b/var/spack/repos/builtin/packages/openmpi/accelerator-cuda-fix-bug-in-makefile.patch
@@ -0,0 +1,33 @@
+From be28fa6421094fcd0c544a6d457c6d748670959a Mon Sep 17 00:00:00 2001
+From: Howard Pritchard <howardp@lanl.gov>
+Date: Mon, 13 Nov 2023 08:12:28 -0700
+Subject: [PATCH] accelerator/cuda: fix bug in makefile.am
+
+that prevents correct linkage of libcuda.so if it is in
+a non standard location.
+
+Related to https://github.com/spack/spack/pull/40913
+
+Signed-off-by: Howard Pritchard <howardp@lanl.gov>
+
+diff --git a/opal/mca/accelerator/cuda/Makefile.am b/opal/mca/accelerator/cuda/Makefile.am
+index 5646890b..2c533ece 100644
+--- a/opal/mca/accelerator/cuda/Makefile.am
++++ b/opal/mca/accelerator/cuda/Makefile.am
+@@ -34,11 +34,11 @@ mcacomponentdir = $(opallibdir)
+ mcacomponent_LTLIBRARIES = $(component_install)
+
+ mca_accelerator_cuda_la_SOURCES = $(sources)
+-mca_accelerator_cuda_la_LDFLAGS = -module -avoid-version
++mca_accelerator_cuda_la_LDFLAGS = -module -avoid-version $(accelerator_cuda_LDFLAGS)
+ mca_accelerator_cuda_la_LIBADD = $(top_builddir)/opal/lib@OPAL_LIB_NAME@.la \
+ $(accelerator_cuda_LIBS)
+
+ noinst_LTLIBRARIES = $(component_noinst)
+ libmca_accelerator_cuda_la_SOURCES =$(sources)
+-libmca_accelerator_cuda_la_LDFLAGS = -module -avoid-version
++libmca_accelerator_cuda_la_LDFLAGS = -module -avoid-version $(accelerator_cuda_LDFLAGS)
+ libmca_accelerator_cuda_la_LIBADD = $(accelerator_cuda_LIBS)
+--
+2.35.3
+
diff --git a/var/spack/repos/builtin/packages/openmpi/btlsmcuda-fix-problem-with-makefile.patch b/var/spack/repos/builtin/packages/openmpi/btlsmcuda-fix-problem-with-makefile.patch
new file mode 100644
index 0000000000..44b4d2766d
--- /dev/null
+++ b/var/spack/repos/builtin/packages/openmpi/btlsmcuda-fix-problem-with-makefile.patch
@@ -0,0 +1,73 @@
+From 27672784304d4c944e2e3c7d526dfd77f021a113 Mon Sep 17 00:00:00 2001
+From: Howard Pritchard <howardp@lanl.gov>
+Date: Thu, 16 Nov 2023 07:05:01 -0700
+Subject: [PATCH] btlsmcuda: fix problem with makefile
+
+when libcuda.so is in a non-standard location.
+
+also fix rcache/gpusm and rcache/rgpsum
+
+Similar fix to that in #12065
+
+Signed-off-by: Howard Pritchard <howardp@lanl.gov>
+
+diff --git a/opal/mca/btl/smcuda/Makefile.am b/opal/mca/btl/smcuda/Makefile.am
+index f1a89df8..8ee37add 100644
+--- a/opal/mca/btl/smcuda/Makefile.am
++++ b/opal/mca/btl/smcuda/Makefile.am
+@@ -51,7 +51,7 @@ endif
+ mcacomponentdir = $(opallibdir)
+ mcacomponent_LTLIBRARIES = $(component_install)
+ mca_btl_smcuda_la_SOURCES = $(libmca_btl_smcuda_la_sources)
+-mca_btl_smcuda_la_LDFLAGS = -module -avoid-version
++mca_btl_smcuda_la_LDFLAGS = -module -avoid-version $(btl_smcuda_LDFLAGS)
+ mca_btl_smcuda_la_LIBADD = $(top_builddir)/opal/lib@OPAL_LIB_NAME@.la \
+ $(OPAL_TOP_BUILDDIR)/opal/mca/common/sm/lib@OPAL_LIB_NAME@mca_common_sm.la \
+ $(btl_smcuda_LIBS)
+@@ -59,6 +59,6 @@ mca_btl_smcuda_la_CPPFLAGS = $(btl_smcuda_CPPFLAGS)
+
+ noinst_LTLIBRARIES = $(component_noinst)
+ libmca_btl_smcuda_la_SOURCES = $(libmca_btl_smcuda_la_sources)
+-libmca_btl_smcuda_la_LDFLAGS = -module -avoid-version
++libmca_btl_smcuda_la_LDFLAGS = -module -avoid-version $(btl_smcuda_LDFLAGS)
+ libmca_btl_smcuda_la_CPPFLAGS = $(btl_smcuda_CPPFLAGS)
+ libmca_btl_smcuda_la_LIBADD = $(btl_smcuda_LIBS)
+diff --git a/opal/mca/rcache/gpusm/Makefile.am b/opal/mca/rcache/gpusm/Makefile.am
+index 5645e5ea..1ff63b35 100644
+--- a/opal/mca/rcache/gpusm/Makefile.am
++++ b/opal/mca/rcache/gpusm/Makefile.am
+@@ -48,11 +48,11 @@ endif
+ mcacomponentdir = $(opallibdir)
+ mcacomponent_LTLIBRARIES = $(component_install)
+ mca_rcache_gpusm_la_SOURCES = $(sources)
+-mca_rcache_gpusm_la_LDFLAGS = -module -avoid-version
++mca_rcache_gpusm_la_LDFLAGS = -module -avoid-version $(rcache_gpusm_LDFLAGS)
+ mca_rcache_gpusm_la_LIBADD = $(top_builddir)/opal/lib@OPAL_LIB_NAME@.la \
+ $(rcache_gpusm_LIBS)
+
+ noinst_LTLIBRARIES = $(component_noinst)
+ libmca_rcache_gpusm_la_SOURCES = $(sources)
+-libmca_rcache_gpusm_la_LDFLAGS = -module -avoid-version
++libmca_rcache_gpusm_la_LDFLAGS = -module -avoid-version $(rcache_gpusm_LDFLAGS)
+ libmca_rcache_gpusm_la_LIBADD = $(rcache_gpusm_LIBS)
+diff --git a/opal/mca/rcache/rgpusm/Makefile.am b/opal/mca/rcache/rgpusm/Makefile.am
+index 6d2fdbc3..dde81411 100644
+--- a/opal/mca/rcache/rgpusm/Makefile.am
++++ b/opal/mca/rcache/rgpusm/Makefile.am
+@@ -46,11 +46,11 @@ endif
+ mcacomponentdir = $(opallibdir)
+ mcacomponent_LTLIBRARIES = $(component_install)
+ mca_rcache_rgpusm_la_SOURCES = $(sources)
+-mca_rcache_rgpusm_la_LDFLAGS = -module -avoid-version
++mca_rcache_rgpusm_la_LDFLAGS = -module -avoid-version $(rcache_rgpusm_LDFLAGS)
+ mca_rcache_rgpusm_la_LIBADD = $(top_builddir)/opal/lib@OPAL_LIB_NAME@.la \
+ $(rcache_rgpusm_LIBS)
+
+ noinst_LTLIBRARIES = $(component_noinst)
+ libmca_rcache_rgpusm_la_SOURCES = $(sources)
+-libmca_rcache_rgpusm_la_LDFLAGS = -module -avoid-version
++libmca_rcache_rgpusm_la_LDFLAGS = -module -avoid-version $(rcache_rgpusm_LDFLAGS)
+ libmca_rcache_rgpusm_la_LIBADD = $(rcache_rgpusm_LIBS)
+--
+2.35.3
+
diff --git a/var/spack/repos/builtin/packages/openmpi/fix-for-dlopen-missing-symbol-problem.patch b/var/spack/repos/builtin/packages/openmpi/fix-for-dlopen-missing-symbol-problem.patch
new file mode 100644
index 0000000000..0a846b0326
--- /dev/null
+++ b/var/spack/repos/builtin/packages/openmpi/fix-for-dlopen-missing-symbol-problem.patch
@@ -0,0 +1,32 @@
+From 50731f03c1ae9d375bfc2771fc402d54fd22e276 Mon Sep 17 00:00:00 2001
+From: Howard Pritchard <howardp@lanl.gov>
+Date: Sat, 4 Nov 2023 13:24:15 -0600
+Subject: [PATCH] spack:fix for dlopen missing symbol problem
+
+related to https://github.com/spack/spack/pull/40725
+
+Signed-off-by: Howard Pritchard <howardp@lanl.gov>
+
+diff --git a/opal/mca/dl/dlopen/configure.m4 b/opal/mca/dl/dlopen/configure.m4
+index 07fda82001..4ae625b1fb 100644
+--- a/opal/mca/dl/dlopen/configure.m4
++++ b/opal/mca/dl/dlopen/configure.m4
+@@ -27,7 +27,7 @@ AC_DEFUN([MCA_opal_dl_dlopen_CONFIG],[
+ AC_CONFIG_FILES([opal/mca/dl/dlopen/Makefile])
+
+ OAC_CHECK_PACKAGE([dlopen],
+- [dl_dlopen],
++ [opal_dl_dlopen],
+ [dlfcn.h],
+ [dl],
+ [dlopen],
+@@ -38,5 +38,5 @@ AC_DEFUN([MCA_opal_dl_dlopen_CONFIG],[
+ [$1],
+ [$2])
+
+- AC_SUBST(dl_dlopen_LIBS)
++ AC_SUBST(opal_dl_dlopen_LIBS)
+ ])
+--
+2.39.3
+
diff --git a/var/spack/repos/builtin/packages/openmpi/package.py b/var/spack/repos/builtin/packages/openmpi/package.py
index 38bee519c8..38a64ac27d 100644
--- a/var/spack/repos/builtin/packages/openmpi/package.py
+++ b/var/spack/repos/builtin/packages/openmpi/package.py
@@ -44,11 +44,17 @@ class Openmpi(AutotoolsPackage, CudaPackage):
# Current
version(
- "4.1.6", sha256="f740994485516deb63b5311af122c265179f5328a0d857a567b85db00b11e415"
- ) # libmpi.so.40.30.6
+ "5.0.1", sha256="e357043e65fd1b956a47d0dae6156a90cf0e378df759364936c1781f1a25ef80"
+ ) # libmpi.so.40.40.1
# Still supported
version(
+ "5.0.0", sha256="9d845ca94bc1aeb445f83d98d238cd08f6ec7ad0f73b0f79ec1668dbfdacd613"
+ ) # libmpi.so.40.40.0
+ version(
+ "4.1.6", sha256="f740994485516deb63b5311af122c265179f5328a0d857a567b85db00b11e415"
+ ) # libmpi.so.40.30.6
+ version(
"4.1.5", sha256="a640986bc257389dd379886fdae6264c8cfa56bc98b71ce3ae3dfbd8ce61dbe3"
) # libmpi.so.40.30.5
version(
@@ -405,6 +411,14 @@ class Openmpi(AutotoolsPackage, CudaPackage):
# To fix performance regressions introduced while fixing a bug in older
# gcc versions on x86_64, Refs. open-mpi/ompi#8603
patch("opal_assembly_arch.patch", when="@4.0.0:4.0.5,4.1.0")
+ # To fix an error in Open MPI configury related to findng dl lib.
+ # This is specific to the 5.0.0 release.
+ patch("fix-for-dlopen-missing-symbol-problem.patch", when="@5.0.0")
+ # Patches to accelerator CUDA component to link in libcuda
+ # when in non-standard location
+ patch("accelerator-cuda-fix-bug-in-makefile.patch", when="@5.0.0")
+ patch("btlsmcuda-fix-problem-with-makefile.patch", when="@5.0.0")
+ patch("accelerator-build-components-as-dso-s-by-default.patch", when="@5.0.0:5.0.1")
variant(
"fabrics",
@@ -439,7 +453,7 @@ class Openmpi(AutotoolsPackage, CudaPackage):
# Additional support options
variant("atomics", default=False, description="Enable built-in atomics")
variant("java", default=False, when="@1.7.4:", description="Build Java support")
- variant("static", default=True, description="Build static libraries")
+ variant("static", default=False, description="Build static libraries")
variant("sqlite3", default=False, when="@1.7.3:1", description="Build SQLite3 support")
variant("vt", default=True, description="Build VampirTrace support")
variant(
@@ -472,7 +486,8 @@ class Openmpi(AutotoolsPackage, CudaPackage):
description="Build deprecated support for the Singularity container",
)
variant("lustre", default=False, description="Lustre filesystem library support")
- variant("romio", default=True, description="Enable ROMIO support")
+ variant("romio", default=True, when="@:5", description="Enable ROMIO support")
+ variant("romio", default=False, when="@5:", description="Enable ROMIO support")
variant("rsh", default=True, description="Enable rsh (openssh) process lifecycle management")
variant(
"orterunprefix",
@@ -511,10 +526,9 @@ class Openmpi(AutotoolsPackage, CudaPackage):
if sys.platform != "darwin":
depends_on("numactl")
- depends_on("autoconf @2.69:", type="build", when="@main")
- depends_on("automake @1.13.4:", type="build", when="@main")
- depends_on("libtool @2.4.2:", type="build", when="@main")
- depends_on("m4", type="build", when="@main")
+ depends_on("autoconf @2.69:", type="build", when="@5.0.0:,main")
+ depends_on("automake @1.13.4:", type="build", when="@5.0.0:,main")
+ depends_on("libtool @2.4.2:", type="build", when="@5.0.0:,main")
depends_on("perl", type="build")
depends_on("pkgconfig", type="build")
@@ -572,6 +586,8 @@ class Openmpi(AutotoolsPackage, CudaPackage):
depends_on("openssh", type="run", when="+rsh")
+ depends_on("cuda", type=("build", "link", "run"), when="@5: +cuda")
+
conflicts("+cxx_exceptions", when="%nvhpc", msg="nvc does not ignore -fexceptions, but errors")
# CUDA support was added in 1.7, and since the variant is part of the
@@ -914,6 +930,11 @@ class Openmpi(AutotoolsPackage, CudaPackage):
perl = which("perl")
perl("autogen.pl")
+ @when("@5.0.0:5.0.1")
+ def autoreconf(self, spec, prefix):
+ perl = which("perl")
+ perl("autogen.pl", "--force")
+
def configure_args(self):
spec = self.spec
config_args = ["--enable-shared", "--disable-silent-rules", "--disable-sphinx"]
@@ -1084,6 +1105,23 @@ class Openmpi(AutotoolsPackage, CudaPackage):
if wrapper_ldflags:
config_args.append("--with-wrapper-ldflags={0}".format(" ".join(wrapper_ldflags)))
+ #
+ # the Spack path padding feature causes issues with Open MPI's lex based parsing system
+ # used by the compiler wrappers. Crank up lex buffer to 1MB to handle this.
+ # see https://spack.readthedocs.io/en/latest/binary_caches.html#relocation
+ #
+
+ if spec.satisfies("@5.0.0:"):
+ config_args.append("CFLAGS=-DYY_BUF_SIZE=1048576")
+
+ #
+ # disable romio for 5.0.0 or newer if using Intel OneAPI owing to a problem
+ # building ZE related components of the romio packaged with this release
+ #
+
+ # if spec.satisfies("@5.0.0:") and spec.satisfies("%oneapi"):
+ # config_args.append("--disable-io-romio")
+
return config_args
@run_after("install", when="+wrapper-rpath")