From ccc7c9b86de790dd268b530e1cfa7336b596fd04 Mon Sep 17 00:00:00 2001 From: "Adam J. Stewart" Date: Mon, 30 Mar 2020 16:09:17 -0500 Subject: py-horovod: fix compilation of ~cuda (#15719) * py-horovod: fix compilation of ~cuda * Rewrite py-horovod with only 3 variants * Add upstream patch to workaround compilation issue --- .../repos/builtin/packages/py-horovod/fma.patch | 52 ++++++++ .../repos/builtin/packages/py-horovod/package.py | 140 +++++++++++---------- 2 files changed, 127 insertions(+), 65 deletions(-) create mode 100644 var/spack/repos/builtin/packages/py-horovod/fma.patch (limited to 'var') diff --git a/var/spack/repos/builtin/packages/py-horovod/fma.patch b/var/spack/repos/builtin/packages/py-horovod/fma.patch new file mode 100644 index 0000000000..13ccb8c9c5 --- /dev/null +++ b/var/spack/repos/builtin/packages/py-horovod/fma.patch @@ -0,0 +1,52 @@ +From 717e72f91f02d1dc3c859719ef1d804b10f88017 Mon Sep 17 00:00:00 2001 +From: Nicolas V Castet +Date: Mon, 30 Mar 2020 12:47:50 -0500 +Subject: [PATCH] Add extra preprocessor guard for FMA optimization + +Fixes #1832 + +Signed-off-by: Nicolas V Castet +--- + horovod/common/ops/adasum/adasum.h | 8 ++++---- + 1 file changed, 4 insertions(+), 4 deletions(-) + +diff --git a/horovod/common/ops/adasum/adasum.h b/horovod/common/ops/adasum/adasum.h +index 0330f5850..876f7f12b 100644 +--- a/horovod/common/ops/adasum/adasum.h ++++ b/horovod/common/ops/adasum/adasum.h +@@ -19,7 +19,7 @@ + #include + #include + +-#if __AVX__ && __F16C__ ++#if __AVX__ && __F16C__ && __FMA__ + #include + #include + #endif +@@ -104,7 +104,7 @@ template class Adasum { + int count, double& dotProduct, + double& anormsq, double& bnormsq, + int layerid) { +-#if __AVX__ && __F16C__ ++#if __AVX__ && __F16C__ && __FMA__ + if (horovod_datatype == DataType::HOROVOD_FLOAT16) { + ComputeDotAndNormSqrdsfp16((uint16_t*)a, (uint16_t*)b, count, dotProduct, + anormsq, bnormsq, layerid); +@@ -125,7 +125,7 @@ template class Adasum { + double acoeff, void* __restrict__ a, + double bcoeff, void* __restrict__ b, + int layerid) { +-#if __AVX__ && __F16C__ ++#if __AVX__ && __F16C__ && __FMA__ + if (horovod_datatype == DataType::HOROVOD_FLOAT16) { + ScaledAddfp16(count, acoeff, (uint16_t*)a, bcoeff, (uint16_t*)b, layerid); + } else +@@ -425,7 +425,7 @@ template class Adasum { + } + + +-#if __AVX__ && __F16C__ ++#if __AVX__ && __F16C__ && __FMA__ + inline void ComputeDotAndNormSqrdsfp16(const uint16_t* __restrict__ a, + const uint16_t* __restrict__ b, + int len, double& dotProduct, diff --git a/var/spack/repos/builtin/packages/py-horovod/package.py b/var/spack/repos/builtin/packages/py-horovod/package.py index 598e01c6f9..c740486853 100644 --- a/var/spack/repos/builtin/packages/py-horovod/package.py +++ b/var/spack/repos/builtin/packages/py-horovod/package.py @@ -25,96 +25,106 @@ class PyHorovod(PythonPackage): version('0.16.3', tag='v0.16.3', submodules=True) version('0.16.2', tag='v0.16.2', submodules=True) - # Deep learning frameworks - variant('pytorch', default=True, description='Enables PyTorch') - variant('tensorflow', default=False, description='Enables TensorFlow') - variant('mxnet', default=False, description='Enables Apache MXNet') - - # Distributed support - variant('gloo', default=False, description='Enables features related to distributed support') - variant('mpi', default=True, description='Enables MPI build') - - # GPU support - variant('cuda', default=True, description='Enables CUDA build') - variant('gpu_allreduce', default='mpi', - description='Backend to use for GPU_ALLREDUCE', - values=('mpi', 'nccl'), multi=False) # DDL support is deprecated - variant('gpu_allgather', default='mpi', - description='Backend to use for GPU_ALLGATHER', - values=('mpi',), multi=False) - variant('gpu_broadcast', default='mpi', - description='Backend to use for GPU_BROADCAST', - values=('mpi', 'nccl'), multi=False) + # https://github.com/horovod/horovod/blob/master/docs/install.rst + variant('frameworks', default='pytorch', + description='Deep learning frameworks to build support for', + values=('tensorflow', 'pytorch', 'mxnet', 'keras', 'spark'), + multi=True) + variant('controllers', default='mpi', + description='Controllers to coordinate work between processes', + values=('mpi', 'gloo'), multi=True) + variant('tensor_ops', default='nccl', + description='Framework to use for GPU/CPU operations', + values=('nccl', 'mpi', 'gloo', 'ccl'), multi=False) # Required dependencies - depends_on('py-setuptools', type='build') + depends_on('py-setuptools', type='build') depends_on('py-cloudpickle', type=('build', 'run')) - depends_on('py-psutil', type=('build', 'run')) - depends_on('py-pyyaml', type=('build', 'run')) - depends_on('py-six', type=('build', 'run')) - - # Deep learning frameworks - depends_on('py-torch@0.4.0:', type=('build', 'run'), when='+pytorch') - depends_on('py-torch+cuda', type=('build', 'run'), when='+pytorch+cuda') - depends_on('py-cffi@1.4.0:', type=('build', 'run'), when='+pytorch') - depends_on('py-tensorflow@1.1.0:', type=('build', 'link', 'run'), when='+tensorflow') - depends_on('mxnet@1.4.0:+python', type=('build', 'link', 'run'), when='+mxnet') - depends_on('mxnet+cuda', type=('build', 'link', 'run'), when='+mxnet+cuda') - - # Distributed support + depends_on('py-psutil', type=('build', 'run')) + depends_on('py-pyyaml', type=('build', 'run')) + depends_on('py-six', type=('build', 'run')) + + # Framework dependencies + depends_on('py-tensorflow@1.1.0:', type=('build', 'link', 'run'), when='frameworks=tensorflow') + depends_on('py-torch@0.4.0:', type=('build', 'run'), when='frameworks=pytorch') + depends_on('py-torchvision', type=('build', 'run'), when='frameworks=pytorch') + depends_on('py-cffi@1.4.0:', type=('build', 'run'), when='frameworks=pytorch') + depends_on('mxnet@1.4.1:+python', type=('build', 'link', 'run'), when='frameworks=mxnet') + depends_on('py-keras@2.0.8,2.1.2:', type=('build', 'run'), when='frameworks=keras') + depends_on('py-h5py@2.9:', type=('build', 'run'), when='frameworks=spark') + depends_on('py-numpy', type=('build', 'run'), when='frameworks=spark') + depends_on('py-petastorm@0.8.2', type=('build', 'run'), when='frameworks=spark') + depends_on('py-pyarrow@0.15.0:', type=('build', 'run'), when='frameworks=spark') + depends_on('py-pyspark@2.3.2:', type=('build', 'run'), when='frameworks=spark') + + # Controller dependencies + depends_on('mpi', when='controllers=mpi') # There does not appear to be a way to use an external Gloo installation - depends_on('cmake', type='build', when='+gloo') - depends_on('mpi', when='+mpi') - depends_on('mpi', when='gpu_allreduce=mpi') - depends_on('mpi', when='gpu_allgather=mpi') - depends_on('mpi', when='gpu_broadcast=mpi') + depends_on('cmake', type='build', when='controllers=gloo') - # GPU support - depends_on('cuda', when='+cuda') - depends_on('nccl@2.0:', when='gpu_allreduce=nccl') - depends_on('nccl@2.0:', when='gpu_broadcast=nccl') + # Tensor Operations dependencies + depends_on('nccl', when='tensor_ops=nccl') + depends_on('mpi', when='tensor_ops=mpi') + # There does not appear to be a way to use an external Gloo installation + depends_on('cmake', type='build', when='tensor_ops=gloo') # Test dependencies depends_on('py-mock', type='test') depends_on('py-pytest', type='test') depends_on('py-pytest-forked', type='test') - conflicts('+gloo', when='platform=darwin', msg='Gloo cannot be compiled on MacOS') - conflicts('~gloo~mpi', msg='One of Gloo or MPI are required for Horovod to run') - conflicts('~pytorch~tensorflow~mxnet', msg='At least one deep learning backend is required') + conflicts('controllers=gloo', when='platform=darwin', msg='Gloo cannot be compiled on MacOS') + + # https://github.com/horovod/horovod/pull/1835 + patch('fma.patch', when='@0.19.0:0.19.1') def setup_build_environment(self, env): - # Deep learning frameworks - if '~pytorch' in self.spec: - env.set('HOROVOD_WITHOUT_PYTORCH', 1) - if '~tensorflow' in self.spec: + # Frameworks + if 'frameworks=tensorflow' in self.spec: + env.set('HOROVOD_WITH_TENSORFLOW', 1) + else: env.set('HOROVOD_WITHOUT_TENSORFLOW', 1) - if '~mxnet' in self.spec: + if 'frameworks=pytorch' in self.spec: + env.set('HOROVOD_WITH_PYTORCH', 1) + else: + env.set('HOROVOD_WITHOUT_PYTORCH', 1) + if 'frameworks=mxnet' in self.spec: + env.set('HOROVOD_WITH_MXNET', 1) + else: env.set('HOROVOD_WITHOUT_MXNET', 1) - # Distributed support - if '~gloo' in self.spec: - env.set('HOROVOD_WITHOUT_GLOO', 1) - if '+mpi' in self.spec: + # Controllers + if 'controllers=mpi' in self.spec: env.set('HOROVOD_WITH_MPI', 1) else: env.set('HOROVOD_WITHOUT_MPI', 1) + if 'controllers=gloo' in self.spec: + env.set('HOROVOD_WITH_GLOO', 1) + else: + env.set('HOROVOD_WITHOUT_GLOO', 1) + + # Tensor Operations + if 'tensor_ops=nccl' in self.spec: + env.set('HOROVOD_GPU', 'CUDA') - # GPU support - if '+cuda' in self.spec: env.set('HOROVOD_CUDA_HOME', self.spec['cuda'].prefix) env.set('HOROVOD_CUDA_INCLUDE', self.spec['cuda'].headers.directories[0]) env.set('HOROVOD_CUDA_LIB', self.spec['cuda'].libs.directories[0]) - if '^nccl' in self.spec: + env.set('HOROVOD_NCCL_HOME', self.spec['nccl'].prefix) env.set('HOROVOD_NCCL_INCLUDE', self.spec['nccl'].headers.directories[0]) env.set('HOROVOD_NCCL_LIB', self.spec['nccl'].libs.directories[0]) - env.set('HOROVOD_GPU_ALLREDUCE', - self.spec.variants['gpu_allreduce'].value.upper()) - env.set('HOROVOD_GPU_ALLGATHER', - self.spec.variants['gpu_allgather'].value.upper()) - env.set('HOROVOD_GPU_BROADCAST', - self.spec.variants['gpu_broadcast'].value.upper()) - env.set('HOROVOD_ALLOW_MIXED_GPU_IMPL', 1) + + env.set('HOROVOD_GPU_ALLREDUCE', 'NCCL') + env.set('HOROVOD_GPU_BROADCAST', 'NCCL') + else: + env.set('HOROVOD_CPU_OPERATIONS', + self.spec.variants['tensor_ops'].value.upper()) + + @run_after('install') + @on_package_attributes(run_tests=True) + def install_test(self): + horovodrun = Executable(self.prefix.bin.horovodrun) + horovodrun('--check-build') -- cgit v1.2.3-70-g09d2