summaryrefslogtreecommitdiff
path: root/var/spack/repos/builtin/packages/py-horovod/package.py
blob: 598e01c6f916471eb0c2be2abfe8aa4d7c32a468 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
# Copyright 2013-2019 Lawrence Livermore National Security, LLC and other
# Spack Project Developers. See the top-level COPYRIGHT file for details.
#
# SPDX-License-Identifier: (Apache-2.0 OR MIT)


class PyHorovod(PythonPackage):
    """Horovod is a distributed deep learning training framework for
    TensorFlow, Keras, PyTorch, and Apache MXNet."""

    homepage = "https://github.com/horovod"
    git      = "https://github.com/horovod/horovod.git"

    maintainers = ['adamjstewart']

    version('master', branch='master', submodules=True)
    version('0.19.1', tag='v0.19.1', submodules=True)
    version('0.19.0', tag='v0.19.0', submodules=True)
    version('0.18.2', tag='v0.18.2', submodules=True)
    version('0.18.1', tag='v0.18.1', submodules=True)
    version('0.18.0', tag='v0.18.0', submodules=True)
    version('0.17.1', tag='v0.17.1', submodules=True)
    version('0.17.0', tag='v0.17.0', submodules=True)
    version('0.16.4', tag='v0.16.4', submodules=True)
    version('0.16.3', tag='v0.16.3', submodules=True)
    version('0.16.2', tag='v0.16.2', submodules=True)

    # Deep learning frameworks
    variant('pytorch',    default=True,  description='Enables PyTorch')
    variant('tensorflow', default=False, description='Enables TensorFlow')
    variant('mxnet',      default=False, description='Enables Apache MXNet')

    # Distributed support
    variant('gloo', default=False, description='Enables features related to distributed support')
    variant('mpi',  default=True,  description='Enables MPI build')

    # GPU support
    variant('cuda', default=True, description='Enables CUDA build')
    variant('gpu_allreduce', default='mpi',
            description='Backend to use for GPU_ALLREDUCE',
            values=('mpi', 'nccl'), multi=False)  # DDL support is deprecated
    variant('gpu_allgather', default='mpi',
            description='Backend to use for GPU_ALLGATHER',
            values=('mpi',), multi=False)
    variant('gpu_broadcast', default='mpi',
            description='Backend to use for GPU_BROADCAST',
            values=('mpi', 'nccl'), multi=False)

    # Required dependencies
    depends_on('py-setuptools', type='build')
    depends_on('py-cloudpickle', type=('build', 'run'))
    depends_on('py-psutil', type=('build', 'run'))
    depends_on('py-pyyaml', type=('build', 'run'))
    depends_on('py-six', type=('build', 'run'))

    # Deep learning frameworks
    depends_on('py-torch@0.4.0:', type=('build', 'run'), when='+pytorch')
    depends_on('py-torch+cuda', type=('build', 'run'), when='+pytorch+cuda')
    depends_on('py-cffi@1.4.0:', type=('build', 'run'), when='+pytorch')
    depends_on('py-tensorflow@1.1.0:', type=('build', 'link', 'run'), when='+tensorflow')
    depends_on('mxnet@1.4.0:+python', type=('build', 'link', 'run'), when='+mxnet')
    depends_on('mxnet+cuda', type=('build', 'link', 'run'), when='+mxnet+cuda')

    # Distributed support
    # There does not appear to be a way to use an external Gloo installation
    depends_on('cmake', type='build', when='+gloo')
    depends_on('mpi', when='+mpi')
    depends_on('mpi', when='gpu_allreduce=mpi')
    depends_on('mpi', when='gpu_allgather=mpi')
    depends_on('mpi', when='gpu_broadcast=mpi')

    # GPU support
    depends_on('cuda', when='+cuda')
    depends_on('nccl@2.0:', when='gpu_allreduce=nccl')
    depends_on('nccl@2.0:', when='gpu_broadcast=nccl')

    # Test dependencies
    depends_on('py-mock', type='test')
    depends_on('py-pytest', type='test')
    depends_on('py-pytest-forked', type='test')

    conflicts('+gloo', when='platform=darwin', msg='Gloo cannot be compiled on MacOS')
    conflicts('~gloo~mpi', msg='One of Gloo or MPI are required for Horovod to run')
    conflicts('~pytorch~tensorflow~mxnet', msg='At least one deep learning backend is required')

    def setup_build_environment(self, env):
        # Deep learning frameworks
        if '~pytorch' in self.spec:
            env.set('HOROVOD_WITHOUT_PYTORCH', 1)
        if '~tensorflow' in self.spec:
            env.set('HOROVOD_WITHOUT_TENSORFLOW', 1)
        if '~mxnet' in self.spec:
            env.set('HOROVOD_WITHOUT_MXNET', 1)

        # Distributed support
        if '~gloo' in self.spec:
            env.set('HOROVOD_WITHOUT_GLOO', 1)
        if '+mpi' in self.spec:
            env.set('HOROVOD_WITH_MPI', 1)
        else:
            env.set('HOROVOD_WITHOUT_MPI', 1)

        # GPU support
        if '+cuda' in self.spec:
            env.set('HOROVOD_CUDA_HOME', self.spec['cuda'].prefix)
            env.set('HOROVOD_CUDA_INCLUDE',
                    self.spec['cuda'].headers.directories[0])
            env.set('HOROVOD_CUDA_LIB', self.spec['cuda'].libs.directories[0])
        if '^nccl' in self.spec:
            env.set('HOROVOD_NCCL_HOME', self.spec['nccl'].prefix)
            env.set('HOROVOD_NCCL_INCLUDE',
                    self.spec['nccl'].headers.directories[0])
            env.set('HOROVOD_NCCL_LIB', self.spec['nccl'].libs.directories[0])
        env.set('HOROVOD_GPU_ALLREDUCE',
                self.spec.variants['gpu_allreduce'].value.upper())
        env.set('HOROVOD_GPU_ALLGATHER',
                self.spec.variants['gpu_allgather'].value.upper())
        env.set('HOROVOD_GPU_BROADCAST',
                self.spec.variants['gpu_broadcast'].value.upper())
        env.set('HOROVOD_ALLOW_MIXED_GPU_IMPL', 1)