summaryrefslogtreecommitdiff
path: root/share
diff options
context:
space:
mode:
authorAdam J. Stewart <ajstewart426@gmail.com>2022-10-09 17:39:47 -0500
committerGitHub <noreply@github.com>2022-10-09 15:39:47 -0700
commit01ede3c595e8c02ecd791b97a0a1cae3ee30255f (patch)
tree9bbdd85946ee7f2bcb95c2781d8afd35db2cc161 /share
parent4a6aff8bd1b3e1cacbdfcf7b1f4f95dbd6b8e3f9 (diff)
downloadspack-01ede3c595e8c02ecd791b97a0a1cae3ee30255f.tar.gz
spack-01ede3c595e8c02ecd791b97a0a1cae3ee30255f.tar.bz2
spack-01ede3c595e8c02ecd791b97a0a1cae3ee30255f.tar.xz
spack-01ede3c595e8c02ecd791b97a0a1cae3ee30255f.zip
Add CI stack for ML packages (#31592)
Basic stack of ML packages we would like to test and generate binaries for in CI. Spack now has a large CI framework in GitLab for PR testing and public binary generation. We should take advantage of this to test and distribute optimized binaries for popular ML frameworks. This is a pretty extensive initial set, including CPU, ROCm, and CUDA versions of a core `x96_64_v4` stack. ### Core ML frameworks These are all popular core ML frameworks already available in Spack. - [x] PyTorch - [x] TensorFlow - [x] Scikit-learn - [x] MXNet - [x] CNTK - [x] Caffe - [x] Chainer - [x] XGBoost - [x] Theano ### ML extensions These are domain libraries and wrappers that build on top of core ML libraries - [x] Keras - [x] TensorBoard - [x] torchvision - [x] torchtext - [x] torchaudio - [x] TorchGeo - [x] PyTorch Lightning - [x] torchmetrics - [x] GPyTorch - [x] Horovod ### ML-adjacent libraries These are libraries that aren't specific to ML but are still core libraries used in ML pipelines - [x] numpy - [x] scipy - [x] pandas - [x] ONNX - [x] bazel Co-authored-by: Jonathon Anderson <17242663+blue42u@users.noreply.github.com>
Diffstat (limited to 'share')
-rw-r--r--share/spack/gitlab/cloud_pipelines/.gitlab-ci.yml120
-rw-r--r--share/spack/gitlab/cloud_pipelines/stacks/ml-cpu/spack.yaml142
-rw-r--r--share/spack/gitlab/cloud_pipelines/stacks/ml-cuda/spack.yaml144
-rw-r--r--share/spack/gitlab/cloud_pipelines/stacks/ml-rocm/spack.yaml147
4 files changed, 553 insertions, 0 deletions
diff --git a/share/spack/gitlab/cloud_pipelines/.gitlab-ci.yml b/share/spack/gitlab/cloud_pipelines/.gitlab-ci.yml
index fcfa925549..17c16e16d6 100644
--- a/share/spack/gitlab/cloud_pipelines/.gitlab-ci.yml
+++ b/share/spack/gitlab/cloud_pipelines/.gitlab-ci.yml
@@ -749,3 +749,123 @@ tutorial-protected-build:
needs:
- artifacts: True
job: tutorial-protected-generate
+
+########################################
+# Machine Learning (CPU)
+########################################
+.ml-cpu:
+ variables:
+ SPACK_CI_STACK_NAME: ml-cpu
+
+.ml-cpu-generate:
+ extends: .ml-cpu
+ image: ghcr.io/spack/e4s-amazonlinux-2:v2022-03-21
+ tags: ["spack", "aws", "public", "medium", "x86_64_v4"]
+
+ml-cpu-pr-generate:
+ extends: [ ".ml-cpu-generate", ".pr-generate"]
+
+ml-cpu-protected-generate:
+ extends: [ ".ml-cpu-generate", ".protected-generate"]
+
+ml-cpu-pr-build:
+ extends: [ ".ml-cpu", ".pr-build" ]
+ trigger:
+ include:
+ - artifact: jobs_scratch_dir/cloud-ci-pipeline.yml
+ job: ml-cpu-pr-generate
+ strategy: depend
+ needs:
+ - artifacts: True
+ job: ml-cpu-pr-generate
+
+ml-cpu-protected-build:
+ extends: [ ".ml-cpu", ".protected-build" ]
+ trigger:
+ include:
+ - artifact: jobs_scratch_dir/cloud-ci-pipeline.yml
+ job: ml-cpu-protected-generate
+ strategy: depend
+ needs:
+ - artifacts: True
+ job: ml-cpu-protected-generate
+
+########################################
+# Machine Learning (CUDA)
+########################################
+.ml-cuda:
+ variables:
+ SPACK_CI_STACK_NAME: ml-cuda
+
+.ml-cuda-generate:
+ extends: .ml-cuda
+ image: ghcr.io/spack/e4s-amazonlinux-2:v2022-03-21
+ tags: ["spack", "aws", "public", "medium", "x86_64_v4"]
+
+ml-cuda-pr-generate:
+ extends: [ ".ml-cuda-generate", ".pr-generate"]
+
+ml-cuda-protected-generate:
+ extends: [ ".ml-cuda-generate", ".protected-generate"]
+
+ml-cuda-pr-build:
+ extends: [ ".ml-cuda", ".pr-build" ]
+ trigger:
+ include:
+ - artifact: jobs_scratch_dir/cloud-ci-pipeline.yml
+ job: ml-cuda-pr-generate
+ strategy: depend
+ needs:
+ - artifacts: True
+ job: ml-cuda-pr-generate
+
+ml-cuda-protected-build:
+ extends: [ ".ml-cuda", ".protected-build" ]
+ trigger:
+ include:
+ - artifact: jobs_scratch_dir/cloud-ci-pipeline.yml
+ job: ml-cuda-protected-generate
+ strategy: depend
+ needs:
+ - artifacts: True
+ job: ml-cuda-protected-generate
+
+########################################
+# Machine Learning (ROCm)
+########################################
+.ml-rocm:
+ variables:
+ SPACK_CI_STACK_NAME: ml-rocm
+
+.ml-rocm-generate:
+ extends: .ml-rocm
+ image: ghcr.io/spack/e4s-amazonlinux-2:v2022-03-21
+ tags: ["spack", "aws", "public", "medium", "x86_64_v4"]
+
+ml-rocm-pr-generate:
+ extends: [ ".ml-rocm-generate", ".pr-generate"]
+
+ml-rocm-protected-generate:
+ extends: [ ".ml-rocm-generate", ".protected-generate"]
+
+ml-rocm-pr-build:
+ extends: [ ".ml-rocm", ".pr-build" ]
+ trigger:
+ include:
+ - artifact: jobs_scratch_dir/cloud-ci-pipeline.yml
+ job: ml-rocm-pr-generate
+ strategy: depend
+ needs:
+ - artifacts: True
+ job: ml-rocm-pr-generate
+
+ml-rocm-protected-build:
+ extends: [ ".ml-rocm", ".protected-build" ]
+ trigger:
+ include:
+ - artifact: jobs_scratch_dir/cloud-ci-pipeline.yml
+ job: ml-rocm-protected-generate
+ strategy: depend
+ needs:
+ - artifacts: True
+ job: ml-rocm-protected-generate
diff --git a/share/spack/gitlab/cloud_pipelines/stacks/ml-cpu/spack.yaml b/share/spack/gitlab/cloud_pipelines/stacks/ml-cpu/spack.yaml
new file mode 100644
index 0000000000..a687a6928e
--- /dev/null
+++ b/share/spack/gitlab/cloud_pipelines/stacks/ml-cpu/spack.yaml
@@ -0,0 +1,142 @@
+spack:
+ view: false
+
+ concretizer:
+ reuse: false
+ unify: false
+
+ config:
+ concretizer: clingo
+ install_tree:
+ root: /home/software/spack
+ padded_length: 384
+ projections:
+ all: "{architecture}/{compiler.name}-{compiler.version}/{name}-{version}-{hash}"
+
+ packages:
+ all:
+ compiler: [gcc@11.2.0]
+ target: [x86_64_v4]
+ variants: ~cuda~rocm
+
+ specs:
+ # Horovod
+ - py-horovod
+
+ # JAX
+ # https://github.com/google/jax/issues/12614
+ # - py-jax
+ # - py-jaxlib
+
+ # Keras
+ - py-keras
+ - py-keras-applications
+ - py-keras-preprocessing
+ - py-keras2onnx
+
+ # PyTorch
+ - py-botorch
+ - py-efficientnet-pytorch
+ - py-gpytorch
+ - py-kornia
+ - py-pytorch-gradual-warmup-lr
+ - py-pytorch-lightning
+ - py-segmentation-models-pytorch
+ - py-timm
+ - py-torch
+ - py-torch-cluster
+ - py-torch-geometric
+ # https://github.com/NVIDIA/apex/issues/1498
+ # - py-torch-nvidia-apex
+ - py-torch-scatter
+ - py-torch-sparse
+ - py-torch-spline-conv
+ - py-torchaudio
+ - py-torchdata
+ - py-torchfile
+ - py-torchgeo
+ - py-torchmeta
+ - py-torchmetrics
+ - py-torchtext
+ - py-torchvision
+ - py-vector-quantize-pytorch
+
+ # scikit-learn
+ - py-scikit-learn
+ - py-scikit-learn-extra
+
+ # TensorBoard
+ - py-tensorboard
+ - py-tensorboard-data-server
+ - py-tensorboard-plugin-wit
+ - py-tensorboardx
+
+ # TensorFlow
+ - py-tensorflow
+ - py-tensorflow-datasets
+ - py-tensorflow-estimator
+ - py-tensorflow-hub
+ - py-tensorflow-metadata
+ - py-tensorflow-probability
+
+ # XGBoost
+ - py-xgboost
+ # - r-xgboost
+ - xgboost
+
+ mirrors: { "mirror": "s3://spack-binaries/develop/ml-cpu" }
+
+ gitlab-ci:
+ script:
+ - . "./share/spack/setup-env.sh"
+ - spack --version
+ - cd ${SPACK_CONCRETE_ENV_DIR}
+ - spack env activate --without-view .
+ - spack config add "config:install_tree:projections:${SPACK_JOB_SPEC_PKG_NAME}:'morepadding/{architecture}/{compiler.name}-{compiler.version}/{name}-{version}-{hash}'"
+ - mkdir -p ${SPACK_ARTIFACTS_ROOT}/user_data
+ - if [[ -r /mnt/key/intermediate_ci_signing_key.gpg ]]; then spack gpg trust /mnt/key/intermediate_ci_signing_key.gpg; fi
+ - if [[ -r /mnt/key/spack_public_key.gpg ]]; then spack gpg trust /mnt/key/spack_public_key.gpg; fi
+ - spack -d ci rebuild > >(tee ${SPACK_ARTIFACTS_ROOT}/user_data/pipeline_out.txt) 2> >(tee ${SPACK_ARTIFACTS_ROOT}/user_data/pipeline_err.txt >&2)
+
+ mappings:
+ - match:
+ - llvm
+ runner-attributes:
+ tags: [ "spack", "huge", "x86_64_v4" ]
+ variables:
+ CI_JOB_SIZE: huge
+ KUBERNETES_CPU_REQUEST: 11000m
+ KUBERNETES_MEMORY_REQUEST: 42G
+ - match:
+ - "@:"
+ runner-attributes:
+ tags: [ "spack", "large", "x86_64_v4" ]
+ variables:
+ CI_JOB_SIZE: large
+ KUBERNETES_CPU_REQUEST: 8000m
+ KUBERNETES_MEMORY_REQUEST: 12G
+
+ image: { "name": "ghcr.io/spack/e4s-amazonlinux-2:v2022-03-21", "entrypoint": [""] }
+
+ broken-specs-url: "s3://spack-binaries/broken-specs"
+
+ service-job-attributes:
+ before_script:
+ - . "./share/spack/setup-env.sh"
+ - spack --version
+ image: { "name": "ghcr.io/spack/e4s-amazonlinux-2:v2022-03-21", "entrypoint": [""] }
+ tags: ["spack", "public", "x86_64_v4"]
+
+ signing-job-attributes:
+ image: { "name": "ghcr.io/spack/notary:latest", "entrypoint": [""] }
+ tags: ["spack", "aws"]
+ script:
+ - aws s3 sync --exclude "*" --include "*spec.json*" ${SPACK_REMOTE_MIRROR_OVERRIDE}/build_cache /tmp
+ - /sign.sh
+ - aws s3 sync --exclude "*" --include "*spec.json.sig*" /tmp ${SPACK_REMOTE_MIRROR_OVERRIDE}/build_cache
+
+ cdash:
+ build-group: Machine Learning
+ url: https://cdash.spack.io
+ project: Spack Testing
+ site: Cloud Gitlab Infrastructure
diff --git a/share/spack/gitlab/cloud_pipelines/stacks/ml-cuda/spack.yaml b/share/spack/gitlab/cloud_pipelines/stacks/ml-cuda/spack.yaml
new file mode 100644
index 0000000000..eb37168665
--- /dev/null
+++ b/share/spack/gitlab/cloud_pipelines/stacks/ml-cuda/spack.yaml
@@ -0,0 +1,144 @@
+spack:
+ view: false
+
+ concretizer:
+ reuse: false
+ unify: false
+
+ config:
+ concretizer: clingo
+ install_tree:
+ root: /home/software/spack
+ padded_length: 384
+ projections:
+ all: "{architecture}/{compiler.name}-{compiler.version}/{name}-{version}-{hash}"
+
+ packages:
+ all:
+ compiler: [gcc@11.2.0]
+ target: [x86_64_v4]
+ variants: ~rocm+cuda cuda_arch=80
+ llvm:
+ # https://github.com/spack/spack/issues/27999
+ require: ~cuda
+
+ specs:
+ # Horovod
+ - py-horovod
+
+ # JAX
+ # https://github.com/google/jax/issues/12614
+ # - py-jax
+ # - py-jaxlib
+
+ # Keras
+ - py-keras
+ - py-keras-applications
+ - py-keras-preprocessing
+ - py-keras2onnx
+
+ # PyTorch
+ - py-botorch
+ - py-efficientnet-pytorch
+ - py-gpytorch
+ - py-kornia
+ - py-pytorch-gradual-warmup-lr
+ - py-pytorch-lightning
+ - py-segmentation-models-pytorch
+ - py-timm
+ - py-torch
+ - py-torch-cluster
+ - py-torch-geometric
+ - py-torch-nvidia-apex
+ - py-torch-scatter
+ - py-torch-sparse
+ - py-torch-spline-conv
+ - py-torchaudio
+ - py-torchdata
+ - py-torchfile
+ - py-torchgeo
+ - py-torchmeta
+ - py-torchmetrics
+ - py-torchtext
+ - py-torchvision
+ - py-vector-quantize-pytorch
+
+ # scikit-learn
+ - py-scikit-learn
+ - py-scikit-learn-extra
+
+ # TensorBoard
+ - py-tensorboard
+ - py-tensorboard-data-server
+ - py-tensorboard-plugin-wit
+ - py-tensorboardx
+
+ # TensorFlow
+ - py-tensorflow
+ - py-tensorflow-datasets
+ - py-tensorflow-estimator
+ - py-tensorflow-hub
+ - py-tensorflow-metadata
+ - py-tensorflow-probability
+
+ # XGBoost
+ - py-xgboost
+ # - r-xgboost
+ - xgboost
+
+ mirrors: { "mirror": "s3://spack-binaries/develop/ml-cuda" }
+
+ gitlab-ci:
+ script:
+ - . "./share/spack/setup-env.sh"
+ - spack --version
+ - cd ${SPACK_CONCRETE_ENV_DIR}
+ - spack env activate --without-view .
+ - spack config add "config:install_tree:projections:${SPACK_JOB_SPEC_PKG_NAME}:'morepadding/{architecture}/{compiler.name}-{compiler.version}/{name}-{version}-{hash}'"
+ - mkdir -p ${SPACK_ARTIFACTS_ROOT}/user_data
+ - if [[ -r /mnt/key/intermediate_ci_signing_key.gpg ]]; then spack gpg trust /mnt/key/intermediate_ci_signing_key.gpg; fi
+ - if [[ -r /mnt/key/spack_public_key.gpg ]]; then spack gpg trust /mnt/key/spack_public_key.gpg; fi
+ - spack -d ci rebuild > >(tee ${SPACK_ARTIFACTS_ROOT}/user_data/pipeline_out.txt) 2> >(tee ${SPACK_ARTIFACTS_ROOT}/user_data/pipeline_err.txt >&2)
+
+ mappings:
+ - match:
+ - llvm
+ runner-attributes:
+ tags: [ "spack", "huge", "x86_64_v4" ]
+ variables:
+ CI_JOB_SIZE: huge
+ KUBERNETES_CPU_REQUEST: 11000m
+ KUBERNETES_MEMORY_REQUEST: 42G
+ - match:
+ - "@:"
+ runner-attributes:
+ tags: [ "spack", "large", "x86_64_v4" ]
+ variables:
+ CI_JOB_SIZE: large
+ KUBERNETES_CPU_REQUEST: 8000m
+ KUBERNETES_MEMORY_REQUEST: 12G
+
+ image: { "name": "ghcr.io/spack/e4s-amazonlinux-2:v2022-03-21", "entrypoint": [""] }
+
+ broken-specs-url: "s3://spack-binaries/broken-specs"
+
+ service-job-attributes:
+ before_script:
+ - . "./share/spack/setup-env.sh"
+ - spack --version
+ image: { "name": "ghcr.io/spack/e4s-amazonlinux-2:v2022-03-21", "entrypoint": [""] }
+ tags: ["spack", "public", "x86_64_v4"]
+
+ signing-job-attributes:
+ image: { "name": "ghcr.io/spack/notary:latest", "entrypoint": [""] }
+ tags: ["spack", "aws"]
+ script:
+ - aws s3 sync --exclude "*" --include "*spec.json*" ${SPACK_REMOTE_MIRROR_OVERRIDE}/build_cache /tmp
+ - /sign.sh
+ - aws s3 sync --exclude "*" --include "*spec.json.sig*" /tmp ${SPACK_REMOTE_MIRROR_OVERRIDE}/build_cache
+
+ cdash:
+ build-group: Machine Learning
+ url: https://cdash.spack.io
+ project: Spack Testing
+ site: Cloud Gitlab Infrastructure
diff --git a/share/spack/gitlab/cloud_pipelines/stacks/ml-rocm/spack.yaml b/share/spack/gitlab/cloud_pipelines/stacks/ml-rocm/spack.yaml
new file mode 100644
index 0000000000..c437b170e4
--- /dev/null
+++ b/share/spack/gitlab/cloud_pipelines/stacks/ml-rocm/spack.yaml
@@ -0,0 +1,147 @@
+spack:
+ view: false
+
+ concretizer:
+ reuse: false
+ unify: false
+
+ config:
+ concretizer: clingo
+ install_tree:
+ root: /home/software/spack
+ padded_length: 384
+ projections:
+ all: "{architecture}/{compiler.name}-{compiler.version}/{name}-{version}-{hash}"
+
+ packages:
+ all:
+ compiler: [gcc@11.2.0]
+ target: [x86_64_v4]
+ variants: ~cuda+rocm amdgpu_target=gfx90a
+ gl:
+ require: "osmesa"
+ py-torch:
+ # Does not yet support Spack-installed ROCm
+ require: ~rocm
+
+ specs:
+ # Horovod
+ - py-horovod
+
+ # JAX
+ # https://github.com/google/jax/issues/12614
+ # - py-jax
+ # - py-jaxlib
+
+ # Keras
+ - py-keras
+ - py-keras-applications
+ - py-keras-preprocessing
+ - py-keras2onnx
+
+ # PyTorch
+ # Does not yet support Spack-install ROCm
+ # - py-botorch
+ # - py-efficientnet-pytorch
+ # - py-gpytorch
+ # - py-kornia
+ # - py-pytorch-gradual-warmup-lr
+ # - py-pytorch-lightning
+ # - py-segmentation-models-pytorch
+ # - py-timm
+ # - py-torch
+ # - py-torch-cluster
+ # - py-torch-geometric
+ # - py-torch-nvidia-apex
+ # - py-torch-scatter
+ # - py-torch-sparse
+ # - py-torch-spline-conv
+ # - py-torchaudio
+ # - py-torchdata
+ # - py-torchfile
+ # - py-torchgeo
+ # - py-torchmeta
+ # - py-torchmetrics
+ # - py-torchtext
+ # - py-torchvision
+ # - py-vector-quantize-pytorch
+
+ # scikit-learn
+ - py-scikit-learn
+ - py-scikit-learn-extra
+
+ # TensorBoard
+ - py-tensorboard
+ - py-tensorboard-data-server
+ - py-tensorboard-plugin-wit
+ - py-tensorboardx
+
+ # TensorFlow
+ - py-tensorflow
+ - py-tensorflow-datasets
+ - py-tensorflow-estimator
+ - py-tensorflow-hub
+ - py-tensorflow-metadata
+ - py-tensorflow-probability
+
+ # XGBoost
+ - py-xgboost
+ # - r-xgboost
+ - xgboost
+
+ mirrors: { "mirror": "s3://spack-binaries/develop/ml-rocm" }
+
+ gitlab-ci:
+ script:
+ - . "./share/spack/setup-env.sh"
+ - spack --version
+ - cd ${SPACK_CONCRETE_ENV_DIR}
+ - spack env activate --without-view .
+ - spack config add "config:install_tree:projections:${SPACK_JOB_SPEC_PKG_NAME}:'morepadding/{architecture}/{compiler.name}-{compiler.version}/{name}-{version}-{hash}'"
+ - mkdir -p ${SPACK_ARTIFACTS_ROOT}/user_data
+ - if [[ -r /mnt/key/intermediate_ci_signing_key.gpg ]]; then spack gpg trust /mnt/key/intermediate_ci_signing_key.gpg; fi
+ - if [[ -r /mnt/key/spack_public_key.gpg ]]; then spack gpg trust /mnt/key/spack_public_key.gpg; fi
+ - spack -d ci rebuild > >(tee ${SPACK_ARTIFACTS_ROOT}/user_data/pipeline_out.txt) 2> >(tee ${SPACK_ARTIFACTS_ROOT}/user_data/pipeline_err.txt >&2)
+
+ mappings:
+ - match:
+ - llvm
+ runner-attributes:
+ tags: [ "spack", "huge", "x86_64_v4" ]
+ variables:
+ CI_JOB_SIZE: huge
+ KUBERNETES_CPU_REQUEST: 11000m
+ KUBERNETES_MEMORY_REQUEST: 42G
+ - match:
+ - "@:"
+ runner-attributes:
+ tags: [ "spack", "large", "x86_64_v4" ]
+ variables:
+ CI_JOB_SIZE: large
+ KUBERNETES_CPU_REQUEST: 8000m
+ KUBERNETES_MEMORY_REQUEST: 12G
+
+ image: { "name": "ghcr.io/spack/e4s-amazonlinux-2:v2022-03-21", "entrypoint": [""] }
+
+ broken-specs-url: "s3://spack-binaries/broken-specs"
+
+ service-job-attributes:
+ before_script:
+ - . "./share/spack/setup-env.sh"
+ - spack --version
+ image: { "name": "ghcr.io/spack/e4s-amazonlinux-2:v2022-03-21", "entrypoint": [""] }
+ tags: ["spack", "public", "x86_64_v4"]
+
+ signing-job-attributes:
+ image: { "name": "ghcr.io/spack/notary:latest", "entrypoint": [""] }
+ tags: ["spack", "aws"]
+ script:
+ - aws s3 sync --exclude "*" --include "*spec.json*" ${SPACK_REMOTE_MIRROR_OVERRIDE}/build_cache /tmp
+ - /sign.sh
+ - aws s3 sync --exclude "*" --include "*spec.json.sig*" /tmp ${SPACK_REMOTE_MIRROR_OVERRIDE}/build_cache
+
+ cdash:
+ build-group: Machine Learning
+ url: https://cdash.spack.io
+ project: Spack Testing
+ site: Cloud Gitlab Infrastructure