diff options
author | Adam J. Stewart <ajstewart426@gmail.com> | 2022-10-09 17:39:47 -0500 |
---|---|---|
committer | GitHub <noreply@github.com> | 2022-10-09 15:39:47 -0700 |
commit | 01ede3c595e8c02ecd791b97a0a1cae3ee30255f (patch) | |
tree | 9bbdd85946ee7f2bcb95c2781d8afd35db2cc161 /share | |
parent | 4a6aff8bd1b3e1cacbdfcf7b1f4f95dbd6b8e3f9 (diff) | |
download | spack-01ede3c595e8c02ecd791b97a0a1cae3ee30255f.tar.gz spack-01ede3c595e8c02ecd791b97a0a1cae3ee30255f.tar.bz2 spack-01ede3c595e8c02ecd791b97a0a1cae3ee30255f.tar.xz spack-01ede3c595e8c02ecd791b97a0a1cae3ee30255f.zip |
Add CI stack for ML packages (#31592)
Basic stack of ML packages we would like to test and generate binaries for in CI.
Spack now has a large CI framework in GitLab for PR testing and public binary generation.
We should take advantage of this to test and distribute optimized binaries for popular ML
frameworks.
This is a pretty extensive initial set, including CPU, ROCm, and CUDA versions of a core
`x96_64_v4` stack.
### Core ML frameworks
These are all popular core ML frameworks already available in Spack.
- [x] PyTorch
- [x] TensorFlow
- [x] Scikit-learn
- [x] MXNet
- [x] CNTK
- [x] Caffe
- [x] Chainer
- [x] XGBoost
- [x] Theano
### ML extensions
These are domain libraries and wrappers that build on top of core ML libraries
- [x] Keras
- [x] TensorBoard
- [x] torchvision
- [x] torchtext
- [x] torchaudio
- [x] TorchGeo
- [x] PyTorch Lightning
- [x] torchmetrics
- [x] GPyTorch
- [x] Horovod
### ML-adjacent libraries
These are libraries that aren't specific to ML but are still core libraries used in ML pipelines
- [x] numpy
- [x] scipy
- [x] pandas
- [x] ONNX
- [x] bazel
Co-authored-by: Jonathon Anderson <17242663+blue42u@users.noreply.github.com>
Diffstat (limited to 'share')
4 files changed, 553 insertions, 0 deletions
diff --git a/share/spack/gitlab/cloud_pipelines/.gitlab-ci.yml b/share/spack/gitlab/cloud_pipelines/.gitlab-ci.yml index fcfa925549..17c16e16d6 100644 --- a/share/spack/gitlab/cloud_pipelines/.gitlab-ci.yml +++ b/share/spack/gitlab/cloud_pipelines/.gitlab-ci.yml @@ -749,3 +749,123 @@ tutorial-protected-build: needs: - artifacts: True job: tutorial-protected-generate + +######################################## +# Machine Learning (CPU) +######################################## +.ml-cpu: + variables: + SPACK_CI_STACK_NAME: ml-cpu + +.ml-cpu-generate: + extends: .ml-cpu + image: ghcr.io/spack/e4s-amazonlinux-2:v2022-03-21 + tags: ["spack", "aws", "public", "medium", "x86_64_v4"] + +ml-cpu-pr-generate: + extends: [ ".ml-cpu-generate", ".pr-generate"] + +ml-cpu-protected-generate: + extends: [ ".ml-cpu-generate", ".protected-generate"] + +ml-cpu-pr-build: + extends: [ ".ml-cpu", ".pr-build" ] + trigger: + include: + - artifact: jobs_scratch_dir/cloud-ci-pipeline.yml + job: ml-cpu-pr-generate + strategy: depend + needs: + - artifacts: True + job: ml-cpu-pr-generate + +ml-cpu-protected-build: + extends: [ ".ml-cpu", ".protected-build" ] + trigger: + include: + - artifact: jobs_scratch_dir/cloud-ci-pipeline.yml + job: ml-cpu-protected-generate + strategy: depend + needs: + - artifacts: True + job: ml-cpu-protected-generate + +######################################## +# Machine Learning (CUDA) +######################################## +.ml-cuda: + variables: + SPACK_CI_STACK_NAME: ml-cuda + +.ml-cuda-generate: + extends: .ml-cuda + image: ghcr.io/spack/e4s-amazonlinux-2:v2022-03-21 + tags: ["spack", "aws", "public", "medium", "x86_64_v4"] + +ml-cuda-pr-generate: + extends: [ ".ml-cuda-generate", ".pr-generate"] + +ml-cuda-protected-generate: + extends: [ ".ml-cuda-generate", ".protected-generate"] + +ml-cuda-pr-build: + extends: [ ".ml-cuda", ".pr-build" ] + trigger: + include: + - artifact: jobs_scratch_dir/cloud-ci-pipeline.yml + job: ml-cuda-pr-generate + strategy: depend + needs: + - artifacts: True + job: ml-cuda-pr-generate + +ml-cuda-protected-build: + extends: [ ".ml-cuda", ".protected-build" ] + trigger: + include: + - artifact: jobs_scratch_dir/cloud-ci-pipeline.yml + job: ml-cuda-protected-generate + strategy: depend + needs: + - artifacts: True + job: ml-cuda-protected-generate + +######################################## +# Machine Learning (ROCm) +######################################## +.ml-rocm: + variables: + SPACK_CI_STACK_NAME: ml-rocm + +.ml-rocm-generate: + extends: .ml-rocm + image: ghcr.io/spack/e4s-amazonlinux-2:v2022-03-21 + tags: ["spack", "aws", "public", "medium", "x86_64_v4"] + +ml-rocm-pr-generate: + extends: [ ".ml-rocm-generate", ".pr-generate"] + +ml-rocm-protected-generate: + extends: [ ".ml-rocm-generate", ".protected-generate"] + +ml-rocm-pr-build: + extends: [ ".ml-rocm", ".pr-build" ] + trigger: + include: + - artifact: jobs_scratch_dir/cloud-ci-pipeline.yml + job: ml-rocm-pr-generate + strategy: depend + needs: + - artifacts: True + job: ml-rocm-pr-generate + +ml-rocm-protected-build: + extends: [ ".ml-rocm", ".protected-build" ] + trigger: + include: + - artifact: jobs_scratch_dir/cloud-ci-pipeline.yml + job: ml-rocm-protected-generate + strategy: depend + needs: + - artifacts: True + job: ml-rocm-protected-generate diff --git a/share/spack/gitlab/cloud_pipelines/stacks/ml-cpu/spack.yaml b/share/spack/gitlab/cloud_pipelines/stacks/ml-cpu/spack.yaml new file mode 100644 index 0000000000..a687a6928e --- /dev/null +++ b/share/spack/gitlab/cloud_pipelines/stacks/ml-cpu/spack.yaml @@ -0,0 +1,142 @@ +spack: + view: false + + concretizer: + reuse: false + unify: false + + config: + concretizer: clingo + install_tree: + root: /home/software/spack + padded_length: 384 + projections: + all: "{architecture}/{compiler.name}-{compiler.version}/{name}-{version}-{hash}" + + packages: + all: + compiler: [gcc@11.2.0] + target: [x86_64_v4] + variants: ~cuda~rocm + + specs: + # Horovod + - py-horovod + + # JAX + # https://github.com/google/jax/issues/12614 + # - py-jax + # - py-jaxlib + + # Keras + - py-keras + - py-keras-applications + - py-keras-preprocessing + - py-keras2onnx + + # PyTorch + - py-botorch + - py-efficientnet-pytorch + - py-gpytorch + - py-kornia + - py-pytorch-gradual-warmup-lr + - py-pytorch-lightning + - py-segmentation-models-pytorch + - py-timm + - py-torch + - py-torch-cluster + - py-torch-geometric + # https://github.com/NVIDIA/apex/issues/1498 + # - py-torch-nvidia-apex + - py-torch-scatter + - py-torch-sparse + - py-torch-spline-conv + - py-torchaudio + - py-torchdata + - py-torchfile + - py-torchgeo + - py-torchmeta + - py-torchmetrics + - py-torchtext + - py-torchvision + - py-vector-quantize-pytorch + + # scikit-learn + - py-scikit-learn + - py-scikit-learn-extra + + # TensorBoard + - py-tensorboard + - py-tensorboard-data-server + - py-tensorboard-plugin-wit + - py-tensorboardx + + # TensorFlow + - py-tensorflow + - py-tensorflow-datasets + - py-tensorflow-estimator + - py-tensorflow-hub + - py-tensorflow-metadata + - py-tensorflow-probability + + # XGBoost + - py-xgboost + # - r-xgboost + - xgboost + + mirrors: { "mirror": "s3://spack-binaries/develop/ml-cpu" } + + gitlab-ci: + script: + - . "./share/spack/setup-env.sh" + - spack --version + - cd ${SPACK_CONCRETE_ENV_DIR} + - spack env activate --without-view . + - spack config add "config:install_tree:projections:${SPACK_JOB_SPEC_PKG_NAME}:'morepadding/{architecture}/{compiler.name}-{compiler.version}/{name}-{version}-{hash}'" + - mkdir -p ${SPACK_ARTIFACTS_ROOT}/user_data + - if [[ -r /mnt/key/intermediate_ci_signing_key.gpg ]]; then spack gpg trust /mnt/key/intermediate_ci_signing_key.gpg; fi + - if [[ -r /mnt/key/spack_public_key.gpg ]]; then spack gpg trust /mnt/key/spack_public_key.gpg; fi + - spack -d ci rebuild > >(tee ${SPACK_ARTIFACTS_ROOT}/user_data/pipeline_out.txt) 2> >(tee ${SPACK_ARTIFACTS_ROOT}/user_data/pipeline_err.txt >&2) + + mappings: + - match: + - llvm + runner-attributes: + tags: [ "spack", "huge", "x86_64_v4" ] + variables: + CI_JOB_SIZE: huge + KUBERNETES_CPU_REQUEST: 11000m + KUBERNETES_MEMORY_REQUEST: 42G + - match: + - "@:" + runner-attributes: + tags: [ "spack", "large", "x86_64_v4" ] + variables: + CI_JOB_SIZE: large + KUBERNETES_CPU_REQUEST: 8000m + KUBERNETES_MEMORY_REQUEST: 12G + + image: { "name": "ghcr.io/spack/e4s-amazonlinux-2:v2022-03-21", "entrypoint": [""] } + + broken-specs-url: "s3://spack-binaries/broken-specs" + + service-job-attributes: + before_script: + - . "./share/spack/setup-env.sh" + - spack --version + image: { "name": "ghcr.io/spack/e4s-amazonlinux-2:v2022-03-21", "entrypoint": [""] } + tags: ["spack", "public", "x86_64_v4"] + + signing-job-attributes: + image: { "name": "ghcr.io/spack/notary:latest", "entrypoint": [""] } + tags: ["spack", "aws"] + script: + - aws s3 sync --exclude "*" --include "*spec.json*" ${SPACK_REMOTE_MIRROR_OVERRIDE}/build_cache /tmp + - /sign.sh + - aws s3 sync --exclude "*" --include "*spec.json.sig*" /tmp ${SPACK_REMOTE_MIRROR_OVERRIDE}/build_cache + + cdash: + build-group: Machine Learning + url: https://cdash.spack.io + project: Spack Testing + site: Cloud Gitlab Infrastructure diff --git a/share/spack/gitlab/cloud_pipelines/stacks/ml-cuda/spack.yaml b/share/spack/gitlab/cloud_pipelines/stacks/ml-cuda/spack.yaml new file mode 100644 index 0000000000..eb37168665 --- /dev/null +++ b/share/spack/gitlab/cloud_pipelines/stacks/ml-cuda/spack.yaml @@ -0,0 +1,144 @@ +spack: + view: false + + concretizer: + reuse: false + unify: false + + config: + concretizer: clingo + install_tree: + root: /home/software/spack + padded_length: 384 + projections: + all: "{architecture}/{compiler.name}-{compiler.version}/{name}-{version}-{hash}" + + packages: + all: + compiler: [gcc@11.2.0] + target: [x86_64_v4] + variants: ~rocm+cuda cuda_arch=80 + llvm: + # https://github.com/spack/spack/issues/27999 + require: ~cuda + + specs: + # Horovod + - py-horovod + + # JAX + # https://github.com/google/jax/issues/12614 + # - py-jax + # - py-jaxlib + + # Keras + - py-keras + - py-keras-applications + - py-keras-preprocessing + - py-keras2onnx + + # PyTorch + - py-botorch + - py-efficientnet-pytorch + - py-gpytorch + - py-kornia + - py-pytorch-gradual-warmup-lr + - py-pytorch-lightning + - py-segmentation-models-pytorch + - py-timm + - py-torch + - py-torch-cluster + - py-torch-geometric + - py-torch-nvidia-apex + - py-torch-scatter + - py-torch-sparse + - py-torch-spline-conv + - py-torchaudio + - py-torchdata + - py-torchfile + - py-torchgeo + - py-torchmeta + - py-torchmetrics + - py-torchtext + - py-torchvision + - py-vector-quantize-pytorch + + # scikit-learn + - py-scikit-learn + - py-scikit-learn-extra + + # TensorBoard + - py-tensorboard + - py-tensorboard-data-server + - py-tensorboard-plugin-wit + - py-tensorboardx + + # TensorFlow + - py-tensorflow + - py-tensorflow-datasets + - py-tensorflow-estimator + - py-tensorflow-hub + - py-tensorflow-metadata + - py-tensorflow-probability + + # XGBoost + - py-xgboost + # - r-xgboost + - xgboost + + mirrors: { "mirror": "s3://spack-binaries/develop/ml-cuda" } + + gitlab-ci: + script: + - . "./share/spack/setup-env.sh" + - spack --version + - cd ${SPACK_CONCRETE_ENV_DIR} + - spack env activate --without-view . + - spack config add "config:install_tree:projections:${SPACK_JOB_SPEC_PKG_NAME}:'morepadding/{architecture}/{compiler.name}-{compiler.version}/{name}-{version}-{hash}'" + - mkdir -p ${SPACK_ARTIFACTS_ROOT}/user_data + - if [[ -r /mnt/key/intermediate_ci_signing_key.gpg ]]; then spack gpg trust /mnt/key/intermediate_ci_signing_key.gpg; fi + - if [[ -r /mnt/key/spack_public_key.gpg ]]; then spack gpg trust /mnt/key/spack_public_key.gpg; fi + - spack -d ci rebuild > >(tee ${SPACK_ARTIFACTS_ROOT}/user_data/pipeline_out.txt) 2> >(tee ${SPACK_ARTIFACTS_ROOT}/user_data/pipeline_err.txt >&2) + + mappings: + - match: + - llvm + runner-attributes: + tags: [ "spack", "huge", "x86_64_v4" ] + variables: + CI_JOB_SIZE: huge + KUBERNETES_CPU_REQUEST: 11000m + KUBERNETES_MEMORY_REQUEST: 42G + - match: + - "@:" + runner-attributes: + tags: [ "spack", "large", "x86_64_v4" ] + variables: + CI_JOB_SIZE: large + KUBERNETES_CPU_REQUEST: 8000m + KUBERNETES_MEMORY_REQUEST: 12G + + image: { "name": "ghcr.io/spack/e4s-amazonlinux-2:v2022-03-21", "entrypoint": [""] } + + broken-specs-url: "s3://spack-binaries/broken-specs" + + service-job-attributes: + before_script: + - . "./share/spack/setup-env.sh" + - spack --version + image: { "name": "ghcr.io/spack/e4s-amazonlinux-2:v2022-03-21", "entrypoint": [""] } + tags: ["spack", "public", "x86_64_v4"] + + signing-job-attributes: + image: { "name": "ghcr.io/spack/notary:latest", "entrypoint": [""] } + tags: ["spack", "aws"] + script: + - aws s3 sync --exclude "*" --include "*spec.json*" ${SPACK_REMOTE_MIRROR_OVERRIDE}/build_cache /tmp + - /sign.sh + - aws s3 sync --exclude "*" --include "*spec.json.sig*" /tmp ${SPACK_REMOTE_MIRROR_OVERRIDE}/build_cache + + cdash: + build-group: Machine Learning + url: https://cdash.spack.io + project: Spack Testing + site: Cloud Gitlab Infrastructure diff --git a/share/spack/gitlab/cloud_pipelines/stacks/ml-rocm/spack.yaml b/share/spack/gitlab/cloud_pipelines/stacks/ml-rocm/spack.yaml new file mode 100644 index 0000000000..c437b170e4 --- /dev/null +++ b/share/spack/gitlab/cloud_pipelines/stacks/ml-rocm/spack.yaml @@ -0,0 +1,147 @@ +spack: + view: false + + concretizer: + reuse: false + unify: false + + config: + concretizer: clingo + install_tree: + root: /home/software/spack + padded_length: 384 + projections: + all: "{architecture}/{compiler.name}-{compiler.version}/{name}-{version}-{hash}" + + packages: + all: + compiler: [gcc@11.2.0] + target: [x86_64_v4] + variants: ~cuda+rocm amdgpu_target=gfx90a + gl: + require: "osmesa" + py-torch: + # Does not yet support Spack-installed ROCm + require: ~rocm + + specs: + # Horovod + - py-horovod + + # JAX + # https://github.com/google/jax/issues/12614 + # - py-jax + # - py-jaxlib + + # Keras + - py-keras + - py-keras-applications + - py-keras-preprocessing + - py-keras2onnx + + # PyTorch + # Does not yet support Spack-install ROCm + # - py-botorch + # - py-efficientnet-pytorch + # - py-gpytorch + # - py-kornia + # - py-pytorch-gradual-warmup-lr + # - py-pytorch-lightning + # - py-segmentation-models-pytorch + # - py-timm + # - py-torch + # - py-torch-cluster + # - py-torch-geometric + # - py-torch-nvidia-apex + # - py-torch-scatter + # - py-torch-sparse + # - py-torch-spline-conv + # - py-torchaudio + # - py-torchdata + # - py-torchfile + # - py-torchgeo + # - py-torchmeta + # - py-torchmetrics + # - py-torchtext + # - py-torchvision + # - py-vector-quantize-pytorch + + # scikit-learn + - py-scikit-learn + - py-scikit-learn-extra + + # TensorBoard + - py-tensorboard + - py-tensorboard-data-server + - py-tensorboard-plugin-wit + - py-tensorboardx + + # TensorFlow + - py-tensorflow + - py-tensorflow-datasets + - py-tensorflow-estimator + - py-tensorflow-hub + - py-tensorflow-metadata + - py-tensorflow-probability + + # XGBoost + - py-xgboost + # - r-xgboost + - xgboost + + mirrors: { "mirror": "s3://spack-binaries/develop/ml-rocm" } + + gitlab-ci: + script: + - . "./share/spack/setup-env.sh" + - spack --version + - cd ${SPACK_CONCRETE_ENV_DIR} + - spack env activate --without-view . + - spack config add "config:install_tree:projections:${SPACK_JOB_SPEC_PKG_NAME}:'morepadding/{architecture}/{compiler.name}-{compiler.version}/{name}-{version}-{hash}'" + - mkdir -p ${SPACK_ARTIFACTS_ROOT}/user_data + - if [[ -r /mnt/key/intermediate_ci_signing_key.gpg ]]; then spack gpg trust /mnt/key/intermediate_ci_signing_key.gpg; fi + - if [[ -r /mnt/key/spack_public_key.gpg ]]; then spack gpg trust /mnt/key/spack_public_key.gpg; fi + - spack -d ci rebuild > >(tee ${SPACK_ARTIFACTS_ROOT}/user_data/pipeline_out.txt) 2> >(tee ${SPACK_ARTIFACTS_ROOT}/user_data/pipeline_err.txt >&2) + + mappings: + - match: + - llvm + runner-attributes: + tags: [ "spack", "huge", "x86_64_v4" ] + variables: + CI_JOB_SIZE: huge + KUBERNETES_CPU_REQUEST: 11000m + KUBERNETES_MEMORY_REQUEST: 42G + - match: + - "@:" + runner-attributes: + tags: [ "spack", "large", "x86_64_v4" ] + variables: + CI_JOB_SIZE: large + KUBERNETES_CPU_REQUEST: 8000m + KUBERNETES_MEMORY_REQUEST: 12G + + image: { "name": "ghcr.io/spack/e4s-amazonlinux-2:v2022-03-21", "entrypoint": [""] } + + broken-specs-url: "s3://spack-binaries/broken-specs" + + service-job-attributes: + before_script: + - . "./share/spack/setup-env.sh" + - spack --version + image: { "name": "ghcr.io/spack/e4s-amazonlinux-2:v2022-03-21", "entrypoint": [""] } + tags: ["spack", "public", "x86_64_v4"] + + signing-job-attributes: + image: { "name": "ghcr.io/spack/notary:latest", "entrypoint": [""] } + tags: ["spack", "aws"] + script: + - aws s3 sync --exclude "*" --include "*spec.json*" ${SPACK_REMOTE_MIRROR_OVERRIDE}/build_cache /tmp + - /sign.sh + - aws s3 sync --exclude "*" --include "*spec.json.sig*" /tmp ${SPACK_REMOTE_MIRROR_OVERRIDE}/build_cache + + cdash: + build-group: Machine Learning + url: https://cdash.spack.io + project: Spack Testing + site: Cloud Gitlab Infrastructure |