From 1baed0d833e75a427467ef5686ea73a6c7700069 Mon Sep 17 00:00:00 2001 From: Scott Wittenburg Date: Thu, 9 Nov 2023 06:30:41 -0700 Subject: buildcache: skip unrecognized metadata files (#40941) This commit improves forward compatibility of Spack with newer build cache metadata formats. Before this commit, invalid or unrecognized metadata would be fatal errors, now they just cause a mirror to be skipped. Co-authored-by: Harmen Stoppels --- lib/spack/spack/binary_distribution.py | 118 ++++++++++++++++++++++++--------- lib/spack/spack/test/bindist.py | 76 +++++++++++++++++++++ 2 files changed, 163 insertions(+), 31 deletions(-) (limited to 'lib') diff --git a/lib/spack/spack/binary_distribution.py b/lib/spack/spack/binary_distribution.py index 6a49ab445e..8cfb891640 100644 --- a/lib/spack/spack/binary_distribution.py +++ b/lib/spack/spack/binary_distribution.py @@ -66,8 +66,9 @@ from spack.spec import Spec from spack.stage import Stage from spack.util.executable import which -_build_cache_relative_path = "build_cache" -_build_cache_keys_relative_path = "_pgp" +BUILD_CACHE_RELATIVE_PATH = "build_cache" +BUILD_CACHE_KEYS_RELATIVE_PATH = "_pgp" +CURRENT_BUILD_CACHE_LAYOUT_VERSION = 1 class BuildCacheDatabase(spack_db.Database): @@ -481,7 +482,7 @@ class BinaryCacheIndex: scheme = urllib.parse.urlparse(mirror_url).scheme if scheme != "oci" and not web_util.url_exists( - url_util.join(mirror_url, _build_cache_relative_path, "index.json") + url_util.join(mirror_url, BUILD_CACHE_RELATIVE_PATH, "index.json") ): return False @@ -600,6 +601,10 @@ class NewLayoutException(spack.error.SpackError): super().__init__(msg) +class InvalidMetadataFile(spack.error.SpackError): + pass + + class UnsignedPackageException(spack.error.SpackError): """ Raised if installation of unsigned package is attempted without @@ -614,11 +619,11 @@ def compute_hash(data): def build_cache_relative_path(): - return _build_cache_relative_path + return BUILD_CACHE_RELATIVE_PATH def build_cache_keys_relative_path(): - return _build_cache_keys_relative_path + return BUILD_CACHE_KEYS_RELATIVE_PATH def build_cache_prefix(prefix): @@ -1401,7 +1406,7 @@ def _build_tarball_in_stage_dir(spec: Spec, out_url: str, stage_dir: str, option spec_dict = sjson.load(content) else: raise ValueError("{0} not a valid spec file type".format(spec_file)) - spec_dict["buildcache_layout_version"] = 1 + spec_dict["buildcache_layout_version"] = CURRENT_BUILD_CACHE_LAYOUT_VERSION spec_dict["binary_cache_checksum"] = {"hash_algorithm": "sha256", "hash": checksum} with open(specfile_path, "w") as outfile: @@ -1560,6 +1565,42 @@ def _delete_staged_downloads(download_result): download_result["specfile_stage"].destroy() +def _get_valid_spec_file(path: str, max_supported_layout: int) -> Tuple[Dict, int]: + """Read and validate a spec file, returning the spec dict with its layout version, or raising + InvalidMetadataFile if invalid.""" + try: + with open(path, "rb") as f: + binary_content = f.read() + except OSError: + raise InvalidMetadataFile(f"No such file: {path}") + + # In the future we may support transparently decompressing compressed spec files. + if binary_content[:2] == b"\x1f\x8b": + raise InvalidMetadataFile("Compressed spec files are not supported") + + try: + as_string = binary_content.decode("utf-8") + if path.endswith(".json.sig"): + spec_dict = Spec.extract_json_from_clearsig(as_string) + else: + spec_dict = json.loads(as_string) + except Exception as e: + raise InvalidMetadataFile(f"Could not parse {path} due to: {e}") from e + + # Ensure this version is not too new. + try: + layout_version = int(spec_dict.get("buildcache_layout_version", 0)) + except ValueError as e: + raise InvalidMetadataFile("Could not parse layout version") from e + + if layout_version > max_supported_layout: + raise InvalidMetadataFile( + f"Layout version {layout_version} is too new for this version of Spack" + ) + + return spec_dict, layout_version + + def download_tarball(spec, unsigned=False, mirrors_for_spec=None): """ Download binary tarball for given package into stage area, returning @@ -1652,6 +1693,18 @@ def download_tarball(spec, unsigned=False, mirrors_for_spec=None): try: local_specfile_stage.fetch() local_specfile_stage.check() + try: + _get_valid_spec_file( + local_specfile_stage.save_filename, + CURRENT_BUILD_CACHE_LAYOUT_VERSION, + ) + except InvalidMetadataFile as e: + tty.warn( + f"Ignoring binary package for {spec.name}/{spec.dag_hash()[:7]} " + f"from {mirror} due to invalid metadata file: {e}" + ) + local_specfile_stage.destroy() + continue except Exception: continue local_specfile_stage.cache_local() @@ -1674,14 +1727,26 @@ def download_tarball(spec, unsigned=False, mirrors_for_spec=None): else: ext = "json.sig" if try_signed else "json" - specfile_path = url_util.join(mirror, _build_cache_relative_path, specfile_prefix) + specfile_path = url_util.join(mirror, BUILD_CACHE_RELATIVE_PATH, specfile_prefix) specfile_url = f"{specfile_path}.{ext}" - spackfile_url = url_util.join(mirror, _build_cache_relative_path, tarball) + spackfile_url = url_util.join(mirror, BUILD_CACHE_RELATIVE_PATH, tarball) local_specfile_stage = try_fetch(specfile_url) if local_specfile_stage: local_specfile_path = local_specfile_stage.save_filename signature_verified = False + try: + _get_valid_spec_file( + local_specfile_path, CURRENT_BUILD_CACHE_LAYOUT_VERSION + ) + except InvalidMetadataFile as e: + tty.warn( + f"Ignoring binary package for {spec.name}/{spec.dag_hash()[:7]} " + f"from {mirror} due to invalid metadata file: {e}" + ) + local_specfile_stage.destroy() + continue + if try_signed and not unsigned: # If we found a signed specfile at the root, try to verify # the signature immediately. We will not download the @@ -2001,24 +2066,16 @@ def extract_tarball(spec, download_result, unsigned=False, force=False, timer=ti ) specfile_path = download_result["specfile_stage"].save_filename - - with open(specfile_path, "r") as inputfile: - content = inputfile.read() - if specfile_path.endswith(".json.sig"): - spec_dict = Spec.extract_json_from_clearsig(content) - else: - spec_dict = sjson.load(content) - + spec_dict, layout_version = _get_valid_spec_file( + specfile_path, CURRENT_BUILD_CACHE_LAYOUT_VERSION + ) bchecksum = spec_dict["binary_cache_checksum"] filename = download_result["tarball_stage"].save_filename signature_verified = download_result["signature_verified"] tmpdir = None - if ( - "buildcache_layout_version" not in spec_dict - or int(spec_dict["buildcache_layout_version"]) < 1 - ): + if layout_version == 0: # Handle the older buildcache layout where the .spack file # contains a spec json, maybe an .asc file (signature), # and another tarball containing the actual install tree. @@ -2029,7 +2086,7 @@ def extract_tarball(spec, download_result, unsigned=False, force=False, timer=ti _delete_staged_downloads(download_result) shutil.rmtree(tmpdir) raise e - else: + elif layout_version == 1: # Newer buildcache layout: the .spack file contains just # in the install tree, the signature, if it exists, is # wrapped around the spec.json at the root. If sig verify @@ -2053,7 +2110,6 @@ def extract_tarball(spec, download_result, unsigned=False, force=False, timer=ti raise NoChecksumException( tarfile_path, size, contents, "sha256", expected, local_checksum ) - try: with closing(tarfile.open(tarfile_path, "r")) as tar: # Remove install prefix from tarfil to extract directly into spec.prefix @@ -2184,10 +2240,10 @@ def try_direct_fetch(spec, mirrors=None): for mirror in binary_mirrors: buildcache_fetch_url_json = url_util.join( - mirror.fetch_url, _build_cache_relative_path, specfile_name + mirror.fetch_url, BUILD_CACHE_RELATIVE_PATH, specfile_name ) buildcache_fetch_url_signed_json = url_util.join( - mirror.fetch_url, _build_cache_relative_path, signed_specfile_name + mirror.fetch_url, BUILD_CACHE_RELATIVE_PATH, signed_specfile_name ) try: _, _, fs = web_util.read_from_url(buildcache_fetch_url_signed_json) @@ -2292,7 +2348,7 @@ def get_keys(install=False, trust=False, force=False, mirrors=None): for mirror in mirror_collection.values(): fetch_url = mirror.fetch_url keys_url = url_util.join( - fetch_url, _build_cache_relative_path, _build_cache_keys_relative_path + fetch_url, BUILD_CACHE_RELATIVE_PATH, BUILD_CACHE_KEYS_RELATIVE_PATH ) keys_index = url_util.join(keys_url, "index.json") @@ -2357,7 +2413,7 @@ def push_keys(*mirrors, **kwargs): for mirror in mirrors: push_url = getattr(mirror, "push_url", mirror) keys_url = url_util.join( - push_url, _build_cache_relative_path, _build_cache_keys_relative_path + push_url, BUILD_CACHE_RELATIVE_PATH, BUILD_CACHE_KEYS_RELATIVE_PATH ) keys_local = url_util.local_file_path(keys_url) @@ -2495,11 +2551,11 @@ def download_buildcache_entry(file_descriptions, mirror_url=None): ) if mirror_url: - mirror_root = os.path.join(mirror_url, _build_cache_relative_path) + mirror_root = os.path.join(mirror_url, BUILD_CACHE_RELATIVE_PATH) return _download_buildcache_entry(mirror_root, file_descriptions) for mirror in spack.mirror.MirrorCollection(binary=True).values(): - mirror_root = os.path.join(mirror.fetch_url, _build_cache_relative_path) + mirror_root = os.path.join(mirror.fetch_url, BUILD_CACHE_RELATIVE_PATH) if _download_buildcache_entry(mirror_root, file_descriptions): return True @@ -2590,7 +2646,7 @@ class DefaultIndexFetcher: def get_remote_hash(self): # Failure to fetch index.json.hash is not fatal - url_index_hash = url_util.join(self.url, _build_cache_relative_path, "index.json.hash") + url_index_hash = url_util.join(self.url, BUILD_CACHE_RELATIVE_PATH, "index.json.hash") try: response = self.urlopen(urllib.request.Request(url_index_hash, headers=self.headers)) except urllib.error.URLError: @@ -2611,7 +2667,7 @@ class DefaultIndexFetcher: return FetchIndexResult(etag=None, hash=None, data=None, fresh=True) # Otherwise, download index.json - url_index = url_util.join(self.url, _build_cache_relative_path, "index.json") + url_index = url_util.join(self.url, BUILD_CACHE_RELATIVE_PATH, "index.json") try: response = self.urlopen(urllib.request.Request(url_index, headers=self.headers)) @@ -2655,7 +2711,7 @@ class EtagIndexFetcher: def conditional_fetch(self) -> FetchIndexResult: # Just do a conditional fetch immediately - url = url_util.join(self.url, _build_cache_relative_path, "index.json") + url = url_util.join(self.url, BUILD_CACHE_RELATIVE_PATH, "index.json") headers = { "User-Agent": web_util.SPACK_USER_AGENT, "If-None-Match": '"{}"'.format(self.etag), diff --git a/lib/spack/spack/test/bindist.py b/lib/spack/spack/test/bindist.py index 20802bbdd8..ea9caf7fc0 100644 --- a/lib/spack/spack/test/bindist.py +++ b/lib/spack/spack/test/bindist.py @@ -4,7 +4,9 @@ # SPDX-License-Identifier: (Apache-2.0 OR MIT) import filecmp import glob +import gzip import io +import json import os import platform import sys @@ -1112,3 +1114,77 @@ def test_tarfile_of_spec_prefix(tmpdir): assert tar.getmember(f"{expected_prefix}/b_directory/file").isreg() assert tar.getmember(f"{expected_prefix}/c_directory").isdir() assert tar.getmember(f"{expected_prefix}/c_directory/file").isreg() + + +@pytest.mark.parametrize("layout,expect_success", [(None, True), (1, True), (2, False)]) +def test_get_valid_spec_file(tmp_path, layout, expect_success): + # Test reading a spec.json file that does not specify a layout version. + spec_dict = Spec("example").to_dict() + path = tmp_path / "spec.json" + effective_layout = layout or 0 # If not specified it should be 0 + + # Add a layout version + if layout is not None: + spec_dict["buildcache_layout_version"] = layout + + # Save to file + with open(path, "w") as f: + json.dump(spec_dict, f) + + try: + spec_dict_disk, layout_disk = bindist._get_valid_spec_file( + str(path), max_supported_layout=1 + ) + assert expect_success + assert spec_dict_disk == spec_dict + assert layout_disk == effective_layout + except bindist.InvalidMetadataFile: + assert not expect_success + + +def test_get_valid_spec_file_doesnt_exist(tmp_path): + with pytest.raises(bindist.InvalidMetadataFile, match="No such file"): + bindist._get_valid_spec_file(str(tmp_path / "no-such-file"), max_supported_layout=1) + + +def test_get_valid_spec_file_gzipped(tmp_path): + # Create a gzipped file, contents don't matter + path = tmp_path / "spec.json.gz" + with gzip.open(path, "wb") as f: + f.write(b"hello") + with pytest.raises( + bindist.InvalidMetadataFile, match="Compressed spec files are not supported" + ): + bindist._get_valid_spec_file(str(path), max_supported_layout=1) + + +@pytest.mark.parametrize("filename", ["spec.json", "spec.json.sig"]) +def test_get_valid_spec_file_no_json(tmp_path, filename): + tmp_path.joinpath(filename).write_text("not json") + with pytest.raises(bindist.InvalidMetadataFile): + bindist._get_valid_spec_file(str(tmp_path / filename), max_supported_layout=1) + + +def test_download_tarball_with_unsupported_layout_fails(tmp_path, mutable_config, capsys): + layout_version = bindist.CURRENT_BUILD_CACHE_LAYOUT_VERSION + 1 + spec = Spec("gmake@4.4.1%gcc@13.1.0 arch=linux-ubuntu23.04-zen2") + spec._mark_concrete() + spec_dict = spec.to_dict() + spec_dict["buildcache_layout_version"] = layout_version + + # Setup a basic local build cache structure + path = ( + tmp_path / bindist.build_cache_relative_path() / bindist.tarball_name(spec, ".spec.json") + ) + path.parent.mkdir(parents=True) + with open(path, "w") as f: + json.dump(spec_dict, f) + + # Configure as a mirror. + mirror_cmd("add", "test-mirror", str(tmp_path)) + + # Shouldn't be able "download" this. + assert bindist.download_tarball(spec, unsigned=True) is None + + # And there should be a warning about an unsupported layout version. + assert f"Layout version {layout_version} is too new" in capsys.readouterr().err -- cgit v1.2.3-60-g2f50