From 8a1b81797834dd04de883c767867e2f85d45c9c4 Mon Sep 17 00:00:00 2001 From: Harmen Stoppels Date: Sat, 7 Jan 2023 12:22:40 +0100 Subject: Allow spec.json files to be clearsigned, use transparent compression for *.json (#34490) This commit allows (remote) spec.json files to be clearsigned and gzipped. The idea is to reduce the number of requests and number of bytes transferred --- lib/spack/spack/binary_distribution.py | 252 +++++++++++++++++++-------------- lib/spack/spack/spec.py | 31 ---- lib/spack/spack/test/bindist.py | 105 ++++++++++++++ lib/spack/spack/test/cmd/ci.py | 4 +- lib/spack/spack/test/spec_yaml.py | 21 --- 5 files changed, 255 insertions(+), 158 deletions(-) (limited to 'lib') diff --git a/lib/spack/spack/binary_distribution.py b/lib/spack/spack/binary_distribution.py index 8ae7207768..f629ebbae2 100644 --- a/lib/spack/spack/binary_distribution.py +++ b/lib/spack/spack/binary_distribution.py @@ -5,7 +5,9 @@ import codecs import collections +import gzip import hashlib +import io import json import multiprocessing.pool import os @@ -613,6 +615,28 @@ def read_buildinfo_file(prefix): return buildinfo +def transparently_decompress_bytes(binary_stream): + """Wrap stream in a decompress wrapper if gzip compressed""" + # Get magic bytes + if isinstance(binary_stream, io.BytesIO): + # Not peekable... Alternatively io.BufferedReader(io.BytesIO(...)) + # but to add yet another wrapper just to read two bytes that are + # already in memory... sigh. + magic = binary_stream.read(2) + binary_stream.seek(0) + else: + magic = binary_stream.peek(2) + + # Verify magic + if magic.startswith(b"\x1f\x8b"): + return gzip.GzipFile(fileobj=binary_stream) + return binary_stream + + +def transparently_decompress_bytes_to_string(binary_stream, encoding="utf-8"): + return codecs.getreader(encoding)(transparently_decompress_bytes(binary_stream)) + + class BuildManifestVisitor(BaseDirectoryVisitor): """Visitor that collects a list of files and symlinks that can be checked for need of relocation. It knows how @@ -845,6 +869,42 @@ def sign_specfile(key, force, specfile_path): spack.util.gpg.sign(key, specfile_path, signed_specfile_path, clearsign=True) +def _load_clearsigned_json(stream): + # Skip the PGP header + stream.readline() + stream.readline() + json = stream.read() + footer_index = json.rfind("-----BEGIN PGP SIGNATURE-----") + if footer_index == -1 or not json[footer_index - 1].isspace(): + raise ValueError("Could not find PGP signature in clearsigned json file.") + return sjson.load(json[:footer_index]) + + +def _load_possibly_clearsigned_json(stream): + if _is_clearsigned_stream(stream): + return _load_clearsigned_json(stream) + return sjson.load(stream) + + +def _is_clearsigned_stream(stream): + curr = stream.tell() + header = stream.read(34) + stream.seek(curr) + return header == "-----BEGIN PGP SIGNED MESSAGE-----" + + +def is_clearsigned_file(path): + with open(path, "r") as f: + return _is_clearsigned_stream(f) + + +def load_possibly_clearsigned_json(s): + """Deserialize JSON from a string or stream s, removing any clearsign + header/footer.""" + s = io.StringIO(s) if isinstance(s, str) else s + return _load_possibly_clearsigned_json(s) + + def _read_specs_and_push_index(file_list, read_method, cache_prefix, db, temp_dir, concurrency): """Read all the specs listed in the provided list, using thread given thread parallelism, generate the index, and push it to the mirror. @@ -867,11 +927,7 @@ def _read_specs_and_push_index(file_list, read_method, cache_prefix, db, temp_di if spec_file_contents: # Need full spec.json name or this gets confused with index.json. - if spec_url.endswith(".json.sig"): - specfile_json = Spec.extract_json_from_clearsig(spec_file_contents) - return Spec.from_dict(specfile_json) - if spec_url.endswith(".json"): - return Spec.from_json(spec_file_contents) + return Spec.from_dict(load_possibly_clearsigned_json(spec_file_contents)) tp = multiprocessing.pool.ThreadPool(processes=concurrency) try: @@ -933,8 +989,8 @@ def _specs_from_cache_aws_cli(cache_prefix): aws = which("aws") def file_read_method(file_path): - with open(file_path) as fd: - return fd.read() + with open(file_path, "rb") as f: + return transparently_decompress_bytes_to_string(f).read() tmpspecsdir = tempfile.mkdtemp() sync_command_args = [ @@ -981,7 +1037,7 @@ def _specs_from_cache_fallback(cache_prefix): contents = None try: _, _, spec_file = web_util.read_from_url(url) - contents = codecs.getreader("utf-8")(spec_file).read() + contents = transparently_decompress_bytes_to_string(spec_file).read() except (URLError, web_util.SpackWebError) as url_err: tty.error("Error reading specfile: {0}".format(url)) tty.error(url_err) @@ -1379,7 +1435,7 @@ def try_verify(specfile_path): return True -def try_fetch(url_to_fetch): +def try_fetch(url_to_fetch, try_decompress=False): """Utility function to try and fetch a file from a url, stage it locally, and return the path to the staged file. @@ -1398,6 +1454,21 @@ def try_fetch(url_to_fetch): stage.destroy() return None + if not try_decompress: + return stage + + # Stage has some logic for automatically expanding + # archives, but it is based on file extensions. So instead, + # we more or less repeat the logic. + try: + tmp = stage.save_filename + ".tmp" + with gzip.open(stage.save_filename, "rb") as compressed: + with open(tmp, "wb") as decompressed: + shutil.copyfileobj(compressed, decompressed) + os.rename(tmp, stage.save_filename) + except OSError: + pass + return stage @@ -1467,61 +1538,45 @@ def download_tarball(spec, unsigned=False, mirrors_for_spec=None): } ) - tried_to_verify_sigs = [] - - # Assumes we care more about finding a spec file by preferred ext - # than by mirrory priority. This can be made less complicated as - # we remove support for deprecated spec formats and buildcache layouts. + verification_failure = False for ext in ["json.sig", "json"]: - for mirror_to_try in mirrors_to_try: - specfile_url = "{0}.{1}".format(mirror_to_try["specfile"], ext) - spackfile_url = mirror_to_try["spackfile"] - local_specfile_stage = try_fetch(specfile_url) - if local_specfile_stage: - local_specfile_path = local_specfile_stage.save_filename - signature_verified = False - - if ext.endswith(".sig") and not unsigned: - # If we found a signed specfile at the root, try to verify - # the signature immediately. We will not download the - # tarball if we could not verify the signature. - tried_to_verify_sigs.append(specfile_url) - signature_verified = try_verify(local_specfile_path) - if not signature_verified: - tty.warn("Failed to verify: {0}".format(specfile_url)) - - if unsigned or signature_verified or not ext.endswith(".sig"): - # We will download the tarball in one of three cases: - # 1. user asked for --no-check-signature - # 2. user didn't ask for --no-check-signature, but we - # found a spec.json.sig and verified the signature already - # 3. neither of the first two cases are true, but this file - # is *not* a signed json (not a spec.json.sig file). That - # means we already looked at all the mirrors and either didn't - # find any .sig files or couldn't verify any of them. But it - # is still possible to find an old style binary package where - # the signature is a detached .asc file in the outer archive - # of the tarball, and in that case, the only way to know is to - # download the tarball. This is a deprecated use case, so if - # something goes wrong during the extraction process (can't - # verify signature, checksum doesn't match) we will fail at - # that point instead of trying to download more tarballs from - # the remaining mirrors, looking for one we can use. - tarball_stage = try_fetch(spackfile_url) - if tarball_stage: - return { - "tarball_stage": tarball_stage, - "specfile_stage": local_specfile_stage, - "signature_verified": signature_verified, - } + for mirror in mirrors_to_try: + # Try to download the specfile. For any legacy version of Spack's buildcache + # we definitely require this file. + specfile_url = "{0}.{1}".format(mirror["specfile"], ext) + specfile_stage = try_fetch(specfile_url, try_decompress=True) + if not specfile_stage: + continue - local_specfile_stage.destroy() + specfile_path = specfile_stage.save_filename + + # If it is a clearsign file, we must verify it (unless disabled) + should_verify = not unsigned and is_clearsigned_file(specfile_path) + if should_verify and not try_verify(specfile_path): + verification_failure = True + tty.warn("Failed to verify: {0}".format(specfile_url)) + specfile_stage.destroy() + continue + + # In case the spec.json is not clearsigned, it means it's a legacy + # format, where either the signature is in the tarball with binaries, or + # the package is unsigned. Verification + # is then postponed. + spackfile_url = mirror["spackfile"] + tarball_stage = try_fetch(spackfile_url) + if tarball_stage: + return { + "tarball_stage": tarball_stage, + "specfile_stage": specfile_stage, + "signature_verified": should_verify, # should_verify implies it was verified + } + specfile_stage.destroy() # Falling through the nested loops meeans we exhaustively searched # for all known kinds of spec files on all mirrors and did not find # an acceptable one for which we could download a tarball. - if tried_to_verify_sigs: + if verification_failure: raise NoVerifyException( ( "Spack found new style signed binary packages, " @@ -1802,11 +1857,7 @@ def extract_tarball(spec, download_result, allow_root=False, unsigned=False, for specfile_path = download_result["specfile_stage"].save_filename with open(specfile_path, "r") as inputfile: - content = inputfile.read() - if specfile_path.endswith(".json.sig"): - spec_dict = Spec.extract_json_from_clearsig(content) - else: - spec_dict = sjson.load(content) + spec_dict = load_possibly_clearsigned_json(inputfile) bchecksum = spec_dict["binary_cache_checksum"] filename = download_result["tarball_stage"].save_filename @@ -1971,54 +2022,35 @@ def try_direct_fetch(spec, mirrors=None): """ specfile_name = tarball_name(spec, ".spec.json") signed_specfile_name = tarball_name(spec, ".spec.json.sig") - specfile_is_signed = False found_specs = [] for mirror in spack.mirror.MirrorCollection(mirrors=mirrors).values(): - buildcache_fetch_url_json = url_util.join( - mirror.fetch_url, _build_cache_relative_path, specfile_name - ) - buildcache_fetch_url_signed_json = url_util.join( - mirror.fetch_url, _build_cache_relative_path, signed_specfile_name - ) - try: - _, _, fs = web_util.read_from_url(buildcache_fetch_url_signed_json) - specfile_is_signed = True - except (URLError, web_util.SpackWebError, HTTPError) as url_err: + for file in (specfile_name, signed_specfile_name): + url = url_util.join(mirror.fetch_url, _build_cache_relative_path, file) try: - _, _, fs = web_util.read_from_url(buildcache_fetch_url_json) - except (URLError, web_util.SpackWebError, HTTPError) as url_err_x: + _, _, fs = web_util.read_from_url(url) + except (URLError, web_util.SpackWebError, HTTPError) as url_err: tty.debug( - "Did not find {0} on {1}".format( - specfile_name, buildcache_fetch_url_signed_json - ), + "Did not find {0} on {1}".format(specfile_name, url), url_err, level=2, ) - tty.debug( - "Did not find {0} on {1}".format(specfile_name, buildcache_fetch_url_json), - url_err_x, - level=2, - ) continue - specfile_contents = codecs.getreader("utf-8")(fs).read() - - # read the spec from the build cache file. All specs in build caches - # are concrete (as they are built) so we need to mark this spec - # concrete on read-in. - if specfile_is_signed: - specfile_json = Spec.extract_json_from_clearsig(specfile_contents) - fetched_spec = Spec.from_dict(specfile_json) - else: - fetched_spec = Spec.from_json(specfile_contents) - fetched_spec._mark_concrete() - found_specs.append( - { - "mirror_url": mirror.fetch_url, - "spec": fetched_spec, - } - ) + # read the spec from the build cache file. All specs in build caches + # are concrete (as they are built) so we need to mark this spec + # concrete on read-in. + stream = transparently_decompress_bytes_to_string(fs) + fetched_spec = Spec.from_dict(load_possibly_clearsigned_json(stream)) + fetched_spec._mark_concrete() + + found_specs.append( + { + "mirror_url": mirror.fetch_url, + "spec": fetched_spec, + } + ) + break return found_specs @@ -2097,7 +2129,7 @@ def get_keys(install=False, trust=False, force=False, mirrors=None): try: _, _, json_file = web_util.read_from_url(keys_index) - json_index = sjson.load(codecs.getreader("utf-8")(json_file)) + json_index = sjson.load(transparently_decompress_bytes_to_string(json_file)) except (URLError, web_util.SpackWebError) as url_err: if web_util.url_exists(keys_index): err_msg = [ @@ -2422,11 +2454,15 @@ class DefaultIndexFetcher: raise FetchIndexError("Could not fetch index from {}".format(url_index), e) try: - result = codecs.getreader("utf-8")(response).read() + binary_result = response.read() except ValueError as e: return FetchCacheError("Remote index {} is invalid".format(url_index), e) - computed_hash = compute_hash(result) + # The hash is computed on the raw bytes + computed_hash = compute_hash(binary_result) + + # Only then decode as string, possibly decompress + result = transparently_decompress_bytes_to_string(io.BytesIO(binary_result)).read() # We don't handle computed_hash != remote_hash here, which can happen # when remote index.json and index.json.hash are out of sync, or if @@ -2480,15 +2516,21 @@ class EtagIndexFetcher: raise FetchIndexError("Could not fetch index {}".format(url), e) from e try: - result = codecs.getreader("utf-8")(response).read() + binary_result = response.read() except ValueError as e: raise FetchIndexError("Remote index {} is invalid".format(url), e) from e + # The hash is computed on the raw bytes + computed_hash = compute_hash(binary_result) + + # Only then decode as string, possibly decompress + result = transparently_decompress_bytes_to_string(io.BytesIO(binary_result)).read() + headers = response.headers etag_header_value = headers.get("Etag", None) or headers.get("etag", None) return FetchIndexResult( etag=web_util.parse_etag(etag_header_value), - hash=compute_hash(result), + hash=computed_hash, data=result, fresh=False, ) diff --git a/lib/spack/spack/spec.py b/lib/spack/spack/spec.py index e8306ae0b7..95f2951f2b 100644 --- a/lib/spack/spack/spec.py +++ b/lib/spack/spack/spec.py @@ -156,16 +156,6 @@ default_format = "{name}{@version}" default_format += "{%compiler.name}{@compiler.version}{compiler_flags}" default_format += "{variants}{arch=architecture}" -#: Regular expression to pull spec contents out of clearsigned signature -#: file. -CLEARSIGN_FILE_REGEX = re.compile( - ( - r"^-----BEGIN PGP SIGNED MESSAGE-----" - r"\s+Hash:\s+[^\s]+\s+(.+)-----BEGIN PGP SIGNATURE-----" - ), - re.MULTILINE | re.DOTALL, -) - #: specfile format version. Must increase monotonically specfile_format_version = 3 @@ -2453,27 +2443,6 @@ class Spec(object): except Exception as e: raise sjson.SpackJSONError("error parsing JSON spec:", str(e)) from e - @staticmethod - def extract_json_from_clearsig(data): - m = CLEARSIGN_FILE_REGEX.search(data) - if m: - return sjson.load(m.group(1)) - return sjson.load(data) - - @staticmethod - def from_signed_json(stream): - """Construct a spec from clearsigned json spec file. - - Args: - stream: string or file object to read from. - """ - data = stream - if hasattr(stream, "read"): - data = stream.read() - - extracted_json = Spec.extract_json_from_clearsig(data) - return Spec.from_dict(extracted_json) - @staticmethod def from_detection(spec_str, extra_attributes=None): """Construct a spec from a spec string determined during external diff --git a/lib/spack/spack/test/bindist.py b/lib/spack/spack/test/bindist.py index dc6cb35177..9d801c6d5e 100644 --- a/lib/spack/spack/test/bindist.py +++ b/lib/spack/spack/test/bindist.py @@ -3,7 +3,9 @@ # # SPDX-License-Identifier: (Apache-2.0 OR MIT) import glob +import gzip import io +import json import os import platform import sys @@ -889,3 +891,106 @@ def test_default_index_json_404(): with pytest.raises(bindist.FetchIndexError, match="Could not fetch index"): fetcher.conditional_fetch() + + +def test_read_spec_from_signed_json(): + spec_dir = os.path.join(test_path, "data", "mirrors", "signed_json") + file_name = ( + "linux-ubuntu18.04-haswell-gcc-8.4.0-" + "zlib-1.2.12-g7otk5dra3hifqxej36m5qzm7uyghqgb.spec.json.sig" + ) + spec_path = os.path.join(spec_dir, file_name) + + def check_spec(spec_to_check): + assert spec_to_check.name == "zlib" + assert spec_to_check._hash == "g7otk5dra3hifqxej36m5qzm7uyghqgb" + + with open(spec_path) as fd: + s = Spec.from_dict(bindist.load_possibly_clearsigned_json(fd)) + check_spec(s) + + +def test_load_clearsigned_json(): + obj = {"hello": "world"} + clearsigned = """\ +-----BEGIN PGP SIGNED MESSAGE----- +Hash: SHA512 + +{} +-----BEGIN PGP SIGNATURE----- +xyz +-----END PGP SIGNATURE-----""".format( + json.dumps(obj) + ) + + # Should accept strings and streams + assert bindist.load_possibly_clearsigned_json(clearsigned) == obj + assert bindist.load_possibly_clearsigned_json(io.StringIO(clearsigned)) == obj + + +def test_load_without_clearsigned_json(): + obj = {"hello": "world"} + not_clearsigned = json.dumps(obj) + + # Should accept strings and streams + assert bindist.load_possibly_clearsigned_json(not_clearsigned) == obj + assert bindist.load_possibly_clearsigned_json(io.StringIO(not_clearsigned)) == obj + + +def test_json_containing_clearsigned_message_is_json(): + # Test that we don't interpret json with a PGP signed message as a string somewhere + # as a clearsigned message. It should just deserialize the json contents. + val = """\ +"-----BEGIN PGP SIGNED MESSAGE----- +Hash: SHA512 + +{} +-----BEGIN PGP SIGNATURE----- +signature +-----END PGP SIGNATURE----- +""" + input = json.dumps({"key": val}) + assert bindist.load_possibly_clearsigned_json(input)["key"] == val + + +def test_clearsign_signature_part_of_json_string(): + # Check if we can deal with a string in json containing the string that is used + # at the start of a PGP signature. + obj = {"-----BEGIN PGP SIGNATURE-----": "-----BEGIN PGP SIGNATURE-----"} + input = """\ +-----BEGIN PGP SIGNED MESSAGE----- +Hash: SHA512 + +{} +-----BEGIN PGP SIGNATURE----- +signature +-----END PGP SIGNATURE----- +""".format( + json.dumps(obj) + ) + assert bindist.load_possibly_clearsigned_json(input) == obj + + +def test_broken_clearsign_signature(): + # In this test there is no PGP signature. + obj = {"-----BEGIN PGP SIGNATURE-----": "-----BEGIN PGP SIGNATURE-----"} + input = """\ +-----BEGIN PGP SIGNED MESSAGE----- +Hash: SHA512 + +{} +""".format( + json.dumps(obj) + ) + with pytest.raises(ValueError, match="Could not find PGP signature"): + bindist.load_possibly_clearsigned_json(input) + + +def test_transparent_decompression(): + """Test a roundtrip of string -> gzip -> string.""" + text = "this is utf8 that should be compressed" + gzipped = io.BytesIO() + with gzip.GzipFile(fileobj=gzipped, mode="w", compresslevel=1, mtime=0) as f: + f.write(text.encode("utf-8")) + gzipped.seek(0) + assert bindist.transparently_decompress_bytes_to_string(gzipped).read() == text diff --git a/lib/spack/spack/test/cmd/ci.py b/lib/spack/spack/test/cmd/ci.py index 3033862c9b..e27bd48a13 100644 --- a/lib/spack/spack/test/cmd/ci.py +++ b/lib/spack/spack/test/cmd/ci.py @@ -1324,7 +1324,9 @@ spack: if file_name.endswith(".spec.json.sig"): spec_json_path = os.path.join(buildcache_path, file_name) with open(spec_json_path) as json_fd: - json_object = Spec.extract_json_from_clearsig(json_fd.read()) + json_object = spack.binary_distribution.load_possibly_clearsigned_json( + json_fd + ) jsonschema.validate(json_object, specfile_schema) logs_dir = working_dir.join("logs_dir") diff --git a/lib/spack/spack/test/spec_yaml.py b/lib/spack/spack/test/spec_yaml.py index f1adf1cad9..1bc2e4c7be 100644 --- a/lib/spack/spack/test/spec_yaml.py +++ b/lib/spack/spack/test/spec_yaml.py @@ -47,27 +47,6 @@ def test_simple_spec(): check_json_round_trip(spec) -def test_read_spec_from_signed_json(): - spec_dir = os.path.join(spack.paths.test_path, "data", "mirrors", "signed_json") - file_name = ( - "linux-ubuntu18.04-haswell-gcc-8.4.0-" - "zlib-1.2.12-g7otk5dra3hifqxej36m5qzm7uyghqgb.spec.json.sig" - ) - spec_path = os.path.join(spec_dir, file_name) - - def check_spec(spec_to_check): - assert spec_to_check.name == "zlib" - assert spec_to_check._hash == "g7otk5dra3hifqxej36m5qzm7uyghqgb" - - with open(spec_path) as fd: - s = Spec.from_signed_json(fd) - check_spec(s) - - with open(spec_path) as fd: - s = Spec.from_signed_json(fd.read()) - check_spec(s) - - def test_normal_spec(mock_packages): spec = Spec("mpileaks+debug~opt") spec.normalize() -- cgit v1.2.3-60-g2f50