diff options
Diffstat (limited to 'lib')
-rw-r--r-- | lib/spack/spack/binary_distribution.py | 271 | ||||
-rw-r--r-- | lib/spack/spack/test/bindist.py | 224 | ||||
-rw-r--r-- | lib/spack/spack/test/web.py | 14 | ||||
-rw-r--r-- | lib/spack/spack/util/web.py | 30 |
4 files changed, 442 insertions, 97 deletions
diff --git a/lib/spack/spack/binary_distribution.py b/lib/spack/spack/binary_distribution.py index 39b42f6d08..6ab71e3965 100644 --- a/lib/spack/spack/binary_distribution.py +++ b/lib/spack/spack/binary_distribution.py @@ -9,12 +9,16 @@ import hashlib import json import multiprocessing.pool import os +import re import shutil import sys import tarfile import tempfile import time import traceback +import urllib.error +import urllib.parse +import urllib.request import warnings from contextlib import closing from urllib.error import HTTPError, URLError @@ -342,7 +346,6 @@ class BinaryCacheIndex(object): for cached_mirror_url in self._local_index_cache: cache_entry = self._local_index_cache[cached_mirror_url] - cached_index_hash = cache_entry["index_hash"] cached_index_path = cache_entry["index_path"] if cached_mirror_url in configured_mirror_urls: # Only do a fetch if the last fetch was longer than TTL ago @@ -361,13 +364,14 @@ class BinaryCacheIndex(object): # May need to fetch the index and update the local caches try: needs_regen = self._fetch_and_cache_index( - cached_mirror_url, expect_hash=cached_index_hash + cached_mirror_url, + cache_entry=cache_entry, ) self._last_fetch_times[cached_mirror_url] = (now, True) all_methods_failed = False - except FetchCacheError as fetch_error: + except FetchIndexError as e: needs_regen = False - fetch_errors.extend(fetch_error.errors) + fetch_errors.append(e) self._last_fetch_times[cached_mirror_url] = (now, False) # The need to regenerate implies a need to clear as well. spec_cache_clear_needed |= needs_regen @@ -396,20 +400,22 @@ class BinaryCacheIndex(object): # already have in our cache must be fetched, stored, and represented # locally. for mirror_url in configured_mirror_urls: - if mirror_url not in self._local_index_cache: - # Need to fetch the index and update the local caches - try: - needs_regen = self._fetch_and_cache_index(mirror_url) - self._last_fetch_times[mirror_url] = (now, True) - all_methods_failed = False - except FetchCacheError as fetch_error: - fetch_errors.extend(fetch_error.errors) - needs_regen = False - self._last_fetch_times[mirror_url] = (now, False) - # Generally speaking, a new mirror wouldn't imply the need to - # clear the spec cache, so leave it as is. - if needs_regen: - spec_cache_regenerate_needed = True + if mirror_url in self._local_index_cache: + continue + + # Need to fetch the index and update the local caches + try: + needs_regen = self._fetch_and_cache_index(mirror_url) + self._last_fetch_times[mirror_url] = (now, True) + all_methods_failed = False + except FetchIndexError as e: + fetch_errors.append(e) + needs_regen = False + self._last_fetch_times[mirror_url] = (now, False) + # Generally speaking, a new mirror wouldn't imply the need to + # clear the spec cache, so leave it as is. + if needs_regen: + spec_cache_regenerate_needed = True self._write_local_index_cache() @@ -423,7 +429,7 @@ class BinaryCacheIndex(object): if spec_cache_regenerate_needed: self.regenerate_spec_cache(clear_existing=spec_cache_clear_needed) - def _fetch_and_cache_index(self, mirror_url, expect_hash=None): + def _fetch_and_cache_index(self, mirror_url, cache_entry={}): """Fetch a buildcache index file from a remote mirror and cache it. If we already have a cached index from this mirror, then we first @@ -431,102 +437,50 @@ class BinaryCacheIndex(object): Args: mirror_url (str): Base url of mirror - expect_hash (str): If provided, this hash will be compared against - the index hash we retrieve from the mirror, to determine if we - need to fetch the index or not. + cache_entry (dict): Old cache metadata with keys ``index_hash``, ``index_path``, + ``etag`` Returns: - True if this function thinks the concrete spec cache, - ``_mirrors_for_spec``, should be regenerated. Returns False - otherwise. + True if the local index.json was updated. + Throws: - FetchCacheError: a composite exception. + FetchIndexError """ - index_fetch_url = url_util.join(mirror_url, _build_cache_relative_path, "index.json") - hash_fetch_url = url_util.join(mirror_url, _build_cache_relative_path, "index.json.hash") - - if not web_util.url_exists(index_fetch_url): - # A binary mirror is not required to have an index, so avoid - # raising FetchCacheError in that case. + # TODO: get rid of this request, handle 404 better + if not web_util.url_exists( + url_util.join(mirror_url, _build_cache_relative_path, "index.json") + ): return False - old_cache_key = None - fetched_hash = None - - errors = [] - - # Fetch the hash first so we can check if we actually need to fetch - # the index itself. - try: - _, _, fs = web_util.read_from_url(hash_fetch_url) - fetched_hash = codecs.getreader("utf-8")(fs).read() - except (URLError, web_util.SpackWebError) as url_err: - errors.append( - RuntimeError( - "Unable to read index hash {0} due to {1}: {2}".format( - hash_fetch_url, url_err.__class__.__name__, str(url_err) - ) - ) - ) - - # The only case where we'll skip attempting to fetch the buildcache - # index from the mirror is when we already have a hash for this - # mirror, we were able to retrieve one from the mirror, and - # the two hashes are the same. - if expect_hash and fetched_hash: - if fetched_hash == expect_hash: - tty.debug("Cached index for {0} already up to date".format(mirror_url)) - return False - else: - # We expected a hash, we fetched a hash, and they were not the - # same. If we end up fetching an index successfully and - # replacing our entry for this mirror, we should clean up the - # existing cache file - if mirror_url in self._local_index_cache: - existing_entry = self._local_index_cache[mirror_url] - old_cache_key = existing_entry["index_path"] - - tty.debug("Fetching index from {0}".format(index_fetch_url)) - - # Fetch index itself - try: - _, _, fs = web_util.read_from_url(index_fetch_url) - index_object_str = codecs.getreader("utf-8")(fs).read() - except (URLError, web_util.SpackWebError) as url_err: - errors.append( - RuntimeError( - "Unable to read index {0} due to {1}: {2}".format( - index_fetch_url, url_err.__class__.__name__, str(url_err) - ) - ) + etag = cache_entry.get("etag", None) + if etag: + fetcher = EtagIndexFetcher(mirror_url, etag) + else: + fetcher = DefaultIndexFetcher( + mirror_url, local_hash=cache_entry.get("index_hash", None) ) - raise FetchCacheError(errors) - locally_computed_hash = compute_hash(index_object_str) + result = fetcher.conditional_fetch() - if fetched_hash is not None and locally_computed_hash != fetched_hash: - msg = ( - "Computed index hash [{0}] did not match remote [{1}, url:{2}] " - "indicating error in index transmission" - ).format(locally_computed_hash, fetched_hash, hash_fetch_url) - errors.append(RuntimeError(msg)) - # We somehow got an index that doesn't match the remote one, maybe - # the next time we try we'll be successful. - raise FetchCacheError(errors) + # Nothing to do + if result.fresh: + return False + # Persist new index.json url_hash = compute_hash(mirror_url) - - cache_key = "{0}_{1}.json".format(url_hash[:10], locally_computed_hash[:10]) + cache_key = "{}_{}.json".format(url_hash[:10], result.hash[:10]) self._index_file_cache.init_entry(cache_key) with self._index_file_cache.write_transaction(cache_key) as (old, new): - new.write(index_object_str) + new.write(result.data) self._local_index_cache[mirror_url] = { - "index_hash": locally_computed_hash, + "index_hash": result.hash, "index_path": cache_key, + "etag": result.etag, } # clean up the old cache_key if necessary + old_cache_key = cache_entry.get("index_path", None) if old_cache_key: self._index_file_cache.remove(old_cache_key) @@ -623,7 +577,9 @@ class UnsignedPackageException(spack.error.SpackError): def compute_hash(data): - return hashlib.sha256(data.encode("utf-8")).hexdigest() + if isinstance(data, str): + data = data.encode("utf-8") + return hashlib.sha256(data).hexdigest() def build_cache_relative_path(): @@ -2413,3 +2369,124 @@ class BinaryCacheQuery(object): # Matching a spec constraint matches = [s for s in self.possible_specs if s.satisfies(spec)] return matches + + +class FetchIndexError(Exception): + def __str__(self): + if len(self.args) == 1: + return str(self.args[0]) + else: + return "{}, due to: {}".format(self.args[0], self.args[1]) + + +FetchIndexResult = collections.namedtuple("FetchIndexResult", "etag hash data fresh") + + +class DefaultIndexFetcher: + """Fetcher for index.json, using separate index.json.hash as cache invalidation strategy""" + + def __init__(self, url, local_hash, urlopen=web_util.urlopen): + self.url = url + self.local_hash = local_hash + self.urlopen = urlopen + + def conditional_fetch(self): + # Do an intermediate fetch for the hash + # and a conditional fetch for the contents + if self.local_hash: + url_index_hash = url_util.join(self.url, _build_cache_relative_path, "index.json.hash") + + try: + response = self.urlopen(urllib.request.Request(url_index_hash)) + except urllib.error.URLError as e: + raise FetchIndexError("Could not fetch {}".format(url_index_hash), e) from e + + # Validate the hash + remote_hash = response.read(64) + if not re.match(rb"[a-f\d]{64}$", remote_hash): + raise FetchIndexError("Invalid hash format in {}".format(url_index_hash)) + remote_hash = remote_hash.decode("utf-8") + + # No need to update further + if remote_hash == self.local_hash: + return FetchIndexResult(etag=None, hash=None, data=None, fresh=True) + + # Otherwise, download index.json + url_index = url_util.join(self.url, _build_cache_relative_path, "index.json") + + try: + response = self.urlopen(urllib.request.Request(url_index)) + except urllib.error.URLError as e: + raise FetchIndexError("Could not fetch index from {}".format(url_index), e) + + try: + result = codecs.getreader("utf-8")(response).read() + except ValueError as e: + return FetchCacheError("Remote index {} is invalid".format(url_index), e) + + computed_hash = compute_hash(result) + + # We don't handle computed_hash != remote_hash here, which can happen + # when remote index.json and index.json.hash are out of sync, or if + # the hash algorithm changed. + # The most likely scenario is that we got index.json got updated + # while we fetched index.json.hash. Warning about an issue thus feels + # wrong, as it's more of an issue with race conditions in the cache + # invalidation strategy. + + # For now we only handle etags on http(s), since 304 error handling + # in s3:// is not there yet. + if urllib.parse.urlparse(self.url).scheme not in ("http", "https"): + etag = None + else: + etag = web_util.parse_etag( + response.headers.get("Etag", None) or response.headers.get("etag", None) + ) + + return FetchIndexResult( + etag=etag, + hash=computed_hash, + data=result, + fresh=False, + ) + + +class EtagIndexFetcher: + """Fetcher for index.json, using ETags headers as cache invalidation strategy""" + + def __init__(self, url, etag, urlopen=web_util.urlopen): + self.url = url + self.etag = etag + self.urlopen = urlopen + + def conditional_fetch(self): + # Just do a conditional fetch immediately + url = url_util.join(self.url, _build_cache_relative_path, "index.json") + headers = { + "User-Agent": web_util.SPACK_USER_AGENT, + "If-None-Match": '"{}"'.format(self.etag), + } + + try: + response = self.urlopen(urllib.request.Request(url, headers=headers)) + except urllib.error.HTTPError as e: + if e.getcode() == 304: + # Not modified; that means fresh. + return FetchIndexResult(etag=None, hash=None, data=None, fresh=True) + raise FetchIndexError("Could not fetch index {}".format(url), e) from e + except urllib.error.URLError as e: + raise FetchIndexError("Could not fetch index {}".format(url), e) from e + + try: + result = codecs.getreader("utf-8")(response).read() + except ValueError as e: + raise FetchIndexError("Remote index {} is invalid".format(url), e) from e + + headers = response.headers + etag_header_value = headers.get("Etag", None) or headers.get("etag", None) + return FetchIndexResult( + etag=web_util.parse_etag(etag_header_value), + hash=compute_hash(result), + data=result, + fresh=False, + ) diff --git a/lib/spack/spack/test/bindist.py b/lib/spack/spack/test/bindist.py index ef80b2bae3..653d783969 100644 --- a/lib/spack/spack/test/bindist.py +++ b/lib/spack/spack/test/bindist.py @@ -3,9 +3,13 @@ # # SPDX-License-Identifier: (Apache-2.0 OR MIT) import glob +import io import os import platform import sys +import urllib.error +import urllib.request +import urllib.response import py import pytest @@ -666,3 +670,223 @@ def test_text_relocate_if_needed(install_mockery, mock_fetch, monkeypatch, capfd assert join_path("bin", "exe") in manifest["text_to_relocate"] assert join_path("bin", "otherexe") not in manifest["text_to_relocate"] assert join_path("bin", "secretexe") not in manifest["text_to_relocate"] + + +def test_etag_fetching_304(): + # Test conditional fetch with etags. If the remote hasn't modified the file + # it returns 304, which is an HTTPError in urllib-land. That should be + # handled as success, since it means the local cache is up-to-date. + def response_304(request: urllib.request.Request): + url = request.get_full_url() + if url == "https://www.example.com/build_cache/index.json": + assert request.get_header("If-none-match") == '"112a8bbc1b3f7f185621c1ee335f0502"' + raise urllib.error.HTTPError( + url, 304, "Not Modified", hdrs={}, fp=None # type: ignore[arg-type] + ) + assert False, "Should not fetch {}".format(url) + + fetcher = bindist.EtagIndexFetcher( + url="https://www.example.com", + etag="112a8bbc1b3f7f185621c1ee335f0502", + urlopen=response_304, + ) + + result = fetcher.conditional_fetch() + assert isinstance(result, bindist.FetchIndexResult) + assert result.fresh + + +def test_etag_fetching_200(): + # Test conditional fetch with etags. The remote has modified the file. + def response_200(request: urllib.request.Request): + url = request.get_full_url() + if url == "https://www.example.com/build_cache/index.json": + assert request.get_header("If-none-match") == '"112a8bbc1b3f7f185621c1ee335f0502"' + return urllib.response.addinfourl( + io.BytesIO(b"Result"), + headers={"Etag": '"59bcc3ad6775562f845953cf01624225"'}, # type: ignore[arg-type] + url=url, + code=200, + ) + assert False, "Should not fetch {}".format(url) + + fetcher = bindist.EtagIndexFetcher( + url="https://www.example.com", + etag="112a8bbc1b3f7f185621c1ee335f0502", + urlopen=response_200, + ) + + result = fetcher.conditional_fetch() + assert isinstance(result, bindist.FetchIndexResult) + assert not result.fresh + assert result.etag == "59bcc3ad6775562f845953cf01624225" + assert result.data == "Result" # decoded utf-8. + assert result.hash == bindist.compute_hash("Result") + + +def test_etag_fetching_404(): + # Test conditional fetch with etags. The remote has modified the file. + def response_404(request: urllib.request.Request): + raise urllib.error.HTTPError( + request.get_full_url(), + 404, + "Not found", + hdrs={"Etag": '"59bcc3ad6775562f845953cf01624225"'}, # type: ignore[arg-type] + fp=None, + ) + + fetcher = bindist.EtagIndexFetcher( + url="https://www.example.com", + etag="112a8bbc1b3f7f185621c1ee335f0502", + urlopen=response_404, + ) + + with pytest.raises(bindist.FetchIndexError): + fetcher.conditional_fetch() + + +def test_default_index_fetch_200(): + index_json = '{"Hello": "World"}' + index_json_hash = bindist.compute_hash(index_json) + + def urlopen(request: urllib.request.Request): + url = request.get_full_url() + if url.endswith("index.json.hash"): + return urllib.response.addinfourl( # type: ignore[arg-type] + io.BytesIO(index_json_hash.encode()), + headers={}, # type: ignore[arg-type] + url=url, + code=200, + ) + + elif url.endswith("index.json"): + return urllib.response.addinfourl( + io.BytesIO(index_json.encode()), + headers={"Etag": '"59bcc3ad6775562f845953cf01624225"'}, # type: ignore[arg-type] + url=url, + code=200, + ) + + assert False, "Unexpected request {}".format(url) + + fetcher = bindist.DefaultIndexFetcher( + url="https://www.example.com", local_hash="outdated", urlopen=urlopen + ) + + result = fetcher.conditional_fetch() + + assert isinstance(result, bindist.FetchIndexResult) + assert not result.fresh + assert result.etag == "59bcc3ad6775562f845953cf01624225" + assert result.data == index_json + assert result.hash == index_json_hash + + +def test_default_index_dont_fetch_index_json_hash_if_no_local_hash(): + # When we don't have local hash, we should not be fetching the + # remote index.json.hash file, but only index.json. + index_json = '{"Hello": "World"}' + index_json_hash = bindist.compute_hash(index_json) + + def urlopen(request: urllib.request.Request): + url = request.get_full_url() + if url.endswith("index.json"): + return urllib.response.addinfourl( + io.BytesIO(index_json.encode()), + headers={"Etag": '"59bcc3ad6775562f845953cf01624225"'}, # type: ignore[arg-type] + url=url, + code=200, + ) + + assert False, "Unexpected request {}".format(url) + + fetcher = bindist.DefaultIndexFetcher( + url="https://www.example.com", local_hash=None, urlopen=urlopen + ) + + result = fetcher.conditional_fetch() + + assert isinstance(result, bindist.FetchIndexResult) + assert result.data == index_json + assert result.hash == index_json_hash + assert result.etag == "59bcc3ad6775562f845953cf01624225" + assert not result.fresh + + +def test_default_index_not_modified(): + index_json = '{"Hello": "World"}' + index_json_hash = bindist.compute_hash(index_json) + + def urlopen(request: urllib.request.Request): + url = request.get_full_url() + if url.endswith("index.json.hash"): + return urllib.response.addinfourl( + io.BytesIO(index_json_hash.encode()), + headers={}, # type: ignore[arg-type] + url=url, + code=200, + ) + + # No request to index.json should be made. + assert False, "Unexpected request {}".format(url) + + fetcher = bindist.DefaultIndexFetcher( + url="https://www.example.com", local_hash=index_json_hash, urlopen=urlopen + ) + + assert fetcher.conditional_fetch().fresh + + +@pytest.mark.parametrize("index_json", [b"\xa9", b"!#%^"]) +def test_default_index_invalid_hash_file(index_json): + # Test invalid unicode / invalid hash type + index_json_hash = bindist.compute_hash(index_json) + + def urlopen(request: urllib.request.Request): + return urllib.response.addinfourl( + io.BytesIO(), + headers={}, # type: ignore[arg-type] + url=request.get_full_url(), + code=200, + ) + + fetcher = bindist.DefaultIndexFetcher( + url="https://www.example.com", local_hash=index_json_hash, urlopen=urlopen + ) + + with pytest.raises(bindist.FetchIndexError, match="Invalid hash format"): + fetcher.conditional_fetch() + + +def test_default_index_json_404(): + # Test invalid unicode / invalid hash type + index_json = '{"Hello": "World"}' + index_json_hash = bindist.compute_hash(index_json) + + def urlopen(request: urllib.request.Request): + url = request.get_full_url() + if url.endswith("index.json.hash"): + return urllib.response.addinfourl( + io.BytesIO(index_json_hash.encode()), + headers={}, # type: ignore[arg-type] + url=url, + code=200, + ) + + elif url.endswith("index.json"): + raise urllib.error.HTTPError( + url, + code=404, + msg="Not Found", + hdrs={"Etag": '"59bcc3ad6775562f845953cf01624225"'}, # type: ignore[arg-type] + fp=None, + ) + + assert False, "Unexpected fetch {}".format(url) + + fetcher = bindist.DefaultIndexFetcher( + url="https://www.example.com", local_hash="invalid", urlopen=urlopen + ) + + with pytest.raises(bindist.FetchIndexError, match="Could not fetch index"): + fetcher.conditional_fetch() diff --git a/lib/spack/spack/test/web.py b/lib/spack/spack/test/web.py index 166a577c89..c7785ab330 100644 --- a/lib/spack/spack/test/web.py +++ b/lib/spack/spack/test/web.py @@ -182,6 +182,20 @@ def test_get_header(): spack.util.web.get_header(headers, "ContentLength") +def test_etag_parser(): + # This follows rfc7232 to some extent, relaxing the quote requirement. + assert spack.util.web.parse_etag('"abcdef"') == "abcdef" + assert spack.util.web.parse_etag("abcdef") == "abcdef" + + # No empty tags + assert spack.util.web.parse_etag("") is None + + # No quotes or spaces allowed + assert spack.util.web.parse_etag('"abcdef"ghi"') is None + assert spack.util.web.parse_etag('"abc def"') is None + assert spack.util.web.parse_etag("abc def") is None + + @pytest.mark.skipif(sys.platform == "win32", reason="Not supported on Windows (yet)") def test_list_url(tmpdir): testpath = str(tmpdir) diff --git a/lib/spack/spack/util/web.py b/lib/spack/spack/util/web.py index 9398a12dd8..90232f7fe1 100644 --- a/lib/spack/spack/util/web.py +++ b/lib/spack/spack/util/web.py @@ -783,6 +783,36 @@ def get_header(headers, header_name): raise +def parse_etag(header_value): + """Parse a strong etag from an ETag: <value> header value. + We don't allow for weakness indicators because it's unclear + what that means for cache invalidation.""" + if header_value is None: + return None + + # First follow rfc7232 section 2.3 mostly: + # ETag = entity-tag + # entity-tag = [ weak ] opaque-tag + # weak = %x57.2F ; "W/", case-sensitive + # opaque-tag = DQUOTE *etagc DQUOTE + # etagc = %x21 / %x23-7E / obs-text + # ; VCHAR except double quotes, plus obs-text + # obs-text = %x80-FF + + # That means quotes are required. + valid = re.match(r'"([\x21\x23-\x7e\x80-\xFF]+)"$', header_value) + if valid: + return valid.group(1) + + # However, not everybody adheres to the RFC (some servers send + # wrong etags, but also s3:// is simply a different standard). + # In that case, it's common that quotes are omitted, everything + # else stays the same. + valid = re.match(r"([\x21\x23-\x7e\x80-\xFF]+)$", header_value) + + return valid.group(1) if valid else None + + class FetchError(spack.error.SpackError): """Superclass for fetch-related errors.""" |