summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--lib/spack/spack/binary_distribution.py271
-rw-r--r--lib/spack/spack/test/bindist.py224
-rw-r--r--lib/spack/spack/test/web.py14
-rw-r--r--lib/spack/spack/util/web.py30
4 files changed, 442 insertions, 97 deletions
diff --git a/lib/spack/spack/binary_distribution.py b/lib/spack/spack/binary_distribution.py
index 39b42f6d08..6ab71e3965 100644
--- a/lib/spack/spack/binary_distribution.py
+++ b/lib/spack/spack/binary_distribution.py
@@ -9,12 +9,16 @@ import hashlib
import json
import multiprocessing.pool
import os
+import re
import shutil
import sys
import tarfile
import tempfile
import time
import traceback
+import urllib.error
+import urllib.parse
+import urllib.request
import warnings
from contextlib import closing
from urllib.error import HTTPError, URLError
@@ -342,7 +346,6 @@ class BinaryCacheIndex(object):
for cached_mirror_url in self._local_index_cache:
cache_entry = self._local_index_cache[cached_mirror_url]
- cached_index_hash = cache_entry["index_hash"]
cached_index_path = cache_entry["index_path"]
if cached_mirror_url in configured_mirror_urls:
# Only do a fetch if the last fetch was longer than TTL ago
@@ -361,13 +364,14 @@ class BinaryCacheIndex(object):
# May need to fetch the index and update the local caches
try:
needs_regen = self._fetch_and_cache_index(
- cached_mirror_url, expect_hash=cached_index_hash
+ cached_mirror_url,
+ cache_entry=cache_entry,
)
self._last_fetch_times[cached_mirror_url] = (now, True)
all_methods_failed = False
- except FetchCacheError as fetch_error:
+ except FetchIndexError as e:
needs_regen = False
- fetch_errors.extend(fetch_error.errors)
+ fetch_errors.append(e)
self._last_fetch_times[cached_mirror_url] = (now, False)
# The need to regenerate implies a need to clear as well.
spec_cache_clear_needed |= needs_regen
@@ -396,20 +400,22 @@ class BinaryCacheIndex(object):
# already have in our cache must be fetched, stored, and represented
# locally.
for mirror_url in configured_mirror_urls:
- if mirror_url not in self._local_index_cache:
- # Need to fetch the index and update the local caches
- try:
- needs_regen = self._fetch_and_cache_index(mirror_url)
- self._last_fetch_times[mirror_url] = (now, True)
- all_methods_failed = False
- except FetchCacheError as fetch_error:
- fetch_errors.extend(fetch_error.errors)
- needs_regen = False
- self._last_fetch_times[mirror_url] = (now, False)
- # Generally speaking, a new mirror wouldn't imply the need to
- # clear the spec cache, so leave it as is.
- if needs_regen:
- spec_cache_regenerate_needed = True
+ if mirror_url in self._local_index_cache:
+ continue
+
+ # Need to fetch the index and update the local caches
+ try:
+ needs_regen = self._fetch_and_cache_index(mirror_url)
+ self._last_fetch_times[mirror_url] = (now, True)
+ all_methods_failed = False
+ except FetchIndexError as e:
+ fetch_errors.append(e)
+ needs_regen = False
+ self._last_fetch_times[mirror_url] = (now, False)
+ # Generally speaking, a new mirror wouldn't imply the need to
+ # clear the spec cache, so leave it as is.
+ if needs_regen:
+ spec_cache_regenerate_needed = True
self._write_local_index_cache()
@@ -423,7 +429,7 @@ class BinaryCacheIndex(object):
if spec_cache_regenerate_needed:
self.regenerate_spec_cache(clear_existing=spec_cache_clear_needed)
- def _fetch_and_cache_index(self, mirror_url, expect_hash=None):
+ def _fetch_and_cache_index(self, mirror_url, cache_entry={}):
"""Fetch a buildcache index file from a remote mirror and cache it.
If we already have a cached index from this mirror, then we first
@@ -431,102 +437,50 @@ class BinaryCacheIndex(object):
Args:
mirror_url (str): Base url of mirror
- expect_hash (str): If provided, this hash will be compared against
- the index hash we retrieve from the mirror, to determine if we
- need to fetch the index or not.
+ cache_entry (dict): Old cache metadata with keys ``index_hash``, ``index_path``,
+ ``etag``
Returns:
- True if this function thinks the concrete spec cache,
- ``_mirrors_for_spec``, should be regenerated. Returns False
- otherwise.
+ True if the local index.json was updated.
+
Throws:
- FetchCacheError: a composite exception.
+ FetchIndexError
"""
- index_fetch_url = url_util.join(mirror_url, _build_cache_relative_path, "index.json")
- hash_fetch_url = url_util.join(mirror_url, _build_cache_relative_path, "index.json.hash")
-
- if not web_util.url_exists(index_fetch_url):
- # A binary mirror is not required to have an index, so avoid
- # raising FetchCacheError in that case.
+ # TODO: get rid of this request, handle 404 better
+ if not web_util.url_exists(
+ url_util.join(mirror_url, _build_cache_relative_path, "index.json")
+ ):
return False
- old_cache_key = None
- fetched_hash = None
-
- errors = []
-
- # Fetch the hash first so we can check if we actually need to fetch
- # the index itself.
- try:
- _, _, fs = web_util.read_from_url(hash_fetch_url)
- fetched_hash = codecs.getreader("utf-8")(fs).read()
- except (URLError, web_util.SpackWebError) as url_err:
- errors.append(
- RuntimeError(
- "Unable to read index hash {0} due to {1}: {2}".format(
- hash_fetch_url, url_err.__class__.__name__, str(url_err)
- )
- )
- )
-
- # The only case where we'll skip attempting to fetch the buildcache
- # index from the mirror is when we already have a hash for this
- # mirror, we were able to retrieve one from the mirror, and
- # the two hashes are the same.
- if expect_hash and fetched_hash:
- if fetched_hash == expect_hash:
- tty.debug("Cached index for {0} already up to date".format(mirror_url))
- return False
- else:
- # We expected a hash, we fetched a hash, and they were not the
- # same. If we end up fetching an index successfully and
- # replacing our entry for this mirror, we should clean up the
- # existing cache file
- if mirror_url in self._local_index_cache:
- existing_entry = self._local_index_cache[mirror_url]
- old_cache_key = existing_entry["index_path"]
-
- tty.debug("Fetching index from {0}".format(index_fetch_url))
-
- # Fetch index itself
- try:
- _, _, fs = web_util.read_from_url(index_fetch_url)
- index_object_str = codecs.getreader("utf-8")(fs).read()
- except (URLError, web_util.SpackWebError) as url_err:
- errors.append(
- RuntimeError(
- "Unable to read index {0} due to {1}: {2}".format(
- index_fetch_url, url_err.__class__.__name__, str(url_err)
- )
- )
+ etag = cache_entry.get("etag", None)
+ if etag:
+ fetcher = EtagIndexFetcher(mirror_url, etag)
+ else:
+ fetcher = DefaultIndexFetcher(
+ mirror_url, local_hash=cache_entry.get("index_hash", None)
)
- raise FetchCacheError(errors)
- locally_computed_hash = compute_hash(index_object_str)
+ result = fetcher.conditional_fetch()
- if fetched_hash is not None and locally_computed_hash != fetched_hash:
- msg = (
- "Computed index hash [{0}] did not match remote [{1}, url:{2}] "
- "indicating error in index transmission"
- ).format(locally_computed_hash, fetched_hash, hash_fetch_url)
- errors.append(RuntimeError(msg))
- # We somehow got an index that doesn't match the remote one, maybe
- # the next time we try we'll be successful.
- raise FetchCacheError(errors)
+ # Nothing to do
+ if result.fresh:
+ return False
+ # Persist new index.json
url_hash = compute_hash(mirror_url)
-
- cache_key = "{0}_{1}.json".format(url_hash[:10], locally_computed_hash[:10])
+ cache_key = "{}_{}.json".format(url_hash[:10], result.hash[:10])
self._index_file_cache.init_entry(cache_key)
with self._index_file_cache.write_transaction(cache_key) as (old, new):
- new.write(index_object_str)
+ new.write(result.data)
self._local_index_cache[mirror_url] = {
- "index_hash": locally_computed_hash,
+ "index_hash": result.hash,
"index_path": cache_key,
+ "etag": result.etag,
}
# clean up the old cache_key if necessary
+ old_cache_key = cache_entry.get("index_path", None)
if old_cache_key:
self._index_file_cache.remove(old_cache_key)
@@ -623,7 +577,9 @@ class UnsignedPackageException(spack.error.SpackError):
def compute_hash(data):
- return hashlib.sha256(data.encode("utf-8")).hexdigest()
+ if isinstance(data, str):
+ data = data.encode("utf-8")
+ return hashlib.sha256(data).hexdigest()
def build_cache_relative_path():
@@ -2413,3 +2369,124 @@ class BinaryCacheQuery(object):
# Matching a spec constraint
matches = [s for s in self.possible_specs if s.satisfies(spec)]
return matches
+
+
+class FetchIndexError(Exception):
+ def __str__(self):
+ if len(self.args) == 1:
+ return str(self.args[0])
+ else:
+ return "{}, due to: {}".format(self.args[0], self.args[1])
+
+
+FetchIndexResult = collections.namedtuple("FetchIndexResult", "etag hash data fresh")
+
+
+class DefaultIndexFetcher:
+ """Fetcher for index.json, using separate index.json.hash as cache invalidation strategy"""
+
+ def __init__(self, url, local_hash, urlopen=web_util.urlopen):
+ self.url = url
+ self.local_hash = local_hash
+ self.urlopen = urlopen
+
+ def conditional_fetch(self):
+ # Do an intermediate fetch for the hash
+ # and a conditional fetch for the contents
+ if self.local_hash:
+ url_index_hash = url_util.join(self.url, _build_cache_relative_path, "index.json.hash")
+
+ try:
+ response = self.urlopen(urllib.request.Request(url_index_hash))
+ except urllib.error.URLError as e:
+ raise FetchIndexError("Could not fetch {}".format(url_index_hash), e) from e
+
+ # Validate the hash
+ remote_hash = response.read(64)
+ if not re.match(rb"[a-f\d]{64}$", remote_hash):
+ raise FetchIndexError("Invalid hash format in {}".format(url_index_hash))
+ remote_hash = remote_hash.decode("utf-8")
+
+ # No need to update further
+ if remote_hash == self.local_hash:
+ return FetchIndexResult(etag=None, hash=None, data=None, fresh=True)
+
+ # Otherwise, download index.json
+ url_index = url_util.join(self.url, _build_cache_relative_path, "index.json")
+
+ try:
+ response = self.urlopen(urllib.request.Request(url_index))
+ except urllib.error.URLError as e:
+ raise FetchIndexError("Could not fetch index from {}".format(url_index), e)
+
+ try:
+ result = codecs.getreader("utf-8")(response).read()
+ except ValueError as e:
+ return FetchCacheError("Remote index {} is invalid".format(url_index), e)
+
+ computed_hash = compute_hash(result)
+
+ # We don't handle computed_hash != remote_hash here, which can happen
+ # when remote index.json and index.json.hash are out of sync, or if
+ # the hash algorithm changed.
+ # The most likely scenario is that we got index.json got updated
+ # while we fetched index.json.hash. Warning about an issue thus feels
+ # wrong, as it's more of an issue with race conditions in the cache
+ # invalidation strategy.
+
+ # For now we only handle etags on http(s), since 304 error handling
+ # in s3:// is not there yet.
+ if urllib.parse.urlparse(self.url).scheme not in ("http", "https"):
+ etag = None
+ else:
+ etag = web_util.parse_etag(
+ response.headers.get("Etag", None) or response.headers.get("etag", None)
+ )
+
+ return FetchIndexResult(
+ etag=etag,
+ hash=computed_hash,
+ data=result,
+ fresh=False,
+ )
+
+
+class EtagIndexFetcher:
+ """Fetcher for index.json, using ETags headers as cache invalidation strategy"""
+
+ def __init__(self, url, etag, urlopen=web_util.urlopen):
+ self.url = url
+ self.etag = etag
+ self.urlopen = urlopen
+
+ def conditional_fetch(self):
+ # Just do a conditional fetch immediately
+ url = url_util.join(self.url, _build_cache_relative_path, "index.json")
+ headers = {
+ "User-Agent": web_util.SPACK_USER_AGENT,
+ "If-None-Match": '"{}"'.format(self.etag),
+ }
+
+ try:
+ response = self.urlopen(urllib.request.Request(url, headers=headers))
+ except urllib.error.HTTPError as e:
+ if e.getcode() == 304:
+ # Not modified; that means fresh.
+ return FetchIndexResult(etag=None, hash=None, data=None, fresh=True)
+ raise FetchIndexError("Could not fetch index {}".format(url), e) from e
+ except urllib.error.URLError as e:
+ raise FetchIndexError("Could not fetch index {}".format(url), e) from e
+
+ try:
+ result = codecs.getreader("utf-8")(response).read()
+ except ValueError as e:
+ raise FetchIndexError("Remote index {} is invalid".format(url), e) from e
+
+ headers = response.headers
+ etag_header_value = headers.get("Etag", None) or headers.get("etag", None)
+ return FetchIndexResult(
+ etag=web_util.parse_etag(etag_header_value),
+ hash=compute_hash(result),
+ data=result,
+ fresh=False,
+ )
diff --git a/lib/spack/spack/test/bindist.py b/lib/spack/spack/test/bindist.py
index ef80b2bae3..653d783969 100644
--- a/lib/spack/spack/test/bindist.py
+++ b/lib/spack/spack/test/bindist.py
@@ -3,9 +3,13 @@
#
# SPDX-License-Identifier: (Apache-2.0 OR MIT)
import glob
+import io
import os
import platform
import sys
+import urllib.error
+import urllib.request
+import urllib.response
import py
import pytest
@@ -666,3 +670,223 @@ def test_text_relocate_if_needed(install_mockery, mock_fetch, monkeypatch, capfd
assert join_path("bin", "exe") in manifest["text_to_relocate"]
assert join_path("bin", "otherexe") not in manifest["text_to_relocate"]
assert join_path("bin", "secretexe") not in manifest["text_to_relocate"]
+
+
+def test_etag_fetching_304():
+ # Test conditional fetch with etags. If the remote hasn't modified the file
+ # it returns 304, which is an HTTPError in urllib-land. That should be
+ # handled as success, since it means the local cache is up-to-date.
+ def response_304(request: urllib.request.Request):
+ url = request.get_full_url()
+ if url == "https://www.example.com/build_cache/index.json":
+ assert request.get_header("If-none-match") == '"112a8bbc1b3f7f185621c1ee335f0502"'
+ raise urllib.error.HTTPError(
+ url, 304, "Not Modified", hdrs={}, fp=None # type: ignore[arg-type]
+ )
+ assert False, "Should not fetch {}".format(url)
+
+ fetcher = bindist.EtagIndexFetcher(
+ url="https://www.example.com",
+ etag="112a8bbc1b3f7f185621c1ee335f0502",
+ urlopen=response_304,
+ )
+
+ result = fetcher.conditional_fetch()
+ assert isinstance(result, bindist.FetchIndexResult)
+ assert result.fresh
+
+
+def test_etag_fetching_200():
+ # Test conditional fetch with etags. The remote has modified the file.
+ def response_200(request: urllib.request.Request):
+ url = request.get_full_url()
+ if url == "https://www.example.com/build_cache/index.json":
+ assert request.get_header("If-none-match") == '"112a8bbc1b3f7f185621c1ee335f0502"'
+ return urllib.response.addinfourl(
+ io.BytesIO(b"Result"),
+ headers={"Etag": '"59bcc3ad6775562f845953cf01624225"'}, # type: ignore[arg-type]
+ url=url,
+ code=200,
+ )
+ assert False, "Should not fetch {}".format(url)
+
+ fetcher = bindist.EtagIndexFetcher(
+ url="https://www.example.com",
+ etag="112a8bbc1b3f7f185621c1ee335f0502",
+ urlopen=response_200,
+ )
+
+ result = fetcher.conditional_fetch()
+ assert isinstance(result, bindist.FetchIndexResult)
+ assert not result.fresh
+ assert result.etag == "59bcc3ad6775562f845953cf01624225"
+ assert result.data == "Result" # decoded utf-8.
+ assert result.hash == bindist.compute_hash("Result")
+
+
+def test_etag_fetching_404():
+ # Test conditional fetch with etags. The remote has modified the file.
+ def response_404(request: urllib.request.Request):
+ raise urllib.error.HTTPError(
+ request.get_full_url(),
+ 404,
+ "Not found",
+ hdrs={"Etag": '"59bcc3ad6775562f845953cf01624225"'}, # type: ignore[arg-type]
+ fp=None,
+ )
+
+ fetcher = bindist.EtagIndexFetcher(
+ url="https://www.example.com",
+ etag="112a8bbc1b3f7f185621c1ee335f0502",
+ urlopen=response_404,
+ )
+
+ with pytest.raises(bindist.FetchIndexError):
+ fetcher.conditional_fetch()
+
+
+def test_default_index_fetch_200():
+ index_json = '{"Hello": "World"}'
+ index_json_hash = bindist.compute_hash(index_json)
+
+ def urlopen(request: urllib.request.Request):
+ url = request.get_full_url()
+ if url.endswith("index.json.hash"):
+ return urllib.response.addinfourl( # type: ignore[arg-type]
+ io.BytesIO(index_json_hash.encode()),
+ headers={}, # type: ignore[arg-type]
+ url=url,
+ code=200,
+ )
+
+ elif url.endswith("index.json"):
+ return urllib.response.addinfourl(
+ io.BytesIO(index_json.encode()),
+ headers={"Etag": '"59bcc3ad6775562f845953cf01624225"'}, # type: ignore[arg-type]
+ url=url,
+ code=200,
+ )
+
+ assert False, "Unexpected request {}".format(url)
+
+ fetcher = bindist.DefaultIndexFetcher(
+ url="https://www.example.com", local_hash="outdated", urlopen=urlopen
+ )
+
+ result = fetcher.conditional_fetch()
+
+ assert isinstance(result, bindist.FetchIndexResult)
+ assert not result.fresh
+ assert result.etag == "59bcc3ad6775562f845953cf01624225"
+ assert result.data == index_json
+ assert result.hash == index_json_hash
+
+
+def test_default_index_dont_fetch_index_json_hash_if_no_local_hash():
+ # When we don't have local hash, we should not be fetching the
+ # remote index.json.hash file, but only index.json.
+ index_json = '{"Hello": "World"}'
+ index_json_hash = bindist.compute_hash(index_json)
+
+ def urlopen(request: urllib.request.Request):
+ url = request.get_full_url()
+ if url.endswith("index.json"):
+ return urllib.response.addinfourl(
+ io.BytesIO(index_json.encode()),
+ headers={"Etag": '"59bcc3ad6775562f845953cf01624225"'}, # type: ignore[arg-type]
+ url=url,
+ code=200,
+ )
+
+ assert False, "Unexpected request {}".format(url)
+
+ fetcher = bindist.DefaultIndexFetcher(
+ url="https://www.example.com", local_hash=None, urlopen=urlopen
+ )
+
+ result = fetcher.conditional_fetch()
+
+ assert isinstance(result, bindist.FetchIndexResult)
+ assert result.data == index_json
+ assert result.hash == index_json_hash
+ assert result.etag == "59bcc3ad6775562f845953cf01624225"
+ assert not result.fresh
+
+
+def test_default_index_not_modified():
+ index_json = '{"Hello": "World"}'
+ index_json_hash = bindist.compute_hash(index_json)
+
+ def urlopen(request: urllib.request.Request):
+ url = request.get_full_url()
+ if url.endswith("index.json.hash"):
+ return urllib.response.addinfourl(
+ io.BytesIO(index_json_hash.encode()),
+ headers={}, # type: ignore[arg-type]
+ url=url,
+ code=200,
+ )
+
+ # No request to index.json should be made.
+ assert False, "Unexpected request {}".format(url)
+
+ fetcher = bindist.DefaultIndexFetcher(
+ url="https://www.example.com", local_hash=index_json_hash, urlopen=urlopen
+ )
+
+ assert fetcher.conditional_fetch().fresh
+
+
+@pytest.mark.parametrize("index_json", [b"\xa9", b"!#%^"])
+def test_default_index_invalid_hash_file(index_json):
+ # Test invalid unicode / invalid hash type
+ index_json_hash = bindist.compute_hash(index_json)
+
+ def urlopen(request: urllib.request.Request):
+ return urllib.response.addinfourl(
+ io.BytesIO(),
+ headers={}, # type: ignore[arg-type]
+ url=request.get_full_url(),
+ code=200,
+ )
+
+ fetcher = bindist.DefaultIndexFetcher(
+ url="https://www.example.com", local_hash=index_json_hash, urlopen=urlopen
+ )
+
+ with pytest.raises(bindist.FetchIndexError, match="Invalid hash format"):
+ fetcher.conditional_fetch()
+
+
+def test_default_index_json_404():
+ # Test invalid unicode / invalid hash type
+ index_json = '{"Hello": "World"}'
+ index_json_hash = bindist.compute_hash(index_json)
+
+ def urlopen(request: urllib.request.Request):
+ url = request.get_full_url()
+ if url.endswith("index.json.hash"):
+ return urllib.response.addinfourl(
+ io.BytesIO(index_json_hash.encode()),
+ headers={}, # type: ignore[arg-type]
+ url=url,
+ code=200,
+ )
+
+ elif url.endswith("index.json"):
+ raise urllib.error.HTTPError(
+ url,
+ code=404,
+ msg="Not Found",
+ hdrs={"Etag": '"59bcc3ad6775562f845953cf01624225"'}, # type: ignore[arg-type]
+ fp=None,
+ )
+
+ assert False, "Unexpected fetch {}".format(url)
+
+ fetcher = bindist.DefaultIndexFetcher(
+ url="https://www.example.com", local_hash="invalid", urlopen=urlopen
+ )
+
+ with pytest.raises(bindist.FetchIndexError, match="Could not fetch index"):
+ fetcher.conditional_fetch()
diff --git a/lib/spack/spack/test/web.py b/lib/spack/spack/test/web.py
index 166a577c89..c7785ab330 100644
--- a/lib/spack/spack/test/web.py
+++ b/lib/spack/spack/test/web.py
@@ -182,6 +182,20 @@ def test_get_header():
spack.util.web.get_header(headers, "ContentLength")
+def test_etag_parser():
+ # This follows rfc7232 to some extent, relaxing the quote requirement.
+ assert spack.util.web.parse_etag('"abcdef"') == "abcdef"
+ assert spack.util.web.parse_etag("abcdef") == "abcdef"
+
+ # No empty tags
+ assert spack.util.web.parse_etag("") is None
+
+ # No quotes or spaces allowed
+ assert spack.util.web.parse_etag('"abcdef"ghi"') is None
+ assert spack.util.web.parse_etag('"abc def"') is None
+ assert spack.util.web.parse_etag("abc def") is None
+
+
@pytest.mark.skipif(sys.platform == "win32", reason="Not supported on Windows (yet)")
def test_list_url(tmpdir):
testpath = str(tmpdir)
diff --git a/lib/spack/spack/util/web.py b/lib/spack/spack/util/web.py
index 9398a12dd8..90232f7fe1 100644
--- a/lib/spack/spack/util/web.py
+++ b/lib/spack/spack/util/web.py
@@ -783,6 +783,36 @@ def get_header(headers, header_name):
raise
+def parse_etag(header_value):
+ """Parse a strong etag from an ETag: <value> header value.
+ We don't allow for weakness indicators because it's unclear
+ what that means for cache invalidation."""
+ if header_value is None:
+ return None
+
+ # First follow rfc7232 section 2.3 mostly:
+ # ETag = entity-tag
+ # entity-tag = [ weak ] opaque-tag
+ # weak = %x57.2F ; "W/", case-sensitive
+ # opaque-tag = DQUOTE *etagc DQUOTE
+ # etagc = %x21 / %x23-7E / obs-text
+ # ; VCHAR except double quotes, plus obs-text
+ # obs-text = %x80-FF
+
+ # That means quotes are required.
+ valid = re.match(r'"([\x21\x23-\x7e\x80-\xFF]+)"$', header_value)
+ if valid:
+ return valid.group(1)
+
+ # However, not everybody adheres to the RFC (some servers send
+ # wrong etags, but also s3:// is simply a different standard).
+ # In that case, it's common that quotes are omitted, everything
+ # else stays the same.
+ valid = re.match(r"([\x21\x23-\x7e\x80-\xFF]+)$", header_value)
+
+ return valid.group(1) if valid else None
+
+
class FetchError(spack.error.SpackError):
"""Superclass for fetch-related errors."""