summaryrefslogtreecommitdiff
path: root/lib
diff options
context:
space:
mode:
authorHarmen Stoppels <harmenstoppels@gmail.com>2022-11-23 20:26:24 +0100
committerGitHub <noreply@github.com>2022-11-23 19:26:24 +0000
commitd06fd26c9ac8dd525fc129096188e2ea9fd2d0d7 (patch)
treebc1c8f62c44fd26457bb60d2bb79b7c9d2eea560 /lib
parent5d2c9636ffd37b197fc04989371eeb7d0fa7af74 (diff)
downloadspack-d06fd26c9ac8dd525fc129096188e2ea9fd2d0d7.tar.gz
spack-d06fd26c9ac8dd525fc129096188e2ea9fd2d0d7.tar.bz2
spack-d06fd26c9ac8dd525fc129096188e2ea9fd2d0d7.tar.xz
spack-d06fd26c9ac8dd525fc129096188e2ea9fd2d0d7.zip
`url_exists` related improvements (#34095)
For reasons beyond me Python thinks it's a great idea to upgrade HEAD requests to GET requests when following redirects. So, this PR adds a better `HTTPRedirectHandler`, and also moves some ad-hoc logic around for dealing with disabling SSL certs verification. Also, I'm stumped by the fact that Spack's `url_exists` does not use HEAD requests at all, so in certain cases Spack awkwardly downloads something first to see if it can download it, and then downloads it again because it knows it can download it. So, this PR ensures that both urllib and botocore use HEAD requests. Finally, it also removes some things that were there to support currently unsupported Python versions. Notice that the HTTP spec [section 10.3.2](https://datatracker.ietf.org/doc/html/rfc2616.html#section-10.3.2) just talks about how to deal with POST request on redirect (whether to follow or not): > If the 301 status code is received in response to a request other > than GET or HEAD, the user agent MUST NOT automatically redirect the > request unless it can be confirmed by the user, since this might > change the conditions under which the request was issued. > Note: When automatically redirecting a POST request after > receiving a 301 status code, some existing HTTP/1.0 user agents > will erroneously change it into a GET request. Python has a comment about this, they choose to go with the "erroneous change". But they then mess up the HEAD request while following the redirect, probably because they were too busy discussing how to deal with POST. See https://github.com/python/cpython/pull/99731
Diffstat (limited to 'lib')
-rw-r--r--lib/spack/spack/test/web.py23
-rw-r--r--lib/spack/spack/util/web.py119
2 files changed, 78 insertions, 64 deletions
diff --git a/lib/spack/spack/test/web.py b/lib/spack/spack/test/web.py
index 21c00e652c..491c155e51 100644
--- a/lib/spack/spack/test/web.py
+++ b/lib/spack/spack/test/web.py
@@ -6,6 +6,7 @@ import collections
import os
import posixpath
import sys
+from urllib.request import Request
import pytest
@@ -222,7 +223,10 @@ class MockPaginator(object):
class MockClientError(Exception):
def __init__(self):
- self.response = {"Error": {"Code": "NoSuchKey"}}
+ self.response = {
+ "Error": {"Code": "NoSuchKey"},
+ "ResponseMetadata": {"HTTPStatusCode": 404},
+ }
class MockS3Client(object):
@@ -244,6 +248,12 @@ class MockS3Client(object):
return True
raise self.ClientError
+ def head_object(self, Bucket=None, Key=None):
+ self.ClientError = MockClientError
+ if Bucket == "my-bucket" and Key == "subdirectory/my-file":
+ return True
+ raise self.ClientError
+
def test_gather_s3_information(monkeypatch, capfd):
mock_connection_data = {
@@ -307,3 +317,14 @@ def test_s3_url_exists(monkeypatch, capfd):
def test_s3_url_parsing():
assert spack.util.s3._parse_s3_endpoint_url("example.com") == "https://example.com"
assert spack.util.s3._parse_s3_endpoint_url("http://example.com") == "http://example.com"
+
+
+def test_head_requests_are_head_requests_after_redirection():
+ # Test whether our workaround for an issue in Python where HEAD requests get
+ # upgraded to GET requests upon redirect works.
+ handler = spack.util.web.BetterHTTPRedirectHandler()
+ initial_request = Request("http://example.com", method="HEAD")
+ redirected_request = handler.redirect_request(
+ initial_request, {}, 302, "Moved Permanently", {}, "http://www.example.com"
+ )
+ assert redirected_request.get_method() == "HEAD"
diff --git a/lib/spack/spack/util/web.py b/lib/spack/spack/util/web.py
index 543bb43c5c..da9c3a7125 100644
--- a/lib/spack/spack/util/web.py
+++ b/lib/spack/spack/util/web.py
@@ -17,7 +17,7 @@ import sys
import traceback
from html.parser import HTMLParser
from urllib.error import URLError
-from urllib.request import Request, urlopen
+from urllib.request import HTTPRedirectHandler, HTTPSHandler, Request, build_opener
import llnl.util.lang
import llnl.util.tty as tty
@@ -35,6 +35,44 @@ from spack.util.compression import ALLOWED_ARCHIVE_TYPES
from spack.util.executable import CommandNotFoundError, which
from spack.util.path import convert_to_posix_path
+
+class BetterHTTPRedirectHandler(HTTPRedirectHandler):
+ """The same as HTTPRedirectHandler, except that it sticks to a HEAD
+ request on redirect. Somehow Python upgrades HEAD requests to GET
+ requests when following redirects, which makes no sense. This
+ handler makes Python's urllib compatible with ``curl -LI``"""
+
+ def redirect_request(self, old_request, fp, code, msg, headers, newurl):
+ new_request = super().redirect_request(old_request, fp, code, msg, headers, newurl)
+ if old_request.get_method() == "HEAD":
+ new_request.method = "HEAD"
+ return new_request
+
+
+def _urlopen():
+ # One opener when SSL is enabled
+ with_ssl = build_opener(
+ BetterHTTPRedirectHandler,
+ HTTPSHandler(context=ssl.create_default_context()),
+ )
+
+ # One opener when SSL is disabled
+ without_ssl = build_opener(
+ BetterHTTPRedirectHandler,
+ HTTPSHandler(context=ssl._create_unverified_context()),
+ )
+
+ # And dynamically dispatch based on the config:verify_ssl.
+ def dispatch_open(*args, **kwargs):
+ opener = with_ssl if spack.config.get("config:verify_ssl", True) else without_ssl
+ return opener.open(*args, **kwargs)
+
+ return dispatch_open
+
+
+#: Dispatches to the correct OpenerDirector.open, based on Spack configuration.
+urlopen = llnl.util.lang.Singleton(_urlopen)
+
#: User-Agent used in Request objects
SPACK_USER_AGENT = "Spackbot/{0}".format(spack.spack_version)
@@ -78,36 +116,12 @@ def uses_ssl(parsed_url):
return False
-__UNABLE_TO_VERIFY_SSL = (lambda pyver: ((pyver < (2, 7, 9)) or ((3,) < pyver < (3, 4, 3))))(
- sys.version_info
-)
-
-
def read_from_url(url, accept_content_type=None):
url = url_util.parse(url)
- context = None
-
- verify_ssl = spack.config.get("config:verify_ssl")
# Timeout in seconds for web requests
timeout = spack.config.get("config:connect_timeout", 10)
- # Don't even bother with a context unless the URL scheme is one that uses
- # SSL certs.
- if uses_ssl(url):
- if verify_ssl:
- if __UNABLE_TO_VERIFY_SSL:
- # User wants SSL verification, but it cannot be provided.
- warn_no_ssl_cert_checking()
- else:
- # User wants SSL verification, and it *can* be provided.
- context = ssl.create_default_context() # novm
- else:
- # User has explicitly indicated that they do not want SSL
- # verification.
- if not __UNABLE_TO_VERIFY_SSL:
- context = ssl._create_unverified_context()
-
url_scheme = url.scheme
url = url_util.format(url)
if sys.platform == "win32" and url_scheme == "file":
@@ -123,7 +137,7 @@ def read_from_url(url, accept_content_type=None):
# one round-trip. However, most servers seem to ignore the header
# if you ask for a tarball with Accept: text/html.
req.get_method = lambda: "HEAD"
- resp = _urlopen(req, timeout=timeout, context=context)
+ resp = urlopen(req, timeout=timeout)
content_type = get_header(resp.headers, "Content-type")
@@ -131,7 +145,7 @@ def read_from_url(url, accept_content_type=None):
req.get_method = lambda: "GET"
try:
- response = _urlopen(req, timeout=timeout, context=context)
+ response = urlopen(req, timeout=timeout)
except URLError as err:
raise SpackWebError("Download failed: {ERROR}".format(ERROR=str(err)))
@@ -154,22 +168,11 @@ def read_from_url(url, accept_content_type=None):
return response.geturl(), response.headers, response
-def warn_no_ssl_cert_checking():
- tty.warn(
- "Spack will not check SSL certificates. You need to update "
- "your Python to enable certificate verification."
- )
-
-
def push_to_url(local_file_path, remote_path, keep_original=True, extra_args=None):
if sys.platform == "win32":
if remote_path[1] == ":":
remote_path = "file://" + remote_path
remote_url = url_util.parse(remote_path)
- verify_ssl = spack.config.get("config:verify_ssl")
-
- if __UNABLE_TO_VERIFY_SSL and verify_ssl and uses_ssl(remote_url):
- warn_no_ssl_cert_checking()
remote_file_path = url_util.local_file_path(remote_url)
if remote_file_path is not None:
@@ -405,12 +408,12 @@ def url_exists(url, curl=None):
) # noqa: E501
try:
- s3.get_object(Bucket=url_result.netloc, Key=url_result.path.lstrip("/"))
+ s3.head_object(Bucket=url_result.netloc, Key=url_result.path.lstrip("/"))
return True
except s3.ClientError as err:
- if err.response["Error"]["Code"] == "NoSuchKey":
+ if err.response["ResponseMetadata"]["HTTPStatusCode"] == 404:
return False
- raise err
+ raise
# Check if Google Storage .. urllib-based fetch
if url_result.scheme == "gs":
@@ -432,12 +435,14 @@ def url_exists(url, curl=None):
return curl_exe.returncode == 0
# If we get here, then the only other fetch method option is urllib.
- # So try to "read" from the URL and assume that *any* non-throwing
- # response contains the resource represented by the URL.
+ # We try a HEAD request and expect a 200 return code.
try:
- read_from_url(url)
- return True
- except (SpackWebError, URLError) as e:
+ response = urlopen(
+ Request(url, method="HEAD", headers={"User-Agent": SPACK_USER_AGENT}),
+ timeout=spack.config.get("config:connect_timeout", 10),
+ )
+ return response.status == 200
+ except URLError as e:
tty.debug("Failure reading URL: " + str(e))
return False
@@ -720,36 +725,24 @@ def spider(root_urls, depth=0, concurrency=32):
return pages, links
-def _urlopen(req, *args, **kwargs):
- """Wrapper for compatibility with old versions of Python."""
+def _open(req, *args, **kwargs):
+ global open
url = req
try:
url = url.get_full_url()
except AttributeError:
pass
- # Note: 'context' parameter was only introduced starting
- # with versions 2.7.9 and 3.4.3 of Python.
- if __UNABLE_TO_VERIFY_SSL:
- del kwargs["context"]
-
- opener = urlopen
if url_util.parse(url).scheme == "s3":
import spack.s3_handler
- opener = spack.s3_handler.open
+ return spack.s3_handler.open(req, *args, **kwargs)
elif url_util.parse(url).scheme == "gs":
import spack.gcs_handler
- opener = spack.gcs_handler.gcs_open
+ return spack.gcs_handler.gcs_open(req, *args, **kwargs)
- try:
- return opener(req, *args, **kwargs)
- except TypeError as err:
- # If the above fails because of 'context', call without 'context'.
- if "context" in kwargs and "context" in str(err):
- del kwargs["context"]
- return opener(req, *args, **kwargs)
+ return open(req, *args, **kwargs)
def find_versions_of_archive(