`url_exists` related improvements (#34095)

For reasons beyond me Python thinks it's a great idea to upgrade HEAD requests to GET requests when following redirects. So, this PR adds a better `HTTPRedirectHandler`, and also moves some ad-hoc logic around for dealing with disabling SSL certs verification. Also, I'm stumped by the fact that Spack's `url_exists` does not use HEAD requests at all, so in certain cases Spack awkwardly downloads something first to see if it can download it, and then downloads it again because it knows it can download it. So, this PR ensures that both urllib and botocore use HEAD requests. Finally, it also removes some things that were there to support currently unsupported Python versions. Notice that the HTTP spec [section 10.3.2](https://datatracker.ietf.org/doc/html/rfc2616.html#section-10.3.2) just talks about how to deal with POST request on redirect (whether to follow or not): > If the 301 status code is received in response to a request other > than GET or HEAD, the user agent MUST NOT automatically redirect the > request unless it can be confirmed by the user, since this might > change the conditions under which the request was issued. > Note: When automatically redirecting a POST request after > receiving a 301 status code, some existing HTTP/1.0 user agents > will erroneously change it into a GET request. Python has a comment about this, they choose to go with the "erroneous change". But they then mess up the HEAD request while following the redirect, probably because they were too busy discussing how to deal with POST. See https://github.com/python/cpython/pull/99731
author: Harmen Stoppels <harmenstoppels@gmail.com> 2022-11-23 20:26:24 +0100
committer: GitHub <noreply@github.com> 2022-11-23 19:26:24 +0000
commit: d06fd26c9ac8dd525fc129096188e2ea9fd2d0d7 (patch)
tree: bc1c8f62c44fd26457bb60d2bb79b7c9d2eea560 /lib
parent: 5d2c9636ffd37b197fc04989371eeb7d0fa7af74 (diff)
download: spack-d06fd26c9ac8dd525fc129096188e2ea9fd2d0d7.tar.gz
spack-d06fd26c9ac8dd525fc129096188e2ea9fd2d0d7.tar.bz2
spack-d06fd26c9ac8dd525fc129096188e2ea9fd2d0d7.tar.xz
spack-d06fd26c9ac8dd525fc129096188e2ea9fd2d0d7.zip
2 files changed, 78 insertions, 64 deletions
diff --git a/lib/spack/spack/test/web.py b/lib/spack/spack/test/web.py
index 21c00e652c..491c155e51 100644
--- a/lib/spack/spack/test/web.py
+++ b/lib/spack/spack/test/web.py
@@ -6,6 +6,7 @@ import collections
 import os
 import posixpath
 import sys
+from urllib.request import Request
 
 import pytest
 
@@ -222,7 +223,10 @@ class MockPaginator(object):
 
 class MockClientError(Exception):
     def __init__(self):
-        self.response = {"Error": {"Code": "NoSuchKey"}}
+        self.response = {
+            "Error": {"Code": "NoSuchKey"},
+            "ResponseMetadata": {"HTTPStatusCode": 404},
+        }
 
 
 class MockS3Client(object):
@@ -244,6 +248,12 @@ class MockS3Client(object):
             return True
         raise self.ClientError
 
+    def head_object(self, Bucket=None, Key=None):
+        self.ClientError = MockClientError
+        if Bucket == "my-bucket" and Key == "subdirectory/my-file":
+            return True
+        raise self.ClientError
+
 
 def test_gather_s3_information(monkeypatch, capfd):
     mock_connection_data = {
@@ -307,3 +317,14 @@ def test_s3_url_exists(monkeypatch, capfd):
 def test_s3_url_parsing():
     assert spack.util.s3._parse_s3_endpoint_url("example.com") == "https://example.com"
     assert spack.util.s3._parse_s3_endpoint_url("http://example.com") == "http://example.com"
+
+
+def test_head_requests_are_head_requests_after_redirection():
+    # Test whether our workaround for an issue in Python where HEAD requests get
+    # upgraded to GET requests upon redirect works.
+    handler = spack.util.web.BetterHTTPRedirectHandler()
+    initial_request = Request("http://example.com", method="HEAD")
+    redirected_request = handler.redirect_request(
+        initial_request, {}, 302, "Moved Permanently", {}, "http://www.example.com"
+    )
+    assert redirected_request.get_method() == "HEAD"
diff --git a/lib/spack/spack/util/web.py b/lib/spack/spack/util/web.py
index 543bb43c5c..da9c3a7125 100644
--- a/lib/spack/spack/util/web.py
+++ b/lib/spack/spack/util/web.py
@@ -17,7 +17,7 @@ import sys
 import traceback
 from html.parser import HTMLParser
 from urllib.error import URLError
-from urllib.request import Request, urlopen
+from urllib.request import HTTPRedirectHandler, HTTPSHandler, Request, build_opener
 
 import llnl.util.lang
 import llnl.util.tty as tty
@@ -35,6 +35,44 @@ from spack.util.compression import ALLOWED_ARCHIVE_TYPES
 from spack.util.executable import CommandNotFoundError, which
 from spack.util.path import convert_to_posix_path
 
+
+class BetterHTTPRedirectHandler(HTTPRedirectHandler):
+    """The same as HTTPRedirectHandler, except that it sticks to a HEAD
+    request on redirect. Somehow Python upgrades HEAD requests to GET
+    requests when following redirects, which makes no sense. This
+    handler makes Python's urllib compatible with ``curl -LI``"""
+
+    def redirect_request(self, old_request, fp, code, msg, headers, newurl):
+        new_request = super().redirect_request(old_request, fp, code, msg, headers, newurl)
+        if old_request.get_method() == "HEAD":
+            new_request.method = "HEAD"
+        return new_request
+
+
+def _urlopen():
+    # One opener when SSL is enabled
+    with_ssl = build_opener(
+        BetterHTTPRedirectHandler,
+        HTTPSHandler(context=ssl.create_default_context()),
+    )
+
+    # One opener when SSL is disabled
+    without_ssl = build_opener(
+        BetterHTTPRedirectHandler,
+        HTTPSHandler(context=ssl._create_unverified_context()),
+    )
+
+    # And dynamically dispatch based on the config:verify_ssl.
+    def dispatch_open(*args, **kwargs):
+        opener = with_ssl if spack.config.get("config:verify_ssl", True) else without_ssl
+        return opener.open(*args, **kwargs)
+
+    return dispatch_open
+
+
+#: Dispatches to the correct OpenerDirector.open, based on Spack configuration.
+urlopen = llnl.util.lang.Singleton(_urlopen)
+
 #: User-Agent used in Request objects
 SPACK_USER_AGENT = "Spackbot/{0}".format(spack.spack_version)
 
@@ -78,36 +116,12 @@ def uses_ssl(parsed_url):
     return False
 
 
-__UNABLE_TO_VERIFY_SSL = (lambda pyver: ((pyver < (2, 7, 9)) or ((3,) < pyver < (3, 4, 3))))(
-    sys.version_info
-)
-
-
 def read_from_url(url, accept_content_type=None):
     url = url_util.parse(url)
-    context = None
-
-    verify_ssl = spack.config.get("config:verify_ssl")
 
     # Timeout in seconds for web requests
     timeout = spack.config.get("config:connect_timeout", 10)
 
-    # Don't even bother with a context unless the URL scheme is one that uses
-    # SSL certs.
-    if uses_ssl(url):
-        if verify_ssl:
-            if __UNABLE_TO_VERIFY_SSL:
-                # User wants SSL verification, but it cannot be provided.
-                warn_no_ssl_cert_checking()
-            else:
-                # User wants SSL verification, and it *can* be provided.
-                context = ssl.create_default_context()  # novm
-        else:
-            # User has explicitly indicated that they do not want SSL
-            # verification.
-            if not __UNABLE_TO_VERIFY_SSL:
-                context = ssl._create_unverified_context()
-
     url_scheme = url.scheme
     url = url_util.format(url)
     if sys.platform == "win32" and url_scheme == "file":
@@ -123,7 +137,7 @@ def read_from_url(url, accept_content_type=None):
         # one round-trip.  However, most servers seem to ignore the header
         # if you ask for a tarball with Accept: text/html.
         req.get_method = lambda: "HEAD"
-        resp = _urlopen(req, timeout=timeout, context=context)
+        resp = urlopen(req, timeout=timeout)
 
         content_type = get_header(resp.headers, "Content-type")
 
@@ -131,7 +145,7 @@ def read_from_url(url, accept_content_type=None):
     req.get_method = lambda: "GET"
 
     try:
-        response = _urlopen(req, timeout=timeout, context=context)
+        response = urlopen(req, timeout=timeout)
     except URLError as err:
         raise SpackWebError("Download failed: {ERROR}".format(ERROR=str(err)))
 
@@ -154,22 +168,11 @@ def read_from_url(url, accept_content_type=None):
     return response.geturl(), response.headers, response
 
 
-def warn_no_ssl_cert_checking():
-    tty.warn(
-        "Spack will not check SSL certificates. You need to update "
-        "your Python to enable certificate verification."
-    )
-
-
 def push_to_url(local_file_path, remote_path, keep_original=True, extra_args=None):
     if sys.platform == "win32":
         if remote_path[1] == ":":
             remote_path = "file://" + remote_path
     remote_url = url_util.parse(remote_path)
-    verify_ssl = spack.config.get("config:verify_ssl")
-
-    if __UNABLE_TO_VERIFY_SSL and verify_ssl and uses_ssl(remote_url):
-        warn_no_ssl_cert_checking()
 
     remote_file_path = url_util.local_file_path(remote_url)
     if remote_file_path is not None:
@@ -405,12 +408,12 @@ def url_exists(url, curl=None):
         )  # noqa: E501
 
         try:
-            s3.get_object(Bucket=url_result.netloc, Key=url_result.path.lstrip("/"))
+            s3.head_object(Bucket=url_result.netloc, Key=url_result.path.lstrip("/"))
             return True
         except s3.ClientError as err:
-            if err.response["Error"]["Code"] == "NoSuchKey":
+            if err.response["ResponseMetadata"]["HTTPStatusCode"] == 404:
                 return False
-            raise err
+            raise
 
     # Check if Google Storage .. urllib-based fetch
     if url_result.scheme == "gs":
@@ -432,12 +435,14 @@ def url_exists(url, curl=None):
         return curl_exe.returncode == 0
 
     # If we get here, then the only other fetch method option is urllib.
-    # So try to "read" from the URL and assume that *any* non-throwing
-    #  response contains the resource represented by the URL.
+    # We try a HEAD request and expect a 200 return code.
     try:
-        read_from_url(url)
-        return True
-    except (SpackWebError, URLError) as e:
+        response = urlopen(
+            Request(url, method="HEAD", headers={"User-Agent": SPACK_USER_AGENT}),
+            timeout=spack.config.get("config:connect_timeout", 10),
+        )
+        return response.status == 200
+    except URLError as e:
         tty.debug("Failure reading URL: " + str(e))
         return False
 
@@ -720,36 +725,24 @@ def spider(root_urls, depth=0, concurrency=32):
     return pages, links
 
 
-def _urlopen(req, *args, **kwargs):
-    """Wrapper for compatibility with old versions of Python."""
+def _open(req, *args, **kwargs):
+    global open
     url = req
     try:
         url = url.get_full_url()
     except AttributeError:
         pass
 
-    # Note: 'context' parameter was only introduced starting
-    # with versions 2.7.9 and 3.4.3 of Python.
-    if __UNABLE_TO_VERIFY_SSL:
-        del kwargs["context"]
-
-    opener = urlopen
     if url_util.parse(url).scheme == "s3":
         import spack.s3_handler
 
-        opener = spack.s3_handler.open
+        return spack.s3_handler.open(req, *args, **kwargs)
     elif url_util.parse(url).scheme == "gs":
         import spack.gcs_handler
 
-        opener = spack.gcs_handler.gcs_open
+        return spack.gcs_handler.gcs_open(req, *args, **kwargs)
 
-    try:
-        return opener(req, *args, **kwargs)
-    except TypeError as err:
-        # If the above fails because of 'context', call without 'context'.
-        if "context" in kwargs and "context" in str(err):
-            del kwargs["context"]
-        return opener(req, *args, **kwargs)
+    return open(req, *args, **kwargs)
 
 
 def find_versions_of_archive(
author	Harmen Stoppels <harmenstoppels@gmail.com>	2022-11-23 20:26:24 +0100
committer	GitHub <noreply@github.com>	2022-11-23 19:26:24 +0000
commit	d06fd26c9ac8dd525fc129096188e2ea9fd2d0d7 (patch)
tree	bc1c8f62c44fd26457bb60d2bb79b7c9d2eea560 /lib
parent	5d2c9636ffd37b197fc04989371eeb7d0fa7af74 (diff)
download	spack-d06fd26c9ac8dd525fc129096188e2ea9fd2d0d7.tar.gz spack-d06fd26c9ac8dd525fc129096188e2ea9fd2d0d7.tar.bz2 spack-d06fd26c9ac8dd525fc129096188e2ea9fd2d0d7.tar.xz spack-d06fd26c9ac8dd525fc129096188e2ea9fd2d0d7.zip