Revert "Revert "Use `urllib` handler for `s3://` and `gs://`, improve `url_exists` through HEAD requests (#34324)"" (#34498)

This reverts commit 8035eeb36d5068fcbae613e51dd13cb1ae9f4888. And also removes logic around an additional HEAD request to prevent a more expensive GET request on wrong content-type. Since large files are typically an attachment and only downloaded when reading the stream, it's not an optimization that helps much, and in fact the logic was broken since the GET request was done unconditionally.
author: Harmen Stoppels <harmenstoppels@gmail.com> 2022-12-14 23:47:11 +0100
committer: GitHub <noreply@github.com> 2022-12-14 23:47:11 +0100
commit: ea029442e6e775b0542699bdc6d8f34e1ac5b248 (patch)
tree: 555485261f77fec249eac6c1dddaab9e49dedc6b /lib
parent: 43e38d0d12b10ef4e4a9bf820fa9ad79e2c1c5f8 (diff)
download: spack-ea029442e6e775b0542699bdc6d8f34e1ac5b248.tar.gz
spack-ea029442e6e775b0542699bdc6d8f34e1ac5b248.tar.bz2
spack-ea029442e6e775b0542699bdc6d8f34e1ac5b248.tar.xz
spack-ea029442e6e775b0542699bdc6d8f34e1ac5b248.zip
4 files changed, 92 insertions, 159 deletions
diff --git a/lib/spack/spack/gcs_handler.py b/lib/spack/spack/gcs_handler.py
index 441eea6f80..4ee9f08896 100644
--- a/lib/spack/spack/gcs_handler.py
+++ b/lib/spack/spack/gcs_handler.py
@@ -4,8 +4,8 @@
 # SPDX-License-Identifier: (Apache-2.0 OR MIT)
 import urllib.parse
 import urllib.response
-
-import spack.util.web as web_util
+from urllib.error import URLError
+from urllib.request import BaseHandler
 
 
 def gcs_open(req, *args, **kwargs):
@@ -16,8 +16,13 @@ def gcs_open(req, *args, **kwargs):
     gcsblob = gcs_util.GCSBlob(url)
 
     if not gcsblob.exists():
-        raise web_util.SpackWebError("GCS blob {0} does not exist".format(gcsblob.blob_path))
+        raise URLError("GCS blob {0} does not exist".format(gcsblob.blob_path))
     stream = gcsblob.get_blob_byte_stream()
     headers = gcsblob.get_blob_headers()
 
     return urllib.response.addinfourl(stream, headers, url)
+
+
+class GCSHandler(BaseHandler):
+    def gs_open(self, req):
+        return gcs_open(req)
diff --git a/lib/spack/spack/s3_handler.py b/lib/spack/spack/s3_handler.py
index a3e0aa991b..77a8a2f7cc 100644
--- a/lib/spack/spack/s3_handler.py
+++ b/lib/spack/spack/s3_handler.py
@@ -7,7 +7,7 @@ import urllib.error
 import urllib.parse
 import urllib.request
 import urllib.response
-from io import BufferedReader, IOBase
+from io import BufferedReader, BytesIO, IOBase
 
 import spack.util.s3 as s3_util
 
@@ -42,7 +42,7 @@ class WrapStream(BufferedReader):
         return getattr(self.raw, key)
 
 
-def _s3_open(url):
+def _s3_open(url, method="GET"):
     parsed = urllib.parse.urlparse(url)
     s3 = s3_util.get_s3_session(url, method="fetch")
 
@@ -52,27 +52,29 @@ def _s3_open(url):
     if key.startswith("/"):
         key = key[1:]
 
-    obj = s3.get_object(Bucket=bucket, Key=key)
+    if method not in ("GET", "HEAD"):
+        raise urllib.error.URLError(
+            "Only GET and HEAD verbs are currently supported for the s3:// scheme"
+        )
+
+    try:
+        if method == "GET":
+            obj = s3.get_object(Bucket=bucket, Key=key)
+            # NOTE(opadron): Apply workaround here (see above)
+            stream = WrapStream(obj["Body"])
+        elif method == "HEAD":
+            obj = s3.head_object(Bucket=bucket, Key=key)
+            stream = BytesIO()
+    except s3.ClientError as e:
+        raise urllib.error.URLError(e) from e
 
-    # NOTE(opadron): Apply workaround here (see above)
-    stream = WrapStream(obj["Body"])
     headers = obj["ResponseMetadata"]["HTTPHeaders"]
 
     return url, headers, stream
 
 
-class UrllibS3Handler(urllib.request.HTTPSHandler):
+class UrllibS3Handler(urllib.request.BaseHandler):
     def s3_open(self, req):
         orig_url = req.get_full_url()
-        from botocore.exceptions import ClientError  # type: ignore[import]
-
-        try:
-            url, headers, stream = _s3_open(orig_url)
-            return urllib.response.addinfourl(stream, headers, url)
-        except ClientError as err:
-            raise urllib.error.URLError(err) from err
-
-
-S3OpenerDirector = urllib.request.build_opener(UrllibS3Handler())
-
-open = S3OpenerDirector.open
+        url, headers, stream = _s3_open(orig_url, method=req.get_method())
+        return urllib.response.addinfourl(stream, headers, url)
diff --git a/lib/spack/spack/test/web.py b/lib/spack/spack/test/web.py
index 476ea01019..166a577c89 100644
--- a/lib/spack/spack/test/web.py
+++ b/lib/spack/spack/test/web.py
@@ -224,7 +224,10 @@ class MockPaginator(object):
 
 class MockClientError(Exception):
     def __init__(self):
-        self.response = {"Error": {"Code": "NoSuchKey"}}
+        self.response = {
+            "Error": {"Code": "NoSuchKey"},
+            "ResponseMetadata": {"HTTPStatusCode": 404},
+        }
 
 
 class MockS3Client(object):
@@ -243,7 +246,13 @@ class MockS3Client(object):
     def get_object(self, Bucket=None, Key=None):
         self.ClientError = MockClientError
         if Bucket == "my-bucket" and Key == "subdirectory/my-file":
-            return True
+            return {"ResponseMetadata": {"HTTPHeaders": {}}}
+        raise self.ClientError
+
+    def head_object(self, Bucket=None, Key=None):
+        self.ClientError = MockClientError
+        if Bucket == "my-bucket" and Key == "subdirectory/my-file":
+            return {"ResponseMetadata": {"HTTPHeaders": {}}}
         raise self.ClientError
 
 
diff --git a/lib/spack/spack/util/web.py b/lib/spack/spack/util/web.py
index 7c8964b3c9..9398a12dd8 100644
--- a/lib/spack/spack/util/web.py
+++ b/lib/spack/spack/util/web.py
@@ -18,7 +18,7 @@ import traceback
 import urllib.parse
 from html.parser import HTMLParser
 from urllib.error import URLError
-from urllib.request import Request, urlopen
+from urllib.request import HTTPSHandler, Request, build_opener
 
 import llnl.util.lang
 import llnl.util.tty as tty
@@ -27,6 +27,8 @@ from llnl.util.filesystem import mkdirp, rename, working_dir
 import spack
 import spack.config
 import spack.error
+import spack.gcs_handler
+import spack.s3_handler
 import spack.url
 import spack.util.crypto
 import spack.util.gcs as gcs_util
@@ -36,6 +38,28 @@ from spack.util.compression import ALLOWED_ARCHIVE_TYPES
 from spack.util.executable import CommandNotFoundError, which
 from spack.util.path import convert_to_posix_path
 
+
+def _urlopen():
+    s3 = spack.s3_handler.UrllibS3Handler()
+    gcs = spack.gcs_handler.GCSHandler()
+
+    # One opener with HTTPS ssl enabled
+    with_ssl = build_opener(s3, gcs, HTTPSHandler(context=ssl.create_default_context()))
+
+    # One opener with HTTPS ssl disabled
+    without_ssl = build_opener(s3, gcs, HTTPSHandler(context=ssl._create_unverified_context()))
+
+    # And dynamically dispatch based on the config:verify_ssl.
+    def dispatch_open(*args, **kwargs):
+        opener = with_ssl if spack.config.get("config:verify_ssl", True) else without_ssl
+        return opener.open(*args, **kwargs)
+
+    return dispatch_open
+
+
+#: Dispatches to the correct OpenerDirector.open, based on Spack configuration.
+urlopen = llnl.util.lang.Singleton(_urlopen)
+
 #: User-Agent used in Request objects
 SPACK_USER_AGENT = "Spackbot/{0}".format(spack.spack_version)
 
@@ -60,86 +84,33 @@ class LinkParser(HTMLParser):
                     self.links.append(val)
 
 
-def uses_ssl(parsed_url):
-    if parsed_url.scheme == "https":
-        return True
-
-    if parsed_url.scheme == "s3":
-        endpoint_url = os.environ.get("S3_ENDPOINT_URL")
-        if not endpoint_url:
-            return True
-
-        if urllib.parse.urlparse(endpoint_url).scheme == "https":
-            return True
-
-    elif parsed_url.scheme == "gs":
-        tty.debug("(uses_ssl) GCS Blob is https")
-        return True
-
-    return False
-
-
 def read_from_url(url, accept_content_type=None):
     if isinstance(url, str):
         url = urllib.parse.urlparse(url)
-    context = None
 
     # Timeout in seconds for web requests
     timeout = spack.config.get("config:connect_timeout", 10)
-
-    # Don't even bother with a context unless the URL scheme is one that uses
-    # SSL certs.
-    if uses_ssl(url):
-        if spack.config.get("config:verify_ssl"):
-            # User wants SSL verification, and it *can* be provided.
-            context = ssl.create_default_context()
-        else:
-            # User has explicitly indicated that they do not want SSL
-            # verification.
-            context = ssl._create_unverified_context()
-
-    url_scheme = url.scheme
-    url = url_util.format(url)
-    if sys.platform == "win32" and url_scheme == "file":
-        url = convert_to_posix_path(url)
-    req = Request(url, headers={"User-Agent": SPACK_USER_AGENT})
-
-    content_type = None
-    is_web_url = url_scheme in ("http", "https")
-    if accept_content_type and is_web_url:
-        # Make a HEAD request first to check the content type.  This lets
-        # us ignore tarballs and gigantic files.
-        # It would be nice to do this with the HTTP Accept header to avoid
-        # one round-trip.  However, most servers seem to ignore the header
-        # if you ask for a tarball with Accept: text/html.
-        req.get_method = lambda: "HEAD"
-        resp = _urlopen(req, timeout=timeout, context=context)
-
-        content_type = get_header(resp.headers, "Content-type")
-
-    # Do the real GET request when we know it's just HTML.
-    req.get_method = lambda: "GET"
+    request = Request(url.geturl(), headers={"User-Agent": SPACK_USER_AGENT})
 
     try:
-        response = _urlopen(req, timeout=timeout, context=context)
+        response = urlopen(request, timeout=timeout)
     except URLError as err:
-        raise SpackWebError("Download failed: {ERROR}".format(ERROR=str(err)))
-
-    if accept_content_type and not is_web_url:
-        content_type = get_header(response.headers, "Content-type")
+        raise SpackWebError("Download failed: {}".format(str(err)))
 
-    reject_content_type = accept_content_type and (
-        content_type is None or not content_type.startswith(accept_content_type)
-    )
-
-    if reject_content_type:
-        tty.debug(
-            "ignoring page {0}{1}{2}".format(
-                url, " with content type " if content_type is not None else "", content_type or ""
-            )
-        )
-
-        return None, None, None
+    if accept_content_type:
+        try:
+            content_type = get_header(response.headers, "Content-type")
+            reject_content_type = not content_type.startswith(accept_content_type)
+        except KeyError:
+            content_type = None
+            reject_content_type = True
+
+        if reject_content_type:
+            msg = "ignoring page {}".format(url.geturl())
+            if content_type:
+                msg += " with content type {}".format(content_type)
+            tty.debug(msg)
+            return None, None, None
 
     return response.geturl(), response.headers, response
 
@@ -349,12 +320,6 @@ def url_exists(url, curl=None):
     Simple Storage Service (`s3`) URLs; otherwise, the configured fetch
     method defined by `config:url_fetch_method` is used.
 
-    If the method is `curl`, it also uses the following configuration option:
-
-        * config:verify_ssl (str): Perform SSL verification
-
-    Otherwise, `urllib` will be used.
-
     Arguments:
         url (str): URL whose existence is being checked
         curl (spack.util.executable.Executable or None): (optional) curl
@@ -365,31 +330,11 @@ def url_exists(url, curl=None):
     tty.debug("Checking existence of {0}".format(url))
     url_result = urllib.parse.urlparse(url)
 
-    # Check if a local file
-    local_path = url_util.local_file_path(url_result)
-    if local_path:
-        return os.path.exists(local_path)
-
-    # Check if Amazon Simple Storage Service (S3) .. urllib-based fetch
-    if url_result.scheme == "s3":
-        # Check for URL-specific connection information
-        s3 = s3_util.get_s3_session(url_result, method="fetch")
-
-        try:
-            s3.get_object(Bucket=url_result.netloc, Key=url_result.path.lstrip("/"))
-            return True
-        except s3.ClientError as err:
-            if err.response["Error"]["Code"] == "NoSuchKey":
-                return False
-            raise err
-
-    # Check if Google Storage .. urllib-based fetch
-    if url_result.scheme == "gs":
-        gcs = gcs_util.GCSBlob(url_result)
-        return gcs.exists()
-
-    # Otherwise, use the configured fetch method
-    if spack.config.get("config:url_fetch_method") == "curl":
+    # Use curl if configured to do so
+    use_curl = spack.config.get(
+        "config:url_fetch_method", "urllib"
+    ) == "curl" and url_result.scheme not in ("gs", "s3")
+    if use_curl:
         curl_exe = _curl(curl)
         if not curl_exe:
             return False
@@ -402,13 +347,14 @@ def url_exists(url, curl=None):
         _ = curl_exe(*curl_args, fail_on_error=False, output=os.devnull)
         return curl_exe.returncode == 0
 
-    # If we get here, then the only other fetch method option is urllib.
-    # So try to "read" from the URL and assume that *any* non-throwing
-    #  response contains the resource represented by the URL.
+    # Otherwise use urllib.
     try:
-        read_from_url(url)
+        urlopen(
+            Request(url, method="HEAD", headers={"User-Agent": SPACK_USER_AGENT}),
+            timeout=spack.config.get("config:connect_timeout", 10),
+        )
         return True
-    except (SpackWebError, URLError) as e:
+    except URLError as e:
         tty.debug("Failure reading URL: " + str(e))
         return False
 
@@ -691,35 +637,6 @@ def spider(root_urls, depth=0, concurrency=32):
     return pages, links
 
 
-def _urlopen(req, *args, **kwargs):
-    """Wrapper for compatibility with old versions of Python."""
-    url = req
-    try:
-        url = url.get_full_url()
-    except AttributeError:
-        pass
-
-    del kwargs["context"]
-
-    opener = urlopen
-    if urllib.parse.urlparse(url).scheme == "s3":
-        import spack.s3_handler
-
-        opener = spack.s3_handler.open
-    elif urllib.parse.urlparse(url).scheme == "gs":
-        import spack.gcs_handler
-
-        opener = spack.gcs_handler.gcs_open
-
-    try:
-        return opener(req, *args, **kwargs)
-    except TypeError as err:
-        # If the above fails because of 'context', call without 'context'.
-        if "context" in kwargs and "context" in str(err):
-            del kwargs["context"]
-        return opener(req, *args, **kwargs)
-
-
 def find_versions_of_archive(
     archive_urls, list_url=None, list_depth=0, concurrency=32, reference_package=None
 ):
author	Harmen Stoppels <harmenstoppels@gmail.com>	2022-12-14 23:47:11 +0100
committer	GitHub <noreply@github.com>	2022-12-14 23:47:11 +0100
commit	ea029442e6e775b0542699bdc6d8f34e1ac5b248 (patch)
tree	555485261f77fec249eac6c1dddaab9e49dedc6b /lib
parent	43e38d0d12b10ef4e4a9bf820fa9ad79e2c1c5f8 (diff)
download	spack-ea029442e6e775b0542699bdc6d8f34e1ac5b248.tar.gz spack-ea029442e6e775b0542699bdc6d8f34e1ac5b248.tar.bz2 spack-ea029442e6e775b0542699bdc6d8f34e1ac5b248.tar.xz spack-ea029442e6e775b0542699bdc6d8f34e1ac5b248.zip