Use `urllib` handler for `s3://` and `gs://`, improve `url_exists` through HEAD requests (#34324)

* `url_exists` improvements (take 2) Make `url_exists` do HEAD request for http/https/s3 protocols Rework the opener: construct it once and only once, dynamically dispatch to the right one based on config.
author: Harmen Stoppels <harmenstoppels@gmail.com> 2022-12-10 00:20:29 +0100
committer: GitHub <noreply@github.com> 2022-12-10 00:20:29 +0100
commit: db8f115013d3a6991da3f92aeee3e49327a24833 (patch)
tree: 47081e89275f1daf217339bf3030254d268c6c19 /lib
parent: 09b54760493e89387c537ef285b592e99f05d991 (diff)
download: spack-db8f115013d3a6991da3f92aeee3e49327a24833.tar.gz
spack-db8f115013d3a6991da3f92aeee3e49327a24833.tar.bz2
spack-db8f115013d3a6991da3f92aeee3e49327a24833.tar.xz
spack-db8f115013d3a6991da3f92aeee3e49327a24833.zip
4 files changed, 77 insertions, 121 deletions
diff --git a/lib/spack/spack/gcs_handler.py b/lib/spack/spack/gcs_handler.py
index 4b547a78dc..5290cf0ab9 100644
--- a/lib/spack/spack/gcs_handler.py
+++ b/lib/spack/spack/gcs_handler.py
@@ -3,9 +3,10 @@
 #
 # SPDX-License-Identifier: (Apache-2.0 OR MIT)
 import urllib.response
+from urllib.error import URLError
+from urllib.request import BaseHandler
 
 import spack.util.url as url_util
-import spack.util.web as web_util
 
 
 def gcs_open(req, *args, **kwargs):
@@ -16,8 +17,13 @@ def gcs_open(req, *args, **kwargs):
     gcsblob = gcs_util.GCSBlob(url)
 
     if not gcsblob.exists():
-        raise web_util.SpackWebError("GCS blob {0} does not exist".format(gcsblob.blob_path))
+        raise URLError("GCS blob {0} does not exist".format(gcsblob.blob_path))
     stream = gcsblob.get_blob_byte_stream()
     headers = gcsblob.get_blob_headers()
 
     return urllib.response.addinfourl(stream, headers, url)
+
+
+class GCSHandler(BaseHandler):
+    def gs_open(self, req):
+        return gcs_open(req)
diff --git a/lib/spack/spack/s3_handler.py b/lib/spack/spack/s3_handler.py
index aee5dc8943..140b5fa7b8 100644
--- a/lib/spack/spack/s3_handler.py
+++ b/lib/spack/spack/s3_handler.py
@@ -6,7 +6,7 @@
 import urllib.error
 import urllib.request
 import urllib.response
-from io import BufferedReader, IOBase
+from io import BufferedReader, BytesIO, IOBase
 
 import spack.util.s3 as s3_util
 import spack.util.url as url_util
@@ -42,7 +42,7 @@ class WrapStream(BufferedReader):
         return getattr(self.raw, key)
 
 
-def _s3_open(url):
+def _s3_open(url, method="GET"):
     parsed = url_util.parse(url)
     s3 = s3_util.get_s3_session(url, method="fetch")
 
@@ -52,27 +52,29 @@ def _s3_open(url):
     if key.startswith("/"):
         key = key[1:]
 
-    obj = s3.get_object(Bucket=bucket, Key=key)
+    if method not in ("GET", "HEAD"):
+        raise urllib.error.URLError(
+            "Only GET and HEAD verbs are currently supported for the s3:// scheme"
+        )
+
+    try:
+        if method == "GET":
+            obj = s3.get_object(Bucket=bucket, Key=key)
+            # NOTE(opadron): Apply workaround here (see above)
+            stream = WrapStream(obj["Body"])
+        elif method == "HEAD":
+            obj = s3.head_object(Bucket=bucket, Key=key)
+            stream = BytesIO()
+    except s3.ClientError as e:
+        raise urllib.error.URLError(e) from e
 
-    # NOTE(opadron): Apply workaround here (see above)
-    stream = WrapStream(obj["Body"])
     headers = obj["ResponseMetadata"]["HTTPHeaders"]
 
     return url, headers, stream
 
 
-class UrllibS3Handler(urllib.request.HTTPSHandler):
+class UrllibS3Handler(urllib.request.BaseHandler):
     def s3_open(self, req):
         orig_url = req.get_full_url()
-        from botocore.exceptions import ClientError  # type: ignore[import]
-
-        try:
-            url, headers, stream = _s3_open(orig_url)
-            return urllib.response.addinfourl(stream, headers, url)
-        except ClientError as err:
-            raise urllib.error.URLError(err) from err
-
-
-S3OpenerDirector = urllib.request.build_opener(UrllibS3Handler())
-
-open = S3OpenerDirector.open
+        url, headers, stream = _s3_open(orig_url, method=req.get_method())
+        return urllib.response.addinfourl(stream, headers, url)
diff --git a/lib/spack/spack/test/web.py b/lib/spack/spack/test/web.py
index f4114eb05c..ee33c2dc2e 100644
--- a/lib/spack/spack/test/web.py
+++ b/lib/spack/spack/test/web.py
@@ -223,7 +223,10 @@ class MockPaginator(object):
 
 class MockClientError(Exception):
     def __init__(self):
-        self.response = {"Error": {"Code": "NoSuchKey"}}
+        self.response = {
+            "Error": {"Code": "NoSuchKey"},
+            "ResponseMetadata": {"HTTPStatusCode": 404},
+        }
 
 
 class MockS3Client(object):
@@ -242,7 +245,13 @@ class MockS3Client(object):
     def get_object(self, Bucket=None, Key=None):
         self.ClientError = MockClientError
         if Bucket == "my-bucket" and Key == "subdirectory/my-file":
-            return True
+            return {"ResponseMetadata": {"HTTPHeaders": {}}}
+        raise self.ClientError
+
+    def head_object(self, Bucket=None, Key=None):
+        self.ClientError = MockClientError
+        if Bucket == "my-bucket" and Key == "subdirectory/my-file":
+            return {"ResponseMetadata": {"HTTPHeaders": {}}}
         raise self.ClientError
 
 
diff --git a/lib/spack/spack/util/web.py b/lib/spack/spack/util/web.py
index 1f2c197460..c67df0325c 100644
--- a/lib/spack/spack/util/web.py
+++ b/lib/spack/spack/util/web.py
@@ -17,7 +17,7 @@ import sys
 import traceback
 from html.parser import HTMLParser
 from urllib.error import URLError
-from urllib.request import Request, urlopen
+from urllib.request import HTTPSHandler, Request, build_opener
 
 import llnl.util.lang
 import llnl.util.tty as tty
@@ -26,6 +26,8 @@ from llnl.util.filesystem import mkdirp, rename, working_dir
 import spack
 import spack.config
 import spack.error
+import spack.gcs_handler
+import spack.s3_handler
 import spack.url
 import spack.util.crypto
 import spack.util.gcs as gcs_util
@@ -35,6 +37,28 @@ from spack.util.compression import ALLOWED_ARCHIVE_TYPES
 from spack.util.executable import CommandNotFoundError, which
 from spack.util.path import convert_to_posix_path
 
+
+def _urlopen():
+    s3 = spack.s3_handler.UrllibS3Handler()
+    gcs = spack.gcs_handler.GCSHandler()
+
+    # One opener with HTTPS ssl enabled
+    with_ssl = build_opener(s3, gcs, HTTPSHandler(context=ssl.create_default_context()))
+
+    # One opener with HTTPS ssl disabled
+    without_ssl = build_opener(s3, gcs, HTTPSHandler(context=ssl._create_unverified_context()))
+
+    # And dynamically dispatch based on the config:verify_ssl.
+    def dispatch_open(*args, **kwargs):
+        opener = with_ssl if spack.config.get("config:verify_ssl", True) else without_ssl
+        return opener.open(*args, **kwargs)
+
+    return dispatch_open
+
+
+#: Dispatches to the correct OpenerDirector.open, based on Spack configuration.
+urlopen = llnl.util.lang.Singleton(_urlopen)
+
 #: User-Agent used in Request objects
 SPACK_USER_AGENT = "Spackbot/{0}".format(spack.spack_version)
 
@@ -59,43 +83,12 @@ class LinkParser(HTMLParser):
                     self.links.append(val)
 
 
-def uses_ssl(parsed_url):
-    if parsed_url.scheme == "https":
-        return True
-
-    if parsed_url.scheme == "s3":
-        endpoint_url = os.environ.get("S3_ENDPOINT_URL")
-        if not endpoint_url:
-            return True
-
-        if url_util.parse(endpoint_url, scheme="https").scheme == "https":
-            return True
-
-    elif parsed_url.scheme == "gs":
-        tty.debug("(uses_ssl) GCS Blob is https")
-        return True
-
-    return False
-
-
 def read_from_url(url, accept_content_type=None):
     url = url_util.parse(url)
-    context = None
 
     # Timeout in seconds for web requests
     timeout = spack.config.get("config:connect_timeout", 10)
 
-    # Don't even bother with a context unless the URL scheme is one that uses
-    # SSL certs.
-    if uses_ssl(url):
-        if spack.config.get("config:verify_ssl"):
-            # User wants SSL verification, and it *can* be provided.
-            context = ssl.create_default_context()
-        else:
-            # User has explicitly indicated that they do not want SSL
-            # verification.
-            context = ssl._create_unverified_context()
-
     url_scheme = url.scheme
     url = url_util.format(url)
     if sys.platform == "win32" and url_scheme == "file":
@@ -111,7 +104,7 @@ def read_from_url(url, accept_content_type=None):
         # one round-trip.  However, most servers seem to ignore the header
         # if you ask for a tarball with Accept: text/html.
         req.get_method = lambda: "HEAD"
-        resp = _urlopen(req, timeout=timeout, context=context)
+        resp = urlopen(req, timeout=timeout)
 
         content_type = get_header(resp.headers, "Content-type")
 
@@ -119,7 +112,7 @@ def read_from_url(url, accept_content_type=None):
     req.get_method = lambda: "GET"
 
     try:
-        response = _urlopen(req, timeout=timeout, context=context)
+        response = urlopen(req, timeout=timeout)
     except URLError as err:
         raise SpackWebError("Download failed: {ERROR}".format(ERROR=str(err)))
 
@@ -351,12 +344,6 @@ def url_exists(url, curl=None):
     Simple Storage Service (`s3`) URLs; otherwise, the configured fetch
     method defined by `config:url_fetch_method` is used.
 
-    If the method is `curl`, it also uses the following configuration option:
-
-        * config:verify_ssl (str): Perform SSL verification
-
-    Otherwise, `urllib` will be used.
-
     Arguments:
         url (str): URL whose existence is being checked
         curl (spack.util.executable.Executable or None): (optional) curl
@@ -367,31 +354,11 @@ def url_exists(url, curl=None):
     tty.debug("Checking existence of {0}".format(url))
     url_result = url_util.parse(url)
 
-    # Check if a local file
-    local_path = url_util.local_file_path(url_result)
-    if local_path:
-        return os.path.exists(local_path)
-
-    # Check if Amazon Simple Storage Service (S3) .. urllib-based fetch
-    if url_result.scheme == "s3":
-        # Check for URL-specific connection information
-        s3 = s3_util.get_s3_session(url_result, method="fetch")
-
-        try:
-            s3.get_object(Bucket=url_result.netloc, Key=url_result.path.lstrip("/"))
-            return True
-        except s3.ClientError as err:
-            if err.response["Error"]["Code"] == "NoSuchKey":
-                return False
-            raise err
-
-    # Check if Google Storage .. urllib-based fetch
-    if url_result.scheme == "gs":
-        gcs = gcs_util.GCSBlob(url_result)
-        return gcs.exists()
-
-    # Otherwise, use the configured fetch method
-    if spack.config.get("config:url_fetch_method") == "curl":
+    # Use curl if configured to do so
+    use_curl = spack.config.get(
+        "config:url_fetch_method", "urllib"
+    ) == "curl" and url_result.scheme not in ("gs", "s3")
+    if use_curl:
         curl_exe = _curl(curl)
         if not curl_exe:
             return False
@@ -404,13 +371,14 @@ def url_exists(url, curl=None):
         _ = curl_exe(*curl_args, fail_on_error=False, output=os.devnull)
         return curl_exe.returncode == 0
 
-    # If we get here, then the only other fetch method option is urllib.
-    # So try to "read" from the URL and assume that *any* non-throwing
-    #  response contains the resource represented by the URL.
+    # Otherwise use urllib.
     try:
-        read_from_url(url)
+        urlopen(
+            Request(url, method="HEAD", headers={"User-Agent": SPACK_USER_AGENT}),
+            timeout=spack.config.get("config:connect_timeout", 10),
+        )
         return True
-    except (SpackWebError, URLError) as e:
+    except URLError as e:
         tty.debug("Failure reading URL: " + str(e))
         return False
 
@@ -693,35 +661,6 @@ def spider(root_urls, depth=0, concurrency=32):
     return pages, links
 
 
-def _urlopen(req, *args, **kwargs):
-    """Wrapper for compatibility with old versions of Python."""
-    url = req
-    try:
-        url = url.get_full_url()
-    except AttributeError:
-        pass
-
-    del kwargs["context"]
-
-    opener = urlopen
-    if url_util.parse(url).scheme == "s3":
-        import spack.s3_handler
-
-        opener = spack.s3_handler.open
-    elif url_util.parse(url).scheme == "gs":
-        import spack.gcs_handler
-
-        opener = spack.gcs_handler.gcs_open
-
-    try:
-        return opener(req, *args, **kwargs)
-    except TypeError as err:
-        # If the above fails because of 'context', call without 'context'.
-        if "context" in kwargs and "context" in str(err):
-            del kwargs["context"]
-        return opener(req, *args, **kwargs)
-
-
 def find_versions_of_archive(
     archive_urls, list_url=None, list_depth=0, concurrency=32, reference_package=None
 ):
author	Harmen Stoppels <harmenstoppels@gmail.com>	2022-12-10 00:20:29 +0100
committer	GitHub <noreply@github.com>	2022-12-10 00:20:29 +0100
commit	db8f115013d3a6991da3f92aeee3e49327a24833 (patch)
tree	47081e89275f1daf217339bf3030254d268c6c19 /lib
parent	09b54760493e89387c537ef285b592e99f05d991 (diff)
download	spack-db8f115013d3a6991da3f92aeee3e49327a24833.tar.gz spack-db8f115013d3a6991da3f92aeee3e49327a24833.tar.bz2 spack-db8f115013d3a6991da3f92aeee3e49327a24833.tar.xz spack-db8f115013d3a6991da3f92aeee3e49327a24833.zip