From 8035eeb36d5068fcbae613e51dd13cb1ae9f4888 Mon Sep 17 00:00:00 2001 From: Todd Gamblin Date: Sat, 10 Dec 2022 11:24:34 -0800 Subject: Revert "Use `urllib` handler for `s3://` and `gs://`, improve `url_exists` through HEAD requests (#34324)" This reverts commit db8f115013d3a6991da3f92aeee3e49327a24833. --- lib/spack/spack/gcs_handler.py | 10 +-- lib/spack/spack/s3_handler.py | 38 ++++++------ lib/spack/spack/test/web.py | 13 +--- lib/spack/spack/util/web.py | 137 +++++++++++++++++++++++++++++------------ 4 files changed, 121 insertions(+), 77 deletions(-) diff --git a/lib/spack/spack/gcs_handler.py b/lib/spack/spack/gcs_handler.py index 5290cf0ab9..4b547a78dc 100644 --- a/lib/spack/spack/gcs_handler.py +++ b/lib/spack/spack/gcs_handler.py @@ -3,10 +3,9 @@ # # SPDX-License-Identifier: (Apache-2.0 OR MIT) import urllib.response -from urllib.error import URLError -from urllib.request import BaseHandler import spack.util.url as url_util +import spack.util.web as web_util def gcs_open(req, *args, **kwargs): @@ -17,13 +16,8 @@ def gcs_open(req, *args, **kwargs): gcsblob = gcs_util.GCSBlob(url) if not gcsblob.exists(): - raise URLError("GCS blob {0} does not exist".format(gcsblob.blob_path)) + raise web_util.SpackWebError("GCS blob {0} does not exist".format(gcsblob.blob_path)) stream = gcsblob.get_blob_byte_stream() headers = gcsblob.get_blob_headers() return urllib.response.addinfourl(stream, headers, url) - - -class GCSHandler(BaseHandler): - def gs_open(self, req): - return gcs_open(req) diff --git a/lib/spack/spack/s3_handler.py b/lib/spack/spack/s3_handler.py index 140b5fa7b8..aee5dc8943 100644 --- a/lib/spack/spack/s3_handler.py +++ b/lib/spack/spack/s3_handler.py @@ -6,7 +6,7 @@ import urllib.error import urllib.request import urllib.response -from io import BufferedReader, BytesIO, IOBase +from io import BufferedReader, IOBase import spack.util.s3 as s3_util import spack.util.url as url_util @@ -42,7 +42,7 @@ class WrapStream(BufferedReader): return getattr(self.raw, key) -def _s3_open(url, method="GET"): +def _s3_open(url): parsed = url_util.parse(url) s3 = s3_util.get_s3_session(url, method="fetch") @@ -52,29 +52,27 @@ def _s3_open(url, method="GET"): if key.startswith("/"): key = key[1:] - if method not in ("GET", "HEAD"): - raise urllib.error.URLError( - "Only GET and HEAD verbs are currently supported for the s3:// scheme" - ) - - try: - if method == "GET": - obj = s3.get_object(Bucket=bucket, Key=key) - # NOTE(opadron): Apply workaround here (see above) - stream = WrapStream(obj["Body"]) - elif method == "HEAD": - obj = s3.head_object(Bucket=bucket, Key=key) - stream = BytesIO() - except s3.ClientError as e: - raise urllib.error.URLError(e) from e + obj = s3.get_object(Bucket=bucket, Key=key) + # NOTE(opadron): Apply workaround here (see above) + stream = WrapStream(obj["Body"]) headers = obj["ResponseMetadata"]["HTTPHeaders"] return url, headers, stream -class UrllibS3Handler(urllib.request.BaseHandler): +class UrllibS3Handler(urllib.request.HTTPSHandler): def s3_open(self, req): orig_url = req.get_full_url() - url, headers, stream = _s3_open(orig_url, method=req.get_method()) - return urllib.response.addinfourl(stream, headers, url) + from botocore.exceptions import ClientError # type: ignore[import] + + try: + url, headers, stream = _s3_open(orig_url) + return urllib.response.addinfourl(stream, headers, url) + except ClientError as err: + raise urllib.error.URLError(err) from err + + +S3OpenerDirector = urllib.request.build_opener(UrllibS3Handler()) + +open = S3OpenerDirector.open diff --git a/lib/spack/spack/test/web.py b/lib/spack/spack/test/web.py index ee33c2dc2e..f4114eb05c 100644 --- a/lib/spack/spack/test/web.py +++ b/lib/spack/spack/test/web.py @@ -223,10 +223,7 @@ class MockPaginator(object): class MockClientError(Exception): def __init__(self): - self.response = { - "Error": {"Code": "NoSuchKey"}, - "ResponseMetadata": {"HTTPStatusCode": 404}, - } + self.response = {"Error": {"Code": "NoSuchKey"}} class MockS3Client(object): @@ -245,13 +242,7 @@ class MockS3Client(object): def get_object(self, Bucket=None, Key=None): self.ClientError = MockClientError if Bucket == "my-bucket" and Key == "subdirectory/my-file": - return {"ResponseMetadata": {"HTTPHeaders": {}}} - raise self.ClientError - - def head_object(self, Bucket=None, Key=None): - self.ClientError = MockClientError - if Bucket == "my-bucket" and Key == "subdirectory/my-file": - return {"ResponseMetadata": {"HTTPHeaders": {}}} + return True raise self.ClientError diff --git a/lib/spack/spack/util/web.py b/lib/spack/spack/util/web.py index c67df0325c..1f2c197460 100644 --- a/lib/spack/spack/util/web.py +++ b/lib/spack/spack/util/web.py @@ -17,7 +17,7 @@ import sys import traceback from html.parser import HTMLParser from urllib.error import URLError -from urllib.request import HTTPSHandler, Request, build_opener +from urllib.request import Request, urlopen import llnl.util.lang import llnl.util.tty as tty @@ -26,8 +26,6 @@ from llnl.util.filesystem import mkdirp, rename, working_dir import spack import spack.config import spack.error -import spack.gcs_handler -import spack.s3_handler import spack.url import spack.util.crypto import spack.util.gcs as gcs_util @@ -37,28 +35,6 @@ from spack.util.compression import ALLOWED_ARCHIVE_TYPES from spack.util.executable import CommandNotFoundError, which from spack.util.path import convert_to_posix_path - -def _urlopen(): - s3 = spack.s3_handler.UrllibS3Handler() - gcs = spack.gcs_handler.GCSHandler() - - # One opener with HTTPS ssl enabled - with_ssl = build_opener(s3, gcs, HTTPSHandler(context=ssl.create_default_context())) - - # One opener with HTTPS ssl disabled - without_ssl = build_opener(s3, gcs, HTTPSHandler(context=ssl._create_unverified_context())) - - # And dynamically dispatch based on the config:verify_ssl. - def dispatch_open(*args, **kwargs): - opener = with_ssl if spack.config.get("config:verify_ssl", True) else without_ssl - return opener.open(*args, **kwargs) - - return dispatch_open - - -#: Dispatches to the correct OpenerDirector.open, based on Spack configuration. -urlopen = llnl.util.lang.Singleton(_urlopen) - #: User-Agent used in Request objects SPACK_USER_AGENT = "Spackbot/{0}".format(spack.spack_version) @@ -83,12 +59,43 @@ class LinkParser(HTMLParser): self.links.append(val) +def uses_ssl(parsed_url): + if parsed_url.scheme == "https": + return True + + if parsed_url.scheme == "s3": + endpoint_url = os.environ.get("S3_ENDPOINT_URL") + if not endpoint_url: + return True + + if url_util.parse(endpoint_url, scheme="https").scheme == "https": + return True + + elif parsed_url.scheme == "gs": + tty.debug("(uses_ssl) GCS Blob is https") + return True + + return False + + def read_from_url(url, accept_content_type=None): url = url_util.parse(url) + context = None # Timeout in seconds for web requests timeout = spack.config.get("config:connect_timeout", 10) + # Don't even bother with a context unless the URL scheme is one that uses + # SSL certs. + if uses_ssl(url): + if spack.config.get("config:verify_ssl"): + # User wants SSL verification, and it *can* be provided. + context = ssl.create_default_context() + else: + # User has explicitly indicated that they do not want SSL + # verification. + context = ssl._create_unverified_context() + url_scheme = url.scheme url = url_util.format(url) if sys.platform == "win32" and url_scheme == "file": @@ -104,7 +111,7 @@ def read_from_url(url, accept_content_type=None): # one round-trip. However, most servers seem to ignore the header # if you ask for a tarball with Accept: text/html. req.get_method = lambda: "HEAD" - resp = urlopen(req, timeout=timeout) + resp = _urlopen(req, timeout=timeout, context=context) content_type = get_header(resp.headers, "Content-type") @@ -112,7 +119,7 @@ def read_from_url(url, accept_content_type=None): req.get_method = lambda: "GET" try: - response = urlopen(req, timeout=timeout) + response = _urlopen(req, timeout=timeout, context=context) except URLError as err: raise SpackWebError("Download failed: {ERROR}".format(ERROR=str(err))) @@ -344,6 +351,12 @@ def url_exists(url, curl=None): Simple Storage Service (`s3`) URLs; otherwise, the configured fetch method defined by `config:url_fetch_method` is used. + If the method is `curl`, it also uses the following configuration option: + + * config:verify_ssl (str): Perform SSL verification + + Otherwise, `urllib` will be used. + Arguments: url (str): URL whose existence is being checked curl (spack.util.executable.Executable or None): (optional) curl @@ -354,11 +367,31 @@ def url_exists(url, curl=None): tty.debug("Checking existence of {0}".format(url)) url_result = url_util.parse(url) - # Use curl if configured to do so - use_curl = spack.config.get( - "config:url_fetch_method", "urllib" - ) == "curl" and url_result.scheme not in ("gs", "s3") - if use_curl: + # Check if a local file + local_path = url_util.local_file_path(url_result) + if local_path: + return os.path.exists(local_path) + + # Check if Amazon Simple Storage Service (S3) .. urllib-based fetch + if url_result.scheme == "s3": + # Check for URL-specific connection information + s3 = s3_util.get_s3_session(url_result, method="fetch") + + try: + s3.get_object(Bucket=url_result.netloc, Key=url_result.path.lstrip("/")) + return True + except s3.ClientError as err: + if err.response["Error"]["Code"] == "NoSuchKey": + return False + raise err + + # Check if Google Storage .. urllib-based fetch + if url_result.scheme == "gs": + gcs = gcs_util.GCSBlob(url_result) + return gcs.exists() + + # Otherwise, use the configured fetch method + if spack.config.get("config:url_fetch_method") == "curl": curl_exe = _curl(curl) if not curl_exe: return False @@ -371,14 +404,13 @@ def url_exists(url, curl=None): _ = curl_exe(*curl_args, fail_on_error=False, output=os.devnull) return curl_exe.returncode == 0 - # Otherwise use urllib. + # If we get here, then the only other fetch method option is urllib. + # So try to "read" from the URL and assume that *any* non-throwing + # response contains the resource represented by the URL. try: - urlopen( - Request(url, method="HEAD", headers={"User-Agent": SPACK_USER_AGENT}), - timeout=spack.config.get("config:connect_timeout", 10), - ) + read_from_url(url) return True - except URLError as e: + except (SpackWebError, URLError) as e: tty.debug("Failure reading URL: " + str(e)) return False @@ -661,6 +693,35 @@ def spider(root_urls, depth=0, concurrency=32): return pages, links +def _urlopen(req, *args, **kwargs): + """Wrapper for compatibility with old versions of Python.""" + url = req + try: + url = url.get_full_url() + except AttributeError: + pass + + del kwargs["context"] + + opener = urlopen + if url_util.parse(url).scheme == "s3": + import spack.s3_handler + + opener = spack.s3_handler.open + elif url_util.parse(url).scheme == "gs": + import spack.gcs_handler + + opener = spack.gcs_handler.gcs_open + + try: + return opener(req, *args, **kwargs) + except TypeError as err: + # If the above fails because of 'context', call without 'context'. + if "context" in kwargs and "context" in str(err): + del kwargs["context"] + return opener(req, *args, **kwargs) + + def find_versions_of_archive( archive_urls, list_url=None, list_depth=0, concurrency=32, reference_package=None ): -- cgit v1.2.3-60-g2f50