diff options
Diffstat (limited to 'lib')
-rw-r--r-- | lib/spack/spack/gcs_handler.py | 11 | ||||
-rw-r--r-- | lib/spack/spack/s3_handler.py | 38 | ||||
-rw-r--r-- | lib/spack/spack/test/web.py | 13 | ||||
-rw-r--r-- | lib/spack/spack/util/web.py | 189 |
4 files changed, 92 insertions, 159 deletions
diff --git a/lib/spack/spack/gcs_handler.py b/lib/spack/spack/gcs_handler.py index 441eea6f80..4ee9f08896 100644 --- a/lib/spack/spack/gcs_handler.py +++ b/lib/spack/spack/gcs_handler.py @@ -4,8 +4,8 @@ # SPDX-License-Identifier: (Apache-2.0 OR MIT) import urllib.parse import urllib.response - -import spack.util.web as web_util +from urllib.error import URLError +from urllib.request import BaseHandler def gcs_open(req, *args, **kwargs): @@ -16,8 +16,13 @@ def gcs_open(req, *args, **kwargs): gcsblob = gcs_util.GCSBlob(url) if not gcsblob.exists(): - raise web_util.SpackWebError("GCS blob {0} does not exist".format(gcsblob.blob_path)) + raise URLError("GCS blob {0} does not exist".format(gcsblob.blob_path)) stream = gcsblob.get_blob_byte_stream() headers = gcsblob.get_blob_headers() return urllib.response.addinfourl(stream, headers, url) + + +class GCSHandler(BaseHandler): + def gs_open(self, req): + return gcs_open(req) diff --git a/lib/spack/spack/s3_handler.py b/lib/spack/spack/s3_handler.py index a3e0aa991b..77a8a2f7cc 100644 --- a/lib/spack/spack/s3_handler.py +++ b/lib/spack/spack/s3_handler.py @@ -7,7 +7,7 @@ import urllib.error import urllib.parse import urllib.request import urllib.response -from io import BufferedReader, IOBase +from io import BufferedReader, BytesIO, IOBase import spack.util.s3 as s3_util @@ -42,7 +42,7 @@ class WrapStream(BufferedReader): return getattr(self.raw, key) -def _s3_open(url): +def _s3_open(url, method="GET"): parsed = urllib.parse.urlparse(url) s3 = s3_util.get_s3_session(url, method="fetch") @@ -52,27 +52,29 @@ def _s3_open(url): if key.startswith("/"): key = key[1:] - obj = s3.get_object(Bucket=bucket, Key=key) + if method not in ("GET", "HEAD"): + raise urllib.error.URLError( + "Only GET and HEAD verbs are currently supported for the s3:// scheme" + ) + + try: + if method == "GET": + obj = s3.get_object(Bucket=bucket, Key=key) + # NOTE(opadron): Apply workaround here (see above) + stream = WrapStream(obj["Body"]) + elif method == "HEAD": + obj = s3.head_object(Bucket=bucket, Key=key) + stream = BytesIO() + except s3.ClientError as e: + raise urllib.error.URLError(e) from e - # NOTE(opadron): Apply workaround here (see above) - stream = WrapStream(obj["Body"]) headers = obj["ResponseMetadata"]["HTTPHeaders"] return url, headers, stream -class UrllibS3Handler(urllib.request.HTTPSHandler): +class UrllibS3Handler(urllib.request.BaseHandler): def s3_open(self, req): orig_url = req.get_full_url() - from botocore.exceptions import ClientError # type: ignore[import] - - try: - url, headers, stream = _s3_open(orig_url) - return urllib.response.addinfourl(stream, headers, url) - except ClientError as err: - raise urllib.error.URLError(err) from err - - -S3OpenerDirector = urllib.request.build_opener(UrllibS3Handler()) - -open = S3OpenerDirector.open + url, headers, stream = _s3_open(orig_url, method=req.get_method()) + return urllib.response.addinfourl(stream, headers, url) diff --git a/lib/spack/spack/test/web.py b/lib/spack/spack/test/web.py index 476ea01019..166a577c89 100644 --- a/lib/spack/spack/test/web.py +++ b/lib/spack/spack/test/web.py @@ -224,7 +224,10 @@ class MockPaginator(object): class MockClientError(Exception): def __init__(self): - self.response = {"Error": {"Code": "NoSuchKey"}} + self.response = { + "Error": {"Code": "NoSuchKey"}, + "ResponseMetadata": {"HTTPStatusCode": 404}, + } class MockS3Client(object): @@ -243,7 +246,13 @@ class MockS3Client(object): def get_object(self, Bucket=None, Key=None): self.ClientError = MockClientError if Bucket == "my-bucket" and Key == "subdirectory/my-file": - return True + return {"ResponseMetadata": {"HTTPHeaders": {}}} + raise self.ClientError + + def head_object(self, Bucket=None, Key=None): + self.ClientError = MockClientError + if Bucket == "my-bucket" and Key == "subdirectory/my-file": + return {"ResponseMetadata": {"HTTPHeaders": {}}} raise self.ClientError diff --git a/lib/spack/spack/util/web.py b/lib/spack/spack/util/web.py index 7c8964b3c9..9398a12dd8 100644 --- a/lib/spack/spack/util/web.py +++ b/lib/spack/spack/util/web.py @@ -18,7 +18,7 @@ import traceback import urllib.parse from html.parser import HTMLParser from urllib.error import URLError -from urllib.request import Request, urlopen +from urllib.request import HTTPSHandler, Request, build_opener import llnl.util.lang import llnl.util.tty as tty @@ -27,6 +27,8 @@ from llnl.util.filesystem import mkdirp, rename, working_dir import spack import spack.config import spack.error +import spack.gcs_handler +import spack.s3_handler import spack.url import spack.util.crypto import spack.util.gcs as gcs_util @@ -36,6 +38,28 @@ from spack.util.compression import ALLOWED_ARCHIVE_TYPES from spack.util.executable import CommandNotFoundError, which from spack.util.path import convert_to_posix_path + +def _urlopen(): + s3 = spack.s3_handler.UrllibS3Handler() + gcs = spack.gcs_handler.GCSHandler() + + # One opener with HTTPS ssl enabled + with_ssl = build_opener(s3, gcs, HTTPSHandler(context=ssl.create_default_context())) + + # One opener with HTTPS ssl disabled + without_ssl = build_opener(s3, gcs, HTTPSHandler(context=ssl._create_unverified_context())) + + # And dynamically dispatch based on the config:verify_ssl. + def dispatch_open(*args, **kwargs): + opener = with_ssl if spack.config.get("config:verify_ssl", True) else without_ssl + return opener.open(*args, **kwargs) + + return dispatch_open + + +#: Dispatches to the correct OpenerDirector.open, based on Spack configuration. +urlopen = llnl.util.lang.Singleton(_urlopen) + #: User-Agent used in Request objects SPACK_USER_AGENT = "Spackbot/{0}".format(spack.spack_version) @@ -60,86 +84,33 @@ class LinkParser(HTMLParser): self.links.append(val) -def uses_ssl(parsed_url): - if parsed_url.scheme == "https": - return True - - if parsed_url.scheme == "s3": - endpoint_url = os.environ.get("S3_ENDPOINT_URL") - if not endpoint_url: - return True - - if urllib.parse.urlparse(endpoint_url).scheme == "https": - return True - - elif parsed_url.scheme == "gs": - tty.debug("(uses_ssl) GCS Blob is https") - return True - - return False - - def read_from_url(url, accept_content_type=None): if isinstance(url, str): url = urllib.parse.urlparse(url) - context = None # Timeout in seconds for web requests timeout = spack.config.get("config:connect_timeout", 10) - - # Don't even bother with a context unless the URL scheme is one that uses - # SSL certs. - if uses_ssl(url): - if spack.config.get("config:verify_ssl"): - # User wants SSL verification, and it *can* be provided. - context = ssl.create_default_context() - else: - # User has explicitly indicated that they do not want SSL - # verification. - context = ssl._create_unverified_context() - - url_scheme = url.scheme - url = url_util.format(url) - if sys.platform == "win32" and url_scheme == "file": - url = convert_to_posix_path(url) - req = Request(url, headers={"User-Agent": SPACK_USER_AGENT}) - - content_type = None - is_web_url = url_scheme in ("http", "https") - if accept_content_type and is_web_url: - # Make a HEAD request first to check the content type. This lets - # us ignore tarballs and gigantic files. - # It would be nice to do this with the HTTP Accept header to avoid - # one round-trip. However, most servers seem to ignore the header - # if you ask for a tarball with Accept: text/html. - req.get_method = lambda: "HEAD" - resp = _urlopen(req, timeout=timeout, context=context) - - content_type = get_header(resp.headers, "Content-type") - - # Do the real GET request when we know it's just HTML. - req.get_method = lambda: "GET" + request = Request(url.geturl(), headers={"User-Agent": SPACK_USER_AGENT}) try: - response = _urlopen(req, timeout=timeout, context=context) + response = urlopen(request, timeout=timeout) except URLError as err: - raise SpackWebError("Download failed: {ERROR}".format(ERROR=str(err))) - - if accept_content_type and not is_web_url: - content_type = get_header(response.headers, "Content-type") + raise SpackWebError("Download failed: {}".format(str(err))) - reject_content_type = accept_content_type and ( - content_type is None or not content_type.startswith(accept_content_type) - ) - - if reject_content_type: - tty.debug( - "ignoring page {0}{1}{2}".format( - url, " with content type " if content_type is not None else "", content_type or "" - ) - ) - - return None, None, None + if accept_content_type: + try: + content_type = get_header(response.headers, "Content-type") + reject_content_type = not content_type.startswith(accept_content_type) + except KeyError: + content_type = None + reject_content_type = True + + if reject_content_type: + msg = "ignoring page {}".format(url.geturl()) + if content_type: + msg += " with content type {}".format(content_type) + tty.debug(msg) + return None, None, None return response.geturl(), response.headers, response @@ -349,12 +320,6 @@ def url_exists(url, curl=None): Simple Storage Service (`s3`) URLs; otherwise, the configured fetch method defined by `config:url_fetch_method` is used. - If the method is `curl`, it also uses the following configuration option: - - * config:verify_ssl (str): Perform SSL verification - - Otherwise, `urllib` will be used. - Arguments: url (str): URL whose existence is being checked curl (spack.util.executable.Executable or None): (optional) curl @@ -365,31 +330,11 @@ def url_exists(url, curl=None): tty.debug("Checking existence of {0}".format(url)) url_result = urllib.parse.urlparse(url) - # Check if a local file - local_path = url_util.local_file_path(url_result) - if local_path: - return os.path.exists(local_path) - - # Check if Amazon Simple Storage Service (S3) .. urllib-based fetch - if url_result.scheme == "s3": - # Check for URL-specific connection information - s3 = s3_util.get_s3_session(url_result, method="fetch") - - try: - s3.get_object(Bucket=url_result.netloc, Key=url_result.path.lstrip("/")) - return True - except s3.ClientError as err: - if err.response["Error"]["Code"] == "NoSuchKey": - return False - raise err - - # Check if Google Storage .. urllib-based fetch - if url_result.scheme == "gs": - gcs = gcs_util.GCSBlob(url_result) - return gcs.exists() - - # Otherwise, use the configured fetch method - if spack.config.get("config:url_fetch_method") == "curl": + # Use curl if configured to do so + use_curl = spack.config.get( + "config:url_fetch_method", "urllib" + ) == "curl" and url_result.scheme not in ("gs", "s3") + if use_curl: curl_exe = _curl(curl) if not curl_exe: return False @@ -402,13 +347,14 @@ def url_exists(url, curl=None): _ = curl_exe(*curl_args, fail_on_error=False, output=os.devnull) return curl_exe.returncode == 0 - # If we get here, then the only other fetch method option is urllib. - # So try to "read" from the URL and assume that *any* non-throwing - # response contains the resource represented by the URL. + # Otherwise use urllib. try: - read_from_url(url) + urlopen( + Request(url, method="HEAD", headers={"User-Agent": SPACK_USER_AGENT}), + timeout=spack.config.get("config:connect_timeout", 10), + ) return True - except (SpackWebError, URLError) as e: + except URLError as e: tty.debug("Failure reading URL: " + str(e)) return False @@ -691,35 +637,6 @@ def spider(root_urls, depth=0, concurrency=32): return pages, links -def _urlopen(req, *args, **kwargs): - """Wrapper for compatibility with old versions of Python.""" - url = req - try: - url = url.get_full_url() - except AttributeError: - pass - - del kwargs["context"] - - opener = urlopen - if urllib.parse.urlparse(url).scheme == "s3": - import spack.s3_handler - - opener = spack.s3_handler.open - elif urllib.parse.urlparse(url).scheme == "gs": - import spack.gcs_handler - - opener = spack.gcs_handler.gcs_open - - try: - return opener(req, *args, **kwargs) - except TypeError as err: - # If the above fails because of 'context', call without 'context'. - if "context" in kwargs and "context" in str(err): - del kwargs["context"] - return opener(req, *args, **kwargs) - - def find_versions_of_archive( archive_urls, list_url=None, list_depth=0, concurrency=32, reference_package=None ): |