From fb9e5fcc4f5307deaf10fcd571ebea68188d859c Mon Sep 17 00:00:00 2001 From: Massimiliano Culpo Date: Fri, 15 Sep 2023 15:43:23 +0200 Subject: Group primitive url/path handling functions together (#40028) --- lib/spack/llnl/url.py | 459 +++++++++++++++++++++++++++++++ lib/spack/spack/cmd/create.py | 2 +- lib/spack/spack/cmd/url.py | 4 +- lib/spack/spack/fetch_strategy.py | 7 +- lib/spack/spack/gcs_handler.py | 28 -- lib/spack/spack/main.py | 1 - lib/spack/spack/mirror.py | 4 +- lib/spack/spack/package_base.py | 2 +- lib/spack/spack/patch.py | 2 +- lib/spack/spack/s3_handler.py | 80 ------ lib/spack/spack/test/llnl/url.py | 167 +++++++++++ lib/spack/spack/test/url_parse.py | 113 -------- lib/spack/spack/test/util/compression.py | 38 +-- lib/spack/spack/test/web.py | 19 +- lib/spack/spack/url.py | 439 +++++++++-------------------- lib/spack/spack/util/compression.py | 155 ++--------- lib/spack/spack/util/gcs.py | 22 ++ lib/spack/spack/util/path.py | 9 - lib/spack/spack/util/s3.py | 75 ++++- lib/spack/spack/util/web.py | 159 ++--------- 20 files changed, 901 insertions(+), 884 deletions(-) create mode 100644 lib/spack/llnl/url.py delete mode 100644 lib/spack/spack/gcs_handler.py delete mode 100644 lib/spack/spack/s3_handler.py create mode 100644 lib/spack/spack/test/llnl/url.py (limited to 'lib') diff --git a/lib/spack/llnl/url.py b/lib/spack/llnl/url.py new file mode 100644 index 0000000000..40e7606506 --- /dev/null +++ b/lib/spack/llnl/url.py @@ -0,0 +1,459 @@ +# Copyright 2013-2023 Lawrence Livermore National Security, LLC and other +# Spack Project Developers. See the top-level COPYRIGHT file for details. +# +# SPDX-License-Identifier: (Apache-2.0 OR MIT) +"""URL primitives that just require Python standard library.""" +import itertools +import os.path +import re +from typing import Optional, Set, Tuple +from urllib.parse import urlsplit, urlunsplit + +# Archive extensions allowed in Spack +PREFIX_EXTENSIONS = ("tar", "TAR") +EXTENSIONS = ("gz", "bz2", "xz", "Z") +NO_TAR_EXTENSIONS = ("zip", "tgz", "tbz2", "tbz", "txz") + +# Add PREFIX_EXTENSIONS and EXTENSIONS last so that .tar.gz is matched *before* .tar or .gz +ALLOWED_ARCHIVE_TYPES = ( + tuple(".".join(ext) for ext in itertools.product(PREFIX_EXTENSIONS, EXTENSIONS)) + + PREFIX_EXTENSIONS + + EXTENSIONS + + NO_TAR_EXTENSIONS +) +CONTRACTION_MAP = {"tgz": "tar.gz", "txz": "tar.xz", "tbz": "tar.bz2", "tbz2": "tar.bz2"} + + +def find_list_urls(url: str) -> Set[str]: + r"""Find good list URLs for the supplied URL. + + By default, returns the dirname of the archive path. + + Provides special treatment for the following websites, which have a + unique list URL different from the dirname of the download URL: + + ========= ======================================================= + GitHub https://github.com///releases + GitLab https://gitlab.\*///tags + BitBucket https://bitbucket.org///downloads/?tab=tags + CRAN https://\*.r-project.org/src/contrib/Archive/ + PyPI https://pypi.org/simple// + LuaRocks https://luarocks.org/modules// + ========= ======================================================= + + Note: this function is called by `spack versions`, `spack checksum`, + and `spack create`, but not by `spack fetch` or `spack install`. + + Parameters: + url (str): The download URL for the package + + Returns: + set: One or more list URLs for the package + """ + + url_types = [ + # GitHub + # e.g. https://github.com/llnl/callpath/archive/v1.0.1.tar.gz + (r"(.*github\.com/[^/]+/[^/]+)", lambda m: m.group(1) + "/releases"), + # GitLab API endpoint + # e.g. https://gitlab.dkrz.de/api/v4/projects/k202009%2Flibaec/repository/archive.tar.gz?sha=v1.0.2 + ( + r"(.*gitlab[^/]+)/api/v4/projects/([^/]+)%2F([^/]+)", + lambda m: m.group(1) + "/" + m.group(2) + "/" + m.group(3) + "/tags", + ), + # GitLab non-API endpoint + # e.g. https://gitlab.dkrz.de/k202009/libaec/uploads/631e85bcf877c2dcaca9b2e6d6526339/libaec-1.0.0.tar.gz + (r"(.*gitlab[^/]+/(?!api/v4/projects)[^/]+/[^/]+)", lambda m: m.group(1) + "/tags"), + # BitBucket + # e.g. https://bitbucket.org/eigen/eigen/get/3.3.3.tar.bz2 + (r"(.*bitbucket.org/[^/]+/[^/]+)", lambda m: m.group(1) + "/downloads/?tab=tags"), + # CRAN + # e.g. https://cran.r-project.org/src/contrib/Rcpp_0.12.9.tar.gz + # e.g. https://cloud.r-project.org/src/contrib/rgl_0.98.1.tar.gz + ( + r"(.*\.r-project\.org/src/contrib)/([^_]+)", + lambda m: m.group(1) + "/Archive/" + m.group(2), + ), + # PyPI + # e.g. https://pypi.io/packages/source/n/numpy/numpy-1.19.4.zip + # e.g. https://www.pypi.io/packages/source/n/numpy/numpy-1.19.4.zip + # e.g. https://pypi.org/packages/source/n/numpy/numpy-1.19.4.zip + # e.g. https://pypi.python.org/packages/source/n/numpy/numpy-1.19.4.zip + # e.g. https://files.pythonhosted.org/packages/source/n/numpy/numpy-1.19.4.zip + # e.g. https://pypi.io/packages/py2.py3/o/opencensus-context/opencensus_context-0.1.1-py2.py3-none-any.whl + ( + r"(?:pypi|pythonhosted)[^/]+/packages/[^/]+/./([^/]+)", + lambda m: "https://pypi.org/simple/" + m.group(1) + "/", + ), + # LuaRocks + # e.g. https://luarocks.org/manifests/gvvaughan/lpeg-1.0.2-1.src.rock + # e.g. https://luarocks.org/manifests/openresty/lua-cjson-2.1.0-1.src.rock + ( + r"luarocks[^/]+/(?:modules|manifests)/(?P[^/]+)/" + + r"(?P.+?)-[0-9.-]*\.src\.rock", + lambda m: "https://luarocks.org/modules/" + + m.group("org") + + "/" + + m.group("name") + + "/", + ), + ] + + list_urls = {os.path.dirname(url)} + + for pattern, fun in url_types: + match = re.search(pattern, url) + if match: + list_urls.add(fun(match)) + + return list_urls + + +def strip_query_and_fragment(url: str) -> Tuple[str, str]: + """Strips query and fragment from a url, then returns the base url and the suffix. + + Args: + url: URL to be stripped + + Raises: + ValueError: when there is any error parsing the URL + """ + components = urlsplit(url) + stripped = components[:3] + (None, None) + + query, frag = components[3:5] + suffix = "" + if query: + suffix += "?" + query + if frag: + suffix += "#" + frag + + return urlunsplit(stripped), suffix + + +SOURCEFORGE_RE = re.compile(r"(.*(?:sourceforge\.net|sf\.net)/.*)(/download)$") + + +def split_url_on_sourceforge_suffix(url: str) -> Tuple[str, ...]: + """If the input is a sourceforge URL, returns base URL and "/download" suffix. Otherwise, + returns the input URL and an empty string. + """ + match = SOURCEFORGE_RE.search(url) + if match is not None: + return match.groups() + return url, "" + + +def has_extension(path_or_url: str, ext: str) -> bool: + """Returns true if the extension in input is present in path, false otherwise.""" + prefix, _ = split_url_on_sourceforge_suffix(path_or_url) + if not ext.startswith(r"\."): + ext = rf"\.{ext}$" + + if re.search(ext, prefix): + return True + return False + + +def extension_from_path(path_or_url: Optional[str]) -> Optional[str]: + """Tries to match an allowed archive extension to the input. Returns the first match, + or None if no match was found. + + Raises: + ValueError: if the input is None + """ + if path_or_url is None: + raise ValueError("Can't call extension() on None") + + for t in ALLOWED_ARCHIVE_TYPES: + if has_extension(path_or_url, t): + return t + return None + + +def remove_extension(path_or_url: str, *, extension: str) -> str: + """Returns the input with the extension removed""" + suffix = rf"\.{extension}$" + return re.sub(suffix, "", path_or_url) + + +def check_and_remove_ext(path: str, *, extension: str) -> str: + """Returns the input path with the extension removed, if the extension is present in path. + Otherwise, returns the input unchanged. + """ + if not has_extension(path, extension): + return path + path, _ = split_url_on_sourceforge_suffix(path) + return remove_extension(path, extension=extension) + + +def strip_extension(path_or_url: str, *, extension: Optional[str] = None) -> str: + """If a path contains the extension in input, returns the path stripped of the extension. + Otherwise, returns the input path. + + If extension is None, attempts to strip any allowed extension from path. + """ + if extension is None: + for t in ALLOWED_ARCHIVE_TYPES: + if has_extension(path_or_url, ext=t): + extension = t + break + else: + return path_or_url + + return check_and_remove_ext(path_or_url, extension=extension) + + +def split_url_extension(url: str) -> Tuple[str, ...]: + """Some URLs have a query string, e.g.: + + 1. https://github.com/losalamos/CLAMR/blob/packages/PowerParser_v2.0.7.tgz?raw=true + 2. http://www.apache.org/dyn/closer.cgi?path=/cassandra/1.2.0/apache-cassandra-1.2.0-rc2-bin.tar.gz + 3. https://gitlab.kitware.com/vtk/vtk/repository/archive.tar.bz2?ref=v7.0.0 + + In (1), the query string needs to be stripped to get at the + extension, but in (2) & (3), the filename is IN a single final query + argument. + + This strips the URL into three pieces: ``prefix``, ``ext``, and ``suffix``. + The suffix contains anything that was stripped off the URL to + get at the file extension. In (1), it will be ``'?raw=true'``, but + in (2), it will be empty. In (3) the suffix is a parameter that follows + after the file extension, e.g.: + + 1. ``('https://github.com/losalamos/CLAMR/blob/packages/PowerParser_v2.0.7', '.tgz', '?raw=true')`` + 2. ``('http://www.apache.org/dyn/closer.cgi?path=/cassandra/1.2.0/apache-cassandra-1.2.0-rc2-bin', '.tar.gz', None)`` + 3. ``('https://gitlab.kitware.com/vtk/vtk/repository/archive', '.tar.bz2', '?ref=v7.0.0')`` + """ + # Strip off sourceforge download suffix. + # e.g. https://sourceforge.net/projects/glew/files/glew/2.0.0/glew-2.0.0.tgz/download + prefix, suffix = split_url_on_sourceforge_suffix(url) + + ext = extension_from_path(prefix) + if ext is not None: + prefix = strip_extension(prefix) + return prefix, ext, suffix + + try: + prefix, suf = strip_query_and_fragment(prefix) + except ValueError: + # FIXME: tty.debug("Got error parsing path %s" % path) + # Ignore URL parse errors here + return url, "" + + ext = extension_from_path(prefix) + prefix = strip_extension(prefix) + suffix = suf + suffix + if ext is None: + ext = "" + + return prefix, ext, suffix + + +def strip_version_suffixes(path_or_url: str) -> str: + """Some tarballs contain extraneous information after the version: + + * ``bowtie2-2.2.5-source`` + * ``libevent-2.0.21-stable`` + * ``cuda_8.0.44_linux.run`` + + These strings are not part of the version number and should be ignored. + This function strips those suffixes off and returns the remaining string. + The goal is that the version is always the last thing in ``path``: + + * ``bowtie2-2.2.5`` + * ``libevent-2.0.21`` + * ``cuda_8.0.44`` + + Args: + path_or_url: The filename or URL for the package + + Returns: + The ``path`` with any extraneous suffixes removed + """ + # NOTE: This could be done with complicated regexes in parse_version_offset + # NOTE: The problem is that we would have to add these regexes to the end + # NOTE: of every single version regex. Easier to just strip them off + # NOTE: permanently + + suffix_regexes = [ + # Download type + r"[Ii]nstall", + r"all", + r"code", + r"[Ss]ources?", + r"file", + r"full", + r"single", + r"with[a-zA-Z_-]+", + r"rock", + r"src(_0)?", + r"public", + r"bin", + r"binary", + r"run", + r"[Uu]niversal", + r"jar", + r"complete", + r"dynamic", + r"oss", + r"gem", + r"tar", + r"sh", + # Download version + r"release", + r"bin", + r"stable", + r"[Ff]inal", + r"rel", + r"orig", + r"dist", + r"\+", + # License + r"gpl", + # Arch + # Needs to come before and after OS, appears in both orders + r"ia32", + r"intel", + r"amd64", + r"linux64", + r"x64", + r"64bit", + r"x86[_-]64", + r"i586_64", + r"x86", + r"i[36]86", + r"ppc64(le)?", + r"armv?(7l|6l|64)", + # Other + r"cpp", + r"gtk", + r"incubating", + # OS + r"[Ll]inux(_64)?", + r"LINUX", + r"[Uu]ni?x", + r"[Ss]un[Oo][Ss]", + r"[Mm]ac[Oo][Ss][Xx]?", + r"[Oo][Ss][Xx]", + r"[Dd]arwin(64)?", + r"[Aa]pple", + r"[Ww]indows", + r"[Ww]in(64|32)?", + r"[Cc]ygwin(64|32)?", + r"[Mm]ingw", + r"centos", + # Arch + # Needs to come before and after OS, appears in both orders + r"ia32", + r"intel", + r"amd64", + r"linux64", + r"x64", + r"64bit", + r"x86[_-]64", + r"i586_64", + r"x86", + r"i[36]86", + r"ppc64(le)?", + r"armv?(7l|6l|64)?", + # PyPI + r"[._-]py[23].*\.whl", + r"[._-]cp[23].*\.whl", + r"[._-]win.*\.exe", + ] + + for regex in suffix_regexes: + # Remove the suffix from the end of the path + # This may be done multiple times + path_or_url = re.sub(r"[._-]?" + regex + "$", "", path_or_url) + + return path_or_url + + +def expand_contracted_extension(extension: str) -> str: + """Returns the expanded version of a known contracted extension. + + This function maps extensions like ".tgz" to ".tar.gz". On unknown extensions, + return the input unmodified. + """ + extension = extension.strip(".") + return CONTRACTION_MAP.get(extension, extension) + + +def expand_contracted_extension_in_path( + path_or_url: str, *, extension: Optional[str] = None +) -> str: + """Returns the input path or URL with any contraction extension expanded. + + Args: + path_or_url: path or URL to be expanded + extension: if specified, only attempt to expand that extension + """ + extension = extension or extension_from_path(path_or_url) + if extension is None: + return path_or_url + + expanded = expand_contracted_extension(extension) + if expanded != extension: + return re.sub(rf"{extension}", rf"{expanded}", path_or_url) + return path_or_url + + +def compression_ext_from_compressed_archive(extension: str) -> Optional[str]: + """Returns compression extension for a compressed archive""" + extension = expand_contracted_extension(extension) + for ext in [*EXTENSIONS]: + if ext in extension: + return ext + return None + + +def strip_compression_extension(path_or_url: str, ext: Optional[str] = None) -> str: + """Strips the compression extension from the input, and returns it. For instance, + "foo.tgz" becomes "foo.tar". + + If no extension is given, try a default list of extensions. + + Args: + path_or_url: input to be stripped + ext: if given, extension to be stripped + """ + if not extension_from_path(path_or_url): + return path_or_url + + expanded_path = expand_contracted_extension_in_path(path_or_url) + candidates = [ext] if ext is not None else EXTENSIONS + for current_extension in candidates: + modified_path = check_and_remove_ext(expanded_path, extension=current_extension) + if modified_path != expanded_path: + return modified_path + return expanded_path + + +def allowed_archive(path_or_url: str) -> bool: + """Returns true if the input is a valid archive, False otherwise.""" + return ( + False if not path_or_url else any(path_or_url.endswith(t) for t in ALLOWED_ARCHIVE_TYPES) + ) + + +def determine_url_file_extension(path: str) -> str: + """This returns the type of archive a URL refers to. This is + sometimes confusing because of URLs like: + + (1) https://github.com/petdance/ack/tarball/1.93_02 + + Where the URL doesn't actually contain the filename. We need + to know what type it is so that we can appropriately name files + in mirrors. + """ + match = re.search(r"github.com/.+/(zip|tar)ball/", path) + if match: + if match.group(1) == "zip": + return "zip" + elif match.group(1) == "tar": + return "tar.gz" + + prefix, ext, suffix = split_url_extension(path) + return ext diff --git a/lib/spack/spack/cmd/create.py b/lib/spack/spack/cmd/create.py index 9c923c4a17..e3569d998f 100644 --- a/lib/spack/spack/cmd/create.py +++ b/lib/spack/spack/cmd/create.py @@ -822,7 +822,7 @@ def get_versions(args, name): if args.url is not None and args.template != "bundle" and valid_url: # Find available versions try: - url_dict = spack.util.web.find_versions_of_archive(args.url) + url_dict = spack.url.find_versions_of_archive(args.url) except UndetectableVersionError: # Use fake versions tty.warn("Couldn't detect version in: {0}".format(args.url)) diff --git a/lib/spack/spack/cmd/url.py b/lib/spack/spack/cmd/url.py index 8f7866c406..25f8ad382a 100644 --- a/lib/spack/spack/cmd/url.py +++ b/lib/spack/spack/cmd/url.py @@ -12,6 +12,7 @@ from llnl.util import tty import spack.fetch_strategy as fs import spack.repo import spack.spec +import spack.url import spack.util.crypto as crypto from spack.url import ( UndetectableNameError, @@ -26,7 +27,6 @@ from spack.url import ( substitution_offsets, ) from spack.util.naming import simplify_name -from spack.util.web import find_versions_of_archive description = "debugging tool for url parsing" section = "developer" @@ -139,7 +139,7 @@ def url_parse(args): if args.spider: print() tty.msg("Spidering for versions:") - versions = find_versions_of_archive(url) + versions = spack.url.find_versions_of_archive(url) if not versions: print(" Found no versions for {0}".format(name)) diff --git a/lib/spack/spack/fetch_strategy.py b/lib/spack/spack/fetch_strategy.py index 1f99c4ce9e..87c6e0fc61 100644 --- a/lib/spack/spack/fetch_strategy.py +++ b/lib/spack/spack/fetch_strategy.py @@ -31,6 +31,7 @@ import shutil import urllib.parse from typing import List, Optional +import llnl.url import llnl.util import llnl.util.filesystem as fs import llnl.util.tty as tty @@ -46,7 +47,7 @@ import spack.util.url as url_util import spack.util.web as web_util import spack.version import spack.version.git_ref_lookup -from spack.util.compression import decompressor_for, extension_from_path +from spack.util.compression import decompressor_for from spack.util.executable import CommandNotFoundError, which from spack.util.string import comma_and, quote @@ -441,7 +442,7 @@ class URLFetchStrategy(FetchStrategy): # TODO: replace this by mime check. if not self.extension: - self.extension = spack.url.determine_url_file_extension(self.url) + self.extension = llnl.url.determine_url_file_extension(self.url) if self.stage.expanded: tty.debug("Source already staged to %s" % self.stage.source_path) @@ -570,7 +571,7 @@ class VCSFetchStrategy(FetchStrategy): @_needs_stage def archive(self, destination, **kwargs): - assert extension_from_path(destination) == "tar.gz" + assert llnl.url.extension_from_path(destination) == "tar.gz" assert self.stage.source_path.startswith(self.stage.path) tar = which("tar", required=True) diff --git a/lib/spack/spack/gcs_handler.py b/lib/spack/spack/gcs_handler.py deleted file mode 100644 index b002fa70ac..0000000000 --- a/lib/spack/spack/gcs_handler.py +++ /dev/null @@ -1,28 +0,0 @@ -# Copyright 2013-2023 Lawrence Livermore National Security, LLC and other -# Spack Project Developers. See the top-level COPYRIGHT file for details. -# -# SPDX-License-Identifier: (Apache-2.0 OR MIT) -import urllib.parse -import urllib.response -from urllib.error import URLError -from urllib.request import BaseHandler - - -def gcs_open(req, *args, **kwargs): - """Open a reader stream to a blob object on GCS""" - import spack.util.gcs as gcs_util - - url = urllib.parse.urlparse(req.get_full_url()) - gcsblob = gcs_util.GCSBlob(url) - - if not gcsblob.exists(): - raise URLError("GCS blob {0} does not exist".format(gcsblob.blob_path)) - stream = gcsblob.get_blob_byte_stream() - headers = gcsblob.get_blob_headers() - - return urllib.response.addinfourl(stream, headers, url) - - -class GCSHandler(BaseHandler): - def gs_open(self, req): - return gcs_open(req) diff --git a/lib/spack/spack/main.py b/lib/spack/spack/main.py index 009190829f..3b330c08d4 100644 --- a/lib/spack/spack/main.py +++ b/lib/spack/spack/main.py @@ -30,7 +30,6 @@ import llnl.util.tty.colify import llnl.util.tty.color as color from llnl.util.tty.log import log_output -import spack import spack.cmd import spack.config import spack.environment as ev diff --git a/lib/spack/spack/mirror.py b/lib/spack/spack/mirror.py index e4825537db..32037502c5 100644 --- a/lib/spack/spack/mirror.py +++ b/lib/spack/spack/mirror.py @@ -20,6 +20,7 @@ import traceback import urllib.parse from typing import Optional, Union +import llnl.url import llnl.util.tty as tty from llnl.util.filesystem import mkdirp @@ -29,7 +30,6 @@ import spack.error import spack.fetch_strategy as fs import spack.mirror import spack.spec -import spack.url as url import spack.util.path import spack.util.spack_json as sjson import spack.util.spack_yaml as syaml @@ -375,7 +375,7 @@ def _determine_extension(fetcher): if isinstance(fetcher, fs.URLFetchStrategy): if fetcher.expand_archive: # If we fetch with a URLFetchStrategy, use URL's archive type - ext = url.determine_url_file_extension(fetcher.url) + ext = llnl.url.determine_url_file_extension(fetcher.url) if ext: # Remove any leading dots diff --git a/lib/spack/spack/package_base.py b/lib/spack/spack/package_base.py index 5a14f44f31..67cebb3a8f 100644 --- a/lib/spack/spack/package_base.py +++ b/lib/spack/spack/package_base.py @@ -2377,7 +2377,7 @@ class PackageBase(WindowsRPath, PackageViewMixin, metaclass=PackageMeta): return {} try: - return spack.util.web.find_versions_of_archive( + return spack.url.find_versions_of_archive( self.all_urls, self.list_url, self.list_depth, concurrency, reference_package=self ) except spack.util.web.NoNetworkConnectionError as e: diff --git a/lib/spack/spack/patch.py b/lib/spack/spack/patch.py index a7fb3620ee..7bbab326d1 100644 --- a/lib/spack/spack/patch.py +++ b/lib/spack/spack/patch.py @@ -11,6 +11,7 @@ import sys import llnl.util.filesystem import llnl.util.lang +from llnl.url import allowed_archive import spack import spack.error @@ -19,7 +20,6 @@ import spack.mirror import spack.repo import spack.stage import spack.util.spack_json as sjson -from spack.util.compression import allowed_archive from spack.util.crypto import Checker, checksum from spack.util.executable import which, which_string diff --git a/lib/spack/spack/s3_handler.py b/lib/spack/spack/s3_handler.py deleted file mode 100644 index efab23a5ea..0000000000 --- a/lib/spack/spack/s3_handler.py +++ /dev/null @@ -1,80 +0,0 @@ -# Copyright 2013-2023 Lawrence Livermore National Security, LLC and other -# Spack Project Developers. See the top-level COPYRIGHT file for details. -# -# SPDX-License-Identifier: (Apache-2.0 OR MIT) - -import urllib.error -import urllib.parse -import urllib.request -import urllib.response -from io import BufferedReader, BytesIO, IOBase - -import spack.util.s3 as s3_util - - -# NOTE(opadron): Workaround issue in boto where its StreamingBody -# implementation is missing several APIs expected from IOBase. These missing -# APIs prevent the streams returned by boto from being passed as-are along to -# urllib. -# -# https://github.com/boto/botocore/issues/879 -# https://github.com/python/cpython/pull/3249 -class WrapStream(BufferedReader): - def __init__(self, raw): - # In botocore >=1.23.47, StreamingBody inherits from IOBase, so we - # only add missing attributes in older versions. - # https://github.com/boto/botocore/commit/a624815eabac50442ed7404f3c4f2664cd0aa784 - if not isinstance(raw, IOBase): - raw.readable = lambda: True - raw.writable = lambda: False - raw.seekable = lambda: False - raw.closed = False - raw.flush = lambda: None - super().__init__(raw) - - def detach(self): - self.raw = None - - def read(self, *args, **kwargs): - return self.raw.read(*args, **kwargs) - - def __getattr__(self, key): - return getattr(self.raw, key) - - -def _s3_open(url, method="GET"): - parsed = urllib.parse.urlparse(url) - s3 = s3_util.get_s3_session(url, method="fetch") - - bucket = parsed.netloc - key = parsed.path - - if key.startswith("/"): - key = key[1:] - - if method not in ("GET", "HEAD"): - raise urllib.error.URLError( - "Only GET and HEAD verbs are currently supported for the s3:// scheme" - ) - - try: - if method == "GET": - obj = s3.get_object(Bucket=bucket, Key=key) - # NOTE(opadron): Apply workaround here (see above) - stream = WrapStream(obj["Body"]) - elif method == "HEAD": - obj = s3.head_object(Bucket=bucket, Key=key) - stream = BytesIO() - except s3.ClientError as e: - raise urllib.error.URLError(e) from e - - headers = obj["ResponseMetadata"]["HTTPHeaders"] - - return url, headers, stream - - -class UrllibS3Handler(urllib.request.BaseHandler): - def s3_open(self, req): - orig_url = req.get_full_url() - url, headers, stream = _s3_open(orig_url, method=req.get_method()) - return urllib.response.addinfourl(stream, headers, url) diff --git a/lib/spack/spack/test/llnl/url.py b/lib/spack/spack/test/llnl/url.py new file mode 100644 index 0000000000..8da8e727ec --- /dev/null +++ b/lib/spack/spack/test/llnl/url.py @@ -0,0 +1,167 @@ +# Copyright 2013-2023 Lawrence Livermore National Security, LLC and other +# Spack Project Developers. See the top-level COPYRIGHT file for details. +# +# SPDX-License-Identifier: (Apache-2.0 OR MIT) +"""Tests for llnl.url functions""" +import itertools + +import pytest + +import llnl.url + + +@pytest.fixture(params=llnl.url.ALLOWED_ARCHIVE_TYPES) +def archive_and_expected(request): + archive_name = ".".join(["Foo", request.param]) + return archive_name, request.param + + +def test_get_extension(archive_and_expected): + """Tests that we can predict correctly known extensions for simple cases.""" + archive, expected = archive_and_expected + result = llnl.url.extension_from_path(archive) + assert result == expected + + +def test_get_bad_extension(): + """Tests that a bad extension returns None""" + result = llnl.url.extension_from_path("Foo.cxx") + assert result is None + + +@pytest.mark.parametrize( + "url,expected", + [ + # No suffix + ("rgb-1.0.6", "rgb-1.0.6"), + # Misleading prefix + ("jpegsrc.v9b", "jpegsrc.v9b"), + ("turbolinux702", "turbolinux702"), + ("converge_install_2.3.16", "converge_install_2.3.16"), + # Download type - code, source + ("cistem-1.0.0-beta-source-code", "cistem-1.0.0-beta"), + # Download type - src + ("apache-ant-1.9.7-src", "apache-ant-1.9.7"), + ("go1.7.4.src", "go1.7.4"), + # Download type - source + ("bowtie2-2.2.5-source", "bowtie2-2.2.5"), + ("grib_api-1.17.0-Source", "grib_api-1.17.0"), + # Download type - full + ("julia-0.4.3-full", "julia-0.4.3"), + # Download type - bin + ("apache-maven-3.3.9-bin", "apache-maven-3.3.9"), + # Download type - binary + ("Jmol-14.8.0-binary", "Jmol-14.8.0"), + # Download type - gem + ("rubysl-date-2.0.9.gem", "rubysl-date-2.0.9"), + # Download type - tar + ("gromacs-4.6.1-tar", "gromacs-4.6.1"), + # Download type - sh + ("Miniconda2-4.3.11-Linux-x86_64.sh", "Miniconda2-4.3.11"), + # Download version - release + ("v1.0.4-release", "v1.0.4"), + # Download version - stable + ("libevent-2.0.21-stable", "libevent-2.0.21"), + # Download version - final + ("2.6.7-final", "2.6.7"), + # Download version - rel + ("v1.9.5.1rel", "v1.9.5.1"), + # Download version - orig + ("dash_0.5.5.1.orig", "dash_0.5.5.1"), + # Download version - plus + ("ncbi-blast-2.6.0+-src", "ncbi-blast-2.6.0"), + # License + ("cppad-20170114.gpl", "cppad-20170114"), + # Arch + ("pcraster-4.1.0_x86-64", "pcraster-4.1.0"), + ("dislin-11.0.linux.i586_64", "dislin-11.0"), + ("PAGIT.V1.01.64bit", "PAGIT.V1.01"), + # OS - linux + ("astyle_2.04_linux", "astyle_2.04"), + # OS - unix + ("install-tl-unx", "install-tl"), + # OS - macos + ("astyle_1.23_macosx", "astyle_1.23"), + ("haxe-2.08-osx", "haxe-2.08"), + # PyPI - wheel + ("entrypoints-0.2.2-py2.py3-none-any.whl", "entrypoints-0.2.2"), + ( + "numpy-1.12.0-cp27-cp27m-macosx_10_6_intel.macosx_10_9_intel." + "macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl", + "numpy-1.12.0", + ), + # PyPI - exe + ("PyYAML-3.12.win-amd64-py3.5.exe", "PyYAML-3.12"), + # Combinations of multiple patterns - bin, release + ("rocketmq-all-4.5.2-bin-release", "rocketmq-all-4.5.2"), + # Combinations of multiple patterns - all + ("p7zip_9.04_src_all", "p7zip_9.04"), + # Combinations of multiple patterns - run + ("cuda_8.0.44_linux.run", "cuda_8.0.44"), + # Combinations of multiple patterns - file + ("ack-2.14-single-file", "ack-2.14"), + # Combinations of multiple patterns - jar + ("antlr-3.4-complete.jar", "antlr-3.4"), + # Combinations of multiple patterns - oss + ("tbb44_20160128oss_src_0", "tbb44_20160128"), + # Combinations of multiple patterns - darwin + ("ghc-7.0.4-x86_64-apple-darwin", "ghc-7.0.4"), + ("ghc-7.0.4-i386-apple-darwin", "ghc-7.0.4"), + # Combinations of multiple patterns - centos + ("sratoolkit.2.8.2-1-centos_linux64", "sratoolkit.2.8.2-1"), + # Combinations of multiple patterns - arch + ( + "VizGlow_v2.2alpha17-R21November2016-Linux-x86_64-Install", + "VizGlow_v2.2alpha17-R21November2016", + ), + ("jdk-8u92-linux-x64", "jdk-8u92"), + ("cuda_6.5.14_linux_64.run", "cuda_6.5.14"), + ("Mathematica_12.0.0_LINUX.sh", "Mathematica_12.0.0"), + ("trf407b.linux64", "trf407b"), + # Combinations of multiple patterns - with + ("mafft-7.221-with-extensions-src", "mafft-7.221"), + ("spark-2.0.0-bin-without-hadoop", "spark-2.0.0"), + ("conduit-v0.3.0-src-with-blt", "conduit-v0.3.0"), + # Combinations of multiple patterns - rock + ("bitlib-23-2.src.rock", "bitlib-23-2"), + # Combinations of multiple patterns - public + ("dakota-6.3-public.src", "dakota-6.3"), + # Combinations of multiple patterns - universal + ("synergy-1.3.6p2-MacOSX-Universal", "synergy-1.3.6p2"), + # Combinations of multiple patterns - dynamic + ("snptest_v2.5.2_linux_x86_64_dynamic", "snptest_v2.5.2"), + # Combinations of multiple patterns - other + ("alglib-3.11.0.cpp.gpl", "alglib-3.11.0"), + ("hpcviewer-2019.08-linux.gtk.x86_64", "hpcviewer-2019.08"), + ("apache-mxnet-src-1.3.0-incubating", "apache-mxnet-src-1.3.0"), + ], +) +def test_url_strip_version_suffixes(url, expected): + stripped = llnl.url.strip_version_suffixes(url) + assert stripped == expected + + +def test_strip_compression_extension(archive_and_expected): + archive, extension = archive_and_expected + stripped = llnl.url.strip_compression_extension(archive) + if extension == "zip": + assert stripped == "Foo.zip" + stripped = llnl.url.strip_compression_extension(archive, "zip") + assert stripped == "Foo" + elif ( + extension.lower() == "tar" + or extension in llnl.url.CONTRACTION_MAP + or extension + in [ + ".".join(ext) + for ext in itertools.product(llnl.url.PREFIX_EXTENSIONS, llnl.url.EXTENSIONS) + ] + ): + assert stripped == "Foo.tar" or stripped == "Foo.TAR" + else: + assert stripped == "Foo" + + +def test_allowed_archive(archive_and_expected): + archive, _ = archive_and_expected + assert llnl.url.allowed_archive(archive) diff --git a/lib/spack/spack/test/url_parse.py b/lib/spack/spack/test/url_parse.py index 86ebf84fa7..dd094ed230 100644 --- a/lib/spack/spack/test/url_parse.py +++ b/lib/spack/spack/test/url_parse.py @@ -17,124 +17,11 @@ from spack.url import ( parse_name_offset, parse_version_offset, strip_name_suffixes, - strip_version_suffixes, substitute_version, ) from spack.version import Version -@pytest.mark.parametrize( - "url,expected", - [ - # No suffix - ("rgb-1.0.6", "rgb-1.0.6"), - # Misleading prefix - ("jpegsrc.v9b", "jpegsrc.v9b"), - ("turbolinux702", "turbolinux702"), - ("converge_install_2.3.16", "converge_install_2.3.16"), - # Download type - code, source - ("cistem-1.0.0-beta-source-code", "cistem-1.0.0-beta"), - # Download type - src - ("apache-ant-1.9.7-src", "apache-ant-1.9.7"), - ("go1.7.4.src", "go1.7.4"), - # Download type - source - ("bowtie2-2.2.5-source", "bowtie2-2.2.5"), - ("grib_api-1.17.0-Source", "grib_api-1.17.0"), - # Download type - full - ("julia-0.4.3-full", "julia-0.4.3"), - # Download type - bin - ("apache-maven-3.3.9-bin", "apache-maven-3.3.9"), - # Download type - binary - ("Jmol-14.8.0-binary", "Jmol-14.8.0"), - # Download type - gem - ("rubysl-date-2.0.9.gem", "rubysl-date-2.0.9"), - # Download type - tar - ("gromacs-4.6.1-tar", "gromacs-4.6.1"), - # Download type - sh - ("Miniconda2-4.3.11-Linux-x86_64.sh", "Miniconda2-4.3.11"), - # Download version - release - ("v1.0.4-release", "v1.0.4"), - # Download version - stable - ("libevent-2.0.21-stable", "libevent-2.0.21"), - # Download version - final - ("2.6.7-final", "2.6.7"), - # Download version - rel - ("v1.9.5.1rel", "v1.9.5.1"), - # Download version - orig - ("dash_0.5.5.1.orig", "dash_0.5.5.1"), - # Download version - plus - ("ncbi-blast-2.6.0+-src", "ncbi-blast-2.6.0"), - # License - ("cppad-20170114.gpl", "cppad-20170114"), - # Arch - ("pcraster-4.1.0_x86-64", "pcraster-4.1.0"), - ("dislin-11.0.linux.i586_64", "dislin-11.0"), - ("PAGIT.V1.01.64bit", "PAGIT.V1.01"), - # OS - linux - ("astyle_2.04_linux", "astyle_2.04"), - # OS - unix - ("install-tl-unx", "install-tl"), - # OS - macos - ("astyle_1.23_macosx", "astyle_1.23"), - ("haxe-2.08-osx", "haxe-2.08"), - # PyPI - wheel - ("entrypoints-0.2.2-py2.py3-none-any.whl", "entrypoints-0.2.2"), - ( - "numpy-1.12.0-cp27-cp27m-macosx_10_6_intel.macosx_10_9_intel." - "macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl", - "numpy-1.12.0", - ), - # PyPI - exe - ("PyYAML-3.12.win-amd64-py3.5.exe", "PyYAML-3.12"), - # Combinations of multiple patterns - bin, release - ("rocketmq-all-4.5.2-bin-release", "rocketmq-all-4.5.2"), - # Combinations of multiple patterns - all - ("p7zip_9.04_src_all", "p7zip_9.04"), - # Combinations of multiple patterns - run - ("cuda_8.0.44_linux.run", "cuda_8.0.44"), - # Combinations of multiple patterns - file - ("ack-2.14-single-file", "ack-2.14"), - # Combinations of multiple patterns - jar - ("antlr-3.4-complete.jar", "antlr-3.4"), - # Combinations of multiple patterns - oss - ("tbb44_20160128oss_src_0", "tbb44_20160128"), - # Combinations of multiple patterns - darwin - ("ghc-7.0.4-x86_64-apple-darwin", "ghc-7.0.4"), - ("ghc-7.0.4-i386-apple-darwin", "ghc-7.0.4"), - # Combinations of multiple patterns - centos - ("sratoolkit.2.8.2-1-centos_linux64", "sratoolkit.2.8.2-1"), - # Combinations of multiple patterns - arch - ( - "VizGlow_v2.2alpha17-R21November2016-Linux-x86_64-Install", - "VizGlow_v2.2alpha17-R21November2016", - ), - ("jdk-8u92-linux-x64", "jdk-8u92"), - ("cuda_6.5.14_linux_64.run", "cuda_6.5.14"), - ("Mathematica_12.0.0_LINUX.sh", "Mathematica_12.0.0"), - ("trf407b.linux64", "trf407b"), - # Combinations of multiple patterns - with - ("mafft-7.221-with-extensions-src", "mafft-7.221"), - ("spark-2.0.0-bin-without-hadoop", "spark-2.0.0"), - ("conduit-v0.3.0-src-with-blt", "conduit-v0.3.0"), - # Combinations of multiple patterns - rock - ("bitlib-23-2.src.rock", "bitlib-23-2"), - # Combinations of multiple patterns - public - ("dakota-6.3-public.src", "dakota-6.3"), - # Combinations of multiple patterns - universal - ("synergy-1.3.6p2-MacOSX-Universal", "synergy-1.3.6p2"), - # Combinations of multiple patterns - dynamic - ("snptest_v2.5.2_linux_x86_64_dynamic", "snptest_v2.5.2"), - # Combinations of multiple patterns - other - ("alglib-3.11.0.cpp.gpl", "alglib-3.11.0"), - ("hpcviewer-2019.08-linux.gtk.x86_64", "hpcviewer-2019.08"), - ("apache-mxnet-src-1.3.0-incubating", "apache-mxnet-src-1.3.0"), - ], -) -def test_url_strip_version_suffixes(url, expected): - stripped = strip_version_suffixes(url) - assert stripped == expected - - @pytest.mark.parametrize( "url,version,expected", [ diff --git a/lib/spack/spack/test/util/compression.py b/lib/spack/spack/test/util/compression.py index 7cbcfb283c..29007a7e33 100644 --- a/lib/spack/spack/test/util/compression.py +++ b/lib/spack/spack/test/util/compression.py @@ -10,6 +10,7 @@ from itertools import product import pytest +import llnl.url from llnl.util.filesystem import working_dir from spack.paths import spack_root @@ -21,7 +22,7 @@ datadir = os.path.join(spack_root, "lib", "spack", "spack", "test", "data", "com ext_archive = {} [ ext_archive.update({ext: ".".join(["Foo", ext])}) - for ext in scomp.ALLOWED_ARCHIVE_TYPES + for ext in llnl.url.ALLOWED_ARCHIVE_TYPES if "TAR" not in ext ] # Spack does not use Python native handling for tarballs or zip @@ -95,38 +96,3 @@ def test_unallowed_extension(): bad_ext_archive = "Foo.cxx" with pytest.raises(CommandNotFoundError): scomp.decompressor_for(bad_ext_archive) - - -@pytest.mark.parametrize("archive", ext_archive.values()) -def test_get_extension(archive): - ext = scomp.extension_from_path(archive) - assert ext_archive[ext] == archive - - -def test_get_bad_extension(): - archive = "Foo.cxx" - ext = scomp.extension_from_path(archive) - assert ext is None - - -@pytest.mark.parametrize("path", ext_archive.values()) -def test_allowed_archive(path): - assert scomp.allowed_archive(path) - - -@pytest.mark.parametrize("ext_path", ext_archive.items()) -def test_strip_compression_extension(ext_path): - ext, path = ext_path - stripped = scomp.strip_compression_extension(path) - if ext == "zip": - assert stripped == "Foo.zip" - stripped = scomp.strip_compression_extension(path, "zip") - assert stripped == "Foo" - elif ( - ext == "tar" - or ext in scomp.CONTRACTION_MAP.keys() - or ext in [".".join(ext) for ext in product(scomp.PRE_EXTS, scomp.EXTS)] - ): - assert stripped == "Foo.tar" or stripped == "Foo.TAR" - else: - assert stripped == "Foo" diff --git a/lib/spack/spack/test/web.py b/lib/spack/spack/test/web.py index 2d6f577799..a012e7524e 100644 --- a/lib/spack/spack/test/web.py +++ b/lib/spack/spack/test/web.py @@ -15,6 +15,7 @@ import llnl.util.tty as tty import spack.config import spack.mirror import spack.paths +import spack.url import spack.util.path import spack.util.s3 import spack.util.url as url_util @@ -102,31 +103,31 @@ def test_spider_no_response(monkeypatch): def test_find_versions_of_archive_0(): - versions = spack.util.web.find_versions_of_archive(root_tarball, root, list_depth=0) + versions = spack.url.find_versions_of_archive(root_tarball, root, list_depth=0) assert Version("0.0.0") in versions def test_find_versions_of_archive_1(): - versions = spack.util.web.find_versions_of_archive(root_tarball, root, list_depth=1) + versions = spack.url.find_versions_of_archive(root_tarball, root, list_depth=1) assert Version("0.0.0") in versions assert Version("1.0.0") in versions def test_find_versions_of_archive_2(): - versions = spack.util.web.find_versions_of_archive(root_tarball, root, list_depth=2) + versions = spack.url.find_versions_of_archive(root_tarball, root, list_depth=2) assert Version("0.0.0") in versions assert Version("1.0.0") in versions assert Version("2.0.0") in versions def test_find_exotic_versions_of_archive_2(): - versions = spack.util.web.find_versions_of_archive(root_tarball, root, list_depth=2) + versions = spack.url.find_versions_of_archive(root_tarball, root, list_depth=2) # up for grabs to make this better. assert Version("2.0.0b2") in versions def test_find_versions_of_archive_3(): - versions = spack.util.web.find_versions_of_archive(root_tarball, root, list_depth=3) + versions = spack.url.find_versions_of_archive(root_tarball, root, list_depth=3) assert Version("0.0.0") in versions assert Version("1.0.0") in versions assert Version("2.0.0") in versions @@ -135,16 +136,14 @@ def test_find_versions_of_archive_3(): def test_find_exotic_versions_of_archive_3(): - versions = spack.util.web.find_versions_of_archive(root_tarball, root, list_depth=3) + versions = spack.url.find_versions_of_archive(root_tarball, root, list_depth=3) assert Version("2.0.0b2") in versions assert Version("3.0a1") in versions assert Version("4.5-rc5") in versions def test_find_versions_of_archive_with_fragment(): - versions = spack.util.web.find_versions_of_archive( - root_tarball, root_with_fragment, list_depth=0 - ) + versions = spack.url.find_versions_of_archive(root_tarball, root_with_fragment, list_depth=0) assert Version("5.0.0") in versions @@ -311,7 +310,7 @@ def test_remove_s3_url(monkeypatch, capfd): def get_s3_session(url, method="fetch"): return MockS3Client() - monkeypatch.setattr(spack.util.s3, "get_s3_session", get_s3_session) + monkeypatch.setattr(spack.util.web, "get_s3_session", get_s3_session) current_debug_level = tty.debug_level() tty.set_debug(1) diff --git a/lib/spack/spack/url.py b/lib/spack/spack/url.py index bf2990f42f..c5e47232c0 100644 --- a/lib/spack/spack/url.py +++ b/lib/spack/spack/url.py @@ -27,246 +27,22 @@ it's never been told about that version before. """ import io import os +import pathlib import re -from urllib.parse import urlsplit, urlunsplit -import llnl.util.tty as tty +import llnl.url from llnl.util.tty.color import cescape, colorize import spack.error -import spack.util.compression as comp -import spack.util.path as spath +import spack.util.web import spack.version - +from spack.util.path import convert_to_posix_path # # Note: We call the input to most of these functions a "path" but the functions # work on paths and URLs. There's not a good word for both of these, but # "path" seemed like the most generic term. # -def find_list_urls(url): - r"""Find good list URLs for the supplied URL. - - By default, returns the dirname of the archive path. - - Provides special treatment for the following websites, which have a - unique list URL different from the dirname of the download URL: - - ========= ======================================================= - GitHub https://github.com///releases - GitLab https://gitlab.\*///tags - BitBucket https://bitbucket.org///downloads/?tab=tags - CRAN https://\*.r-project.org/src/contrib/Archive/ - PyPI https://pypi.org/simple// - LuaRocks https://luarocks.org/modules// - ========= ======================================================= - - Note: this function is called by `spack versions`, `spack checksum`, - and `spack create`, but not by `spack fetch` or `spack install`. - - Parameters: - url (str): The download URL for the package - - Returns: - set: One or more list URLs for the package - """ - - url_types = [ - # GitHub - # e.g. https://github.com/llnl/callpath/archive/v1.0.1.tar.gz - (r"(.*github\.com/[^/]+/[^/]+)", lambda m: m.group(1) + "/releases"), - # GitLab API endpoint - # e.g. https://gitlab.dkrz.de/api/v4/projects/k202009%2Flibaec/repository/archive.tar.gz?sha=v1.0.2 - ( - r"(.*gitlab[^/]+)/api/v4/projects/([^/]+)%2F([^/]+)", - lambda m: m.group(1) + "/" + m.group(2) + "/" + m.group(3) + "/tags", - ), - # GitLab non-API endpoint - # e.g. https://gitlab.dkrz.de/k202009/libaec/uploads/631e85bcf877c2dcaca9b2e6d6526339/libaec-1.0.0.tar.gz - (r"(.*gitlab[^/]+/(?!api/v4/projects)[^/]+/[^/]+)", lambda m: m.group(1) + "/tags"), - # BitBucket - # e.g. https://bitbucket.org/eigen/eigen/get/3.3.3.tar.bz2 - (r"(.*bitbucket.org/[^/]+/[^/]+)", lambda m: m.group(1) + "/downloads/?tab=tags"), - # CRAN - # e.g. https://cran.r-project.org/src/contrib/Rcpp_0.12.9.tar.gz - # e.g. https://cloud.r-project.org/src/contrib/rgl_0.98.1.tar.gz - ( - r"(.*\.r-project\.org/src/contrib)/([^_]+)", - lambda m: m.group(1) + "/Archive/" + m.group(2), - ), - # PyPI - # e.g. https://pypi.io/packages/source/n/numpy/numpy-1.19.4.zip - # e.g. https://www.pypi.io/packages/source/n/numpy/numpy-1.19.4.zip - # e.g. https://pypi.org/packages/source/n/numpy/numpy-1.19.4.zip - # e.g. https://pypi.python.org/packages/source/n/numpy/numpy-1.19.4.zip - # e.g. https://files.pythonhosted.org/packages/source/n/numpy/numpy-1.19.4.zip - # e.g. https://pypi.io/packages/py2.py3/o/opencensus-context/opencensus_context-0.1.1-py2.py3-none-any.whl - ( - r"(?:pypi|pythonhosted)[^/]+/packages/[^/]+/./([^/]+)", - lambda m: "https://pypi.org/simple/" + m.group(1) + "/", - ), - # LuaRocks - # e.g. https://luarocks.org/manifests/gvvaughan/lpeg-1.0.2-1.src.rock - # e.g. https://luarocks.org/manifests/openresty/lua-cjson-2.1.0-1.src.rock - ( - r"luarocks[^/]+/(?:modules|manifests)/(?P[^/]+)/" - + r"(?P.+?)-[0-9.-]*\.src\.rock", - lambda m: "https://luarocks.org/modules/" - + m.group("org") - + "/" - + m.group("name") - + "/", - ), - ] - - list_urls = set([os.path.dirname(url)]) - - for pattern, fun in url_types: - match = re.search(pattern, url) - if match: - list_urls.add(fun(match)) - - return list_urls - - -def strip_query_and_fragment(path): - try: - components = urlsplit(path) - stripped = components[:3] + (None, None) - - query, frag = components[3:5] - suffix = "" - if query: - suffix += "?" + query - if frag: - suffix += "#" + frag - - return (urlunsplit(stripped), suffix) - - except ValueError: - tty.debug("Got error parsing path %s" % path) - return (path, "") # Ignore URL parse errors here - - -def strip_version_suffixes(path): - """Some tarballs contain extraneous information after the version: - - * ``bowtie2-2.2.5-source`` - * ``libevent-2.0.21-stable`` - * ``cuda_8.0.44_linux.run`` - - These strings are not part of the version number and should be ignored. - This function strips those suffixes off and returns the remaining string. - The goal is that the version is always the last thing in ``path``: - - * ``bowtie2-2.2.5`` - * ``libevent-2.0.21`` - * ``cuda_8.0.44`` - - Args: - path (str): The filename or URL for the package - - Returns: - str: The ``path`` with any extraneous suffixes removed - """ - # NOTE: This could be done with complicated regexes in parse_version_offset - # NOTE: The problem is that we would have to add these regexes to the end - # NOTE: of every single version regex. Easier to just strip them off - # NOTE: permanently - - suffix_regexes = [ - # Download type - r"[Ii]nstall", - r"all", - r"code", - r"[Ss]ources?", - r"file", - r"full", - r"single", - r"with[a-zA-Z_-]+", - r"rock", - r"src(_0)?", - r"public", - r"bin", - r"binary", - r"run", - r"[Uu]niversal", - r"jar", - r"complete", - r"dynamic", - r"oss", - r"gem", - r"tar", - r"sh", - # Download version - r"release", - r"bin", - r"stable", - r"[Ff]inal", - r"rel", - r"orig", - r"dist", - r"\+", - # License - r"gpl", - # Arch - # Needs to come before and after OS, appears in both orders - r"ia32", - r"intel", - r"amd64", - r"linux64", - r"x64", - r"64bit", - r"x86[_-]64", - r"i586_64", - r"x86", - r"i[36]86", - r"ppc64(le)?", - r"armv?(7l|6l|64)", - # Other - r"cpp", - r"gtk", - r"incubating", - # OS - r"[Ll]inux(_64)?", - r"LINUX", - r"[Uu]ni?x", - r"[Ss]un[Oo][Ss]", - r"[Mm]ac[Oo][Ss][Xx]?", - r"[Oo][Ss][Xx]", - r"[Dd]arwin(64)?", - r"[Aa]pple", - r"[Ww]indows", - r"[Ww]in(64|32)?", - r"[Cc]ygwin(64|32)?", - r"[Mm]ingw", - r"centos", - # Arch - # Needs to come before and after OS, appears in both orders - r"ia32", - r"intel", - r"amd64", - r"linux64", - r"x64", - r"64bit", - r"x86[_-]64", - r"i586_64", - r"x86", - r"i[36]86", - r"ppc64(le)?", - r"armv?(7l|6l|64)?", - # PyPI - r"[._-]py[23].*\.whl", - r"[._-]cp[23].*\.whl", - r"[._-]win.*\.exe", - ] - - for regex in suffix_regexes: - # Remove the suffix from the end of the path - # This may be done multiple times - path = re.sub(r"[._-]?" + regex + "$", "", path) - - return path def strip_name_suffixes(path, version): @@ -341,69 +117,6 @@ def strip_name_suffixes(path, version): return path -def split_url_extension(path): - """Some URLs have a query string, e.g.: - - 1. https://github.com/losalamos/CLAMR/blob/packages/PowerParser_v2.0.7.tgz?raw=true - 2. http://www.apache.org/dyn/closer.cgi?path=/cassandra/1.2.0/apache-cassandra-1.2.0-rc2-bin.tar.gz - 3. https://gitlab.kitware.com/vtk/vtk/repository/archive.tar.bz2?ref=v7.0.0 - - In (1), the query string needs to be stripped to get at the - extension, but in (2) & (3), the filename is IN a single final query - argument. - - This strips the URL into three pieces: ``prefix``, ``ext``, and ``suffix``. - The suffix contains anything that was stripped off the URL to - get at the file extension. In (1), it will be ``'?raw=true'``, but - in (2), it will be empty. In (3) the suffix is a parameter that follows - after the file extension, e.g.: - - 1. ``('https://github.com/losalamos/CLAMR/blob/packages/PowerParser_v2.0.7', '.tgz', '?raw=true')`` - 2. ``('http://www.apache.org/dyn/closer.cgi?path=/cassandra/1.2.0/apache-cassandra-1.2.0-rc2-bin', '.tar.gz', None)`` - 3. ``('https://gitlab.kitware.com/vtk/vtk/repository/archive', '.tar.bz2', '?ref=v7.0.0')`` - """ - prefix, ext, suffix = path, "", "" - - # Strip off sourceforge download suffix. - # e.g. https://sourceforge.net/projects/glew/files/glew/2.0.0/glew-2.0.0.tgz/download - prefix, suffix = spath.find_sourceforge_suffix(path) - - ext = comp.extension_from_path(prefix) - if ext is not None: - prefix = comp.strip_extension(prefix) - - else: - prefix, suf = strip_query_and_fragment(prefix) - ext = comp.extension_from_path(prefix) - prefix = comp.strip_extension(prefix) - suffix = suf + suffix - if ext is None: - ext = "" - - return prefix, ext, suffix - - -def determine_url_file_extension(path): - """This returns the type of archive a URL refers to. This is - sometimes confusing because of URLs like: - - (1) https://github.com/petdance/ack/tarball/1.93_02 - - Where the URL doesn't actually contain the filename. We need - to know what type it is so that we can appropriately name files - in mirrors. - """ - match = re.search(r"github.com/.+/(zip|tar)ball/", path) - if match: - if match.group(1) == "zip": - return "zip" - elif match.group(1) == "tar": - return "tar.gz" - - prefix, ext, suffix = split_url_extension(path) - return ext - - def parse_version_offset(path): """Try to extract a version string from a filename or URL. @@ -426,13 +139,13 @@ def parse_version_offset(path): # path: The prefix of the URL, everything before the ext and suffix # ext: The file extension # suffix: Any kind of query string that begins with a '?' - path, ext, suffix = split_url_extension(path) + path, ext, suffix = llnl.url.split_url_extension(path) # stem: Everything from path after the final '/' original_stem = os.path.basename(path) # Try to strip off anything after the version number - stem = strip_version_suffixes(original_stem) + stem = llnl.url.strip_version_suffixes(original_stem) # Assumptions: # @@ -620,7 +333,7 @@ def parse_name_offset(path, v=None): # path: The prefix of the URL, everything before the ext and suffix # ext: The file extension # suffix: Any kind of query string that begins with a '?' - path, ext, suffix = split_url_extension(path) + path, ext, suffix = llnl.url.split_url_extension(path) # stem: Everything from path after the final '/' original_stem = os.path.basename(path) @@ -735,28 +448,6 @@ def parse_name_and_version(path): return (name, ver) -def insensitize(string): - """Change upper and lowercase letters to be case insensitive in - the provided string. e.g., 'a' becomes '[Aa]', 'B' becomes - '[bB]', etc. Use for building regexes.""" - - def to_ins(match): - char = match.group(1) - return "[%s%s]" % (char.lower(), char.upper()) - - return re.sub(r"([a-zA-Z])", to_ins, string) - - -def cumsum(elts, init=0, fn=lambda x: x): - """Return cumulative sum of result of fn on each element in elts.""" - sums = [] - s = init - for i, e in enumerate(elts): - sums.append(s) - s += fn(e) - return sums - - def find_all(substring, string): """Returns a list containing the indices of every occurrence of substring in string.""" @@ -912,6 +603,122 @@ def color_url(path, **kwargs): return colorize(out.getvalue()) +def find_versions_of_archive( + archive_urls, list_url=None, list_depth=0, concurrency=32, reference_package=None +): + """Scrape web pages for new versions of a tarball. This function prefers URLs in the + following order: links found on the scraped page that match a url generated by the + reference package, found and in the archive_urls list, found and derived from those + in the archive_urls list, and if none are found for a version then the item in the + archive_urls list is included for the version. + + Args: + archive_urls (str or list or tuple): URL or sequence of URLs for + different versions of a package. Typically these are just the + tarballs from the package file itself. By default, this searches + the parent directories of archives. + list_url (str or None): URL for a listing of archives. + Spack will scrape these pages for download links that look + like the archive URL. + list_depth (int): max depth to follow links on list_url pages. + Defaults to 0. + concurrency (int): maximum number of concurrent requests + reference_package (spack.package_base.PackageBase or None): a spack package + used as a reference for url detection. Uses the url_for_version + method on the package to produce reference urls which, if found, + are preferred. + """ + if not isinstance(archive_urls, (list, tuple)): + archive_urls = [archive_urls] + + # Generate a list of list_urls based on archive urls and any + # explicitly listed list_url in the package + list_urls = set() + if list_url is not None: + list_urls.add(list_url) + for aurl in archive_urls: + list_urls |= llnl.url.find_list_urls(aurl) + + # Add '/' to the end of the URL. Some web servers require this. + additional_list_urls = set() + for lurl in list_urls: + if not lurl.endswith("/"): + additional_list_urls.add(lurl + "/") + list_urls |= additional_list_urls + + # Grab some web pages to scrape. + pages, links = spack.util.web.spider(list_urls, depth=list_depth, concurrency=concurrency) + + # Scrape them for archive URLs + regexes = [] + for aurl in archive_urls: + # This creates a regex from the URL with a capture group for + # the version part of the URL. The capture group is converted + # to a generic wildcard, so we can use this to extract things + # on a page that look like archive URLs. + url_regex = wildcard_version(aurl) + + # We'll be a bit more liberal and just look for the archive + # part, not the full path. + # this is a URL so it is a posixpath even on Windows + url_regex = pathlib.PurePosixPath(url_regex).name + + # We need to add a / to the beginning of the regex to prevent + # Spack from picking up similarly named packages like: + # https://cran.r-project.org/src/contrib/pls_2.6-0.tar.gz + # https://cran.r-project.org/src/contrib/enpls_5.7.tar.gz + # https://cran.r-project.org/src/contrib/autopls_1.3.tar.gz + # https://cran.r-project.org/src/contrib/matrixpls_1.0.4.tar.gz + url_regex = "/" + url_regex + + # We need to add a $ anchor to the end of the regex to prevent + # Spack from picking up signature files like: + # .asc + # .md5 + # .sha256 + # .sig + # However, SourceForge downloads still need to end in '/download'. + url_regex += r"(\/download)?" + # PyPI adds #sha256=... to the end of the URL + url_regex += "(#sha256=.*)?" + url_regex += "$" + + regexes.append(url_regex) + + regexes = [re.compile(r) for r in regexes] + # Build a dict version -> URL from any links that match the wildcards. + # Walk through archive_url links first. + # Any conflicting versions will be overwritten by the list_url links. + versions = {} + matched = set() + for url in sorted(links): + url = convert_to_posix_path(url) + if any(r.search(url) for r in regexes): + try: + ver = parse_version(url) + if ver in matched: + continue + versions[ver] = url + # prevent this version from getting overwritten + if reference_package is not None: + if url == reference_package.url_for_version(ver): + matched.add(ver) + else: + extrapolated_urls = [substitute_version(u, ver) for u in archive_urls] + if url in extrapolated_urls: + matched.add(ver) + except UndetectableVersionError: + continue + + for url in archive_urls: + url = convert_to_posix_path(url) + ver = parse_version(url) + if ver not in versions: + versions[ver] = url + + return versions + + class UrlParseError(spack.error.SpackError): """Raised when the URL module can't parse something correctly.""" diff --git a/lib/spack/spack/util/compression.py b/lib/spack/spack/util/compression.py index b8dcd032f4..25ccfdf0bb 100644 --- a/lib/spack/spack/util/compression.py +++ b/lib/spack/spack/util/compression.py @@ -9,27 +9,13 @@ import os import re import shutil import sys -from itertools import product +import llnl.url from llnl.util import tty -import spack.util.path as spath from spack.error import SpackError from spack.util.executable import CommandNotFoundError, which -# Supported archive extensions. -PRE_EXTS = ["tar", "TAR"] -EXTS = ["gz", "bz2", "xz", "Z"] -NOTAR_EXTS = ["zip", "tgz", "tbz2", "tbz", "txz"] -CONTRACTION_MAP = {"tgz": "tar.gz", "txz": "tar.xz", "tbz": "tar.bz2", "tbz2": "tar.bz2"} - -# Add PRE_EXTS and EXTS last so that .tar.gz is matched *before* .tar or .gz -ALLOWED_ARCHIVE_TYPES = ( - [".".join(ext) for ext in product(PRE_EXTS, EXTS)] + PRE_EXTS + EXTS + NOTAR_EXTS -) - -ALLOWED_SINGLE_EXT_ARCHIVE_TYPES = PRE_EXTS + EXTS + NOTAR_EXTS - try: import bz2 # noqa @@ -66,10 +52,6 @@ def is_bz2_supported(): return _bz2_support -def allowed_archive(path): - return False if not path else any(path.endswith(t) for t in ALLOWED_ARCHIVE_TYPES) - - def _system_untar(archive_file, remove_archive_file=False): """Returns path to unarchived tar file. Untars archive via system tar. @@ -78,7 +60,7 @@ def _system_untar(archive_file, remove_archive_file=False): archive_file (str): absolute path to the archive to be extracted. Can be one of .tar(.[gz|bz2|xz|Z]) or .(tgz|tbz|tbz2|txz). """ - archive_file_no_ext = strip_extension(archive_file) + archive_file_no_ext = llnl.url.strip_extension(archive_file) outfile = os.path.basename(archive_file_no_ext) if archive_file_no_ext == archive_file: # the archive file has no extension. Tar on windows cannot untar onto itself @@ -114,7 +96,7 @@ def _bunzip2(archive_file): def _py_bunzip(archive_file): """Returns path to decompressed file. Decompresses bz2 compressed archives/files via python's bz2 module""" - decompressed_file = os.path.basename(strip_compression_extension(archive_file, "bz2")) + decompressed_file = os.path.basename(llnl.url.strip_compression_extension(archive_file, "bz2")) working_dir = os.getcwd() archive_out = os.path.join(working_dir, decompressed_file) f_bz = bz2.BZ2File(archive_file, mode="rb") @@ -128,7 +110,7 @@ def _system_bunzip(archive_file): """Returns path to decompressed file. Decompresses bz2 compressed archives/files via system bzip2 utility""" compressed_file_name = os.path.basename(archive_file) - decompressed_file = os.path.basename(strip_compression_extension(archive_file, "bz2")) + decompressed_file = os.path.basename(llnl.url.strip_compression_extension(archive_file, "bz2")) working_dir = os.getcwd() archive_out = os.path.join(working_dir, decompressed_file) copy_path = os.path.join(working_dir, compressed_file_name) @@ -158,7 +140,7 @@ def _gunzip(archive_file): def _py_gunzip(archive_file): """Returns path to gunzip'd file Decompresses `.gz` compressed archvies via python gzip module""" - decompressed_file = os.path.basename(strip_compression_extension(archive_file, "gz")) + decompressed_file = os.path.basename(llnl.url.strip_compression_extension(archive_file, "gz")) working_dir = os.getcwd() destination_abspath = os.path.join(working_dir, decompressed_file) f_in = gzip.open(archive_file, "rb") @@ -171,7 +153,7 @@ def _py_gunzip(archive_file): def _system_gunzip(archive_file): """Returns path to gunzip'd file Decompresses `.gz` compressed files via system gzip""" - archive_file_no_ext = strip_compression_extension(archive_file) + archive_file_no_ext = llnl.url.strip_compression_extension(archive_file) if archive_file_no_ext == archive_file: # the zip file has no extension. On Unix gunzip cannot unzip onto itself archive_file = archive_file + ".gz" @@ -196,7 +178,7 @@ def _unzip(archive_file): Args: archive_file (str): absolute path of the file to be decompressed """ - extracted_file = os.path.basename(strip_extension(archive_file, "zip")) + extracted_file = os.path.basename(llnl.url.strip_extension(archive_file, extension="zip")) if sys.platform == "win32": return _system_untar(archive_file) else: @@ -259,7 +241,7 @@ def _win_compressed_tarball_handler(decompressor): def _py_lzma(archive_file): """Returns path to decompressed .xz files Decompress lzma compressed .xz files via python lzma module""" - decompressed_file = os.path.basename(strip_compression_extension(archive_file, "xz")) + decompressed_file = os.path.basename(llnl.url.strip_compression_extension(archive_file, "xz")) archive_out = os.path.join(os.getcwd(), decompressed_file) with open(archive_out, "wb") as ar: with lzma.open(archive_file) as lar: @@ -272,7 +254,7 @@ def _xz(archive_file): Decompress lzma compressed .xz files via xz command line tool. """ - decompressed_file = os.path.basename(strip_extension(archive_file, "xz")) + decompressed_file = os.path.basename(llnl.url.strip_extension(archive_file, extension="xz")) working_dir = os.getcwd() destination_abspath = os.path.join(working_dir, decompressed_file) compressed_file = os.path.basename(archive_file) @@ -297,13 +279,13 @@ def _system_7zip(archive_file): Args: archive_file (str): absolute path of file to be unarchived """ - outfile = os.path.basename(strip_compression_extension(archive_file)) + outfile = os.path.basename(llnl.url.strip_compression_extension(archive_file)) _7z = which("7z") if not _7z: raise CommandNotFoundError( "7z unavailable,\ unable to extract %s files. 7z can be installed via Spack" - % extension_from_path(archive_file) + % llnl.url.extension_from_path(archive_file) ) _7z.add_default_arg("e") _7z(archive_file) @@ -318,7 +300,7 @@ def decompressor_for(path, extension=None): if not extension: extension = extension_from_file(path, decompress=True) - if not allowed_archive(extension): + if not llnl.url.allowed_archive(extension): raise CommandNotFoundError( "Cannot extract archive, \ unrecognized file extension: '%s'" @@ -394,7 +376,7 @@ def decompressor_for_win(extension): path (str): path of the archive file requiring decompression extension (str): extension """ - extension = expand_contracted_extension(extension) + extension = llnl.url.expand_contracted_extension(extension) # Windows native tar can handle .zip extensions, use standard # unzip method if re.match(r"zip$", extension): @@ -415,7 +397,7 @@ def decompressor_for_win(extension): # python based decompression strategy # Expand extension from contracted extension i.e. tar.gz from .tgz # no-op on non contracted extensions - compression_extension = compression_ext_from_compressed_archive(extension) + compression_extension = llnl.url.compression_ext_from_compressed_archive(extension) decompressor = _determine_py_decomp_archive_strategy(compression_extension) if not decompressor: raise SpackError( @@ -657,7 +639,7 @@ def extension_from_stream(stream, decompress=False): "Cannot derive file extension from magic number;" " falling back to regex path parsing." ) - return extension_from_path(stream.name) + return llnl.url.extension_from_path(stream.name) resultant_ext = suffix_ext if not prefix_ext else ".".join([prefix_ext, suffix_ext]) tty.debug("File extension %s successfully derived by magic number." % resultant_ext) return resultant_ext @@ -693,114 +675,11 @@ def extension_from_file(file, decompress=False): if ext and ext.startswith("tar."): suf = ext.split(".")[1] abbr = "t" + suf - if check_extension(file, abbr): + if llnl.url.has_extension(file, abbr): return abbr if not ext: # If unable to parse extension from stream, # attempt to fall back to string parsing - ext = extension_from_path(file) + ext = llnl.url.extension_from_path(file) return ext return None - - -def extension_from_path(path): - """Returns the allowed archive extension for a path. - If path does not include a valid archive extension - (see`spack.util.compression.ALLOWED_ARCHIVE_TYPES`) return None - """ - if path is None: - raise ValueError("Can't call extension() on None") - - for t in ALLOWED_ARCHIVE_TYPES: - if check_extension(path, t): - return t - return None - - -def strip_compression_extension(path, ext=None): - """Returns path with last supported (can be combined with tar) or - provided archive extension stripped""" - path_ext = extension_from_path(path) - if path_ext: - path = expand_contracted_extension_in_path(path) - exts_to_check = EXTS - if ext: - exts_to_check = [ext] - for ext_check in exts_to_check: - mod_path = check_and_remove_ext(path, ext_check) - if mod_path != path: - return mod_path - return path - - -def strip_extension(path, ext=None): - """Returns the part of a path that does not include extension. - If ext is given, only attempts to remove that extension. If no - extension given, attempts to strip any valid extension from path""" - if ext: - return check_and_remove_ext(path, ext) - for t in ALLOWED_ARCHIVE_TYPES: - mod_path = check_and_remove_ext(path, t) - if mod_path != path: - return mod_path - return path - - -def check_extension(path, ext): - """Returns true if extension is present in path - false otherwise""" - # Strip sourceforge suffix. - prefix, _ = spath.find_sourceforge_suffix(path) - if not ext.startswith(r"\."): - ext = r"\.%s$" % ext - if re.search(ext, prefix): - return True - return False - - -def reg_remove_ext(path, ext): - """Returns path with ext remove via regex""" - if path and ext: - suffix = r"\.%s$" % ext - return re.sub(suffix, "", path) - return path - - -def check_and_remove_ext(path, ext): - """Returns path with extension removed if extension - is present in path. Otherwise just returns path""" - if check_extension(path, ext): - return reg_remove_ext(path, ext) - return path - - -def _substitute_extension(path, old_ext, new_ext): - """Returns path with old_ext replaced with new_ext. - old_ext and new_ext can be extension strings or regexs""" - return re.sub(rf"{old_ext}", rf"{new_ext}", path) - - -def expand_contracted_extension_in_path(path, ext=None): - """Returns path with any contraction extension (i.e. tgz) expanded - (i.e. tar.gz). If ext is specified, only attempt to expand that extension""" - if not ext: - ext = extension_from_path(path) - expanded_ext = expand_contracted_extension(ext) - if expanded_ext != ext: - return _substitute_extension(path, ext, expanded_ext) - return path - - -def expand_contracted_extension(extension): - """Return expanded version of contracted extension - i.e. .tgz -> .tar.gz, no op on non contracted extensions""" - extension = extension.strip(".") - return CONTRACTION_MAP.get(extension, extension) - - -def compression_ext_from_compressed_archive(extension): - """Returns compression extension for a compressed archive""" - extension = expand_contracted_extension(extension) - for ext in [*EXTS]: - if ext in extension: - return ext diff --git a/lib/spack/spack/util/gcs.py b/lib/spack/spack/util/gcs.py index 856fe73001..4e997df52b 100644 --- a/lib/spack/spack/util/gcs.py +++ b/lib/spack/spack/util/gcs.py @@ -10,6 +10,10 @@ integrate GCS Blob storage with spack buildcache. import os import sys +import urllib.parse +import urllib.response +from urllib.error import URLError +from urllib.request import BaseHandler import llnl.util.tty as tty @@ -222,3 +226,21 @@ class GCSBlob: } return headers + + +def gcs_open(req, *args, **kwargs): + """Open a reader stream to a blob object on GCS""" + url = urllib.parse.urlparse(req.get_full_url()) + gcsblob = GCSBlob(url) + + if not gcsblob.exists(): + raise URLError("GCS blob {0} does not exist".format(gcsblob.blob_path)) + stream = gcsblob.get_blob_byte_stream() + headers = gcsblob.get_blob_headers() + + return urllib.response.addinfourl(stream, headers, url) + + +class GCSHandler(BaseHandler): + def gs_open(self, req): + return gcs_open(req) diff --git a/lib/spack/spack/util/path.py b/lib/spack/spack/util/path.py index ef6fb883c7..3dc0ea676c 100644 --- a/lib/spack/spack/util/path.py +++ b/lib/spack/spack/util/path.py @@ -109,15 +109,6 @@ def win_exe_ext(): return ".exe" -def find_sourceforge_suffix(path): - """find and match sourceforge filepath components - Return match object""" - match = re.search(r"(.*(?:sourceforge\.net|sf\.net)/.*)(/download)$", path) - if match: - return match.groups() - return path, "" - - def path_to_os_path(*pths): """ Takes an arbitrary number of positional parameters diff --git a/lib/spack/spack/util/s3.py b/lib/spack/spack/util/s3.py index c4d53d86b6..796c49a8c8 100644 --- a/lib/spack/spack/util/s3.py +++ b/lib/spack/spack/util/s3.py @@ -3,10 +3,13 @@ # # SPDX-License-Identifier: (Apache-2.0 OR MIT) import os +import urllib.error import urllib.parse +import urllib.request +import urllib.response +from io import BufferedReader, BytesIO, IOBase from typing import Any, Dict, Tuple -import spack import spack.config #: Map (mirror name, method) tuples to s3 client instances. @@ -114,4 +117,72 @@ def get_mirror_s3_connection_info(mirror, method): if endpoint_url: s3_client_args["endpoint_url"] = _parse_s3_endpoint_url(endpoint_url) - return (s3_connection, s3_client_args) + return s3_connection, s3_client_args + + +# NOTE(opadron): Workaround issue in boto where its StreamingBody +# implementation is missing several APIs expected from IOBase. These missing +# APIs prevent the streams returned by boto from being passed as-are along to +# urllib. +# +# https://github.com/boto/botocore/issues/879 +# https://github.com/python/cpython/pull/3249 +class WrapStream(BufferedReader): + def __init__(self, raw): + # In botocore >=1.23.47, StreamingBody inherits from IOBase, so we + # only add missing attributes in older versions. + # https://github.com/boto/botocore/commit/a624815eabac50442ed7404f3c4f2664cd0aa784 + if not isinstance(raw, IOBase): + raw.readable = lambda: True + raw.writable = lambda: False + raw.seekable = lambda: False + raw.closed = False + raw.flush = lambda: None + super().__init__(raw) + + def detach(self): + self.raw = None + + def read(self, *args, **kwargs): + return self.raw.read(*args, **kwargs) + + def __getattr__(self, key): + return getattr(self.raw, key) + + +def _s3_open(url, method="GET"): + parsed = urllib.parse.urlparse(url) + s3 = get_s3_session(url, method="fetch") + + bucket = parsed.netloc + key = parsed.path + + if key.startswith("/"): + key = key[1:] + + if method not in ("GET", "HEAD"): + raise urllib.error.URLError( + "Only GET and HEAD verbs are currently supported for the s3:// scheme" + ) + + try: + if method == "GET": + obj = s3.get_object(Bucket=bucket, Key=key) + # NOTE(opadron): Apply workaround here (see above) + stream = WrapStream(obj["Body"]) + elif method == "HEAD": + obj = s3.head_object(Bucket=bucket, Key=key) + stream = BytesIO() + except s3.ClientError as e: + raise urllib.error.URLError(e) from e + + headers = obj["ResponseMetadata"]["HTTPHeaders"] + + return url, headers, stream + + +class UrllibS3Handler(urllib.request.BaseHandler): + def s3_open(self, req): + orig_url = req.get_full_url() + url, headers, stream = _s3_open(orig_url, method=req.get_method()) + return urllib.response.addinfourl(stream, headers, url) diff --git a/lib/spack/spack/util/web.py b/lib/spack/spack/util/web.py index 22309ba87f..79ad39ebd7 100644 --- a/lib/spack/spack/util/web.py +++ b/lib/spack/spack/util/web.py @@ -21,23 +21,17 @@ from typing import IO, Optional from urllib.error import HTTPError, URLError from urllib.request import HTTPSHandler, Request, build_opener -import llnl.util.lang -import llnl.util.tty as tty +import llnl.url +from llnl.util import lang, tty from llnl.util.filesystem import mkdirp, rename, working_dir -import spack import spack.config import spack.error -import spack.gcs_handler -import spack.s3_handler -import spack.url -import spack.util.crypto -import spack.util.gcs as gcs_util -import spack.util.s3 as s3_util import spack.util.url as url_util -from spack.util.compression import ALLOWED_ARCHIVE_TYPES -from spack.util.executable import CommandNotFoundError, which -from spack.util.path import convert_to_posix_path + +from .executable import CommandNotFoundError, which +from .gcs import GCSBlob, GCSBucket, GCSHandler +from .s3 import UrllibS3Handler, get_s3_session class DetailedHTTPError(HTTPError): @@ -66,8 +60,8 @@ class SpackHTTPDefaultErrorHandler(urllib.request.HTTPDefaultErrorHandler): def _urlopen(): - s3 = spack.s3_handler.UrllibS3Handler() - gcs = spack.gcs_handler.GCSHandler() + s3 = UrllibS3Handler() + gcs = GCSHandler() error_handler = SpackHTTPDefaultErrorHandler() # One opener with HTTPS ssl enabled @@ -90,7 +84,7 @@ def _urlopen(): #: Dispatches to the correct OpenerDirector.open, based on Spack configuration. -urlopen = llnl.util.lang.Singleton(_urlopen) +urlopen = lang.Singleton(_urlopen) #: User-Agent used in Request objects SPACK_USER_AGENT = "Spackbot/{0}".format(spack.spack_version) @@ -190,14 +184,14 @@ def push_to_url(local_file_path, remote_path, keep_original=True, extra_args=Non while remote_path.startswith("/"): remote_path = remote_path[1:] - s3 = s3_util.get_s3_session(remote_url, method="push") + s3 = get_s3_session(remote_url, method="push") s3.upload_file(local_file_path, remote_url.netloc, remote_path, ExtraArgs=extra_args) if not keep_original: os.remove(local_file_path) elif remote_url.scheme == "gs": - gcs = gcs_util.GCSBlob(remote_url) + gcs = GCSBlob(remote_url) gcs.upload_to_blob(local_file_path) if not keep_original: os.remove(local_file_path) @@ -427,7 +421,7 @@ def remove_url(url, recursive=False): if url.scheme == "s3": # Try to find a mirror for potential connection information - s3 = s3_util.get_s3_session(url, method="push") + s3 = get_s3_session(url, method="push") bucket = url.netloc if recursive: # Because list_objects_v2 can only return up to 1000 items @@ -460,10 +454,10 @@ def remove_url(url, recursive=False): elif url.scheme == "gs": if recursive: - bucket = gcs_util.GCSBucket(url) + bucket = GCSBucket(url) bucket.destroy(recursive=recursive) else: - blob = gcs_util.GCSBlob(url) + blob = GCSBlob(url) blob.delete_blob() return @@ -538,14 +532,14 @@ def list_url(url, recursive=False): ] if url.scheme == "s3": - s3 = s3_util.get_s3_session(url, method="fetch") + s3 = get_s3_session(url, method="fetch") if recursive: return list(_iter_s3_prefix(s3, url)) return list(set(key.split("/", 1)[0] for key in _iter_s3_prefix(s3, url))) elif url.scheme == "gs": - gcs = gcs_util.GCSBucket(url) + gcs = GCSBucket(url) return gcs.get_all_blobs(recursive=recursive) @@ -636,7 +630,7 @@ def spider(root_urls, depth=0, concurrency=32): links.add(abs_link) # Skip stuff that looks like an archive - if any(raw_link.endswith(s) for s in ALLOWED_ARCHIVE_TYPES): + if any(raw_link.endswith(s) for s in llnl.url.ALLOWED_ARCHIVE_TYPES): continue # Skip already-visited links @@ -696,7 +690,7 @@ def spider(root_urls, depth=0, concurrency=32): current_depth, depth, len(spider_args) ) ) - results = tp.map(llnl.util.lang.star(_spider), spider_args) + results = tp.map(lang.star(_spider), spider_args) spider_args = [] collect = current_depth < depth for sub_pages, sub_links, sub_spider_args in results: @@ -713,123 +707,6 @@ def spider(root_urls, depth=0, concurrency=32): return pages, links -def find_versions_of_archive( - archive_urls, list_url=None, list_depth=0, concurrency=32, reference_package=None -): - """Scrape web pages for new versions of a tarball. This function prefers URLs in the - following order: links found on the scraped page that match a url generated by the - reference package, found and in the archive_urls list, found and derived from those - in the archive_urls list, and if none are found for a version then the item in the - archive_urls list is included for the version. - - Args: - archive_urls (str or list or tuple): URL or sequence of URLs for - different versions of a package. Typically these are just the - tarballs from the package file itself. By default, this searches - the parent directories of archives. - list_url (str or None): URL for a listing of archives. - Spack will scrape these pages for download links that look - like the archive URL. - list_depth (int): max depth to follow links on list_url pages. - Defaults to 0. - concurrency (int): maximum number of concurrent requests - reference_package (spack.package_base.PackageBase or None): a spack package - used as a reference for url detection. Uses the url_for_version - method on the package to produce reference urls which, if found, - are preferred. - """ - if not isinstance(archive_urls, (list, tuple)): - archive_urls = [archive_urls] - - # Generate a list of list_urls based on archive urls and any - # explicitly listed list_url in the package - list_urls = set() - if list_url is not None: - list_urls.add(list_url) - for aurl in archive_urls: - list_urls |= spack.url.find_list_urls(aurl) - - # Add '/' to the end of the URL. Some web servers require this. - additional_list_urls = set() - for lurl in list_urls: - if not lurl.endswith("/"): - additional_list_urls.add(lurl + "/") - list_urls |= additional_list_urls - - # Grab some web pages to scrape. - pages, links = spider(list_urls, depth=list_depth, concurrency=concurrency) - - # Scrape them for archive URLs - regexes = [] - for aurl in archive_urls: - # This creates a regex from the URL with a capture group for - # the version part of the URL. The capture group is converted - # to a generic wildcard, so we can use this to extract things - # on a page that look like archive URLs. - url_regex = spack.url.wildcard_version(aurl) - - # We'll be a bit more liberal and just look for the archive - # part, not the full path. - # this is a URL so it is a posixpath even on Windows - url_regex = PurePosixPath(url_regex).name - - # We need to add a / to the beginning of the regex to prevent - # Spack from picking up similarly named packages like: - # https://cran.r-project.org/src/contrib/pls_2.6-0.tar.gz - # https://cran.r-project.org/src/contrib/enpls_5.7.tar.gz - # https://cran.r-project.org/src/contrib/autopls_1.3.tar.gz - # https://cran.r-project.org/src/contrib/matrixpls_1.0.4.tar.gz - url_regex = "/" + url_regex - - # We need to add a $ anchor to the end of the regex to prevent - # Spack from picking up signature files like: - # .asc - # .md5 - # .sha256 - # .sig - # However, SourceForge downloads still need to end in '/download'. - url_regex += r"(\/download)?" - # PyPI adds #sha256=... to the end of the URL - url_regex += "(#sha256=.*)?" - url_regex += "$" - - regexes.append(url_regex) - - # Build a dict version -> URL from any links that match the wildcards. - # Walk through archive_url links first. - # Any conflicting versions will be overwritten by the list_url links. - versions = {} - matched = set() - for url in sorted(links): - url = convert_to_posix_path(url) - if any(re.search(r, url) for r in regexes): - try: - ver = spack.url.parse_version(url) - if ver in matched: - continue - versions[ver] = url - # prevent this version from getting overwritten - if reference_package is not None: - if url == reference_package.url_for_version(ver): - matched.add(ver) - else: - extrapolated_urls = [ - spack.url.substitute_version(u, ver) for u in archive_urls - ] - if url in extrapolated_urls: - matched.add(ver) - except spack.url.UndetectableVersionError: - continue - - for url in archive_urls: - url = convert_to_posix_path(url) - ver = spack.url.parse_version(url) - if ver not in versions: - versions[ver] = url - - return versions - - def get_header(headers, header_name): """Looks up a dict of headers for the given header value. -- cgit v1.2.3-60-g2f50