Group primitive url/path handling functions together (#40028)

author: Massimiliano Culpo <massimiliano.culpo@gmail.com> 2023-09-15 15:43:23 +0200
committer: GitHub <noreply@github.com> 2023-09-15 15:43:23 +0200
commit: fb9e5fcc4f5307deaf10fcd571ebea68188d859c (patch)
tree: 0c048fbcd27b73b516922dfb1cc37003c440935b
parent: bc02453f6dd06b82f0324d208b67559125e135ea (diff)
download: spack-fb9e5fcc4f5307deaf10fcd571ebea68188d859c.tar.gz
spack-fb9e5fcc4f5307deaf10fcd571ebea68188d859c.tar.bz2
spack-fb9e5fcc4f5307deaf10fcd571ebea68188d859c.tar.xz
spack-fb9e5fcc4f5307deaf10fcd571ebea68188d859c.zip
21 files changed, 903 insertions, 888 deletions
diff --git a/lib/spack/llnl/url.py b/lib/spack/llnl/url.py
new file mode 100644
index 0000000000..40e7606506
--- /dev/null
+++ b/lib/spack/llnl/url.py
@@ -0,0 +1,459 @@
+# Copyright 2013-2023 Lawrence Livermore National Security, LLC and other
+# Spack Project Developers. See the top-level COPYRIGHT file for details.
+#
+# SPDX-License-Identifier: (Apache-2.0 OR MIT)
+"""URL primitives that just require Python standard library."""
+import itertools
+import os.path
+import re
+from typing import Optional, Set, Tuple
+from urllib.parse import urlsplit, urlunsplit
+
+# Archive extensions allowed in Spack
+PREFIX_EXTENSIONS = ("tar", "TAR")
+EXTENSIONS = ("gz", "bz2", "xz", "Z")
+NO_TAR_EXTENSIONS = ("zip", "tgz", "tbz2", "tbz", "txz")
+
+# Add PREFIX_EXTENSIONS and EXTENSIONS last so that .tar.gz is matched *before* .tar or .gz
+ALLOWED_ARCHIVE_TYPES = (
+    tuple(".".join(ext) for ext in itertools.product(PREFIX_EXTENSIONS, EXTENSIONS))
+    + PREFIX_EXTENSIONS
+    + EXTENSIONS
+    + NO_TAR_EXTENSIONS
+)
+CONTRACTION_MAP = {"tgz": "tar.gz", "txz": "tar.xz", "tbz": "tar.bz2", "tbz2": "tar.bz2"}
+
+
+def find_list_urls(url: str) -> Set[str]:
+    r"""Find good list URLs for the supplied URL.
+
+    By default, returns the dirname of the archive path.
+
+    Provides special treatment for the following websites, which have a
+    unique list URL different from the dirname of the download URL:
+
+    =========  =======================================================
+    GitHub     https://github.com/<repo>/<name>/releases
+    GitLab     https://gitlab.\*/<repo>/<name>/tags
+    BitBucket  https://bitbucket.org/<repo>/<name>/downloads/?tab=tags
+    CRAN       https://\*.r-project.org/src/contrib/Archive/<name>
+    PyPI       https://pypi.org/simple/<name>/
+    LuaRocks   https://luarocks.org/modules/<repo>/<name>
+    =========  =======================================================
+
+    Note: this function is called by `spack versions`, `spack checksum`,
+    and `spack create`, but not by `spack fetch` or `spack install`.
+
+    Parameters:
+        url (str): The download URL for the package
+
+    Returns:
+        set: One or more list URLs for the package
+    """
+
+    url_types = [
+        # GitHub
+        # e.g. https://github.com/llnl/callpath/archive/v1.0.1.tar.gz
+        (r"(.*github\.com/[^/]+/[^/]+)", lambda m: m.group(1) + "/releases"),
+        # GitLab API endpoint
+        # e.g. https://gitlab.dkrz.de/api/v4/projects/k202009%2Flibaec/repository/archive.tar.gz?sha=v1.0.2
+        (
+            r"(.*gitlab[^/]+)/api/v4/projects/([^/]+)%2F([^/]+)",
+            lambda m: m.group(1) + "/" + m.group(2) + "/" + m.group(3) + "/tags",
+        ),
+        # GitLab non-API endpoint
+        # e.g. https://gitlab.dkrz.de/k202009/libaec/uploads/631e85bcf877c2dcaca9b2e6d6526339/libaec-1.0.0.tar.gz
+        (r"(.*gitlab[^/]+/(?!api/v4/projects)[^/]+/[^/]+)", lambda m: m.group(1) + "/tags"),
+        # BitBucket
+        # e.g. https://bitbucket.org/eigen/eigen/get/3.3.3.tar.bz2
+        (r"(.*bitbucket.org/[^/]+/[^/]+)", lambda m: m.group(1) + "/downloads/?tab=tags"),
+        # CRAN
+        # e.g. https://cran.r-project.org/src/contrib/Rcpp_0.12.9.tar.gz
+        # e.g. https://cloud.r-project.org/src/contrib/rgl_0.98.1.tar.gz
+        (
+            r"(.*\.r-project\.org/src/contrib)/([^_]+)",
+            lambda m: m.group(1) + "/Archive/" + m.group(2),
+        ),
+        # PyPI
+        # e.g. https://pypi.io/packages/source/n/numpy/numpy-1.19.4.zip
+        # e.g. https://www.pypi.io/packages/source/n/numpy/numpy-1.19.4.zip
+        # e.g. https://pypi.org/packages/source/n/numpy/numpy-1.19.4.zip
+        # e.g. https://pypi.python.org/packages/source/n/numpy/numpy-1.19.4.zip
+        # e.g. https://files.pythonhosted.org/packages/source/n/numpy/numpy-1.19.4.zip
+        # e.g. https://pypi.io/packages/py2.py3/o/opencensus-context/opencensus_context-0.1.1-py2.py3-none-any.whl
+        (
+            r"(?:pypi|pythonhosted)[^/]+/packages/[^/]+/./([^/]+)",
+            lambda m: "https://pypi.org/simple/" + m.group(1) + "/",
+        ),
+        # LuaRocks
+        # e.g. https://luarocks.org/manifests/gvvaughan/lpeg-1.0.2-1.src.rock
+        # e.g. https://luarocks.org/manifests/openresty/lua-cjson-2.1.0-1.src.rock
+        (
+            r"luarocks[^/]+/(?:modules|manifests)/(?P<org>[^/]+)/"
+            + r"(?P<name>.+?)-[0-9.-]*\.src\.rock",
+            lambda m: "https://luarocks.org/modules/"
+            + m.group("org")
+            + "/"
+            + m.group("name")
+            + "/",
+        ),
+    ]
+
+    list_urls = {os.path.dirname(url)}
+
+    for pattern, fun in url_types:
+        match = re.search(pattern, url)
+        if match:
+            list_urls.add(fun(match))
+
+    return list_urls
+
+
+def strip_query_and_fragment(url: str) -> Tuple[str, str]:
+    """Strips query and fragment from a url, then returns the base url and the suffix.
+
+    Args:
+        url: URL to be stripped
+
+    Raises:
+        ValueError: when there is any error parsing the URL
+    """
+    components = urlsplit(url)
+    stripped = components[:3] + (None, None)
+
+    query, frag = components[3:5]
+    suffix = ""
+    if query:
+        suffix += "?" + query
+    if frag:
+        suffix += "#" + frag
+
+    return urlunsplit(stripped), suffix
+
+
+SOURCEFORGE_RE = re.compile(r"(.*(?:sourceforge\.net|sf\.net)/.*)(/download)$")
+
+
+def split_url_on_sourceforge_suffix(url: str) -> Tuple[str, ...]:
+    """If the input is a sourceforge URL, returns base URL and "/download" suffix. Otherwise,
+    returns the input URL and an empty string.
+    """
+    match = SOURCEFORGE_RE.search(url)
+    if match is not None:
+        return match.groups()
+    return url, ""
+
+
+def has_extension(path_or_url: str, ext: str) -> bool:
+    """Returns true if the extension in input is present in path, false otherwise."""
+    prefix, _ = split_url_on_sourceforge_suffix(path_or_url)
+    if not ext.startswith(r"\."):
+        ext = rf"\.{ext}$"
+
+    if re.search(ext, prefix):
+        return True
+    return False
+
+
+def extension_from_path(path_or_url: Optional[str]) -> Optional[str]:
+    """Tries to match an allowed archive extension to the input. Returns the first match,
+    or None if no match was found.
+
+    Raises:
+        ValueError: if the input is None
+    """
+    if path_or_url is None:
+        raise ValueError("Can't call extension() on None")
+
+    for t in ALLOWED_ARCHIVE_TYPES:
+        if has_extension(path_or_url, t):
+            return t
+    return None
+
+
+def remove_extension(path_or_url: str, *, extension: str) -> str:
+    """Returns the input with the extension removed"""
+    suffix = rf"\.{extension}$"
+    return re.sub(suffix, "", path_or_url)
+
+
+def check_and_remove_ext(path: str, *, extension: str) -> str:
+    """Returns the input path with the extension removed, if the extension is present in path.
+    Otherwise, returns the input unchanged.
+    """
+    if not has_extension(path, extension):
+        return path
+    path, _ = split_url_on_sourceforge_suffix(path)
+    return remove_extension(path, extension=extension)
+
+
+def strip_extension(path_or_url: str, *, extension: Optional[str] = None) -> str:
+    """If a path contains the extension in input, returns the path stripped of the extension.
+    Otherwise, returns the input path.
+
+    If extension is None, attempts to strip any allowed extension from path.
+    """
+    if extension is None:
+        for t in ALLOWED_ARCHIVE_TYPES:
+            if has_extension(path_or_url, ext=t):
+                extension = t
+                break
+        else:
+            return path_or_url
+
+    return check_and_remove_ext(path_or_url, extension=extension)
+
+
+def split_url_extension(url: str) -> Tuple[str, ...]:
+    """Some URLs have a query string, e.g.:
+
+    1. https://github.com/losalamos/CLAMR/blob/packages/PowerParser_v2.0.7.tgz?raw=true
+    2. http://www.apache.org/dyn/closer.cgi?path=/cassandra/1.2.0/apache-cassandra-1.2.0-rc2-bin.tar.gz
+    3. https://gitlab.kitware.com/vtk/vtk/repository/archive.tar.bz2?ref=v7.0.0
+
+    In (1), the query string needs to be stripped to get at the
+    extension, but in (2) & (3), the filename is IN a single final query
+    argument.
+
+    This strips the URL into three pieces: ``prefix``, ``ext``, and ``suffix``.
+    The suffix contains anything that was stripped off the URL to
+    get at the file extension.  In (1), it will be ``'?raw=true'``, but
+    in (2), it will be empty. In (3) the suffix is a parameter that follows
+    after the file extension, e.g.:
+
+    1. ``('https://github.com/losalamos/CLAMR/blob/packages/PowerParser_v2.0.7', '.tgz', '?raw=true')``
+    2. ``('http://www.apache.org/dyn/closer.cgi?path=/cassandra/1.2.0/apache-cassandra-1.2.0-rc2-bin', '.tar.gz', None)``
+    3. ``('https://gitlab.kitware.com/vtk/vtk/repository/archive', '.tar.bz2', '?ref=v7.0.0')``
+    """
+    # Strip off sourceforge download suffix.
+    # e.g. https://sourceforge.net/projects/glew/files/glew/2.0.0/glew-2.0.0.tgz/download
+    prefix, suffix = split_url_on_sourceforge_suffix(url)
+
+    ext = extension_from_path(prefix)
+    if ext is not None:
+        prefix = strip_extension(prefix)
+        return prefix, ext, suffix
+
+    try:
+        prefix, suf = strip_query_and_fragment(prefix)
+    except ValueError:
+        # FIXME: tty.debug("Got error parsing path %s" % path)
+        # Ignore URL parse errors here
+        return url, ""
+
+    ext = extension_from_path(prefix)
+    prefix = strip_extension(prefix)
+    suffix = suf + suffix
+    if ext is None:
+        ext = ""
+
+    return prefix, ext, suffix
+
+
+def strip_version_suffixes(path_or_url: str) -> str:
+    """Some tarballs contain extraneous information after the version:
+
+    * ``bowtie2-2.2.5-source``
+    * ``libevent-2.0.21-stable``
+    * ``cuda_8.0.44_linux.run``
+
+    These strings are not part of the version number and should be ignored.
+    This function strips those suffixes off and returns the remaining string.
+    The goal is that the version is always the last thing in ``path``:
+
+    * ``bowtie2-2.2.5``
+    * ``libevent-2.0.21``
+    * ``cuda_8.0.44``
+
+    Args:
+        path_or_url: The filename or URL for the package
+
+    Returns:
+        The ``path`` with any extraneous suffixes removed
+    """
+    # NOTE: This could be done with complicated regexes in parse_version_offset
+    # NOTE: The problem is that we would have to add these regexes to the end
+    # NOTE: of every single version regex. Easier to just strip them off
+    # NOTE: permanently
+
+    suffix_regexes = [
+        # Download type
+        r"[Ii]nstall",
+        r"all",
+        r"code",
+        r"[Ss]ources?",
+        r"file",
+        r"full",
+        r"single",
+        r"with[a-zA-Z_-]+",
+        r"rock",
+        r"src(_0)?",
+        r"public",
+        r"bin",
+        r"binary",
+        r"run",
+        r"[Uu]niversal",
+        r"jar",
+        r"complete",
+        r"dynamic",
+        r"oss",
+        r"gem",
+        r"tar",
+        r"sh",
+        # Download version
+        r"release",
+        r"bin",
+        r"stable",
+        r"[Ff]inal",
+        r"rel",
+        r"orig",
+        r"dist",
+        r"\+",
+        # License
+        r"gpl",
+        # Arch
+        # Needs to come before and after OS, appears in both orders
+        r"ia32",
+        r"intel",
+        r"amd64",
+        r"linux64",
+        r"x64",
+        r"64bit",
+        r"x86[_-]64",
+        r"i586_64",
+        r"x86",
+        r"i[36]86",
+        r"ppc64(le)?",
+        r"armv?(7l|6l|64)",
+        # Other
+        r"cpp",
+        r"gtk",
+        r"incubating",
+        # OS
+        r"[Ll]inux(_64)?",
+        r"LINUX",
+        r"[Uu]ni?x",
+        r"[Ss]un[Oo][Ss]",
+        r"[Mm]ac[Oo][Ss][Xx]?",
+        r"[Oo][Ss][Xx]",
+        r"[Dd]arwin(64)?",
+        r"[Aa]pple",
+        r"[Ww]indows",
+        r"[Ww]in(64|32)?",
+        r"[Cc]ygwin(64|32)?",
+        r"[Mm]ingw",
+        r"centos",
+        # Arch
+        # Needs to come before and after OS, appears in both orders
+        r"ia32",
+        r"intel",
+        r"amd64",
+        r"linux64",
+        r"x64",
+        r"64bit",
+        r"x86[_-]64",
+        r"i586_64",
+        r"x86",
+        r"i[36]86",
+        r"ppc64(le)?",
+        r"armv?(7l|6l|64)?",
+        # PyPI
+        r"[._-]py[23].*\.whl",
+        r"[._-]cp[23].*\.whl",
+        r"[._-]win.*\.exe",
+    ]
+
+    for regex in suffix_regexes:
+        # Remove the suffix from the end of the path
+        # This may be done multiple times
+        path_or_url = re.sub(r"[._-]?" + regex + "$", "", path_or_url)
+
+    return path_or_url
+
+
+def expand_contracted_extension(extension: str) -> str:
+    """Returns the expanded version of a known contracted extension.
+
+    This function maps extensions like ".tgz" to ".tar.gz". On unknown extensions,
+    return the input unmodified.
+    """
+    extension = extension.strip(".")
+    return CONTRACTION_MAP.get(extension, extension)
+
+
+def expand_contracted_extension_in_path(
+    path_or_url: str, *, extension: Optional[str] = None
+) -> str:
+    """Returns the input path or URL with any contraction extension expanded.
+
+    Args:
+        path_or_url: path or URL to be expanded
+        extension: if specified, only attempt to expand that extension
+    """
+    extension = extension or extension_from_path(path_or_url)
+    if extension is None:
+        return path_or_url
+
+    expanded = expand_contracted_extension(extension)
+    if expanded != extension:
+        return re.sub(rf"{extension}", rf"{expanded}", path_or_url)
+    return path_or_url
+
+
+def compression_ext_from_compressed_archive(extension: str) -> Optional[str]:
+    """Returns compression extension for a compressed archive"""
+    extension = expand_contracted_extension(extension)
+    for ext in [*EXTENSIONS]:
+        if ext in extension:
+            return ext
+    return None
+
+
+def strip_compression_extension(path_or_url: str, ext: Optional[str] = None) -> str:
+    """Strips the compression extension from the input, and returns it. For instance,
+    "foo.tgz" becomes "foo.tar".
+
+    If no extension is given, try a default list of extensions.
+
+    Args:
+        path_or_url: input to be stripped
+        ext: if given, extension to be stripped
+    """
+    if not extension_from_path(path_or_url):
+        return path_or_url
+
+    expanded_path = expand_contracted_extension_in_path(path_or_url)
+    candidates = [ext] if ext is not None else EXTENSIONS
+    for current_extension in candidates:
+        modified_path = check_and_remove_ext(expanded_path, extension=current_extension)
+        if modified_path != expanded_path:
+            return modified_path
+    return expanded_path
+
+
+def allowed_archive(path_or_url: str) -> bool:
+    """Returns true if the input is a valid archive, False otherwise."""
+    return (
+        False if not path_or_url else any(path_or_url.endswith(t) for t in ALLOWED_ARCHIVE_TYPES)
+    )
+
+
+def determine_url_file_extension(path: str) -> str:
+    """This returns the type of archive a URL refers to.  This is
+    sometimes confusing because of URLs like:
+
+    (1) https://github.com/petdance/ack/tarball/1.93_02
+
+    Where the URL doesn't actually contain the filename.  We need
+    to know what type it is so that we can appropriately name files
+    in mirrors.
+    """
+    match = re.search(r"github.com/.+/(zip|tar)ball/", path)
+    if match:
+        if match.group(1) == "zip":
+            return "zip"
+        elif match.group(1) == "tar":
+            return "tar.gz"
+
+    prefix, ext, suffix = split_url_extension(path)
+    return ext
diff --git a/lib/spack/spack/cmd/create.py b/lib/spack/spack/cmd/create.py
index 9c923c4a17..e3569d998f 100644
--- a/lib/spack/spack/cmd/create.py
+++ b/lib/spack/spack/cmd/create.py
@@ -822,7 +822,7 @@ def get_versions(args, name):
     if args.url is not None and args.template != "bundle" and valid_url:
         # Find available versions
         try:
-            url_dict = spack.util.web.find_versions_of_archive(args.url)
+            url_dict = spack.url.find_versions_of_archive(args.url)
         except UndetectableVersionError:
             # Use fake versions
             tty.warn("Couldn't detect version in: {0}".format(args.url))
diff --git a/lib/spack/spack/cmd/url.py b/lib/spack/spack/cmd/url.py
index 8f7866c406..25f8ad382a 100644
--- a/lib/spack/spack/cmd/url.py
+++ b/lib/spack/spack/cmd/url.py
@@ -12,6 +12,7 @@ from llnl.util import tty
 import spack.fetch_strategy as fs
 import spack.repo
 import spack.spec
+import spack.url
 import spack.util.crypto as crypto
 from spack.url import (
     UndetectableNameError,
@@ -26,7 +27,6 @@ from spack.url import (
     substitution_offsets,
 )
 from spack.util.naming import simplify_name
-from spack.util.web import find_versions_of_archive
 
 description = "debugging tool for url parsing"
 section = "developer"
@@ -139,7 +139,7 @@ def url_parse(args):
     if args.spider:
         print()
         tty.msg("Spidering for versions:")
-        versions = find_versions_of_archive(url)
+        versions = spack.url.find_versions_of_archive(url)
 
         if not versions:
             print("  Found no versions for {0}".format(name))
diff --git a/lib/spack/spack/fetch_strategy.py b/lib/spack/spack/fetch_strategy.py
index 1f99c4ce9e..87c6e0fc61 100644
--- a/lib/spack/spack/fetch_strategy.py
+++ b/lib/spack/spack/fetch_strategy.py
@@ -31,6 +31,7 @@ import shutil
 import urllib.parse
 from typing import List, Optional
 
+import llnl.url
 import llnl.util
 import llnl.util.filesystem as fs
 import llnl.util.tty as tty
@@ -46,7 +47,7 @@ import spack.util.url as url_util
 import spack.util.web as web_util
 import spack.version
 import spack.version.git_ref_lookup
-from spack.util.compression import decompressor_for, extension_from_path
+from spack.util.compression import decompressor_for
 from spack.util.executable import CommandNotFoundError, which
 from spack.util.string import comma_and, quote
 
@@ -441,7 +442,7 @@ class URLFetchStrategy(FetchStrategy):
 
         # TODO: replace this by mime check.
         if not self.extension:
-            self.extension = spack.url.determine_url_file_extension(self.url)
+            self.extension = llnl.url.determine_url_file_extension(self.url)
 
         if self.stage.expanded:
             tty.debug("Source already staged to %s" % self.stage.source_path)
@@ -570,7 +571,7 @@ class VCSFetchStrategy(FetchStrategy):
 
     @_needs_stage
     def archive(self, destination, **kwargs):
-        assert extension_from_path(destination) == "tar.gz"
+        assert llnl.url.extension_from_path(destination) == "tar.gz"
         assert self.stage.source_path.startswith(self.stage.path)
 
         tar = which("tar", required=True)
diff --git a/lib/spack/spack/gcs_handler.py b/lib/spack/spack/gcs_handler.py
deleted file mode 100644
index b002fa70ac..0000000000
--- a/lib/spack/spack/gcs_handler.py
+++ /dev/null
@@ -1,28 +0,0 @@
-# Copyright 2013-2023 Lawrence Livermore National Security, LLC and other
-# Spack Project Developers. See the top-level COPYRIGHT file for details.
-#
-# SPDX-License-Identifier: (Apache-2.0 OR MIT)
-import urllib.parse
-import urllib.response
-from urllib.error import URLError
-from urllib.request import BaseHandler
-
-
-def gcs_open(req, *args, **kwargs):
-    """Open a reader stream to a blob object on GCS"""
-    import spack.util.gcs as gcs_util
-
-    url = urllib.parse.urlparse(req.get_full_url())
-    gcsblob = gcs_util.GCSBlob(url)
-
-    if not gcsblob.exists():
-        raise URLError("GCS blob {0} does not exist".format(gcsblob.blob_path))
-    stream = gcsblob.get_blob_byte_stream()
-    headers = gcsblob.get_blob_headers()
-
-    return urllib.response.addinfourl(stream, headers, url)
-
-
-class GCSHandler(BaseHandler):
-    def gs_open(self, req):
-        return gcs_open(req)
diff --git a/lib/spack/spack/main.py b/lib/spack/spack/main.py
index 009190829f..3b330c08d4 100644
--- a/lib/spack/spack/main.py
+++ b/lib/spack/spack/main.py
@@ -30,7 +30,6 @@ import llnl.util.tty.colify
 import llnl.util.tty.color as color
 from llnl.util.tty.log import log_output
 
-import spack
 import spack.cmd
 import spack.config
 import spack.environment as ev
diff --git a/lib/spack/spack/mirror.py b/lib/spack/spack/mirror.py
index e4825537db..32037502c5 100644
--- a/lib/spack/spack/mirror.py
+++ b/lib/spack/spack/mirror.py
@@ -20,6 +20,7 @@ import traceback
 import urllib.parse
 from typing import Optional, Union
 
+import llnl.url
 import llnl.util.tty as tty
 from llnl.util.filesystem import mkdirp
 
@@ -29,7 +30,6 @@ import spack.error
 import spack.fetch_strategy as fs
 import spack.mirror
 import spack.spec
-import spack.url as url
 import spack.util.path
 import spack.util.spack_json as sjson
 import spack.util.spack_yaml as syaml
@@ -375,7 +375,7 @@ def _determine_extension(fetcher):
     if isinstance(fetcher, fs.URLFetchStrategy):
         if fetcher.expand_archive:
             # If we fetch with a URLFetchStrategy, use URL's archive type
-            ext = url.determine_url_file_extension(fetcher.url)
+            ext = llnl.url.determine_url_file_extension(fetcher.url)
 
             if ext:
                 # Remove any leading dots
diff --git a/lib/spack/spack/package_base.py b/lib/spack/spack/package_base.py
index 5a14f44f31..67cebb3a8f 100644
--- a/lib/spack/spack/package_base.py
+++ b/lib/spack/spack/package_base.py
@@ -2377,7 +2377,7 @@ class PackageBase(WindowsRPath, PackageViewMixin, metaclass=PackageMeta):
             return {}
 
         try:
-            return spack.util.web.find_versions_of_archive(
+            return spack.url.find_versions_of_archive(
                 self.all_urls, self.list_url, self.list_depth, concurrency, reference_package=self
             )
         except spack.util.web.NoNetworkConnectionError as e:
diff --git a/lib/spack/spack/patch.py b/lib/spack/spack/patch.py
index a7fb3620ee..7bbab326d1 100644
--- a/lib/spack/spack/patch.py
+++ b/lib/spack/spack/patch.py
@@ -11,6 +11,7 @@ import sys
 
 import llnl.util.filesystem
 import llnl.util.lang
+from llnl.url import allowed_archive
 
 import spack
 import spack.error
@@ -19,7 +20,6 @@ import spack.mirror
 import spack.repo
 import spack.stage
 import spack.util.spack_json as sjson
-from spack.util.compression import allowed_archive
 from spack.util.crypto import Checker, checksum
 from spack.util.executable import which, which_string
 
diff --git a/lib/spack/spack/s3_handler.py b/lib/spack/spack/s3_handler.py
deleted file mode 100644
index efab23a5ea..0000000000
--- a/lib/spack/spack/s3_handler.py
+++ /dev/null
@@ -1,80 +0,0 @@
-# Copyright 2013-2023 Lawrence Livermore National Security, LLC and other
-# Spack Project Developers. See the top-level COPYRIGHT file for details.
-#
-# SPDX-License-Identifier: (Apache-2.0 OR MIT)
-
-import urllib.error
-import urllib.parse
-import urllib.request
-import urllib.response
-from io import BufferedReader, BytesIO, IOBase
-
-import spack.util.s3 as s3_util
-
-
-# NOTE(opadron): Workaround issue in boto where its StreamingBody
-# implementation is missing several APIs expected from IOBase.  These missing
-# APIs prevent the streams returned by boto from being passed as-are along to
-# urllib.
-#
-# https://github.com/boto/botocore/issues/879
-# https://github.com/python/cpython/pull/3249
-class WrapStream(BufferedReader):
-    def __init__(self, raw):
-        # In botocore >=1.23.47, StreamingBody inherits from IOBase, so we
-        # only add missing attributes in older versions.
-        # https://github.com/boto/botocore/commit/a624815eabac50442ed7404f3c4f2664cd0aa784
-        if not isinstance(raw, IOBase):
-            raw.readable = lambda: True
-            raw.writable = lambda: False
-            raw.seekable = lambda: False
-            raw.closed = False
-            raw.flush = lambda: None
-        super().__init__(raw)
-
-    def detach(self):
-        self.raw = None
-
-    def read(self, *args, **kwargs):
-        return self.raw.read(*args, **kwargs)
-
-    def __getattr__(self, key):
-        return getattr(self.raw, key)
-
-
-def _s3_open(url, method="GET"):
-    parsed = urllib.parse.urlparse(url)
-    s3 = s3_util.get_s3_session(url, method="fetch")
-
-    bucket = parsed.netloc
-    key = parsed.path
-
-    if key.startswith("/"):
-        key = key[1:]
-
-    if method not in ("GET", "HEAD"):
-        raise urllib.error.URLError(
-            "Only GET and HEAD verbs are currently supported for the s3:// scheme"
-        )
-
-    try:
-        if method == "GET":
-            obj = s3.get_object(Bucket=bucket, Key=key)
-            # NOTE(opadron): Apply workaround here (see above)
-            stream = WrapStream(obj["Body"])
-        elif method == "HEAD":
-            obj = s3.head_object(Bucket=bucket, Key=key)
-            stream = BytesIO()
-    except s3.ClientError as e:
-        raise urllib.error.URLError(e) from e
-
-    headers = obj["ResponseMetadata"]["HTTPHeaders"]
-
-    return url, headers, stream
-
-
-class UrllibS3Handler(urllib.request.BaseHandler):
-    def s3_open(self, req):
-        orig_url = req.get_full_url()
-        url, headers, stream = _s3_open(orig_url, method=req.get_method())
-        return urllib.response.addinfourl(stream, headers, url)
diff --git a/lib/spack/spack/test/llnl/url.py b/lib/spack/spack/test/llnl/url.py
new file mode 100644
index 0000000000..8da8e727ec
--- /dev/null
+++ b/lib/spack/spack/test/llnl/url.py
@@ -0,0 +1,167 @@
+# Copyright 2013-2023 Lawrence Livermore National Security, LLC and other
+# Spack Project Developers. See the top-level COPYRIGHT file for details.
+#
+# SPDX-License-Identifier: (Apache-2.0 OR MIT)
+"""Tests for llnl.url functions"""
+import itertools
+
+import pytest
+
+import llnl.url
+
+
+@pytest.fixture(params=llnl.url.ALLOWED_ARCHIVE_TYPES)
+def archive_and_expected(request):
+    archive_name = ".".join(["Foo", request.param])
+    return archive_name, request.param
+
+
+def test_get_extension(archive_and_expected):
+    """Tests that we can predict correctly known extensions for simple cases."""
+    archive, expected = archive_and_expected
+    result = llnl.url.extension_from_path(archive)
+    assert result == expected
+
+
+def test_get_bad_extension():
+    """Tests that a bad extension returns None"""
+    result = llnl.url.extension_from_path("Foo.cxx")
+    assert result is None
+
+
+@pytest.mark.parametrize(
+    "url,expected",
+    [
+        # No suffix
+        ("rgb-1.0.6", "rgb-1.0.6"),
+        # Misleading prefix
+        ("jpegsrc.v9b", "jpegsrc.v9b"),
+        ("turbolinux702", "turbolinux702"),
+        ("converge_install_2.3.16", "converge_install_2.3.16"),
+        # Download type - code, source
+        ("cistem-1.0.0-beta-source-code", "cistem-1.0.0-beta"),
+        # Download type - src
+        ("apache-ant-1.9.7-src", "apache-ant-1.9.7"),
+        ("go1.7.4.src", "go1.7.4"),
+        # Download type - source
+        ("bowtie2-2.2.5-source", "bowtie2-2.2.5"),
+        ("grib_api-1.17.0-Source", "grib_api-1.17.0"),
+        # Download type - full
+        ("julia-0.4.3-full", "julia-0.4.3"),
+        # Download type - bin
+        ("apache-maven-3.3.9-bin", "apache-maven-3.3.9"),
+        # Download type - binary
+        ("Jmol-14.8.0-binary", "Jmol-14.8.0"),
+        # Download type - gem
+        ("rubysl-date-2.0.9.gem", "rubysl-date-2.0.9"),
+        # Download type - tar
+        ("gromacs-4.6.1-tar", "gromacs-4.6.1"),
+        # Download type - sh
+        ("Miniconda2-4.3.11-Linux-x86_64.sh", "Miniconda2-4.3.11"),
+        # Download version - release
+        ("v1.0.4-release", "v1.0.4"),
+        # Download version - stable
+        ("libevent-2.0.21-stable", "libevent-2.0.21"),
+        # Download version - final
+        ("2.6.7-final", "2.6.7"),
+        # Download version - rel
+        ("v1.9.5.1rel", "v1.9.5.1"),
+        # Download version - orig
+        ("dash_0.5.5.1.orig", "dash_0.5.5.1"),
+        # Download version - plus
+        ("ncbi-blast-2.6.0+-src", "ncbi-blast-2.6.0"),
+        # License
+        ("cppad-20170114.gpl", "cppad-20170114"),
+        # Arch
+        ("pcraster-4.1.0_x86-64", "pcraster-4.1.0"),
+        ("dislin-11.0.linux.i586_64", "dislin-11.0"),
+        ("PAGIT.V1.01.64bit", "PAGIT.V1.01"),
+        # OS - linux
+        ("astyle_2.04_linux", "astyle_2.04"),
+        # OS - unix
+        ("install-tl-unx", "install-tl"),
+        # OS - macos
+        ("astyle_1.23_macosx", "astyle_1.23"),
+        ("haxe-2.08-osx", "haxe-2.08"),
+        # PyPI - wheel
+        ("entrypoints-0.2.2-py2.py3-none-any.whl", "entrypoints-0.2.2"),
+        (
+            "numpy-1.12.0-cp27-cp27m-macosx_10_6_intel.macosx_10_9_intel."
+            "macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl",
+            "numpy-1.12.0",
+        ),
+        # PyPI - exe
+        ("PyYAML-3.12.win-amd64-py3.5.exe", "PyYAML-3.12"),
+        # Combinations of multiple patterns - bin, release
+        ("rocketmq-all-4.5.2-bin-release", "rocketmq-all-4.5.2"),
+        # Combinations of multiple patterns - all
+        ("p7zip_9.04_src_all", "p7zip_9.04"),
+        # Combinations of multiple patterns - run
+        ("cuda_8.0.44_linux.run", "cuda_8.0.44"),
+        # Combinations of multiple patterns - file
+        ("ack-2.14-single-file", "ack-2.14"),
+        # Combinations of multiple patterns - jar
+        ("antlr-3.4-complete.jar", "antlr-3.4"),
+        # Combinations of multiple patterns - oss
+        ("tbb44_20160128oss_src_0", "tbb44_20160128"),
+        # Combinations of multiple patterns - darwin
+        ("ghc-7.0.4-x86_64-apple-darwin", "ghc-7.0.4"),
+        ("ghc-7.0.4-i386-apple-darwin", "ghc-7.0.4"),
+        # Combinations of multiple patterns - centos
+        ("sratoolkit.2.8.2-1-centos_linux64", "sratoolkit.2.8.2-1"),
+        # Combinations of multiple patterns - arch
+        (
+            "VizGlow_v2.2alpha17-R21November2016-Linux-x86_64-Install",
+            "VizGlow_v2.2alpha17-R21November2016",
+        ),
+        ("jdk-8u92-linux-x64", "jdk-8u92"),
+        ("cuda_6.5.14_linux_64.run", "cuda_6.5.14"),
+        ("Mathematica_12.0.0_LINUX.sh", "Mathematica_12.0.0"),
+        ("trf407b.linux64", "trf407b"),
+        # Combinations of multiple patterns - with
+        ("mafft-7.221-with-extensions-src", "mafft-7.221"),
+        ("spark-2.0.0-bin-without-hadoop", "spark-2.0.0"),
+        ("conduit-v0.3.0-src-with-blt", "conduit-v0.3.0"),
+        # Combinations of multiple patterns - rock
+        ("bitlib-23-2.src.rock", "bitlib-23-2"),
+        # Combinations of multiple patterns - public
+        ("dakota-6.3-public.src", "dakota-6.3"),
+        # Combinations of multiple patterns - universal
+        ("synergy-1.3.6p2-MacOSX-Universal", "synergy-1.3.6p2"),
+        # Combinations of multiple patterns - dynamic
+        ("snptest_v2.5.2_linux_x86_64_dynamic", "snptest_v2.5.2"),
+        # Combinations of multiple patterns - other
+        ("alglib-3.11.0.cpp.gpl", "alglib-3.11.0"),
+        ("hpcviewer-2019.08-linux.gtk.x86_64", "hpcviewer-2019.08"),
+        ("apache-mxnet-src-1.3.0-incubating", "apache-mxnet-src-1.3.0"),
+    ],
+)
+def test_url_strip_version_suffixes(url, expected):
+    stripped = llnl.url.strip_version_suffixes(url)
+    assert stripped == expected
+
+
+def test_strip_compression_extension(archive_and_expected):
+    archive, extension = archive_and_expected
+    stripped = llnl.url.strip_compression_extension(archive)
+    if extension == "zip":
+        assert stripped == "Foo.zip"
+        stripped = llnl.url.strip_compression_extension(archive, "zip")
+        assert stripped == "Foo"
+    elif (
+        extension.lower() == "tar"
+        or extension in llnl.url.CONTRACTION_MAP
+        or extension
+        in [
+            ".".join(ext)
+            for ext in itertools.product(llnl.url.PREFIX_EXTENSIONS, llnl.url.EXTENSIONS)
+        ]
+    ):
+        assert stripped == "Foo.tar" or stripped == "Foo.TAR"
+    else:
+        assert stripped == "Foo"
+
+
+def test_allowed_archive(archive_and_expected):
+    archive, _ = archive_and_expected
+    assert llnl.url.allowed_archive(archive)
diff --git a/lib/spack/spack/test/url_parse.py b/lib/spack/spack/test/url_parse.py
index 86ebf84fa7..dd094ed230 100644
--- a/lib/spack/spack/test/url_parse.py
+++ b/lib/spack/spack/test/url_parse.py
@@ -17,125 +17,12 @@ from spack.url import (
     parse_name_offset,
     parse_version_offset,
     strip_name_suffixes,
-    strip_version_suffixes,
     substitute_version,
 )
 from spack.version import Version
 
 
 @pytest.mark.parametrize(
-    "url,expected",
-    [
-        # No suffix
-        ("rgb-1.0.6", "rgb-1.0.6"),
-        # Misleading prefix
-        ("jpegsrc.v9b", "jpegsrc.v9b"),
-        ("turbolinux702", "turbolinux702"),
-        ("converge_install_2.3.16", "converge_install_2.3.16"),
-        # Download type - code, source
-        ("cistem-1.0.0-beta-source-code", "cistem-1.0.0-beta"),
-        # Download type - src
-        ("apache-ant-1.9.7-src", "apache-ant-1.9.7"),
-        ("go1.7.4.src", "go1.7.4"),
-        # Download type - source
-        ("bowtie2-2.2.5-source", "bowtie2-2.2.5"),
-        ("grib_api-1.17.0-Source", "grib_api-1.17.0"),
-        # Download type - full
-        ("julia-0.4.3-full", "julia-0.4.3"),
-        # Download type - bin
-        ("apache-maven-3.3.9-bin", "apache-maven-3.3.9"),
-        # Download type - binary
-        ("Jmol-14.8.0-binary", "Jmol-14.8.0"),
-        # Download type - gem
-        ("rubysl-date-2.0.9.gem", "rubysl-date-2.0.9"),
-        # Download type - tar
-        ("gromacs-4.6.1-tar", "gromacs-4.6.1"),
-        # Download type - sh
-        ("Miniconda2-4.3.11-Linux-x86_64.sh", "Miniconda2-4.3.11"),
-        # Download version - release
-        ("v1.0.4-release", "v1.0.4"),
-        # Download version - stable
-        ("libevent-2.0.21-stable", "libevent-2.0.21"),
-        # Download version - final
-        ("2.6.7-final", "2.6.7"),
-        # Download version - rel
-        ("v1.9.5.1rel", "v1.9.5.1"),
-        # Download version - orig
-        ("dash_0.5.5.1.orig", "dash_0.5.5.1"),
-        # Download version - plus
-        ("ncbi-blast-2.6.0+-src", "ncbi-blast-2.6.0"),
-        # License
-        ("cppad-20170114.gpl", "cppad-20170114"),
-        # Arch
-        ("pcraster-4.1.0_x86-64", "pcraster-4.1.0"),
-        ("dislin-11.0.linux.i586_64", "dislin-11.0"),
-        ("PAGIT.V1.01.64bit", "PAGIT.V1.01"),
-        # OS - linux
-        ("astyle_2.04_linux", "astyle_2.04"),
-        # OS - unix
-        ("install-tl-unx", "install-tl"),
-        # OS - macos
-        ("astyle_1.23_macosx", "astyle_1.23"),
-        ("haxe-2.08-osx", "haxe-2.08"),
-        # PyPI - wheel
-        ("entrypoints-0.2.2-py2.py3-none-any.whl", "entrypoints-0.2.2"),
-        (
-            "numpy-1.12.0-cp27-cp27m-macosx_10_6_intel.macosx_10_9_intel."
-            "macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl",
-            "numpy-1.12.0",
-        ),
-        # PyPI - exe
-        ("PyYAML-3.12.win-amd64-py3.5.exe", "PyYAML-3.12"),
-        # Combinations of multiple patterns - bin, release
-        ("rocketmq-all-4.5.2-bin-release", "rocketmq-all-4.5.2"),
-        # Combinations of multiple patterns - all
-        ("p7zip_9.04_src_all", "p7zip_9.04"),
-        # Combinations of multiple patterns - run
-        ("cuda_8.0.44_linux.run", "cuda_8.0.44"),
-        # Combinations of multiple patterns - file
-        ("ack-2.14-single-file", "ack-2.14"),
-        # Combinations of multiple patterns - jar
-        ("antlr-3.4-complete.jar", "antlr-3.4"),
-        # Combinations of multiple patterns - oss
-        ("tbb44_20160128oss_src_0", "tbb44_20160128"),
-        # Combinations of multiple patterns - darwin
-        ("ghc-7.0.4-x86_64-apple-darwin", "ghc-7.0.4"),
-        ("ghc-7.0.4-i386-apple-darwin", "ghc-7.0.4"),
-        # Combinations of multiple patterns - centos
-        ("sratoolkit.2.8.2-1-centos_linux64", "sratoolkit.2.8.2-1"),
-        # Combinations of multiple patterns - arch
-        (
-            "VizGlow_v2.2alpha17-R21November2016-Linux-x86_64-Install",
-            "VizGlow_v2.2alpha17-R21November2016",
-        ),
-        ("jdk-8u92-linux-x64", "jdk-8u92"),
-        ("cuda_6.5.14_linux_64.run", "cuda_6.5.14"),
-        ("Mathematica_12.0.0_LINUX.sh", "Mathematica_12.0.0"),
-        ("trf407b.linux64", "trf407b"),
-        # Combinations of multiple patterns - with
-        ("mafft-7.221-with-extensions-src", "mafft-7.221"),
-        ("spark-2.0.0-bin-without-hadoop", "spark-2.0.0"),
-        ("conduit-v0.3.0-src-with-blt", "conduit-v0.3.0"),
-        # Combinations of multiple patterns - rock
-        ("bitlib-23-2.src.rock", "bitlib-23-2"),
-        # Combinations of multiple patterns - public
-        ("dakota-6.3-public.src", "dakota-6.3"),
-        # Combinations of multiple patterns - universal
-        ("synergy-1.3.6p2-MacOSX-Universal", "synergy-1.3.6p2"),
-        # Combinations of multiple patterns - dynamic
-        ("snptest_v2.5.2_linux_x86_64_dynamic", "snptest_v2.5.2"),
-        # Combinations of multiple patterns - other
-        ("alglib-3.11.0.cpp.gpl", "alglib-3.11.0"),
-        ("hpcviewer-2019.08-linux.gtk.x86_64", "hpcviewer-2019.08"),
-        ("apache-mxnet-src-1.3.0-incubating", "apache-mxnet-src-1.3.0"),
-    ],
-)
-def test_url_strip_version_suffixes(url, expected):
-    stripped = strip_version_suffixes(url)
-    assert stripped == expected
-
-
-@pytest.mark.parametrize(
     "url,version,expected",
     [
         # No suffix
diff --git a/lib/spack/spack/test/util/compression.py b/lib/spack/spack/test/util/compression.py
index 7cbcfb283c..29007a7e33 100644
--- a/lib/spack/spack/test/util/compression.py
+++ b/lib/spack/spack/test/util/compression.py
@@ -10,6 +10,7 @@ from itertools import product
 
 import pytest
 
+import llnl.url
 from llnl.util.filesystem import working_dir
 
 from spack.paths import spack_root
@@ -21,7 +22,7 @@ datadir = os.path.join(spack_root, "lib", "spack", "spack", "test", "data", "com
 ext_archive = {}
 [
     ext_archive.update({ext: ".".join(["Foo", ext])})
-    for ext in scomp.ALLOWED_ARCHIVE_TYPES
+    for ext in llnl.url.ALLOWED_ARCHIVE_TYPES
     if "TAR" not in ext
 ]
 # Spack does not use Python native handling for tarballs or zip
@@ -95,38 +96,3 @@ def test_unallowed_extension():
     bad_ext_archive = "Foo.cxx"
     with pytest.raises(CommandNotFoundError):
         scomp.decompressor_for(bad_ext_archive)
-
-
-@pytest.mark.parametrize("archive", ext_archive.values())
-def test_get_extension(archive):
-    ext = scomp.extension_from_path(archive)
-    assert ext_archive[ext] == archive
-
-
-def test_get_bad_extension():
-    archive = "Foo.cxx"
-    ext = scomp.extension_from_path(archive)
-    assert ext is None
-
-
-@pytest.mark.parametrize("path", ext_archive.values())
-def test_allowed_archive(path):
-    assert scomp.allowed_archive(path)
-
-
-@pytest.mark.parametrize("ext_path", ext_archive.items())
-def test_strip_compression_extension(ext_path):
-    ext, path = ext_path
-    stripped = scomp.strip_compression_extension(path)
-    if ext == "zip":
-        assert stripped == "Foo.zip"
-        stripped = scomp.strip_compression_extension(path, "zip")
-        assert stripped == "Foo"
-    elif (
-        ext == "tar"
-        or ext in scomp.CONTRACTION_MAP.keys()
-        or ext in [".".join(ext) for ext in product(scomp.PRE_EXTS, scomp.EXTS)]
-    ):
-        assert stripped == "Foo.tar" or stripped == "Foo.TAR"
-    else:
-        assert stripped == "Foo"
diff --git a/lib/spack/spack/test/web.py b/lib/spack/spack/test/web.py
index 2d6f577799..a012e7524e 100644
--- a/lib/spack/spack/test/web.py
+++ b/lib/spack/spack/test/web.py
@@ -15,6 +15,7 @@ import llnl.util.tty as tty
 import spack.config
 import spack.mirror
 import spack.paths
+import spack.url
 import spack.util.path
 import spack.util.s3
 import spack.util.url as url_util
@@ -102,31 +103,31 @@ def test_spider_no_response(monkeypatch):
 
 
 def test_find_versions_of_archive_0():
-    versions = spack.util.web.find_versions_of_archive(root_tarball, root, list_depth=0)
+    versions = spack.url.find_versions_of_archive(root_tarball, root, list_depth=0)
     assert Version("0.0.0") in versions
 
 
 def test_find_versions_of_archive_1():
-    versions = spack.util.web.find_versions_of_archive(root_tarball, root, list_depth=1)
+    versions = spack.url.find_versions_of_archive(root_tarball, root, list_depth=1)
     assert Version("0.0.0") in versions
     assert Version("1.0.0") in versions
 
 
 def test_find_versions_of_archive_2():
-    versions = spack.util.web.find_versions_of_archive(root_tarball, root, list_depth=2)
+    versions = spack.url.find_versions_of_archive(root_tarball, root, list_depth=2)
     assert Version("0.0.0") in versions
     assert Version("1.0.0") in versions
     assert Version("2.0.0") in versions
 
 
 def test_find_exotic_versions_of_archive_2():
-    versions = spack.util.web.find_versions_of_archive(root_tarball, root, list_depth=2)
+    versions = spack.url.find_versions_of_archive(root_tarball, root, list_depth=2)
     # up for grabs to make this better.
     assert Version("2.0.0b2") in versions
 
 
 def test_find_versions_of_archive_3():
-    versions = spack.util.web.find_versions_of_archive(root_tarball, root, list_depth=3)
+    versions = spack.url.find_versions_of_archive(root_tarball, root, list_depth=3)
     assert Version("0.0.0") in versions
     assert Version("1.0.0") in versions
     assert Version("2.0.0") in versions
@@ -135,16 +136,14 @@ def test_find_versions_of_archive_3():
 
 
 def test_find_exotic_versions_of_archive_3():
-    versions = spack.util.web.find_versions_of_archive(root_tarball, root, list_depth=3)
+    versions = spack.url.find_versions_of_archive(root_tarball, root, list_depth=3)
     assert Version("2.0.0b2") in versions
     assert Version("3.0a1") in versions
     assert Version("4.5-rc5") in versions
 
 
 def test_find_versions_of_archive_with_fragment():
-    versions = spack.util.web.find_versions_of_archive(
-        root_tarball, root_with_fragment, list_depth=0
-    )
+    versions = spack.url.find_versions_of_archive(root_tarball, root_with_fragment, list_depth=0)
     assert Version("5.0.0") in versions
 
 
@@ -311,7 +310,7 @@ def test_remove_s3_url(monkeypatch, capfd):
     def get_s3_session(url, method="fetch"):
         return MockS3Client()
 
-    monkeypatch.setattr(spack.util.s3, "get_s3_session", get_s3_session)
+    monkeypatch.setattr(spack.util.web, "get_s3_session", get_s3_session)
 
     current_debug_level = tty.debug_level()
     tty.set_debug(1)
diff --git a/lib/spack/spack/url.py b/lib/spack/spack/url.py
index bf2990f42f..c5e47232c0 100644
--- a/lib/spack/spack/url.py
+++ b/lib/spack/spack/url.py
@@ -27,246 +27,22 @@ it's never been told about that version before.
 """
 import io
 import os
+import pathlib
 import re
-from urllib.parse import urlsplit, urlunsplit
 
-import llnl.util.tty as tty
+import llnl.url
 from llnl.util.tty.color import cescape, colorize
 
 import spack.error
-import spack.util.compression as comp
-import spack.util.path as spath
+import spack.util.web
 import spack.version
-
+from spack.util.path import convert_to_posix_path
 
 #
 # Note: We call the input to most of these functions a "path" but the functions
 # work on paths and URLs.  There's not a good word for both of these, but
 # "path" seemed like the most generic term.
 #
-def find_list_urls(url):
-    r"""Find good list URLs for the supplied URL.
-
-    By default, returns the dirname of the archive path.
-
-    Provides special treatment for the following websites, which have a
-    unique list URL different from the dirname of the download URL:
-
-    =========  =======================================================
-    GitHub     https://github.com/<repo>/<name>/releases
-    GitLab     https://gitlab.\*/<repo>/<name>/tags
-    BitBucket  https://bitbucket.org/<repo>/<name>/downloads/?tab=tags
-    CRAN       https://\*.r-project.org/src/contrib/Archive/<name>
-    PyPI       https://pypi.org/simple/<name>/
-    LuaRocks   https://luarocks.org/modules/<repo>/<name>
-    =========  =======================================================
-
-    Note: this function is called by `spack versions`, `spack checksum`,
-    and `spack create`, but not by `spack fetch` or `spack install`.
-
-    Parameters:
-        url (str): The download URL for the package
-
-    Returns:
-        set: One or more list URLs for the package
-    """
-
-    url_types = [
-        # GitHub
-        # e.g. https://github.com/llnl/callpath/archive/v1.0.1.tar.gz
-        (r"(.*github\.com/[^/]+/[^/]+)", lambda m: m.group(1) + "/releases"),
-        # GitLab API endpoint
-        # e.g. https://gitlab.dkrz.de/api/v4/projects/k202009%2Flibaec/repository/archive.tar.gz?sha=v1.0.2
-        (
-            r"(.*gitlab[^/]+)/api/v4/projects/([^/]+)%2F([^/]+)",
-            lambda m: m.group(1) + "/" + m.group(2) + "/" + m.group(3) + "/tags",
-        ),
-        # GitLab non-API endpoint
-        # e.g. https://gitlab.dkrz.de/k202009/libaec/uploads/631e85bcf877c2dcaca9b2e6d6526339/libaec-1.0.0.tar.gz
-        (r"(.*gitlab[^/]+/(?!api/v4/projects)[^/]+/[^/]+)", lambda m: m.group(1) + "/tags"),
-        # BitBucket
-        # e.g. https://bitbucket.org/eigen/eigen/get/3.3.3.tar.bz2
-        (r"(.*bitbucket.org/[^/]+/[^/]+)", lambda m: m.group(1) + "/downloads/?tab=tags"),
-        # CRAN
-        # e.g. https://cran.r-project.org/src/contrib/Rcpp_0.12.9.tar.gz
-        # e.g. https://cloud.r-project.org/src/contrib/rgl_0.98.1.tar.gz
-        (
-            r"(.*\.r-project\.org/src/contrib)/([^_]+)",
-            lambda m: m.group(1) + "/Archive/" + m.group(2),
-        ),
-        # PyPI
-        # e.g. https://pypi.io/packages/source/n/numpy/numpy-1.19.4.zip
-        # e.g. https://www.pypi.io/packages/source/n/numpy/numpy-1.19.4.zip
-        # e.g. https://pypi.org/packages/source/n/numpy/numpy-1.19.4.zip
-        # e.g. https://pypi.python.org/packages/source/n/numpy/numpy-1.19.4.zip
-        # e.g. https://files.pythonhosted.org/packages/source/n/numpy/numpy-1.19.4.zip
-        # e.g. https://pypi.io/packages/py2.py3/o/opencensus-context/opencensus_context-0.1.1-py2.py3-none-any.whl
-        (
-            r"(?:pypi|pythonhosted)[^/]+/packages/[^/]+/./([^/]+)",
-            lambda m: "https://pypi.org/simple/" + m.group(1) + "/",
-        ),
-        # LuaRocks
-        # e.g. https://luarocks.org/manifests/gvvaughan/lpeg-1.0.2-1.src.rock
-        # e.g. https://luarocks.org/manifests/openresty/lua-cjson-2.1.0-1.src.rock
-        (
-            r"luarocks[^/]+/(?:modules|manifests)/(?P<org>[^/]+)/"
-            + r"(?P<name>.+?)-[0-9.-]*\.src\.rock",
-            lambda m: "https://luarocks.org/modules/"
-            + m.group("org")
-            + "/"
-            + m.group("name")
-            + "/",
-        ),
-    ]
-
-    list_urls = set([os.path.dirname(url)])
-
-    for pattern, fun in url_types:
-        match = re.search(pattern, url)
-        if match:
-            list_urls.add(fun(match))
-
-    return list_urls
-
-
-def strip_query_and_fragment(path):
-    try:
-        components = urlsplit(path)
-        stripped = components[:3] + (None, None)
-
-        query, frag = components[3:5]
-        suffix = ""
-        if query:
-            suffix += "?" + query
-        if frag:
-            suffix += "#" + frag
-
-        return (urlunsplit(stripped), suffix)
-
-    except ValueError:
-        tty.debug("Got error parsing path %s" % path)
-        return (path, "")  # Ignore URL parse errors here
-
-
-def strip_version_suffixes(path):
-    """Some tarballs contain extraneous information after the version:
-
-    * ``bowtie2-2.2.5-source``
-    * ``libevent-2.0.21-stable``
-    * ``cuda_8.0.44_linux.run``
-
-    These strings are not part of the version number and should be ignored.
-    This function strips those suffixes off and returns the remaining string.
-    The goal is that the version is always the last thing in ``path``:
-
-    * ``bowtie2-2.2.5``
-    * ``libevent-2.0.21``
-    * ``cuda_8.0.44``
-
-    Args:
-        path (str): The filename or URL for the package
-
-    Returns:
-        str: The ``path`` with any extraneous suffixes removed
-    """
-    # NOTE: This could be done with complicated regexes in parse_version_offset
-    # NOTE: The problem is that we would have to add these regexes to the end
-    # NOTE: of every single version regex. Easier to just strip them off
-    # NOTE: permanently
-
-    suffix_regexes = [
-        # Download type
-        r"[Ii]nstall",
-        r"all",
-        r"code",
-        r"[Ss]ources?",
-        r"file",
-        r"full",
-        r"single",
-        r"with[a-zA-Z_-]+",
-        r"rock",
-        r"src(_0)?",
-        r"public",
-        r"bin",
-        r"binary",
-        r"run",
-        r"[Uu]niversal",
-        r"jar",
-        r"complete",
-        r"dynamic",
-        r"oss",
-        r"gem",
-        r"tar",
-        r"sh",
-        # Download version
-        r"release",
-        r"bin",
-        r"stable",
-        r"[Ff]inal",
-        r"rel",
-        r"orig",
-        r"dist",
-        r"\+",
-        # License
-        r"gpl",
-        # Arch
-        # Needs to come before and after OS, appears in both orders
-        r"ia32",
-        r"intel",
-        r"amd64",
-        r"linux64",
-        r"x64",
-        r"64bit",
-        r"x86[_-]64",
-        r"i586_64",
-        r"x86",
-        r"i[36]86",
-        r"ppc64(le)?",
-        r"armv?(7l|6l|64)",
-        # Other
-        r"cpp",
-        r"gtk",
-        r"incubating",
-        # OS
-        r"[Ll]inux(_64)?",
-        r"LINUX",
-        r"[Uu]ni?x",
-        r"[Ss]un[Oo][Ss]",
-        r"[Mm]ac[Oo][Ss][Xx]?",
-        r"[Oo][Ss][Xx]",
-        r"[Dd]arwin(64)?",
-        r"[Aa]pple",
-        r"[Ww]indows",
-        r"[Ww]in(64|32)?",
-        r"[Cc]ygwin(64|32)?",
-        r"[Mm]ingw",
-        r"centos",
-        # Arch
-        # Needs to come before and after OS, appears in both orders
-        r"ia32",
-        r"intel",
-        r"amd64",
-        r"linux64",
-        r"x64",
-        r"64bit",
-        r"x86[_-]64",
-        r"i586_64",
-        r"x86",
-        r"i[36]86",
-        r"ppc64(le)?",
-        r"armv?(7l|6l|64)?",
-        # PyPI
-        r"[._-]py[23].*\.whl",
-        r"[._-]cp[23].*\.whl",
-        r"[._-]win.*\.exe",
-    ]
-
-    for regex in suffix_regexes:
-        # Remove the suffix from the end of the path
-        # This may be done multiple times
-        path = re.sub(r"[._-]?" + regex + "$", "", path)
-
-    return path
 
 
 def strip_name_suffixes(path, version):
@@ -341,69 +117,6 @@ def strip_name_suffixes(path, version):
     return path
 
 
-def split_url_extension(path):
-    """Some URLs have a query string, e.g.:
-
-    1. https://github.com/losalamos/CLAMR/blob/packages/PowerParser_v2.0.7.tgz?raw=true
-    2. http://www.apache.org/dyn/closer.cgi?path=/cassandra/1.2.0/apache-cassandra-1.2.0-rc2-bin.tar.gz
-    3. https://gitlab.kitware.com/vtk/vtk/repository/archive.tar.bz2?ref=v7.0.0
-
-    In (1), the query string needs to be stripped to get at the
-    extension, but in (2) & (3), the filename is IN a single final query
-    argument.
-
-    This strips the URL into three pieces: ``prefix``, ``ext``, and ``suffix``.
-    The suffix contains anything that was stripped off the URL to
-    get at the file extension.  In (1), it will be ``'?raw=true'``, but
-    in (2), it will be empty. In (3) the suffix is a parameter that follows
-    after the file extension, e.g.:
-
-    1. ``('https://github.com/losalamos/CLAMR/blob/packages/PowerParser_v2.0.7', '.tgz', '?raw=true')``
-    2. ``('http://www.apache.org/dyn/closer.cgi?path=/cassandra/1.2.0/apache-cassandra-1.2.0-rc2-bin', '.tar.gz', None)``
-    3. ``('https://gitlab.kitware.com/vtk/vtk/repository/archive', '.tar.bz2', '?ref=v7.0.0')``
-    """
-    prefix, ext, suffix = path, "", ""
-
-    # Strip off sourceforge download suffix.
-    # e.g. https://sourceforge.net/projects/glew/files/glew/2.0.0/glew-2.0.0.tgz/download
-    prefix, suffix = spath.find_sourceforge_suffix(path)
-
-    ext = comp.extension_from_path(prefix)
-    if ext is not None:
-        prefix = comp.strip_extension(prefix)
-
-    else:
-        prefix, suf = strip_query_and_fragment(prefix)
-        ext = comp.extension_from_path(prefix)
-        prefix = comp.strip_extension(prefix)
-        suffix = suf + suffix
-        if ext is None:
-            ext = ""
-
-    return prefix, ext, suffix
-
-
-def determine_url_file_extension(path):
-    """This returns the type of archive a URL refers to.  This is
-    sometimes confusing because of URLs like:
-
-    (1) https://github.com/petdance/ack/tarball/1.93_02
-
-    Where the URL doesn't actually contain the filename.  We need
-    to know what type it is so that we can appropriately name files
-    in mirrors.
-    """
-    match = re.search(r"github.com/.+/(zip|tar)ball/", path)
-    if match:
-        if match.group(1) == "zip":
-            return "zip"
-        elif match.group(1) == "tar":
-            return "tar.gz"
-
-    prefix, ext, suffix = split_url_extension(path)
-    return ext
-
-
 def parse_version_offset(path):
     """Try to extract a version string from a filename or URL.
 
@@ -426,13 +139,13 @@ def parse_version_offset(path):
     # path:   The prefix of the URL, everything before the ext and suffix
     # ext:    The file extension
     # suffix: Any kind of query string that begins with a '?'
-    path, ext, suffix = split_url_extension(path)
+    path, ext, suffix = llnl.url.split_url_extension(path)
 
     # stem:   Everything from path after the final '/'
     original_stem = os.path.basename(path)
 
     # Try to strip off anything after the version number
-    stem = strip_version_suffixes(original_stem)
+    stem = llnl.url.strip_version_suffixes(original_stem)
 
     # Assumptions:
     #
@@ -620,7 +333,7 @@ def parse_name_offset(path, v=None):
     # path:   The prefix of the URL, everything before the ext and suffix
     # ext:    The file extension
     # suffix: Any kind of query string that begins with a '?'
-    path, ext, suffix = split_url_extension(path)
+    path, ext, suffix = llnl.url.split_url_extension(path)
 
     # stem:   Everything from path after the final '/'
     original_stem = os.path.basename(path)
@@ -735,28 +448,6 @@ def parse_name_and_version(path):
     return (name, ver)
 
 
-def insensitize(string):
-    """Change upper and lowercase letters to be case insensitive in
-    the provided string.  e.g., 'a' becomes '[Aa]', 'B' becomes
-    '[bB]', etc.  Use for building regexes."""
-
-    def to_ins(match):
-        char = match.group(1)
-        return "[%s%s]" % (char.lower(), char.upper())
-
-    return re.sub(r"([a-zA-Z])", to_ins, string)
-
-
-def cumsum(elts, init=0, fn=lambda x: x):
-    """Return cumulative sum of result of fn on each element in elts."""
-    sums = []
-    s = init
-    for i, e in enumerate(elts):
-        sums.append(s)
-        s += fn(e)
-    return sums
-
-
 def find_all(substring, string):
     """Returns a list containing the indices of
     every occurrence of substring in string."""
@@ -912,6 +603,122 @@ def color_url(path, **kwargs):
     return colorize(out.getvalue())
 
 
+def find_versions_of_archive(
+    archive_urls, list_url=None, list_depth=0, concurrency=32, reference_package=None
+):
+    """Scrape web pages for new versions of a tarball. This function prefers URLs in the
+    following order: links found on the scraped page that match a url generated by the
+    reference package, found and in the archive_urls list, found and derived from those
+    in the archive_urls list, and if none are found for a version then the item in the
+    archive_urls list is included for the version.
+
+    Args:
+        archive_urls (str or list or tuple): URL or sequence of URLs for
+            different versions of a package. Typically these are just the
+            tarballs from the package file itself. By default, this searches
+            the parent directories of archives.
+        list_url (str or None): URL for a listing of archives.
+            Spack will scrape these pages for download links that look
+            like the archive URL.
+        list_depth (int): max depth to follow links on list_url pages.
+            Defaults to 0.
+        concurrency (int): maximum number of concurrent requests
+        reference_package (spack.package_base.PackageBase or None): a spack package
+            used as a reference for url detection.  Uses the url_for_version
+            method on the package to produce reference urls which, if found,
+            are preferred.
+    """
+    if not isinstance(archive_urls, (list, tuple)):
+        archive_urls = [archive_urls]
+
+    # Generate a list of list_urls based on archive urls and any
+    # explicitly listed list_url in the package
+    list_urls = set()
+    if list_url is not None:
+        list_urls.add(list_url)
+    for aurl in archive_urls:
+        list_urls |= llnl.url.find_list_urls(aurl)
+
+    # Add '/' to the end of the URL. Some web servers require this.
+    additional_list_urls = set()
+    for lurl in list_urls:
+        if not lurl.endswith("/"):
+            additional_list_urls.add(lurl + "/")
+    list_urls |= additional_list_urls
+
+    # Grab some web pages to scrape.
+    pages, links = spack.util.web.spider(list_urls, depth=list_depth, concurrency=concurrency)
+
+    # Scrape them for archive URLs
+    regexes = []
+    for aurl in archive_urls:
+        # This creates a regex from the URL with a capture group for
+        # the version part of the URL.  The capture group is converted
+        # to a generic wildcard, so we can use this to extract things
+        # on a page that look like archive URLs.
+        url_regex = wildcard_version(aurl)
+
+        # We'll be a bit more liberal and just look for the archive
+        # part, not the full path.
+        # this is a URL so it is a posixpath even on Windows
+        url_regex = pathlib.PurePosixPath(url_regex).name
+
+        # We need to add a / to the beginning of the regex to prevent
+        # Spack from picking up similarly named packages like:
+        #   https://cran.r-project.org/src/contrib/pls_2.6-0.tar.gz
+        #   https://cran.r-project.org/src/contrib/enpls_5.7.tar.gz
+        #   https://cran.r-project.org/src/contrib/autopls_1.3.tar.gz
+        #   https://cran.r-project.org/src/contrib/matrixpls_1.0.4.tar.gz
+        url_regex = "/" + url_regex
+
+        # We need to add a $ anchor to the end of the regex to prevent
+        # Spack from picking up signature files like:
+        #   .asc
+        #   .md5
+        #   .sha256
+        #   .sig
+        # However, SourceForge downloads still need to end in '/download'.
+        url_regex += r"(\/download)?"
+        # PyPI adds #sha256=... to the end of the URL
+        url_regex += "(#sha256=.*)?"
+        url_regex += "$"
+
+        regexes.append(url_regex)
+
+    regexes = [re.compile(r) for r in regexes]
+    # Build a dict version -> URL from any links that match the wildcards.
+    # Walk through archive_url links first.
+    # Any conflicting versions will be overwritten by the list_url links.
+    versions = {}
+    matched = set()
+    for url in sorted(links):
+        url = convert_to_posix_path(url)
+        if any(r.search(url) for r in regexes):
+            try:
+                ver = parse_version(url)
+                if ver in matched:
+                    continue
+                versions[ver] = url
+                # prevent this version from getting overwritten
+                if reference_package is not None:
+                    if url == reference_package.url_for_version(ver):
+                        matched.add(ver)
+                else:
+                    extrapolated_urls = [substitute_version(u, ver) for u in archive_urls]
+                    if url in extrapolated_urls:
+                        matched.add(ver)
+            except UndetectableVersionError:
+                continue
+
+    for url in archive_urls:
+        url = convert_to_posix_path(url)
+        ver = parse_version(url)
+        if ver not in versions:
+            versions[ver] = url
+
+    return versions
+
+
 class UrlParseError(spack.error.SpackError):
     """Raised when the URL module can't parse something correctly."""
 
diff --git a/lib/spack/spack/util/compression.py b/lib/spack/spack/util/compression.py
index b8dcd032f4..25ccfdf0bb 100644
--- a/lib/spack/spack/util/compression.py
+++ b/lib/spack/spack/util/compression.py
@@ -9,27 +9,13 @@ import os
 import re
 import shutil
 import sys
-from itertools import product
 
+import llnl.url
 from llnl.util import tty
 
-import spack.util.path as spath
 from spack.error import SpackError
 from spack.util.executable import CommandNotFoundError, which
 
-# Supported archive extensions.
-PRE_EXTS = ["tar", "TAR"]
-EXTS = ["gz", "bz2", "xz", "Z"]
-NOTAR_EXTS = ["zip", "tgz", "tbz2", "tbz", "txz"]
-CONTRACTION_MAP = {"tgz": "tar.gz", "txz": "tar.xz", "tbz": "tar.bz2", "tbz2": "tar.bz2"}
-
-# Add PRE_EXTS and EXTS last so that .tar.gz is matched *before* .tar or .gz
-ALLOWED_ARCHIVE_TYPES = (
-    [".".join(ext) for ext in product(PRE_EXTS, EXTS)] + PRE_EXTS + EXTS + NOTAR_EXTS
-)
-
-ALLOWED_SINGLE_EXT_ARCHIVE_TYPES = PRE_EXTS + EXTS + NOTAR_EXTS
-
 try:
     import bz2  # noqa
 
@@ -66,10 +52,6 @@ def is_bz2_supported():
     return _bz2_support
 
 
-def allowed_archive(path):
-    return False if not path else any(path.endswith(t) for t in ALLOWED_ARCHIVE_TYPES)
-
-
 def _system_untar(archive_file, remove_archive_file=False):
     """Returns path to unarchived tar file.
     Untars archive via system tar.
@@ -78,7 +60,7 @@ def _system_untar(archive_file, remove_archive_file=False):
         archive_file (str): absolute path to the archive to be extracted.
         Can be one of .tar(.[gz|bz2|xz|Z]) or .(tgz|tbz|tbz2|txz).
     """
-    archive_file_no_ext = strip_extension(archive_file)
+    archive_file_no_ext = llnl.url.strip_extension(archive_file)
     outfile = os.path.basename(archive_file_no_ext)
     if archive_file_no_ext == archive_file:
         # the archive file has no extension. Tar on windows cannot untar onto itself
@@ -114,7 +96,7 @@ def _bunzip2(archive_file):
 def _py_bunzip(archive_file):
     """Returns path to decompressed file.
     Decompresses bz2 compressed archives/files via python's bz2 module"""
-    decompressed_file = os.path.basename(strip_compression_extension(archive_file, "bz2"))
+    decompressed_file = os.path.basename(llnl.url.strip_compression_extension(archive_file, "bz2"))
     working_dir = os.getcwd()
     archive_out = os.path.join(working_dir, decompressed_file)
     f_bz = bz2.BZ2File(archive_file, mode="rb")
@@ -128,7 +110,7 @@ def _system_bunzip(archive_file):
     """Returns path to decompressed file.
     Decompresses bz2 compressed archives/files via system bzip2 utility"""
     compressed_file_name = os.path.basename(archive_file)
-    decompressed_file = os.path.basename(strip_compression_extension(archive_file, "bz2"))
+    decompressed_file = os.path.basename(llnl.url.strip_compression_extension(archive_file, "bz2"))
     working_dir = os.getcwd()
     archive_out = os.path.join(working_dir, decompressed_file)
     copy_path = os.path.join(working_dir, compressed_file_name)
@@ -158,7 +140,7 @@ def _gunzip(archive_file):
 def _py_gunzip(archive_file):
     """Returns path to gunzip'd file
     Decompresses `.gz` compressed archvies via python gzip module"""
-    decompressed_file = os.path.basename(strip_compression_extension(archive_file, "gz"))
+    decompressed_file = os.path.basename(llnl.url.strip_compression_extension(archive_file, "gz"))
     working_dir = os.getcwd()
     destination_abspath = os.path.join(working_dir, decompressed_file)
     f_in = gzip.open(archive_file, "rb")
@@ -171,7 +153,7 @@ def _py_gunzip(archive_file):
 def _system_gunzip(archive_file):
     """Returns path to gunzip'd file
     Decompresses `.gz` compressed files via system gzip"""
-    archive_file_no_ext = strip_compression_extension(archive_file)
+    archive_file_no_ext = llnl.url.strip_compression_extension(archive_file)
     if archive_file_no_ext == archive_file:
         # the zip file has no extension. On Unix gunzip cannot unzip onto itself
         archive_file = archive_file + ".gz"
@@ -196,7 +178,7 @@ def _unzip(archive_file):
     Args:
         archive_file (str): absolute path of the file to be decompressed
     """
-    extracted_file = os.path.basename(strip_extension(archive_file, "zip"))
+    extracted_file = os.path.basename(llnl.url.strip_extension(archive_file, extension="zip"))
     if sys.platform == "win32":
         return _system_untar(archive_file)
     else:
@@ -259,7 +241,7 @@ def _win_compressed_tarball_handler(decompressor):
 def _py_lzma(archive_file):
     """Returns path to decompressed .xz files
     Decompress lzma compressed .xz files via python lzma module"""
-    decompressed_file = os.path.basename(strip_compression_extension(archive_file, "xz"))
+    decompressed_file = os.path.basename(llnl.url.strip_compression_extension(archive_file, "xz"))
     archive_out = os.path.join(os.getcwd(), decompressed_file)
     with open(archive_out, "wb") as ar:
         with lzma.open(archive_file) as lar:
@@ -272,7 +254,7 @@ def _xz(archive_file):
     Decompress lzma compressed .xz files via xz command line
     tool.
     """
-    decompressed_file = os.path.basename(strip_extension(archive_file, "xz"))
+    decompressed_file = os.path.basename(llnl.url.strip_extension(archive_file, extension="xz"))
     working_dir = os.getcwd()
     destination_abspath = os.path.join(working_dir, decompressed_file)
     compressed_file = os.path.basename(archive_file)
@@ -297,13 +279,13 @@ def _system_7zip(archive_file):
     Args:
         archive_file (str): absolute path of file to be unarchived
     """
-    outfile = os.path.basename(strip_compression_extension(archive_file))
+    outfile = os.path.basename(llnl.url.strip_compression_extension(archive_file))
     _7z = which("7z")
     if not _7z:
         raise CommandNotFoundError(
             "7z unavailable,\
 unable to extract %s files. 7z can be installed via Spack"
-            % extension_from_path(archive_file)
+            % llnl.url.extension_from_path(archive_file)
         )
     _7z.add_default_arg("e")
     _7z(archive_file)
@@ -318,7 +300,7 @@ def decompressor_for(path, extension=None):
     if not extension:
         extension = extension_from_file(path, decompress=True)
 
-    if not allowed_archive(extension):
+    if not llnl.url.allowed_archive(extension):
         raise CommandNotFoundError(
             "Cannot extract archive, \
 unrecognized file extension: '%s'"
@@ -394,7 +376,7 @@ def decompressor_for_win(extension):
         path (str): path of the archive file requiring decompression
         extension (str): extension
     """
-    extension = expand_contracted_extension(extension)
+    extension = llnl.url.expand_contracted_extension(extension)
     # Windows native tar can handle .zip extensions, use standard
     # unzip method
     if re.match(r"zip$", extension):
@@ -415,7 +397,7 @@ def decompressor_for_win(extension):
     # python based decompression strategy
     # Expand extension from contracted extension i.e. tar.gz from .tgz
     # no-op on non contracted extensions
-    compression_extension = compression_ext_from_compressed_archive(extension)
+    compression_extension = llnl.url.compression_ext_from_compressed_archive(extension)
     decompressor = _determine_py_decomp_archive_strategy(compression_extension)
     if not decompressor:
         raise SpackError(
@@ -657,7 +639,7 @@ def extension_from_stream(stream, decompress=False):
                         "Cannot derive file extension from magic number;"
                         " falling back to regex path parsing."
                     )
-                    return extension_from_path(stream.name)
+                    return llnl.url.extension_from_path(stream.name)
             resultant_ext = suffix_ext if not prefix_ext else ".".join([prefix_ext, suffix_ext])
             tty.debug("File extension %s successfully derived by magic number." % resultant_ext)
             return resultant_ext
@@ -693,114 +675,11 @@ def extension_from_file(file, decompress=False):
             if ext and ext.startswith("tar."):
                 suf = ext.split(".")[1]
                 abbr = "t" + suf
-                if check_extension(file, abbr):
+                if llnl.url.has_extension(file, abbr):
                     return abbr
             if not ext:
                 # If unable to parse extension from stream,
                 # attempt to fall back to string parsing
-                ext = extension_from_path(file)
+                ext = llnl.url.extension_from_path(file)
             return ext
     return None
-
-
-def extension_from_path(path):
-    """Returns the allowed archive extension for a path.
-    If path does not include a valid archive extension
-    (see`spack.util.compression.ALLOWED_ARCHIVE_TYPES`) return None
-    """
-    if path is None:
-        raise ValueError("Can't call extension() on None")
-
-    for t in ALLOWED_ARCHIVE_TYPES:
-        if check_extension(path, t):
-            return t
-    return None
-
-
-def strip_compression_extension(path, ext=None):
-    """Returns path with last supported (can be combined with tar) or
-    provided archive extension stripped"""
-    path_ext = extension_from_path(path)
-    if path_ext:
-        path = expand_contracted_extension_in_path(path)
-        exts_to_check = EXTS
-        if ext:
-            exts_to_check = [ext]
-        for ext_check in exts_to_check:
-            mod_path = check_and_remove_ext(path, ext_check)
-            if mod_path != path:
-                return mod_path
-    return path
-
-
-def strip_extension(path, ext=None):
-    """Returns the part of a path that does not include extension.
-    If ext is given, only attempts to remove that extension. If no
-    extension given, attempts to strip any valid extension from path"""
-    if ext:
-        return check_and_remove_ext(path, ext)
-    for t in ALLOWED_ARCHIVE_TYPES:
-        mod_path = check_and_remove_ext(path, t)
-        if mod_path != path:
-            return mod_path
-    return path
-
-
-def check_extension(path, ext):
-    """Returns true if extension is present in path
-    false otherwise"""
-    # Strip sourceforge suffix.
-    prefix, _ = spath.find_sourceforge_suffix(path)
-    if not ext.startswith(r"\."):
-        ext = r"\.%s$" % ext
-    if re.search(ext, prefix):
-        return True
-    return False
-
-
-def reg_remove_ext(path, ext):
-    """Returns path with ext remove via regex"""
-    if path and ext:
-        suffix = r"\.%s$" % ext
-        return re.sub(suffix, "", path)
-    return path
-
-
-def check_and_remove_ext(path, ext):
-    """Returns path with extension removed if extension
-    is present in path. Otherwise just returns path"""
-    if check_extension(path, ext):
-        return reg_remove_ext(path, ext)
-    return path
-
-
-def _substitute_extension(path, old_ext, new_ext):
-    """Returns path with old_ext replaced with new_ext.
-    old_ext and new_ext can be extension strings or regexs"""
-    return re.sub(rf"{old_ext}", rf"{new_ext}", path)
-
-
-def expand_contracted_extension_in_path(path, ext=None):
-    """Returns path with any contraction extension (i.e. tgz) expanded
-    (i.e. tar.gz). If ext is specified, only attempt to expand that extension"""
-    if not ext:
-        ext = extension_from_path(path)
-    expanded_ext = expand_contracted_extension(ext)
-    if expanded_ext != ext:
-        return _substitute_extension(path, ext, expanded_ext)
-    return path
-
-
-def expand_contracted_extension(extension):
-    """Return expanded version of contracted extension
-    i.e. .tgz -> .tar.gz, no op on non contracted extensions"""
-    extension = extension.strip(".")
-    return CONTRACTION_MAP.get(extension, extension)
-
-
-def compression_ext_from_compressed_archive(extension):
-    """Returns compression extension for a compressed archive"""
-    extension = expand_contracted_extension(extension)
-    for ext in [*EXTS]:
-        if ext in extension:
-            return ext
diff --git a/lib/spack/spack/util/gcs.py b/lib/spack/spack/util/gcs.py
index 856fe73001..4e997df52b 100644
--- a/lib/spack/spack/util/gcs.py
+++ b/lib/spack/spack/util/gcs.py
@@ -10,6 +10,10 @@ integrate GCS Blob storage with spack buildcache.
 
 import os
 import sys
+import urllib.parse
+import urllib.response
+from urllib.error import URLError
+from urllib.request import BaseHandler
 
 import llnl.util.tty as tty
 
@@ -222,3 +226,21 @@ class GCSBlob:
         }
 
         return headers
+
+
+def gcs_open(req, *args, **kwargs):
+    """Open a reader stream to a blob object on GCS"""
+    url = urllib.parse.urlparse(req.get_full_url())
+    gcsblob = GCSBlob(url)
+
+    if not gcsblob.exists():
+        raise URLError("GCS blob {0} does not exist".format(gcsblob.blob_path))
+    stream = gcsblob.get_blob_byte_stream()
+    headers = gcsblob.get_blob_headers()
+
+    return urllib.response.addinfourl(stream, headers, url)
+
+
+class GCSHandler(BaseHandler):
+    def gs_open(self, req):
+        return gcs_open(req)
diff --git a/lib/spack/spack/util/path.py b/lib/spack/spack/util/path.py
index ef6fb883c7..3dc0ea676c 100644
--- a/lib/spack/spack/util/path.py
+++ b/lib/spack/spack/util/path.py
@@ -109,15 +109,6 @@ def win_exe_ext():
     return ".exe"
 
 
-def find_sourceforge_suffix(path):
-    """find and match sourceforge filepath components
-    Return match object"""
-    match = re.search(r"(.*(?:sourceforge\.net|sf\.net)/.*)(/download)$", path)
-    if match:
-        return match.groups()
-    return path, ""
-
-
 def path_to_os_path(*pths):
     """
     Takes an arbitrary number of positional parameters
diff --git a/lib/spack/spack/util/s3.py b/lib/spack/spack/util/s3.py
index c4d53d86b6..796c49a8c8 100644
--- a/lib/spack/spack/util/s3.py
+++ b/lib/spack/spack/util/s3.py
@@ -3,10 +3,13 @@
 #
 # SPDX-License-Identifier: (Apache-2.0 OR MIT)
 import os
+import urllib.error
 import urllib.parse
+import urllib.request
+import urllib.response
+from io import BufferedReader, BytesIO, IOBase
 from typing import Any, Dict, Tuple
 
-import spack
 import spack.config
 
 #: Map (mirror name, method) tuples to s3 client instances.
@@ -114,4 +117,72 @@ def get_mirror_s3_connection_info(mirror, method):
     if endpoint_url:
         s3_client_args["endpoint_url"] = _parse_s3_endpoint_url(endpoint_url)
 
-    return (s3_connection, s3_client_args)
+    return s3_connection, s3_client_args
+
+
+# NOTE(opadron): Workaround issue in boto where its StreamingBody
+# implementation is missing several APIs expected from IOBase.  These missing
+# APIs prevent the streams returned by boto from being passed as-are along to
+# urllib.
+#
+# https://github.com/boto/botocore/issues/879
+# https://github.com/python/cpython/pull/3249
+class WrapStream(BufferedReader):
+    def __init__(self, raw):
+        # In botocore >=1.23.47, StreamingBody inherits from IOBase, so we
+        # only add missing attributes in older versions.
+        # https://github.com/boto/botocore/commit/a624815eabac50442ed7404f3c4f2664cd0aa784
+        if not isinstance(raw, IOBase):
+            raw.readable = lambda: True
+            raw.writable = lambda: False
+            raw.seekable = lambda: False
+            raw.closed = False
+            raw.flush = lambda: None
+        super().__init__(raw)
+
+    def detach(self):
+        self.raw = None
+
+    def read(self, *args, **kwargs):
+        return self.raw.read(*args, **kwargs)
+
+    def __getattr__(self, key):
+        return getattr(self.raw, key)
+
+
+def _s3_open(url, method="GET"):
+    parsed = urllib.parse.urlparse(url)
+    s3 = get_s3_session(url, method="fetch")
+
+    bucket = parsed.netloc
+    key = parsed.path
+
+    if key.startswith("/"):
+        key = key[1:]
+
+    if method not in ("GET", "HEAD"):
+        raise urllib.error.URLError(
+            "Only GET and HEAD verbs are currently supported for the s3:// scheme"
+        )
+
+    try:
+        if method == "GET":
+            obj = s3.get_object(Bucket=bucket, Key=key)
+            # NOTE(opadron): Apply workaround here (see above)
+            stream = WrapStream(obj["Body"])
+        elif method == "HEAD":
+            obj = s3.head_object(Bucket=bucket, Key=key)
+            stream = BytesIO()
+    except s3.ClientError as e:
+        raise urllib.error.URLError(e) from e
+
+    headers = obj["ResponseMetadata"]["HTTPHeaders"]
+
+    return url, headers, stream
+
+
+class UrllibS3Handler(urllib.request.BaseHandler):
+    def s3_open(self, req):
+        orig_url = req.get_full_url()
+        url, headers, stream = _s3_open(orig_url, method=req.get_method())
+        return urllib.response.addinfourl(stream, headers, url)
diff --git a/lib/spack/spack/util/web.py b/lib/spack/spack/util/web.py
index 22309ba87f..79ad39ebd7 100644
--- a/lib/spack/spack/util/web.py
+++ b/lib/spack/spack/util/web.py
@@ -21,23 +21,17 @@ from typing import IO, Optional
 from urllib.error import HTTPError, URLError
 from urllib.request import HTTPSHandler, Request, build_opener
 
-import llnl.util.lang
-import llnl.util.tty as tty
+import llnl.url
+from llnl.util import lang, tty
 from llnl.util.filesystem import mkdirp, rename, working_dir
 
-import spack
 import spack.config
 import spack.error
-import spack.gcs_handler
-import spack.s3_handler
-import spack.url
-import spack.util.crypto
-import spack.util.gcs as gcs_util
-import spack.util.s3 as s3_util
 import spack.util.url as url_util
-from spack.util.compression import ALLOWED_ARCHIVE_TYPES
-from spack.util.executable import CommandNotFoundError, which
-from spack.util.path import convert_to_posix_path
+
+from .executable import CommandNotFoundError, which
+from .gcs import GCSBlob, GCSBucket, GCSHandler
+from .s3 import UrllibS3Handler, get_s3_session
 
 
 class DetailedHTTPError(HTTPError):
@@ -66,8 +60,8 @@ class SpackHTTPDefaultErrorHandler(urllib.request.HTTPDefaultErrorHandler):
 
 
 def _urlopen():
-    s3 = spack.s3_handler.UrllibS3Handler()
-    gcs = spack.gcs_handler.GCSHandler()
+    s3 = UrllibS3Handler()
+    gcs = GCSHandler()
     error_handler = SpackHTTPDefaultErrorHandler()
 
     # One opener with HTTPS ssl enabled
@@ -90,7 +84,7 @@ def _urlopen():
 
 
 #: Dispatches to the correct OpenerDirector.open, based on Spack configuration.
-urlopen = llnl.util.lang.Singleton(_urlopen)
+urlopen = lang.Singleton(_urlopen)
 
 #: User-Agent used in Request objects
 SPACK_USER_AGENT = "Spackbot/{0}".format(spack.spack_version)
@@ -190,14 +184,14 @@ def push_to_url(local_file_path, remote_path, keep_original=True, extra_args=Non
         while remote_path.startswith("/"):
             remote_path = remote_path[1:]
 
-        s3 = s3_util.get_s3_session(remote_url, method="push")
+        s3 = get_s3_session(remote_url, method="push")
         s3.upload_file(local_file_path, remote_url.netloc, remote_path, ExtraArgs=extra_args)
 
         if not keep_original:
             os.remove(local_file_path)
 
     elif remote_url.scheme == "gs":
-        gcs = gcs_util.GCSBlob(remote_url)
+        gcs = GCSBlob(remote_url)
         gcs.upload_to_blob(local_file_path)
         if not keep_original:
             os.remove(local_file_path)
@@ -427,7 +421,7 @@ def remove_url(url, recursive=False):
 
     if url.scheme == "s3":
         # Try to find a mirror for potential connection information
-        s3 = s3_util.get_s3_session(url, method="push")
+        s3 = get_s3_session(url, method="push")
         bucket = url.netloc
         if recursive:
             # Because list_objects_v2 can only return up to 1000 items
@@ -460,10 +454,10 @@ def remove_url(url, recursive=False):
 
     elif url.scheme == "gs":
         if recursive:
-            bucket = gcs_util.GCSBucket(url)
+            bucket = GCSBucket(url)
             bucket.destroy(recursive=recursive)
         else:
-            blob = gcs_util.GCSBlob(url)
+            blob = GCSBlob(url)
             blob.delete_blob()
         return
 
@@ -538,14 +532,14 @@ def list_url(url, recursive=False):
         ]
 
     if url.scheme == "s3":
-        s3 = s3_util.get_s3_session(url, method="fetch")
+        s3 = get_s3_session(url, method="fetch")
         if recursive:
             return list(_iter_s3_prefix(s3, url))
 
         return list(set(key.split("/", 1)[0] for key in _iter_s3_prefix(s3, url)))
 
     elif url.scheme == "gs":
-        gcs = gcs_util.GCSBucket(url)
+        gcs = GCSBucket(url)
         return gcs.get_all_blobs(recursive=recursive)
 
 
@@ -636,7 +630,7 @@ def spider(root_urls, depth=0, concurrency=32):
                 links.add(abs_link)
 
                 # Skip stuff that looks like an archive
-                if any(raw_link.endswith(s) for s in ALLOWED_ARCHIVE_TYPES):
+                if any(raw_link.endswith(s) for s in llnl.url.ALLOWED_ARCHIVE_TYPES):
                     continue
 
                 # Skip already-visited links
@@ -696,7 +690,7 @@ def spider(root_urls, depth=0, concurrency=32):
                     current_depth, depth, len(spider_args)
                 )
             )
-            results = tp.map(llnl.util.lang.star(_spider), spider_args)
+            results = tp.map(lang.star(_spider), spider_args)
             spider_args = []
             collect = current_depth < depth
             for sub_pages, sub_links, sub_spider_args in results:
@@ -713,123 +707,6 @@ def spider(root_urls, depth=0, concurrency=32):
     return pages, links
 
 
-def find_versions_of_archive(
-    archive_urls, list_url=None, list_depth=0, concurrency=32, reference_package=None
-):
-    """Scrape web pages for new versions of a tarball. This function prefers URLs in the
-    following order: links found on the scraped page that match a url generated by the
-    reference package, found and in the archive_urls list, found and derived from those
-    in the archive_urls list, and if none are found for a version then the item in the
-    archive_urls list is included for the version.
-
-    Args:
-        archive_urls (str or list or tuple): URL or sequence of URLs for
-            different versions of a package. Typically these are just the
-            tarballs from the package file itself. By default, this searches
-            the parent directories of archives.
-        list_url (str or None): URL for a listing of archives.
-            Spack will scrape these pages for download links that look
-            like the archive URL.
-        list_depth (int): max depth to follow links on list_url pages.
-            Defaults to 0.
-        concurrency (int): maximum number of concurrent requests
-        reference_package (spack.package_base.PackageBase or None): a spack package
-            used as a reference for url detection.  Uses the url_for_version
-            method on the package to produce reference urls which, if found,
-            are preferred.
-    """
-    if not isinstance(archive_urls, (list, tuple)):
-        archive_urls = [archive_urls]
-
-    # Generate a list of list_urls based on archive urls and any
-    # explicitly listed list_url in the package
-    list_urls = set()
-    if list_url is not None:
-        list_urls.add(list_url)
-    for aurl in archive_urls:
-        list_urls |= spack.url.find_list_urls(aurl)
-
-    # Add '/' to the end of the URL. Some web servers require this.
-    additional_list_urls = set()
-    for lurl in list_urls:
-        if not lurl.endswith("/"):
-            additional_list_urls.add(lurl + "/")
-    list_urls |= additional_list_urls
-
-    # Grab some web pages to scrape.
-    pages, links = spider(list_urls, depth=list_depth, concurrency=concurrency)
-
-    # Scrape them for archive URLs
-    regexes = []
-    for aurl in archive_urls:
-        # This creates a regex from the URL with a capture group for
-        # the version part of the URL.  The capture group is converted
-        # to a generic wildcard, so we can use this to extract things
-        # on a page that look like archive URLs.
-        url_regex = spack.url.wildcard_version(aurl)
-
-        # We'll be a bit more liberal and just look for the archive
-        # part, not the full path.
-        # this is a URL so it is a posixpath even on Windows
-        url_regex = PurePosixPath(url_regex).name
-
-        # We need to add a / to the beginning of the regex to prevent
-        # Spack from picking up similarly named packages like:
-        #   https://cran.r-project.org/src/contrib/pls_2.6-0.tar.gz
-        #   https://cran.r-project.org/src/contrib/enpls_5.7.tar.gz
-        #   https://cran.r-project.org/src/contrib/autopls_1.3.tar.gz
-        #   https://cran.r-project.org/src/contrib/matrixpls_1.0.4.tar.gz
-        url_regex = "/" + url_regex
-
-        # We need to add a $ anchor to the end of the regex to prevent
-        # Spack from picking up signature files like:
-        #   .asc
-        #   .md5
-        #   .sha256
-        #   .sig
-        # However, SourceForge downloads still need to end in '/download'.
-        url_regex += r"(\/download)?"
-        # PyPI adds #sha256=... to the end of the URL
-        url_regex += "(#sha256=.*)?"
-        url_regex += "$"
-
-        regexes.append(url_regex)
-
-    # Build a dict version -> URL from any links that match the wildcards.
-    # Walk through archive_url links first.
-    # Any conflicting versions will be overwritten by the list_url links.
-    versions = {}
-    matched = set()
-    for url in sorted(links):
-        url = convert_to_posix_path(url)
-        if any(re.search(r, url) for r in regexes):
-            try:
-                ver = spack.url.parse_version(url)
-                if ver in matched:
-                    continue
-                versions[ver] = url
-                # prevent this version from getting overwritten
-                if reference_package is not None:
-                    if url == reference_package.url_for_version(ver):
-                        matched.add(ver)
-                else:
-                    extrapolated_urls = [
-                        spack.url.substitute_version(u, ver) for u in archive_urls
-                    ]
-                    if url in extrapolated_urls:
-                        matched.add(ver)
-            except spack.url.UndetectableVersionError:
-                continue
-
-    for url in archive_urls:
-        url = convert_to_posix_path(url)
-        ver = spack.url.parse_version(url)
-        if ver not in versions:
-            versions[ver] = url
-
-    return versions
-
-
 def get_header(headers, header_name):
     """Looks up a dict of headers for the given header value.
 
diff --git a/var/spack/repos/builtin/packages/protobuf/package.py b/var/spack/repos/builtin/packages/protobuf/package.py
index a1a9a8e2d1..9a4ed84058 100644
--- a/var/spack/repos/builtin/packages/protobuf/package.py
+++ b/var/spack/repos/builtin/packages/protobuf/package.py
@@ -3,7 +3,7 @@
 #
 # SPDX-License-Identifier: (Apache-2.0 OR MIT)
 
-import spack.util.web
+import spack.url
 from spack.package import *
 
 
@@ -120,9 +120,7 @@ class Protobuf(CMakePackage):
         return dict(
             map(
                 lambda u: (u, self.url_for_version(u)),
-                spack.util.web.find_versions_of_archive(
-                    self.all_urls, self.list_url, self.list_depth
-                ),
+                spack.url.find_versions_of_archive(self.all_urls, self.list_url, self.list_depth),
             )
         )
author	Massimiliano Culpo <massimiliano.culpo@gmail.com>	2023-09-15 15:43:23 +0200
committer	GitHub <noreply@github.com>	2023-09-15 15:43:23 +0200
commit	fb9e5fcc4f5307deaf10fcd571ebea68188d859c (patch)
tree	0c048fbcd27b73b516922dfb1cc37003c440935b
parent	bc02453f6dd06b82f0324d208b67559125e135ea (diff)
download	spack-fb9e5fcc4f5307deaf10fcd571ebea68188d859c.tar.gz spack-fb9e5fcc4f5307deaf10fcd571ebea68188d859c.tar.bz2 spack-fb9e5fcc4f5307deaf10fcd571ebea68188d859c.tar.xz spack-fb9e5fcc4f5307deaf10fcd571ebea68188d859c.zip