1 files changed, 459 insertions, 0 deletions
diff --git a/lib/spack/llnl/url.py b/lib/spack/llnl/url.py
new file mode 100644
index 0000000000..40e7606506
--- /dev/null
+++ b/lib/spack/llnl/url.py
@@ -0,0 +1,459 @@
+# Copyright 2013-2023 Lawrence Livermore National Security, LLC and other
+# Spack Project Developers. See the top-level COPYRIGHT file for details.
+#
+# SPDX-License-Identifier: (Apache-2.0 OR MIT)
+"""URL primitives that just require Python standard library."""
+import itertools
+import os.path
+import re
+from typing import Optional, Set, Tuple
+from urllib.parse import urlsplit, urlunsplit
+
+# Archive extensions allowed in Spack
+PREFIX_EXTENSIONS = ("tar", "TAR")
+EXTENSIONS = ("gz", "bz2", "xz", "Z")
+NO_TAR_EXTENSIONS = ("zip", "tgz", "tbz2", "tbz", "txz")
+
+# Add PREFIX_EXTENSIONS and EXTENSIONS last so that .tar.gz is matched *before* .tar or .gz
+ALLOWED_ARCHIVE_TYPES = (
+    tuple(".".join(ext) for ext in itertools.product(PREFIX_EXTENSIONS, EXTENSIONS))
+    + PREFIX_EXTENSIONS
+    + EXTENSIONS
+    + NO_TAR_EXTENSIONS
+)
+CONTRACTION_MAP = {"tgz": "tar.gz", "txz": "tar.xz", "tbz": "tar.bz2", "tbz2": "tar.bz2"}
+
+
+def find_list_urls(url: str) -> Set[str]:
+    r"""Find good list URLs for the supplied URL.
+
+    By default, returns the dirname of the archive path.
+
+    Provides special treatment for the following websites, which have a
+    unique list URL different from the dirname of the download URL:
+
+    =========  =======================================================
+    GitHub     https://github.com/<repo>/<name>/releases
+    GitLab     https://gitlab.\*/<repo>/<name>/tags
+    BitBucket  https://bitbucket.org/<repo>/<name>/downloads/?tab=tags
+    CRAN       https://\*.r-project.org/src/contrib/Archive/<name>
+    PyPI       https://pypi.org/simple/<name>/
+    LuaRocks   https://luarocks.org/modules/<repo>/<name>
+    =========  =======================================================
+
+    Note: this function is called by `spack versions`, `spack checksum`,
+    and `spack create`, but not by `spack fetch` or `spack install`.
+
+    Parameters:
+        url (str): The download URL for the package
+
+    Returns:
+        set: One or more list URLs for the package
+    """
+
+    url_types = [
+        # GitHub
+        # e.g. https://github.com/llnl/callpath/archive/v1.0.1.tar.gz
+        (r"(.*github\.com/[^/]+/[^/]+)", lambda m: m.group(1) + "/releases"),
+        # GitLab API endpoint
+        # e.g. https://gitlab.dkrz.de/api/v4/projects/k202009%2Flibaec/repository/archive.tar.gz?sha=v1.0.2
+        (
+            r"(.*gitlab[^/]+)/api/v4/projects/([^/]+)%2F([^/]+)",
+            lambda m: m.group(1) + "/" + m.group(2) + "/" + m.group(3) + "/tags",
+        ),
+        # GitLab non-API endpoint
+        # e.g. https://gitlab.dkrz.de/k202009/libaec/uploads/631e85bcf877c2dcaca9b2e6d6526339/libaec-1.0.0.tar.gz
+        (r"(.*gitlab[^/]+/(?!api/v4/projects)[^/]+/[^/]+)", lambda m: m.group(1) + "/tags"),
+        # BitBucket
+        # e.g. https://bitbucket.org/eigen/eigen/get/3.3.3.tar.bz2
+        (r"(.*bitbucket.org/[^/]+/[^/]+)", lambda m: m.group(1) + "/downloads/?tab=tags"),
+        # CRAN
+        # e.g. https://cran.r-project.org/src/contrib/Rcpp_0.12.9.tar.gz
+        # e.g. https://cloud.r-project.org/src/contrib/rgl_0.98.1.tar.gz
+        (
+            r"(.*\.r-project\.org/src/contrib)/([^_]+)",
+            lambda m: m.group(1) + "/Archive/" + m.group(2),
+        ),
+        # PyPI
+        # e.g. https://pypi.io/packages/source/n/numpy/numpy-1.19.4.zip
+        # e.g. https://www.pypi.io/packages/source/n/numpy/numpy-1.19.4.zip
+        # e.g. https://pypi.org/packages/source/n/numpy/numpy-1.19.4.zip
+        # e.g. https://pypi.python.org/packages/source/n/numpy/numpy-1.19.4.zip
+        # e.g. https://files.pythonhosted.org/packages/source/n/numpy/numpy-1.19.4.zip
+        # e.g. https://pypi.io/packages/py2.py3/o/opencensus-context/opencensus_context-0.1.1-py2.py3-none-any.whl
+        (
+            r"(?:pypi|pythonhosted)[^/]+/packages/[^/]+/./([^/]+)",
+            lambda m: "https://pypi.org/simple/" + m.group(1) + "/",
+        ),
+        # LuaRocks
+        # e.g. https://luarocks.org/manifests/gvvaughan/lpeg-1.0.2-1.src.rock
+        # e.g. https://luarocks.org/manifests/openresty/lua-cjson-2.1.0-1.src.rock
+        (
+            r"luarocks[^/]+/(?:modules|manifests)/(?P<org>[^/]+)/"
+            + r"(?P<name>.+?)-[0-9.-]*\.src\.rock",
+            lambda m: "https://luarocks.org/modules/"
+            + m.group("org")
+            + "/"
+            + m.group("name")
+            + "/",
+        ),
+    ]
+
+    list_urls = {os.path.dirname(url)}
+
+    for pattern, fun in url_types:
+        match = re.search(pattern, url)
+        if match:
+            list_urls.add(fun(match))
+
+    return list_urls
+
+
+def strip_query_and_fragment(url: str) -> Tuple[str, str]:
+    """Strips query and fragment from a url, then returns the base url and the suffix.
+
+    Args:
+        url: URL to be stripped
+
+    Raises:
+        ValueError: when there is any error parsing the URL
+    """
+    components = urlsplit(url)
+    stripped = components[:3] + (None, None)
+
+    query, frag = components[3:5]
+    suffix = ""
+    if query:
+        suffix += "?" + query
+    if frag:
+        suffix += "#" + frag
+
+    return urlunsplit(stripped), suffix
+
+
+SOURCEFORGE_RE = re.compile(r"(.*(?:sourceforge\.net|sf\.net)/.*)(/download)$")
+
+
+def split_url_on_sourceforge_suffix(url: str) -> Tuple[str, ...]:
+    """If the input is a sourceforge URL, returns base URL and "/download" suffix. Otherwise,
+    returns the input URL and an empty string.
+    """
+    match = SOURCEFORGE_RE.search(url)
+    if match is not None:
+        return match.groups()
+    return url, ""
+
+
+def has_extension(path_or_url: str, ext: str) -> bool:
+    """Returns true if the extension in input is present in path, false otherwise."""
+    prefix, _ = split_url_on_sourceforge_suffix(path_or_url)
+    if not ext.startswith(r"\."):
+        ext = rf"\.{ext}$"
+
+    if re.search(ext, prefix):
+        return True
+    return False
+
+
+def extension_from_path(path_or_url: Optional[str]) -> Optional[str]:
+    """Tries to match an allowed archive extension to the input. Returns the first match,
+    or None if no match was found.
+
+    Raises:
+        ValueError: if the input is None
+    """
+    if path_or_url is None:
+        raise ValueError("Can't call extension() on None")
+
+    for t in ALLOWED_ARCHIVE_TYPES:
+        if has_extension(path_or_url, t):
+            return t
+    return None
+
+
+def remove_extension(path_or_url: str, *, extension: str) -> str:
+    """Returns the input with the extension removed"""
+    suffix = rf"\.{extension}$"
+    return re.sub(suffix, "", path_or_url)
+
+
+def check_and_remove_ext(path: str, *, extension: str) -> str:
+    """Returns the input path with the extension removed, if the extension is present in path.
+    Otherwise, returns the input unchanged.
+    """
+    if not has_extension(path, extension):
+        return path
+    path, _ = split_url_on_sourceforge_suffix(path)
+    return remove_extension(path, extension=extension)
+
+
+def strip_extension(path_or_url: str, *, extension: Optional[str] = None) -> str:
+    """If a path contains the extension in input, returns the path stripped of the extension.
+    Otherwise, returns the input path.
+
+    If extension is None, attempts to strip any allowed extension from path.
+    """
+    if extension is None:
+        for t in ALLOWED_ARCHIVE_TYPES:
+            if has_extension(path_or_url, ext=t):
+                extension = t
+                break
+        else:
+            return path_or_url
+
+    return check_and_remove_ext(path_or_url, extension=extension)
+
+
+def split_url_extension(url: str) -> Tuple[str, ...]:
+    """Some URLs have a query string, e.g.:
+
+    1. https://github.com/losalamos/CLAMR/blob/packages/PowerParser_v2.0.7.tgz?raw=true
+    2. http://www.apache.org/dyn/closer.cgi?path=/cassandra/1.2.0/apache-cassandra-1.2.0-rc2-bin.tar.gz
+    3. https://gitlab.kitware.com/vtk/vtk/repository/archive.tar.bz2?ref=v7.0.0
+
+    In (1), the query string needs to be stripped to get at the
+    extension, but in (2) & (3), the filename is IN a single final query
+    argument.
+
+    This strips the URL into three pieces: ``prefix``, ``ext``, and ``suffix``.
+    The suffix contains anything that was stripped off the URL to
+    get at the file extension.  In (1), it will be ``'?raw=true'``, but
+    in (2), it will be empty. In (3) the suffix is a parameter that follows
+    after the file extension, e.g.:
+
+    1. ``('https://github.com/losalamos/CLAMR/blob/packages/PowerParser_v2.0.7', '.tgz', '?raw=true')``
+    2. ``('http://www.apache.org/dyn/closer.cgi?path=/cassandra/1.2.0/apache-cassandra-1.2.0-rc2-bin', '.tar.gz', None)``
+    3. ``('https://gitlab.kitware.com/vtk/vtk/repository/archive', '.tar.bz2', '?ref=v7.0.0')``
+    """
+    # Strip off sourceforge download suffix.
+    # e.g. https://sourceforge.net/projects/glew/files/glew/2.0.0/glew-2.0.0.tgz/download
+    prefix, suffix = split_url_on_sourceforge_suffix(url)
+
+    ext = extension_from_path(prefix)
+    if ext is not None:
+        prefix = strip_extension(prefix)
+        return prefix, ext, suffix
+
+    try:
+        prefix, suf = strip_query_and_fragment(prefix)
+    except ValueError:
+        # FIXME: tty.debug("Got error parsing path %s" % path)
+        # Ignore URL parse errors here
+        return url, ""
+
+    ext = extension_from_path(prefix)
+    prefix = strip_extension(prefix)
+    suffix = suf + suffix
+    if ext is None:
+        ext = ""
+
+    return prefix, ext, suffix
+
+
+def strip_version_suffixes(path_or_url: str) -> str:
+    """Some tarballs contain extraneous information after the version:
+
+    * ``bowtie2-2.2.5-source``
+    * ``libevent-2.0.21-stable``
+    * ``cuda_8.0.44_linux.run``
+
+    These strings are not part of the version number and should be ignored.
+    This function strips those suffixes off and returns the remaining string.
+    The goal is that the version is always the last thing in ``path``:
+
+    * ``bowtie2-2.2.5``
+    * ``libevent-2.0.21``
+    * ``cuda_8.0.44``
+
+    Args:
+        path_or_url: The filename or URL for the package
+
+    Returns:
+        The ``path`` with any extraneous suffixes removed
+    """
+    # NOTE: This could be done with complicated regexes in parse_version_offset
+    # NOTE: The problem is that we would have to add these regexes to the end
+    # NOTE: of every single version regex. Easier to just strip them off
+    # NOTE: permanently
+
+    suffix_regexes = [
+        # Download type
+        r"[Ii]nstall",
+        r"all",
+        r"code",
+        r"[Ss]ources?",
+        r"file",
+        r"full",
+        r"single",
+        r"with[a-zA-Z_-]+",
+        r"rock",
+        r"src(_0)?",
+        r"public",
+        r"bin",
+        r"binary",
+        r"run",
+        r"[Uu]niversal",
+        r"jar",
+        r"complete",
+        r"dynamic",
+        r"oss",
+        r"gem",
+        r"tar",
+        r"sh",
+        # Download version
+        r"release",
+        r"bin",
+        r"stable",
+        r"[Ff]inal",
+        r"rel",
+        r"orig",
+        r"dist",
+        r"\+",
+        # License
+        r"gpl",
+        # Arch
+        # Needs to come before and after OS, appears in both orders
+        r"ia32",
+        r"intel",
+        r"amd64",
+        r"linux64",
+        r"x64",
+        r"64bit",
+        r"x86[_-]64",
+        r"i586_64",
+        r"x86",
+        r"i[36]86",
+        r"ppc64(le)?",
+        r"armv?(7l|6l|64)",
+        # Other
+        r"cpp",
+        r"gtk",
+        r"incubating",
+        # OS
+        r"[Ll]inux(_64)?",
+        r"LINUX",
+        r"[Uu]ni?x",
+        r"[Ss]un[Oo][Ss]",
+        r"[Mm]ac[Oo][Ss][Xx]?",
+        r"[Oo][Ss][Xx]",
+        r"[Dd]arwin(64)?",
+        r"[Aa]pple",
+        r"[Ww]indows",
+        r"[Ww]in(64|32)?",
+        r"[Cc]ygwin(64|32)?",
+        r"[Mm]ingw",
+        r"centos",
+        # Arch
+        # Needs to come before and after OS, appears in both orders
+        r"ia32",
+        r"intel",
+        r"amd64",
+        r"linux64",
+        r"x64",
+        r"64bit",
+        r"x86[_-]64",
+        r"i586_64",
+        r"x86",
+        r"i[36]86",
+        r"ppc64(le)?",
+        r"armv?(7l|6l|64)?",
+        # PyPI
+        r"[._-]py[23].*\.whl",
+        r"[._-]cp[23].*\.whl",
+        r"[._-]win.*\.exe",
+    ]
+
+    for regex in suffix_regexes:
+        # Remove the suffix from the end of the path
+        # This may be done multiple times
+        path_or_url = re.sub(r"[._-]?" + regex + "$", "", path_or_url)
+
+    return path_or_url
+
+
+def expand_contracted_extension(extension: str) -> str:
+    """Returns the expanded version of a known contracted extension.
+
+    This function maps extensions like ".tgz" to ".tar.gz". On unknown extensions,
+    return the input unmodified.
+    """
+    extension = extension.strip(".")
+    return CONTRACTION_MAP.get(extension, extension)
+
+
+def expand_contracted_extension_in_path(
+    path_or_url: str, *, extension: Optional[str] = None
+) -> str:
+    """Returns the input path or URL with any contraction extension expanded.
+
+    Args:
+        path_or_url: path or URL to be expanded
+        extension: if specified, only attempt to expand that extension
+    """
+    extension = extension or extension_from_path(path_or_url)
+    if extension is None:
+        return path_or_url
+
+    expanded = expand_contracted_extension(extension)
+    if expanded != extension:
+        return re.sub(rf"{extension}", rf"{expanded}", path_or_url)
+    return path_or_url
+
+
+def compression_ext_from_compressed_archive(extension: str) -> Optional[str]:
+    """Returns compression extension for a compressed archive"""
+    extension = expand_contracted_extension(extension)
+    for ext in [*EXTENSIONS]:
+        if ext in extension:
+            return ext
+    return None
+
+
+def strip_compression_extension(path_or_url: str, ext: Optional[str] = None) -> str:
+    """Strips the compression extension from the input, and returns it. For instance,
+    "foo.tgz" becomes "foo.tar".
+
+    If no extension is given, try a default list of extensions.
+
+    Args:
+        path_or_url: input to be stripped
+        ext: if given, extension to be stripped
+    """
+    if not extension_from_path(path_or_url):
+        return path_or_url
+
+    expanded_path = expand_contracted_extension_in_path(path_or_url)
+    candidates = [ext] if ext is not None else EXTENSIONS
+    for current_extension in candidates:
+        modified_path = check_and_remove_ext(expanded_path, extension=current_extension)
+        if modified_path != expanded_path:
+            return modified_path
+    return expanded_path
+
+
+def allowed_archive(path_or_url: str) -> bool:
+    """Returns true if the input is a valid archive, False otherwise."""
+    return (
+        False if not path_or_url else any(path_or_url.endswith(t) for t in ALLOWED_ARCHIVE_TYPES)
+    )
+
+
+def determine_url_file_extension(path: str) -> str:
+    """This returns the type of archive a URL refers to.  This is
+    sometimes confusing because of URLs like:
+
+    (1) https://github.com/petdance/ack/tarball/1.93_02
+
+    Where the URL doesn't actually contain the filename.  We need
+    to know what type it is so that we can appropriately name files
+    in mirrors.
+    """
+    match = re.search(r"github.com/.+/(zip|tar)ball/", path)
+    if match:
+        if match.group(1) == "zip":
+            return "zip"
+        elif match.group(1) == "tar":
+            return "tar.gz"
+
+    prefix, ext, suffix = split_url_extension(path)
+    return ext