From fb9e5fcc4f5307deaf10fcd571ebea68188d859c Mon Sep 17 00:00:00 2001 From: Massimiliano Culpo Date: Fri, 15 Sep 2023 15:43:23 +0200 Subject: Group primitive url/path handling functions together (#40028) --- lib/spack/llnl/url.py | 459 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 459 insertions(+) create mode 100644 lib/spack/llnl/url.py (limited to 'lib/spack/llnl') diff --git a/lib/spack/llnl/url.py b/lib/spack/llnl/url.py new file mode 100644 index 0000000000..40e7606506 --- /dev/null +++ b/lib/spack/llnl/url.py @@ -0,0 +1,459 @@ +# Copyright 2013-2023 Lawrence Livermore National Security, LLC and other +# Spack Project Developers. See the top-level COPYRIGHT file for details. +# +# SPDX-License-Identifier: (Apache-2.0 OR MIT) +"""URL primitives that just require Python standard library.""" +import itertools +import os.path +import re +from typing import Optional, Set, Tuple +from urllib.parse import urlsplit, urlunsplit + +# Archive extensions allowed in Spack +PREFIX_EXTENSIONS = ("tar", "TAR") +EXTENSIONS = ("gz", "bz2", "xz", "Z") +NO_TAR_EXTENSIONS = ("zip", "tgz", "tbz2", "tbz", "txz") + +# Add PREFIX_EXTENSIONS and EXTENSIONS last so that .tar.gz is matched *before* .tar or .gz +ALLOWED_ARCHIVE_TYPES = ( + tuple(".".join(ext) for ext in itertools.product(PREFIX_EXTENSIONS, EXTENSIONS)) + + PREFIX_EXTENSIONS + + EXTENSIONS + + NO_TAR_EXTENSIONS +) +CONTRACTION_MAP = {"tgz": "tar.gz", "txz": "tar.xz", "tbz": "tar.bz2", "tbz2": "tar.bz2"} + + +def find_list_urls(url: str) -> Set[str]: + r"""Find good list URLs for the supplied URL. + + By default, returns the dirname of the archive path. + + Provides special treatment for the following websites, which have a + unique list URL different from the dirname of the download URL: + + ========= ======================================================= + GitHub https://github.com///releases + GitLab https://gitlab.\*///tags + BitBucket https://bitbucket.org///downloads/?tab=tags + CRAN https://\*.r-project.org/src/contrib/Archive/ + PyPI https://pypi.org/simple// + LuaRocks https://luarocks.org/modules// + ========= ======================================================= + + Note: this function is called by `spack versions`, `spack checksum`, + and `spack create`, but not by `spack fetch` or `spack install`. + + Parameters: + url (str): The download URL for the package + + Returns: + set: One or more list URLs for the package + """ + + url_types = [ + # GitHub + # e.g. https://github.com/llnl/callpath/archive/v1.0.1.tar.gz + (r"(.*github\.com/[^/]+/[^/]+)", lambda m: m.group(1) + "/releases"), + # GitLab API endpoint + # e.g. https://gitlab.dkrz.de/api/v4/projects/k202009%2Flibaec/repository/archive.tar.gz?sha=v1.0.2 + ( + r"(.*gitlab[^/]+)/api/v4/projects/([^/]+)%2F([^/]+)", + lambda m: m.group(1) + "/" + m.group(2) + "/" + m.group(3) + "/tags", + ), + # GitLab non-API endpoint + # e.g. https://gitlab.dkrz.de/k202009/libaec/uploads/631e85bcf877c2dcaca9b2e6d6526339/libaec-1.0.0.tar.gz + (r"(.*gitlab[^/]+/(?!api/v4/projects)[^/]+/[^/]+)", lambda m: m.group(1) + "/tags"), + # BitBucket + # e.g. https://bitbucket.org/eigen/eigen/get/3.3.3.tar.bz2 + (r"(.*bitbucket.org/[^/]+/[^/]+)", lambda m: m.group(1) + "/downloads/?tab=tags"), + # CRAN + # e.g. https://cran.r-project.org/src/contrib/Rcpp_0.12.9.tar.gz + # e.g. https://cloud.r-project.org/src/contrib/rgl_0.98.1.tar.gz + ( + r"(.*\.r-project\.org/src/contrib)/([^_]+)", + lambda m: m.group(1) + "/Archive/" + m.group(2), + ), + # PyPI + # e.g. https://pypi.io/packages/source/n/numpy/numpy-1.19.4.zip + # e.g. https://www.pypi.io/packages/source/n/numpy/numpy-1.19.4.zip + # e.g. https://pypi.org/packages/source/n/numpy/numpy-1.19.4.zip + # e.g. https://pypi.python.org/packages/source/n/numpy/numpy-1.19.4.zip + # e.g. https://files.pythonhosted.org/packages/source/n/numpy/numpy-1.19.4.zip + # e.g. https://pypi.io/packages/py2.py3/o/opencensus-context/opencensus_context-0.1.1-py2.py3-none-any.whl + ( + r"(?:pypi|pythonhosted)[^/]+/packages/[^/]+/./([^/]+)", + lambda m: "https://pypi.org/simple/" + m.group(1) + "/", + ), + # LuaRocks + # e.g. https://luarocks.org/manifests/gvvaughan/lpeg-1.0.2-1.src.rock + # e.g. https://luarocks.org/manifests/openresty/lua-cjson-2.1.0-1.src.rock + ( + r"luarocks[^/]+/(?:modules|manifests)/(?P[^/]+)/" + + r"(?P.+?)-[0-9.-]*\.src\.rock", + lambda m: "https://luarocks.org/modules/" + + m.group("org") + + "/" + + m.group("name") + + "/", + ), + ] + + list_urls = {os.path.dirname(url)} + + for pattern, fun in url_types: + match = re.search(pattern, url) + if match: + list_urls.add(fun(match)) + + return list_urls + + +def strip_query_and_fragment(url: str) -> Tuple[str, str]: + """Strips query and fragment from a url, then returns the base url and the suffix. + + Args: + url: URL to be stripped + + Raises: + ValueError: when there is any error parsing the URL + """ + components = urlsplit(url) + stripped = components[:3] + (None, None) + + query, frag = components[3:5] + suffix = "" + if query: + suffix += "?" + query + if frag: + suffix += "#" + frag + + return urlunsplit(stripped), suffix + + +SOURCEFORGE_RE = re.compile(r"(.*(?:sourceforge\.net|sf\.net)/.*)(/download)$") + + +def split_url_on_sourceforge_suffix(url: str) -> Tuple[str, ...]: + """If the input is a sourceforge URL, returns base URL and "/download" suffix. Otherwise, + returns the input URL and an empty string. + """ + match = SOURCEFORGE_RE.search(url) + if match is not None: + return match.groups() + return url, "" + + +def has_extension(path_or_url: str, ext: str) -> bool: + """Returns true if the extension in input is present in path, false otherwise.""" + prefix, _ = split_url_on_sourceforge_suffix(path_or_url) + if not ext.startswith(r"\."): + ext = rf"\.{ext}$" + + if re.search(ext, prefix): + return True + return False + + +def extension_from_path(path_or_url: Optional[str]) -> Optional[str]: + """Tries to match an allowed archive extension to the input. Returns the first match, + or None if no match was found. + + Raises: + ValueError: if the input is None + """ + if path_or_url is None: + raise ValueError("Can't call extension() on None") + + for t in ALLOWED_ARCHIVE_TYPES: + if has_extension(path_or_url, t): + return t + return None + + +def remove_extension(path_or_url: str, *, extension: str) -> str: + """Returns the input with the extension removed""" + suffix = rf"\.{extension}$" + return re.sub(suffix, "", path_or_url) + + +def check_and_remove_ext(path: str, *, extension: str) -> str: + """Returns the input path with the extension removed, if the extension is present in path. + Otherwise, returns the input unchanged. + """ + if not has_extension(path, extension): + return path + path, _ = split_url_on_sourceforge_suffix(path) + return remove_extension(path, extension=extension) + + +def strip_extension(path_or_url: str, *, extension: Optional[str] = None) -> str: + """If a path contains the extension in input, returns the path stripped of the extension. + Otherwise, returns the input path. + + If extension is None, attempts to strip any allowed extension from path. + """ + if extension is None: + for t in ALLOWED_ARCHIVE_TYPES: + if has_extension(path_or_url, ext=t): + extension = t + break + else: + return path_or_url + + return check_and_remove_ext(path_or_url, extension=extension) + + +def split_url_extension(url: str) -> Tuple[str, ...]: + """Some URLs have a query string, e.g.: + + 1. https://github.com/losalamos/CLAMR/blob/packages/PowerParser_v2.0.7.tgz?raw=true + 2. http://www.apache.org/dyn/closer.cgi?path=/cassandra/1.2.0/apache-cassandra-1.2.0-rc2-bin.tar.gz + 3. https://gitlab.kitware.com/vtk/vtk/repository/archive.tar.bz2?ref=v7.0.0 + + In (1), the query string needs to be stripped to get at the + extension, but in (2) & (3), the filename is IN a single final query + argument. + + This strips the URL into three pieces: ``prefix``, ``ext``, and ``suffix``. + The suffix contains anything that was stripped off the URL to + get at the file extension. In (1), it will be ``'?raw=true'``, but + in (2), it will be empty. In (3) the suffix is a parameter that follows + after the file extension, e.g.: + + 1. ``('https://github.com/losalamos/CLAMR/blob/packages/PowerParser_v2.0.7', '.tgz', '?raw=true')`` + 2. ``('http://www.apache.org/dyn/closer.cgi?path=/cassandra/1.2.0/apache-cassandra-1.2.0-rc2-bin', '.tar.gz', None)`` + 3. ``('https://gitlab.kitware.com/vtk/vtk/repository/archive', '.tar.bz2', '?ref=v7.0.0')`` + """ + # Strip off sourceforge download suffix. + # e.g. https://sourceforge.net/projects/glew/files/glew/2.0.0/glew-2.0.0.tgz/download + prefix, suffix = split_url_on_sourceforge_suffix(url) + + ext = extension_from_path(prefix) + if ext is not None: + prefix = strip_extension(prefix) + return prefix, ext, suffix + + try: + prefix, suf = strip_query_and_fragment(prefix) + except ValueError: + # FIXME: tty.debug("Got error parsing path %s" % path) + # Ignore URL parse errors here + return url, "" + + ext = extension_from_path(prefix) + prefix = strip_extension(prefix) + suffix = suf + suffix + if ext is None: + ext = "" + + return prefix, ext, suffix + + +def strip_version_suffixes(path_or_url: str) -> str: + """Some tarballs contain extraneous information after the version: + + * ``bowtie2-2.2.5-source`` + * ``libevent-2.0.21-stable`` + * ``cuda_8.0.44_linux.run`` + + These strings are not part of the version number and should be ignored. + This function strips those suffixes off and returns the remaining string. + The goal is that the version is always the last thing in ``path``: + + * ``bowtie2-2.2.5`` + * ``libevent-2.0.21`` + * ``cuda_8.0.44`` + + Args: + path_or_url: The filename or URL for the package + + Returns: + The ``path`` with any extraneous suffixes removed + """ + # NOTE: This could be done with complicated regexes in parse_version_offset + # NOTE: The problem is that we would have to add these regexes to the end + # NOTE: of every single version regex. Easier to just strip them off + # NOTE: permanently + + suffix_regexes = [ + # Download type + r"[Ii]nstall", + r"all", + r"code", + r"[Ss]ources?", + r"file", + r"full", + r"single", + r"with[a-zA-Z_-]+", + r"rock", + r"src(_0)?", + r"public", + r"bin", + r"binary", + r"run", + r"[Uu]niversal", + r"jar", + r"complete", + r"dynamic", + r"oss", + r"gem", + r"tar", + r"sh", + # Download version + r"release", + r"bin", + r"stable", + r"[Ff]inal", + r"rel", + r"orig", + r"dist", + r"\+", + # License + r"gpl", + # Arch + # Needs to come before and after OS, appears in both orders + r"ia32", + r"intel", + r"amd64", + r"linux64", + r"x64", + r"64bit", + r"x86[_-]64", + r"i586_64", + r"x86", + r"i[36]86", + r"ppc64(le)?", + r"armv?(7l|6l|64)", + # Other + r"cpp", + r"gtk", + r"incubating", + # OS + r"[Ll]inux(_64)?", + r"LINUX", + r"[Uu]ni?x", + r"[Ss]un[Oo][Ss]", + r"[Mm]ac[Oo][Ss][Xx]?", + r"[Oo][Ss][Xx]", + r"[Dd]arwin(64)?", + r"[Aa]pple", + r"[Ww]indows", + r"[Ww]in(64|32)?", + r"[Cc]ygwin(64|32)?", + r"[Mm]ingw", + r"centos", + # Arch + # Needs to come before and after OS, appears in both orders + r"ia32", + r"intel", + r"amd64", + r"linux64", + r"x64", + r"64bit", + r"x86[_-]64", + r"i586_64", + r"x86", + r"i[36]86", + r"ppc64(le)?", + r"armv?(7l|6l|64)?", + # PyPI + r"[._-]py[23].*\.whl", + r"[._-]cp[23].*\.whl", + r"[._-]win.*\.exe", + ] + + for regex in suffix_regexes: + # Remove the suffix from the end of the path + # This may be done multiple times + path_or_url = re.sub(r"[._-]?" + regex + "$", "", path_or_url) + + return path_or_url + + +def expand_contracted_extension(extension: str) -> str: + """Returns the expanded version of a known contracted extension. + + This function maps extensions like ".tgz" to ".tar.gz". On unknown extensions, + return the input unmodified. + """ + extension = extension.strip(".") + return CONTRACTION_MAP.get(extension, extension) + + +def expand_contracted_extension_in_path( + path_or_url: str, *, extension: Optional[str] = None +) -> str: + """Returns the input path or URL with any contraction extension expanded. + + Args: + path_or_url: path or URL to be expanded + extension: if specified, only attempt to expand that extension + """ + extension = extension or extension_from_path(path_or_url) + if extension is None: + return path_or_url + + expanded = expand_contracted_extension(extension) + if expanded != extension: + return re.sub(rf"{extension}", rf"{expanded}", path_or_url) + return path_or_url + + +def compression_ext_from_compressed_archive(extension: str) -> Optional[str]: + """Returns compression extension for a compressed archive""" + extension = expand_contracted_extension(extension) + for ext in [*EXTENSIONS]: + if ext in extension: + return ext + return None + + +def strip_compression_extension(path_or_url: str, ext: Optional[str] = None) -> str: + """Strips the compression extension from the input, and returns it. For instance, + "foo.tgz" becomes "foo.tar". + + If no extension is given, try a default list of extensions. + + Args: + path_or_url: input to be stripped + ext: if given, extension to be stripped + """ + if not extension_from_path(path_or_url): + return path_or_url + + expanded_path = expand_contracted_extension_in_path(path_or_url) + candidates = [ext] if ext is not None else EXTENSIONS + for current_extension in candidates: + modified_path = check_and_remove_ext(expanded_path, extension=current_extension) + if modified_path != expanded_path: + return modified_path + return expanded_path + + +def allowed_archive(path_or_url: str) -> bool: + """Returns true if the input is a valid archive, False otherwise.""" + return ( + False if not path_or_url else any(path_or_url.endswith(t) for t in ALLOWED_ARCHIVE_TYPES) + ) + + +def determine_url_file_extension(path: str) -> str: + """This returns the type of archive a URL refers to. This is + sometimes confusing because of URLs like: + + (1) https://github.com/petdance/ack/tarball/1.93_02 + + Where the URL doesn't actually contain the filename. We need + to know what type it is so that we can appropriately name files + in mirrors. + """ + match = re.search(r"github.com/.+/(zip|tar)ball/", path) + if match: + if match.group(1) == "zip": + return "zip" + elif match.group(1) == "tar": + return "tar.gz" + + prefix, ext, suffix = split_url_extension(path) + return ext -- cgit v1.2.3-60-g2f50