summaryrefslogtreecommitdiff
path: root/lib/spack/llnl/url.py
diff options
context:
space:
mode:
Diffstat (limited to 'lib/spack/llnl/url.py')
-rw-r--r--lib/spack/llnl/url.py459
1 files changed, 459 insertions, 0 deletions
diff --git a/lib/spack/llnl/url.py b/lib/spack/llnl/url.py
new file mode 100644
index 0000000000..40e7606506
--- /dev/null
+++ b/lib/spack/llnl/url.py
@@ -0,0 +1,459 @@
+# Copyright 2013-2023 Lawrence Livermore National Security, LLC and other
+# Spack Project Developers. See the top-level COPYRIGHT file for details.
+#
+# SPDX-License-Identifier: (Apache-2.0 OR MIT)
+"""URL primitives that just require Python standard library."""
+import itertools
+import os.path
+import re
+from typing import Optional, Set, Tuple
+from urllib.parse import urlsplit, urlunsplit
+
+# Archive extensions allowed in Spack
+PREFIX_EXTENSIONS = ("tar", "TAR")
+EXTENSIONS = ("gz", "bz2", "xz", "Z")
+NO_TAR_EXTENSIONS = ("zip", "tgz", "tbz2", "tbz", "txz")
+
+# Add PREFIX_EXTENSIONS and EXTENSIONS last so that .tar.gz is matched *before* .tar or .gz
+ALLOWED_ARCHIVE_TYPES = (
+ tuple(".".join(ext) for ext in itertools.product(PREFIX_EXTENSIONS, EXTENSIONS))
+ + PREFIX_EXTENSIONS
+ + EXTENSIONS
+ + NO_TAR_EXTENSIONS
+)
+CONTRACTION_MAP = {"tgz": "tar.gz", "txz": "tar.xz", "tbz": "tar.bz2", "tbz2": "tar.bz2"}
+
+
+def find_list_urls(url: str) -> Set[str]:
+ r"""Find good list URLs for the supplied URL.
+
+ By default, returns the dirname of the archive path.
+
+ Provides special treatment for the following websites, which have a
+ unique list URL different from the dirname of the download URL:
+
+ ========= =======================================================
+ GitHub https://github.com/<repo>/<name>/releases
+ GitLab https://gitlab.\*/<repo>/<name>/tags
+ BitBucket https://bitbucket.org/<repo>/<name>/downloads/?tab=tags
+ CRAN https://\*.r-project.org/src/contrib/Archive/<name>
+ PyPI https://pypi.org/simple/<name>/
+ LuaRocks https://luarocks.org/modules/<repo>/<name>
+ ========= =======================================================
+
+ Note: this function is called by `spack versions`, `spack checksum`,
+ and `spack create`, but not by `spack fetch` or `spack install`.
+
+ Parameters:
+ url (str): The download URL for the package
+
+ Returns:
+ set: One or more list URLs for the package
+ """
+
+ url_types = [
+ # GitHub
+ # e.g. https://github.com/llnl/callpath/archive/v1.0.1.tar.gz
+ (r"(.*github\.com/[^/]+/[^/]+)", lambda m: m.group(1) + "/releases"),
+ # GitLab API endpoint
+ # e.g. https://gitlab.dkrz.de/api/v4/projects/k202009%2Flibaec/repository/archive.tar.gz?sha=v1.0.2
+ (
+ r"(.*gitlab[^/]+)/api/v4/projects/([^/]+)%2F([^/]+)",
+ lambda m: m.group(1) + "/" + m.group(2) + "/" + m.group(3) + "/tags",
+ ),
+ # GitLab non-API endpoint
+ # e.g. https://gitlab.dkrz.de/k202009/libaec/uploads/631e85bcf877c2dcaca9b2e6d6526339/libaec-1.0.0.tar.gz
+ (r"(.*gitlab[^/]+/(?!api/v4/projects)[^/]+/[^/]+)", lambda m: m.group(1) + "/tags"),
+ # BitBucket
+ # e.g. https://bitbucket.org/eigen/eigen/get/3.3.3.tar.bz2
+ (r"(.*bitbucket.org/[^/]+/[^/]+)", lambda m: m.group(1) + "/downloads/?tab=tags"),
+ # CRAN
+ # e.g. https://cran.r-project.org/src/contrib/Rcpp_0.12.9.tar.gz
+ # e.g. https://cloud.r-project.org/src/contrib/rgl_0.98.1.tar.gz
+ (
+ r"(.*\.r-project\.org/src/contrib)/([^_]+)",
+ lambda m: m.group(1) + "/Archive/" + m.group(2),
+ ),
+ # PyPI
+ # e.g. https://pypi.io/packages/source/n/numpy/numpy-1.19.4.zip
+ # e.g. https://www.pypi.io/packages/source/n/numpy/numpy-1.19.4.zip
+ # e.g. https://pypi.org/packages/source/n/numpy/numpy-1.19.4.zip
+ # e.g. https://pypi.python.org/packages/source/n/numpy/numpy-1.19.4.zip
+ # e.g. https://files.pythonhosted.org/packages/source/n/numpy/numpy-1.19.4.zip
+ # e.g. https://pypi.io/packages/py2.py3/o/opencensus-context/opencensus_context-0.1.1-py2.py3-none-any.whl
+ (
+ r"(?:pypi|pythonhosted)[^/]+/packages/[^/]+/./([^/]+)",
+ lambda m: "https://pypi.org/simple/" + m.group(1) + "/",
+ ),
+ # LuaRocks
+ # e.g. https://luarocks.org/manifests/gvvaughan/lpeg-1.0.2-1.src.rock
+ # e.g. https://luarocks.org/manifests/openresty/lua-cjson-2.1.0-1.src.rock
+ (
+ r"luarocks[^/]+/(?:modules|manifests)/(?P<org>[^/]+)/"
+ + r"(?P<name>.+?)-[0-9.-]*\.src\.rock",
+ lambda m: "https://luarocks.org/modules/"
+ + m.group("org")
+ + "/"
+ + m.group("name")
+ + "/",
+ ),
+ ]
+
+ list_urls = {os.path.dirname(url)}
+
+ for pattern, fun in url_types:
+ match = re.search(pattern, url)
+ if match:
+ list_urls.add(fun(match))
+
+ return list_urls
+
+
+def strip_query_and_fragment(url: str) -> Tuple[str, str]:
+ """Strips query and fragment from a url, then returns the base url and the suffix.
+
+ Args:
+ url: URL to be stripped
+
+ Raises:
+ ValueError: when there is any error parsing the URL
+ """
+ components = urlsplit(url)
+ stripped = components[:3] + (None, None)
+
+ query, frag = components[3:5]
+ suffix = ""
+ if query:
+ suffix += "?" + query
+ if frag:
+ suffix += "#" + frag
+
+ return urlunsplit(stripped), suffix
+
+
+SOURCEFORGE_RE = re.compile(r"(.*(?:sourceforge\.net|sf\.net)/.*)(/download)$")
+
+
+def split_url_on_sourceforge_suffix(url: str) -> Tuple[str, ...]:
+ """If the input is a sourceforge URL, returns base URL and "/download" suffix. Otherwise,
+ returns the input URL and an empty string.
+ """
+ match = SOURCEFORGE_RE.search(url)
+ if match is not None:
+ return match.groups()
+ return url, ""
+
+
+def has_extension(path_or_url: str, ext: str) -> bool:
+ """Returns true if the extension in input is present in path, false otherwise."""
+ prefix, _ = split_url_on_sourceforge_suffix(path_or_url)
+ if not ext.startswith(r"\."):
+ ext = rf"\.{ext}$"
+
+ if re.search(ext, prefix):
+ return True
+ return False
+
+
+def extension_from_path(path_or_url: Optional[str]) -> Optional[str]:
+ """Tries to match an allowed archive extension to the input. Returns the first match,
+ or None if no match was found.
+
+ Raises:
+ ValueError: if the input is None
+ """
+ if path_or_url is None:
+ raise ValueError("Can't call extension() on None")
+
+ for t in ALLOWED_ARCHIVE_TYPES:
+ if has_extension(path_or_url, t):
+ return t
+ return None
+
+
+def remove_extension(path_or_url: str, *, extension: str) -> str:
+ """Returns the input with the extension removed"""
+ suffix = rf"\.{extension}$"
+ return re.sub(suffix, "", path_or_url)
+
+
+def check_and_remove_ext(path: str, *, extension: str) -> str:
+ """Returns the input path with the extension removed, if the extension is present in path.
+ Otherwise, returns the input unchanged.
+ """
+ if not has_extension(path, extension):
+ return path
+ path, _ = split_url_on_sourceforge_suffix(path)
+ return remove_extension(path, extension=extension)
+
+
+def strip_extension(path_or_url: str, *, extension: Optional[str] = None) -> str:
+ """If a path contains the extension in input, returns the path stripped of the extension.
+ Otherwise, returns the input path.
+
+ If extension is None, attempts to strip any allowed extension from path.
+ """
+ if extension is None:
+ for t in ALLOWED_ARCHIVE_TYPES:
+ if has_extension(path_or_url, ext=t):
+ extension = t
+ break
+ else:
+ return path_or_url
+
+ return check_and_remove_ext(path_or_url, extension=extension)
+
+
+def split_url_extension(url: str) -> Tuple[str, ...]:
+ """Some URLs have a query string, e.g.:
+
+ 1. https://github.com/losalamos/CLAMR/blob/packages/PowerParser_v2.0.7.tgz?raw=true
+ 2. http://www.apache.org/dyn/closer.cgi?path=/cassandra/1.2.0/apache-cassandra-1.2.0-rc2-bin.tar.gz
+ 3. https://gitlab.kitware.com/vtk/vtk/repository/archive.tar.bz2?ref=v7.0.0
+
+ In (1), the query string needs to be stripped to get at the
+ extension, but in (2) & (3), the filename is IN a single final query
+ argument.
+
+ This strips the URL into three pieces: ``prefix``, ``ext``, and ``suffix``.
+ The suffix contains anything that was stripped off the URL to
+ get at the file extension. In (1), it will be ``'?raw=true'``, but
+ in (2), it will be empty. In (3) the suffix is a parameter that follows
+ after the file extension, e.g.:
+
+ 1. ``('https://github.com/losalamos/CLAMR/blob/packages/PowerParser_v2.0.7', '.tgz', '?raw=true')``
+ 2. ``('http://www.apache.org/dyn/closer.cgi?path=/cassandra/1.2.0/apache-cassandra-1.2.0-rc2-bin', '.tar.gz', None)``
+ 3. ``('https://gitlab.kitware.com/vtk/vtk/repository/archive', '.tar.bz2', '?ref=v7.0.0')``
+ """
+ # Strip off sourceforge download suffix.
+ # e.g. https://sourceforge.net/projects/glew/files/glew/2.0.0/glew-2.0.0.tgz/download
+ prefix, suffix = split_url_on_sourceforge_suffix(url)
+
+ ext = extension_from_path(prefix)
+ if ext is not None:
+ prefix = strip_extension(prefix)
+ return prefix, ext, suffix
+
+ try:
+ prefix, suf = strip_query_and_fragment(prefix)
+ except ValueError:
+ # FIXME: tty.debug("Got error parsing path %s" % path)
+ # Ignore URL parse errors here
+ return url, ""
+
+ ext = extension_from_path(prefix)
+ prefix = strip_extension(prefix)
+ suffix = suf + suffix
+ if ext is None:
+ ext = ""
+
+ return prefix, ext, suffix
+
+
+def strip_version_suffixes(path_or_url: str) -> str:
+ """Some tarballs contain extraneous information after the version:
+
+ * ``bowtie2-2.2.5-source``
+ * ``libevent-2.0.21-stable``
+ * ``cuda_8.0.44_linux.run``
+
+ These strings are not part of the version number and should be ignored.
+ This function strips those suffixes off and returns the remaining string.
+ The goal is that the version is always the last thing in ``path``:
+
+ * ``bowtie2-2.2.5``
+ * ``libevent-2.0.21``
+ * ``cuda_8.0.44``
+
+ Args:
+ path_or_url: The filename or URL for the package
+
+ Returns:
+ The ``path`` with any extraneous suffixes removed
+ """
+ # NOTE: This could be done with complicated regexes in parse_version_offset
+ # NOTE: The problem is that we would have to add these regexes to the end
+ # NOTE: of every single version regex. Easier to just strip them off
+ # NOTE: permanently
+
+ suffix_regexes = [
+ # Download type
+ r"[Ii]nstall",
+ r"all",
+ r"code",
+ r"[Ss]ources?",
+ r"file",
+ r"full",
+ r"single",
+ r"with[a-zA-Z_-]+",
+ r"rock",
+ r"src(_0)?",
+ r"public",
+ r"bin",
+ r"binary",
+ r"run",
+ r"[Uu]niversal",
+ r"jar",
+ r"complete",
+ r"dynamic",
+ r"oss",
+ r"gem",
+ r"tar",
+ r"sh",
+ # Download version
+ r"release",
+ r"bin",
+ r"stable",
+ r"[Ff]inal",
+ r"rel",
+ r"orig",
+ r"dist",
+ r"\+",
+ # License
+ r"gpl",
+ # Arch
+ # Needs to come before and after OS, appears in both orders
+ r"ia32",
+ r"intel",
+ r"amd64",
+ r"linux64",
+ r"x64",
+ r"64bit",
+ r"x86[_-]64",
+ r"i586_64",
+ r"x86",
+ r"i[36]86",
+ r"ppc64(le)?",
+ r"armv?(7l|6l|64)",
+ # Other
+ r"cpp",
+ r"gtk",
+ r"incubating",
+ # OS
+ r"[Ll]inux(_64)?",
+ r"LINUX",
+ r"[Uu]ni?x",
+ r"[Ss]un[Oo][Ss]",
+ r"[Mm]ac[Oo][Ss][Xx]?",
+ r"[Oo][Ss][Xx]",
+ r"[Dd]arwin(64)?",
+ r"[Aa]pple",
+ r"[Ww]indows",
+ r"[Ww]in(64|32)?",
+ r"[Cc]ygwin(64|32)?",
+ r"[Mm]ingw",
+ r"centos",
+ # Arch
+ # Needs to come before and after OS, appears in both orders
+ r"ia32",
+ r"intel",
+ r"amd64",
+ r"linux64",
+ r"x64",
+ r"64bit",
+ r"x86[_-]64",
+ r"i586_64",
+ r"x86",
+ r"i[36]86",
+ r"ppc64(le)?",
+ r"armv?(7l|6l|64)?",
+ # PyPI
+ r"[._-]py[23].*\.whl",
+ r"[._-]cp[23].*\.whl",
+ r"[._-]win.*\.exe",
+ ]
+
+ for regex in suffix_regexes:
+ # Remove the suffix from the end of the path
+ # This may be done multiple times
+ path_or_url = re.sub(r"[._-]?" + regex + "$", "", path_or_url)
+
+ return path_or_url
+
+
+def expand_contracted_extension(extension: str) -> str:
+ """Returns the expanded version of a known contracted extension.
+
+ This function maps extensions like ".tgz" to ".tar.gz". On unknown extensions,
+ return the input unmodified.
+ """
+ extension = extension.strip(".")
+ return CONTRACTION_MAP.get(extension, extension)
+
+
+def expand_contracted_extension_in_path(
+ path_or_url: str, *, extension: Optional[str] = None
+) -> str:
+ """Returns the input path or URL with any contraction extension expanded.
+
+ Args:
+ path_or_url: path or URL to be expanded
+ extension: if specified, only attempt to expand that extension
+ """
+ extension = extension or extension_from_path(path_or_url)
+ if extension is None:
+ return path_or_url
+
+ expanded = expand_contracted_extension(extension)
+ if expanded != extension:
+ return re.sub(rf"{extension}", rf"{expanded}", path_or_url)
+ return path_or_url
+
+
+def compression_ext_from_compressed_archive(extension: str) -> Optional[str]:
+ """Returns compression extension for a compressed archive"""
+ extension = expand_contracted_extension(extension)
+ for ext in [*EXTENSIONS]:
+ if ext in extension:
+ return ext
+ return None
+
+
+def strip_compression_extension(path_or_url: str, ext: Optional[str] = None) -> str:
+ """Strips the compression extension from the input, and returns it. For instance,
+ "foo.tgz" becomes "foo.tar".
+
+ If no extension is given, try a default list of extensions.
+
+ Args:
+ path_or_url: input to be stripped
+ ext: if given, extension to be stripped
+ """
+ if not extension_from_path(path_or_url):
+ return path_or_url
+
+ expanded_path = expand_contracted_extension_in_path(path_or_url)
+ candidates = [ext] if ext is not None else EXTENSIONS
+ for current_extension in candidates:
+ modified_path = check_and_remove_ext(expanded_path, extension=current_extension)
+ if modified_path != expanded_path:
+ return modified_path
+ return expanded_path
+
+
+def allowed_archive(path_or_url: str) -> bool:
+ """Returns true if the input is a valid archive, False otherwise."""
+ return (
+ False if not path_or_url else any(path_or_url.endswith(t) for t in ALLOWED_ARCHIVE_TYPES)
+ )
+
+
+def determine_url_file_extension(path: str) -> str:
+ """This returns the type of archive a URL refers to. This is
+ sometimes confusing because of URLs like:
+
+ (1) https://github.com/petdance/ack/tarball/1.93_02
+
+ Where the URL doesn't actually contain the filename. We need
+ to know what type it is so that we can appropriately name files
+ in mirrors.
+ """
+ match = re.search(r"github.com/.+/(zip|tar)ball/", path)
+ if match:
+ if match.group(1) == "zip":
+ return "zip"
+ elif match.group(1) == "tar":
+ return "tar.gz"
+
+ prefix, ext, suffix = split_url_extension(path)
+ return ext