summaryrefslogtreecommitdiff
path: root/lib
diff options
context:
space:
mode:
authorHarmen Stoppels <me@harmenstoppels.nl>2024-09-19 12:29:56 +0200
committerGitHub <noreply@github.com>2024-09-19 12:29:56 +0200
commitd0b736607b8ce08c1a44550f20a50439497e3187 (patch)
treee6b454fe9bf8a71201f3e618ebe4d9676e0bf2e5 /lib
parentfe5d7881f5a5ec72e2ffbeccc647968dee44214e (diff)
downloadspack-d0b736607b8ce08c1a44550f20a50439497e3187.tar.gz
spack-d0b736607b8ce08c1a44550f20a50439497e3187.tar.bz2
spack-d0b736607b8ce08c1a44550f20a50439497e3187.tar.xz
spack-d0b736607b8ce08c1a44550f20a50439497e3187.zip
spack.util.url: fix join breakage in python 3.12.6 (#46453)
Diffstat (limited to 'lib')
-rw-r--r--lib/spack/spack/test/util/util_url.py208
-rw-r--r--lib/spack/spack/util/url.py194
2 files changed, 84 insertions, 318 deletions
diff --git a/lib/spack/spack/test/util/util_url.py b/lib/spack/spack/test/util/util_url.py
index befaaef1cd..0666d7a28f 100644
--- a/lib/spack/spack/test/util/util_url.py
+++ b/lib/spack/spack/test/util/util_url.py
@@ -8,6 +8,8 @@ import os
import os.path
import urllib.parse
+import pytest
+
import spack.util.path
import spack.util.url as url_util
@@ -45,155 +47,63 @@ def test_relative_path_to_file_url(tmpdir):
assert os.path.samefile(roundtrip, path)
-def test_url_join_local_paths():
- # Resolve local link against page URL
-
- # wrong:
- assert (
- url_util.join("s3://bucket/index.html", "../other-bucket/document.txt")
- == "s3://bucket/other-bucket/document.txt"
- )
-
- # correct - need to specify resolve_href=True:
- assert (
- url_util.join("s3://bucket/index.html", "../other-bucket/document.txt", resolve_href=True)
- == "s3://other-bucket/document.txt"
- )
-
- # same as above: make sure several components are joined together correctly
- assert (
- url_util.join(
- # with resolve_href=True, first arg is the base url; can not be
- # broken up
- "s3://bucket/index.html",
- # with resolve_href=True, remaining arguments are the components of
- # the local href that needs to be resolved
- "..",
- "other-bucket",
- "document.txt",
- resolve_href=True,
- )
- == "s3://other-bucket/document.txt"
- )
-
- # Append local path components to prefix URL
-
- # wrong:
- assert (
- url_util.join("https://mirror.spack.io/build_cache", "my-package", resolve_href=True)
- == "https://mirror.spack.io/my-package"
- )
-
- # correct - Need to specify resolve_href=False:
- assert (
- url_util.join("https://mirror.spack.io/build_cache", "my-package", resolve_href=False)
- == "https://mirror.spack.io/build_cache/my-package"
- )
-
- # same as above; make sure resolve_href=False is default
- assert (
- url_util.join("https://mirror.spack.io/build_cache", "my-package")
- == "https://mirror.spack.io/build_cache/my-package"
- )
-
- # same as above: make sure several components are joined together correctly
- assert (
- url_util.join(
- # with resolve_href=False, first arg is just a prefix. No
- # resolution is done. So, there should be no difference between
- # join('/a/b/c', 'd/e'),
- # join('/a/b', 'c', 'd/e'),
- # join('/a', 'b/c', 'd', 'e'), etc.
- "https://mirror.spack.io",
- "build_cache",
- "my-package",
- )
- == "https://mirror.spack.io/build_cache/my-package"
- )
-
- # For s3:// URLs, the "netloc" (bucket) is considered part of the path.
- # Make sure join() can cross bucket boundaries in this case.
- args = ["s3://bucket/a/b", "new-bucket", "c"]
- assert url_util.join(*args) == "s3://bucket/a/b/new-bucket/c"
-
- args.insert(1, "..")
- assert url_util.join(*args) == "s3://bucket/a/new-bucket/c"
-
- args.insert(1, "..")
- assert url_util.join(*args) == "s3://bucket/new-bucket/c"
-
- # new-bucket is now the "netloc" (bucket name)
- args.insert(1, "..")
- assert url_util.join(*args) == "s3://new-bucket/c"
-
-
-def test_url_join_absolute_paths():
- # Handling absolute path components is a little tricky. To this end, we
- # distinguish "absolute path components", from the more-familiar concept of
- # "absolute paths" as they are understood for local filesystem paths.
- #
- # - All absolute paths are absolute path components. Joining a URL with
- # these components has the effect of completely replacing the path of the
- # URL with the absolute path. These components do not specify a URL
- # scheme, so the scheme of the URL procuced when joining them depend on
- # those provided by components that came before it (file:// assumed if no
- # such scheme is provided).
-
- # For eaxmple:
- p = "/path/to/resource"
- # ...is an absolute path
-
- # http:// URL
- assert url_util.join("http://example.com/a/b/c", p) == "http://example.com/path/to/resource"
-
- # s3:// URL
- # also notice how the netloc is treated as part of the path for s3:// URLs
- assert url_util.join("s3://example.com/a/b/c", p) == "s3://path/to/resource"
-
- # - URL components that specify a scheme are always absolute path
- # components. Joining a base URL with these components effectively
- # discards the base URL and "resets" the joining logic starting at the
- # component in question and using it as the new base URL.
-
- # For eaxmple:
- p = "http://example.com/path/to"
- # ...is an http:// URL
-
- join_result = url_util.join(p, "resource")
- assert join_result == "http://example.com/path/to/resource"
-
- # works as if everything before the http:// URL was left out
- assert url_util.join("literally", "does", "not", "matter", p, "resource") == join_result
-
- assert url_util.join("file:///a/b/c", "./d") == "file:///a/b/c/d"
-
- # Finally, resolve_href should have no effect for how absolute path
- # components are handled because local hrefs can not be absolute path
- # components.
- args = [
- "s3://does",
- "not",
- "matter",
- "http://example.com",
- "also",
- "does",
- "not",
- "matter",
- "/path",
- ]
-
- expected = "http://example.com/path"
- assert url_util.join(*args, resolve_href=True) == expected
- assert url_util.join(*args, resolve_href=False) == expected
-
- # resolve_href only matters for the local path components at the end of the
- # argument list.
- args[-1] = "/path/to/page"
- args.extend(("..", "..", "resource"))
-
- assert url_util.join(*args, resolve_href=True) == "http://example.com/resource"
-
- assert url_util.join(*args, resolve_href=False) == "http://example.com/path/resource"
+@pytest.mark.parametrize("resolve_href", [True, False])
+@pytest.mark.parametrize("scheme", ["http", "s3", "gs", "file"])
+def test_url_join_absolute(scheme, resolve_href):
+ """Test that joining a URL with an absolute path works the same for schemes we care about, and
+ whether we work in web browser mode or not."""
+ netloc = "" if scheme == "file" else "example.com"
+ a1 = url_util.join(f"{scheme}://{netloc}/a/b/c", "/d/e/f", resolve_href=resolve_href)
+ a2 = url_util.join(f"{scheme}://{netloc}/a/b/c", "/d", "e", "f", resolve_href=resolve_href)
+ assert a1 == a2 == f"{scheme}://{netloc}/d/e/f"
+
+ b1 = url_util.join(f"{scheme}://{netloc}/a", "https://b.com/b", resolve_href=resolve_href)
+ b2 = url_util.join(f"{scheme}://{netloc}/a", "https://b.com", "b", resolve_href=resolve_href)
+ assert b1 == b2 == "https://b.com/b"
+
+
+@pytest.mark.parametrize("scheme", ["http", "s3", "gs"])
+def test_url_join_up(scheme):
+ """Test that the netloc component is preserved when going .. up in the path."""
+ a1 = url_util.join(f"{scheme}://netloc/a/b.html", "c", resolve_href=True)
+ assert a1 == f"{scheme}://netloc/a/c"
+ b1 = url_util.join(f"{scheme}://netloc/a/b.html", "../c", resolve_href=True)
+ b2 = url_util.join(f"{scheme}://netloc/a/b.html", "..", "c", resolve_href=True)
+ assert b1 == b2 == f"{scheme}://netloc/c"
+ c1 = url_util.join(f"{scheme}://netloc/a/b.html", "../../c", resolve_href=True)
+ c2 = url_util.join(f"{scheme}://netloc/a/b.html", "..", "..", "c", resolve_href=True)
+ assert c1 == c2 == f"{scheme}://netloc/c"
+
+ d1 = url_util.join(f"{scheme}://netloc/a/b", "c", resolve_href=False)
+ assert d1 == f"{scheme}://netloc/a/b/c"
+ d2 = url_util.join(f"{scheme}://netloc/a/b", "../c", resolve_href=False)
+ d3 = url_util.join(f"{scheme}://netloc/a/b", "..", "c", resolve_href=False)
+ assert d2 == d3 == f"{scheme}://netloc/a/c"
+ e1 = url_util.join(f"{scheme}://netloc/a/b", "../../c", resolve_href=False)
+ e2 = url_util.join(f"{scheme}://netloc/a/b", "..", "..", "c", resolve_href=False)
+ assert e1 == e2 == f"{scheme}://netloc/c"
+ f1 = url_util.join(f"{scheme}://netloc/a/b", "../../../c", resolve_href=False)
+ f2 = url_util.join(f"{scheme}://netloc/a/b", "..", "..", "..", "c", resolve_href=False)
+ assert f1 == f2 == f"{scheme}://netloc/c"
+
+
+@pytest.mark.parametrize("scheme", ["http", "https", "ftp", "s3", "gs", "file"])
+def test_url_join_resolve_href(scheme):
+ """test that `resolve_href=True` behaves like a web browser at the base page, and
+ `resolve_href=False` behaves like joining paths in a file system at the base directory."""
+ # these are equivalent because of the trailing /
+ netloc = "" if scheme == "file" else "netloc"
+ a1 = url_util.join(f"{scheme}://{netloc}/my/path/", "other/path", resolve_href=True)
+ a2 = url_util.join(f"{scheme}://{netloc}/my/path/", "other", "path", resolve_href=True)
+ assert a1 == a2 == f"{scheme}://{netloc}/my/path/other/path"
+ b1 = url_util.join(f"{scheme}://{netloc}/my/path", "other/path", resolve_href=False)
+ b2 = url_util.join(f"{scheme}://{netloc}/my/path", "other", "path", resolve_href=False)
+ assert b1 == b2 == f"{scheme}://{netloc}/my/path/other/path"
+
+ # this is like a web browser: relative to /my.
+ c1 = url_util.join(f"{scheme}://{netloc}/my/path", "other/path", resolve_href=True)
+ c2 = url_util.join(f"{scheme}://{netloc}/my/path", "other", "path", resolve_href=True)
+ assert c1 == c2 == f"{scheme}://{netloc}/my/other/path"
def test_default_download_name():
diff --git a/lib/spack/spack/util/url.py b/lib/spack/spack/util/url.py
index 64671f1d95..982e192a4c 100644
--- a/lib/spack/spack/util/url.py
+++ b/lib/spack/spack/util/url.py
@@ -7,17 +7,13 @@
Utility functions for parsing, formatting, and manipulating URLs.
"""
-import itertools
import os
import posixpath
import re
-import sys
import urllib.parse
import urllib.request
from typing import Optional
-from llnl.path import convert_to_posix_path
-
from spack.util.path import sanitize_filename
@@ -29,26 +25,6 @@ def validate_scheme(scheme):
return scheme in ("file", "http", "https", "ftp", "s3", "gs", "ssh", "git")
-def _split_all(path):
- """Split path into its atomic components.
-
- Returns the shortest list, L, of strings such that posixpath.join(*L) ==
- path and posixpath.split(element) == ('', element) for every element in L
- except possibly the first. This first element may possibly have the value
- of '/'.
- """
- result = []
- a = path
- old_a = None
- while a != old_a:
- (old_a, (a, b)) = a, posixpath.split(a)
-
- if a or b:
- result.insert(0, b or "/")
-
- return result
-
-
def local_file_path(url):
"""Get a local file path from a url.
@@ -92,151 +68,31 @@ def format(parsed_url):
return parsed_url.geturl()
-def join(base_url, path, *extra, **kwargs):
- """Joins a base URL with one or more local URL path components
-
- If resolve_href is True, treat the base URL as though it where the locator
- of a web page, and the remaining URL path components as though they formed
- a relative URL to be resolved against it (i.e.: as in posixpath.join(...)).
- The result is an absolute URL to the resource to which a user's browser
- would navigate if they clicked on a link with an "href" attribute equal to
- the relative URL.
-
- If resolve_href is False (default), then the URL path components are joined
- as in posixpath.join().
-
- Note: file:// URL path components are not canonicalized as part of this
- operation. To canonicalize, pass the joined url to format().
-
- Examples:
- base_url = 's3://bucket/index.html'
- body = fetch_body(prefix)
- link = get_href(body) # link == '../other-bucket/document.txt'
-
- # wrong - link is a local URL that needs to be resolved against base_url
- spack.util.url.join(base_url, link)
- 's3://bucket/other_bucket/document.txt'
-
- # correct - resolve local URL against base_url
- spack.util.url.join(base_url, link, resolve_href=True)
- 's3://other_bucket/document.txt'
-
- prefix = 'https://mirror.spack.io/build_cache'
-
- # wrong - prefix is just a URL prefix
- spack.util.url.join(prefix, 'my-package', resolve_href=True)
- 'https://mirror.spack.io/my-package'
-
- # correct - simply append additional URL path components
- spack.util.url.join(prefix, 'my-package', resolve_href=False) # default
- 'https://mirror.spack.io/build_cache/my-package'
-
- # For canonicalizing file:// URLs, take care to explicitly differentiate
- # between absolute and relative join components.
- """
- paths = [
- (x) if isinstance(x, str) else x.geturl() for x in itertools.chain((base_url, path), extra)
- ]
-
- paths = [convert_to_posix_path(x) for x in paths]
- n = len(paths)
- last_abs_component = None
- scheme = ""
- for i in range(n - 1, -1, -1):
- obj = urllib.parse.urlparse(paths[i], scheme="", allow_fragments=False)
-
- scheme = obj.scheme
-
- # in either case the component is absolute
- if scheme or obj.path.startswith("/"):
- if not scheme:
- # Without a scheme, we have to go back looking for the
- # next-last component that specifies a scheme.
- for j in range(i - 1, -1, -1):
- obj = urllib.parse.urlparse(paths[j], scheme="", allow_fragments=False)
-
- if obj.scheme:
- paths[i] = "{SM}://{NL}{PATH}".format(
- SM=obj.scheme,
- NL=((obj.netloc + "/") if obj.scheme != "s3" else ""),
- PATH=paths[i][1:],
- )
- break
-
- last_abs_component = i
- break
-
- if last_abs_component is not None:
- paths = paths[last_abs_component:]
- if len(paths) == 1:
- result = urllib.parse.urlparse(paths[0], scheme="file", allow_fragments=False)
-
- # another subtlety: If the last argument to join() is an absolute
- # file:// URL component with a relative path, the relative path
- # needs to be resolved.
- if result.scheme == "file" and result.netloc:
- result = urllib.parse.ParseResult(
- scheme=result.scheme,
- netloc="",
- path=posixpath.abspath(result.netloc + result.path),
- params=result.params,
- query=result.query,
- fragment=None,
- )
-
- return result.geturl()
-
- return _join(*paths, **kwargs)
-
-
-def _join(base_url, path, *extra, **kwargs):
- base_url = urllib.parse.urlparse(base_url)
- resolve_href = kwargs.get("resolve_href", False)
-
- (scheme, netloc, base_path, params, query, _) = base_url
- scheme = scheme.lower()
-
- path_tokens = [
- part
- for part in itertools.chain(
- _split_all(path),
- itertools.chain.from_iterable(_split_all(extra_path) for extra_path in extra),
- )
- if part and part != "/"
- ]
-
- base_path_args = ["/fake-root"]
- if scheme == "s3":
- if netloc:
- base_path_args.append(netloc)
-
- if base_path.startswith("/"):
- base_path = base_path[1:]
-
- base_path_args.append(base_path)
-
- if resolve_href:
- new_base_path, _ = posixpath.split(posixpath.join(*base_path_args))
- base_path_args = [new_base_path]
-
- base_path_args.extend(path_tokens)
- base_path = posixpath.relpath(posixpath.join(*base_path_args), "/fake-root")
-
- if scheme == "s3":
- path_tokens = [part for part in _split_all(base_path) if part and part != "/"]
-
- if path_tokens:
- netloc = path_tokens.pop(0)
- base_path = posixpath.join("", *path_tokens)
-
- if sys.platform == "win32":
- base_path = convert_to_posix_path(base_path)
-
- return format(
- urllib.parse.ParseResult(
- scheme=scheme, netloc=netloc, path=base_path, params=params, query=query, fragment=None
- )
- )
+def join(base: str, *components: str, resolve_href: bool = False, **kwargs) -> str:
+ """Convenience wrapper around ``urllib.parse.urljoin``, with a few differences:
+ 1. By default resolve_href=False, which makes the function like os.path.join: for example
+ https://example.com/a/b + c/d = https://example.com/a/b/c/d. If resolve_href=True, the
+ behavior is how a browser would resolve the URL: https://example.com/a/c/d.
+ 2. s3:// and gs:// URLs are joined like http:// URLs.
+ 3. It accepts multiple components for convenience. Note that components[1:] are treated as
+ literal path components and appended to components[0] separated by slashes."""
+ # Ensure a trailing slash in the path component of the base URL to get os.path.join-like
+ # behavior instead of web browser behavior.
+ if not resolve_href:
+ parsed = urllib.parse.urlparse(base)
+ if not parsed.path.endswith("/"):
+ base = parsed._replace(path=f"{parsed.path}/").geturl()
+ uses_netloc = urllib.parse.uses_netloc
+ uses_relative = urllib.parse.uses_relative
+ try:
+ # NOTE: we temporarily modify urllib internals so s3 and gs schemes are treated like http.
+ # This is non-portable, and may be forward incompatible with future cpython versions.
+ urllib.parse.uses_netloc = [*uses_netloc, "s3", "gs"]
+ urllib.parse.uses_relative = [*uses_relative, "s3", "gs"]
+ return urllib.parse.urljoin(base, "/".join(components), **kwargs)
+ finally:
+ urllib.parse.uses_netloc = uses_netloc
+ urllib.parse.uses_relative = uses_relative
def default_download_filename(url: str) -> str: