summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorOmar Padron <omar.padron@kitware.com>2019-11-01 06:42:43 -0400
committerTodd Gamblin <tgamblin@llnl.gov>2019-11-01 03:50:47 -0700
commit01a0d554f595bc43bc5295d28248b992a53aa5f7 (patch)
tree12125d45fa0694e014349b145980c8f70c46c297
parent2a9d6b9fbf5a2694dccd77fd75bb6517f6430dc1 (diff)
downloadspack-01a0d554f595bc43bc5295d28248b992a53aa5f7.tar.gz
spack-01a0d554f595bc43bc5295d28248b992a53aa5f7.tar.bz2
spack-01a0d554f595bc43bc5295d28248b992a53aa5f7.tar.xz
spack-01a0d554f595bc43bc5295d28248b992a53aa5f7.zip
bugfix: spack.util.url.join() now handles absolute paths correctly (#13488)
* fix issue where spack.util.url.join() failed to correctly handle absolute path components * add url util tests
-rw-r--r--lib/spack/spack/test/util/util_url.py303
-rw-r--r--lib/spack/spack/util/url.py77
-rw-r--r--lib/spack/spack/util/web.py1
3 files changed, 379 insertions, 2 deletions
diff --git a/lib/spack/spack/test/util/util_url.py b/lib/spack/spack/test/util/util_url.py
new file mode 100644
index 0000000000..24b40ac63c
--- /dev/null
+++ b/lib/spack/spack/test/util/util_url.py
@@ -0,0 +1,303 @@
+# Copyright 2013-2019 Lawrence Livermore National Security, LLC and other
+# Spack Project Developers. See the top-level COPYRIGHT file for details.
+#
+# SPDX-License-Identifier: (Apache-2.0 OR MIT)
+
+"""Test Spack's URL handling utility functions."""
+import os
+import os.path
+import spack.util.url as url_util
+
+
+def test_url_parse():
+ parsed = url_util.parse('/path/to/resource')
+ assert(parsed.scheme == 'file')
+ assert(parsed.netloc == '')
+ assert(parsed.path == '/path/to/resource')
+
+ parsed = url_util.parse('/path/to/resource', scheme='fake')
+ assert(parsed.scheme == 'fake')
+ assert(parsed.netloc == '')
+ assert(parsed.path == '/path/to/resource')
+
+ parsed = url_util.parse('file:///path/to/resource')
+ assert(parsed.scheme == 'file')
+ assert(parsed.netloc == '')
+ assert(parsed.path == '/path/to/resource')
+
+ parsed = url_util.parse('file:///path/to/resource', scheme='fake')
+ assert(parsed.scheme == 'file')
+ assert(parsed.netloc == '')
+ assert(parsed.path == '/path/to/resource')
+
+ parsed = url_util.parse('file://path/to/resource')
+ assert(parsed.scheme == 'file')
+ assert(parsed.netloc == '')
+ expected = os.path.abspath(os.path.join('path', 'to', 'resource'))
+ assert(parsed.path == expected)
+
+ parsed = url_util.parse('https://path/to/resource')
+ assert(parsed.scheme == 'https')
+ assert(parsed.netloc == 'path')
+ assert(parsed.path == '/to/resource')
+
+ spack_root = os.path.abspath(os.environ['SPACK_ROOT'])
+ parsed = url_util.parse('$spack')
+ assert(parsed.scheme == 'file')
+ assert(parsed.netloc == '')
+ assert(parsed.path == spack_root)
+
+ parsed = url_util.parse('/a/b/c/$spack')
+ assert(parsed.scheme == 'file')
+ assert(parsed.netloc == '')
+ expected = os.path.abspath(os.path.join(
+ '/', 'a', 'b', 'c', './' + spack_root))
+ assert(parsed.path == expected)
+
+
+def test_url_local_file_path():
+ spack_root = os.path.abspath(os.environ['SPACK_ROOT'])
+
+ lfp = url_util.local_file_path('/a/b/c.txt')
+ assert(lfp == '/a/b/c.txt')
+
+ lfp = url_util.local_file_path('file:///a/b/c.txt')
+ assert(lfp == '/a/b/c.txt')
+
+ lfp = url_util.local_file_path('file://a/b/c.txt')
+ expected = os.path.abspath(os.path.join('a', 'b', 'c.txt'))
+ assert(lfp == expected)
+
+ lfp = url_util.local_file_path('$spack/a/b/c.txt')
+ expected = os.path.abspath(os.path.join(spack_root, 'a', 'b', 'c.txt'))
+ assert(lfp == expected)
+
+ lfp = url_util.local_file_path('file:///$spack/a/b/c.txt')
+ expected = os.path.abspath(os.path.join(spack_root, 'a', 'b', 'c.txt'))
+ assert(lfp == expected)
+
+ lfp = url_util.local_file_path('file://$spack/a/b/c.txt')
+ expected = os.path.abspath(os.path.join(spack_root, 'a', 'b', 'c.txt'))
+ assert(lfp == expected)
+
+ # not a file:// URL - so no local file path
+ lfp = url_util.local_file_path('http:///a/b/c.txt')
+ assert(lfp is None)
+
+ lfp = url_util.local_file_path('http://a/b/c.txt')
+ assert(lfp is None)
+
+ lfp = url_util.local_file_path('http:///$spack/a/b/c.txt')
+ assert(lfp is None)
+
+ lfp = url_util.local_file_path('http://$spack/a/b/c.txt')
+ assert(lfp is None)
+
+
+def test_url_join_local_paths():
+ # Resolve local link against page URL
+
+ # wrong:
+ assert(
+ url_util.join(
+ 's3://bucket/index.html',
+ '../other-bucket/document.txt')
+ ==
+ 's3://bucket/other-bucket/document.txt')
+
+ # correct - need to specify resolve_href=True:
+ assert(
+ url_util.join(
+ 's3://bucket/index.html',
+ '../other-bucket/document.txt',
+ resolve_href=True)
+ ==
+ 's3://other-bucket/document.txt')
+
+ # same as above: make sure several components are joined together correctly
+ assert(
+ url_util.join(
+ # with resolve_href=True, first arg is the base url; can not be
+ # broken up
+ 's3://bucket/index.html',
+
+ # with resolve_href=True, remaining arguments are the components of
+ # the local href that needs to be resolved
+ '..', 'other-bucket', 'document.txt',
+ resolve_href=True)
+ ==
+ 's3://other-bucket/document.txt')
+
+ # Append local path components to prefix URL
+
+ # wrong:
+ assert(
+ url_util.join(
+ 'https://mirror.spack.io/build_cache',
+ 'my-package',
+ resolve_href=True)
+ ==
+ 'https://mirror.spack.io/my-package')
+
+ # correct - Need to specify resolve_href=False:
+ assert(
+ url_util.join(
+ 'https://mirror.spack.io/build_cache',
+ 'my-package',
+ resolve_href=False)
+ ==
+ 'https://mirror.spack.io/build_cache/my-package')
+
+ # same as above; make sure resolve_href=False is default
+ assert(
+ url_util.join(
+ 'https://mirror.spack.io/build_cache',
+ 'my-package')
+ ==
+ 'https://mirror.spack.io/build_cache/my-package')
+
+ # same as above: make sure several components are joined together correctly
+ assert(
+ url_util.join(
+ # with resolve_href=False, first arg is just a prefix. No
+ # resolution is done. So, there should be no difference between
+ # join('/a/b/c', 'd/e'),
+ # join('/a/b', 'c', 'd/e'),
+ # join('/a', 'b/c', 'd', 'e'), etc.
+ 'https://mirror.spack.io',
+ 'build_cache',
+ 'my-package')
+ ==
+ 'https://mirror.spack.io/build_cache/my-package')
+
+ # file:// URL path components are *NOT* canonicalized
+ spack_root = os.path.abspath(os.environ['SPACK_ROOT'])
+
+ join_result = url_util.join('/a/b/c', '$spack')
+ assert(join_result == 'file:///a/b/c/$spack') # not canonicalized
+ format_result = url_util.format(join_result)
+ # canoncalize by hand
+ expected = url_util.format(os.path.abspath(os.path.join(
+ '/', 'a', 'b', 'c', '.' + spack_root)))
+ assert(format_result == expected)
+
+ # see test_url_join_absolute_paths() for more on absolute path components
+ join_result = url_util.join('/a/b/c', '/$spack')
+ assert(join_result == 'file:///$spack') # not canonicalized
+ format_result = url_util.format(join_result)
+ expected = url_util.format(spack_root)
+ assert(format_result == expected)
+
+ # For s3:// URLs, the "netloc" (bucket) is considered part of the path.
+ # Make sure join() can cross bucket boundaries in this case.
+ args = ['s3://bucket/a/b', 'new-bucket', 'c']
+ assert(url_util.join(*args) == 's3://bucket/a/b/new-bucket/c')
+
+ args.insert(1, '..')
+ assert(url_util.join(*args) == 's3://bucket/a/new-bucket/c')
+
+ args.insert(1, '..')
+ assert(url_util.join(*args) == 's3://bucket/new-bucket/c')
+
+ # new-bucket is now the "netloc" (bucket name)
+ args.insert(1, '..')
+ assert(url_util.join(*args) == 's3://new-bucket/c')
+
+
+def test_url_join_absolute_paths():
+ # Handling absolute path components is a little tricky. To this end, we
+ # distinguish "absolute path components", from the more-familiar concept of
+ # "absolute paths" as they are understood for local filesystem paths.
+ #
+ # - All absolute paths are absolute path components. Joining a URL with
+ # these components has the effect of completely replacing the path of the
+ # URL with the absolute path. These components do not specify a URL
+ # scheme, so the scheme of the URL procuced when joining them depend on
+ # those provided by components that came before it (file:// assumed if no
+ # such scheme is provided).
+
+ # For eaxmple:
+ p = '/path/to/resource'
+ # ...is an absolute path
+
+ # http:// URL
+ assert(
+ url_util.join('http://example.com/a/b/c', p)
+ == 'http://example.com/path/to/resource')
+
+ # s3:// URL
+ # also notice how the netloc is treated as part of the path for s3:// URLs
+ assert(
+ url_util.join('s3://example.com/a/b/c', p)
+ == 's3://path/to/resource')
+
+ # - URL components that specify a scheme are always absolute path
+ # components. Joining a base URL with these components effectively
+ # discards the base URL and "resets" the joining logic starting at the
+ # component in question and using it as the new base URL.
+
+ # For eaxmple:
+ p = 'http://example.com/path/to'
+ # ...is an http:// URL
+
+ join_result = url_util.join(p, 'resource')
+ assert(join_result == 'http://example.com/path/to/resource')
+
+ # works as if everything before the http:// URL was left out
+ assert(
+ url_util.join(
+ 'literally', 'does', 'not', 'matter',
+ p, 'resource')
+ == join_result)
+
+ # It's important to keep in mind that this logic applies even if the
+ # component's path is not an absolute path!
+
+ # For eaxmple:
+ p = './d'
+ # ...is *NOT* an absolute path
+ # ...is also *NOT* an absolute path component
+
+ u = 'file://./d'
+ # ...is a URL
+ # The path of this URL is *NOT* an absolute path
+ # HOWEVER, the URL, itself, *is* an absolute path component
+
+ # (We just need...
+ cwd = os.getcwd()
+ # ...to work out what resource it points to)
+
+ # So, even though parse() assumes "file://" URL, the scheme is still
+ # significant in URL path components passed to join(), even if the base
+ # is a file:// URL.
+
+ path_join_result = 'file:///a/b/c/d'
+ assert(url_util.join('/a/b/c', p) == path_join_result)
+ assert(url_util.join('file:///a/b/c', p) == path_join_result)
+
+ url_join_result = 'file://{CWD}/d'.format(CWD=cwd)
+ assert(url_util.join('/a/b/c', u) == url_join_result)
+ assert(url_util.join('file:///a/b/c', u) == url_join_result)
+
+ # Finally, resolve_href should have no effect for how absolute path
+ # components are handled because local hrefs can not be absolute path
+ # components.
+ args = ['s3://does', 'not', 'matter',
+ 'http://example.com',
+ 'also', 'does', 'not', 'matter',
+ '/path']
+
+ expected = 'http://example.com/path'
+ assert(url_util.join(*args, resolve_href=True) == expected)
+ assert(url_util.join(*args, resolve_href=False) == expected)
+
+ # resolve_href only matters for the local path components at the end of the
+ # argument list.
+ args[-1] = '/path/to/page'
+ args.extend(('..', '..', 'resource'))
+
+ assert(url_util.join(*args, resolve_href=True) ==
+ 'http://example.com/resource')
+
+ assert(url_util.join(*args, resolve_href=False) ==
+ 'http://example.com/path/resource')
diff --git a/lib/spack/spack/util/url.py b/lib/spack/spack/util/url.py
index 6b2786f244..7ac12e7b81 100644
--- a/lib/spack/spack/util/url.py
+++ b/lib/spack/spack/util/url.py
@@ -51,7 +51,7 @@ def local_file_path(url):
def parse(url, scheme='file'):
- """Parse a mirror url.
+ """Parse a url.
For file:// URLs, the netloc and path components are concatenated and
passed through spack.util.path.canoncalize_path().
@@ -105,6 +105,9 @@ def join(base_url, path, *extra, **kwargs):
If resolve_href is False (default), then the URL path components are joined
as in os.path.join().
+ Note: file:// URL path components are not canonicalized as part of this
+ operation. To canonicalize, pass the joined url to format().
+
Examples:
base_url = 's3://bucket/index.html'
body = fetch_body(prefix)
@@ -127,7 +130,79 @@ def join(base_url, path, *extra, **kwargs):
# correct - simply append additional URL path components
spack.util.url.join(prefix, 'my-package', resolve_href=False) # default
'https://mirror.spack.io/build_cache/my-package'
+
+ # For canonicalizing file:// URLs, take care to explicitly differentiate
+ # between absolute and relative join components.
+
+ # '$spack' is not an absolute path component
+ join_result = spack.util.url.join('/a/b/c', '$spack') ; join_result
+ 'file:///a/b/c/$spack'
+ spack.util.url.format(join_result)
+ 'file:///a/b/c/opt/spack'
+
+ # '/$spack' *is* an absolute path component
+ join_result = spack.util.url.join('/a/b/c', '/$spack') ; join_result
+ 'file:///$spack'
+ spack.util.url.format(join_result)
+ 'file:///opt/spack'
"""
+ paths = [
+ (x if isinstance(x, string_types) else x.geturl())
+ for x in itertools.chain((base_url, path), extra)]
+ n = len(paths)
+ last_abs_component = None
+ scheme = None
+ for i in range(n - 1, -1, -1):
+ obj = urllib_parse.urlparse(
+ paths[i], scheme=None, allow_fragments=False)
+
+ scheme = obj.scheme
+
+ # in either case the component is absolute
+ if scheme is not None or obj.path.startswith('/'):
+ if scheme is None:
+ # Without a scheme, we have to go back looking for the
+ # next-last component that specifies a scheme.
+ for j in range(i - 1, -1, -1):
+ obj = urllib_parse.urlparse(
+ paths[j], scheme=None, allow_fragments=False)
+
+ if obj.scheme:
+ paths[i] = '{SM}://{NL}{PATH}'.format(
+ SM=obj.scheme,
+ NL=(
+ (obj.netloc + '/')
+ if obj.scheme != 's3' else ''),
+ PATH=paths[i][1:])
+ break
+
+ last_abs_component = i
+ break
+
+ if last_abs_component is not None:
+ paths = paths[last_abs_component:]
+ if len(paths) == 1:
+ result = urllib_parse.urlparse(
+ paths[0], scheme='file', allow_fragments=False)
+
+ # another subtlety: If the last argument to join() is an absolute
+ # file:// URL component with a relative path, the relative path
+ # needs to be resolved.
+ if result.scheme == 'file' and result.netloc:
+ result = urllib_parse.ParseResult(
+ scheme=result.scheme,
+ netloc='',
+ path=os.path.abspath(result.netloc + result.path),
+ params=result.params,
+ query=result.query,
+ fragment=None)
+
+ return result.geturl()
+
+ return _join(*paths, **kwargs)
+
+
+def _join(base_url, path, *extra, **kwargs):
base_url = parse(base_url)
resolve_href = kwargs.get('resolve_href', False)
diff --git a/lib/spack/spack/util/web.py b/lib/spack/spack/util/web.py
index abf549cc89..f2afe769c6 100644
--- a/lib/spack/spack/util/web.py
+++ b/lib/spack/spack/util/web.py
@@ -477,7 +477,6 @@ def spider(root, depth=0):
performance over a sequential fetch.
"""
-
root = url_util.parse(root)
pages, links = _spider(root, set(), root, 0, depth, False)
return pages, links