From 01a0d554f595bc43bc5295d28248b992a53aa5f7 Mon Sep 17 00:00:00 2001 From: Omar Padron Date: Fri, 1 Nov 2019 06:42:43 -0400 Subject: bugfix: spack.util.url.join() now handles absolute paths correctly (#13488) * fix issue where spack.util.url.join() failed to correctly handle absolute path components * add url util tests --- lib/spack/spack/test/util/util_url.py | 303 ++++++++++++++++++++++++++++++++++ lib/spack/spack/util/url.py | 77 ++++++++- lib/spack/spack/util/web.py | 1 - 3 files changed, 379 insertions(+), 2 deletions(-) create mode 100644 lib/spack/spack/test/util/util_url.py diff --git a/lib/spack/spack/test/util/util_url.py b/lib/spack/spack/test/util/util_url.py new file mode 100644 index 0000000000..24b40ac63c --- /dev/null +++ b/lib/spack/spack/test/util/util_url.py @@ -0,0 +1,303 @@ +# Copyright 2013-2019 Lawrence Livermore National Security, LLC and other +# Spack Project Developers. See the top-level COPYRIGHT file for details. +# +# SPDX-License-Identifier: (Apache-2.0 OR MIT) + +"""Test Spack's URL handling utility functions.""" +import os +import os.path +import spack.util.url as url_util + + +def test_url_parse(): + parsed = url_util.parse('/path/to/resource') + assert(parsed.scheme == 'file') + assert(parsed.netloc == '') + assert(parsed.path == '/path/to/resource') + + parsed = url_util.parse('/path/to/resource', scheme='fake') + assert(parsed.scheme == 'fake') + assert(parsed.netloc == '') + assert(parsed.path == '/path/to/resource') + + parsed = url_util.parse('file:///path/to/resource') + assert(parsed.scheme == 'file') + assert(parsed.netloc == '') + assert(parsed.path == '/path/to/resource') + + parsed = url_util.parse('file:///path/to/resource', scheme='fake') + assert(parsed.scheme == 'file') + assert(parsed.netloc == '') + assert(parsed.path == '/path/to/resource') + + parsed = url_util.parse('file://path/to/resource') + assert(parsed.scheme == 'file') + assert(parsed.netloc == '') + expected = os.path.abspath(os.path.join('path', 'to', 'resource')) + assert(parsed.path == expected) + + parsed = url_util.parse('https://path/to/resource') + assert(parsed.scheme == 'https') + assert(parsed.netloc == 'path') + assert(parsed.path == '/to/resource') + + spack_root = os.path.abspath(os.environ['SPACK_ROOT']) + parsed = url_util.parse('$spack') + assert(parsed.scheme == 'file') + assert(parsed.netloc == '') + assert(parsed.path == spack_root) + + parsed = url_util.parse('/a/b/c/$spack') + assert(parsed.scheme == 'file') + assert(parsed.netloc == '') + expected = os.path.abspath(os.path.join( + '/', 'a', 'b', 'c', './' + spack_root)) + assert(parsed.path == expected) + + +def test_url_local_file_path(): + spack_root = os.path.abspath(os.environ['SPACK_ROOT']) + + lfp = url_util.local_file_path('/a/b/c.txt') + assert(lfp == '/a/b/c.txt') + + lfp = url_util.local_file_path('file:///a/b/c.txt') + assert(lfp == '/a/b/c.txt') + + lfp = url_util.local_file_path('file://a/b/c.txt') + expected = os.path.abspath(os.path.join('a', 'b', 'c.txt')) + assert(lfp == expected) + + lfp = url_util.local_file_path('$spack/a/b/c.txt') + expected = os.path.abspath(os.path.join(spack_root, 'a', 'b', 'c.txt')) + assert(lfp == expected) + + lfp = url_util.local_file_path('file:///$spack/a/b/c.txt') + expected = os.path.abspath(os.path.join(spack_root, 'a', 'b', 'c.txt')) + assert(lfp == expected) + + lfp = url_util.local_file_path('file://$spack/a/b/c.txt') + expected = os.path.abspath(os.path.join(spack_root, 'a', 'b', 'c.txt')) + assert(lfp == expected) + + # not a file:// URL - so no local file path + lfp = url_util.local_file_path('http:///a/b/c.txt') + assert(lfp is None) + + lfp = url_util.local_file_path('http://a/b/c.txt') + assert(lfp is None) + + lfp = url_util.local_file_path('http:///$spack/a/b/c.txt') + assert(lfp is None) + + lfp = url_util.local_file_path('http://$spack/a/b/c.txt') + assert(lfp is None) + + +def test_url_join_local_paths(): + # Resolve local link against page URL + + # wrong: + assert( + url_util.join( + 's3://bucket/index.html', + '../other-bucket/document.txt') + == + 's3://bucket/other-bucket/document.txt') + + # correct - need to specify resolve_href=True: + assert( + url_util.join( + 's3://bucket/index.html', + '../other-bucket/document.txt', + resolve_href=True) + == + 's3://other-bucket/document.txt') + + # same as above: make sure several components are joined together correctly + assert( + url_util.join( + # with resolve_href=True, first arg is the base url; can not be + # broken up + 's3://bucket/index.html', + + # with resolve_href=True, remaining arguments are the components of + # the local href that needs to be resolved + '..', 'other-bucket', 'document.txt', + resolve_href=True) + == + 's3://other-bucket/document.txt') + + # Append local path components to prefix URL + + # wrong: + assert( + url_util.join( + 'https://mirror.spack.io/build_cache', + 'my-package', + resolve_href=True) + == + 'https://mirror.spack.io/my-package') + + # correct - Need to specify resolve_href=False: + assert( + url_util.join( + 'https://mirror.spack.io/build_cache', + 'my-package', + resolve_href=False) + == + 'https://mirror.spack.io/build_cache/my-package') + + # same as above; make sure resolve_href=False is default + assert( + url_util.join( + 'https://mirror.spack.io/build_cache', + 'my-package') + == + 'https://mirror.spack.io/build_cache/my-package') + + # same as above: make sure several components are joined together correctly + assert( + url_util.join( + # with resolve_href=False, first arg is just a prefix. No + # resolution is done. So, there should be no difference between + # join('/a/b/c', 'd/e'), + # join('/a/b', 'c', 'd/e'), + # join('/a', 'b/c', 'd', 'e'), etc. + 'https://mirror.spack.io', + 'build_cache', + 'my-package') + == + 'https://mirror.spack.io/build_cache/my-package') + + # file:// URL path components are *NOT* canonicalized + spack_root = os.path.abspath(os.environ['SPACK_ROOT']) + + join_result = url_util.join('/a/b/c', '$spack') + assert(join_result == 'file:///a/b/c/$spack') # not canonicalized + format_result = url_util.format(join_result) + # canoncalize by hand + expected = url_util.format(os.path.abspath(os.path.join( + '/', 'a', 'b', 'c', '.' + spack_root))) + assert(format_result == expected) + + # see test_url_join_absolute_paths() for more on absolute path components + join_result = url_util.join('/a/b/c', '/$spack') + assert(join_result == 'file:///$spack') # not canonicalized + format_result = url_util.format(join_result) + expected = url_util.format(spack_root) + assert(format_result == expected) + + # For s3:// URLs, the "netloc" (bucket) is considered part of the path. + # Make sure join() can cross bucket boundaries in this case. + args = ['s3://bucket/a/b', 'new-bucket', 'c'] + assert(url_util.join(*args) == 's3://bucket/a/b/new-bucket/c') + + args.insert(1, '..') + assert(url_util.join(*args) == 's3://bucket/a/new-bucket/c') + + args.insert(1, '..') + assert(url_util.join(*args) == 's3://bucket/new-bucket/c') + + # new-bucket is now the "netloc" (bucket name) + args.insert(1, '..') + assert(url_util.join(*args) == 's3://new-bucket/c') + + +def test_url_join_absolute_paths(): + # Handling absolute path components is a little tricky. To this end, we + # distinguish "absolute path components", from the more-familiar concept of + # "absolute paths" as they are understood for local filesystem paths. + # + # - All absolute paths are absolute path components. Joining a URL with + # these components has the effect of completely replacing the path of the + # URL with the absolute path. These components do not specify a URL + # scheme, so the scheme of the URL procuced when joining them depend on + # those provided by components that came before it (file:// assumed if no + # such scheme is provided). + + # For eaxmple: + p = '/path/to/resource' + # ...is an absolute path + + # http:// URL + assert( + url_util.join('http://example.com/a/b/c', p) + == 'http://example.com/path/to/resource') + + # s3:// URL + # also notice how the netloc is treated as part of the path for s3:// URLs + assert( + url_util.join('s3://example.com/a/b/c', p) + == 's3://path/to/resource') + + # - URL components that specify a scheme are always absolute path + # components. Joining a base URL with these components effectively + # discards the base URL and "resets" the joining logic starting at the + # component in question and using it as the new base URL. + + # For eaxmple: + p = 'http://example.com/path/to' + # ...is an http:// URL + + join_result = url_util.join(p, 'resource') + assert(join_result == 'http://example.com/path/to/resource') + + # works as if everything before the http:// URL was left out + assert( + url_util.join( + 'literally', 'does', 'not', 'matter', + p, 'resource') + == join_result) + + # It's important to keep in mind that this logic applies even if the + # component's path is not an absolute path! + + # For eaxmple: + p = './d' + # ...is *NOT* an absolute path + # ...is also *NOT* an absolute path component + + u = 'file://./d' + # ...is a URL + # The path of this URL is *NOT* an absolute path + # HOWEVER, the URL, itself, *is* an absolute path component + + # (We just need... + cwd = os.getcwd() + # ...to work out what resource it points to) + + # So, even though parse() assumes "file://" URL, the scheme is still + # significant in URL path components passed to join(), even if the base + # is a file:// URL. + + path_join_result = 'file:///a/b/c/d' + assert(url_util.join('/a/b/c', p) == path_join_result) + assert(url_util.join('file:///a/b/c', p) == path_join_result) + + url_join_result = 'file://{CWD}/d'.format(CWD=cwd) + assert(url_util.join('/a/b/c', u) == url_join_result) + assert(url_util.join('file:///a/b/c', u) == url_join_result) + + # Finally, resolve_href should have no effect for how absolute path + # components are handled because local hrefs can not be absolute path + # components. + args = ['s3://does', 'not', 'matter', + 'http://example.com', + 'also', 'does', 'not', 'matter', + '/path'] + + expected = 'http://example.com/path' + assert(url_util.join(*args, resolve_href=True) == expected) + assert(url_util.join(*args, resolve_href=False) == expected) + + # resolve_href only matters for the local path components at the end of the + # argument list. + args[-1] = '/path/to/page' + args.extend(('..', '..', 'resource')) + + assert(url_util.join(*args, resolve_href=True) == + 'http://example.com/resource') + + assert(url_util.join(*args, resolve_href=False) == + 'http://example.com/path/resource') diff --git a/lib/spack/spack/util/url.py b/lib/spack/spack/util/url.py index 6b2786f244..7ac12e7b81 100644 --- a/lib/spack/spack/util/url.py +++ b/lib/spack/spack/util/url.py @@ -51,7 +51,7 @@ def local_file_path(url): def parse(url, scheme='file'): - """Parse a mirror url. + """Parse a url. For file:// URLs, the netloc and path components are concatenated and passed through spack.util.path.canoncalize_path(). @@ -105,6 +105,9 @@ def join(base_url, path, *extra, **kwargs): If resolve_href is False (default), then the URL path components are joined as in os.path.join(). + Note: file:// URL path components are not canonicalized as part of this + operation. To canonicalize, pass the joined url to format(). + Examples: base_url = 's3://bucket/index.html' body = fetch_body(prefix) @@ -127,7 +130,79 @@ def join(base_url, path, *extra, **kwargs): # correct - simply append additional URL path components spack.util.url.join(prefix, 'my-package', resolve_href=False) # default 'https://mirror.spack.io/build_cache/my-package' + + # For canonicalizing file:// URLs, take care to explicitly differentiate + # between absolute and relative join components. + + # '$spack' is not an absolute path component + join_result = spack.util.url.join('/a/b/c', '$spack') ; join_result + 'file:///a/b/c/$spack' + spack.util.url.format(join_result) + 'file:///a/b/c/opt/spack' + + # '/$spack' *is* an absolute path component + join_result = spack.util.url.join('/a/b/c', '/$spack') ; join_result + 'file:///$spack' + spack.util.url.format(join_result) + 'file:///opt/spack' """ + paths = [ + (x if isinstance(x, string_types) else x.geturl()) + for x in itertools.chain((base_url, path), extra)] + n = len(paths) + last_abs_component = None + scheme = None + for i in range(n - 1, -1, -1): + obj = urllib_parse.urlparse( + paths[i], scheme=None, allow_fragments=False) + + scheme = obj.scheme + + # in either case the component is absolute + if scheme is not None or obj.path.startswith('/'): + if scheme is None: + # Without a scheme, we have to go back looking for the + # next-last component that specifies a scheme. + for j in range(i - 1, -1, -1): + obj = urllib_parse.urlparse( + paths[j], scheme=None, allow_fragments=False) + + if obj.scheme: + paths[i] = '{SM}://{NL}{PATH}'.format( + SM=obj.scheme, + NL=( + (obj.netloc + '/') + if obj.scheme != 's3' else ''), + PATH=paths[i][1:]) + break + + last_abs_component = i + break + + if last_abs_component is not None: + paths = paths[last_abs_component:] + if len(paths) == 1: + result = urllib_parse.urlparse( + paths[0], scheme='file', allow_fragments=False) + + # another subtlety: If the last argument to join() is an absolute + # file:// URL component with a relative path, the relative path + # needs to be resolved. + if result.scheme == 'file' and result.netloc: + result = urllib_parse.ParseResult( + scheme=result.scheme, + netloc='', + path=os.path.abspath(result.netloc + result.path), + params=result.params, + query=result.query, + fragment=None) + + return result.geturl() + + return _join(*paths, **kwargs) + + +def _join(base_url, path, *extra, **kwargs): base_url = parse(base_url) resolve_href = kwargs.get('resolve_href', False) diff --git a/lib/spack/spack/util/web.py b/lib/spack/spack/util/web.py index abf549cc89..f2afe769c6 100644 --- a/lib/spack/spack/util/web.py +++ b/lib/spack/spack/util/web.py @@ -477,7 +477,6 @@ def spider(root, depth=0): performance over a sequential fetch. """ - root = url_util.parse(root) pages, links = _spider(root, set(), root, 0, depth, False) return pages, links -- cgit v1.2.3-70-g09d2