From 0fff219aa4483cfbed6bb947b13a340b46e6aa71 Mon Sep 17 00:00:00 2001 From: Harmen Stoppels Date: Mon, 24 Jul 2023 12:30:47 +0200 Subject: Fix broken `sanitize_file_path` (#38926) The sanitization function is completely bogus as it tries to replace / on unix after ... splitting on it. The way it's implemented is very questionable: the input is a file name, not a path. It doesn't make sense to interpret the input as a path and then make the components valid -- you'll interpret / in a filename as a dir separator. It also fails to deal with path components that contain just unsupported characters (resulting in empty component). The correct way to deal with this is to have a function that takes a potential file name and replaces unsupported characters. I'm not going to fix the other issues on Windows, such as reserved file names, but left a note, and hope that @johnwparent can fix that separately. (Obviously we wouldn't have this problem at all if we just fixed the filename in a safe way instead of trying to derive something from the url; we could use the content digest when available for example) --- lib/spack/spack/stage.py | 3 +-- lib/spack/spack/test/util/path.py | 14 ++++++-------- lib/spack/spack/test/util/util_url.py | 14 ++++++++++++++ lib/spack/spack/test/web.py | 1 + lib/spack/spack/util/path.py | 34 ++++++++++------------------------ lib/spack/spack/util/url.py | 21 ++++++++++++++++++++- 6 files changed, 52 insertions(+), 35 deletions(-) (limited to 'lib') diff --git a/lib/spack/spack/stage.py b/lib/spack/spack/stage.py index 032e4407bf..446e3a4a0e 100644 --- a/lib/spack/spack/stage.py +++ b/lib/spack/spack/stage.py @@ -387,8 +387,7 @@ class Stage: expanded = True if isinstance(self.default_fetcher, fs.URLFetchStrategy): expanded = self.default_fetcher.expand_archive - clean_url = os.path.basename(sup.sanitize_file_path(self.default_fetcher.url)) - fnames.append(clean_url) + fnames.append(url_util.default_download_filename(self.default_fetcher.url)) if self.mirror_paths: fnames.extend(os.path.basename(x) for x in self.mirror_paths) diff --git a/lib/spack/spack/test/util/path.py b/lib/spack/spack/test/util/path.py index 3949620e1a..9eba4c42fa 100644 --- a/lib/spack/spack/test/util/path.py +++ b/lib/spack/spack/test/util/path.py @@ -29,15 +29,13 @@ fixed_lines = [ ] -def test_sanitze_file_path(tmpdir): - """Test filtering illegal characters out of potential file paths""" - # *nix illegal files characters are '/' and none others - illegal_file_path = str(tmpdir) + "//" + "abcdefghi.txt" +def test_sanitize_filename(): + """Test filtering illegal characters out of potential filenames""" + sanitized = sup.sanitize_filename("""acd/?e:f"g|h*i.\0txt""") if sys.platform == "win32": - # Windows has a larger set of illegal characters - illegal_file_path = os.path.join(tmpdir, 'acd?e:f"g|h*i.txt') - real_path = sup.sanitize_file_path(illegal_file_path) - assert real_path == os.path.join(str(tmpdir), "abcdefghi.txt") + assert sanitized == "a_b_cd__e_f_g_h_i._txt" + else: + assert sanitized == """acd_?e:f"g|h*i._txt""" # This class pertains to path string padding manipulation specifically diff --git a/lib/spack/spack/test/util/util_url.py b/lib/spack/spack/test/util/util_url.py index 582393a022..451f4a4383 100644 --- a/lib/spack/spack/test/util/util_url.py +++ b/lib/spack/spack/test/util/util_url.py @@ -10,6 +10,7 @@ import urllib.parse import pytest +import spack.util.path import spack.util.url as url_util @@ -278,3 +279,16 @@ def test_git_url_parse(url, parts): url_util.parse_git_url(url) else: assert parts == url_util.parse_git_url(url) + + +def test_default_download_name(): + url = "https://example.com:1234/path/to/file.txt;params?abc=def#file=blob.tar" + filename = url_util.default_download_filename(url) + assert filename == spack.util.path.sanitize_filename(filename) + + +def test_default_download_name_dot_dot(): + """Avoid that downloaded files get names computed as ., .. or any hidden file.""" + assert url_util.default_download_filename("https://example.com/.") == "_" + assert url_util.default_download_filename("https://example.com/..") == "_." + assert url_util.default_download_filename("https://example.com/.abcdef") == "_abcdef" diff --git a/lib/spack/spack/test/web.py b/lib/spack/spack/test/web.py index 6d451ed20a..89023875df 100644 --- a/lib/spack/spack/test/web.py +++ b/lib/spack/spack/test/web.py @@ -12,6 +12,7 @@ import llnl.util.tty as tty import spack.config import spack.mirror import spack.paths +import spack.util.path import spack.util.s3 import spack.util.url as url_util import spack.util.web diff --git a/lib/spack/spack/util/path.py b/lib/spack/spack/util/path.py index f894f6f3d8..ef6fb883c7 100644 --- a/lib/spack/spack/util/path.py +++ b/lib/spack/spack/util/path.py @@ -132,40 +132,26 @@ def path_to_os_path(*pths): return ret_pths -def sanitize_file_path(pth): +def sanitize_filename(filename: str) -> str: """ - Formats strings to contain only characters that can - be used to generate legal file paths. + Replaces unsupported characters (for the host) in a filename with underscores. Criteria for legal files based on https://en.wikipedia.org/wiki/Filename#Comparison_of_filename_limitations Args: - pth: string containing path to be created - on the host filesystem + filename: string containing filename to be created on the host filesystem Return: - sanitized string that can legally be made into a path + filename that can be created on the host filesystem """ - # on unix, splitting path by seperators will remove - # instances of illegal characters on join - pth_cmpnts = pth.split(os.path.sep) + if sys.platform != "win32": + # Only disallow null bytes and directory separators. + return re.sub("[\0/]", "_", filename) - if sys.platform == "win32": - drive_match = r"[a-zA-Z]:" - is_abs = bool(re.match(drive_match, pth_cmpnts[0])) - drive = pth_cmpnts[0] + os.path.sep if is_abs else "" - pth_cmpnts = pth_cmpnts[1:] if drive else pth_cmpnts - illegal_chars = r'[<>?:"|*\\]' - else: - drive = "/" if not pth_cmpnts[0] else "" - illegal_chars = r"[/]" - - pth = [] - for cmp in pth_cmpnts: - san_cmp = re.sub(illegal_chars, "", cmp) - pth.append(san_cmp) - return drive + os.path.join(*pth) + # On Windows, things are more involved. + # NOTE: this is incomplete, missing reserved names + return re.sub(r'[\x00-\x1F\x7F"*/:<>?\\|]', "_", filename) def system_path_filter(_func=None, arg_slice=None): diff --git a/lib/spack/spack/util/url.py b/lib/spack/spack/util/url.py index cacee8f1fb..2dbbe6b7b4 100644 --- a/lib/spack/spack/util/url.py +++ b/lib/spack/spack/util/url.py @@ -15,7 +15,7 @@ import sys import urllib.parse import urllib.request -from spack.util.path import convert_to_posix_path +from spack.util.path import convert_to_posix_path, sanitize_filename def validate_scheme(scheme): @@ -294,3 +294,22 @@ def parse_git_url(url): raise ValueError("bad port in git url: %s" % url) return (scheme, user, hostname, port, path) + + +def default_download_filename(url: str) -> str: + """This method computes a default file name for a given URL. + Note that it makes no request, so this is not the same as the + option curl -O, which uses the remote file name from the response + header.""" + parsed_url = urllib.parse.urlparse(url) + # Only use the last path component + params + query + fragment + name = urllib.parse.urlunparse( + parsed_url._replace(scheme="", netloc="", path=posixpath.basename(parsed_url.path)) + ) + valid_name = sanitize_filename(name) + + # Don't download to hidden files please + if valid_name[0] == ".": + valid_name = "_" + valid_name[1:] + + return valid_name -- cgit v1.2.3-60-g2f50