Fix broken `sanitize_file_path` (#38926)

The sanitization function is completely bogus as it tries to replace / on unix after ... splitting on it. The way it's implemented is very questionable: the input is a file name, not a path. It doesn't make sense to interpret the input as a path and then make the components valid -- you'll interpret / in a filename as a dir separator. It also fails to deal with path components that contain just unsupported characters (resulting in empty component). The correct way to deal with this is to have a function that takes a potential file name and replaces unsupported characters. I'm not going to fix the other issues on Windows, such as reserved file names, but left a note, and hope that @johnwparent can fix that separately. (Obviously we wouldn't have this problem at all if we just fixed the filename in a safe way instead of trying to derive something from the url; we could use the content digest when available for example)
author: Harmen Stoppels <harmenstoppels@gmail.com> 2023-07-24 12:30:47 +0200
committer: GitHub <noreply@github.com> 2023-07-24 12:30:47 +0200
commit: 0fff219aa4483cfbed6bb947b13a340b46e6aa71 (patch)
tree: aa417bec0cd2337da601c68f95c59f8c6771472d
parent: ac3c0a434702d5ce378fd8a826032582297fe79f (diff)
download: spack-0fff219aa4483cfbed6bb947b13a340b46e6aa71.tar.gz
spack-0fff219aa4483cfbed6bb947b13a340b46e6aa71.tar.bz2
spack-0fff219aa4483cfbed6bb947b13a340b46e6aa71.tar.xz
spack-0fff219aa4483cfbed6bb947b13a340b46e6aa71.zip
6 files changed, 52 insertions, 35 deletions
diff --git a/lib/spack/spack/stage.py b/lib/spack/spack/stage.py
index 032e4407bf..446e3a4a0e 100644
--- a/lib/spack/spack/stage.py
+++ b/lib/spack/spack/stage.py
@@ -387,8 +387,7 @@ class Stage:
         expanded = True
         if isinstance(self.default_fetcher, fs.URLFetchStrategy):
             expanded = self.default_fetcher.expand_archive
-            clean_url = os.path.basename(sup.sanitize_file_path(self.default_fetcher.url))
-            fnames.append(clean_url)
+            fnames.append(url_util.default_download_filename(self.default_fetcher.url))
 
         if self.mirror_paths:
             fnames.extend(os.path.basename(x) for x in self.mirror_paths)
diff --git a/lib/spack/spack/test/util/path.py b/lib/spack/spack/test/util/path.py
index 3949620e1a..9eba4c42fa 100644
--- a/lib/spack/spack/test/util/path.py
+++ b/lib/spack/spack/test/util/path.py
@@ -29,15 +29,13 @@ fixed_lines = [
 ]
 
 
-def test_sanitze_file_path(tmpdir):
-    """Test filtering illegal characters out of potential file paths"""
-    # *nix illegal files characters are '/' and none others
-    illegal_file_path = str(tmpdir) + "//" + "abcdefghi.txt"
+def test_sanitize_filename():
+    """Test filtering illegal characters out of potential filenames"""
+    sanitized = sup.sanitize_filename("""a<b>cd/?e:f"g|h*i.\0txt""")
     if sys.platform == "win32":
-        # Windows has a larger set of illegal characters
-        illegal_file_path = os.path.join(tmpdir, 'a<b>cd?e:f"g|h*i.txt')
-    real_path = sup.sanitize_file_path(illegal_file_path)
-    assert real_path == os.path.join(str(tmpdir), "abcdefghi.txt")
+        assert sanitized == "a_b_cd__e_f_g_h_i._txt"
+    else:
+        assert sanitized == """a<b>cd_?e:f"g|h*i._txt"""
 
 
 # This class pertains to path string padding manipulation specifically
diff --git a/lib/spack/spack/test/util/util_url.py b/lib/spack/spack/test/util/util_url.py
index 582393a022..451f4a4383 100644
--- a/lib/spack/spack/test/util/util_url.py
+++ b/lib/spack/spack/test/util/util_url.py
@@ -10,6 +10,7 @@ import urllib.parse
 
 import pytest
 
+import spack.util.path
 import spack.util.url as url_util
 
 
@@ -278,3 +279,16 @@ def test_git_url_parse(url, parts):
             url_util.parse_git_url(url)
     else:
         assert parts == url_util.parse_git_url(url)
+
+
+def test_default_download_name():
+    url = "https://example.com:1234/path/to/file.txt;params?abc=def#file=blob.tar"
+    filename = url_util.default_download_filename(url)
+    assert filename == spack.util.path.sanitize_filename(filename)
+
+
+def test_default_download_name_dot_dot():
+    """Avoid that downloaded files get names computed as ., .. or any hidden file."""
+    assert url_util.default_download_filename("https://example.com/.") == "_"
+    assert url_util.default_download_filename("https://example.com/..") == "_."
+    assert url_util.default_download_filename("https://example.com/.abcdef") == "_abcdef"
diff --git a/lib/spack/spack/test/web.py b/lib/spack/spack/test/web.py
index 6d451ed20a..89023875df 100644
--- a/lib/spack/spack/test/web.py
+++ b/lib/spack/spack/test/web.py
@@ -12,6 +12,7 @@ import llnl.util.tty as tty
 import spack.config
 import spack.mirror
 import spack.paths
+import spack.util.path
 import spack.util.s3
 import spack.util.url as url_util
 import spack.util.web
diff --git a/lib/spack/spack/util/path.py b/lib/spack/spack/util/path.py
index f894f6f3d8..ef6fb883c7 100644
--- a/lib/spack/spack/util/path.py
+++ b/lib/spack/spack/util/path.py
@@ -132,40 +132,26 @@ def path_to_os_path(*pths):
     return ret_pths
 
 
-def sanitize_file_path(pth):
+def sanitize_filename(filename: str) -> str:
     """
-    Formats strings to contain only characters that can
-    be used to generate legal file paths.
+    Replaces unsupported characters (for the host) in a filename with underscores.
 
     Criteria for legal files based on
     https://en.wikipedia.org/wiki/Filename#Comparison_of_filename_limitations
 
     Args:
-        pth: string containing path to be created
-            on the host filesystem
+        filename: string containing filename to be created on the host filesystem
 
     Return:
-        sanitized string that can legally be made into a path
+        filename that can be created on the host filesystem
     """
-    # on unix, splitting path by seperators will remove
-    # instances of illegal characters on join
-    pth_cmpnts = pth.split(os.path.sep)
+    if sys.platform != "win32":
+        # Only disallow null bytes and directory separators.
+        return re.sub("[\0/]", "_", filename)
 
-    if sys.platform == "win32":
-        drive_match = r"[a-zA-Z]:"
-        is_abs = bool(re.match(drive_match, pth_cmpnts[0]))
-        drive = pth_cmpnts[0] + os.path.sep if is_abs else ""
-        pth_cmpnts = pth_cmpnts[1:] if drive else pth_cmpnts
-        illegal_chars = r'[<>?:"|*\\]'
-    else:
-        drive = "/" if not pth_cmpnts[0] else ""
-        illegal_chars = r"[/]"
-
-    pth = []
-    for cmp in pth_cmpnts:
-        san_cmp = re.sub(illegal_chars, "", cmp)
-        pth.append(san_cmp)
-    return drive + os.path.join(*pth)
+    # On Windows, things are more involved.
+    # NOTE: this is incomplete, missing reserved names
+    return re.sub(r'[\x00-\x1F\x7F"*/:<>?\\|]', "_", filename)
 
 
 def system_path_filter(_func=None, arg_slice=None):
diff --git a/lib/spack/spack/util/url.py b/lib/spack/spack/util/url.py
index cacee8f1fb..2dbbe6b7b4 100644
--- a/lib/spack/spack/util/url.py
+++ b/lib/spack/spack/util/url.py
@@ -15,7 +15,7 @@ import sys
 import urllib.parse
 import urllib.request
 
-from spack.util.path import convert_to_posix_path
+from spack.util.path import convert_to_posix_path, sanitize_filename
 
 
 def validate_scheme(scheme):
@@ -294,3 +294,22 @@ def parse_git_url(url):
             raise ValueError("bad port in git url: %s" % url)
 
     return (scheme, user, hostname, port, path)
+
+
+def default_download_filename(url: str) -> str:
+    """This method computes a default file name for a given URL.
+    Note that it makes no request, so this is not the same as the
+    option curl -O, which uses the remote file name from the response
+    header."""
+    parsed_url = urllib.parse.urlparse(url)
+    # Only use the last path component + params + query + fragment
+    name = urllib.parse.urlunparse(
+        parsed_url._replace(scheme="", netloc="", path=posixpath.basename(parsed_url.path))
+    )
+    valid_name = sanitize_filename(name)
+
+    # Don't download to hidden files please
+    if valid_name[0] == ".":
+        valid_name = "_" + valid_name[1:]
+
+    return valid_name
author	Harmen Stoppels <harmenstoppels@gmail.com>	2023-07-24 12:30:47 +0200
committer	GitHub <noreply@github.com>	2023-07-24 12:30:47 +0200
commit	0fff219aa4483cfbed6bb947b13a340b46e6aa71 (patch)
tree	aa417bec0cd2337da601c68f95c59f8c6771472d
parent	ac3c0a434702d5ce378fd8a826032582297fe79f (diff)
download	spack-0fff219aa4483cfbed6bb947b13a340b46e6aa71.tar.gz spack-0fff219aa4483cfbed6bb947b13a340b46e6aa71.tar.bz2 spack-0fff219aa4483cfbed6bb947b13a340b46e6aa71.tar.xz spack-0fff219aa4483cfbed6bb947b13a340b46e6aa71.zip