summaryrefslogtreecommitdiff
path: root/lib
diff options
context:
space:
mode:
authorHarmen Stoppels <me@harmenstoppels.nl>2024-11-11 10:43:23 +0100
committerGitHub <noreply@github.com>2024-11-11 09:43:23 +0000
commita9e60749964e976382fc87bf5e8b9a0d40b816d2 (patch)
treeb1043399b654fac9eb03ab302d30b69f72905d64 /lib
parent30db7644495c8a52f8bbf81af564a33f1373fb31 (diff)
downloadspack-a9e60749964e976382fc87bf5e8b9a0d40b816d2.tar.gz
spack-a9e60749964e976382fc87bf5e8b9a0d40b816d2.tar.bz2
spack-a9e60749964e976382fc87bf5e8b9a0d40b816d2.tar.xz
spack-a9e60749964e976382fc87bf5e8b9a0d40b816d2.zip
filesystem.py find: return directories and improve performance (#47537)
Diffstat (limited to 'lib')
-rw-r--r--lib/spack/llnl/util/filesystem.py84
-rw-r--r--lib/spack/spack/test/llnl/util/filesystem.py22
2 files changed, 49 insertions, 57 deletions
diff --git a/lib/spack/llnl/util/filesystem.py b/lib/spack/llnl/util/filesystem.py
index 83cbe45104..a876a76c27 100644
--- a/lib/spack/llnl/util/filesystem.py
+++ b/lib/spack/llnl/util/filesystem.py
@@ -1693,11 +1693,11 @@ def find(
recursive: bool = True,
max_depth: Optional[int] = None,
) -> List[str]:
- """Finds all non-directory files matching the patterns from ``files`` starting from ``root``.
- This function returns a deterministic result for the same input and directory structure when
- run multiple times. Symlinked directories are followed, and unique directories are searched
- only once. Each matching file is returned only once at lowest depth in case multiple paths
- exist due to symlinked directories.
+ """Finds all files matching the patterns from ``files`` starting from ``root``. This function
+ returns a deterministic result for the same input and directory structure when run multiple
+ times. Symlinked directories are followed, and unique directories are searched only once. Each
+ matching file is returned only once at lowest depth in case multiple paths exist due to
+ symlinked directories.
Accepts any glob characters accepted by fnmatch:
@@ -1830,54 +1830,58 @@ def _find_max_depth(
# Use glob.glob for complex patterns.
for pattern_name, pattern in complex_patterns.items():
matched_paths[pattern_name].extend(
- path
- for path in glob.glob(os.path.join(curr_dir, pattern))
- if not os.path.isdir(path)
+ path for path in glob.glob(os.path.join(curr_dir, pattern))
)
+ # List of subdirectories by path and (inode, device) tuple
+ subdirs: List[Tuple[str, Tuple[int, int]]] = []
+
with dir_iter:
- ordered_entries = sorted(dir_iter, key=lambda x: x.name)
- for dir_entry in ordered_entries:
+ for dir_entry in dir_iter:
+
+ # Match filename only patterns
+ if filename_only_patterns:
+ m = regex.match(os.path.normcase(dir_entry.name))
+ if m:
+ for pattern_name in filename_only_patterns:
+ if m.group(pattern_name):
+ matched_paths[pattern_name].append(dir_entry.path)
+ break
+
+ # Collect subdirectories
+ if depth >= max_depth:
+ continue
+
try:
- it_is_a_dir = dir_entry.is_dir(follow_symlinks=True)
+ if not dir_entry.is_dir(follow_symlinks=True):
+ continue
+ if sys.platform == "win32":
+ # Note: st_ino/st_dev on DirEntry.stat are not set on Windows, so we have
+ # to call os.stat
+ stat_info = os.stat(dir_entry.path, follow_symlinks=True)
+ else:
+ stat_info = dir_entry.stat(follow_symlinks=True)
except OSError as e:
# Possible permission issue, or a symlink that cannot be resolved (ELOOP).
_log_file_access_issue(e, dir_entry.path)
continue
- if it_is_a_dir:
- if depth >= max_depth:
- continue
- try:
- # The stat should be performed in a try/except block. We repeat that here
- # vs. moving to the above block because we only want to call `stat` if we
- # haven't exceeded our max_depth
- if sys.platform == "win32":
- # Note: st_ino/st_dev on DirEntry.stat are not set on Windows, so we
- # have to call os.stat
- stat_info = os.stat(dir_entry.path, follow_symlinks=True)
- else:
- stat_info = dir_entry.stat(follow_symlinks=True)
- except OSError as e:
- _log_file_access_issue(e, dir_entry.path)
- continue
+ subdirs.append((dir_entry.path, _file_id(stat_info)))
- dir_id = _file_id(stat_info)
- if dir_id not in visited_dirs:
- dir_queue.appendleft((depth + 1, dir_entry.path))
- visited_dirs.add(dir_id)
- elif filename_only_patterns:
- m = regex.match(os.path.normcase(dir_entry.name))
- if not m:
- continue
- for pattern_name in filename_only_patterns:
- if m.group(pattern_name):
- matched_paths[pattern_name].append(dir_entry.path)
- break
+ # Enqueue subdirectories in a deterministic order
+ if subdirs:
+ subdirs.sort(key=lambda s: os.path.basename(s[0]))
+ for subdir, subdir_id in subdirs:
+ if subdir_id not in visited_dirs:
+ dir_queue.appendleft((depth + 1, subdir))
+ visited_dirs.add(subdir_id)
+ # Sort the matched paths for deterministic output
+ for paths in matched_paths.values():
+ paths.sort()
all_matching_paths = [path for paths in matched_paths.values() for path in paths]
- # we only dedupe files if we have any complex patterns, since only they can match the same file
+ # We only dedupe files if we have any complex patterns, since only they can match the same file
# multiple times
return _dedupe_files(all_matching_paths) if complex_patterns else all_matching_paths
diff --git a/lib/spack/spack/test/llnl/util/filesystem.py b/lib/spack/spack/test/llnl/util/filesystem.py
index fd801295f4..1a32e5707c 100644
--- a/lib/spack/spack/test/llnl/util/filesystem.py
+++ b/lib/spack/spack/test/llnl/util/filesystem.py
@@ -1130,16 +1130,16 @@ def complex_dir_structure(request, tmpdir):
<root>/
l1-d1/
l2-d1/
- l3-s1 -> l1-d2 # points to directory above l2-d1
l3-d2/
l4-f1
- l3-s3 -> l1-d1 # cyclic link
l3-d4/
l4-f2
+ l3-s1 -> l1-d2 # points to directory above l2-d1
+ l3-s3 -> l1-d1 # cyclic link
l1-d2/
- l2-f1
l2-d2/
l3-f3
+ l2-f1
l2-s3 -> l2-d2
l1-s3 -> l3-d4 # a link that "skips" a directory level
l1-s4 -> l2-s3 # a link to a link to a dir
@@ -1155,7 +1155,7 @@ def complex_dir_structure(request, tmpdir):
l3_d2 = l2_d1.join("l3-d2").ensure(dir=True)
l3_d4 = l2_d1.join("l3-d4").ensure(dir=True)
l1_d2 = tmpdir.join("l1-d2").ensure(dir=True)
- l2_d2 = l1_d2.join("l1-d2").ensure(dir=True)
+ l2_d2 = l1_d2.join("l2-d2").ensure(dir=True)
if use_junctions:
link_fn = llnl.util.symlink._windows_create_junction
@@ -1216,7 +1216,7 @@ def test_find_max_depth_multiple_and_repeated_entry_points(complex_dir_structure
def test_multiple_patterns(complex_dir_structure):
root, _ = complex_dir_structure
- paths = fs.find(root, ["l2-f1", "l*-d*/l3-f3", "*", "*/*"])
+ paths = fs.find(root, ["l2-f1", "l*-d*/l3-f3", "*-f*", "*/*-f*"])
# There shouldn't be duplicate results with multiple, overlapping patterns
assert len(set(paths)) == len(paths)
# All files should be found
@@ -1249,15 +1249,3 @@ def test_find_input_types(tmp_path: pathlib.Path):
with pytest.raises(TypeError):
fs.find(1, "file.txt") # type: ignore
-
-
-def test_find_only_finds_files(tmp_path: pathlib.Path):
- """ensure that find only returns files even at max_depth"""
- (tmp_path / "subdir").mkdir()
- (tmp_path / "subdir" / "dir").mkdir()
- (tmp_path / "subdir" / "file.txt").write_text("")
- assert (
- fs.find(tmp_path, "*", max_depth=1)
- == fs.find(tmp_path, "*/*", max_depth=1)
- == [str(tmp_path / "subdir" / "file.txt")]
- )