diff options
author | Harmen Stoppels <me@harmenstoppels.nl> | 2024-11-11 10:43:23 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-11-11 09:43:23 +0000 |
commit | a9e60749964e976382fc87bf5e8b9a0d40b816d2 (patch) | |
tree | b1043399b654fac9eb03ab302d30b69f72905d64 /lib | |
parent | 30db7644495c8a52f8bbf81af564a33f1373fb31 (diff) | |
download | spack-a9e60749964e976382fc87bf5e8b9a0d40b816d2.tar.gz spack-a9e60749964e976382fc87bf5e8b9a0d40b816d2.tar.bz2 spack-a9e60749964e976382fc87bf5e8b9a0d40b816d2.tar.xz spack-a9e60749964e976382fc87bf5e8b9a0d40b816d2.zip |
filesystem.py find: return directories and improve performance (#47537)
Diffstat (limited to 'lib')
-rw-r--r-- | lib/spack/llnl/util/filesystem.py | 84 | ||||
-rw-r--r-- | lib/spack/spack/test/llnl/util/filesystem.py | 22 |
2 files changed, 49 insertions, 57 deletions
diff --git a/lib/spack/llnl/util/filesystem.py b/lib/spack/llnl/util/filesystem.py index 83cbe45104..a876a76c27 100644 --- a/lib/spack/llnl/util/filesystem.py +++ b/lib/spack/llnl/util/filesystem.py @@ -1693,11 +1693,11 @@ def find( recursive: bool = True, max_depth: Optional[int] = None, ) -> List[str]: - """Finds all non-directory files matching the patterns from ``files`` starting from ``root``. - This function returns a deterministic result for the same input and directory structure when - run multiple times. Symlinked directories are followed, and unique directories are searched - only once. Each matching file is returned only once at lowest depth in case multiple paths - exist due to symlinked directories. + """Finds all files matching the patterns from ``files`` starting from ``root``. This function + returns a deterministic result for the same input and directory structure when run multiple + times. Symlinked directories are followed, and unique directories are searched only once. Each + matching file is returned only once at lowest depth in case multiple paths exist due to + symlinked directories. Accepts any glob characters accepted by fnmatch: @@ -1830,54 +1830,58 @@ def _find_max_depth( # Use glob.glob for complex patterns. for pattern_name, pattern in complex_patterns.items(): matched_paths[pattern_name].extend( - path - for path in glob.glob(os.path.join(curr_dir, pattern)) - if not os.path.isdir(path) + path for path in glob.glob(os.path.join(curr_dir, pattern)) ) + # List of subdirectories by path and (inode, device) tuple + subdirs: List[Tuple[str, Tuple[int, int]]] = [] + with dir_iter: - ordered_entries = sorted(dir_iter, key=lambda x: x.name) - for dir_entry in ordered_entries: + for dir_entry in dir_iter: + + # Match filename only patterns + if filename_only_patterns: + m = regex.match(os.path.normcase(dir_entry.name)) + if m: + for pattern_name in filename_only_patterns: + if m.group(pattern_name): + matched_paths[pattern_name].append(dir_entry.path) + break + + # Collect subdirectories + if depth >= max_depth: + continue + try: - it_is_a_dir = dir_entry.is_dir(follow_symlinks=True) + if not dir_entry.is_dir(follow_symlinks=True): + continue + if sys.platform == "win32": + # Note: st_ino/st_dev on DirEntry.stat are not set on Windows, so we have + # to call os.stat + stat_info = os.stat(dir_entry.path, follow_symlinks=True) + else: + stat_info = dir_entry.stat(follow_symlinks=True) except OSError as e: # Possible permission issue, or a symlink that cannot be resolved (ELOOP). _log_file_access_issue(e, dir_entry.path) continue - if it_is_a_dir: - if depth >= max_depth: - continue - try: - # The stat should be performed in a try/except block. We repeat that here - # vs. moving to the above block because we only want to call `stat` if we - # haven't exceeded our max_depth - if sys.platform == "win32": - # Note: st_ino/st_dev on DirEntry.stat are not set on Windows, so we - # have to call os.stat - stat_info = os.stat(dir_entry.path, follow_symlinks=True) - else: - stat_info = dir_entry.stat(follow_symlinks=True) - except OSError as e: - _log_file_access_issue(e, dir_entry.path) - continue + subdirs.append((dir_entry.path, _file_id(stat_info))) - dir_id = _file_id(stat_info) - if dir_id not in visited_dirs: - dir_queue.appendleft((depth + 1, dir_entry.path)) - visited_dirs.add(dir_id) - elif filename_only_patterns: - m = regex.match(os.path.normcase(dir_entry.name)) - if not m: - continue - for pattern_name in filename_only_patterns: - if m.group(pattern_name): - matched_paths[pattern_name].append(dir_entry.path) - break + # Enqueue subdirectories in a deterministic order + if subdirs: + subdirs.sort(key=lambda s: os.path.basename(s[0])) + for subdir, subdir_id in subdirs: + if subdir_id not in visited_dirs: + dir_queue.appendleft((depth + 1, subdir)) + visited_dirs.add(subdir_id) + # Sort the matched paths for deterministic output + for paths in matched_paths.values(): + paths.sort() all_matching_paths = [path for paths in matched_paths.values() for path in paths] - # we only dedupe files if we have any complex patterns, since only they can match the same file + # We only dedupe files if we have any complex patterns, since only they can match the same file # multiple times return _dedupe_files(all_matching_paths) if complex_patterns else all_matching_paths diff --git a/lib/spack/spack/test/llnl/util/filesystem.py b/lib/spack/spack/test/llnl/util/filesystem.py index fd801295f4..1a32e5707c 100644 --- a/lib/spack/spack/test/llnl/util/filesystem.py +++ b/lib/spack/spack/test/llnl/util/filesystem.py @@ -1130,16 +1130,16 @@ def complex_dir_structure(request, tmpdir): <root>/ l1-d1/ l2-d1/ - l3-s1 -> l1-d2 # points to directory above l2-d1 l3-d2/ l4-f1 - l3-s3 -> l1-d1 # cyclic link l3-d4/ l4-f2 + l3-s1 -> l1-d2 # points to directory above l2-d1 + l3-s3 -> l1-d1 # cyclic link l1-d2/ - l2-f1 l2-d2/ l3-f3 + l2-f1 l2-s3 -> l2-d2 l1-s3 -> l3-d4 # a link that "skips" a directory level l1-s4 -> l2-s3 # a link to a link to a dir @@ -1155,7 +1155,7 @@ def complex_dir_structure(request, tmpdir): l3_d2 = l2_d1.join("l3-d2").ensure(dir=True) l3_d4 = l2_d1.join("l3-d4").ensure(dir=True) l1_d2 = tmpdir.join("l1-d2").ensure(dir=True) - l2_d2 = l1_d2.join("l1-d2").ensure(dir=True) + l2_d2 = l1_d2.join("l2-d2").ensure(dir=True) if use_junctions: link_fn = llnl.util.symlink._windows_create_junction @@ -1216,7 +1216,7 @@ def test_find_max_depth_multiple_and_repeated_entry_points(complex_dir_structure def test_multiple_patterns(complex_dir_structure): root, _ = complex_dir_structure - paths = fs.find(root, ["l2-f1", "l*-d*/l3-f3", "*", "*/*"]) + paths = fs.find(root, ["l2-f1", "l*-d*/l3-f3", "*-f*", "*/*-f*"]) # There shouldn't be duplicate results with multiple, overlapping patterns assert len(set(paths)) == len(paths) # All files should be found @@ -1249,15 +1249,3 @@ def test_find_input_types(tmp_path: pathlib.Path): with pytest.raises(TypeError): fs.find(1, "file.txt") # type: ignore - - -def test_find_only_finds_files(tmp_path: pathlib.Path): - """ensure that find only returns files even at max_depth""" - (tmp_path / "subdir").mkdir() - (tmp_path / "subdir" / "dir").mkdir() - (tmp_path / "subdir" / "file.txt").write_text("") - assert ( - fs.find(tmp_path, "*", max_depth=1) - == fs.find(tmp_path, "*/*", max_depth=1) - == [str(tmp_path / "subdir" / "file.txt")] - ) |