diff options
author | Harmen Stoppels <harmenstoppels@gmail.com> | 2022-10-26 10:41:31 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2022-10-26 10:41:31 +0200 |
commit | b92bdf8c7202429d785e8c867bd83d92df1f3f4a (patch) | |
tree | 3973c91c562708f849adedafe4074d020121c8a7 | |
parent | a2520e80c0b2bee62cc4d57090ded2422b96263a (diff) | |
download | spack-b92bdf8c7202429d785e8c867bd83d92df1f3f4a.tar.gz spack-b92bdf8c7202429d785e8c867bd83d92df1f3f4a.tar.bz2 spack-b92bdf8c7202429d785e8c867bd83d92df1f3f4a.tar.xz spack-b92bdf8c7202429d785e8c867bd83d92df1f3f4a.zip |
Relocation regex single pass (#33496)
Instead of looping over multiple regexes and the entire text file
contents, create a giant regex with all literal prefixes and do a single
pass over files to detect prefixes. Not only is a single pass faster,
it's also likely that the regex is compiled better, given that most
prefixes share a common ... prefix.
-rw-r--r-- | lib/spack/spack/binary_distribution.py | 20 | ||||
-rw-r--r-- | lib/spack/spack/relocate.py | 8 | ||||
-rw-r--r-- | lib/spack/spack/test/relocate.py | 29 |
3 files changed, 36 insertions, 21 deletions
diff --git a/lib/spack/spack/binary_distribution.py b/lib/spack/spack/binary_distribution.py index d22572d8a4..7c62c58cb1 100644 --- a/lib/spack/spack/binary_distribution.py +++ b/lib/spack/spack/binary_distribution.py @@ -42,7 +42,7 @@ import spack.util.spack_yaml as syaml import spack.util.url as url_util import spack.util.web as web_util from spack.caches import misc_cache_location -from spack.relocate import utf8_path_to_binary_regex +from spack.relocate import utf8_paths_to_single_binary_regex from spack.spec import Spec from spack.stage import Stage @@ -694,13 +694,10 @@ class BuildManifestVisitor(BaseDirectoryVisitor): return False -def file_matches_any_binary_regex(path, regexes): +def file_matches(path, regex): with open(path, "rb") as f: contents = f.read() - for regex in regexes: - if regex.search(contents): - return True - return False + return bool(regex.search(contents)) def get_buildfile_manifest(spec): @@ -734,8 +731,8 @@ def get_buildfile_manifest(spec): prefixes.append(spack.hooks.sbang.sbang_install_path()) prefixes.append(str(spack.store.layout.root)) - # Create a list regexes matching collected prefixes - compiled_prefixes = [utf8_path_to_binary_regex(prefix) for prefix in prefixes] + # Create a giant regex that matches all prefixes + regex = utf8_paths_to_single_binary_regex(prefixes) # Symlinks. @@ -767,10 +764,9 @@ def get_buildfile_manifest(spec): data["binary_to_relocate_fullpath"].append(abs_path) continue - elif relocate.needs_text_relocation(m_type, m_subtype): - if file_matches_any_binary_regex(abs_path, compiled_prefixes): - data["text_to_relocate"].append(rel_path) - continue + elif relocate.needs_text_relocation(m_type, m_subtype) and file_matches(abs_path, regex): + data["text_to_relocate"].append(rel_path) + continue data["other"].append(abs_path) diff --git a/lib/spack/spack/relocate.py b/lib/spack/spack/relocate.py index 667e8de4e0..0d7525eb05 100644 --- a/lib/spack/spack/relocate.py +++ b/lib/spack/spack/relocate.py @@ -747,9 +747,13 @@ def relocate_links(links, orig_layout_root, orig_install_prefix, new_install_pre def utf8_path_to_binary_regex(prefix): """Create a (binary) regex that matches the input path in utf8""" prefix_bytes = re.escape(prefix).encode("utf-8") - prefix_rexp = re.compile(b"(?<![\\w\\-_/])([\\w\\-_]*?)%s([\\w\\-_/]*)" % prefix_bytes) + return re.compile(b"(?<![\\w\\-_/])([\\w\\-_]*?)%s([\\w\\-_/]*)" % prefix_bytes) - return prefix_rexp + +def utf8_paths_to_single_binary_regex(prefixes): + """Create a (binary) regex that matches any input path in utf8""" + all_prefixes = b"|".join(re.escape(prefix).encode("utf-8") for prefix in prefixes) + return re.compile(b"(?<![\\w\\-_/])([\\w\\-_]*?)(%s)([\\w\\-_/]*)" % all_prefixes) def unsafe_relocate_text(files, prefixes, concurrency=32): diff --git a/lib/spack/spack/test/relocate.py b/lib/spack/spack/test/relocate.py index 42e392c1ba..8e8819ca48 100644 --- a/lib/spack/spack/test/relocate.py +++ b/lib/spack/spack/test/relocate.py @@ -20,7 +20,7 @@ import spack.spec import spack.store import spack.tengine import spack.util.executable -from spack.relocate import utf8_path_to_binary_regex +from spack.relocate import utf8_path_to_binary_regex, utf8_paths_to_single_binary_regex pytestmark = pytest.mark.skipif(sys.platform == "win32", reason="Tests fail on Windows") @@ -480,9 +480,24 @@ def test_fixup_macos_rpaths(make_dylib, make_object_file): def test_text_relocation_regex_is_safe(): - assert ( - utf8_path_to_binary_regex("/[a-z]/") - .search(b"This does not match /a/, but this does: /[a-z]/.") - .group(0) - == b"/[a-z]/" - ) + # Test whether prefix regex is properly escaped + string = b"This does not match /a/, but this does: /[a-z]/." + assert utf8_path_to_binary_regex("/[a-z]/").search(string).group(0) == b"/[a-z]/" + + +def test_utf8_paths_to_single_binary_regex(): + regex = utf8_paths_to_single_binary_regex(["/first/path", "/second/path", "/safe/[a-z]"]) + # Match nothing + assert not regex.search(b"text /neither/first/path text /the/second/path text") + + # Match first + string = b"contains both /first/path/subdir and /second/path/sub" + assert regex.search(string).group(0) == b"/first/path/subdir" + + # Match second + string = b"contains both /not/first/path/subdir but /second/path/subdir" + assert regex.search(string).group(0) == b"/second/path/subdir" + + # Match "unsafe" dir name + string = b"don't match /safe/a/path but do match /safe/[a-z]/file" + assert regex.search(string).group(0) == b"/safe/[a-z]/file" |