lib/spack/spack/relocate_text.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301

# Copyright 2013-2024 Lawrence Livermore National Security, LLC and other
# Spack Project Developers. See the top-level COPYRIGHT file for details.
#
# SPDX-License-Identifier: (Apache-2.0 OR MIT)

"""This module contains pure-Python classes and functions for replacing
paths inside text files and binaries."""

import re
from collections import OrderedDict
from typing import Dict, Union

import spack.error

Prefix = Union[str, bytes]


def encode_path(p: Prefix) -> bytes:
    return p if isinstance(p, bytes) else p.encode("utf-8")


def _prefix_to_prefix_as_bytes(prefix_to_prefix) -> Dict[bytes, bytes]:
    return OrderedDict((encode_path(k), encode_path(v)) for (k, v) in prefix_to_prefix.items())


def utf8_path_to_binary_regex(prefix: str):
    """Create a binary regex that matches the input path in utf8"""
    prefix_bytes = re.escape(prefix).encode("utf-8")
    return re.compile(b"(?<![\\w\\-_/])([\\w\\-_]*?)%s([\\w\\-_/]*)" % prefix_bytes)


def _byte_strings_to_single_binary_regex(prefixes):
    all_prefixes = b"|".join(re.escape(p) for p in prefixes)
    return re.compile(b"(?<![\\w\\-_/])([\\w\\-_]*?)(%s)([\\w\\-_/]*)" % all_prefixes)


def utf8_paths_to_single_binary_regex(prefixes):
    """Create a (binary) regex that matches any input path in utf8"""
    return _byte_strings_to_single_binary_regex(p.encode("utf-8") for p in prefixes)


def filter_identity_mappings(prefix_to_prefix):
    """Drop mappings that are not changed."""
    # NOTE: we don't guard against the following case:
    # [/abc/def -> /abc/def, /abc -> /x] *will* be simplified to
    # [/abc -> /x], meaning that after this simplification /abc/def will be
    # mapped to /x/def instead of /abc/def. This should not be a problem.
    return OrderedDict((k, v) for (k, v) in prefix_to_prefix.items() if k != v)


class PrefixReplacer:
    """Base class for applying a prefix to prefix map
    to a list of binaries or text files.
    Child classes implement _apply_to_file to do the
    actual work, which is different when it comes to
    binaries and text files."""

    def __init__(self, prefix_to_prefix: Dict[bytes, bytes]):
        """
        Arguments:

            prefix_to_prefix (OrderedDict):

                A ordered mapping from prefix to prefix. The order is
                relevant to support substring fallbacks, for example
                [("/first/sub", "/x"), ("/first", "/y")] will ensure
                /first/sub is matched and replaced before /first.
        """
        self.prefix_to_prefix = filter_identity_mappings(prefix_to_prefix)

    @property
    def is_noop(self) -> bool:
        """Returns true when the prefix to prefix map
        is mapping everything to the same location (identity)
        or there are no prefixes to replace."""
        return not self.prefix_to_prefix

    def apply(self, filenames: list):
        """Returns a list of files that were modified"""
        changed_files = []
        if self.is_noop:
            return []
        for filename in filenames:
            if self.apply_to_filename(filename):
                changed_files.append(filename)
        return changed_files

    def apply_to_filename(self, filename):
        if self.is_noop:
            return False
        with open(filename, "rb+") as f:
            return self.apply_to_file(f)

    def apply_to_file(self, f):
        if self.is_noop:
            return False
        return self._apply_to_file(f)


class TextFilePrefixReplacer(PrefixReplacer):
    """This class applies prefix to prefix mappings for relocation
    on text files.

    Note that UTF-8 encoding is assumed."""

    def __init__(self, prefix_to_prefix: Dict[bytes, bytes]):
        """
        prefix_to_prefix (OrderedDict): OrderedDictionary where the keys are
            bytes representing the old prefixes and the values are the new.
        """
        super().__init__(prefix_to_prefix)
        # Single regex for all paths.
        self.regex = _byte_strings_to_single_binary_regex(self.prefix_to_prefix.keys())

    @classmethod
    def from_strings_or_bytes(
        cls, prefix_to_prefix: Dict[Prefix, Prefix]
    ) -> "TextFilePrefixReplacer":
        """Create a TextFilePrefixReplacer from an ordered prefix to prefix map."""
        return cls(_prefix_to_prefix_as_bytes(prefix_to_prefix))

    def _apply_to_file(self, f):
        """Text replacement implementation simply reads the entire file
        in memory and applies the combined regex."""
        replacement = lambda m: m.group(1) + self.prefix_to_prefix[m.group(2)] + m.group(3)
        data = f.read()
        new_data = re.sub(self.regex, replacement, data)
        if id(data) == id(new_data):
            return False
        f.seek(0)
        f.write(new_data)
        f.truncate()
        return True


class BinaryFilePrefixReplacer(PrefixReplacer):
    def __init__(self, prefix_to_prefix, suffix_safety_size=7):
        """
        prefix_to_prefix (OrderedDict): OrderedDictionary where the keys are
            bytes representing the old prefixes and the values are the new
        suffix_safety_size (int): in case of null terminated strings, what size
            of the suffix should remain to avoid aliasing issues?
        """
        assert suffix_safety_size >= 0
        super().__init__(prefix_to_prefix)
        self.suffix_safety_size = suffix_safety_size
        self.regex = self.binary_text_regex(self.prefix_to_prefix.keys(), suffix_safety_size)

    @classmethod
    def binary_text_regex(cls, binary_prefixes, suffix_safety_size=7):
        """
        Create a regex that looks for exact matches of prefixes, and also tries to
        match a C-string type null terminator in a small lookahead window.

        Arguments:
            binary_prefixes (list): List of byte strings of prefixes to match
            suffix_safety_size (int): Sizeof the lookahed for null-terminated string.

        Returns: compiled regex
        """
        return re.compile(
            b"("
            + b"|".join(re.escape(p) for p in binary_prefixes)
            + b")([^\0]{0,%d}\0)?" % suffix_safety_size
        )

    @classmethod
    def from_strings_or_bytes(
        cls, prefix_to_prefix: Dict[Prefix, Prefix], suffix_safety_size: int = 7
    ) -> "BinaryFilePrefixReplacer":
        """Create a BinaryFilePrefixReplacer from an ordered prefix to prefix map.

        Arguments:
            prefix_to_prefix (OrderedDict): Ordered mapping of prefix to prefix.
            suffix_safety_size (int): Number of bytes to retain at the end of a C-string
                to avoid binary string-aliasing issues.
        """
        return cls(_prefix_to_prefix_as_bytes(prefix_to_prefix), suffix_safety_size)

    def _apply_to_file(self, f):
        """
        Given a file opened in rb+ mode, apply the string replacements as
        specified by an ordered dictionary of prefix to prefix mappings. This
        method takes special care of null-terminated C-strings. C-string constants
        are problematic because compilers and linkers optimize readonly strings for
        space by aliasing those that share a common suffix (only suffix since all
        of them are null terminated). See https://github.com/spack/spack/pull/31739
        and https://github.com/spack/spack/pull/32253 for details. Our logic matches
        the original prefix with a ``suffix_safety_size + 1`` lookahead for null bytes.
        If no null terminator is found, we simply pad with leading /, assuming that
        it's a long C-string; the full C-string after replacement has a large suffix
        in common with its original value.
        If there *is* a null terminator we can do the same as long as the replacement
        has a sufficiently long common suffix with the original prefix.
        As a last resort when the replacement does not have a long enough common suffix,
        we can try to shorten the string, but this only works if the new length is
        sufficiently short (typically the case when going from large padding -> normal path)
        If the replacement string is longer, or all of the above fails, we error out.

        Arguments:
            f: file opened in rb+ mode

        Returns:
            bool: True if file was modified
        """
        assert f.tell() == 0

        # We *could* read binary data in chunks to avoid loading all in memory,
        # but it's nasty to deal with matches across boundaries, so let's stick to
        # something simple.

        modified = True

        for match in self.regex.finditer(f.read()):
            # The matching prefix (old) and its replacement (new)
            old = match.group(1)
            new = self.prefix_to_prefix[old]

            # Did we find a trailing null within a N + 1 bytes window after the prefix?
            null_terminated = match.end(0) > match.end(1)

            # Suffix string length, excluding the null byte
            # Only makes sense if null_terminated
            suffix_strlen = match.end(0) - match.end(1) - 1

            # How many bytes are we shrinking our string?
            bytes_shorter = len(old) - len(new)

            # We can't make strings larger.
            if bytes_shorter < 0:
                raise CannotGrowString(old, new)

            # If we don't know whether this is a null terminated C-string (we're looking
            # only N + 1 bytes ahead), or if it is and we have a common suffix, we can
            # simply pad with leading dir separators.
            elif (
                not null_terminated
                or suffix_strlen >= self.suffix_safety_size  # == is enough, but let's be defensive
                or old[-self.suffix_safety_size + suffix_strlen :]
                == new[-self.suffix_safety_size + suffix_strlen :]
            ):
                replacement = b"/" * bytes_shorter + new

            # If it *was* null terminated, all that matters is that we can leave N bytes
            # of old suffix in place. Note that > is required since we also insert an
            # additional null terminator.
            elif bytes_shorter > self.suffix_safety_size:
                replacement = new + match.group(2)  # includes the trailing null

            # Otherwise... we can't :(
            else:
                raise CannotShrinkCString(old, new, match.group()[:-1])

            f.seek(match.start())
            f.write(replacement)
            modified = True

        return modified


class BinaryStringReplacementError(spack.error.SpackError):
    def __init__(self, file_path, old_len, new_len):
        """The size of the file changed after binary path substitution

        Args:
            file_path (str): file with changing size
            old_len (str): original length of the file
            new_len (str): length of the file after substitution
        """
        super().__init__(
            "Doing a binary string replacement in %s failed.\n"
            "The size of the file changed from %s to %s\n"
            "when it should have remanined the same." % (file_path, old_len, new_len)
        )


class BinaryTextReplaceError(spack.error.SpackError):
    def __init__(self, msg):
        msg += (
            " To fix this, compile with more padding "
            "(config:install_tree:padded_length), or install to a shorter prefix."
        )
        super().__init__(msg)


class CannotGrowString(BinaryTextReplaceError):
    def __init__(self, old, new):
        msg = "Cannot replace {!r} with {!r} because the new prefix is longer.".format(old, new)
        super().__init__(msg)


class CannotShrinkCString(BinaryTextReplaceError):
    def __init__(self, old, new, full_old_string):
        # Just interpolate binary string to not risk issues with invalid
        # unicode, which would be really bad user experience: error in error.
        # We have no clue if we actually deal with a real C-string nor what
        # encoding it has.
        msg = "Cannot replace {!r} with {!r} in the C-string {!r}.".format(
            old, new, full_old_string
        )
        super().__init__(msg)