diff options
author | Harmen Stoppels <harmenstoppels@gmail.com> | 2021-10-12 09:05:11 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2021-10-12 00:05:11 -0700 |
commit | 0c0831861c57b747d915731ef3ad214346a9dcbe (patch) | |
tree | 9fc03780ccfdcc1d743b7fb38b4c650aee0fee47 | |
parent | 580a20c2436110f6b10ae3a62242c7b01f24b9a9 (diff) | |
download | spack-0c0831861c57b747d915731ef3ad214346a9dcbe.tar.gz spack-0c0831861c57b747d915731ef3ad214346a9dcbe.tar.bz2 spack-0c0831861c57b747d915731ef3ad214346a9dcbe.tar.xz spack-0c0831861c57b747d915731ef3ad214346a9dcbe.zip |
Avoid quadratic complexity in log parser (#26568)
TL;DR: there are matching groups trying to match 1 or more occurrences of
something. We don't use the matching group. Therefore it's sufficient to test
for 1 occurrence. This reduce quadratic complexity to linear time.
---
When parsing logs of an mpich build, I'm getting a 4 minute (!!) wait
with 16 threads for regexes to run:
```
In [1]: %time p.parse("mpich.log")
Wall time: 4min 14s
```
That's really unacceptably slow...
After some digging, it seems a few regexes tend to have `O(n^2)` scaling
where `n` is the string / log line length. I don't think they *necessarily*
should scale like that, but it seems that way. The common pattern is this
```
([^:]+): error
```
which matches `: error` literally, and then one or more non-colons before that. So
for a log line like this:
```
abcdefghijklmnopqrstuvwxyz: error etc etc
```
Any of these are potential group matches when using `search` in Python:
```
abcdefghijklmnopqrstuvwxyz
bcdefghijklmnopqrstuvwxyz
cdefghijklmnopqrstuvwxyz
⋮
yz
z
```
but clearly the capture group should return the longest match.
My hypothesis is that Python has a very bad implementation of `search`
that somehow considers all of these, even though it can be implemented
in linear time by scanning for `: error` first, and then greedily expanding
the longest possible `[^:]+` match to the left. If Python indeed considers
all possible matches, then with `n` matches of length `1 .. n` you
see the `O(n^2)` slowness (i verified this by replacing + with {1,k}
and doubling `k`, it doubles the execution time indeed).
This PR fixes this by removing the `+`, so effectively changing the
O(n^2) into a O(n) worst case.
The reason we are fine with dropping `+` is that we don't use the
capture group anywhere, so, we just ensure `:: error` is not a match
but `x: error` is.
After going from O(n^2) to O(n), the 15MB mpich build log is parsed
in `1.288s`, so about 200x faster.
Just to be sure I've also updated `^CMake Error.*:` to `^CMake Error`,
so that it does not match with all the possible `:`'s in the line.
Another option is to use `.*?` there to make it quit scanning as soon as
possible, but what line that starts with `CMake Error` that does not have
a colon is really a false positive...
-rw-r--r-- | lib/spack/external/ctest_log_parser.py | 83 |
1 files changed, 21 insertions, 62 deletions
diff --git a/lib/spack/external/ctest_log_parser.py b/lib/spack/external/ctest_log_parser.py index 4e80a2b688..61cc6df029 100644 --- a/lib/spack/external/ctest_log_parser.py +++ b/lib/spack/external/ctest_log_parser.py @@ -77,47 +77,7 @@ from contextlib import contextmanager from six import StringIO from six import string_types -class prefilter(object): - """Make regular expressions faster with a simple prefiltering predicate. - - Some regular expressions seem to be much more costly than others. In - most cases, we can evaluate a simple precondition, e.g.:: - - lambda x: "error" in x - - to avoid evaluating expensive regexes on all lines in a file. This - can reduce parse time for large files by orders of magnitude when - evaluating lots of expressions. - - A ``prefilter`` object is designed to act like a regex,, but - ``search`` and ``match`` check the precondition before bothering to - evaluate the regular expression. - - Note that ``match`` and ``search`` just return ``True`` and ``False`` - at the moment. Make them return a ``MatchObject`` or ``None`` if it - becomes necessary. - """ - def __init__(self, precondition, *patterns): - self.patterns = [re.compile(p) for p in patterns] - self.pre = precondition - self.pattern = "\n ".join( - ('MERGED:',) + patterns) - - def search(self, text): - return self.pre(text) and any(p.search(text) for p in self.patterns) - - def match(self, text): - return self.pre(text) and any(p.match(text) for p in self.patterns) - - _error_matches = [ - prefilter( - lambda x: any(s in x for s in ( - 'Error:', 'error', 'undefined reference', 'multiply defined')), - "([^:]+): error[ \\t]*[0-9]+[ \\t]*:", - "([^:]+): (Error:|error|undefined reference|multiply defined)", - "([^ :]+) ?: (error|fatal error|catastrophic error)", - "([^:]+)\\(([^\\)]+)\\) ?: (error|fatal error|catastrophic error)"), "^FAIL: ", "^FATAL: ", "^failed ", @@ -127,6 +87,8 @@ _error_matches = [ "^[Ss]egmentation [Vv]iolation", "^[Ss]egmentation [Ff]ault", ":.*[Pp]ermission [Dd]enied", + "[^ :]:[0-9]+: [^ \\t]", + "[^:]: error[ \\t]*[0-9]+[ \\t]*:", "^Error ([0-9]+):", "^Fatal", "^[Ee]rror: ", @@ -136,6 +98,9 @@ _error_matches = [ "^cc[^C]*CC: ERROR File = ([^,]+), Line = ([0-9]+)", "^ld([^:])*:([ \\t])*ERROR([^:])*:", "^ild:([ \\t])*\\(undefined symbol\\)", + "[^ :] : (error|fatal error|catastrophic error)", + "[^:]: (Error:|error|undefined reference|multiply defined)", + "[^:]\\([^\\)]+\\) ?: (error|fatal error|catastrophic error)", "^fatal error C[0-9]+:", ": syntax error ", "^collect2: ld returned 1 exit status", @@ -144,7 +109,7 @@ _error_matches = [ "^Unresolved:", "Undefined symbol", "^Undefined[ \\t]+first referenced", - "^CMake Error.*:", + "^CMake Error", ":[ \\t]cannot find", ":[ \\t]can't find", ": \\*\\*\\* No rule to make target [`'].*\\'. Stop", @@ -180,6 +145,7 @@ _error_exceptions = [ "instantiated from ", "candidates are:", ": warning", + ": WARNING", ": \\(Warning\\)", ": note", " ok", @@ -187,38 +153,32 @@ _error_exceptions = [ "makefile:", "Makefile:", ":[ \\t]+Where:", - "([^ :]+):([0-9]+): Warning", + "[^ :]:[0-9]+: Warning", "------ Build started: .* ------", ] #: Regexes to match file/line numbers in error/warning messages _warning_matches = [ - prefilter( - lambda x: 'warning' in x, - "([^ :]+):([0-9]+): warning:", - "([^:]+): warning ([0-9]+):", - "([^:]+): warning[ \\t]*[0-9]+[ \\t]*:", - "([^ :]+) : warning", - "([^:]+): warning"), - prefilter( - lambda x: 'note:' in x, - "^([^ :]+):([0-9]+): note:"), - prefilter( - lambda x: any(s in x for s in ('Warning', 'Warnung')), - "^(Warning|Warnung) ([0-9]+):", - "^(Warning|Warnung)[ :]", - "^cxx: Warning:", - "([^ :]+):([0-9]+): (Warning|Warnung)", - "^CMake Warning.*:"), - "file: .* has no symbols", + "[^ :]:[0-9]+: warning:", + "[^ :]:[0-9]+: note:", "^cc[^C]*CC: WARNING File = ([^,]+), Line = ([0-9]+)", "^ld([^:])*:([ \\t])*WARNING([^:])*:", + "[^:]: warning [0-9]+:", "^\"[^\"]+\", line [0-9]+: [Ww](arning|arnung)", + "[^:]: warning[ \\t]*[0-9]+[ \\t]*:", + "^(Warning|Warnung) ([0-9]+):", + "^(Warning|Warnung)[ :]", "WARNING: ", + "[^ :] : warning", + "[^:]: warning", "\", line [0-9]+\\.[0-9]+: [0-9]+-[0-9]+ \\([WI]\\)", + "^cxx: Warning:", + "file: .* has no symbols", + "[^ :]:[0-9]+: (Warning|Warnung)", "\\([0-9]*\\): remark #[0-9]*", "\".*\", line [0-9]+: remark\\([0-9]*\\):", "cc-[0-9]* CC: REMARK File = .*, Line = [0-9]*", + "^CMake Warning", "^\\[WARNING\\]", ] @@ -349,8 +309,7 @@ def _profile_match(matches, exceptions, line, match_times, exc_times): def _parse(lines, offset, profile): def compile(regex_array): - return [regex if isinstance(regex, prefilter) else re.compile(regex) - for regex in regex_array] + return [re.compile(regex) for regex in regex_array] error_matches = compile(_error_matches) error_exceptions = compile(_error_exceptions) |