diff options
author | Rich Felker <dalias@aerifal.cx> | 2012-04-28 18:05:29 -0400 |
---|---|---|
committer | Rich Felker <dalias@aerifal.cx> | 2012-04-28 18:05:29 -0400 |
commit | 45b38550eec7de580d790440154791c67cae8475 (patch) | |
tree | 93005ff72589f5f0ea2989eb08b383d35198b695 /src/regex | |
parent | a1b442335d6382b0dc0363fdd3a5604493b6a9c7 (diff) | |
download | musl-45b38550eec7de580d790440154791c67cae8475.tar.gz musl-45b38550eec7de580d790440154791c67cae8475.tar.bz2 musl-45b38550eec7de580d790440154791c67cae8475.tar.xz musl-45b38550eec7de580d790440154791c67cae8475.zip |
new fnmatch implementation
unlike the old one, this one's algorithm does not suffer from
potential stack overflow issues or pathologically bad performance on
certain patterns. instead of backtracking, it uses a matching
algorithm which I have not seen before (unsure whether I invented or
re-invented it) that runs in O(1) space and O(nm) time. it may be
possible to improve the time to O(n), but not without significantly
greater complexity.
Diffstat (limited to 'src/regex')
-rw-r--r-- | src/regex/fnmatch.c | 404 |
1 files changed, 273 insertions, 131 deletions
diff --git a/src/regex/fnmatch.c b/src/regex/fnmatch.c index 81f0c7bd..ffd3ea0d 100644 --- a/src/regex/fnmatch.c +++ b/src/regex/fnmatch.c @@ -1,157 +1,299 @@ -#include <fnmatch.h> -#include <wctype.h> +/* + * An implementation of what I call the "Sea of Stars" algorithm for + * POSIX fnmatch(). The basic idea is that we factor the pattern into + * a head component (which we match first and can reject without ever + * measuring the length of the string), an optional tail component + * (which only exists if the pattern contains at least one star), and + * an optional "sea of stars", a set of star-separated components + * between the head and tail. After the head and tail matches have + * been removed from the input string, the components in the "sea of + * stars" are matched sequentially by searching for their first + * occurrence past the end of the previous match. + * + * - Rich Felker, April 2012 + */ + #include <string.h> -#include <wchar.h> +#include <fnmatch.h> #include <stdlib.h> -#include <limits.h> +#include <wchar.h> +#include <wctype.h> -static int next(const char **s) +#define END -1 +#define UNMATCHABLE -2 +#define BRACKET -3 +#define QUESTION -4 +#define STAR -5 + +static int str_next(const char *str, size_t n, size_t *step) { - wchar_t c; - int l = mbtowc(&c, *s, MB_LEN_MAX); - /* hack to allow literal matches of invalid byte sequences */ - if (l < 0) return (unsigned char)*(*s)++ - 0x100; - *s += l; - return c; + if (!n) { + *step = 0; + return 0; + } + if (str[0] >= 128U) { + wchar_t wc; + int k = mbtowc(&wc, str, n); + if (k<0) { + *step = 1; + return -1; + } + *step = k; + return wc; + } + *step = 1; + return str[0]; } -#define BRACKET_ERROR -0x100 -#define BRACKET_NOCHAR -0x101 - -static int bracket_next(const char **s) +static int pat_next(const char *pat, size_t m, size_t *step, int flags) { - int c; - int type; - if (**s == '[') { - type = *(*s+1); - if (type == '.' || type == '=') { - *s += 2; - c = next(s); - if (c <= 0) return BRACKET_ERROR; - if (**s == type && *(*s+1) == ']') { - *s += 2; - return c; + int esc = 0; + if (!m || !*pat) { + *step = 0; + return END; + } + *step = 1; + if (pat[0]=='\\' && !(flags & FNM_NOESCAPE)) { + *step = 2; + pat++; + esc = 1; + goto escaped; + } + if (pat[0]=='[') { + size_t k = 1; + if (k<m) if (pat[k] == '^' || pat[k] == '!') k++; + if (k<m) if (pat[k] == ']') k++; + for (; k<m && pat[k] && pat[k]!=']'; k++) { + if (k+1<m && pat[k+1] && pat[k]=='[' && (pat[k+1]==':' || pat[k+1]=='.' || pat[k+1]=='=')) { + int z = pat[k+1]; + k+=2; + if (k<m && pat[k]) k++; + while (k<m && pat[k] && (pat[k-1]!=z || pat[k]!=']')) k++; + if (k==m || !pat[k]) break; } - for (; **s && (**s != type || *(*s+1) != ']'); (*s)++); - if (!**s) return BRACKET_ERROR; - *s += 2; - return BRACKET_NOCHAR; } + if (k==m || !pat[k]) { + *step = 1; + return '['; + } + *step = k+1; + return BRACKET; + } + if (pat[0] == '*') + return STAR; + if (pat[0] == '?') + return QUESTION; +escaped: + if (pat[0] >= 128U) { + wchar_t wc; + int k = mbtowc(&wc, pat, m); + if (k<0) { + *step = 0; + return UNMATCHABLE; + } + *step = k + esc; + return wc; } - c = next(s); - if (c <= 0) return BRACKET_ERROR; - return c; + return pat[0]; } -#define __FNM_CONT 0x8000 +static int match_bracket(const char *p, int k) +{ + wchar_t wc; + int inv = 0; + p++; + if (*p=='^' || *p=='!') { + inv = 1; + p++; + } + if (*p==']') { + if (k==']') return !inv; + p++; + } else if (*p=='-') { + if (k=='-') return !inv; + p++; + } + wc = p[-1]; + for (; *p != ']'; p++) { + if (p[0]=='-' && p[1]!=']') { + wchar_t wc2; + int l = mbtowc(&wc2, p+1, 4); + if (l < 0) return 0; + if (wc<=wc2 && (unsigned)k-wc <= wc2-wc) return !inv; + p += l-1; + continue; + } + if (p[0]=='[' && (p[1]==':' || p[1]=='.' || p[1]=='=')) { + const char *p0 = p+2; + int z = p[1]; + p+=3; + while (p[-1]!=z || p[0]!=']') p++; + if (z == ':' && p-1-p0 < 16) { + char buf[16]; + memcpy(buf, p0, p-1-p0); + buf[p-1-p0] = 0; + if (iswctype(k, wctype(buf))) return !inv; + } + continue; + } + if (*p < 128U) { + wc = (unsigned char)*p; + } else { + int l = mbtowc(&wc, p, 4); + if (l < 0) return 0; + p += l-1; + } + if (wc==k) return !inv; + } + return inv; +} -int fnmatch(const char *p, const char *s, int flags) +static int fnmatch_internal(const char *pat, size_t m, const char *str, size_t n, int flags) { - int c, d, k; - int not; - int match; - int first; - int no_slash = (flags & FNM_PATHNAME) ? '/' : 0; - int no_period = (flags & FNM_PERIOD) && !(flags & __FNM_CONT) ? '.' : 0x100; - const char *p1; - - flags |= __FNM_CONT; - - while ((c = *p++)) { - switch (c) { - case '?': - k = next(&s); - if (!k || k == no_period || k == no_slash) - return FNM_NOMATCH; + const char *p, *ptail, *endpat; + const char *s, *stail, *endstr; + size_t pinc, sinc, tailcnt=0; + int c, k; + + if (flags & FNM_PERIOD) { + if (*str == '.' && *pat != '.') + return FNM_NOMATCH; + } + for (;;) { + switch ((c = pat_next(pat, m, &pinc, flags))) { + case UNMATCHABLE: + return FNM_NOMATCH; + case STAR: + pat++; + m--; break; - case '\\': - if (!(flags & FNM_NOESCAPE)) { - c = *p++; - goto literal; + default: + k = str_next(str, n, &sinc); + if (k <= 0) + return (c==END) ? 0 : FNM_NOMATCH; + str += sinc; + n -= sinc; + if (c == BRACKET) { + if (!match_bracket(pat, k)) + return FNM_NOMATCH; + } else if (c != QUESTION && k != c) { + return FNM_NOMATCH; } - if (*s++ != c) return FNM_NOMATCH; + pat+=pinc; + m-=pinc; + continue; + } + break; + } + + /* Compute real pat length if it was initially unknown/-1 */ + m = strnlen(pat, m); + endpat = pat + m; + + /* Find the last * in pat and count chars needed after it */ + for (p=ptail=pat; p<endpat; p+=pinc) { + switch (pat_next(p, endpat-p, &pinc, flags)) { + case UNMATCHABLE: + return FNM_NOMATCH; + case STAR: + tailcnt=0; + ptail = p+1; break; - case '*': - for (; *p == '*'; p++); - if (*p && !*s) return FNM_NOMATCH; - if (*s == no_period) + default: + tailcnt++; + break; + } + } + + /* Past this point we need not check for UNMATCHABLE in pat, + * because all of pat has already been parsed once. */ + + /* Compute real str length if it was initially unknown/-1 */ + n = strnlen(str, n); + endstr = str + n; + if (n < tailcnt) return FNM_NOMATCH; + + /* Find the final tailcnt chars of str, accounting for UTF-8. + * On illegal sequences we may get it wrong, but in that case + * we necessarily have a matching failure anyway. */ + for (s=endstr; s>str && tailcnt; tailcnt--) { + if (s[-1] < 128U) s--; + else while ((unsigned char)*--s-0x80U<0x40 && s>str); + } + if (tailcnt) return FNM_NOMATCH; + stail = s; + + /* Check that the pat and str tails match */ + p = ptail; + for (;;) { + c = pat_next(p, endpat-p, &pinc, flags); + p += pinc; + if ((k = str_next(s, endstr-s, &sinc)) <= 0) { + if (c != END) return FNM_NOMATCH; + break; + } + s += sinc; + if (c == BRACKET) { + if (!match_bracket(p-pinc, k)) return FNM_NOMATCH; - if (!*p && (!no_slash || !strchr(s, no_slash))) - return 0; - for (; *s; s++) - if (!fnmatch(p, s, flags)) - return 0; - else if (*s == no_slash) - break; + } else if (c != QUESTION && k != c) { return FNM_NOMATCH; - case '[': - p1 = p-1; - not = (*p == '!' || *p == '^'); - if (not) p++; - k = next(&s); - if (!k || k == no_slash || k == no_period) - return FNM_NOMATCH; - match = 0; - first = 1; - for (;;) { - if (!*p) goto literal_bracket; - if (*p == ']' && !first) break; - first = 0; - if (*p == '[' && *(p+1) == ':') { - const char *z; - p += 2; - for (z=p; *z && (*z != ':' || *(z+1) != ']'); z++); - if (!*z || z-p > 32) { /* FIXME: symbolic const? */ - return FNM_NOMATCH; - } else { - char class[33]; - memcpy(class, p, z-p); - class[z-p] = 0; - if (iswctype(k, wctype(class))) - match = 1; - } - p = z+2; - continue; - } - c = bracket_next(&p); - if (c == BRACKET_ERROR) { -literal_bracket: - match = (k=='['); - p = p1; - not = 0; - break; - } - if (c == BRACKET_NOCHAR) - continue; - if (*p == '-' && *(p+1) != ']') { - p++; - d = bracket_next(&p); - if (d == BRACKET_ERROR) - goto literal_bracket; - if (d == BRACKET_NOCHAR) - continue; - if (k >= c && k <= d) - match = 1; - continue; - } - if (k == c) match = 1; + } + } + + /* We're all done with the tails now, so throw them out */ + endstr = stail; + endpat = ptail; + + /* Match pattern components until there are none left */ + while (pat<endpat) { + p = pat; + s = str; + for (;;) { + c = pat_next(p, endpat-p, &pinc, flags); + p += pinc; + /* Encountering * completes/commits a component */ + if (c == STAR) { + pat = p; + str = s; + break; } - p++; - if (not == match) + k = str_next(s, endstr-s, &sinc); + if (!k) return FNM_NOMATCH; - break; - default: - literal: - if (*s++ != c) - return FNM_NOMATCH; - if (c == no_slash && (flags & FNM_PERIOD)) { - no_period = '.'; - continue; + if (c == BRACKET) { + if (!match_bracket(p-pinc, k)) + break; + } else if (c != QUESTION && k != c) { + break; } - break; + s += sinc; } - no_period = 0x100; + if (c == STAR) continue; + /* If we failed, advance str, by 1 char if it's a valid + * char, or past all invalid bytes otherwise. */ + k = str_next(str, endstr-str, &sinc); + if (k > 0) str += sinc; + else for (str++; str_next(str, endstr-str, &sinc)<0; str++); } - if (*s) return FNM_NOMATCH; + return 0; } + +int fnmatch(const char *pat, const char *str, int flags) +{ + const char *s, *p; + size_t inc; + int c; + if (flags & FNM_PATHNAME) for (;;) { + for (s=str; *s && *s!='/'; s++); + for (p=pat; (c=pat_next(p, -1, &inc, flags))!=END && c!='/'; p+=inc); + if (*s && *p!=*s) return FNM_NOMATCH; + if (fnmatch_internal(pat, p-pat, str, s-str, flags)) + return FNM_NOMATCH; + if (!*s && c==END) return 0; + str = s+1; + pat = p+1; + } + return fnmatch_internal(pat, -1, str, -1, flags); +} |