diff options
author | Szabolcs Nagy <nsz@port70.net> | 2016-11-24 01:44:49 +0100 |
---|---|---|
committer | Rich Felker <dalias@aerifal.cx> | 2016-12-16 23:26:29 -0500 |
commit | 7a4c25d78030b3a43ed5c8dd1a456f73cb990f44 (patch) | |
tree | e4855beab38f5f84fd4b5d35f88748bdae78b454 | |
parent | adfe682eb0d77c6afc751f5e407d9da39623c24e (diff) | |
download | musl-7a4c25d78030b3a43ed5c8dd1a456f73cb990f44.tar.gz musl-7a4c25d78030b3a43ed5c8dd1a456f73cb990f44.tar.bz2 musl-7a4c25d78030b3a43ed5c8dd1a456f73cb990f44.tar.xz musl-7a4c25d78030b3a43ed5c8dd1a456f73cb990f44.zip |
handle ^ and $ in BRE subexpression start and end as anchors
In BRE, ^ is an anchor at the beginning of an expression, optionally
it may be an anchor at the beginning of a subexpression and must be
treated as a literal otherwise.
Previously musl treated ^ in subexpressions as literal, but at least
glibc and gnu sed treats it as an anchor and that's the more useful
behaviour: it can always be escaped to get back the literal meaning.
Same for $ at the end of a subexpression.
Portable BRE should not rely on this, but there are sed commands in
build scripts which do.
This changes the meaning of the BREs:
\(^a\)
\(a\|^b\)
\(a$\)
\(a$\|b\)
-rw-r--r-- | src/regex/regcomp.c | 21 |
1 files changed, 12 insertions, 9 deletions
diff --git a/src/regex/regcomp.c b/src/regex/regcomp.c index 65f2fd0b..5a7b53a7 100644 --- a/src/regex/regcomp.c +++ b/src/regex/regcomp.c @@ -401,8 +401,8 @@ typedef struct { tre_ast_node_t *n; /* Position in the regexp pattern after a parse function returns. */ const char *s; - /* The first character of the regexp. */ - const char *re; + /* The first character of the last subexpression parsed. */ + const char *start; /* Current submatch ID. */ int submatch_id; /* Current position (number of literal). */ @@ -876,14 +876,14 @@ static reg_errcode_t parse_atom(tre_parse_ctx_t *ctx, const char *s) break; case '^': /* '^' has a special meaning everywhere in EREs, and at beginning of BRE. */ - if (!ere && s != ctx->re) + if (!ere && s != ctx->start) goto parse_literal; node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_BOL, -1); s++; break; case '$': - /* '$' is special everywhere in EREs, and in the end of the string in BREs. */ - if (!ere && s[1]) + /* '$' is special everywhere in EREs, and at the end of a BRE subexpression. */ + if (!ere && s[1] && (s[1]!='\\'|| (s[2]!=')' && s[2]!='|'))) goto parse_literal; node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_EOL, -1); s++; @@ -944,7 +944,7 @@ static reg_errcode_t tre_parse(tre_parse_ctx_t *ctx) { tre_ast_node_t *nbranch=0, *nunion=0; int ere = ctx->cflags & REG_EXTENDED; - const char *s = ctx->re; + const char *s = ctx->start; int subid = 0; int depth = 0; reg_errcode_t err; @@ -962,6 +962,7 @@ static reg_errcode_t tre_parse(tre_parse_ctx_t *ctx) s++; depth++; nbranch = nunion = 0; + ctx->start = s; continue; } if ((!ere && *s == '\\' && s[1] == ')') || @@ -994,8 +995,8 @@ static reg_errcode_t tre_parse(tre_parse_ctx_t *ctx) if (*s=='\\') s++; - /* handle ^* at the start of a complete BRE. */ - if (!ere && s==ctx->re+1 && s[-1]=='^') + /* handle ^* at the start of a BRE. */ + if (!ere && s==ctx->start+1 && s[-1]=='^') break; /* extension: multiple consecutive *+?{,} is unspecified, @@ -1038,8 +1039,10 @@ static reg_errcode_t tre_parse(tre_parse_ctx_t *ctx) if (c == '\\' && s[1] == '|') { s+=2; + ctx->start = s; } else if (c == '|') { s++; + ctx->start = s; } else { if (c == '\\') { if (!depth) return REG_EPAREN; @@ -2705,7 +2708,7 @@ regcomp(regex_t *restrict preg, const char *restrict regex, int cflags) memset(&parse_ctx, 0, sizeof(parse_ctx)); parse_ctx.mem = mem; parse_ctx.stack = stack; - parse_ctx.re = regex; + parse_ctx.start = regex; parse_ctx.cflags = cflags; parse_ctx.max_backref = -1; errcode = tre_parse(&parse_ctx); |