summaryrefslogtreecommitdiff
path: root/src/regex
diff options
context:
space:
mode:
authorRich Felker <dalias@aerifal.cx>2012-05-07 14:50:49 -0400
committerRich Felker <dalias@aerifal.cx>2012-05-07 14:50:49 -0400
commitd7a90b35b9b5336a3e730e09d56f1c53f4e946ee (patch)
tree9352610f23666ee1e0427d90b7ab12aeb8469592 /src/regex
parent2d3e2a7fc13296bb682b9a48b81f1c7130a94bde (diff)
downloadmusl-d7a90b35b9b5336a3e730e09d56f1c53f4e946ee.tar.gz
musl-d7a90b35b9b5336a3e730e09d56f1c53f4e946ee.tar.bz2
musl-d7a90b35b9b5336a3e730e09d56f1c53f4e946ee.tar.xz
musl-d7a90b35b9b5336a3e730e09d56f1c53f4e946ee.zip
fix ugly bugs in TRE regex parser
1. * in BRE is not special at the beginning of the regex or a subexpression. this broke ncurses' build scripts. 2. \\( in BRE is a literal \ followed by a literal (, not a literal \ followed by a subexpression opener. 3. the ^ in \\(^ in BRE is a literal ^ only at the beginning of the entire BRE. POSIX allows treating it as an anchor at the beginning of a subexpression, but TRE's code for checking if it was at the beginning of a subexpression was wrong, and fixing it for the sake of supporting a non-portable usage was too much trouble when just removing this non-portable behavior was much easier. this patch also moved lots of the ugly logic for empty atom checking out of the default/literal case and into new cases for the relevant characters. this should make parsing faster and make the code smaller. if nothing else it's a lot more readable/logical. at some point i'd like to revisit and overhaul lots of this code...
Diffstat (limited to 'src/regex')
-rw-r--r--src/regex/regcomp.c91
1 files changed, 31 insertions, 60 deletions
diff --git a/src/regex/regcomp.c b/src/regex/regcomp.c
index f8ebe409..fa79e2e5 100644
--- a/src/regex/regcomp.c
+++ b/src/regex/regcomp.c
@@ -961,6 +961,8 @@ tre_parse(tre_parse_ctx_t *ctx)
tre_stack_t *stack = ctx->stack;
int bottom = tre_stack_num_objects(stack);
int depth = 0;
+ wchar_t wc;
+ int clen;
if (!ctx->nofirstsub)
{
@@ -1155,10 +1157,9 @@ tre_parse(tre_parse_ctx_t *ctx)
{
case CHAR_LPAREN: /* parenthesized subexpression */
- if (ctx->cflags & REG_EXTENDED
- || (ctx->re > ctx->re_start
- && *(ctx->re - 1) == CHAR_BACKSLASH))
+ if (ctx->cflags & REG_EXTENDED)
{
+ lparen:
depth++;
{
ctx->re++;
@@ -1174,25 +1175,6 @@ tre_parse(tre_parse_ctx_t *ctx)
goto parse_literal;
break;
- case CHAR_RPAREN: /* end of current subexpression */
- if ((ctx->cflags & REG_EXTENDED && depth > 0)
- || (ctx->re > ctx->re_start
- && *(ctx->re - 1) == CHAR_BACKSLASH))
- {
- /* We were expecting an atom, but instead the current
- subexpression was closed. POSIX leaves the meaning of
- this to be implementation-defined. We interpret this as
- an empty expression (which matches an empty string). */
- result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
- if (result == NULL)
- return REG_ESPACE;
- if (!(ctx->cflags & REG_EXTENDED))
- ctx->re--;
- }
- else
- goto parse_literal;
- break;
-
case CHAR_LBRACKET: /* bracket expression */
ctx->re++;
status = tre_parse_bracket(ctx, &result);
@@ -1203,13 +1185,14 @@ tre_parse(tre_parse_ctx_t *ctx)
case CHAR_BACKSLASH:
/* If this is "\(" or "\)" chew off the backslash and
try again. */
- if (!(ctx->cflags & REG_EXTENDED)
- && (*(ctx->re + 1) == CHAR_LPAREN
- || *(ctx->re + 1) == CHAR_RPAREN))
+ if (!(ctx->cflags & REG_EXTENDED) && *(ctx->re + 1) == CHAR_LPAREN)
{
ctx->re++;
- STACK_PUSHX(stack, int, PARSE_ATOM);
- break;
+ goto lparen;
+ }
+ if (!(ctx->cflags & REG_EXTENDED) && *(ctx->re + 1) == CHAR_LPAREN)
+ {
+ goto empty_atom;
}
/* If a macro is used, parse the expanded macro recursively. */
@@ -1369,12 +1352,9 @@ tre_parse(tre_parse_ctx_t *ctx)
break;
case CHAR_CARET: /* beginning of line assertion */
- /* '^' has a special meaning everywhere in EREs, and in the
- beginning of the RE and after \( is BREs. */
+ /* '^' has a special meaning everywhere in EREs, and at
+ beginning of BRE. */
if (ctx->cflags & REG_EXTENDED
- || (ctx->re - 2 >= ctx->re_start
- && *(ctx->re - 2) == CHAR_BACKSLASH
- && *(ctx->re - 1) == CHAR_LPAREN)
|| ctx->re == ctx->re_start)
{
result = tre_ast_new_literal(ctx->mem, ASSERTION,
@@ -1389,10 +1369,8 @@ tre_parse(tre_parse_ctx_t *ctx)
case CHAR_DOLLAR: /* end of line assertion. */
/* '$' is special everywhere in EREs, and in the end of the
- string and before \) is BREs. */
+ string in BREs. */
if (ctx->cflags & REG_EXTENDED
- || (*(ctx->re + 1) == CHAR_BACKSLASH
- && *(ctx->re + 2) == CHAR_RPAREN)
|| !*(ctx->re + 1))
{
result = tre_ast_new_literal(ctx->mem, ASSERTION,
@@ -1405,34 +1383,27 @@ tre_parse(tre_parse_ctx_t *ctx)
goto parse_literal;
break;
+ case CHAR_RPAREN:
+ if (!depth)
+ goto parse_literal;
+ case CHAR_STAR:
+ case CHAR_PIPE:
+ case CHAR_LBRACE:
+ case CHAR_PLUS:
+ case CHAR_QUESTIONMARK:
+ if (!(ctx->cflags & REG_EXTENDED))
+ goto parse_literal;
+
+ empty_atom:
+ result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
+ if (!result)
+ return REG_ESPACE;
+ break;
+
default:
parse_literal:
- /* We are expecting an atom. If the subexpression (or the whole
- regexp ends here, we interpret it as an empty expression
- (which matches an empty string). */
- if (
- (!*ctx->re
- || *ctx->re == CHAR_STAR
- || (ctx->cflags & REG_EXTENDED
- && (*ctx->re == CHAR_PIPE
- || *ctx->re == CHAR_LBRACE
- || *ctx->re == CHAR_PLUS
- || *ctx->re == CHAR_QUESTIONMARK))
- /* Test for "\)" in BRE mode. */
- || (!(ctx->cflags & REG_EXTENDED)
- && !*(ctx->re + 1)
- && *ctx->re == CHAR_BACKSLASH
- && *(ctx->re + 1) == CHAR_LBRACE)))
- {
- result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
- if (!result)
- return REG_ESPACE;
- break;
- }
-
- wchar_t wc;
- int clen = mbtowc(&wc, ctx->re, -1);
+ clen = mbtowc(&wc, ctx->re, -1);
if (clen<0) clen=1, wc=WEOF;
/* Note that we can't use an tre_isalpha() test here, since there