diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/regex/regcomp.c | 1715 |
1 files changed, 634 insertions, 1081 deletions
diff --git a/src/regex/regcomp.c b/src/regex/regcomp.c index d9076275..4cdaa1ea 100644 --- a/src/regex/regcomp.c +++ b/src/regex/regcomp.c @@ -34,6 +34,7 @@ #include <regex.h> #include <limits.h> #include <stdint.h> +#include <ctype.h> #include "tre.h" @@ -135,108 +136,88 @@ typedef struct { tre_ast_node_t *right; } tre_union_t; -static tre_ast_node_t * -tre_ast_new_node(tre_mem_t mem, tre_ast_type_t type, size_t size); - -static tre_ast_node_t * -tre_ast_new_literal(tre_mem_t mem, int code_min, int code_max, int position); - -static tre_ast_node_t * -tre_ast_new_iter(tre_mem_t mem, tre_ast_node_t *arg, int min, int max, - int minimal); - -static tre_ast_node_t * -tre_ast_new_union(tre_mem_t mem, tre_ast_node_t *left, tre_ast_node_t *right); - -static tre_ast_node_t * -tre_ast_new_catenation(tre_mem_t mem, tre_ast_node_t *left, - tre_ast_node_t *right); - static tre_ast_node_t * -tre_ast_new_node(tre_mem_t mem, tre_ast_type_t type, size_t size) +tre_ast_new_node(tre_mem_t mem, int type, void *obj) { - tre_ast_node_t *node; - - node = tre_mem_calloc(mem, sizeof(*node)); - if (!node) - return NULL; - node->obj = tre_mem_calloc(mem, size); - if (!node->obj) - return NULL; - node->type = type; - node->nullable = -1; - node->submatch_id = -1; - - return node; + tre_ast_node_t *node = tre_mem_calloc(mem, sizeof *node); + if (!node || !obj) + return 0; + node->obj = obj; + node->type = type; + node->nullable = -1; + node->submatch_id = -1; + return node; } static tre_ast_node_t * tre_ast_new_literal(tre_mem_t mem, int code_min, int code_max, int position) { - tre_ast_node_t *node; - tre_literal_t *lit; - - node = tre_ast_new_node(mem, LITERAL, sizeof(tre_literal_t)); - if (!node) - return NULL; - lit = node->obj; - lit->code_min = code_min; - lit->code_max = code_max; - lit->position = position; - - return node; + tre_ast_node_t *node; + tre_literal_t *lit; + + lit = tre_mem_calloc(mem, sizeof *lit); + node = tre_ast_new_node(mem, LITERAL, lit); + if (!node) + return 0; + lit->code_min = code_min; + lit->code_max = code_max; + lit->position = position; + return node; } static tre_ast_node_t * -tre_ast_new_iter(tre_mem_t mem, tre_ast_node_t *arg, int min, int max, - int minimal) +tre_ast_new_iter(tre_mem_t mem, tre_ast_node_t *arg, int min, int max, int minimal) { - tre_ast_node_t *node; - tre_iteration_t *iter; - - node = tre_ast_new_node(mem, ITERATION, sizeof(tre_iteration_t)); - if (!node) - return NULL; - iter = node->obj; - iter->arg = arg; - iter->min = min; - iter->max = max; - iter->minimal = minimal; - node->num_submatches = arg->num_submatches; - - return node; + tre_ast_node_t *node; + tre_iteration_t *iter; + + iter = tre_mem_calloc(mem, sizeof *iter); + node = tre_ast_new_node(mem, ITERATION, iter); + if (!node) + return 0; + iter->arg = arg; + iter->min = min; + iter->max = max; + iter->minimal = minimal; + node->num_submatches = arg->num_submatches; + return node; } static tre_ast_node_t * tre_ast_new_union(tre_mem_t mem, tre_ast_node_t *left, tre_ast_node_t *right) { - tre_ast_node_t *node; - - node = tre_ast_new_node(mem, UNION, sizeof(tre_union_t)); - if (node == NULL) - return NULL; - ((tre_union_t *)node->obj)->left = left; - ((tre_union_t *)node->obj)->right = right; - node->num_submatches = left->num_submatches + right->num_submatches; - - return node; + tre_ast_node_t *node; + tre_union_t *un; + + if (!left) + return right; + un = tre_mem_calloc(mem, sizeof *un); + node = tre_ast_new_node(mem, UNION, un); + if (!node || !right) + return 0; + un->left = left; + un->right = right; + node->num_submatches = left->num_submatches + right->num_submatches; + return node; } static tre_ast_node_t * -tre_ast_new_catenation(tre_mem_t mem, tre_ast_node_t *left, - tre_ast_node_t *right) +tre_ast_new_catenation(tre_mem_t mem, tre_ast_node_t *left, tre_ast_node_t *right) { - tre_ast_node_t *node; - - node = tre_ast_new_node(mem, CATENATION, sizeof(tre_catenation_t)); - if (node == NULL) - return NULL; - ((tre_catenation_t *)node->obj)->left = left; - ((tre_catenation_t *)node->obj)->right = right; - node->num_submatches = left->num_submatches + right->num_submatches; - - return node; + tre_ast_node_t *node; + tre_catenation_t *cat; + + if (!left) + return right; + cat = tre_mem_calloc(mem, sizeof *cat); + node = tre_ast_new_node(mem, CATENATION, cat); + if (!node) + return 0; + cat->left = left; + cat->right = right; + node->num_submatches = left->num_submatches + right->num_submatches; + return node; } @@ -412,1077 +393,654 @@ define_popf(voidptr, void *) /* Parse context. */ typedef struct { - /* Memory allocator. The AST is allocated using this. */ - tre_mem_t mem; - /* Stack used for keeping track of regexp syntax. */ - tre_stack_t *stack; - /* The parse result. */ - tre_ast_node_t *result; - /* The regexp to parse and its length. */ - const char *re; - /* The first character of the entire regexp. */ - const char *re_start; - /* Current submatch ID. */ - int submatch_id; - /* Current position (number of literal). */ - int position; - /* The highest back reference or -1 if none seen so far. */ - int max_backref; - /* This flag is set if the regexp uses approximate matching. */ - int have_approx; - /* Compilation flags. */ - int cflags; - /* If this flag is set the top-level submatch is not captured. */ - int nofirstsub; + /* Memory allocator. The AST is allocated using this. */ + tre_mem_t mem; + /* Stack used for keeping track of regexp syntax. */ + tre_stack_t *stack; + /* The parsed node after a parse function returns. */ + tre_ast_node_t *n; + /* Position in the regexp pattern after a parse function returns. */ + const char *s; + /* The first character of the regexp. */ + const char *re; + /* Current submatch ID. */ + int submatch_id; + /* Current position (number of literal). */ + int position; + /* The highest back reference or -1 if none seen so far. */ + int max_backref; + /* Compilation flags. */ + int cflags; } tre_parse_ctx_t; -/* Parses a wide character regexp pattern into a syntax tree. This parser - handles both syntaxes (BRE and ERE), including the TRE extensions. */ -static reg_errcode_t -tre_parse(tre_parse_ctx_t *ctx); - - -/* - This parser is just a simple recursive descent parser for POSIX.2 - regexps. The parser supports both the obsolete default syntax and - the "extended" syntax, and some nonstandard extensions. -*/ - -/* Characters with special meanings in regexp syntax. */ -#define CHAR_PIPE '|' -#define CHAR_LPAREN '(' -#define CHAR_RPAREN ')' -#define CHAR_LBRACE '{' -#define CHAR_RBRACE '}' -#define CHAR_LBRACKET '[' -#define CHAR_RBRACKET ']' -#define CHAR_MINUS '-' -#define CHAR_STAR '*' -#define CHAR_QUESTIONMARK '?' -#define CHAR_PLUS '+' -#define CHAR_PERIOD '.' -#define CHAR_COLON ':' -#define CHAR_EQUAL '=' -#define CHAR_COMMA ',' -#define CHAR_CARET '^' -#define CHAR_DOLLAR '$' -#define CHAR_BACKSLASH '\\' -#define CHAR_HASH '#' -#define CHAR_TILDE '~' - - /* Some macros for expanding \w, \s, etc. */ -static const struct tre_macro_struct { - const char c; - const char *expansion; -} tre_macros[] = - { {'t', "\t"}, {'n', "\n"}, {'r', "\r"}, - {'f', "\f"}, {'a', "\a"}, {'e', "\033"}, - {'w', "[[:alnum:]_]"}, {'W', "[^[:alnum:]_]"}, {'s', "[[:space:]]"}, - {'S', "[^[:space:]]"}, {'d', "[[:digit:]]"}, {'D', "[^[:digit:]]"}, - { 0, NULL } - }; - +static const struct { + char c; + const char *expansion; +} tre_macros[] = { + {'t', "\t"}, {'n', "\n"}, {'r', "\r"}, + {'f', "\f"}, {'a', "\a"}, {'e', "\033"}, + {'w', "[[:alnum:]_]"}, {'W', "[^[:alnum:]_]"}, {'s', "[[:space:]]"}, + {'S', "[^[:space:]]"}, {'d', "[[:digit:]]"}, {'D', "[^[:digit:]]"}, + { 0, 0 } +}; /* Expands a macro delimited by `regex' and `regex_end' to `buf', which must have at least `len' items. Sets buf[0] to zero if the there is no match in `tre_macros'. */ -static const char * -tre_expand_macro(const char *regex) +static const char *tre_expand_macro(const char *s) { - int i; - - if (!*regex) - return 0; - - for (i = 0; tre_macros[i].expansion && tre_macros[i].c != *regex; i++); - return tre_macros[i].expansion; + int i; + for (i = 0; tre_macros[i].c && tre_macros[i].c != *s; i++); + return tre_macros[i].expansion; } -static reg_errcode_t -tre_new_item(tre_mem_t mem, int min, int max, int *i, int *max_i, - tre_ast_node_t ***items) +static int +tre_compare_lit(const void *a, const void *b) { - reg_errcode_t status; - tre_ast_node_t **array = *items; - /* Allocate more space if necessary. */ - if (*i >= *max_i) - { - tre_ast_node_t **new_items; - /* If the array is already 1024 items large, give up -- there's - probably an error in the regexp (e.g. not a '\0' terminated - string and missing ']') */ - if (*max_i > 1024) - return REG_ESPACE; - *max_i *= 2; - new_items = xrealloc(array, sizeof(*array) * *max_i); - if (new_items == NULL) - return REG_ESPACE; - *items = array = new_items; - } - array[*i] = tre_ast_new_literal(mem, min, max, -1); - status = array[*i] == NULL ? REG_ESPACE : REG_OK; - (*i)++; - return status; + const tre_literal_t *const *la = a; + const tre_literal_t *const *lb = b; + /* assumes the range of valid code_min is < INT_MAX */ + return la[0]->code_min - lb[0]->code_min; } +struct literals { + tre_mem_t mem; + tre_literal_t **a; + int len; + int cap; +}; -static int -tre_compare_items(const void *a, const void *b) +static tre_literal_t *tre_new_lit(struct literals *p) { - const tre_ast_node_t *node_a = *(tre_ast_node_t * const *)a; - const tre_ast_node_t *node_b = *(tre_ast_node_t * const *)b; - tre_literal_t *l_a = node_a->obj, *l_b = node_b->obj; - int a_min = l_a->code_min, b_min = l_b->code_min; - - if (a_min < b_min) - return -1; - else if (a_min > b_min) - return 1; - else - return 0; + tre_literal_t **a; + if (p->len >= p->cap) { + if (p->cap >= 1<<15) + return 0; + p->cap *= 2; + a = xrealloc(p->a, p->cap * sizeof *p->a); + if (!a) + return 0; + p->a = a; + } + a = p->a + p->len++; + *a = tre_mem_calloc(p->mem, sizeof **a); + return *a; } -/* Maximum number of character classes that can occur in a negated bracket - expression. */ -#define MAX_NEG_CLASSES 64 - -/* Maximum length of character class names. */ -#define MAX_CLASS_NAME - -static reg_errcode_t -tre_parse_bracket_items(tre_parse_ctx_t *ctx, int negate, - tre_ctype_t neg_classes[], int *num_neg_classes, - tre_ast_node_t ***items, int *num_items, - int *items_size) +static int add_icase_literals(struct literals *ls, int min, int max) { - const char *re = ctx->re; - reg_errcode_t status = REG_OK; - tre_ctype_t class = (tre_ctype_t)0; - int i = *num_items; - int max_i = *items_size; - int skip; - - /* Build an array of the items in the bracket expression. */ - while (status == REG_OK) - { - skip = 0; - if (!*re) - { - status = REG_EBRACK; - } - else if (*re == CHAR_RBRACKET && re > ctx->re) - { - re++; - break; + tre_literal_t *lit; + int b, e, c; + for (c=min; c<=max; ) { + /* assumes islower(c) and isupper(c) are exclusive + and toupper(c)!=c if islower(c). + multiple opposite case characters are not supported */ + if (tre_islower(c)) { + b = e = tre_toupper(c); + for (c++, e++; c<=max; c++, e++) + if (tre_toupper(c) != e) break; + } else if (tre_isupper(c)) { + b = e = tre_tolower(c); + for (c++, e++; c<=max; c++, e++) + if (tre_tolower(c) != e) break; + } else { + c++; + continue; + } + lit = tre_new_lit(ls); + if (!lit) + return -1; + lit->code_min = b; + lit->code_max = e-1; + lit->position = -1; } - else - { - tre_cint_t min = 0, max = 0; - wchar_t wc; - int clen = mbtowc(&wc, re, -1); + return 0; +} - if (clen<0) clen=1, wc=WEOF; - class = (tre_ctype_t)0; - if (*(re + clen) == CHAR_MINUS && *(re + clen + 1) != CHAR_RBRACKET) - { - min = wc; - re += clen+1; - clen = mbtowc(&wc, re, -1); - if (clen<0) clen=1, wc=WEOF; - max = wc; - re += clen; - /* XXX - Should use collation order instead of encoding values - in character ranges. */ - if (min > max) - status = REG_ERANGE; - } - else if (*re == CHAR_LBRACKET && *(re + 1) == CHAR_PERIOD) - status = REG_ECOLLATE; - else if (*re == CHAR_LBRACKET && *(re + 1) == CHAR_EQUAL) - status = REG_ECOLLATE; - else if (*re == CHAR_LBRACKET && *(re + 1) == CHAR_COLON) - { - char tmp_str[64]; - const char *endptr = re + 2; - int len; - while (*endptr && *endptr != CHAR_COLON) - endptr++; - if (*endptr) - { - len = MIN(endptr - re - 2, 63); - strncpy(tmp_str, re + 2, len); - tmp_str[len] = '\0'; - class = tre_ctype(tmp_str); - if (!class) - status = REG_ECTYPE; - re = endptr + 2; - } - else - status = REG_ECTYPE; - min = 0; - max = TRE_CHAR_MAX; - } - else - { - if (*re == CHAR_MINUS && *(re + 1) != CHAR_RBRACKET - && ctx->re != re) - /* Two ranges are not allowed to share and endpoint. */ - status = REG_ERANGE; - min = max = wc; - re += clen; - } +/* Maximum number of character classes in a negated bracket expression. */ +#define MAX_NEG_CLASSES 64 - if (status != REG_OK) - break; +struct neg { + int negate; + int len; + tre_ctype_t a[MAX_NEG_CLASSES]; +}; - if (class && negate) - if (*num_neg_classes >= MAX_NEG_CLASSES) - status = REG_ESPACE; - else - neg_classes[(*num_neg_classes)++] = class; - else if (!skip) - { - status = tre_new_item(ctx->mem, min, max, &i, &max_i, items); - if (status != REG_OK) - break; - ((tre_literal_t*)((*items)[i-1])->obj)->class = class; - } +// TODO: parse bracket into a set of non-overlapping [lo,hi] ranges - /* Add opposite-case counterpoints if REG_ICASE is present. - This is broken if there are more than two "same" characters. */ - if (ctx->cflags & REG_ICASE && !class && status == REG_OK && !skip) - { - tre_cint_t cmin, ccurr; +/* +bracket grammar: +Bracket = '[' List ']' | '[^' List ']' +List = Term | List Term +Term = Char | Range | Chclass | Eqclass +Range = Char '-' Char | Char '-' '-' +Char = Coll | coll_single +Meta = ']' | '-' +Coll = '[.' coll_single '.]' | '[.' coll_multi '.]' | '[.' Meta '.]' +Eqclass = '[=' coll_single '=]' | '[=' coll_multi '=]' +Chclass = '[:' class ':]' + +coll_single is a single char collating element but it can be + '-' only at the beginning or end of a List and + ']' only at the beginning of a List and + '^' anywhere except after the openning '[' +*/ - while (min <= max) - { - if (tre_islower(min)) - { - cmin = ccurr = tre_toupper(min++); - while (tre_islower(min) && tre_toupper(min) == ccurr + 1 - && min <= max) - ccurr = tre_toupper(min++); - status = tre_new_item(ctx->mem, cmin, ccurr, - &i, &max_i, items); - } - else if (tre_isupper(min)) - { - cmin = ccurr = tre_tolower(min++); - while (tre_isupper(min) && tre_tolower(min) == ccurr + 1 - && min <= max) - ccurr = tre_tolower(min++); - status = tre_new_item(ctx->mem, cmin, ccurr, - &i, &max_i, items); - } - else min++; - if (status != REG_OK) - break; +static reg_errcode_t parse_bracket_terms(tre_parse_ctx_t *ctx, const char *s, struct literals *ls, struct neg *neg) +{ + const char *start = s; + tre_ctype_t class; + int min, max; + wchar_t wc; + int len; + + for (;;) { + class = 0; + len = mbtowc(&wc, s, -1); + if (len <= 0) + return *s ? REG_BADPAT : REG_EBRACK; + if (*s == ']' && s != start) { + ctx->s = s+1; + return REG_OK; + } + if (*s == '-' && s != start && s[1] != ']' && + /* extension: [a-z--@] is accepted as [a-z]|[--@] */ + (s[1] != '-' || s[2] == ']')) + return REG_ERANGE; + if (*s == '[' && (s[1] == '.' || s[1] == '=')) + /* collating symbols and equivalence classes are not supported */ + return REG_ECOLLATE; + if (*s == '[' && s[1] == ':') { + char tmp[CHARCLASS_NAME_MAX+1]; + s += 2; + for (len=0; len < CHARCLASS_NAME_MAX && s[len]; len++) { + if (s[len] == ':') { + memcpy(tmp, s, len); + tmp[len] = 0; + class = tre_ctype(tmp); + break; + } + } + if (!class || s[len+1] != ']') + return REG_ECTYPE; + min = 0; + max = TRE_CHAR_MAX; + s += len+2; + } else { + min = max = wc; + s += len; + if (*s == '-' && s[1] != ']') { + s++; + len = mbtowc(&wc, s, -1); + max = wc; + /* XXX - Should use collation order instead of + encoding values in character ranges. */ + if (len <= 0 || min > max) + return REG_ERANGE; + s += len; + } + } + + if (class && neg->negate) { + if (neg->len >= MAX_NEG_CLASSES) + return REG_ESPACE; + neg->a[neg->len++] = class; + } else { + tre_literal_t *lit = tre_new_lit(ls); + if (!lit) + return REG_ESPACE; + lit->code_min = min; + lit->code_max = max; + lit->class = class; + lit->position = -1; + + /* Add opposite-case codepoints if REG_ICASE is present. + It seems that POSIX requires that bracket negation + should happen before case-folding, but most practical + implementations do it the other way around. Changing + the order would need efficient representation of + case-fold ranges and bracket range sets even with + simple patterns so this is ok for now. */ + if (ctx->cflags & REG_ICASE && !class) + if (add_icase_literals(ls, min, max)) + return REG_ESPACE; } - if (status != REG_OK) - break; - } } - } - *num_items = i; - *items_size = max_i; - ctx->re = re; - return status; } -static reg_errcode_t -tre_parse_bracket(tre_parse_ctx_t *ctx, tre_ast_node_t **result) +static reg_errcode_t parse_bracket(tre_parse_ctx_t *ctx, const char *s) { - tre_ast_node_t *node = NULL; - int negate = 0; - reg_errcode_t status = REG_OK; - tre_ast_node_t **items, *u, *n; - int i = 0, j, max_i = 32, curr_max, curr_min; - tre_ctype_t neg_classes[MAX_NEG_CLASSES]; - int num_neg_classes = 0; - - /* Start off with an array of `max_i' elements. */ - items = xmalloc(sizeof(*items) * max_i); - if (items == NULL) - return REG_ESPACE; - - if (*ctx->re == CHAR_CARET) - { - negate = 1; - ctx->re++; - } - - status = tre_parse_bracket_items(ctx, negate, neg_classes, &num_neg_classes, - &items, &i, &max_i); - - if (status != REG_OK) - goto parse_bracket_done; - - /* Sort the array if we need to negate it. */ - if (negate) - qsort(items, (unsigned)i, sizeof(*items), tre_compare_items); - - curr_max = curr_min = 0; - /* Build a union of the items in the array, negated if necessary. */ - for (j = 0; j < i && status == REG_OK; j++) - { - int min, max; - tre_literal_t *l = items[j]->obj; - min = l->code_min; - max = l->code_max; - - if (negate) - { - if (min < curr_max) - { - /* Overlap. */ - curr_max = MAX(max + 1, curr_max); - l = NULL; - } - else - { - /* No overlap. */ - curr_max = min - 1; - if (curr_max >= curr_min) - { - l->code_min = curr_min; - l->code_max = curr_max; + int i, max, min, negmax, negmin; + tre_ast_node_t *node = 0, *n; + tre_ctype_t *nc = 0; + tre_literal_t *lit; + struct literals ls; + struct neg neg; + reg_errcode_t err; + + ls.mem = ctx->mem; + ls.len = 0; + ls.cap = 32; + ls.a = xmalloc(ls.cap * sizeof *ls.a); + if (!ls.a) + return REG_ESPACE; + neg.len = 0; + neg.negate = *s == '^'; + if (neg.negate) + s++; + + err = parse_bracket_terms(ctx, s, &ls, &neg); + if (err != REG_OK) + goto parse_bracket_done; + + if (neg.negate) { + /* Sort the array if we need to negate it. */ + qsort(ls.a, ls.len, sizeof *ls.a, tre_compare_lit); + /* extra lit for the last negated range */ + lit = tre_new_lit(&ls); + if (!lit) { + err = REG_ESPACE; + goto parse_bracket_done; } - else - { - l = NULL; + lit->code_min = TRE_CHAR_MAX+1; + lit->code_max = TRE_CHAR_MAX+1; + lit->position = -1; + /* negated classes */ + if (neg.len) { + nc = tre_mem_alloc(ctx->mem, (neg.len+1)*sizeof *neg.a); + if (!nc) { + err = REG_ESPACE; + goto parse_bracket_done; + } + memcpy(nc, neg.a, neg.len*sizeof *neg.a); + nc[neg.len] = 0; } - curr_min = curr_max = max + 1; - } } - if (l != NULL) - { - int k; - l->position = ctx->position; - if (num_neg_classes > 0) - { - l->neg_classes = tre_mem_alloc(ctx->mem, - (sizeof(*l->neg_classes) - * (num_neg_classes + 1))); - if (l->neg_classes == NULL) - { - status = REG_ESPACE; - break; + /* Build a union of the items in the array, negated if necessary. */ + negmax = negmin = 0; + for (i = 0; i < ls.len; i++) { + lit = ls.a[i]; + min = lit->code_min; + max = lit->code_max; + if (neg.negate) { + if (min <= negmin) { + /* Overlap. */ + negmin = MAX(max + 1, negmin); + continue; + } + negmax = min - 1; + lit->code_min = negmin; + lit->code_max = negmax; + negmin = max + 1; } - for (k = 0; k < num_neg_classes; k++) - l->neg_classes[k] = neg_classes[k]; - l->neg_classes[k] = (tre_ctype_t)0; - } - else - l->neg_classes = NULL; - if (node == NULL) - node = items[j]; - else - { - u = tre_ast_new_union(ctx->mem, node, items[j]); - if (u == NULL) - status = REG_ESPACE; - node = u; - } - } - } - - if (status != REG_OK) - goto parse_bracket_done; - - if (negate) - { - int k; - n = tre_ast_new_literal(ctx->mem, curr_min, TRE_CHAR_MAX, ctx->position); - if (n == NULL) - status = REG_ESPACE; - else - { - tre_literal_t *l = n->obj; - if (num_neg_classes > 0) - { - l->neg_classes = tre_mem_alloc(ctx->mem, - (sizeof(*l->neg_classes) - * (num_neg_classes + 1))); - if (l->neg_classes == NULL) - { - status = REG_ESPACE; - goto parse_bracket_done; + lit->position = ctx->position; + lit->neg_classes = nc; + n = tre_ast_new_node(ctx->mem, LITERAL, lit); + node = tre_ast_new_union(ctx->mem, node, n); + if (!node) { + err = REG_ESPACE; + break; } - for (k = 0; k < num_neg_classes; k++) - l->neg_classes[k] = neg_classes[k]; - l->neg_classes[k] = (tre_ctype_t)0; - } - else - l->neg_classes = NULL; - if (node == NULL) - node = n; - else - { - u = tre_ast_new_union(ctx->mem, node, n); - if (u == NULL) - status = REG_ESPACE; - node = u; - } } - } - - if (status != REG_OK) - goto parse_bracket_done; -#ifdef TRE_DEBUG - tre_ast_print(node); -#endif /* TRE_DEBUG */ - - parse_bracket_done: - xfree(items); - ctx->position++; - *result = node; - return status; +parse_bracket_done: + xfree(ls.a); + ctx->position++; + ctx->n = node; + return err; } - -/* Parses a positive decimal integer. Returns -1 if the string does not - contain a valid number. */ -static int -tre_parse_int(const char **regex) +static const char *parse_dup_count(const char *s, int *n) { - int num = -1; - const char *r = *regex; - while (*r-'0'<10U) - { - if (num < 0) - num = 0; - num = num * 10 + *r - '0'; - r++; - } - *regex = r; - return num; + *n = -1; + if (!isdigit(*s)) + return s; + *n = 0; + for (;;) { + *n = 10 * *n + (*s - '0'); + s++; + if (!isdigit(*s) || *n > RE_DUP_MAX) + break; + } + return s; } - -static reg_errcode_t -tre_parse_bound(tre_parse_ctx_t *ctx, tre_ast_node_t **result) +static reg_errcode_t parse_dup(tre_parse_ctx_t *ctx, const char *s) { - int min, max; - const char *r = ctx->re; - int minimal = 0; - - /* Parse number (minimum repetition count). */ - min = -1; - if (*r >= '0' && *r <= '9') { - min = tre_parse_int(&r); - } - - /* Parse comma and second number (maximum repetition count). */ - max = min; - if (*r == CHAR_COMMA) - { - r++; - max = tre_parse_int(&r); - } - - /* Check that the repeat counts are sane. */ - if ((max >= 0 && min > max) || max > RE_DUP_MAX) - return REG_BADBR; - - /* Missing }. */ - if (!*r) - return REG_EBRACE; - - /* Empty contents of {}. */ - if (r == ctx->re) - return REG_BADBR; - - /* Parse the ending '}' or '\}'.*/ - if (ctx->cflags & REG_EXTENDED) - { - if (*r != CHAR_RBRACE) - return REG_BADBR; - r++; - } - else - { - if (*r != CHAR_BACKSLASH || *(r + 1) != CHAR_RBRACE) - return REG_BADBR; - r += 2; - } - - /* Create the AST node(s). */ - if (min == 0 && max == 0) - { - *result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1); - if (*result == NULL) - return REG_ESPACE; - } - else - { - if (min < 0 && max < 0) - /* Only approximate parameters set, no repetitions. */ - min = max = 1; - - *result = tre_ast_new_iter(ctx->mem, *result, min, max, minimal); - if (!*result) - return REG_ESPACE; - } - - ctx->re = r; - return REG_OK; + int min, max; + + s = parse_dup_count(s, &min); + if (*s == ',') + s = parse_dup_count(s+1, &max); + else + max = min; + + if ( + (max < min && max >= 0) || + max > RE_DUP_MAX || + min > RE_DUP_MAX || + min < 0 || + (!(ctx->cflags & REG_EXTENDED) && *s++ != '\\') || + *s++ != '}' + ) + return REG_BADBR; + + if (min == 0 && max == 0) + ctx->n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1); + else + ctx->n = tre_ast_new_iter(ctx->mem, ctx->n, min, max, 0); + if (!ctx->n) + return REG_ESPACE; + ctx->s = s; + return REG_OK; } -typedef enum { - PARSE_RE = 0, - PARSE_ATOM, - PARSE_MARK_FOR_SUBMATCH, - PARSE_BRANCH, - PARSE_PIECE, - PARSE_CATENATION, - PARSE_POST_CATENATION, - PARSE_UNION, - PARSE_POST_UNION, - PARSE_POSTFIX, - PARSE_RESTORE_CFLAGS -} tre_parse_re_stack_symbol_t; - - -static reg_errcode_t -tre_parse(tre_parse_ctx_t *ctx) +static int hexval(unsigned c) { - tre_ast_node_t *result = NULL; - tre_parse_re_stack_symbol_t symbol; - reg_errcode_t status = REG_OK; - tre_stack_t *stack = ctx->stack; - int bottom = tre_stack_num_objects(stack); - int depth = 0; - wchar_t wc; - int clen; - - if (!ctx->nofirstsub) - { - STACK_PUSH(stack, int, ctx->submatch_id); - STACK_PUSH(stack, int, PARSE_MARK_FOR_SUBMATCH); - ctx->submatch_id++; - } - STACK_PUSH(stack, int, PARSE_RE); - ctx->re_start = ctx->re; - - - /* The following is basically just a recursive descent parser. I use - an explicit stack instead of recursive functions mostly because of - two reasons: compatibility with systems which have an overflowable - call stack, and efficiency (both in lines of code and speed). */ - while (tre_stack_num_objects(stack) > bottom && status == REG_OK) - { - if (status != REG_OK) - break; - symbol = tre_stack_pop_int(stack); - switch (symbol) - { - case PARSE_RE: - /* Parse a full regexp. A regexp is one or more branches, - separated by the union operator `|'. */ - if (ctx->cflags & REG_EXTENDED) - STACK_PUSHX(stack, int, PARSE_UNION); - STACK_PUSHX(stack, int, PARSE_BRANCH); - break; - - case PARSE_BRANCH: - /* Parse a branch. A branch is one or more pieces, concatenated. - A piece is an atom possibly followed by a postfix operator. */ - STACK_PUSHX(stack, int, PARSE_CATENATION); - STACK_PUSHX(stack, int, PARSE_PIECE); - break; - - case PARSE_PIECE: - /* Parse a piece. A piece is an atom possibly followed by one - or more postfix operators. */ - STACK_PUSHX(stack, int, PARSE_POSTFIX); - STACK_PUSHX(stack, int, PARSE_ATOM); - break; - - case PARSE_CATENATION: - /* If the expression has not ended, parse another piece. */ - { - tre_char_t c; - if (!*ctx->re) - break; - c = *ctx->re; - if (ctx->cflags & REG_EXTENDED && c == CHAR_PIPE) - break; - if ((ctx->cflags & REG_EXTENDED - && c == CHAR_RPAREN && depth > 0) - || (!(ctx->cflags & REG_EXTENDED) - && (c == CHAR_BACKSLASH - && *(ctx->re + 1) == CHAR_RPAREN))) - { - if (!(ctx->cflags & REG_EXTENDED) && depth == 0) - status = REG_EPAREN; - depth--; - if (!(ctx->cflags & REG_EXTENDED)) - ctx->re += 2; - break; - } - - { - /* Default case, left associative concatenation. */ - STACK_PUSHX(stack, int, PARSE_CATENATION); - STACK_PUSHX(stack, voidptr, result); - STACK_PUSHX(stack, int, PARSE_POST_CATENATION); - STACK_PUSHX(stack, int, PARSE_PIECE); - } - break; - } - - case PARSE_POST_CATENATION: - { - tre_ast_node_t *tree = tre_stack_pop_voidptr(stack); - tre_ast_node_t *tmp_node; - tmp_node = tre_ast_new_catenation(ctx->mem, tree, result); - if (!tmp_node) - return REG_ESPACE; - result = tmp_node; - break; - } - - case PARSE_UNION: - switch (*ctx->re) - { - case CHAR_PIPE: - STACK_PUSHX(stack, int, PARSE_UNION); - STACK_PUSHX(stack, voidptr, result); - STACK_PUSHX(stack, int, PARSE_POST_UNION); - STACK_PUSHX(stack, int, PARSE_BRANCH); - ctx->re++; - break; - - case CHAR_RPAREN: - ctx->re++; - break; - - default: - break; - } - break; - - case PARSE_POST_UNION: - { - tre_ast_node_t *tmp_node; - tre_ast_node_t *tree = tre_stack_pop_voidptr(stack); - tmp_node = tre_ast_new_union(ctx->mem, tree, result); - if (!tmp_node) - return REG_ESPACE; - result = tmp_node; - break; - } - - case PARSE_POSTFIX: - /* Parse postfix operators. */ - switch (*ctx->re) - { - case CHAR_PLUS: - case CHAR_QUESTIONMARK: - if (!(ctx->cflags & REG_EXTENDED)) - break; - /*FALLTHROUGH*/ - case CHAR_STAR: - { - tre_ast_node_t *tmp_node; - int minimal = 0; - int rep_min = 0; - int rep_max = -1; - - if (*ctx->re == CHAR_PLUS) - rep_min = 1; - if (*ctx->re == CHAR_QUESTIONMARK) - rep_max = 1; - - ctx->re++; - tmp_node = tre_ast_new_iter(ctx->mem, result, rep_min, rep_max, - minimal); - if (tmp_node == NULL) - return REG_ESPACE; - result = tmp_node; - STACK_PUSHX(stack, int, PARSE_POSTFIX); - } - break; - - case CHAR_BACKSLASH: - /* "\{" is special without REG_EXTENDED */ - if (!(ctx->cflags & REG_EXTENDED) - && *(ctx->re + 1) == CHAR_LBRACE) - { - ctx->re++; - goto parse_brace; - } - else - break; - - case CHAR_LBRACE: - /* "{" is literal without REG_EXTENDED */ - if (!(ctx->cflags & REG_EXTENDED)) - break; - - parse_brace: - ctx->re++; + if (c-'0'<10) return c-'0'; + c |= 32; + if (c-'a'<6) return c-'a'+10; + return -1; +} - status = tre_parse_bound(ctx, &result); - if (status != REG_OK) - return status; - STACK_PUSHX(stack, int, PARSE_POSTFIX); - break; - } - break; +static reg_errcode_t marksub(tre_parse_ctx_t *ctx, tre_ast_node_t *node, int subid) +{ + if (node->submatch_id >= 0) { + tre_ast_node_t *n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1); + if (!n) + return REG_ESPACE; + n = tre_ast_new_catenation(ctx->mem, n, node); + if (!n) + return REG_ESPACE; + n->num_submatches = node->num_submatches; + node = n; + } + node->submatch_id = subid; + node->num_submatches++; + ctx->n = node; + return REG_OK; +} - case PARSE_ATOM: - /* Parse an atom. An atom is a regular expression enclosed in `()', - an empty set of `()', a bracket expression, `.', `^', `$', - a `\' followed by a character, or a single character. */ +/* +BRE grammar: +Regex = Branch | '^' | '$' | '^$' | '^' Branch | Branch '$' | '^' Branch '$' +Branch = Atom | Branch Atom +Atom = char | quoted_char | '.' | Bracket | Atom Dup | '\(' Branch '\)' | back_ref +Dup = '*' | '\{' Count '\}' | '\{' Count ',\}' | '\{' Count ',' Count '\}' - switch (*ctx->re) - { - case CHAR_LPAREN: /* parenthesized subexpression */ +(leading ^ and trailing $ in a sub expr may be an anchor or literal as well) - if (ctx->cflags & REG_EXTENDED) - { - lparen: - depth++; - { - ctx->re++; - /* First parse a whole RE, then mark the resulting tree - for submatching. */ - STACK_PUSHX(stack, int, ctx->submatch_id); - STACK_PUSHX(stack, int, PARSE_MARK_FOR_SUBMATCH); - STACK_PUSHX(stack, int, PARSE_RE); - ctx->submatch_id++; - } - } - else - goto parse_literal; - break; +ERE grammar: +Regex = Branch | Regex '|' Branch +Branch = Atom | Branch Atom +Atom = char | quoted_char | '.' | Bracket | Atom Dup | '(' Regex ')' | '^' | '$' +Dup = '*' | '+' | '?' | '{' Count '}' | '{' Count ',}' | '{' Count ',' Count '}' - case CHAR_LBRACKET: /* bracket expression */ - ctx->re++; - status = tre_parse_bracket(ctx, &result); - if (status != REG_OK) - return status; - break; +(a*+?, ^*, $+, \X, {, (|a) are unspecified) +*/ - case CHAR_BACKSLASH: - /* If this is "\(" or "\)" chew off the backslash and - try again. */ - if (!(ctx->cflags & REG_EXTENDED) && *(ctx->re + 1) == CHAR_LPAREN) - { - ctx->re++; - goto lparen; - } - if (!(ctx->cflags & REG_EXTENDED) && *(ctx->re + 1) == CHAR_RPAREN) - { - goto empty_atom; +static reg_errcode_t parse_atom(tre_parse_ctx_t *ctx, const char *s) +{ + int len, ere = ctx->cflags & REG_EXTENDED; + const char *p; + tre_ast_node_t *node; + wchar_t wc; + switch (*s) { + case '[': + return parse_bracket(ctx, s+1); + case '\\': + p = tre_expand_macro(s+1); + if (p) { + /* assume \X expansion is a single atom */ + reg_errcode_t err = parse_atom(ctx, p); + ctx->s = s+2; + return err; } - - /* If a macro is used, parse the expanded macro recursively. */ - { - const char *buf = tre_expand_macro(ctx->re + 1); - if (buf) - { - tre_parse_ctx_t subctx; - memcpy(&subctx, ctx, sizeof(subctx)); - subctx.re = buf; - subctx.nofirstsub = 1; - status = tre_parse(&subctx); - if (status != REG_OK) - return status; - ctx->re += 2; - ctx->position = subctx.position; - result = subctx.result; - break; - } - } - - if (!ctx->re[1]) - /* Trailing backslash. */ - return REG_EESCAPE; - - ctx->re++; - switch (*ctx->re) - { + /* extensions: \b, \B, \<, \>, \xHH \x{HHHH} */ + switch (*++s) { + case 0: + return REG_EESCAPE; case 'b': - result = tre_ast_new_literal(ctx->mem, ASSERTION, - ASSERT_AT_WB, -1); - ctx->re++; - break; + node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_WB, -1); + break; case 'B': - result = tre_ast_new_literal(ctx->mem, ASSERTION, - ASSERT_AT_WB_NEG, -1); - ctx->re++; - break; + node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_WB_NEG, -1); + break; case '<': - result = tre_ast_new_literal(ctx->mem, ASSERTION, - ASSERT_AT_BOW, -1); - ctx->re++; - break; + node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_BOW, -1); + break; case '>': - result = tre_ast_new_literal(ctx->mem, ASSERTION, - ASSERT_AT_EOW, -1); - ctx->re++; - break; + node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_EOW, -1); + break; case 'x': - ctx->re++; - if (ctx->re[0] != CHAR_LBRACE) - { - /* 8 bit hex char. */ - char tmp[3] = {0, 0, 0}; - long val; - - if (tre_isxdigit(ctx->re[0])) - { - tmp[0] = (char)ctx->re[0]; - ctx->re++; + s++; + int i, v = 0, c; + len = 2; + if (*s == '{') { + len = 8; + s++; } - if (tre_isxdigit(ctx->re[0])) - { - tmp[1] = (char)ctx->re[0]; - ctx->re++; + for (i=0; i<len && v<0x110000; i++) { + c = hexval(s[i]); + if (c < 0) break; + v = 16*v + c; } - val = strtol(tmp, NULL, 16); - result = tre_ast_new_literal(ctx->mem, (int)val, - (int)val, ctx->position); - ctx->position++; - break; - } - else if (*ctx->re) - { - /* Wide char. */ - char tmp[32]; - long val; - int i = 0; - ctx->re++; - while (*ctx->re && i < sizeof tmp) - { - if (ctx->re[0] == CHAR_RBRACE) - break; - if (tre_isxdigit(ctx->re[0])) - { - tmp[i] = (char)ctx->re[0]; - i++; - ctx->re++; - continue; - } - return REG_EBRACE; + s += i; + if (len == 8) { + if (*s != '}') + return REG_EBRACE; + s++; } - ctx->re++; - tmp[i] = 0; - val = strtol(tmp, NULL, 16); - result = tre_ast_new_literal(ctx->mem, (int)val, (int)val, - ctx->position); - ctx->position++; - break; - } - /*FALLTHROUGH*/ - + node = tre_ast_new_literal(ctx->mem, v, v, ctx->position); + ctx->position++; + s--; + break; default: - if (tre_isdigit(*ctx->re)) - { - /* Back reference. */ - int val = *ctx->re - '0'; - result = tre_ast_new_literal(ctx->mem, BACKREF, val, - ctx->position); - if (result == NULL) - return REG_ESPACE; - ctx->position++; - ctx->max_backref = MAX(val, ctx->max_backref); - ctx->re++; - } - else - { - /* Escaped character. */ - result = tre_ast_new_literal(ctx->mem, *ctx->re, *ctx->re, - ctx->position); - ctx->position++; - ctx->re++; - } - break; - } - if (result == NULL) - return REG_ESPACE; - break; - - case CHAR_PERIOD: /* the any-symbol */ - if (ctx->cflags & REG_NEWLINE) - { - tre_ast_node_t *tmp1; - tre_ast_node_t *tmp2; - tmp1 = tre_ast_new_literal(ctx->mem, 0, '\n' - 1, - ctx->position); - if (!tmp1) - return REG_ESPACE; - tmp2 = tre_ast_new_literal(ctx->mem, '\n' + 1, TRE_CHAR_MAX, - ctx->position + 1); - if (!tmp2) - return REG_ESPACE; - result = tre_ast_new_union(ctx->mem, tmp1, tmp2); - if (!result) - return REG_ESPACE; - ctx->position += 2; - } - else - { - result = tre_ast_new_literal(ctx->mem, 0, TRE_CHAR_MAX, - ctx->position); - if (!result) - return REG_ESPACE; - ctx->position++; + if (isdigit(*s)) { + /* back reference */ + int val = *s - '0'; + node = tre_ast_new_literal(ctx->mem, BACKREF, val, ctx->position); + ctx->max_backref = MAX(val, ctx->max_backref); + } else { + /* extension: accept unknown escaped char + as a literal */ + node = tre_ast_new_literal(ctx->mem, *s, *s, ctx->position); + } + ctx->position++; } - ctx->re++; - break; - - case CHAR_CARET: /* beginning of line assertion */ - /* '^' has a special meaning everywhere in EREs, and at - beginning of BRE. */ - if (ctx->cflags & REG_EXTENDED - || ctx->re == ctx->re_start) - { - if (!(ctx->cflags & REG_EXTENDED)) - STACK_PUSHX(stack, int, PARSE_CATENATION); - result = tre_ast_new_literal(ctx->mem, ASSERTION, - ASSERT_AT_BOL, -1); - if (result == NULL) - return REG_ESPACE; - ctx->re++; + s++; + break; + case '.': + if (ctx->cflags & REG_NEWLINE) { + tre_ast_node_t *tmp1, *tmp2; + tmp1 = tre_ast_new_literal(ctx->mem, 0, '\n'-1, ctx->position++); + tmp2 = tre_ast_new_literal(ctx->mem, '\n'+1, TRE_CHAR_MAX, ctx->position++); + if (tmp1 && tmp2) + node = tre_ast_new_union(ctx->mem, tmp1, tmp2); + else + node = 0; + } else { + node = tre_ast_new_literal(ctx->mem, 0, TRE_CHAR_MAX, ctx->position++); } - else - goto parse_literal; - break; - - case CHAR_DOLLAR: /* end of line assertion. */ - /* '$' is special everywhere in EREs, and in the end of the - string in BREs. */ - if (ctx->cflags & REG_EXTENDED - || !*(ctx->re + 1)) - { - result = tre_ast_new_literal(ctx->mem, ASSERTION, - ASSERT_AT_EOL, -1); - if (result == NULL) - return REG_ESPACE; - ctx->re++; + s++; + break; + case '^': + /* '^' has a special meaning everywhere in EREs, and at beginning of BRE. */ + if (!ere && s != ctx->re) + goto parse_literal; + node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_BOL, -1); + s++; + break; + case '$': + /* '$' is special everywhere in EREs, and in the end of the string in BREs. */ + if (!ere && s[1]) + goto parse_literal; + node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_EOL, -1); + s++; + break; + case '*': + case '|': + case '{': + case '+': + case '?': + if (!ere) + goto parse_literal; + case 0: + node = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1); + break; + default: +parse_literal: + len = mbtowc(&wc, s, -1); + if (len < 0) + return REG_BADPAT; + if (ctx->cflags & REG_ICASE && (tre_isupper(wc) || tre_islower(wc))) { + tre_ast_node_t *tmp1, *tmp2; + /* multiple opposite case characters are not supported */ + tmp1 = tre_ast_new_literal(ctx->mem, tre_toupper(wc), tre_toupper(wc), ctx->position); + tmp2 = tre_ast_new_literal(ctx->mem, tre_tolower(wc), tre_tolower(wc), ctx->position); + if (tmp1 && tmp2) + node = tre_ast_new_union(ctx->mem, tmp1, tmp2); + else + node = 0; + } else { + node = tre_ast_new_literal(ctx->mem, wc, wc, ctx->position); } - else - goto parse_literal; - break; - - case CHAR_RPAREN: - if (!depth) - goto parse_literal; - case CHAR_STAR: - case CHAR_PIPE: - case CHAR_LBRACE: - case CHAR_PLUS: - case CHAR_QUESTIONMARK: - if (!(ctx->cflags & REG_EXTENDED)) - goto parse_literal; - - case 0: - empty_atom: - result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1); - if (!result) + ctx->position++; + s += len; + break; + } + if (!node) return REG_ESPACE; - break; + ctx->n = node; + ctx->s = s; + return REG_OK; +} - default: - parse_literal: +#define PUSHPTR(err, s, v) do { \ + if ((err = tre_stack_push_voidptr(s, v)) != REG_OK) \ + return err; \ +} while(0) - clen = mbtowc(&wc, ctx->re, -1); - if (clen<0) clen=1, wc=WEOF; +#define PUSHINT(err, s, v) do { \ + if ((err = tre_stack_push_int(s, v)) != REG_OK) \ + return err; \ +} while(0) - /* Note that we can't use an tre_isalpha() test here, since there - may be characters which are alphabetic but neither upper or - lower case. */ - if (ctx->cflags & REG_ICASE - && (tre_isupper(wc) || tre_islower(wc))) - { - tre_ast_node_t *tmp1; - tre_ast_node_t *tmp2; - - /* XXX - Can there be more than one opposite-case - counterpoints for some character in some locale? Or - more than two characters which all should be regarded - the same character if case is ignored? If yes, there - does not seem to be a portable way to detect it. I guess - that at least for multi-character collating elements there - could be several opposite-case counterpoints, but they - cannot be supported portably anyway. */ - tmp1 = tre_ast_new_literal(ctx->mem, tre_toupper(wc), - tre_toupper(wc), - ctx->position); - if (!tmp1) - return REG_ESPACE; - tmp2 = tre_ast_new_literal(ctx->mem, tre_tolower(wc), - tre_tolower(wc), - ctx->position); - if (!tmp2) - return REG_ESPACE; - result = tre_ast_new_union(ctx->mem, tmp1, tmp2); - if (!result) - return REG_ESPACE; +static reg_errcode_t tre_parse(tre_parse_ctx_t *ctx) +{ + tre_ast_node_t *nbranch=0, *nunion=0; + int ere = ctx->cflags & REG_EXTENDED; + const char *s = ctx->re; + int subid = 0; + int depth = 0; + reg_errcode_t err; + tre_stack_t *stack = ctx->stack; + + PUSHINT(err, stack, subid++); + for (;;) { + if ((!ere && *s == '\\' && s[1] == '(') || + (ere && *s == '(')) { + PUSHPTR(err, stack, nunion); + PUSHPTR(err, stack, nbranch); + PUSHINT(err, stack, subid++); + s++; + if (!ere) + s++; + depth++; + nbranch = nunion = 0; + continue; } - else - { - result = tre_ast_new_literal(ctx->mem, wc, wc, - ctx->position); - if (!result) - return REG_ESPACE; + if ((!ere && *s == '\\' && s[1] == ')') || + (ere && *s == ')' && depth)) { + ctx->n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1); + if (!ctx->n) + return REG_ESPACE; + } else { + err = parse_atom(ctx, s); + if (err != REG_OK) + return err; + s = ctx->s; } - ctx->position++; - ctx->re += clen; - break; - } - break; - - case PARSE_MARK_FOR_SUBMATCH: - { - int submatch_id = tre_stack_pop_int(stack); - if (result->submatch_id >= 0) - { - tre_ast_node_t *n, *tmp_node; - n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1); - if (n == NULL) - return REG_ESPACE; - tmp_node = tre_ast_new_catenation(ctx->mem, n, result); - if (tmp_node == NULL) - return REG_ESPACE; - tmp_node->num_submatches = result->num_submatches; - result = tmp_node; - } - result->submatch_id = submatch_id; - result->num_submatches++; - break; - } - - case PARSE_RESTORE_CFLAGS: - ctx->cflags = tre_stack_pop_int(stack); - break; + parse_iter: + /* extension: repetitions are accepted after an empty node + eg. (+), ^*, a$?, a|{2} */ + switch (*s) { + case '+': + case '?': + if (!ere) + break; + /* fallthrough */ + case '*':; + int min=0, max=-1; + if (*s == '+') + min = 1; + if (*s == '?') + max = 1; + s++; + ctx->n = tre_ast_new_iter(ctx->mem, ctx->n, min, max, 0); + if (!ctx->n) + return REG_ESPACE; + /* extension: multiple consecutive *+?{,} is unspecified, + but (a+)+ has to be supported so accepting a++ makes + sense, note however that the RE_DUP_MAX limit can be + circumvented: (a{255}){255} uses a lot of memory.. */ + goto parse_iter; + case '\\': + if (ere || s[1] != '{') + break; + s++; + goto parse_brace; + case '{': + if (!ere) + break; + parse_brace: + err = parse_dup(ctx, s+1); + if (err != REG_OK) + return err; + s = ctx->s; + goto parse_iter; + } - default: - assert(0); - break; + nbranch = tre_ast_new_catenation(ctx->mem, nbranch, ctx->n); + if ((ere && *s == '|') || + (ere && *s == ')' && depth) || + (!ere && *s == '\\' && s[1] == ')') || + !*s) { + /* extension: empty branch is unspecified (), (|a), (a|) + here they are not rejected but match on empty string */ + int c = *s; + nunion = tre_ast_new_union(ctx->mem, nunion, nbranch); + nbranch = 0; + if (c != '|') { + if (c == '\\') { + if (!depth) return REG_EPAREN; + s+=2; + } else if (c == ')') + s++; + depth--; + err = marksub(ctx, nunion, tre_stack_pop_int(stack)); + if (err != REG_OK) + return err; + if (!c && depth<0) { + ctx->submatch_id = subid; + return REG_OK; + } + if (!c || depth<0) + return REG_EPAREN; + nbranch = tre_stack_pop_voidptr(stack); + nunion = tre_stack_pop_voidptr(stack); + goto parse_iter; + } + s++; + } } - } - - /* Check for missing closing parentheses. */ - if (depth > 0) - return REG_EPAREN; - - if (status == REG_OK) - ctx->result = result; - - return status; } - /*********************************************************************** from tre-compile.c ***********************************************************************/ @@ -3122,12 +2680,7 @@ regcomp(regex_t *restrict preg, const char *restrict regex, int cflags) if (errcode != REG_OK) ERROR_EXIT(errcode); preg->re_nsub = parse_ctx.submatch_id - 1; - tree = parse_ctx.result; - - /* Back references and approximate matching cannot currently be used - in the same regexp. */ - if (parse_ctx.max_backref >= 0 && parse_ctx.have_approx) - ERROR_EXIT(REG_BADPAT); + tree = parse_ctx.n; #ifdef TRE_DEBUG tre_ast_print(tree); @@ -3142,7 +2695,7 @@ regcomp(regex_t *restrict preg, const char *restrict regex, int cflags) if (tnfa == NULL) ERROR_EXIT(REG_ESPACE); tnfa->have_backrefs = parse_ctx.max_backref >= 0; - tnfa->have_approx = parse_ctx.have_approx; + tnfa->have_approx = 0; tnfa->num_submatches = parse_ctx.submatch_id; /* Set up tags for submatch addressing. If REG_NOSUB is set and the |