diff options
Diffstat (limited to 'lib/tre-match-backtrack.c')
-rw-r--r-- | lib/tre-match-backtrack.c | 666 |
1 files changed, 666 insertions, 0 deletions
diff --git a/lib/tre-match-backtrack.c b/lib/tre-match-backtrack.c new file mode 100644 index 0000000000000..eab13a7bc8ab7 --- /dev/null +++ b/lib/tre-match-backtrack.c @@ -0,0 +1,666 @@ +/* + tre-match-backtrack.c - TRE backtracking regex matching engine + + This software is released under a BSD-style license. + See the file LICENSE for details and copyright. + +*/ + +/* + This matcher is for regexps that use back referencing. Regexp matching + with back referencing is an NP-complete problem on the number of back + references. The easiest way to match them is to use a backtracking + routine which basically goes through all possible paths in the TNFA + and chooses the one which results in the best (leftmost and longest) + match. This can be spectacularly expensive and may run out of stack + space, but there really is no better known generic algorithm. Quoting + Henry Spencer from comp.compilers: + <URL: http://compilers.iecc.com/comparch/article/93-03-102> + + POSIX.2 REs require longest match, which is really exciting to + implement since the obsolete ("basic") variant also includes + \<digit>. I haven't found a better way of tackling this than doing + a preliminary match using a DFA (or simulation) on a modified RE + that just replicates subREs for \<digit>, and then doing a + backtracking match to determine whether the subRE matches were + right. This can be rather slow, but I console myself with the + thought that people who use \<digit> deserve very slow execution. + (Pun unintentional but very appropriate.) + +*/ + + +#ifdef HAVE_CONFIG_H +#include <config.h> +#endif /* HAVE_CONFIG_H */ + +#ifdef TRE_USE_ALLOCA +/* AIX requires this to be the first thing in the file. */ +#ifndef __GNUC__ +# if HAVE_ALLOCA_H +# include <alloca.h> +# else +# ifdef _AIX + #pragma alloca +# else +# ifndef alloca /* predefined by HP cc +Olibcalls */ +char *alloca (); +# endif +# endif +# endif +#endif +#endif /* TRE_USE_ALLOCA */ + +#include <assert.h> +#include <stdlib.h> +#include <string.h> +#ifdef HAVE_WCHAR_H +#include <wchar.h> +#endif /* HAVE_WCHAR_H */ +#ifdef HAVE_WCTYPE_H +#include <wctype.h> +#endif /* HAVE_WCTYPE_H */ +#ifndef TRE_WCHAR +#include <ctype.h> +#endif /* !TRE_WCHAR */ +#ifdef HAVE_MALLOC_H +#include <malloc.h> +#endif /* HAVE_MALLOC_H */ + +#include "tre-internal.h" +#include "tre-mem.h" +#include "tre-match-utils.h" +#include "tre.h" +#include "xmalloc.h" + +typedef struct { + int pos; + const char *str_byte; +#ifdef TRE_WCHAR + const wchar_t *str_wide; +#endif /* TRE_WCHAR */ + tre_tnfa_transition_t *state; + int state_id; + int next_c; + int *tags; +#ifdef TRE_MBSTATE + mbstate_t mbstate; +#endif /* TRE_MBSTATE */ +} tre_backtrack_item_t; + +typedef struct tre_backtrack_struct { + tre_backtrack_item_t item; + struct tre_backtrack_struct *prev; + struct tre_backtrack_struct *next; +} *tre_backtrack_t; + +#ifdef TRE_WCHAR +#define BT_STACK_WIDE_IN(_str_wide) stack->item.str_wide = (_str_wide) +#define BT_STACK_WIDE_OUT (str_wide) = stack->item.str_wide +#else /* !TRE_WCHAR */ +#define BT_STACK_WIDE_IN(_str_wide) +#define BT_STACK_WIDE_OUT +#endif /* !TRE_WCHAR */ + +#ifdef TRE_MBSTATE +#define BT_STACK_MBSTATE_IN stack->item.mbstate = (mbstate) +#define BT_STACK_MBSTATE_OUT (mbstate) = stack->item.mbstate +#else /* !TRE_MBSTATE */ +#define BT_STACK_MBSTATE_IN +#define BT_STACK_MBSTATE_OUT +#endif /* !TRE_MBSTATE */ + + +#ifdef TRE_USE_ALLOCA +#define tre_bt_mem_new tre_mem_newa +#define tre_bt_mem_alloc tre_mem_alloca +#define tre_bt_mem_destroy(obj) do { } while (0) +#else /* !TRE_USE_ALLOCA */ +#define tre_bt_mem_new tre_mem_new +#define tre_bt_mem_alloc tre_mem_alloc +#define tre_bt_mem_destroy tre_mem_destroy +#endif /* !TRE_USE_ALLOCA */ + + +#define BT_STACK_PUSH(_pos, _str_byte, _str_wide, _state, _state_id, _next_c, _tags, _mbstate) \ + do \ + { \ + int i; \ + if (!stack->next) \ + { \ + tre_backtrack_t s; \ + s = tre_bt_mem_alloc(mem, sizeof(*s)); \ + if (!s) \ + { \ + tre_bt_mem_destroy(mem); \ + if (tags) \ + xfree(tags); \ + if (pmatch) \ + xfree(pmatch); \ + if (states_seen) \ + xfree(states_seen); \ + return REG_ESPACE; \ + } \ + s->prev = stack; \ + s->next = NULL; \ + s->item.tags = tre_bt_mem_alloc(mem, \ + sizeof(*tags) * tnfa->num_tags); \ + if (!s->item.tags) \ + { \ + tre_bt_mem_destroy(mem); \ + if (tags) \ + xfree(tags); \ + if (pmatch) \ + xfree(pmatch); \ + if (states_seen) \ + xfree(states_seen); \ + return REG_ESPACE; \ + } \ + stack->next = s; \ + stack = s; \ + } \ + else \ + stack = stack->next; \ + stack->item.pos = (_pos); \ + stack->item.str_byte = (_str_byte); \ + BT_STACK_WIDE_IN(_str_wide); \ + stack->item.state = (_state); \ + stack->item.state_id = (_state_id); \ + stack->item.next_c = (_next_c); \ + for (i = 0; i < tnfa->num_tags; i++) \ + stack->item.tags[i] = (_tags)[i]; \ + BT_STACK_MBSTATE_IN; \ + } \ + while (/*CONSTCOND*/0) + +#define BT_STACK_POP() \ + do \ + { \ + int i; \ + assert(stack->prev); \ + pos = stack->item.pos; \ + if (type == STR_USER) \ + str_source->rewind(pos + pos_add_next, str_source->context); \ + str_byte = stack->item.str_byte; \ + BT_STACK_WIDE_OUT; \ + state = stack->item.state; \ + next_c = stack->item.next_c; \ + for (i = 0; i < tnfa->num_tags; i++) \ + tags[i] = stack->item.tags[i]; \ + BT_STACK_MBSTATE_OUT; \ + stack = stack->prev; \ + } \ + while (/*CONSTCOND*/0) + +#undef MIN +#define MIN(a, b) ((a) <= (b) ? (a) : (b)) + +reg_errcode_t +tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string, + int len, tre_str_type_t type, int *match_tags, + int eflags, int *match_end_ofs) +{ + /* State variables required by GET_NEXT_WCHAR. */ + tre_char_t prev_c = 0, next_c = 0; + const char *str_byte = string; + int pos = 0; + unsigned int pos_add_next = 1; +#ifdef TRE_WCHAR + const wchar_t *str_wide = string; +#ifdef TRE_MBSTATE + mbstate_t mbstate; +#endif /* TRE_MBSTATE */ +#endif /* TRE_WCHAR */ + int reg_notbol = eflags & REG_NOTBOL; + int reg_noteol = eflags & REG_NOTEOL; + int reg_newline = tnfa->cflags & REG_NEWLINE; + int str_user_end = 0; + + /* These are used to remember the necessary values of the above + variables to return to the position where the current search + started from. */ + int next_c_start; + const char *str_byte_start; + int pos_start = -1; +#ifdef TRE_WCHAR + const wchar_t *str_wide_start; +#endif /* TRE_WCHAR */ +#ifdef TRE_MBSTATE + mbstate_t mbstate_start; +#endif /* TRE_MBSTATE */ + + /* End offset of best match so far, or -1 if no match found yet. */ + int match_eo = -1; + /* Tag arrays. */ + int *next_tags, *tags = NULL; + /* Current TNFA state. */ + tre_tnfa_transition_t *state; + int *states_seen = NULL; + + /* Memory allocator to for allocating the backtracking stack. */ + tre_mem_t mem = tre_bt_mem_new(); + + /* The backtracking stack. */ + tre_backtrack_t stack; + + tre_tnfa_transition_t *trans_i; + regmatch_t *pmatch = NULL; + int ret; + +#ifdef TRE_MBSTATE + memset(&mbstate, '\0', sizeof(mbstate)); +#endif /* TRE_MBSTATE */ + + if (!mem) + return REG_ESPACE; + stack = tre_bt_mem_alloc(mem, sizeof(*stack)); + if (!stack) + { + ret = REG_ESPACE; + goto error_exit; + } + stack->prev = NULL; + stack->next = NULL; + + DPRINT(("tnfa_execute_backtrack, input type %d\n", type)); + DPRINT(("len = %d\n", len)); + +#ifdef TRE_USE_ALLOCA + tags = alloca(sizeof(*tags) * tnfa->num_tags); + pmatch = alloca(sizeof(*pmatch) * tnfa->num_submatches); + states_seen = alloca(sizeof(*states_seen) * tnfa->num_states); +#else /* !TRE_USE_ALLOCA */ + if (tnfa->num_tags) + { + tags = xmalloc(sizeof(*tags) * tnfa->num_tags); + if (!tags) + { + ret = REG_ESPACE; + goto error_exit; + } + } + if (tnfa->num_submatches) + { + pmatch = xmalloc(sizeof(*pmatch) * tnfa->num_submatches); + if (!pmatch) + { + ret = REG_ESPACE; + goto error_exit; + } + } + if (tnfa->num_states) + { + states_seen = xmalloc(sizeof(*states_seen) * tnfa->num_states); + if (!states_seen) + { + ret = REG_ESPACE; + goto error_exit; + } + } +#endif /* !TRE_USE_ALLOCA */ + + retry: + { + int i; + for (i = 0; i < tnfa->num_tags; i++) + { + tags[i] = -1; + if (match_tags) + match_tags[i] = -1; + } + for (i = 0; i < tnfa->num_states; i++) + states_seen[i] = 0; + } + + state = NULL; + pos = pos_start; + if (type == STR_USER) + str_source->rewind(pos + pos_add_next, str_source->context); + GET_NEXT_WCHAR(); + pos_start = pos; + next_c_start = next_c; + str_byte_start = str_byte; +#ifdef TRE_WCHAR + str_wide_start = str_wide; +#endif /* TRE_WCHAR */ +#ifdef TRE_MBSTATE + mbstate_start = mbstate; +#endif /* TRE_MBSTATE */ + + /* Handle initial states. */ + next_tags = NULL; + for (trans_i = tnfa->initial; trans_i->state; trans_i++) + { + DPRINT(("> init %p, prev_c %lc\n", trans_i->state, (tre_cint_t)prev_c)); + if (trans_i->assertions && CHECK_ASSERTIONS(trans_i->assertions)) + { + DPRINT(("assert failed\n")); + continue; + } + if (state == NULL) + { + /* Start from this state. */ + state = trans_i->state; + next_tags = trans_i->tags; + } + else + { + /* Backtrack to this state. */ + DPRINT(("saving state %d for backtracking\n", trans_i->state_id)); + BT_STACK_PUSH(pos, str_byte, str_wide, trans_i->state, + trans_i->state_id, next_c, tags, mbstate); + { + int *tmp = trans_i->tags; + if (tmp) + while (*tmp >= 0) + stack->item.tags[*tmp++] = pos; + } + } + } + + if (next_tags) + for (; *next_tags >= 0; next_tags++) + tags[*next_tags] = pos; + + + DPRINT(("entering match loop, pos %d, str_byte %p\n", pos, str_byte)); + DPRINT(("pos:chr/code | state and tags\n")); + DPRINT(("-------------+------------------------------------------------\n")); + + if (state == NULL) + goto backtrack; + + while (/*CONSTCOND*/1) + { + tre_tnfa_transition_t *next_state; + int empty_br_match; + + DPRINT(("start loop\n")); + if (state == tnfa->final) + { + DPRINT((" match found, %d %d\n", match_eo, pos)); + if (match_eo < pos + || (match_eo == pos + && match_tags + && tre_tag_order(tnfa->num_tags, tnfa->tag_directions, + tags, match_tags))) + { + int i; + /* This match wins the previous match. */ + DPRINT((" win previous\n")); + match_eo = pos; + if (match_tags) + for (i = 0; i < tnfa->num_tags; i++) + match_tags[i] = tags[i]; + } + /* Our TNFAs never have transitions leaving from the final state, + so we jump right to backtracking. */ + goto backtrack; + } + +#ifdef TRE_DEBUG + DPRINT(("%3d:%2lc/%05d | %p ", pos, (tre_cint_t)next_c, (int)next_c, + state)); + { + int i; + for (i = 0; i < tnfa->num_tags; i++) + DPRINT(("%d%s", tags[i], i < tnfa->num_tags - 1 ? ", " : "")); + DPRINT(("\n")); + } +#endif /* TRE_DEBUG */ + + /* Go to the next character in the input string. */ + empty_br_match = 0; + trans_i = state; + if (trans_i->state && trans_i->assertions & ASSERT_BACKREF) + { + /* This is a back reference state. All transitions leaving from + this state have the same back reference "assertion". Instead + of reading the next character, we match the back reference. */ + int so, eo, bt = trans_i->u.backref; + int bt_len; + int result; + + DPRINT((" should match back reference %d\n", bt)); + /* Get the substring we need to match against. Remember to + turn off REG_NOSUB temporarily. */ + tre_fill_pmatch(bt + 1, pmatch, tnfa->cflags & /*LINTED*/!REG_NOSUB, + tnfa, tags, pos); + so = pmatch[bt].rm_so; + eo = pmatch[bt].rm_eo; + bt_len = eo - so; + +#ifdef TRE_DEBUG + { + int slen; + if (len < 0) + slen = bt_len; + else + slen = MIN(bt_len, len - pos); + + if (type == STR_BYTE) + { + DPRINT((" substring (len %d) is [%d, %d[: '%.*s'\n", + bt_len, so, eo, bt_len, (char*)string + so)); + DPRINT((" current string is '%.*s'\n", slen, str_byte - 1)); + } +#ifdef TRE_WCHAR + else if (type == STR_WIDE) + { + DPRINT((" substring (len %d) is [%d, %d[: '%.*" STRF "'\n", + bt_len, so, eo, bt_len, (wchar_t*)string + so)); + DPRINT((" current string is '%.*" STRF "'\n", + slen, str_wide - 1)); + } +#endif /* TRE_WCHAR */ + } +#endif + + if (len < 0) + { + if (type == STR_USER) + result = str_source->compare((unsigned)so, (unsigned)pos, + (unsigned)bt_len, + str_source->context); +#ifdef TRE_WCHAR + else if (type == STR_WIDE) + result = wcsncmp((const wchar_t*)string + so, str_wide - 1, + (size_t)bt_len); +#endif /* TRE_WCHAR */ + else + result = strncmp((const char*)string + so, str_byte - 1, + (size_t)bt_len); + } + else if (len - pos < bt_len) + result = 1; +#ifdef TRE_WCHAR + else if (type == STR_WIDE) + result = wmemcmp((const wchar_t*)string + so, str_wide - 1, + (size_t)bt_len); +#endif /* TRE_WCHAR */ + else + result = memcmp((const char*)string + so, str_byte - 1, + (size_t)bt_len); + + if (result == 0) + { + /* Back reference matched. Check for infinite loop. */ + if (bt_len == 0) + empty_br_match = 1; + if (empty_br_match && states_seen[trans_i->state_id]) + { + DPRINT((" avoid loop\n")); + goto backtrack; + } + + states_seen[trans_i->state_id] = empty_br_match; + + /* Advance in input string and resync `prev_c', `next_c' + and pos. */ + DPRINT((" back reference matched\n")); + str_byte += bt_len - 1; +#ifdef TRE_WCHAR + str_wide += bt_len - 1; +#endif /* TRE_WCHAR */ + pos += bt_len - 1; + GET_NEXT_WCHAR(); + DPRINT((" pos now %d\n", pos)); + } + else + { + DPRINT((" back reference did not match\n")); + goto backtrack; + } + } + else + { + /* Check for end of string. */ + if (len < 0) + { + if (type == STR_USER) + { + if (str_user_end) + goto backtrack; + } + else if (next_c == L'\0') + goto backtrack; + } + else + { + if (pos >= len) + goto backtrack; + } + + /* Read the next character. */ + GET_NEXT_WCHAR(); + } + + next_state = NULL; + for (trans_i = state; trans_i->state; trans_i++) + { + DPRINT((" transition %d-%d (%c-%c) %d to %d\n", + trans_i->code_min, trans_i->code_max, + trans_i->code_min, trans_i->code_max, + trans_i->assertions, trans_i->state_id)); + if (trans_i->code_min <= (tre_cint_t)prev_c + && trans_i->code_max >= (tre_cint_t)prev_c) + { + if (trans_i->assertions + && (CHECK_ASSERTIONS(trans_i->assertions) + || CHECK_CHAR_CLASSES(trans_i, tnfa, eflags))) + { + DPRINT((" assertion failed\n")); + continue; + } + + if (next_state == NULL) + { + /* First matching transition. */ + DPRINT((" Next state is %d\n", trans_i->state_id)); + next_state = trans_i->state; + next_tags = trans_i->tags; + } + else + { + /* Second matching transition. We may need to backtrack here + to take this transition instead of the first one, so we + push this transition in the backtracking stack so we can + jump back here if needed. */ + DPRINT((" saving state %d for backtracking\n", + trans_i->state_id)); + BT_STACK_PUSH(pos, str_byte, str_wide, trans_i->state, + trans_i->state_id, next_c, tags, mbstate); + { + int *tmp; + for (tmp = trans_i->tags; tmp && *tmp >= 0; tmp++) + stack->item.tags[*tmp] = pos; + } +#if 0 /* XXX - it's important not to look at all transitions here to keep + the stack small! */ + break; +#endif + } + } + } + + if (next_state != NULL) + { + /* Matching transitions were found. Take the first one. */ + state = next_state; + + /* Update the tag values. */ + if (next_tags) + while (*next_tags >= 0) + tags[*next_tags++] = pos; + } + else + { + backtrack: + /* A matching transition was not found. Try to backtrack. */ + if (stack->prev) + { + DPRINT((" backtracking\n")); + if (stack->item.state->assertions && ASSERT_BACKREF) + { + DPRINT((" states_seen[%d] = 0\n", + stack->item.state_id)); + states_seen[stack->item.state_id] = 0; + } + + BT_STACK_POP(); + } + else if (match_eo < 0) + { + /* Try starting from a later position in the input string. */ + /* Check for end of string. */ + if (len < 0) + { + if (next_c == L'\0') + { + DPRINT(("end of string.\n")); + break; + } + } + else + { + if (pos >= len) + { + DPRINT(("end of string.\n")); + break; + } + } + DPRINT(("restarting from next start position\n")); + next_c = next_c_start; +#ifdef TRE_MBSTATE + mbstate = mbstate_start; +#endif /* TRE_MBSTATE */ + str_byte = str_byte_start; +#ifdef TRE_WCHAR + str_wide = str_wide_start; +#endif /* TRE_WCHAR */ + goto retry; + } + else + { + DPRINT(("finished\n")); + break; + } + } + } + + ret = match_eo >= 0 ? REG_OK : REG_NOMATCH; + *match_end_ofs = match_eo; + + error_exit: + tre_bt_mem_destroy(mem); +#ifndef TRE_USE_ALLOCA + if (tags) + xfree(tags); + if (pmatch) + xfree(pmatch); + if (states_seen) + xfree(states_seen); +#endif /* !TRE_USE_ALLOCA */ + + return ret; +} |