diff options
Diffstat (limited to 'lib/regexec.c')
-rw-r--r-- | lib/regexec.c | 348 |
1 files changed, 348 insertions, 0 deletions
diff --git a/lib/regexec.c b/lib/regexec.c new file mode 100644 index 0000000000000..54779ef4597df --- /dev/null +++ b/lib/regexec.c @@ -0,0 +1,348 @@ +/* + tre_regexec.c - TRE POSIX compatible matching functions (and more). + + This software is released under a BSD-style license. + See the file LICENSE for details and copyright. + +*/ + +#ifdef HAVE_CONFIG_H +#include <config.h> +#endif /* HAVE_CONFIG_H */ + +#ifdef TRE_USE_ALLOCA +/* AIX requires this to be the first thing in the file. */ +#ifndef __GNUC__ +# if HAVE_ALLOCA_H +# include <alloca.h> +# else +# ifdef _AIX + #pragma alloca +# else +# ifndef alloca /* predefined by HP cc +Olibcalls */ +char *alloca (); +# endif +# endif +# endif +#endif +#endif /* TRE_USE_ALLOCA */ + +#include <assert.h> +#include <stdlib.h> +#include <string.h> +#ifdef HAVE_WCHAR_H +#include <wchar.h> +#endif /* HAVE_WCHAR_H */ +#ifdef HAVE_WCTYPE_H +#include <wctype.h> +#endif /* HAVE_WCTYPE_H */ +#ifndef TRE_WCHAR +#include <ctype.h> +#endif /* !TRE_WCHAR */ +#ifdef HAVE_MALLOC_H +#include <malloc.h> +#endif /* HAVE_MALLOC_H */ +#include <limits.h> + +#include "tre-internal.h" +#include "tre.h" +#include "xmalloc.h" + + +/* Fills the POSIX.2 regmatch_t array according to the TNFA tag and match + endpoint values. */ +void +tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags, + const tre_tnfa_t *tnfa, int *tags, int match_eo) +{ + tre_submatch_data_t *submatch_data; + unsigned int i, j; + int *parents; + + i = 0; + if (match_eo >= 0 && !(cflags & REG_NOSUB)) + { + /* Construct submatch offsets from the tags. */ + DPRINT(("end tag = t%d = %d\n", tnfa->end_tag, match_eo)); + submatch_data = tnfa->submatch_data; + while (i < tnfa->num_submatches && i < nmatch) + { + if (submatch_data[i].so_tag == tnfa->end_tag) + pmatch[i].rm_so = match_eo; + else + pmatch[i].rm_so = tags[submatch_data[i].so_tag]; + + if (submatch_data[i].eo_tag == tnfa->end_tag) + pmatch[i].rm_eo = match_eo; + else + pmatch[i].rm_eo = tags[submatch_data[i].eo_tag]; + + /* If either of the endpoints were not used, this submatch + was not part of the match. */ + if (pmatch[i].rm_so == -1 || pmatch[i].rm_eo == -1) + pmatch[i].rm_so = pmatch[i].rm_eo = -1; + + DPRINT(("pmatch[%d] = {t%d = %d, t%d = %d}\n", i, + submatch_data[i].so_tag, pmatch[i].rm_so, + submatch_data[i].eo_tag, pmatch[i].rm_eo)); + i++; + } + /* Reset all submatches that are not within all of their parent + submatches. */ + i = 0; + while (i < tnfa->num_submatches && i < nmatch) + { + if (pmatch[i].rm_eo == -1) + assert(pmatch[i].rm_so == -1); + assert(pmatch[i].rm_so <= pmatch[i].rm_eo); + + parents = submatch_data[i].parents; + if (parents != NULL) + for (j = 0; parents[j] >= 0; j++) + { + DPRINT(("pmatch[%d] parent %d\n", i, parents[j])); + if (pmatch[i].rm_so < pmatch[parents[j]].rm_so + || pmatch[i].rm_eo > pmatch[parents[j]].rm_eo) + pmatch[i].rm_so = pmatch[i].rm_eo = -1; + } + i++; + } + } + + while (i < nmatch) + { + pmatch[i].rm_so = -1; + pmatch[i].rm_eo = -1; + i++; + } +} + + +/* + Wrapper functions for POSIX compatible regexp matching. +*/ + +int +tre_have_backrefs(const regex_t *preg) +{ + tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD; + return tnfa->have_backrefs; +} + +int +tre_have_approx(const regex_t *preg) +{ + tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD; + return tnfa->have_approx; +} + +static int +tre_match(const tre_tnfa_t *tnfa, const void *string, size_t len, + tre_str_type_t type, size_t nmatch, regmatch_t pmatch[], + int eflags) +{ + reg_errcode_t status; + int *tags = NULL, eo; + if (tnfa->num_tags > 0 && nmatch > 0) + { +#ifdef TRE_USE_ALLOCA + tags = alloca(sizeof(*tags) * tnfa->num_tags); +#else /* !TRE_USE_ALLOCA */ + tags = xmalloc(sizeof(*tags) * tnfa->num_tags); +#endif /* !TRE_USE_ALLOCA */ + if (tags == NULL) + return REG_ESPACE; + } + + /* Dispatch to the appropriate matcher. */ + if (tnfa->have_backrefs || eflags & REG_BACKTRACKING_MATCHER) + { + /* The regex has back references, use the backtracking matcher. */ + if (type == STR_USER) + { + const tre_str_source *source = string; + if (source->rewind == NULL || source->compare == NULL) + /* The backtracking matcher requires rewind and compare + capabilities from the input stream. */ + return REG_BADPAT; + } + status = tre_tnfa_run_backtrack(tnfa, string, (int)len, type, + tags, eflags, &eo); + } +#ifdef TRE_APPROX + else if (tnfa->have_approx || eflags & REG_APPROX_MATCHER) + { + /* The regex uses approximate matching, use the approximate matcher. */ + regamatch_t match; + regaparams_t params; + tre_regaparams_default(¶ms); + params.max_err = 0; + params.max_cost = 0; + status = tre_tnfa_run_approx(tnfa, string, (int)len, type, tags, + &match, params, eflags, &eo); + } +#endif /* TRE_APPROX */ + else + { + /* Exact matching, no back references, use the parallel matcher. */ + status = tre_tnfa_run_parallel(tnfa, string, (int)len, type, + tags, eflags, &eo); + } + + if (status == REG_OK) + /* A match was found, so fill the submatch registers. */ + tre_fill_pmatch(nmatch, pmatch, tnfa->cflags, tnfa, tags, eo); +#ifndef TRE_USE_ALLOCA + if (tags) + xfree(tags); +#endif /* !TRE_USE_ALLOCA */ + return status; +} + +int +tre_regnexec(const regex_t *preg, const char *str, size_t len, + size_t nmatch, regmatch_t pmatch[], int eflags) +{ + tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD; + tre_str_type_t type = (TRE_MB_CUR_MAX == 1) ? STR_BYTE : STR_MBS; + + return tre_match(tnfa, str, len, type, nmatch, pmatch, eflags); +} + +int +tre_regexec(const regex_t *preg, const char *str, + size_t nmatch, regmatch_t pmatch[], int eflags) +{ + return tre_regnexec(preg, str, (unsigned)-1, nmatch, pmatch, eflags); +} + + +#ifdef TRE_WCHAR + +int +tre_regwnexec(const regex_t *preg, const wchar_t *str, size_t len, + size_t nmatch, regmatch_t pmatch[], int eflags) +{ + tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD; + return tre_match(tnfa, str, len, STR_WIDE, nmatch, pmatch, eflags); +} + +int +tre_regwexec(const regex_t *preg, const wchar_t *str, + size_t nmatch, regmatch_t pmatch[], int eflags) +{ + return tre_regwnexec(preg, str, (unsigned)-1, nmatch, pmatch, eflags); +} + +#endif /* TRE_WCHAR */ + +int +tre_reguexec(const regex_t *preg, const tre_str_source *str, + size_t nmatch, regmatch_t pmatch[], int eflags) +{ + tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD; + return tre_match(tnfa, str, (unsigned)-1, STR_USER, nmatch, pmatch, eflags); +} + + +#ifdef TRE_APPROX + +/* + Wrapper functions for approximate regexp matching. +*/ + +static int +tre_match_approx(const tre_tnfa_t *tnfa, const void *string, size_t len, + tre_str_type_t type, regamatch_t *match, regaparams_t params, + int eflags) +{ + reg_errcode_t status; + int *tags = NULL, eo; + + /* If the regexp does not use approximate matching features, the + maximum cost is zero, and the approximate matcher isn't forced, + use the exact matcher instead. */ + if (params.max_cost == 0 && !tnfa->have_approx + && !(eflags & REG_APPROX_MATCHER)) + return tre_match(tnfa, string, len, type, match->nmatch, match->pmatch, + eflags); + + /* Back references are not supported by the approximate matcher. */ + if (tnfa->have_backrefs) + return REG_BADPAT; + + if (tnfa->num_tags > 0 && match->nmatch > 0) + { +#if TRE_USE_ALLOCA + tags = alloca(sizeof(*tags) * tnfa->num_tags); +#else /* !TRE_USE_ALLOCA */ + tags = xmalloc(sizeof(*tags) * tnfa->num_tags); +#endif /* !TRE_USE_ALLOCA */ + if (tags == NULL) + return REG_ESPACE; + } + status = tre_tnfa_run_approx(tnfa, string, (int)len, type, tags, + match, params, eflags, &eo); + if (status == REG_OK) + tre_fill_pmatch(match->nmatch, match->pmatch, tnfa->cflags, tnfa, tags, eo); +#ifndef TRE_USE_ALLOCA + if (tags) + xfree(tags); +#endif /* !TRE_USE_ALLOCA */ + return status; +} + +int +tre_reganexec(const regex_t *preg, const char *str, size_t len, + regamatch_t *match, regaparams_t params, int eflags) +{ + tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD; + tre_str_type_t type = (TRE_MB_CUR_MAX == 1) ? STR_BYTE : STR_MBS; + + return tre_match_approx(tnfa, str, len, type, match, params, eflags); +} + +int +tre_regaexec(const regex_t *preg, const char *str, + regamatch_t *match, regaparams_t params, int eflags) +{ + return tre_reganexec(preg, str, (unsigned)-1, match, params, eflags); +} + +#ifdef TRE_WCHAR + +int +tre_regawnexec(const regex_t *preg, const wchar_t *str, size_t len, + regamatch_t *match, regaparams_t params, int eflags) +{ + tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD; + return tre_match_approx(tnfa, str, len, STR_WIDE, + match, params, eflags); +} + +int +tre_regawexec(const regex_t *preg, const wchar_t *str, + regamatch_t *match, regaparams_t params, int eflags) +{ + return tre_regawnexec(preg, str, (unsigned)-1, match, params, eflags); +} + +#endif /* TRE_WCHAR */ + +void +tre_regaparams_default(regaparams_t *params) +{ + memset(params, 0, sizeof(*params)); + params->cost_ins = 1; + params->cost_del = 1; + params->cost_subst = 1; + params->max_cost = INT_MAX; + params->max_ins = INT_MAX; + params->max_del = INT_MAX; + params->max_subst = INT_MAX; + params->max_err = INT_MAX; +} + +#endif /* TRE_APPROX */ + +/* EOF */ |