diff options
| author | Daniel C. Sobral <dcs@FreeBSD.org> | 2000-07-31 06:30:37 +0000 |
|---|---|---|
| committer | Daniel C. Sobral <dcs@FreeBSD.org> | 2000-07-31 06:30:37 +0000 |
| commit | c2861446f004ff028acd2c3366c7500bf7538d58 (patch) | |
| tree | 374d245e893e4c27daa9c968ab9091888b8e227a /lib/libc/regex/regcomp.c | |
| parent | 1b139adb86a606c36a71bb5a7698a8c6ca9ff976 (diff) | |
Notes
Diffstat (limited to 'lib/libc/regex/regcomp.c')
| -rw-r--r-- | lib/libc/regex/regcomp.c | 315 |
1 files changed, 313 insertions, 2 deletions
diff --git a/lib/libc/regex/regcomp.c b/lib/libc/regex/regcomp.c index a4e9e1dced4d..bf92f2c07460 100644 --- a/lib/libc/regex/regcomp.c +++ b/lib/libc/regex/regcomp.c @@ -35,6 +35,8 @@ * SUCH DAMAGE. * * @(#)regcomp.c 8.5 (Berkeley) 3/20/94 + * + * $FreeBSD$ */ #if defined(LIBC_SCCS) && !defined(lint) @@ -122,6 +124,9 @@ static void dofwd __P((struct parse *p, sopno pos, sop value)); static void enlarge __P((struct parse *p, sopno size)); static void stripsnug __P((struct parse *p, struct re_guts *g)); static void findmust __P((struct parse *p, struct re_guts *g)); +static int altoffset __P((sop *scan, int offset, int mccs)); +static void computejumps __P((struct parse *p, struct re_guts *g)); +static void computematchjumps __P((struct parse *p, struct re_guts *g)); static sopno pluscount __P((struct parse *p, struct re_guts *g)); #ifdef __cplusplus @@ -167,6 +172,9 @@ static int never = 0; /* for use in asserts; shuts lint up */ #define never 0 /* some <assert.h>s have bugs too */ #endif +/* Macro used by computejump()/computematchjump() */ +#define MIN(a,b) ((a)<(b)?(a):(b)) + /* - regcomp - interface for parser and compilation = extern int regcomp(regex_t *, const char *, int); @@ -239,6 +247,9 @@ int cflags; g->nbol = 0; g->neol = 0; g->must = NULL; + g->moffset = -1; + g->charjump = NULL; + g->matchjump = NULL; g->mlen = 0; g->nsub = 0; g->ncategories = 1; /* category 0 is "everything else" */ @@ -262,6 +273,17 @@ int cflags; categorize(p, g); stripsnug(p, g); findmust(p, g); + /* only use Boyer-Moore algorithm if the pattern is bigger + * than three characters + */ + if(g->mlen > 3) { + computejumps(p, g); + computematchjumps(p, g); + if(g->matchjump == NULL && g->charjump != NULL) { + free(g->charjump); + g->charjump = NULL; + } + } g->nplus = pluscount(p, g); g->magic = MAGIC2; preg->re_nsub = g->nsub; @@ -1675,13 +1697,23 @@ register struct re_guts *g; register sop s; register char *cp; register sopno i; + int offset; + int cs, mccs; /* avoid making error situations worse */ if (p->error != 0) return; + /* Find out if we can handle OANYOF or not */ + mccs = 0; + for (cs = 0; cs < g->ncsets; cs++) + if (g->sets[cs].multis != NULL) + mccs = 1; + /* find the longest OCHAR sequence in strip */ newlen = 0; + offset = 0; + g->moffset = 0; scan = g->strip + 1; do { s = *scan++; @@ -1697,6 +1729,7 @@ register struct re_guts *g; break; case OQUEST_: /* things that must be skipped */ case OCH_: + offset = altoffset(scan, offset, mccs); scan--; do { scan += OPND(s); @@ -1709,23 +1742,97 @@ register struct re_guts *g; } } while (OP(s) != O_QUEST && OP(s) != O_CH); /* fallthrough */ - default: /* things that break a sequence */ + case OBOW: /* things that break a sequence */ + case OEOW: + case OBOL: + case OEOL: + case O_QUEST: + case O_CH: + case OEND: if (newlen > g->mlen) { /* ends one */ start = newstart; g->mlen = newlen; + if (offset > -1) { + g->moffset += offset; + offset = newlen; + } else + g->moffset = offset; + } else { + if (offset > -1) + offset += newlen; } newlen = 0; break; + case OANY: + if (newlen > g->mlen) { /* ends one */ + start = newstart; + g->mlen = newlen; + if (offset > -1) { + g->moffset += offset; + offset = newlen; + } else + g->moffset = offset; + } else { + if (offset > -1) + offset += newlen; + } + if (offset > -1) + offset++; + newlen = 0; + break; + case OANYOF: /* may or may not invalidate offset */ + /* First, everything as OANY */ + if (newlen > g->mlen) { /* ends one */ + start = newstart; + g->mlen = newlen; + if (offset > -1) { + g->moffset += offset; + offset = newlen; + } else + g->moffset = offset; + } else { + if (offset > -1) + offset += newlen; + } + if (offset > -1) + offset++; + newlen = 0; + /* And, now, if we found out we can't deal with + * it, make offset = -1. + */ + if (mccs) + offset = -1; + break; + default: + /* Anything here makes it impossible or too hard + * to calculate the offset -- so we give up; + * save the last known good offset, in case the + * must sequence doesn't occur later. + */ + if (newlen > g->mlen) { /* ends one */ + start = newstart; + g->mlen = newlen; + if (offset > -1) + g->moffset += offset; + else + g->moffset = offset; + } + offset = -1; + newlen = 0; + break; } } while (OP(s) != OEND); - if (g->mlen == 0) /* there isn't one */ + if (g->mlen == 0) { /* there isn't one */ + g->moffset = -1; return; + } /* turn it into a character string */ g->must = malloc((size_t)g->mlen + 1); if (g->must == NULL) { /* argh; just forget it */ g->mlen = 0; + g->moffset = -1; return; } cp = g->must; @@ -1741,6 +1848,210 @@ register struct re_guts *g; } /* + - altoffset - choose biggest offset among multiple choices + == static int altoffset(sop *scan, int offset, int mccs); + * + * Compute, recursively if necessary, the largest offset among multiple + * re paths. + */ +static int +altoffset(scan, offset, mccs) +sop *scan; +int offset; +int mccs; +{ + int largest; + int try; + sop s; + + /* If we gave up already on offsets, return */ + if (offset == -1) + return -1; + + largest = 0; + try = 0; + s = *scan++; + while (OP(s) != O_QUEST && OP(s) != O_CH) { + switch (OP(s)) { + case OOR1: + if (try > largest) + largest = try; + try = 0; + break; + case OQUEST_: + case OCH_: + try = altoffset(scan, try, mccs); + if (try == -1) + return -1; + scan--; + do { + scan += OPND(s); + s = *scan; + if (OP(s) != O_QUEST && OP(s) != O_CH && + OP(s) != OOR2) + return -1; + } while (OP(s) != O_QUEST && OP(s) != O_CH); + /* We must skip to the next position, or we'll + * leave altoffset() too early. + */ + scan++; + break; + case OANYOF: + if (mccs) + return -1; + case OCHAR: + case OANY: + try++; + case OBOW: + case OEOW: + case OLPAREN: + case ORPAREN: + case OOR2: + break; + default: + try = -1; + break; + } + if (try == -1) + return -1; + s = *scan++; + } + + if (try > largest) + largest = try; + + return largest+offset; +} + +/* + - computejumps - compute char jumps for BM scan + == static void computejumps(register struct parse *p, register struct re_guts *g); + * + * This algorithm assumes g->must exists and is has size greater than + * zero. It's based on the algorithm found on Computer Algorithms by + * Sara Baase. + * + * A char jump is the number of characters one needs to jump based on + * the value of the character from the text that was mismatched. + */ +static void +computejumps(p, g) +struct parse *p; +struct re_guts *g; +{ + int ch; + int mindex; + + /* Avoid making errors worse */ + if (p->error != 0) + return; + + g->charjump = (int*) malloc((NC + 1) * sizeof(int)); + if (g->charjump == NULL) /* Not a fatal error */ + return; + /* Adjust for signed chars, if necessary */ + g->charjump = &g->charjump[-(CHAR_MIN)]; + + /* If the character does not exist in the pattern, the jump + * is equal to the number of characters in the pattern. + */ + for (ch = CHAR_MIN; ch < (CHAR_MAX + 1); ch++) + g->charjump[ch] = g->mlen; + + /* If the character does exist, compute the jump that would + * take us to the last character in the pattern equal to it + * (notice that we match right to left, so that last character + * is the first one that would be matched). + */ + for (mindex = 0; mindex < g->mlen; mindex++) + g->charjump[g->must[mindex]] = g->mlen - mindex - 1; +} + +/* + - computematchjumps - compute match jumps for BM scan + == static void computematchjumps(register struct parse *p, register struct re_guts *g); + * + * This algorithm assumes g->must exists and is has size greater than + * zero. It's based on the algorithm found on Computer Algorithms by + * Sara Baase. + * + * A match jump is the number of characters one needs to advance based + * on the already-matched suffix. + * Notice that all values here are minus (g->mlen-1), because of the way + * the search algorithm works. + */ +static void +computematchjumps(p, g) +struct parse *p; +struct re_guts *g; +{ + int mindex; /* General "must" iterator */ + int suffix; /* Keeps track of matching suffix */ + int ssuffix; /* Keeps track of suffixes' suffix */ + int* pmatches; /* pmatches[k] points to the next i + * such that i+1...mlen is a substring + * of k+1...k+mlen-i-1 + */ + + /* Avoid making errors worse */ + if (p->error != 0) + return; + + pmatches = (int*) malloc(g->mlen * sizeof(unsigned int)); + if (pmatches == NULL) { + g->matchjump = NULL; + return; + } + + g->matchjump = (int*) malloc(g->mlen * sizeof(unsigned int)); + if (g->matchjump == NULL) /* Not a fatal error */ + return; + + /* Set maximum possible jump for each character in the pattern */ + for (mindex = 0; mindex < g->mlen; mindex++) + g->matchjump[mindex] = 2*g->mlen - mindex - 1; + + /* Compute pmatches[] */ + for (mindex = g->mlen - 1, suffix = g->mlen; mindex >= 0; + mindex--, suffix--) { + pmatches[mindex] = suffix; + + /* If a mismatch is found, interrupting the substring, + * compute the matchjump for that position. If no + * mismatch is found, then a text substring mismatched + * against the suffix will also mismatch against the + * substring. + */ + while (suffix < g->mlen + && g->must[mindex] != g->must[suffix]) { + g->matchjump[suffix] = MIN(g->matchjump[suffix], + g->mlen - mindex - 1); + suffix = pmatches[suffix]; + } + } + + /* Compute the matchjump up to the last substring found to jump + * to the beginning of the largest must pattern prefix matching + * it's own suffix. + */ + for (mindex = 0; mindex <= suffix; mindex++) + g->matchjump[mindex] = MIN(g->matchjump[mindex], + g->mlen + suffix - mindex); + + ssuffix = pmatches[suffix]; + while (suffix < g->mlen) { + while (suffix <= ssuffix && suffix < g->mlen) { + g->matchjump[suffix] = MIN(g->matchjump[suffix], + g->mlen + ssuffix - suffix); + suffix++; + } + ssuffix = pmatches[ssuffix]; + } + + free(pmatches); +} + +/* - pluscount - count + nesting == static sopno pluscount(register struct parse *p, register struct re_guts *g); */ |
