summaryrefslogtreecommitdiff
path: root/lib/libc/regex/regcomp.c
diff options
context:
space:
mode:
Diffstat (limited to 'lib/libc/regex/regcomp.c')
-rw-r--r--lib/libc/regex/regcomp.c315
1 files changed, 313 insertions, 2 deletions
diff --git a/lib/libc/regex/regcomp.c b/lib/libc/regex/regcomp.c
index a4e9e1dced4d..bf92f2c07460 100644
--- a/lib/libc/regex/regcomp.c
+++ b/lib/libc/regex/regcomp.c
@@ -35,6 +35,8 @@
* SUCH DAMAGE.
*
* @(#)regcomp.c 8.5 (Berkeley) 3/20/94
+ *
+ * $FreeBSD$
*/
#if defined(LIBC_SCCS) && !defined(lint)
@@ -122,6 +124,9 @@ static void dofwd __P((struct parse *p, sopno pos, sop value));
static void enlarge __P((struct parse *p, sopno size));
static void stripsnug __P((struct parse *p, struct re_guts *g));
static void findmust __P((struct parse *p, struct re_guts *g));
+static int altoffset __P((sop *scan, int offset, int mccs));
+static void computejumps __P((struct parse *p, struct re_guts *g));
+static void computematchjumps __P((struct parse *p, struct re_guts *g));
static sopno pluscount __P((struct parse *p, struct re_guts *g));
#ifdef __cplusplus
@@ -167,6 +172,9 @@ static int never = 0; /* for use in asserts; shuts lint up */
#define never 0 /* some <assert.h>s have bugs too */
#endif
+/* Macro used by computejump()/computematchjump() */
+#define MIN(a,b) ((a)<(b)?(a):(b))
+
/*
- regcomp - interface for parser and compilation
= extern int regcomp(regex_t *, const char *, int);
@@ -239,6 +247,9 @@ int cflags;
g->nbol = 0;
g->neol = 0;
g->must = NULL;
+ g->moffset = -1;
+ g->charjump = NULL;
+ g->matchjump = NULL;
g->mlen = 0;
g->nsub = 0;
g->ncategories = 1; /* category 0 is "everything else" */
@@ -262,6 +273,17 @@ int cflags;
categorize(p, g);
stripsnug(p, g);
findmust(p, g);
+ /* only use Boyer-Moore algorithm if the pattern is bigger
+ * than three characters
+ */
+ if(g->mlen > 3) {
+ computejumps(p, g);
+ computematchjumps(p, g);
+ if(g->matchjump == NULL && g->charjump != NULL) {
+ free(g->charjump);
+ g->charjump = NULL;
+ }
+ }
g->nplus = pluscount(p, g);
g->magic = MAGIC2;
preg->re_nsub = g->nsub;
@@ -1675,13 +1697,23 @@ register struct re_guts *g;
register sop s;
register char *cp;
register sopno i;
+ int offset;
+ int cs, mccs;
/* avoid making error situations worse */
if (p->error != 0)
return;
+ /* Find out if we can handle OANYOF or not */
+ mccs = 0;
+ for (cs = 0; cs < g->ncsets; cs++)
+ if (g->sets[cs].multis != NULL)
+ mccs = 1;
+
/* find the longest OCHAR sequence in strip */
newlen = 0;
+ offset = 0;
+ g->moffset = 0;
scan = g->strip + 1;
do {
s = *scan++;
@@ -1697,6 +1729,7 @@ register struct re_guts *g;
break;
case OQUEST_: /* things that must be skipped */
case OCH_:
+ offset = altoffset(scan, offset, mccs);
scan--;
do {
scan += OPND(s);
@@ -1709,23 +1742,97 @@ register struct re_guts *g;
}
} while (OP(s) != O_QUEST && OP(s) != O_CH);
/* fallthrough */
- default: /* things that break a sequence */
+ case OBOW: /* things that break a sequence */
+ case OEOW:
+ case OBOL:
+ case OEOL:
+ case O_QUEST:
+ case O_CH:
+ case OEND:
if (newlen > g->mlen) { /* ends one */
start = newstart;
g->mlen = newlen;
+ if (offset > -1) {
+ g->moffset += offset;
+ offset = newlen;
+ } else
+ g->moffset = offset;
+ } else {
+ if (offset > -1)
+ offset += newlen;
}
newlen = 0;
break;
+ case OANY:
+ if (newlen > g->mlen) { /* ends one */
+ start = newstart;
+ g->mlen = newlen;
+ if (offset > -1) {
+ g->moffset += offset;
+ offset = newlen;
+ } else
+ g->moffset = offset;
+ } else {
+ if (offset > -1)
+ offset += newlen;
+ }
+ if (offset > -1)
+ offset++;
+ newlen = 0;
+ break;
+ case OANYOF: /* may or may not invalidate offset */
+ /* First, everything as OANY */
+ if (newlen > g->mlen) { /* ends one */
+ start = newstart;
+ g->mlen = newlen;
+ if (offset > -1) {
+ g->moffset += offset;
+ offset = newlen;
+ } else
+ g->moffset = offset;
+ } else {
+ if (offset > -1)
+ offset += newlen;
+ }
+ if (offset > -1)
+ offset++;
+ newlen = 0;
+ /* And, now, if we found out we can't deal with
+ * it, make offset = -1.
+ */
+ if (mccs)
+ offset = -1;
+ break;
+ default:
+ /* Anything here makes it impossible or too hard
+ * to calculate the offset -- so we give up;
+ * save the last known good offset, in case the
+ * must sequence doesn't occur later.
+ */
+ if (newlen > g->mlen) { /* ends one */
+ start = newstart;
+ g->mlen = newlen;
+ if (offset > -1)
+ g->moffset += offset;
+ else
+ g->moffset = offset;
+ }
+ offset = -1;
+ newlen = 0;
+ break;
}
} while (OP(s) != OEND);
- if (g->mlen == 0) /* there isn't one */
+ if (g->mlen == 0) { /* there isn't one */
+ g->moffset = -1;
return;
+ }
/* turn it into a character string */
g->must = malloc((size_t)g->mlen + 1);
if (g->must == NULL) { /* argh; just forget it */
g->mlen = 0;
+ g->moffset = -1;
return;
}
cp = g->must;
@@ -1741,6 +1848,210 @@ register struct re_guts *g;
}
/*
+ - altoffset - choose biggest offset among multiple choices
+ == static int altoffset(sop *scan, int offset, int mccs);
+ *
+ * Compute, recursively if necessary, the largest offset among multiple
+ * re paths.
+ */
+static int
+altoffset(scan, offset, mccs)
+sop *scan;
+int offset;
+int mccs;
+{
+ int largest;
+ int try;
+ sop s;
+
+ /* If we gave up already on offsets, return */
+ if (offset == -1)
+ return -1;
+
+ largest = 0;
+ try = 0;
+ s = *scan++;
+ while (OP(s) != O_QUEST && OP(s) != O_CH) {
+ switch (OP(s)) {
+ case OOR1:
+ if (try > largest)
+ largest = try;
+ try = 0;
+ break;
+ case OQUEST_:
+ case OCH_:
+ try = altoffset(scan, try, mccs);
+ if (try == -1)
+ return -1;
+ scan--;
+ do {
+ scan += OPND(s);
+ s = *scan;
+ if (OP(s) != O_QUEST && OP(s) != O_CH &&
+ OP(s) != OOR2)
+ return -1;
+ } while (OP(s) != O_QUEST && OP(s) != O_CH);
+ /* We must skip to the next position, or we'll
+ * leave altoffset() too early.
+ */
+ scan++;
+ break;
+ case OANYOF:
+ if (mccs)
+ return -1;
+ case OCHAR:
+ case OANY:
+ try++;
+ case OBOW:
+ case OEOW:
+ case OLPAREN:
+ case ORPAREN:
+ case OOR2:
+ break;
+ default:
+ try = -1;
+ break;
+ }
+ if (try == -1)
+ return -1;
+ s = *scan++;
+ }
+
+ if (try > largest)
+ largest = try;
+
+ return largest+offset;
+}
+
+/*
+ - computejumps - compute char jumps for BM scan
+ == static void computejumps(register struct parse *p, register struct re_guts *g);
+ *
+ * This algorithm assumes g->must exists and is has size greater than
+ * zero. It's based on the algorithm found on Computer Algorithms by
+ * Sara Baase.
+ *
+ * A char jump is the number of characters one needs to jump based on
+ * the value of the character from the text that was mismatched.
+ */
+static void
+computejumps(p, g)
+struct parse *p;
+struct re_guts *g;
+{
+ int ch;
+ int mindex;
+
+ /* Avoid making errors worse */
+ if (p->error != 0)
+ return;
+
+ g->charjump = (int*) malloc((NC + 1) * sizeof(int));
+ if (g->charjump == NULL) /* Not a fatal error */
+ return;
+ /* Adjust for signed chars, if necessary */
+ g->charjump = &g->charjump[-(CHAR_MIN)];
+
+ /* If the character does not exist in the pattern, the jump
+ * is equal to the number of characters in the pattern.
+ */
+ for (ch = CHAR_MIN; ch < (CHAR_MAX + 1); ch++)
+ g->charjump[ch] = g->mlen;
+
+ /* If the character does exist, compute the jump that would
+ * take us to the last character in the pattern equal to it
+ * (notice that we match right to left, so that last character
+ * is the first one that would be matched).
+ */
+ for (mindex = 0; mindex < g->mlen; mindex++)
+ g->charjump[g->must[mindex]] = g->mlen - mindex - 1;
+}
+
+/*
+ - computematchjumps - compute match jumps for BM scan
+ == static void computematchjumps(register struct parse *p, register struct re_guts *g);
+ *
+ * This algorithm assumes g->must exists and is has size greater than
+ * zero. It's based on the algorithm found on Computer Algorithms by
+ * Sara Baase.
+ *
+ * A match jump is the number of characters one needs to advance based
+ * on the already-matched suffix.
+ * Notice that all values here are minus (g->mlen-1), because of the way
+ * the search algorithm works.
+ */
+static void
+computematchjumps(p, g)
+struct parse *p;
+struct re_guts *g;
+{
+ int mindex; /* General "must" iterator */
+ int suffix; /* Keeps track of matching suffix */
+ int ssuffix; /* Keeps track of suffixes' suffix */
+ int* pmatches; /* pmatches[k] points to the next i
+ * such that i+1...mlen is a substring
+ * of k+1...k+mlen-i-1
+ */
+
+ /* Avoid making errors worse */
+ if (p->error != 0)
+ return;
+
+ pmatches = (int*) malloc(g->mlen * sizeof(unsigned int));
+ if (pmatches == NULL) {
+ g->matchjump = NULL;
+ return;
+ }
+
+ g->matchjump = (int*) malloc(g->mlen * sizeof(unsigned int));
+ if (g->matchjump == NULL) /* Not a fatal error */
+ return;
+
+ /* Set maximum possible jump for each character in the pattern */
+ for (mindex = 0; mindex < g->mlen; mindex++)
+ g->matchjump[mindex] = 2*g->mlen - mindex - 1;
+
+ /* Compute pmatches[] */
+ for (mindex = g->mlen - 1, suffix = g->mlen; mindex >= 0;
+ mindex--, suffix--) {
+ pmatches[mindex] = suffix;
+
+ /* If a mismatch is found, interrupting the substring,
+ * compute the matchjump for that position. If no
+ * mismatch is found, then a text substring mismatched
+ * against the suffix will also mismatch against the
+ * substring.
+ */
+ while (suffix < g->mlen
+ && g->must[mindex] != g->must[suffix]) {
+ g->matchjump[suffix] = MIN(g->matchjump[suffix],
+ g->mlen - mindex - 1);
+ suffix = pmatches[suffix];
+ }
+ }
+
+ /* Compute the matchjump up to the last substring found to jump
+ * to the beginning of the largest must pattern prefix matching
+ * it's own suffix.
+ */
+ for (mindex = 0; mindex <= suffix; mindex++)
+ g->matchjump[mindex] = MIN(g->matchjump[mindex],
+ g->mlen + suffix - mindex);
+
+ ssuffix = pmatches[suffix];
+ while (suffix < g->mlen) {
+ while (suffix <= ssuffix && suffix < g->mlen) {
+ g->matchjump[suffix] = MIN(g->matchjump[suffix],
+ g->mlen + ssuffix - suffix);
+ suffix++;
+ }
+ ssuffix = pmatches[ssuffix];
+ }
+
+ free(pmatches);
+}
+
+/*
- pluscount - count + nesting
== static sopno pluscount(register struct parse *p, register struct re_guts *g);
*/