summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKyle Evans <kevans@FreeBSD.org>2020-12-05 03:16:05 +0000
committerKyle Evans <kevans@FreeBSD.org>2020-12-05 03:16:05 +0000
commit6b986646d434baa21ae3d74d6a662ad206c7ddbd (patch)
tree11153e51ca240d2b5256c0f35e6d0f9feeaeca1b
parentca53e5aedfebcc1b4091b68e01b2d5cae923f85e (diff)
downloadsrc-test-6b986646d434baa21ae3d74d6a662ad206c7ddbd.tar.gz
src-test-6b986646d434baa21ae3d74d6a662ad206c7ddbd.zip
libregex: implement \b and \B (word boundary, not word boundary)
This is the last of the needed GNU expressions before we can unleash bsdgrep by default. \b is effectively an agnostic equivalent of \< and \>, while \B will match every space that isn't making a transition from nonchar -> char or char -> nonchar.
Notes
Notes: svn path=/head/; revision=368358
-rw-r--r--contrib/netbsd-tests/lib/libc/regex/data/meta.in2
-rw-r--r--lib/libc/regex/engine.c37
-rw-r--r--lib/libc/regex/regcomp.c16
-rw-r--r--lib/libc/regex/regex2.h2
-rw-r--r--lib/libregex/tests/gnuext.in12
5 files changed, 61 insertions, 8 deletions
diff --git a/contrib/netbsd-tests/lib/libc/regex/data/meta.in b/contrib/netbsd-tests/lib/libc/regex/data/meta.in
index eb24075aea622..b8f14aad8c744 100644
--- a/contrib/netbsd-tests/lib/libc/regex/data/meta.in
+++ b/contrib/netbsd-tests/lib/libc/regex/data/meta.in
@@ -5,7 +5,7 @@ a\*c & a*c a*c
a\\b & a\b a\b
a\\\*b & a\*b a\*b
# Begin FreeBSD
-a\bc &C EESCAPE
+a\bc & abc
# End FreeBSD
a\ &C EESCAPE
a\\bc & a\bc a\bc
diff --git a/lib/libc/regex/engine.c b/lib/libc/regex/engine.c
index 79af9a4790b33..bb40018c07e1d 100644
--- a/lib/libc/regex/engine.c
+++ b/lib/libc/regex/engine.c
@@ -118,6 +118,7 @@ static states step(struct re_guts *g, sopno start, sopno stop, states bef, wint_
#define BOW (BOL-4)
#define EOW (BOL-5)
#define BADCHAR (BOL-6)
+#define NWBND (BOL-7)
#define NONCHAR(c) ((c) <= OUT)
/* sflags */
#define SBOS 0x0001
@@ -463,6 +464,8 @@ dissect(struct match *m,
case OEOW:
case OBOS:
case OEOS:
+ case OWBND:
+ case ONWBND:
break;
case OANY:
case OANYOF:
@@ -691,6 +694,21 @@ backref(struct match *m,
else
return(NULL);
break;
+ case OWBND:
+ if (ISBOW(m, sp) || ISEOW(m, sp))
+ { /* yes */ }
+ else
+ return(NULL);
+ break;
+ case ONWBND:
+ if (((sp == m->beginp) && !ISWORD(*sp)) ||
+ (sp == m->endp && !ISWORD(*(sp - 1))))
+ { /* yes, beginning/end of subject */ }
+ else if (ISWORD(*(sp - 1)) == ISWORD(*sp))
+ { /* yes, beginning/end of subject */ }
+ else
+ return(NULL);
+ break;
case OBOW:
if (ISBOW(m, sp))
{ /* yes */ }
@@ -916,6 +934,17 @@ walk(struct match *m, const char *start, const char *stop, sopno startst,
st = step(m->g, startst, stopst, st, flagch, st, sflags);
SP("sboweow", st, c);
}
+ if (lastc != OUT && c != OUT &&
+ ISWORD(lastc) == ISWORD(c)) {
+ flagch = NWBND;
+ } else if ((lastc == OUT && !ISWORD(c)) ||
+ (c == OUT && !ISWORD(lastc))) {
+ flagch = NWBND;
+ }
+ if (flagch == NWBND) {
+ st = step(m->g, startst, stopst, st, flagch, st, sflags);
+ SP("snwbnd", st, c);
+ }
/* are we done? */
if (ISSET(st, stopst)) {
@@ -1017,6 +1046,14 @@ step(struct re_guts *g,
if (ch == EOW)
FWD(aft, bef, 1);
break;
+ case OWBND:
+ if (ch == BOW || ch == EOW)
+ FWD(aft, bef, 1);
+ break;
+ case ONWBND:
+ if (ch == NWBND)
+ FWD(aft, aft, 1);
+ break;
case OANY:
if (!NONCHAR(ch))
FWD(aft, bef, 1);
diff --git a/lib/libc/regex/regcomp.c b/lib/libc/regex/regcomp.c
index fd44fd60cc651..0eb4b44309963 100644
--- a/lib/libc/regex/regcomp.c
+++ b/lib/libc/regex/regcomp.c
@@ -486,6 +486,12 @@ p_ere_exp(struct parse *p, struct branchc *bc)
case '\'':
EMIT(OEOS, 0);
break;
+ case 'B':
+ EMIT(ONWBND, 0);
+ break;
+ case 'b':
+ EMIT(OWBND, 0);
+ break;
case 'W':
case 'w':
case 'S':
@@ -845,6 +851,12 @@ p_simp_re(struct parse *p, struct branchc *bc)
case BACKSL|'\'':
EMIT(OEOS, 0);
break;
+ case BACKSL|'B':
+ EMIT(ONWBND, 0);
+ break;
+ case BACKSL|'b':
+ EMIT(OWBND, 0);
+ break;
case BACKSL|'W':
case BACKSL|'w':
case BACKSL|'S':
@@ -1892,6 +1904,8 @@ findmust(struct parse *p, struct re_guts *g)
case OEOL:
case OBOS:
case OEOS:
+ case OWBND:
+ case ONWBND:
case O_QUEST:
case O_CH:
case OEND:
@@ -2043,6 +2057,8 @@ altoffset(sop *scan, int offset)
try++;
case OBOW:
case OEOW:
+ case OWBND:
+ case ONWBND:
case OLPAREN:
case ORPAREN:
case OOR2:
diff --git a/lib/libc/regex/regex2.h b/lib/libc/regex/regex2.h
index 1c41656694f3c..012823d3261a2 100644
--- a/lib/libc/regex/regex2.h
+++ b/lib/libc/regex/regex2.h
@@ -106,6 +106,8 @@ typedef unsigned long sopno;
#define OEOW (20L<<OPSHIFT) /* end word - */
#define OBOS (21L<<OPSHIFT) /* begin subj. - */
#define OEOS (22L<<OPSHIFT) /* end subj. - */
+#define OWBND (23L<<OPSHIFT) /* word bound - */
+#define ONWBND (24L<<OPSHIFT) /* not bound - */
/*
* Structures for [] character-set representation.
diff --git a/lib/libregex/tests/gnuext.in b/lib/libregex/tests/gnuext.in
index ebd052fb8b750..8f49854235a91 100644
--- a/lib/libregex/tests/gnuext.in
+++ b/lib/libregex/tests/gnuext.in
@@ -17,14 +17,12 @@ a\|b\|c b abc a
\s\+ b aSNTb SNT
# Word boundaries (\b, \B, \<, \>, \`, \')
# (is/not boundary, start/end word, start/end subject string)
-# Most of these are disabled for the moment, and will be re-enabled as
-# we become feature complete.
-#\babc\b & <abc> abc
+\babc\b & <abc> abc
\<abc\> & <abc> abc
-#\Babc\B & abc
-#\B[abc]\B & <abc> b
-#\B[abc]+ - <abc> bc
-#\B[abc]\+ b <abc> bc
+\Babc\B & abc
+\B[abc]\B & <abc> b
+\B[abc]+ - <abc> bc
+\B[abc]\+ b <abc> bc
\`abc & abc abc
abc\' & abc abc
\`abc\' & abc abc