libregex: implement \b and \B (word boundary, not word boundary)

This is the last of the needed GNU expressions before we can unleash bsdgrep by default. \b is effectively an agnostic equivalent of \< and \>, while \B will match every space that isn't making a transition from nonchar -> char or char -> nonchar.
author: Kyle Evans <kevans@FreeBSD.org> 2020-12-05 03:16:05 +0000
committer: Kyle Evans <kevans@FreeBSD.org> 2020-12-05 03:16:05 +0000
commit: 6b986646d434baa21ae3d74d6a662ad206c7ddbd (patch)
tree: 11153e51ca240d2b5256c0f35e6d0f9feeaeca1b
parent: ca53e5aedfebcc1b4091b68e01b2d5cae923f85e (diff)
download: src-test-6b986646d434baa21ae3d74d6a662ad206c7ddbd.tar.gz
src-test-6b986646d434baa21ae3d74d6a662ad206c7ddbd.zip
5 files changed, 61 insertions, 8 deletions
diff --git a/contrib/netbsd-tests/lib/libc/regex/data/meta.in b/contrib/netbsd-tests/lib/libc/regex/data/meta.in
index eb24075aea622..b8f14aad8c744 100644
--- a/contrib/netbsd-tests/lib/libc/regex/data/meta.in
+++ b/contrib/netbsd-tests/lib/libc/regex/data/meta.in
@@ -5,7 +5,7 @@ a\*c		&	a*c	a*c
 a\\b		&	a\b	a\b
 a\\\*b		&	a\*b	a\*b
 # Begin FreeBSD
-a\bc		&C	EESCAPE
+a\bc		&	abc
 # End FreeBSD
 a\		&C	EESCAPE
 a\\bc		&	a\bc	a\bc
diff --git a/lib/libc/regex/engine.c b/lib/libc/regex/engine.c
index 79af9a4790b33..bb40018c07e1d 100644
--- a/lib/libc/regex/engine.c
+++ b/lib/libc/regex/engine.c
@@ -118,6 +118,7 @@ static states step(struct re_guts *g, sopno start, sopno stop, states bef, wint_
 #define	BOW	(BOL-4)
 #define	EOW	(BOL-5)
 #define	BADCHAR	(BOL-6)
+#define	NWBND	(BOL-7)
 #define	NONCHAR(c)	((c) <= OUT)
 /* sflags */
 #define	SBOS	0x0001
@@ -463,6 +464,8 @@ dissect(struct match *m,
 		case OEOW:
 		case OBOS:
 		case OEOS:
+		case OWBND:
+		case ONWBND:
 			break;
 		case OANY:
 		case OANYOF:
@@ -691,6 +694,21 @@ backref(struct match *m,
 			else
 				return(NULL);
 			break;
+		case OWBND:
+			if (ISBOW(m, sp) || ISEOW(m, sp))
+				{ /* yes */ }
+			else
+				return(NULL);
+			break;
+		case ONWBND:
+			if (((sp == m->beginp) && !ISWORD(*sp)) ||
+			    (sp == m->endp && !ISWORD(*(sp - 1))))
+				{ /* yes, beginning/end of subject */ }
+			else if (ISWORD(*(sp - 1)) == ISWORD(*sp))
+				{ /* yes, beginning/end of subject */ }
+			else
+				return(NULL);
+			break;
 		case OBOW:
 			if (ISBOW(m, sp))
 				{ /* yes */ }
@@ -916,6 +934,17 @@ walk(struct match *m, const char *start, const char *stop, sopno startst,
 			st = step(m->g, startst, stopst, st, flagch, st, sflags);
 			SP("sboweow", st, c);
 		}
+		if (lastc != OUT && c != OUT &&
+		    ISWORD(lastc) == ISWORD(c)) {
+			flagch = NWBND;
+		} else if ((lastc == OUT && !ISWORD(c)) ||
+		    (c == OUT && !ISWORD(lastc))) {
+			flagch = NWBND;
+		}
+		if (flagch == NWBND) {
+			st = step(m->g, startst, stopst, st, flagch, st, sflags);
+			SP("snwbnd", st, c);
+		}
 
 		/* are we done? */
 		if (ISSET(st, stopst)) {
@@ -1017,6 +1046,14 @@ step(struct re_guts *g,
 			if (ch == EOW)
 				FWD(aft, bef, 1);
 			break;
+		case OWBND:
+			if (ch == BOW || ch == EOW)
+				FWD(aft, bef, 1);
+			break;
+		case ONWBND:
+			if (ch == NWBND)
+				FWD(aft, aft, 1);
+			break;
 		case OANY:
 			if (!NONCHAR(ch))
 				FWD(aft, bef, 1);
diff --git a/lib/libc/regex/regcomp.c b/lib/libc/regex/regcomp.c
index fd44fd60cc651..0eb4b44309963 100644
--- a/lib/libc/regex/regcomp.c
+++ b/lib/libc/regex/regcomp.c
@@ -486,6 +486,12 @@ p_ere_exp(struct parse *p, struct branchc *bc)
 			case '\'':
 				EMIT(OEOS, 0);
 				break;
+			case 'B':
+				EMIT(ONWBND, 0);
+				break;
+			case 'b':
+				EMIT(OWBND, 0);
+				break;
 			case 'W':
 			case 'w':
 			case 'S':
@@ -845,6 +851,12 @@ p_simp_re(struct parse *p, struct branchc *bc)
 			case BACKSL|'\'':
 				EMIT(OEOS, 0);
 				break;
+			case BACKSL|'B':
+				EMIT(ONWBND, 0);
+				break;
+			case BACKSL|'b':
+				EMIT(OWBND, 0);
+				break;
 			case BACKSL|'W':
 			case BACKSL|'w':
 			case BACKSL|'S':
@@ -1892,6 +1904,8 @@ findmust(struct parse *p, struct re_guts *g)
 		case OEOL:
 		case OBOS:
 		case OEOS:
+		case OWBND:
+		case ONWBND:
 		case O_QUEST:
 		case O_CH:
 		case OEND:
@@ -2043,6 +2057,8 @@ altoffset(sop *scan, int offset)
 			try++;
 		case OBOW:
 		case OEOW:
+		case OWBND:
+		case ONWBND:
 		case OLPAREN:
 		case ORPAREN:
 		case OOR2:
diff --git a/lib/libc/regex/regex2.h b/lib/libc/regex/regex2.h
index 1c41656694f3c..012823d3261a2 100644
--- a/lib/libc/regex/regex2.h
+++ b/lib/libc/regex/regex2.h
@@ -106,6 +106,8 @@ typedef unsigned long sopno;
 #define	OEOW	(20L<<OPSHIFT)	/* end word	-			*/
 #define	OBOS	(21L<<OPSHIFT)	/* begin subj.  -			*/
 #define	OEOS	(22L<<OPSHIFT)	/* end subj.	-			*/
+#define	OWBND	(23L<<OPSHIFT)	/* word bound	-			*/
+#define	ONWBND	(24L<<OPSHIFT)	/* not bound	-			*/
 
 /*
  * Structures for [] character-set representation.
diff --git a/lib/libregex/tests/gnuext.in b/lib/libregex/tests/gnuext.in
index ebd052fb8b750..8f49854235a91 100644
--- a/lib/libregex/tests/gnuext.in
+++ b/lib/libregex/tests/gnuext.in
@@ -17,14 +17,12 @@ a\|b\|c	b	abc	a
 \s\+	b	aSNTb	SNT
 # Word boundaries (\b, \B, \<, \>, \`, \')
 # (is/not boundary, start/end word, start/end subject string)
-# Most of these are disabled for the moment, and will be re-enabled as
-# we become feature complete.
-#\babc\b	&	<abc>	abc
+\babc\b	&	<abc>	abc
 \<abc\> &	<abc>	abc
-#\Babc\B	&	abc
-#\B[abc]\B	&	<abc>	b
-#\B[abc]+	-	<abc>	bc
-#\B[abc]\+	b	<abc>	bc
+\Babc\B	&	abc
+\B[abc]\B	&	<abc>	b
+\B[abc]+	-	<abc>	bc
+\B[abc]\+	b	<abc>	bc
 \`abc	&	abc	abc
 abc\'	&	abc	abc
 \`abc\'	&	abc	abc
author	Kyle Evans <kevans@FreeBSD.org>	2020-12-05 03:16:05 +0000
committer	Kyle Evans <kevans@FreeBSD.org>	2020-12-05 03:16:05 +0000
commit	6b986646d434baa21ae3d74d6a662ad206c7ddbd (patch)
tree	11153e51ca240d2b5256c0f35e6d0f9feeaeca1b
parent	ca53e5aedfebcc1b4091b68e01b2d5cae923f85e (diff)
download	src-test-6b986646d434baa21ae3d74d6a662ad206c7ddbd.tar.gz src-test-6b986646d434baa21ae3d74d6a662ad206c7ddbd.zip