diff options
Diffstat (limited to 'lib/libc/regex/grot')
| -rw-r--r-- | lib/libc/regex/grot/Makefile | 124 | ||||
| -rw-r--r-- | lib/libc/regex/grot/debug.c | 244 | ||||
| -rw-r--r-- | lib/libc/regex/grot/limits.h | 6 | ||||
| -rw-r--r-- | lib/libc/regex/grot/main.c | 512 | ||||
| -rwxr-xr-x | lib/libc/regex/grot/mkh | 77 | ||||
| -rw-r--r-- | lib/libc/regex/grot/split.c | 318 | ||||
| -rw-r--r-- | lib/libc/regex/grot/stdlib.h | 4 | ||||
| -rw-r--r-- | lib/libc/regex/grot/tests | 450 | 
8 files changed, 1735 insertions, 0 deletions
diff --git a/lib/libc/regex/grot/Makefile b/lib/libc/regex/grot/Makefile new file mode 100644 index 000000000000..b7eefb956662 --- /dev/null +++ b/lib/libc/regex/grot/Makefile @@ -0,0 +1,124 @@ +# $FreeBSD$ +# You probably want to take -DREDEBUG out of CFLAGS, and put something like +# -O in, *after* testing (-DREDEBUG strengthens testing by enabling a lot of +# internal assertion checking).  Take -Dconst= out for an ANSI compiler. +# Do not take -DPOSIX_MISTAKE out.  REGCFLAGS isn't important to you (it's +# for my use in some special contexts). + +PATHS= ${.CURDIR}/.. ${.CURDIR}/../../locale ${.CURDIR}/../../../../include +.PATH: ${PATHS} + +CFLAGS+= -DPOSIX_MISTAKE -DREDEBUG $(REGCFLAGS) +.for incpath in ${PATHS} +CFLAGS+= -I${incpath} +.endfor + +# If you have an ANSI compiler, take -o out of MKHFLAGS.  If you want +# the Berkeley __P macro, put -b in. +MKHFLAGS = + +LDFLAGS = + +# If you have an ANSI environment, take limits.h and stdlib.h out of +# HMISSING and take memmove out of SRCMISSING and OBJMISSING. +HMISSING = +SRCMISSING = split.c +OBJMISSING = split.o +H = cclass.h cname.h regex2.h utils.h $(HMISSING) +REGSRC = regcomp.c regerror.c regexec.c regfree.c engine.c +SRC = $(REGSRC) debug.c main.c $(SRCMISSING) + +# Internal stuff, should not need changing. +OBJPRODN = regcomp.o regexec.o regerror.o regfree.o +OBJS = $(OBJPRODN) debug.o main.o $(OBJMISSING) + +# Stuff that matters only if you're trying to lint the package. +LINTFLAGS = -I. -Dstatic= -Dconst= -DREDEBUG +LINTC = regcomp.c regexec.c regerror.c regfree.c debug.c main.c $(SRCMISSING) +JUNKLINT =possible pointer alignment|null effect + +.SUFFIXES:	.ih .h +.c.ih: +	./mkh $(MKHFLAGS) -p $< >$@ + +default:	r + +re:	$(OBJS) +	$(CC) $(CFLAGS) $(LDFLAGS) $(OBJS) -o $@ + +o:	$(OBJPRODN) + +REGEXHSRC = regex2.h reg*.c +h:	$(REGEXHSRC) +	./mkh $(MKHFLAGS) -i _REGEX_H_ $(REGEXHSRC) >regex.tmp +	cmp -s regex.tmp regex.h 2>/dev/null || cp regex.tmp regex.h +	rm -f regex.tmp + +regcomp.o regexec.o regfree.o debug.o:	utils.h regex.h regex2.h +regcomp.o:	cclass.h cname.h regcomp.ih +regexec.o:	engine.c engine.ih +regerror.o:	regerror.ih +regerror.o:	utils.h +debug.o:	debug.ih +main.o:	main.ih + +r:	re tests +	./re <tests +	./re -el <tests +	./re -er <tests + +ra:	./re tests +	-./re <tests +	-./re -el <tests +	-./re -er <tests + +rx:	./re tests +	./re -x <tests +	./re -x -el <tests +	./re -x -er <tests + +t:	./re tests +	-time ./re <tests +	-time ./re -cs <tests +	-time ./re -el <tests +	-time ./re -cs -el <tests + +e:	echeck re tests +	./re -d <tests | awk -f echeck + +l:	$(LINTC) +	lint $(LINTFLAGS) -h $(LINTC) 2>&1 | egrep -v '$(JUNKLINT)' | tee lint + +fullprint: +	ti README WHATSNEW notes todo | hplist +	ti *.h | hplist +	hplist *.c +	hplist regex.3 regex.7 + +print: +	ti README WHATSNEW notes todo | hplist +	ti *.h | hplist +	hplist reg*.c engine.c + +clean:	tidy +	rm -f *.o *.s *.ih re + +tidy: +	rm -f junk* core dtr regex.tmp lint + +spotless:	clean +	rm -f mkh regex.h + +mkh:	/usr/henry/bin/mkh +	cp $? $@ + +DTRH = cclass.h cname.h regex2.h utils.h limits.h stdlib.h +DTR = README WHATSNEW Makefile Makefile.44bsd mkh regex.3 regex.7 tests $(DTRH) $(SRC) +dtr:	$(DTR) +	makedtr $(DTR) >$@ + +cio:	$(DTR) +	cio $(DTR) + +rdf:	$(DTR) +	rcsdiff -c $(DTR) 2>&1 | p diff --git a/lib/libc/regex/grot/debug.c b/lib/libc/regex/grot/debug.c new file mode 100644 index 000000000000..24bad149f684 --- /dev/null +++ b/lib/libc/regex/grot/debug.c @@ -0,0 +1,244 @@ +/* $FreeBSD$ + */ +#include <stdio.h> +#include <string.h> +#include <ctype.h> +#include <limits.h> +#include <stdlib.h> +#include <sys/types.h> +#include <regex.h> + +#include "utils.h" +#include "regex2.h" +#include "debug.ih" + +/* + - regprint - print a regexp for debugging + == void regprint(regex_t *r, FILE *d); + */ +void +regprint(r, d) +regex_t *r; +FILE *d; +{ +	struct re_guts *g = r->re_g; +	int i; +	int c; +	int last; +	int nincat[NC]; + +	fprintf(d, "%ld states, %d categories", (long)g->nstates, +							g->ncategories); +	fprintf(d, ", first %ld last %ld", (long)g->firststate, +						(long)g->laststate); +	if (g->iflags&USEBOL) +		fprintf(d, ", USEBOL"); +	if (g->iflags&USEEOL) +		fprintf(d, ", USEEOL"); +	if (g->iflags&BAD) +		fprintf(d, ", BAD"); +	if (g->nsub > 0) +		fprintf(d, ", nsub=%ld", (long)g->nsub); +	if (g->must != NULL) +		fprintf(d, ", must(%ld) `%*s'", (long)g->mlen, (int)g->mlen, +								g->must); +	if (g->backrefs) +		fprintf(d, ", backrefs"); +	if (g->nplus > 0) +		fprintf(d, ", nplus %ld", (long)g->nplus); +	fprintf(d, "\n"); +	s_print(g, d); +	for (i = 0; i < g->ncategories; i++) { +		nincat[i] = 0; +		for (c = CHAR_MIN; c <= CHAR_MAX; c++) +			if (g->categories[c] == i) +				nincat[i]++; +	} +	fprintf(d, "cc0#%d", nincat[0]); +	for (i = 1; i < g->ncategories; i++) +		if (nincat[i] == 1) { +			for (c = CHAR_MIN; c <= CHAR_MAX; c++) +				if (g->categories[c] == i) +					break; +			fprintf(d, ", %d=%s", i, regchar(c)); +		} +	fprintf(d, "\n"); +	for (i = 1; i < g->ncategories; i++) +		if (nincat[i] != 1) { +			fprintf(d, "cc%d\t", i); +			last = -1; +			for (c = CHAR_MIN; c <= CHAR_MAX+1; c++)	/* +1 does flush */ +				if (c <= CHAR_MAX && g->categories[c] == i) { +					if (last < 0) { +						fprintf(d, "%s", regchar(c)); +						last = c; +					} +				} else { +					if (last >= 0) { +						if (last != c-1) +							fprintf(d, "-%s", +								regchar(c-1)); +						last = -1; +					} +				} +			fprintf(d, "\n"); +		} +} + +/* + - s_print - print the strip for debugging + == static void s_print(struct re_guts *g, FILE *d); + */ +static void +s_print(g, d) +struct re_guts *g; +FILE *d; +{ +	sop *s; +	cset *cs; +	int i; +	int done = 0; +	sop opnd; +	int col = 0; +	int last; +	sopno offset = 2; +#	define	GAP()	{	if (offset % 5 == 0) { \ +					if (col > 40) { \ +						fprintf(d, "\n\t"); \ +						col = 0; \ +					} else { \ +						fprintf(d, " "); \ +						col++; \ +					} \ +				} else \ +					col++; \ +				offset++; \ +			} + +	if (OP(g->strip[0]) != OEND) +		fprintf(d, "missing initial OEND!\n"); +	for (s = &g->strip[1]; !done; s++) { +		opnd = OPND(*s); +		switch (OP(*s)) { +		case OEND: +			fprintf(d, "\n"); +			done = 1; +			break; +		case OCHAR: +			if (strchr("\\|()^$.[+*?{}!<> ", (char)opnd) != NULL) +				fprintf(d, "\\%c", (char)opnd); +			else +				fprintf(d, "%s", regchar((char)opnd)); +			break; +		case OBOL: +			fprintf(d, "^"); +			break; +		case OEOL: +			fprintf(d, "$"); +			break; +		case OBOW: +			fprintf(d, "\\{"); +			break; +		case OEOW: +			fprintf(d, "\\}"); +			break; +		case OANY: +			fprintf(d, "."); +			break; +		case OANYOF: +			fprintf(d, "[(%ld)", (long)opnd); +			cs = &g->sets[opnd]; +			last = -1; +			for (i = 0; i < g->csetsize+1; i++)	/* +1 flushes */ +				if (CHIN(cs, i) && i < g->csetsize) { +					if (last < 0) { +						fprintf(d, "%s", regchar(i)); +						last = i; +					} +				} else { +					if (last >= 0) { +						if (last != i-1) +							fprintf(d, "-%s", +								regchar(i-1)); +						last = -1; +					} +				} +			fprintf(d, "]"); +			break; +		case OBACK_: +			fprintf(d, "(\\<%ld>", (long)opnd); +			break; +		case O_BACK: +			fprintf(d, "<%ld>\\)", (long)opnd); +			break; +		case OPLUS_: +			fprintf(d, "(+"); +			if (OP(*(s+opnd)) != O_PLUS) +				fprintf(d, "<%ld>", (long)opnd); +			break; +		case O_PLUS: +			if (OP(*(s-opnd)) != OPLUS_) +				fprintf(d, "<%ld>", (long)opnd); +			fprintf(d, "+)"); +			break; +		case OQUEST_: +			fprintf(d, "(?"); +			if (OP(*(s+opnd)) != O_QUEST) +				fprintf(d, "<%ld>", (long)opnd); +			break; +		case O_QUEST: +			if (OP(*(s-opnd)) != OQUEST_) +				fprintf(d, "<%ld>", (long)opnd); +			fprintf(d, "?)"); +			break; +		case OLPAREN: +			fprintf(d, "((<%ld>", (long)opnd); +			break; +		case ORPAREN: +			fprintf(d, "<%ld>))", (long)opnd); +			break; +		case OCH_: +			fprintf(d, "<"); +			if (OP(*(s+opnd)) != OOR2) +				fprintf(d, "<%ld>", (long)opnd); +			break; +		case OOR1: +			if (OP(*(s-opnd)) != OOR1 && OP(*(s-opnd)) != OCH_) +				fprintf(d, "<%ld>", (long)opnd); +			fprintf(d, "|"); +			break; +		case OOR2: +			fprintf(d, "|"); +			if (OP(*(s+opnd)) != OOR2 && OP(*(s+opnd)) != O_CH) +				fprintf(d, "<%ld>", (long)opnd); +			break; +		case O_CH: +			if (OP(*(s-opnd)) != OOR1) +				fprintf(d, "<%ld>", (long)opnd); +			fprintf(d, ">"); +			break; +		default: +			fprintf(d, "!%d(%d)!", OP(*s), opnd); +			break; +		} +		if (!done) +			GAP(); +	} +} + +/* + - regchar - make a character printable + == static char *regchar(int ch); + */ +static char *			/* -> representation */ +regchar(ch) +int ch; +{ +	static char buf[10]; + +	if (isprint(ch) || ch == ' ') +		sprintf(buf, "%c", ch); +	else +		sprintf(buf, "\\%o", ch); +	return(buf); +} diff --git a/lib/libc/regex/grot/limits.h b/lib/libc/regex/grot/limits.h new file mode 100644 index 000000000000..6049fdf2e1b6 --- /dev/null +++ b/lib/libc/regex/grot/limits.h @@ -0,0 +1,6 @@ +/* $FreeBSD$ + */ +#define	_POSIX2_RE_DUP_MAX	255 +#define	CHAR_MIN	(-128) +#define	CHAR_MAX	127 +#define	CHAR_BIT	8 diff --git a/lib/libc/regex/grot/main.c b/lib/libc/regex/grot/main.c new file mode 100644 index 000000000000..cb56e48433c9 --- /dev/null +++ b/lib/libc/regex/grot/main.c @@ -0,0 +1,512 @@ +/* $FreeBSD$ + */ +#include <stdio.h> +#include <string.h> +#include <sys/types.h> +#include <regex.h> +#include <assert.h> + +#include "main.ih" + +char *progname; +int debug = 0; +int line = 0; +int status = 0; + +int copts = REG_EXTENDED; +int eopts = 0; +regoff_t startoff = 0; +regoff_t endoff = 0; + + +extern int split(); +extern void regprint(); + +/* + - main - do the simple case, hand off to regress() for regression + */ +main(argc, argv) +int argc; +char *argv[]; +{ +	regex_t re; +#	define	NS	10 +	regmatch_t subs[NS]; +	char erbuf[100]; +	int err; +	size_t len; +	int c; +	int errflg = 0; +	int i; +	extern int optind; +	extern char *optarg; + +	progname = argv[0]; + +	while ((c = getopt(argc, argv, "c:e:S:E:x")) != EOF) +		switch (c) { +		case 'c':	/* compile options */ +			copts = options('c', optarg); +			break; +		case 'e':	/* execute options */ +			eopts = options('e', optarg); +			break; +		case 'S':	/* start offset */ +			startoff = (regoff_t)atoi(optarg); +			break; +		case 'E':	/* end offset */ +			endoff = (regoff_t)atoi(optarg); +			break; +		case 'x':	/* Debugging. */ +			debug++; +			break; +		case '?': +		default: +			errflg++; +			break; +		} +	if (errflg) { +		fprintf(stderr, "usage: %s ", progname); +		fprintf(stderr, "[-c copt][-C][-d] [re]\n"); +		exit(2); +	} + +	if (optind >= argc) { +		regress(stdin); +		exit(status); +	} + +	err = regcomp(&re, argv[optind++], copts); +	if (err) { +		len = regerror(err, &re, erbuf, sizeof(erbuf)); +		fprintf(stderr, "error %s, %d/%d `%s'\n", +			eprint(err), len, sizeof(erbuf), erbuf); +		exit(status); +	} +	regprint(&re, stdout);	 + +	if (optind >= argc) { +		regfree(&re); +		exit(status); +	} + +	if (eopts®_STARTEND) { +		subs[0].rm_so = startoff; +		subs[0].rm_eo = strlen(argv[optind]) - endoff; +	} +	err = regexec(&re, argv[optind], (size_t)NS, subs, eopts); +	if (err) { +		len = regerror(err, &re, erbuf, sizeof(erbuf)); +		fprintf(stderr, "error %s, %d/%d `%s'\n", +			eprint(err), len, sizeof(erbuf), erbuf); +		exit(status); +	} +	if (!(copts®_NOSUB)) { +		len = (int)(subs[0].rm_eo - subs[0].rm_so); +		if (subs[0].rm_so != -1) { +			if (len != 0) +				printf("match `%.*s'\n", len, +					argv[optind] + subs[0].rm_so); +			else +				printf("match `'@%.1s\n", +					argv[optind] + subs[0].rm_so); +		} +		for (i = 1; i < NS; i++) +			if (subs[i].rm_so != -1) +				printf("(%d) `%.*s'\n", i, +					(int)(subs[i].rm_eo - subs[i].rm_so), +					argv[optind] + subs[i].rm_so); +	} +	exit(status); +} + +/* + - regress - main loop of regression test + == void regress(FILE *in); + */ +void +regress(in) +FILE *in; +{ +	char inbuf[1000]; +#	define	MAXF	10 +	char *f[MAXF]; +	int nf; +	int i; +	char erbuf[100]; +	size_t ne; +	char *badpat = "invalid regular expression"; +#	define	SHORT	10 +	char *bpname = "REG_BADPAT"; +	regex_t re; + +	while (fgets(inbuf, sizeof(inbuf), in) != NULL) { +		line++; +		if (inbuf[0] == '#' || inbuf[0] == '\n') +			continue;			/* NOTE CONTINUE */ +		inbuf[strlen(inbuf)-1] = '\0';	/* get rid of stupid \n */ +		if (debug) +			fprintf(stdout, "%d:\n", line); +		nf = split(inbuf, f, MAXF, "\t\t"); +		if (nf < 3) { +			fprintf(stderr, "bad input, line %d\n", line); +			exit(1); +		} +		for (i = 0; i < nf; i++) +			if (strcmp(f[i], "\"\"") == 0) +				f[i] = ""; +		if (nf <= 3) +			f[3] = NULL; +		if (nf <= 4) +			f[4] = NULL; +		try(f[0], f[1], f[2], f[3], f[4], options('c', f[1])); +		if (opt('&', f[1]))	/* try with either type of RE */ +			try(f[0], f[1], f[2], f[3], f[4], +					options('c', f[1]) &~ REG_EXTENDED); +	} + +	ne = regerror(REG_BADPAT, (regex_t *)NULL, erbuf, sizeof(erbuf)); +	if (strcmp(erbuf, badpat) != 0 || ne != strlen(badpat)+1) { +		fprintf(stderr, "end: regerror() test gave `%s' not `%s'\n", +							erbuf, badpat); +		status = 1; +	} +	ne = regerror(REG_BADPAT, (regex_t *)NULL, erbuf, (size_t)SHORT); +	if (strncmp(erbuf, badpat, SHORT-1) != 0 || erbuf[SHORT-1] != '\0' || +						ne != strlen(badpat)+1) { +		fprintf(stderr, "end: regerror() short test gave `%s' not `%.*s'\n", +						erbuf, SHORT-1, badpat); +		status = 1; +	} +	ne = regerror(REG_ITOA|REG_BADPAT, (regex_t *)NULL, erbuf, sizeof(erbuf)); +	if (strcmp(erbuf, bpname) != 0 || ne != strlen(bpname)+1) { +		fprintf(stderr, "end: regerror() ITOA test gave `%s' not `%s'\n", +						erbuf, bpname); +		status = 1; +	} +	re.re_endp = bpname; +	ne = regerror(REG_ATOI, &re, erbuf, sizeof(erbuf)); +	if (atoi(erbuf) != (int)REG_BADPAT) { +		fprintf(stderr, "end: regerror() ATOI test gave `%s' not `%ld'\n", +						erbuf, (long)REG_BADPAT); +		status = 1; +	} else if (ne != strlen(erbuf)+1) { +		fprintf(stderr, "end: regerror() ATOI test len(`%s') = %ld\n", +						erbuf, (long)REG_BADPAT); +		status = 1; +	} +} + +/* + - try - try it, and report on problems + == void try(char *f0, char *f1, char *f2, char *f3, char *f4, int opts); + */ +void +try(f0, f1, f2, f3, f4, opts) +char *f0; +char *f1; +char *f2; +char *f3; +char *f4; +int opts;			/* may not match f1 */ +{ +	regex_t re; +#	define	NSUBS	10 +	regmatch_t subs[NSUBS]; +#	define	NSHOULD	15 +	char *should[NSHOULD]; +	int nshould; +	char erbuf[100]; +	int err; +	int len; +	char *type = (opts & REG_EXTENDED) ? "ERE" : "BRE"; +	int i; +	char *grump; +	char f0copy[1000]; +	char f2copy[1000]; + +	strcpy(f0copy, f0); +	re.re_endp = (opts®_PEND) ? f0copy + strlen(f0copy) : NULL; +	fixstr(f0copy); +	err = regcomp(&re, f0copy, opts); +	if (err != 0 && (!opt('C', f1) || err != efind(f2))) { +		/* unexpected error or wrong error */ +		len = regerror(err, &re, erbuf, sizeof(erbuf)); +		fprintf(stderr, "%d: %s error %s, %d/%d `%s'\n", +					line, type, eprint(err), len, +					sizeof(erbuf), erbuf); +		status = 1; +	} else if (err == 0 && opt('C', f1)) { +		/* unexpected success */ +		fprintf(stderr, "%d: %s should have given REG_%s\n", +						line, type, f2); +		status = 1; +		err = 1;	/* so we won't try regexec */ +	} + +	if (err != 0) { +		regfree(&re); +		return; +	} + +	strcpy(f2copy, f2); +	fixstr(f2copy); + +	if (options('e', f1)®_STARTEND) { +		if (strchr(f2, '(') == NULL || strchr(f2, ')') == NULL) +			fprintf(stderr, "%d: bad STARTEND syntax\n", line); +		subs[0].rm_so = strchr(f2, '(') - f2 + 1; +		subs[0].rm_eo = strchr(f2, ')') - f2; +	} +	err = regexec(&re, f2copy, NSUBS, subs, options('e', f1)); + +	if (err != 0 && (f3 != NULL || err != REG_NOMATCH)) { +		/* unexpected error or wrong error */ +		len = regerror(err, &re, erbuf, sizeof(erbuf)); +		fprintf(stderr, "%d: %s exec error %s, %d/%d `%s'\n", +					line, type, eprint(err), len, +					sizeof(erbuf), erbuf); +		status = 1; +	} else if (err != 0) { +		/* nothing more to check */ +	} else if (f3 == NULL) { +		/* unexpected success */ +		fprintf(stderr, "%d: %s exec should have failed\n", +						line, type); +		status = 1; +		err = 1;		/* just on principle */ +	} else if (opts®_NOSUB) { +		/* nothing more to check */ +	} else if ((grump = check(f2, subs[0], f3)) != NULL) { +		fprintf(stderr, "%d: %s %s\n", line, type, grump); +		status = 1; +		err = 1; +	} + +	if (err != 0 || f4 == NULL) { +		regfree(&re); +		return; +	} + +	for (i = 1; i < NSHOULD; i++) +		should[i] = NULL; +	nshould = split(f4, should+1, NSHOULD-1, ","); +	if (nshould == 0) { +		nshould = 1; +		should[1] = ""; +	} +	for (i = 1; i < NSUBS; i++) { +		grump = check(f2, subs[i], should[i]); +		if (grump != NULL) { +			fprintf(stderr, "%d: %s $%d %s\n", line, +							type, i, grump); +			status = 1; +			err = 1; +		} +	} + +	regfree(&re); +} + +/* + - options - pick options out of a regression-test string + == int options(int type, char *s); + */ +int +options(type, s) +int type;			/* 'c' compile, 'e' exec */ +char *s; +{ +	char *p; +	int o = (type == 'c') ? copts : eopts; +	char *legal = (type == 'c') ? "bisnmp" : "^$#tl"; + +	for (p = s; *p != '\0'; p++) +		if (strchr(legal, *p) != NULL) +			switch (*p) { +			case 'b': +				o &= ~REG_EXTENDED; +				break; +			case 'i': +				o |= REG_ICASE; +				break; +			case 's': +				o |= REG_NOSUB; +				break; +			case 'n': +				o |= REG_NEWLINE; +				break; +			case 'm': +				o &= ~REG_EXTENDED; +				o |= REG_NOSPEC; +				break; +			case 'p': +				o |= REG_PEND; +				break; +			case '^': +				o |= REG_NOTBOL; +				break; +			case '$': +				o |= REG_NOTEOL; +				break; +			case '#': +				o |= REG_STARTEND; +				break; +			case 't':	/* trace */ +				o |= REG_TRACE; +				break; +			case 'l':	/* force long representation */ +				o |= REG_LARGE; +				break; +			case 'r':	/* force backref use */ +				o |= REG_BACKR; +				break; +			} +	return(o); +} + +/* + - opt - is a particular option in a regression string? + == int opt(int c, char *s); + */ +int				/* predicate */ +opt(c, s) +int c; +char *s; +{ +	return(strchr(s, c) != NULL); +} + +/* + - fixstr - transform magic characters in strings + == void fixstr(char *p); + */ +void +fixstr(p) +char *p; +{ +	if (p == NULL) +		return; + +	for (; *p != '\0'; p++) +		if (*p == 'N') +			*p = '\n'; +		else if (*p == 'T') +			*p = '\t'; +		else if (*p == 'S') +			*p = ' '; +		else if (*p == 'Z') +			*p = '\0'; +} + +/* + - check - check a substring match + == char *check(char *str, regmatch_t sub, char *should); + */ +char *				/* NULL or complaint */ +check(str, sub, should) +char *str; +regmatch_t sub; +char *should; +{ +	int len; +	int shlen; +	char *p; +	static char grump[500]; +	char *at = NULL; + +	if (should != NULL && strcmp(should, "-") == 0) +		should = NULL; +	if (should != NULL && should[0] == '@') { +		at = should + 1; +		should = ""; +	} + +	/* check rm_so and rm_eo for consistency */ +	if (sub.rm_so > sub.rm_eo || (sub.rm_so == -1 && sub.rm_eo != -1) || +				(sub.rm_so != -1 && sub.rm_eo == -1) || +				(sub.rm_so != -1 && sub.rm_so < 0) || +				(sub.rm_eo != -1 && sub.rm_eo < 0) ) { +		sprintf(grump, "start %ld end %ld", (long)sub.rm_so, +							(long)sub.rm_eo); +		return(grump); +	} + +	/* check for no match */ +	if (sub.rm_so == -1 && should == NULL) +		return(NULL); +	if (sub.rm_so == -1) +		return("did not match"); + +	/* check for in range */ +	if (sub.rm_eo > strlen(str)) { +		sprintf(grump, "start %ld end %ld, past end of string", +					(long)sub.rm_so, (long)sub.rm_eo); +		return(grump); +	} + +	len = (int)(sub.rm_eo - sub.rm_so); +	shlen = (int)strlen(should); +	p = str + sub.rm_so; + +	/* check for not supposed to match */ +	if (should == NULL) { +		sprintf(grump, "matched `%.*s'", len, p); +		return(grump); +	} + +	/* check for wrong match */ +	if (len != shlen || strncmp(p, should, (size_t)shlen) != 0) { +		sprintf(grump, "matched `%.*s' instead", len, p); +		return(grump); +	} +	if (shlen > 0) +		return(NULL); + +	/* check null match in right place */ +	if (at == NULL) +		return(NULL); +	shlen = strlen(at); +	if (shlen == 0) +		shlen = 1;	/* force check for end-of-string */ +	if (strncmp(p, at, shlen) != 0) { +		sprintf(grump, "matched null at `%.20s'", p); +		return(grump); +	} +	return(NULL); +} + +/* + - eprint - convert error number to name + == static char *eprint(int err); + */ +static char * +eprint(err) +int err; +{ +	static char epbuf[100]; +	size_t len; + +	len = regerror(REG_ITOA|err, (regex_t *)NULL, epbuf, sizeof(epbuf)); +	assert(len <= sizeof(epbuf)); +	return(epbuf); +} + +/* + - efind - convert error name to number + == static int efind(char *name); + */ +static int +efind(name) +char *name; +{ +	static char efbuf[100]; +	size_t n; +	regex_t re; + +	sprintf(efbuf, "REG_%s", name); +	assert(strlen(efbuf) < sizeof(efbuf)); +	re.re_endp = efbuf; +	(void) regerror(REG_ATOI, &re, efbuf, sizeof(efbuf)); +	return(atoi(efbuf)); +} diff --git a/lib/libc/regex/grot/mkh b/lib/libc/regex/grot/mkh new file mode 100755 index 000000000000..1deba7983d5c --- /dev/null +++ b/lib/libc/regex/grot/mkh @@ -0,0 +1,77 @@ +#! /bin/sh +# mkh - pull headers out of C source +# $FreeBSD$ +PATH=/bin:/usr/bin ; export PATH + +# egrep pattern to pick out marked lines +egrep='^ =([ 	]|$)' + +# Sed program to process marked lines into lines for the header file. +# The markers have already been removed.  Two things are done here:  removal +# of backslashed newlines, and some fudging of comments.  The first is done +# because -o needs to have prototypes on one line to strip them down. +# Getting comments into the output is tricky; we turn C++-style // comments +# into /* */ comments, after altering any existing */'s to avoid trouble. +peel='	/\\$/N +	/\\\n[ 	]*/s///g +	/\/\//s;\*/;* /;g +	/\/\//s;//\(.*\);/*\1 */;' + +for a +do +	case "$a" in +	-o)	# old (pre-function-prototype) compiler +		# add code to comment out argument lists +		peel="$peel +			"'/^\([^#\/][^\/]*[a-zA-Z0-9_)]\)(\(.*\))/s;;\1(/*\2*/);' +		shift +		;; +	-b)	# funny Berkeley __P macro +		peel="$peel +			"'/^\([^#\/][^\/]*[a-zA-Z0-9_)]\)(\(.*\))/s;;\1 __P((\2));' +		shift +		;; +	-s)	# compiler doesn't like `static foo();' +		# add code to get rid of the `static' +		peel="$peel +			"'/^static[ 	][^\/]*[a-zA-Z0-9_)](.*)/s;static.;;' +		shift +		;; +	-p)	# private declarations +		egrep='^ ==([ 	]|$)' +		shift +		;; +	-i)	# wrap in #ifndef, argument is name +		ifndef="$2" +		shift ; shift +		;; +	*)	break +		;; +	esac +done + +if test " $ifndef" != " " +then +	echo "#ifndef $ifndef" +	echo "#define	$ifndef	/* never again */" +fi +echo "/* ========= begin header generated by $0 ========= */" +echo '#ifdef __cplusplus' +echo 'extern "C" {' +echo '#endif' +for f +do +	echo +	echo "/* === $f === */" +	egrep "$egrep" $f | sed 's/^ ==*[ 	]//;s/^ ==*$//' | sed "$peel" +	echo +done +echo '#ifdef __cplusplus' +echo '}' +echo '#endif' +echo "/* ========= end header generated by $0 ========= */" +if test " $ifndef" != " " +then +	echo "#endif" +fi +exit 0 diff --git a/lib/libc/regex/grot/split.c b/lib/libc/regex/grot/split.c new file mode 100644 index 000000000000..e001f39ed5ec --- /dev/null +++ b/lib/libc/regex/grot/split.c @@ -0,0 +1,318 @@ +/* $FreeBSD$ + */ +#include <stdio.h> +#include <string.h> + +/* + - split - divide a string into fields, like awk split() + = int split(char *string, char *fields[], int nfields, char *sep); + */ +int				/* number of fields, including overflow */ +split(string, fields, nfields, sep) +char *string; +char *fields[];			/* list is not NULL-terminated */ +int nfields;			/* number of entries available in fields[] */ +char *sep;			/* "" white, "c" single char, "ab" [ab]+ */ +{ +	char *p = string; +	char c;			/* latest character */ +	char sepc = sep[0]; +	char sepc2; +	int fn; +	char **fp = fields; +	char *sepp; +	int trimtrail; + +	/* white space */ +	if (sepc == '\0') { +		while ((c = *p++) == ' ' || c == '\t') +			continue; +		p--; +		trimtrail = 1; +		sep = " \t";	/* note, code below knows this is 2 long */ +		sepc = ' '; +	} else +		trimtrail = 0; +	sepc2 = sep[1];		/* now we can safely pick this up */ + +	/* catch empties */ +	if (*p == '\0') +		return(0); + +	/* single separator */ +	if (sepc2 == '\0') { +		fn = nfields; +		for (;;) { +			*fp++ = p; +			fn--; +			if (fn == 0) +				break; +			while ((c = *p++) != sepc) +				if (c == '\0') +					return(nfields - fn); +			*(p-1) = '\0'; +		} +		/* we have overflowed the fields vector -- just count them */ +		fn = nfields; +		for (;;) { +			while ((c = *p++) != sepc) +				if (c == '\0') +					return(fn); +			fn++; +		} +		/* not reached */ +	} + +	/* two separators */ +	if (sep[2] == '\0') { +		fn = nfields; +		for (;;) { +			*fp++ = p; +			fn--; +			while ((c = *p++) != sepc && c != sepc2) +				if (c == '\0') { +					if (trimtrail && **(fp-1) == '\0') +						fn++; +					return(nfields - fn); +				} +			if (fn == 0) +				break; +			*(p-1) = '\0'; +			while ((c = *p++) == sepc || c == sepc2) +				continue; +			p--; +		} +		/* we have overflowed the fields vector -- just count them */ +		fn = nfields; +		while (c != '\0') { +			while ((c = *p++) == sepc || c == sepc2) +				continue; +			p--; +			fn++; +			while ((c = *p++) != '\0' && c != sepc && c != sepc2) +				continue; +		} +		/* might have to trim trailing white space */ +		if (trimtrail) { +			p--; +			while ((c = *--p) == sepc || c == sepc2) +				continue; +			p++; +			if (*p != '\0') { +				if (fn == nfields+1) +					*p = '\0'; +				fn--; +			} +		} +		return(fn); +	} + +	/* n separators */ +	fn = 0; +	for (;;) { +		if (fn < nfields) +			*fp++ = p; +		fn++; +		for (;;) { +			c = *p++; +			if (c == '\0') +				return(fn); +			sepp = sep; +			while ((sepc = *sepp++) != '\0' && sepc != c) +				continue; +			if (sepc != '\0')	/* it was a separator */ +				break; +		} +		if (fn < nfields) +			*(p-1) = '\0'; +		for (;;) { +			c = *p++; +			sepp = sep; +			while ((sepc = *sepp++) != '\0' && sepc != c) +				continue; +			if (sepc == '\0')	/* it wasn't a separator */ +				break; +		} +		p--; +	} + +	/* not reached */ +} + +#ifdef TEST_SPLIT + + +/* + * test program + * pgm		runs regression + * pgm sep	splits stdin lines by sep + * pgm str sep	splits str by sep + * pgm str sep n	splits str by sep n times + */ +int +main(argc, argv) +int argc; +char *argv[]; +{ +	char buf[512]; +	int n; +#	define	MNF	10 +	char *fields[MNF]; + +	if (argc > 4) +		for (n = atoi(argv[3]); n > 0; n--) { +			(void) strcpy(buf, argv[1]); +		} +	else if (argc > 3) +		for (n = atoi(argv[3]); n > 0; n--) { +			(void) strcpy(buf, argv[1]); +			(void) split(buf, fields, MNF, argv[2]); +		} +	else if (argc > 2) +		dosplit(argv[1], argv[2]); +	else if (argc > 1) +		while (fgets(buf, sizeof(buf), stdin) != NULL) { +			buf[strlen(buf)-1] = '\0';	/* stomp newline */ +			dosplit(buf, argv[1]); +		} +	else +		regress(); + +	exit(0); +} + +dosplit(string, seps) +char *string; +char *seps; +{ +#	define	NF	5 +	char *fields[NF]; +	int nf; + +	nf = split(string, fields, NF, seps); +	print(nf, NF, fields); +} + +print(nf, nfp, fields) +int nf; +int nfp; +char *fields[]; +{ +	int fn; +	int bound; + +	bound = (nf > nfp) ? nfp : nf; +	printf("%d:\t", nf); +	for (fn = 0; fn < bound; fn++) +		printf("\"%s\"%s", fields[fn], (fn+1 < nf) ? ", " : "\n"); +} + +#define	RNF	5		/* some table entries know this */ +struct { +	char *str; +	char *seps; +	int nf; +	char *fi[RNF]; +} tests[] = { +	"",		" ",	0,	{ "" }, +	" ",		" ",	2,	{ "", "" }, +	"x",		" ",	1,	{ "x" }, +	"xy",		" ",	1,	{ "xy" }, +	"x y",		" ",	2,	{ "x", "y" }, +	"abc def  g ",	" ",	5,	{ "abc", "def", "", "g", "" }, +	"  a bcd",	" ",	4,	{ "", "", "a", "bcd" }, +	"a b c d e f",	" ",	6,	{ "a", "b", "c", "d", "e f" }, +	" a b c d ",	" ",	6,	{ "", "a", "b", "c", "d " }, + +	"",		" _",	0,	{ "" }, +	" ",		" _",	2,	{ "", "" }, +	"x",		" _",	1,	{ "x" }, +	"x y",		" _",	2,	{ "x", "y" }, +	"ab _ cd",	" _",	2,	{ "ab", "cd" }, +	" a_b  c ",	" _",	5,	{ "", "a", "b", "c", "" }, +	"a b c_d e f",	" _",	6,	{ "a", "b", "c", "d", "e f" }, +	" a b c d ",	" _",	6,	{ "", "a", "b", "c", "d " }, + +	"",		" _~",	0,	{ "" }, +	" ",		" _~",	2,	{ "", "" }, +	"x",		" _~",	1,	{ "x" }, +	"x y",		" _~",	2,	{ "x", "y" }, +	"ab _~ cd",	" _~",	2,	{ "ab", "cd" }, +	" a_b  c~",	" _~",	5,	{ "", "a", "b", "c", "" }, +	"a b_c d~e f",	" _~",	6,	{ "a", "b", "c", "d", "e f" }, +	"~a b c d ",	" _~",	6,	{ "", "a", "b", "c", "d " }, + +	"",		" _~-",	0,	{ "" }, +	" ",		" _~-",	2,	{ "", "" }, +	"x",		" _~-",	1,	{ "x" }, +	"x y",		" _~-",	2,	{ "x", "y" }, +	"ab _~- cd",	" _~-",	2,	{ "ab", "cd" }, +	" a_b  c~",	" _~-",	5,	{ "", "a", "b", "c", "" }, +	"a b_c-d~e f",	" _~-",	6,	{ "a", "b", "c", "d", "e f" }, +	"~a-b c d ",	" _~-",	6,	{ "", "a", "b", "c", "d " }, + +	"",		"  ",	0,	{ "" }, +	" ",		"  ",	2,	{ "", "" }, +	"x",		"  ",	1,	{ "x" }, +	"xy",		"  ",	1,	{ "xy" }, +	"x y",		"  ",	2,	{ "x", "y" }, +	"abc def  g ",	"  ",	4,	{ "abc", "def", "g", "" }, +	"  a bcd",	"  ",	3,	{ "", "a", "bcd" }, +	"a b c d e f",	"  ",	6,	{ "a", "b", "c", "d", "e f" }, +	" a b c d ",	"  ",	6,	{ "", "a", "b", "c", "d " }, + +	"",		"",	0,	{ "" }, +	" ",		"",	0,	{ "" }, +	"x",		"",	1,	{ "x" }, +	"xy",		"",	1,	{ "xy" }, +	"x y",		"",	2,	{ "x", "y" }, +	"abc def  g ",	"",	3,	{ "abc", "def", "g" }, +	"\t a bcd",	"",	2,	{ "a", "bcd" }, +	"  a \tb\t c ",	"",	3,	{ "a", "b", "c" }, +	"a b c d e ",	"",	5,	{ "a", "b", "c", "d", "e" }, +	"a b\tc d e f",	"",	6,	{ "a", "b", "c", "d", "e f" }, +	" a b c d e f ",	"",	6,	{ "a", "b", "c", "d", "e f " }, + +	NULL,		NULL,	0,	{ NULL }, +}; + +regress() +{ +	char buf[512]; +	int n; +	char *fields[RNF+1]; +	int nf; +	int i; +	int printit; +	char *f; + +	for (n = 0; tests[n].str != NULL; n++) { +		(void) strcpy(buf, tests[n].str); +		fields[RNF] = NULL; +		nf = split(buf, fields, RNF, tests[n].seps); +		printit = 0; +		if (nf != tests[n].nf) { +			printf("split `%s' by `%s' gave %d fields, not %d\n", +				tests[n].str, tests[n].seps, nf, tests[n].nf); +			printit = 1; +		} else if (fields[RNF] != NULL) { +			printf("split() went beyond array end\n"); +			printit = 1; +		} else { +			for (i = 0; i < nf && i < RNF; i++) { +				f = fields[i]; +				if (f == NULL) +					f = "(NULL)"; +				if (strcmp(f, tests[n].fi[i]) != 0) { +					printf("split `%s' by `%s' field %d is `%s', not `%s'\n", +						tests[n].str, tests[n].seps, +						i, fields[i], tests[n].fi[i]); +					printit = 1; +				} +			} +		} +		if (printit) +			print(nf, RNF, fields); +	} +} +#endif diff --git a/lib/libc/regex/grot/stdlib.h b/lib/libc/regex/grot/stdlib.h new file mode 100644 index 000000000000..7e58a4564423 --- /dev/null +++ b/lib/libc/regex/grot/stdlib.h @@ -0,0 +1,4 @@ +/* $FreeBSD$ + */ +extern char *malloc(); +extern char *realloc(); diff --git a/lib/libc/regex/grot/tests b/lib/libc/regex/grot/tests new file mode 100644 index 000000000000..07e9dfb8539d --- /dev/null +++ b/lib/libc/regex/grot/tests @@ -0,0 +1,450 @@ +# regular expression test set +# $FreeBSD$ +# Lines are at least three fields, separated by one or more tabs.  "" stands +# for an empty field.  First field is an RE.  Second field is flags.  If +# C flag given, regcomp() is expected to fail, and the third field is the +# error name (minus the leading REG_). +# +# Otherwise it is expected to succeed, and the third field is the string to +# try matching it against.  If there is no fourth field, the match is +# expected to fail.  If there is a fourth field, it is the substring that +# the RE is expected to match.  If there is a fifth field, it is a comma- +# separated list of what the subexpressions should match, with - indicating +# no match for that one.  In both the fourth and fifth fields, a (sub)field +# starting with @ indicates that the (sub)expression is expected to match +# a null string followed by the stuff after the @; this provides a way to +# test where null strings match.  The character `N' in REs and strings +# is newline, `S' is space, `T' is tab, `Z' is NUL. +# +# The full list of flags: +#	-	placeholder, does nothing +#	b	RE is a BRE, not an ERE +#	&	try it as both an ERE and a BRE +#	C	regcomp() error expected, third field is error name +#	i	REG_ICASE +#	m	("mundane") REG_NOSPEC +#	s	REG_NOSUB (not really testable) +#	n	REG_NEWLINE +#	^	REG_NOTBOL +#	$	REG_NOTEOL +#	#	REG_STARTEND (see below) +#	p	REG_PEND +# +# For REG_STARTEND, the start/end offsets are those of the substring +# enclosed in (). + +# basics +a		&	a	a +abc		&	abc	abc +abc|de		-	abc	abc +a|b|c		-	abc	a + +# parentheses and perversions thereof +a(b)c		-	abc	abc +a\(b\)c		b	abc	abc +a(		C	EPAREN +a(		b	a(	a( +a\(		-	a(	a( +a\(		bC	EPAREN +a\(b		bC	EPAREN +a(b		C	EPAREN +a(b		b	a(b	a(b +# gag me with a right parenthesis -- 1003.2 goofed here (my fault, partly) +a)		-	a)	a) +)		-	)	) +# end gagging (in a just world, those *should* give EPAREN) +a)		b	a)	a) +a\)		bC	EPAREN +\)		bC	EPAREN +a()b		-	ab	ab +a\(\)b		b	ab	ab + +# anchoring and REG_NEWLINE +^abc$		&	abc	abc +a^b		-	a^b +a^b		b	a^b	a^b +a$b		-	a$b +a$b		b	a$b	a$b +^		&	abc	@abc +$		&	abc	@ +^$		&	""	@ +$^		-	""	@ +\($\)\(^\)	b	""	@ +# stop retching, those are legitimate (although disgusting) +^^		-	""	@ +$$		-	""	@ +b$		&	abNc +b$		&n	abNc	b +^b$		&	aNbNc +^b$		&n	aNbNc	b +^$		&n	aNNb	@Nb +^$		n	abc +^$		n	abcN	@ +$^		n	aNNb	@Nb +\($\)\(^\)	bn	aNNb	@Nb +^^		n^	aNNb	@Nb +$$		n	aNNb	@NN +^a		^	a +a$		$	a +^a		^n	aNb +^b		^n	aNb	b +a$		$n	bNa +b$		$n	bNa	b +a*(^b$)c*	-	b	b +a*\(^b$\)c*	b	b	b + +# certain syntax errors and non-errors +|		C	EMPTY +|		b	|	| +*		C	BADRPT +*		b	*	* ++		C	BADRPT +?		C	BADRPT +""		&C	EMPTY +()		-	abc	@abc +\(\)		b	abc	@abc +a||b		C	EMPTY +|ab		C	EMPTY +ab|		C	EMPTY +(|a)b		C	EMPTY +(a|)b		C	EMPTY +(*a)		C	BADRPT +(+a)		C	BADRPT +(?a)		C	BADRPT +({1}a)		C	BADRPT +\(\{1\}a\)	bC	BADRPT +(a|*b)		C	BADRPT +(a|+b)		C	BADRPT +(a|?b)		C	BADRPT +(a|{1}b)	C	BADRPT +^*		C	BADRPT +^*		b	*	* +^+		C	BADRPT +^?		C	BADRPT +^{1}		C	BADRPT +^\{1\}		bC	BADRPT + +# metacharacters, backslashes +a.c		&	abc	abc +a[bc]d		&	abd	abd +a\*c		&	a*c	a*c +a\\b		&	a\b	a\b +a\\\*b		&	a\*b	a\*b +a\bc		&	abc	abc +a\		&C	EESCAPE +a\\bc		&	a\bc	a\bc +\{		bC	BADRPT +# trailing $ is a peculiar special case for the BRE code +a$		&	a	a +a$		&	a$ +a\$		&	a +a\$		&	a$	a$ +a\\$		&	a +a\\$		&	a$ +a\\$		&	a\$ +a\\$		&	a\	a\ + +# back references, ugh +a\(b\)\2c	bC	ESUBREG +a\(b\1\)c	bC	ESUBREG +a\(b*\)c\1d	b	abbcbbd	abbcbbd	bb +a\(b*\)c\1d	b	abbcbd +a\(b*\)c\1d	b	abbcbbbd +^\(.\)\1	b	abc +a\([bc]\)\1d	b	abcdabbd	abbd	b +a\(\([bc]\)\2\)*d	b	abbccd	abbccd +a\(\([bc]\)\2\)*d	b	abbcbd +# actually, this next one probably ought to fail, but the spec is unclear +a\(\(b\)*\2\)*d		b	abbbd	abbbd +# here is a case that no NFA implementation does right +\(ab*\)[ab]*\1	b	ababaaa	ababaaa	a +# check out normal matching in the presence of back refs +\(a\)\1bcd	b	aabcd	aabcd +\(a\)\1bc*d	b	aabcd	aabcd +\(a\)\1bc*d	b	aabd	aabd +\(a\)\1bc*d	b	aabcccd	aabcccd +\(a\)\1bc*[ce]d	b	aabcccd	aabcccd +^\(a\)\1b\(c\)*cd$	b	aabcccd	aabcccd + +# ordinary repetitions +ab*c		&	abc	abc +ab+c		-	abc	abc +ab?c		-	abc	abc +a\(*\)b		b	a*b	a*b +a\(**\)b	b	ab	ab +a\(***\)b	bC	BADRPT +*a		b	*a	*a +**a		b	a	a +***a		bC	BADRPT + +# the dreaded bounded repetitions +{		&	{	{ +{abc		&	{abc	{abc +{1		C	BADRPT +{1}		C	BADRPT +a{b		&	a{b	a{b +a{1}b		-	ab	ab +a\{1\}b		b	ab	ab +a{1,}b		-	ab	ab +a\{1,\}b	b	ab	ab +a{1,2}b		-	aab	aab +a\{1,2\}b	b	aab	aab +a{1		C	EBRACE +a\{1		bC	EBRACE +a{1a		C	EBRACE +a\{1a		bC	EBRACE +a{1a}		C	BADBR +a\{1a\}		bC	BADBR +a{,2}		-	a{,2}	a{,2} +a\{,2\}		bC	BADBR +a{,}		-	a{,}	a{,} +a\{,\}		bC	BADBR +a{1,x}		C	BADBR +a\{1,x\}	bC	BADBR +a{1,x		C	EBRACE +a\{1,x		bC	EBRACE +a{300}		C	BADBR +a\{300\}	bC	BADBR +a{1,0}		C	BADBR +a\{1,0\}	bC	BADBR +ab{0,0}c	-	abcac	ac +ab\{0,0\}c	b	abcac	ac +ab{0,1}c	-	abcac	abc +ab\{0,1\}c	b	abcac	abc +ab{0,3}c	-	abbcac	abbc +ab\{0,3\}c	b	abbcac	abbc +ab{1,1}c	-	acabc	abc +ab\{1,1\}c	b	acabc	abc +ab{1,3}c	-	acabc	abc +ab\{1,3\}c	b	acabc	abc +ab{2,2}c	-	abcabbc	abbc +ab\{2,2\}c	b	abcabbc	abbc +ab{2,4}c	-	abcabbc	abbc +ab\{2,4\}c	b	abcabbc	abbc +((a{1,10}){1,10}){1,10}	-	a	a	a,a +((a{1,10}){1,10}){1,10}bb	-	aaaaaaaaaaaaaaaaaaaaaaaaaaaaaabb	aaaaaaaaaaaaaaaaaaaaaaaaaaaaaabb + +# multiple repetitions +a**		&C	BADRPT +a++		C	BADRPT +a??		C	BADRPT +a*+		C	BADRPT +a*?		C	BADRPT +a+*		C	BADRPT +a+?		C	BADRPT +a?*		C	BADRPT +a?+		C	BADRPT +a{1}{1}		C	BADRPT +a*{1}		C	BADRPT +a+{1}		C	BADRPT +a?{1}		C	BADRPT +a{1}*		C	BADRPT +a{1}+		C	BADRPT +a{1}?		C	BADRPT +a*{b}		-	a{b}	a{b} +a\{1\}\{1\}	bC	BADRPT +a*\{1\}		bC	BADRPT +a\{1\}*		bC	BADRPT + +# brackets, and numerous perversions thereof +a[b]c		&	abc	abc +a[ab]c		&	abc	abc +a[^ab]c		&	adc	adc +a[]b]c		&	a]c	a]c +a[[b]c		&	a[c	a[c +a[-b]c		&	a-c	a-c +a[^]b]c		&	adc	adc +a[^-b]c		&	adc	adc +a[b-]c		&	a-c	a-c +a[b		&C	EBRACK +a[]		&C	EBRACK +a[1-3]c		&	a2c	a2c +a[3-1]c		&C	ERANGE +a[1-3-5]c	&C	ERANGE +a[[.-.]--]c	&	a-c	a-c +a[1-		&C	ERANGE +a[[.		&C	EBRACK +a[[.x		&C	EBRACK +a[[.x.		&C	EBRACK +a[[.x.]		&C	EBRACK +a[[.x.]]	&	ax	ax +a[[.x,.]]	&C	ECOLLATE +a[[.one.]]b	&	a1b	a1b +a[[.notdef.]]b	&C	ECOLLATE +a[[.].]]b	&	a]b	a]b +a[[:alpha:]]c	&	abc	abc +a[[:notdef:]]c	&C	ECTYPE +a[[:		&C	EBRACK +a[[:alpha	&C	EBRACK +a[[:alpha:]	&C	EBRACK +a[[:alpha,:]	&C	ECTYPE +a[[:]:]]b	&C	ECTYPE +a[[:-:]]b	&C	ECTYPE +a[[:alph:]]	&C	ECTYPE +a[[:alphabet:]]	&C	ECTYPE +[[:alnum:]]+	-	-%@a0X-	a0X +[[:alpha:]]+	-	-%@aX0-	aX +[[:blank:]]+	-	aSSTb	SST +[[:cntrl:]]+	-	aNTb	NT +[[:digit:]]+	-	a019b	019 +[[:graph:]]+	-	Sa%bS	a%b +[[:lower:]]+	-	AabC	ab +[[:print:]]+	-	NaSbN	aSb +[[:punct:]]+	-	S%-&T	%-& +[[:space:]]+	-	aSNTb	SNT +[[:upper:]]+	-	aBCd	BC +[[:xdigit:]]+	-	p0f3Cq	0f3C +a[[=b=]]c	&	abc	abc +a[[=		&C	EBRACK +a[[=b		&C	EBRACK +a[[=b=		&C	EBRACK +a[[=b=]		&C	EBRACK +a[[=b,=]]	&C	ECOLLATE +a[[=one=]]b	&	a1b	a1b + +# complexities +a(((b)))c	-	abc	abc +a(b|(c))d	-	abd	abd +a(b*|c)d	-	abbd	abbd +# just gotta have one DFA-buster, of course +a[ab]{20}	-	aaaaabaaaabaaaabaaaab	aaaaabaaaabaaaabaaaab +# and an inline expansion in case somebody gets tricky +a[ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab]	-	aaaaabaaaabaaaabaaaab	aaaaabaaaabaaaabaaaab +# and in case somebody just slips in an NFA... +a[ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab](wee|week)(knights|night)	-	aaaaabaaaabaaaabaaaabweeknights	aaaaabaaaabaaaabaaaabweeknights +# fish for anomalies as the number of states passes 32 +12345678901234567890123456789	-	a12345678901234567890123456789b	12345678901234567890123456789 +123456789012345678901234567890	-	a123456789012345678901234567890b	123456789012345678901234567890 +1234567890123456789012345678901	-	a1234567890123456789012345678901b	1234567890123456789012345678901 +12345678901234567890123456789012	-	a12345678901234567890123456789012b	12345678901234567890123456789012 +123456789012345678901234567890123	-	a123456789012345678901234567890123b	123456789012345678901234567890123 +# and one really big one, beyond any plausible word width +1234567890123456789012345678901234567890123456789012345678901234567890	-	a1234567890123456789012345678901234567890123456789012345678901234567890b	1234567890123456789012345678901234567890123456789012345678901234567890 +# fish for problems as brackets go past 8 +[ab][cd][ef][gh][ij][kl][mn]	-	xacegikmoq	acegikm +[ab][cd][ef][gh][ij][kl][mn][op]	-	xacegikmoq	acegikmo +[ab][cd][ef][gh][ij][kl][mn][op][qr]	-	xacegikmoqy	acegikmoq +[ab][cd][ef][gh][ij][kl][mn][op][q]	-	xacegikmoqy	acegikmoq + +# subtleties of matching +abc		&	xabcy	abc +a\(b\)?c\1d	b	acd +aBc		i	Abc	Abc +a[Bc]*d		i	abBCcd	abBCcd +0[[:upper:]]1	&i	0a1	0a1 +0[[:lower:]]1	&i	0A1	0A1 +a[^b]c		&i	abc +a[^b]c		&i	aBc +a[^b]c		&i	adc	adc +[a]b[c]		-	abc	abc +[a]b[a]		-	aba	aba +[abc]b[abc]	-	abc	abc +[abc]b[abd]	-	abd	abd +a(b?c)+d	-	accd	accd +(wee|week)(knights|night)	-	weeknights	weeknights +(we|wee|week|frob)(knights|night|day)	-	weeknights	weeknights +a[bc]d		-	xyzaaabcaababdacd	abd +a[ab]c		-	aaabc	abc +abc		s	abc	abc + +# subexpressions +a(b)(c)d	-	abcd	abcd	b,c +a(((b)))c	-	abc	abc	b,b,b +a(b|(c))d	-	abd	abd	b,- +a(b*|c|e)d	-	abbd	abbd	bb +a(b*|c|e)d	-	acd	acd	c +a(b*|c|e)d	-	ad	ad	@d +a(b?)c		-	abc	abc	b +a(b?)c		-	ac	ac	@c +a(b+)c		-	abc	abc	b +a(b+)c		-	abbbc	abbbc	bbb +a(b*)c		-	ac	ac	@c +(a|ab)(bc([de]+)f|cde)	-	abcdef	abcdef	a,bcdef,de +# the regression tester only asks for 9 subexpressions +a(b)(c)(d)(e)(f)(g)(h)(i)(j)k	-	abcdefghijk	abcdefghijk	b,c,d,e,f,g,h,i,j +a(b)(c)(d)(e)(f)(g)(h)(i)(j)(k)l	-	abcdefghijkl	abcdefghijkl	b,c,d,e,f,g,h,i,j,k +a([bc]?)c	-	abc	abc	b +a([bc]?)c	-	ac	ac	@c +a([bc]+)c	-	abc	abc	b +a([bc]+)c	-	abcc	abcc	bc +a([bc]+)bc	-	abcbc	abcbc	bc +a(bb+|b)b	-	abb	abb	b +a(bbb+|bb+|b)b	-	abb	abb	b +a(bbb+|bb+|b)b	-	abbb	abbb	bb +a(bbb+|bb+|b)bb	-	abbb	abbb	b +(.*).*		-	abcdef	abcdef	abcdef +(a*)*		-	bc	@b	@b + +# do we get the right subexpression when it is used more than once? +a(b|c)*d	-	ad	ad	- +a(b|c)*d	-	abcd	abcd	c +a(b|c)+d	-	abd	abd	b +a(b|c)+d	-	abcd	abcd	c +a(b|c?)+d	-	ad	ad	@d +a(b|c?)+d	-	abcd	abcd	@d +a(b|c){0,0}d	-	ad	ad	- +a(b|c){0,1}d	-	ad	ad	- +a(b|c){0,1}d	-	abd	abd	b +a(b|c){0,2}d	-	ad	ad	- +a(b|c){0,2}d	-	abcd	abcd	c +a(b|c){0,}d	-	ad	ad	- +a(b|c){0,}d	-	abcd	abcd	c +a(b|c){1,1}d	-	abd	abd	b +a(b|c){1,1}d	-	acd	acd	c +a(b|c){1,2}d	-	abd	abd	b +a(b|c){1,2}d	-	abcd	abcd	c +a(b|c){1,}d	-	abd	abd	b +a(b|c){1,}d	-	abcd	abcd	c +a(b|c){2,2}d	-	acbd	acbd	b +a(b|c){2,2}d	-	abcd	abcd	c +a(b|c){2,4}d	-	abcd	abcd	c +a(b|c){2,4}d	-	abcbd	abcbd	b +a(b|c){2,4}d	-	abcbcd	abcbcd	c +a(b|c){2,}d	-	abcd	abcd	c +a(b|c){2,}d	-	abcbd	abcbd	b +a(b+|((c)*))+d	-	abd	abd	@d,@d,- +a(b+|((c)*))+d	-	abcd	abcd	@d,@d,- + +# check out the STARTEND option +[abc]		&#	a(b)c	b +[abc]		&#	a(d)c +[abc]		&#	a(bc)d	b +[abc]		&#	a(dc)d	c +.		&#	a()c +b.*c		&#	b(bc)c	bc +b.*		&#	b(bc)c	bc +.*c		&#	b(bc)c	bc + +# plain strings, with the NOSPEC flag +abc		m	abc	abc +abc		m	xabcy	abc +abc		m	xyz +a*b		m	aba*b	a*b +a*b		m	ab +""		mC	EMPTY + +# cases involving NULs +aZb		&	a	a +aZb		&p	a +aZb		&p#	(aZb)	aZb +aZ*b		&p#	(ab)	ab +a.b		&#	(aZb)	aZb +a.*		&#	(aZb)c	aZb + +# word boundaries (ick) +[[:<:]]a	&	a	a +[[:<:]]a	&	ba +[[:<:]]a	&	-a	a +a[[:>:]]	&	a	a +a[[:>:]]	&	ab +a[[:>:]]	&	a-	a +[[:<:]]a.c[[:>:]]	&	axcd-dayc-dazce-abc	abc +[[:<:]]a.c[[:>:]]	&	axcd-dayc-dazce-abc-q	abc +[[:<:]]a.c[[:>:]]	&	axc-dayc-dazce-abc	axc + +# past problems +(A[1])|(A[2])|(A[3])|(A[4])|(A[5])|(A[6])|(A[7])|(A[8])|(A[9])|(A[A])	-	A1	A1 +abcdefghijklmnop	i	abcdefghijklmnop	abcdefghijklmnop +abcdefghijklmnopqrstuv	i	abcdefghijklmnopqrstuv	abcdefghijklmnopqrstuv +(ALAK)|(ALT[AB])|(CC[123]1)|(CM[123]1)|(GAMC)|(LC[23][EO ])|(SEM[1234])|(SL[ES][12])|(SLWW)|(SLF )|(SLDT)|(VWH[12])|(WH[34][EW])|(WP1[ESN])	-	CC11	CC11 +CC[13]1|a{21}[23][EO][123][Es][12]a{15}aa[34][EW]aaaaaaa[X]a	-	CC11	CC11  | 
