diff options
Diffstat (limited to 'lib/libc/regex/engine.c')
| -rw-r--r-- | lib/libc/regex/engine.c | 132 | 
1 files changed, 92 insertions, 40 deletions
diff --git a/lib/libc/regex/engine.c b/lib/libc/regex/engine.c index 9d69c1e98991..e6484ef6b8ba 100644 --- a/lib/libc/regex/engine.c +++ b/lib/libc/regex/engine.c @@ -69,6 +69,17 @@ __FBSDID("$FreeBSD$");  #define	at	lat  #define	match	lmat  #endif +#ifdef MNAMES +#define	matcher	mmatcher +#define	fast	mfast +#define	slow	mslow +#define	dissect	mdissect +#define	backref	mbackref +#define	step	mstep +#define	print	mprint +#define	at	mat +#define	match	mmat +#endif  /* another structure passed up and down to avoid zillions of parameters */  struct match { @@ -85,6 +96,7 @@ struct match {  	states fresh;		/* states for a fresh start */  	states tmp;		/* temporary */  	states empty;		/* empty set of states */ +	mbstate_t mbs;		/* multibyte conversion state */  };  /* ========= begin header generated by ./mkh ========= */ @@ -98,16 +110,15 @@ static char *dissect(struct match *m, char *start, char *stop, sopno startst, so  static char *backref(struct match *m, char *start, char *stop, sopno startst, sopno stopst, sopno lev);  static char *fast(struct match *m, char *start, char *stop, sopno startst, sopno stopst);  static char *slow(struct match *m, char *start, char *stop, sopno startst, sopno stopst); -static states step(struct re_guts *g, sopno start, sopno stop, states bef, int ch, states aft); -#define	BOL	(OUT+1) -#define	EOL	(BOL+1) -#define	BOLEOL	(BOL+2) -#define	NOTHING	(BOL+3) -#define	BOW	(BOL+4) -#define	EOW	(BOL+5) -#define	CODEMAX	(BOL+5)		/* highest code used */ -#define	NONCHAR(c)	((c) > CHAR_MAX) -#define	NNONCHAR	(CODEMAX-CHAR_MAX) +static states step(struct re_guts *g, sopno start, sopno stop, states bef, wint_t ch, states aft); +#define	BOL	(OUT-1) +#define	EOL	(BOL-1) +#define	BOLEOL	(BOL-2) +#define	NOTHING	(BOL-3) +#define	BOW	(BOL-4) +#define	EOW	(BOL-5) +#define	BADCHAR	(BOL-6) +#define	NONCHAR(c)	((c) <= OUT)  #ifdef REDEBUG  static void print(struct match *m, char *caption, states st, int ch, FILE *d);  #endif @@ -234,6 +245,7 @@ int eflags;  	SETUP(m->tmp);  	SETUP(m->empty);  	CLEAR(m->empty); +	ZAPSTATE(&m->mbs);  	/* Adjust start according to moffset, to speed things up */  	if (g->moffset > -1) @@ -257,7 +269,8 @@ int eflags;  			if (endp != NULL)  				break;  			assert(m->coldp < m->endp); -			m->coldp++; +			m->coldp += XMBRTOWC(NULL, m->coldp, +			    m->endp - m->coldp, &m->mbs, 0);  		}  		if (nmatch == 1 && !g->backrefs)  			break;		/* no further info needed */ @@ -316,7 +329,9 @@ int eflags;  		/* despite initial appearances, there is no match here */  		NOTE("false alarm"); -		start = m->coldp + 1;	/* recycle starting later */ +		/* recycle starting later */ +		start = m->coldp + XMBRTOWC(NULL, m->coldp, +		    m->endp - m->coldp, &m->mbs, 0);  		assert(start <= stop);  	} @@ -394,7 +409,7 @@ sopno stopst;  			assert(nope);  			break;  		case OCHAR: -			sp++; +			sp += XMBRTOWC(NULL, sp, stop - start, &m->mbs, 0);  			break;  		case OBOL:  		case OEOL: @@ -403,7 +418,7 @@ sopno stopst;  			break;  		case OANY:  		case OANYOF: -			sp++; +			sp += XMBRTOWC(NULL, sp, stop - start, &m->mbs, 0);  			break;  		case OBACK_:  		case O_BACK: @@ -558,6 +573,7 @@ sopno lev;			/* PLUS nesting level */  	sop s;  	regoff_t offsave;  	cset *cs; +	wint_t wc;  	AT("back", start, stop, startst, stopst);  	sp = start; @@ -567,17 +583,25 @@ sopno lev;			/* PLUS nesting level */  	for (ss = startst; !hard && ss < stopst; ss++)  		switch (OP(s = m->g->strip[ss])) {  		case OCHAR: -			if (sp == stop || *sp++ != (char)OPND(s)) +			if (sp == stop) +				return(NULL); +			sp += XMBRTOWC(&wc, sp, stop - sp, &m->mbs, BADCHAR); +			if (wc != OPND(s))  				return(NULL);  			break;  		case OANY:  			if (sp == stop)  				return(NULL); -			sp++; +			sp += XMBRTOWC(&wc, sp, stop - sp, &m->mbs, BADCHAR); +			if (wc == BADCHAR) +				return (NULL);  			break;  		case OANYOF: +			if (sp == stop) +				return (NULL);  			cs = &m->g->sets[OPND(s)]; -			if (sp == stop || !CHIN(cs, *sp++)) +			sp += XMBRTOWC(&wc, sp, stop - sp, &m->mbs, BADCHAR); +			if (wc == BADCHAR || !CHIN(cs, wc))  				return(NULL);  			break;  		case OBOL: @@ -754,11 +778,12 @@ sopno stopst;  	states fresh = m->fresh;  	states tmp = m->tmp;  	char *p = start; -	int c = (start == m->beginp) ? OUT : *(start-1); -	int lastc;		/* previous c */ -	int flagch; +	wint_t c; +	wint_t lastc;		/* previous c */ +	wint_t flagch;  	int i;  	char *coldp;		/* last p after which no match was underway */ +	size_t clen;  	CLEAR(st);  	SET1(st, startst); @@ -766,10 +791,23 @@ sopno stopst;  	ASSIGN(fresh, st);  	SP("start", st, *p);  	coldp = NULL; +	if (start == m->beginp) +		c = OUT; +	else { +		/* +		 * XXX Wrong if the previous character was multi-byte. +		 * Newline never is (in encodings supported by FreeBSD), +		 * so this only breaks the ISWORD tests below. +		 */ +		c = (uch)*(start - 1); +	}  	for (;;) {  		/* next character */  		lastc = c; -		c = (p == m->endp) ? OUT : *p; +		if (p == m->endp) +			c = OUT; +		else +			clen = XMBRTOWC(&c, p, m->endp - p, &m->mbs, BADCHAR);  		if (EQ(st, fresh))  			coldp = p; @@ -817,13 +855,13 @@ sopno stopst;  		st = step(m->g, startst, stopst, tmp, c, st);  		SP("aft", st, c);  		assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st)); -		p++; +		p += clen;  	}  	assert(coldp != NULL);  	m->coldp = coldp;  	if (ISSET(st, stopst)) -		return(p+1); +		return(p+XMBRTOWC(NULL, p, m->endp - p, &m->mbs, 0));  	else  		return(NULL);  } @@ -845,11 +883,12 @@ sopno stopst;  	states empty = m->empty;  	states tmp = m->tmp;  	char *p = start; -	int c = (start == m->beginp) ? OUT : *(start-1); -	int lastc;		/* previous c */ -	int flagch; +	wint_t c; +	wint_t lastc;		/* previous c */ +	wint_t flagch;  	int i;  	char *matchp;		/* last p at which a match ended */ +	size_t clen;  	AT("slow", start, stop, startst, stopst);  	CLEAR(st); @@ -857,10 +896,24 @@ sopno stopst;  	SP("sstart", st, *p);  	st = step(m->g, startst, stopst, st, NOTHING, st);  	matchp = NULL; +	if (start == m->beginp) +		c = OUT; +	else { +		/* +		 * XXX Wrong if the previous character was multi-byte. +		 * Newline never is (in encodings supported by FreeBSD), +		 * so this only breaks the ISWORD tests below. +		 */ +		c = (uch)*(start - 1); +	}  	for (;;) {  		/* next character */  		lastc = c; -		c = (p == m->endp) ? OUT : *p; +		if (p == m->endp) { +			c = OUT; +			clen = 0; +		} else +			clen = XMBRTOWC(&c, p, m->endp - p, &m->mbs, BADCHAR);  		/* is there an EOL and/or BOL between lastc and c? */  		flagch = '\0'; @@ -908,7 +961,7 @@ sopno stopst;  		st = step(m->g, startst, stopst, tmp, c, st);  		SP("saft", st, c);  		assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st)); -		p++; +		p += clen;  	}  	return(matchp); @@ -919,15 +972,14 @@ sopno stopst;   - step - map set of states reachable before char to set reachable after   == static states step(struct re_guts *g, sopno start, sopno stop, \   ==	states bef, int ch, states aft); - == #define	BOL	(OUT+1) - == #define	EOL	(BOL+1) - == #define	BOLEOL	(BOL+2) - == #define	NOTHING	(BOL+3) - == #define	BOW	(BOL+4) - == #define	EOW	(BOL+5) - == #define	CODEMAX	(BOL+5)		// highest code used - == #define	NONCHAR(c)	((c) > CHAR_MAX) - == #define	NNONCHAR	(CODEMAX-CHAR_MAX) + == #define	BOL	(OUT-1) + == #define	EOL	(BOL-1) + == #define	BOLEOL	(BOL-2) + == #define	NOTHING	(BOL-3) + == #define	BOW	(BOL-4) + == #define	EOW	(BOL-5) + == #define	BADCHAR	(BOL-6) + == #define	NONCHAR(c)	((c) <= OUT)   */  static states  step(g, start, stop, bef, ch, aft) @@ -935,7 +987,7 @@ struct re_guts *g;  sopno start;			/* start state within strip */  sopno stop;			/* state after stop state within strip */  states bef;			/* states reachable before */ -int ch;				/* character or NONCHAR code */ +wint_t ch;			/* character or NONCHAR code */  states aft;			/* states already known reachable after */  {  	cset *cs; @@ -953,8 +1005,8 @@ states aft;			/* states already known reachable after */  			break;  		case OCHAR:  			/* only characters can match */ -			assert(!NONCHAR(ch) || ch != (char)OPND(s)); -			if (ch == (char)OPND(s)) +			assert(!NONCHAR(ch) || ch != OPND(s)); +			if (ch == OPND(s))  				FWD(aft, bef, 1);  			break;  		case OBOL:  | 
