diff options
author | Tim J. Robbins <tjr@FreeBSD.org> | 2004-07-04 09:52:08 +0000 |
---|---|---|
committer | Tim J. Robbins <tjr@FreeBSD.org> | 2004-07-04 09:52:08 +0000 |
commit | 6fdbbb54872ec7fac83387296f0165f7ad3400a9 (patch) | |
tree | 70ffe417c9436d51a3807610738f2a5993f7f0fc /gnu/usr.bin/grep/search.c | |
parent | 7a39f4da90d53085484e72b7504a3e53dbc9f051 (diff) |
Notes
Diffstat (limited to 'gnu/usr.bin/grep/search.c')
-rw-r--r-- | gnu/usr.bin/grep/search.c | 723 |
1 files changed, 513 insertions, 210 deletions
diff --git a/gnu/usr.bin/grep/search.c b/gnu/usr.bin/grep/search.c index 9763dba5f5bd9..7bd233fbcaf14 100644 --- a/gnu/usr.bin/grep/search.c +++ b/gnu/usr.bin/grep/search.c @@ -22,54 +22,71 @@ # include <config.h> #endif #include <sys/types.h> +#if defined HAVE_WCTYPE_H && defined HAVE_WCHAR_H && defined HAVE_MBRTOWC +/* We can handle multibyte string. */ +# define MBS_SUPPORT +# include <wchar.h> +# include <wctype.h> +#endif + #include "system.h" #include "grep.h" #include "regex.h" #include "dfa.h" #include "kwset.h" +#include "error.h" +#include "xalloc.h" +#ifdef HAVE_LIBPCRE +# include <pcre.h> +#endif #define NCHAR (UCHAR_MAX + 1) -static void Gcompile PARAMS((char *, size_t)); -static void Ecompile PARAMS((char *, size_t)); -static char *EGexecute PARAMS((char *, size_t, char **)); -static void Fcompile PARAMS((char *, size_t)); -static char *Fexecute PARAMS((char *, size_t, char **)); -static void kwsinit PARAMS((void)); - -/* Here is the matchers vector for the main program. */ -struct matcher matchers[] = { - { "default", Gcompile, EGexecute }, - { "grep", Gcompile, EGexecute }, - { "egrep", Ecompile, EGexecute }, - { "awk", Ecompile, EGexecute }, - { "fgrep", Fcompile, Fexecute }, - { 0, 0, 0 }, -}; - /* For -w, we also consider _ to be word constituent. */ #define WCHAR(C) (ISALNUM(C) || (C) == '_') /* DFA compiled regexp. */ static struct dfa dfa; -/* Regex compiled regexp. */ -static struct re_pattern_buffer regexbuf; +/* The Regex compiled patterns. */ +static struct patterns +{ + /* Regex compiled regexp. */ + struct re_pattern_buffer regexbuf; + struct re_registers regs; /* This is here on account of a BRAIN-DEAD + Q@#%!# library interface in regex.c. */ +} patterns0; + +struct patterns *patterns; +size_t pcount; /* KWset compiled pattern. For Ecompile and Gcompile, we compile a list of strings, at least one of which is known to occur in any string matching the regexp. */ static kwset_t kwset; -/* Last compiled fixed string known to exactly match the regexp. - If kwsexec() returns < lastexact, then we don't need to +/* Number of compiled fixed strings known to exactly match the regexp. + If kwsexec returns < kwset_exact_matches, then we don't need to call the regexp matcher at all. */ -static int lastexact; +static int kwset_exact_matches; + +#if defined(MBS_SUPPORT) +static char* check_multibyte_string PARAMS ((char const *buf, size_t size)); +#endif +static void kwsinit PARAMS ((void)); +static void kwsmusts PARAMS ((void)); +static void Gcompile PARAMS ((char const *, size_t)); +static void Ecompile PARAMS ((char const *, size_t)); +static size_t EGexecute PARAMS ((char const *, size_t, size_t *, int )); +static void Fcompile PARAMS ((char const *, size_t)); +static size_t Fexecute PARAMS ((char const *, size_t, size_t *, int)); +static void Pcompile PARAMS ((char const *, size_t )); +static size_t Pexecute PARAMS ((char const *, size_t, size_t *, int)); void dfaerror (char const *mesg) { - fatal(mesg, 0); + error (2, 0, mesg); } static void @@ -80,10 +97,10 @@ kwsinit (void) if (match_icase) for (i = 0; i < NCHAR; ++i) - trans[i] = TOLOWER(i); + trans[i] = TOLOWER (i); - if (!(kwset = kwsalloc(match_icase ? trans : (char *) 0))) - fatal("memory exhausted", 0); + if (!(kwset = kwsalloc (match_icase ? trans : (char *) 0))) + error (2, 0, _("memory exhausted")); } /* If the DFA turns out to have some set of fixed strings one of @@ -93,12 +110,12 @@ kwsinit (void) static void kwsmusts (void) { - struct dfamust *dm; - char *err; + struct dfamust const *dm; + char const *err; if (dfa.musts) { - kwsinit(); + kwsinit (); /* First, we compile in the substrings known to be exact matches. The kwset matcher will return the index of the matching string that it chooses. */ @@ -106,9 +123,9 @@ kwsmusts (void) { if (!dm->exact) continue; - ++lastexact; - if ((err = kwsincr(kwset, dm->must, strlen(dm->must))) != 0) - fatal(err, 0); + ++kwset_exact_matches; + if ((err = kwsincr (kwset, dm->must, strlen (dm->must))) != 0) + error (2, 0, err); } /* Now, we compile the substrings that will require the use of the regexp matcher. */ @@ -116,24 +133,90 @@ kwsmusts (void) { if (dm->exact) continue; - if ((err = kwsincr(kwset, dm->must, strlen(dm->must))) != 0) - fatal(err, 0); + if ((err = kwsincr (kwset, dm->must, strlen (dm->must))) != 0) + error (2, 0, err); + } + if ((err = kwsprep (kwset)) != 0) + error (2, 0, err); + } +} + +#ifdef MBS_SUPPORT +/* This function allocate the array which correspond to "buf". + Then this check multibyte string and mark on the positions which + are not singlebyte character nor the first byte of a multibyte + character. Caller must free the array. */ +static char* +check_multibyte_string(char const *buf, size_t size) +{ + char *mb_properties = malloc(size); + mbstate_t cur_state; + int i; + memset(&cur_state, 0, sizeof(mbstate_t)); + memset(mb_properties, 0, sizeof(char)*size); + for (i = 0; i < size ;) + { + size_t mbclen; + mbclen = mbrlen(buf + i, size - i, &cur_state); + + if (mbclen == (size_t) -1 || mbclen == (size_t) -2 || mbclen == 0) + { + /* An invalid sequence, or a truncated multibyte character. + We treat it as a singlebyte character. */ + mbclen = 1; } - if ((err = kwsprep(kwset)) != 0) - fatal(err, 0); + mb_properties[i] = mbclen; + i += mbclen; } + + return mb_properties; } +#endif static void -Gcompile (char *pattern, size_t size) +Gcompile (char const *pattern, size_t size) { const char *err; + char const *sep; + size_t total = size; + char const *motif = pattern; + + re_set_syntax (RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE); + dfasyntax (RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE, match_icase, eolbyte); + + /* For GNU regex compiler we have to pass the patterns separately to detect + errors like "[\nallo\n]\n". The patterns here are "[", "allo" and "]" + GNU regex should have raise a syntax error. The same for backref, where + the backref should have been local to each pattern. */ + do + { + size_t len; + sep = memchr (motif, '\n', total); + if (sep) + { + len = sep - motif; + sep++; + total -= (len + 1); + } + else + { + len = total; + total = 0; + } + + patterns = realloc (patterns, (pcount + 1) * sizeof (*patterns)); + if (patterns == NULL) + error (2, errno, _("memory exhausted")); + + patterns[pcount] = patterns0; - re_set_syntax(RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE); - dfasyntax(RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE, match_icase, eolbyte); + if ((err = re_compile_pattern (motif, len, + &(patterns[pcount].regexbuf))) != 0) + error (2, 0, err); + pcount++; - if ((err = re_compile_pattern(pattern, size, ®exbuf)) != 0) - fatal(err, 0); + motif = sep; + } while (sep && total != 0); /* In the match_words and match_lines cases, we use a different pattern for the DFA matcher that will quickly throw out cases that won't work. @@ -142,49 +225,42 @@ Gcompile (char *pattern, size_t size) if (match_words || match_lines) { /* In the whole-word case, we use the pattern: - (^|[^A-Za-z_])(userpattern)([^A-Za-z_]|$). + \(^\|[^[:alnum:]_]\)\(userpattern\)\([^[:alnum:]_]|$\). In the whole-line case, we use the pattern: - ^(userpattern)$. - BUG: Using [A-Za-z_] is locale-dependent! - So will use [:alnum:] */ - - char *n = malloc(size + 50); - int i = 0; - - strcpy(n, ""); - - if (match_lines) - strcpy(n, "^\\("); - if (match_words) - strcpy(n, "\\(^\\|[^[:alnum:]_]\\)\\("); - - i = strlen(n); - memcpy(n + i, pattern, size); + ^\(userpattern\)$. */ + + static char const line_beg[] = "^\\("; + static char const line_end[] = "\\)$"; + static char const word_beg[] = "\\(^\\|[^[:alnum:]_]\\)\\("; + static char const word_end[] = "\\)\\([^[:alnum:]_]\\|$\\)"; + char *n = malloc (sizeof word_beg - 1 + size + sizeof word_end); + size_t i; + strcpy (n, match_lines ? line_beg : word_beg); + i = strlen (n); + memcpy (n + i, pattern, size); i += size; - - if (match_words) - strcpy(n + i, "\\)\\([^[:alnum:]_]\\|$\\)"); - if (match_lines) - strcpy(n + i, "\\)$"); - - i += strlen(n + i); - dfacomp(n, i, &dfa, 1); + strcpy (n + i, match_lines ? line_end : word_end); + i += strlen (n + i); + pattern = n; + size = i; } - else - dfacomp(pattern, size, &dfa, 1); - kwsmusts(); + dfacomp (pattern, size, &dfa, 1); + kwsmusts (); } static void -Ecompile (char *pattern, size_t size) +Ecompile (char const *pattern, size_t size) { const char *err; + const char *sep; + size_t total = size; + char const *motif = pattern; - if (strcmp(matcher, "awk") == 0) + if (strcmp (matcher, "awk") == 0) { - re_set_syntax(RE_SYNTAX_AWK); - dfasyntax(RE_SYNTAX_AWK, match_icase, eolbyte); + re_set_syntax (RE_SYNTAX_AWK); + dfasyntax (RE_SYNTAX_AWK, match_icase, eolbyte); } else { @@ -192,8 +268,38 @@ Ecompile (char *pattern, size_t size) dfasyntax (RE_SYNTAX_POSIX_EGREP, match_icase, eolbyte); } - if ((err = re_compile_pattern(pattern, size, ®exbuf)) != 0) - fatal(err, 0); + /* For GNU regex compiler we have to pass the patterns separately to detect + errors like "[\nallo\n]\n". The patterns here are "[", "allo" and "]" + GNU regex should have raise a syntax error. The same for backref, where + the backref should have been local to each pattern. */ + do + { + size_t len; + sep = memchr (motif, '\n', total); + if (sep) + { + len = sep - motif; + sep++; + total -= (len + 1); + } + else + { + len = total; + total = 0; + } + + patterns = realloc (patterns, (pcount + 1) * sizeof (*patterns)); + if (patterns == NULL) + error (2, errno, _("memory exhausted")); + patterns[pcount] = patterns0; + + if ((err = re_compile_pattern (motif, len, + &(patterns[pcount].regexbuf))) != 0) + error (2, 0, err); + pcount++; + + motif = sep; + } while (sep && total != 0); /* In the match_words and match_lines cases, we use a different pattern for the DFA matcher that will quickly throw out cases that won't work. @@ -202,186 +308,236 @@ Ecompile (char *pattern, size_t size) if (match_words || match_lines) { /* In the whole-word case, we use the pattern: - (^|[^A-Za-z_])(userpattern)([^A-Za-z_]|$). + (^|[^[:alnum:]_])(userpattern)([^[:alnum:]_]|$). In the whole-line case, we use the pattern: - ^(userpattern)$. - BUG: Using [A-Za-z_] is locale-dependent! - so will use the char class */ - - char *n = malloc(size + 50); - int i = 0; - - strcpy(n, ""); - - if (match_lines) - strcpy(n, "^("); - if (match_words) - strcpy(n, "(^|[^[:alnum:]_])("); - + ^(userpattern)$. */ + + static char const line_beg[] = "^("; + static char const line_end[] = ")$"; + static char const word_beg[] = "(^|[^[:alnum:]_])("; + static char const word_end[] = ")([^[:alnum:]_]|$)"; + char *n = malloc (sizeof word_beg - 1 + size + sizeof word_end); + size_t i; + strcpy (n, match_lines ? line_beg : word_beg); i = strlen(n); - memcpy(n + i, pattern, size); + memcpy (n + i, pattern, size); i += size; - - if (match_words) - strcpy(n + i, ")([^[:alnum:]_]|$)"); - if (match_lines) - strcpy(n + i, ")$"); - - i += strlen(n + i); - dfacomp(n, i, &dfa, 1); + strcpy (n + i, match_lines ? line_end : word_end); + i += strlen (n + i); + pattern = n; + size = i; } - else - dfacomp(pattern, size, &dfa, 1); - kwsmusts(); + dfacomp (pattern, size, &dfa, 1); + kwsmusts (); } -static char * -EGexecute (char *buf, size_t size, char **endp) +static size_t +EGexecute (char const *buf, size_t size, size_t *match_size, int exact) { - register char *buflim, *beg, *end, save; + register char const *buflim, *beg, *end; char eol = eolbyte; int backref, start, len; struct kwsmatch kwsm; - static struct re_registers regs; /* This is static on account of a BRAIN-DEAD - Q@#%!# library interface in regex.c. */ + size_t i; +#ifdef MBS_SUPPORT + char *mb_properties = NULL; +#endif /* MBS_SUPPORT */ + +#ifdef MBS_SUPPORT + if (MB_CUR_MAX > 1 && kwset) + mb_properties = check_multibyte_string(buf, size); +#endif /* MBS_SUPPORT */ buflim = buf + size; - for (beg = end = buf; end < buflim; beg = end + 1) + for (beg = end = buf; end < buflim; beg = end) { - if (kwset) + if (!exact) { - /* Find a possible match using the KWset matcher. */ - beg = kwsexec(kwset, beg, buflim - beg, &kwsm); - if (!beg) - goto failure; - /* Narrow down to the line containing the candidate, and - run it through DFA. */ - end = memchr(beg, eol, buflim - beg); - if (!end) - end = buflim; - while (beg > buf && beg[-1] != eol) - --beg; - save = *end; - if (kwsm.index < lastexact) - goto success; - if (!dfaexec(&dfa, beg, end, 0, (int *) 0, &backref)) + if (kwset) { - *end = save; - continue; + /* Find a possible match using the KWset matcher. */ + size_t offset = kwsexec (kwset, beg, buflim - beg, &kwsm); + if (offset == (size_t) -1) + { +#ifdef MBS_SUPPORT + if (MB_CUR_MAX > 1) + free(mb_properties); +#endif + return (size_t)-1; + } + beg += offset; + /* Narrow down to the line containing the candidate, and + run it through DFA. */ + end = memchr(beg, eol, buflim - beg); + end++; +#ifdef MBS_SUPPORT + if (MB_CUR_MAX > 1 && mb_properties[beg - buf] == 0) + continue; +#endif + while (beg > buf && beg[-1] != eol) + --beg; + if (kwsm.index < kwset_exact_matches) + goto success; + if (dfaexec (&dfa, beg, end - beg, &backref) == (size_t) -1) + continue; + } + else + { + /* No good fixed strings; start with DFA. */ + size_t offset = dfaexec (&dfa, beg, buflim - beg, &backref); + if (offset == (size_t) -1) + break; + /* Narrow down to the line we've found. */ + beg += offset; + end = memchr (beg, eol, buflim - beg); + end++; + while (beg > buf && beg[-1] != eol) + --beg; } - *end = save; - /* Successful, no backreferences encountered. */ - if (!backref) - goto success; - } - else - { - /* No good fixed strings; start with DFA. */ - save = *buflim; - beg = dfaexec(&dfa, beg, buflim, 0, (int *) 0, &backref); - *buflim = save; - if (!beg) - goto failure; - /* Narrow down to the line we've found. */ - end = memchr(beg, eol, buflim - beg); - if (!end) - end = buflim; - while (beg > buf && beg[-1] != eol) - --beg; /* Successful, no backreferences encountered! */ if (!backref) goto success; } + else + end = beg + size; + /* If we've made it to this point, this means DFA has seen a probable match, and we need to run it through Regex. */ - regexbuf.not_eol = 0; - if ((start = re_search(®exbuf, beg, end - beg, 0, end - beg, ®s)) >= 0) + for (i = 0; i < pcount; i++) { - len = regs.end[0] - start; - if ((!match_lines && !match_words) - || (match_lines && len == end - beg)) - goto success; - /* If -w, check if the match aligns with word boundaries. - We do this iteratively because: - (a) the line may contain more than one occurence of the pattern, and - (b) Several alternatives in the pattern might be valid at a given - point, and we may need to consider a shorter one to find a word - boundary. */ - if (match_words) - while (start >= 0) - { - if ((start == 0 || !WCHAR ((unsigned char) beg[start - 1])) - && (len == end - beg - || !WCHAR ((unsigned char) beg[start + len]))) - goto success; - if (len > 0) - { - /* Try a shorter length anchored at the same place. */ - --len; - regexbuf.not_eol = 1; - len = re_match(®exbuf, beg, start + len, start, ®s); - } - if (len <= 0) + patterns[i].regexbuf.not_eol = 0; + if (0 <= (start = re_search (&(patterns[i].regexbuf), beg, + end - beg - 1, 0, + end - beg - 1, &(patterns[i].regs)))) + { + len = patterns[i].regs.end[0] - start; + if (exact) + { + *match_size = len; + return start; + } + if ((!match_lines && !match_words) + || (match_lines && len == end - beg - 1)) + goto success; + /* If -w, check if the match aligns with word boundaries. + We do this iteratively because: + (a) the line may contain more than one occurence of the + pattern, and + (b) Several alternatives in the pattern might be valid at a + given point, and we may need to consider a shorter one to + find a word boundary. */ + if (match_words) + while (start >= 0) { - /* Try looking further on. */ - if (start == end - beg) - break; - ++start; - regexbuf.not_eol = 0; - start = re_search(®exbuf, beg, end - beg, - start, end - beg - start, ®s); - len = regs.end[0] - start; + if ((start == 0 || !WCHAR ((unsigned char) beg[start - 1])) + && (len == end - beg - 1 + || !WCHAR ((unsigned char) beg[start + len]))) + goto success; + if (len > 0) + { + /* Try a shorter length anchored at the same place. */ + --len; + patterns[i].regexbuf.not_eol = 1; + len = re_match (&(patterns[i].regexbuf), beg, + start + len, start, + &(patterns[i].regs)); + } + if (len <= 0) + { + /* Try looking further on. */ + if (start == end - beg - 1) + break; + ++start; + patterns[i].regexbuf.not_eol = 0; + start = re_search (&(patterns[i].regexbuf), beg, + end - beg - 1, + start, end - beg - 1 - start, + &(patterns[i].regs)); + len = patterns[i].regs.end[0] - start; + } } - } - } - } - - failure: - return 0; + } + } /* for Regex patterns. */ + } /* for (beg = end ..) */ +#ifdef MBS_SUPPORT + if (MB_CUR_MAX > 1 && mb_properties) + free (mb_properties); +#endif /* MBS_SUPPORT */ + return (size_t) -1; success: - *endp = end < buflim ? end + 1 : end; - return beg; +#ifdef MBS_SUPPORT + if (MB_CUR_MAX > 1 && mb_properties) + free (mb_properties); +#endif /* MBS_SUPPORT */ + *match_size = end - beg; + return beg - buf; } static void -Fcompile (char *pattern, size_t size) +Fcompile (char const *pattern, size_t size) { - char *beg, *lim, *err; + char const *beg, *lim, *err; - kwsinit(); + kwsinit (); beg = pattern; do { for (lim = beg; lim < pattern + size && *lim != '\n'; ++lim) ; - if ((err = kwsincr(kwset, beg, lim - beg)) != 0) - fatal(err, 0); + if ((err = kwsincr (kwset, beg, lim - beg)) != 0) + error (2, 0, err); if (lim < pattern + size) ++lim; beg = lim; } while (beg < pattern + size); - if ((err = kwsprep(kwset)) != 0) - fatal(err, 0); + if ((err = kwsprep (kwset)) != 0) + error (2, 0, err); } -static char * -Fexecute (char *buf, size_t size, char **endp) +static size_t +Fexecute (char const *buf, size_t size, size_t *match_size, int exact) { - register char *beg, *try, *end; + register char const *beg, *try, *end; register size_t len; char eol = eolbyte; struct kwsmatch kwsmatch; +#ifdef MBS_SUPPORT + char *mb_properties; + if (MB_CUR_MAX > 1) + mb_properties = check_multibyte_string (buf, size); +#endif /* MBS_SUPPORT */ for (beg = buf; beg <= buf + size; ++beg) { - if (!(beg = kwsexec(kwset, beg, buf + size - beg, &kwsmatch))) - return 0; + size_t offset = kwsexec (kwset, beg, buf + size - beg, &kwsmatch); + if (offset == (size_t) -1) + { +#ifdef MBS_SUPPORT + if (MB_CUR_MAX > 1) + free(mb_properties); +#endif /* MBS_SUPPORT */ + return offset; + } +#ifdef MBS_SUPPORT + if (MB_CUR_MAX > 1 && mb_properties[offset+beg-buf] == 0) + continue; /* It is a part of multibyte character. */ +#endif /* MBS_SUPPORT */ + beg += offset; len = kwsmatch.size[0]; + if (exact) + { + *match_size = len; +#ifdef MBS_SUPPORT + if (MB_CUR_MAX > 1) + free (mb_properties); +#endif /* MBS_SUPPORT */ + return beg - buf; + } if (match_lines) { if (beg > buf && beg[-1] != eol) @@ -391,13 +547,22 @@ Fexecute (char *buf, size_t size, char **endp) goto success; } else if (match_words) - for (try = beg; len && try;) + for (try = beg; len; ) { if (try > buf && WCHAR((unsigned char) try[-1])) break; if (try + len < buf + size && WCHAR((unsigned char) try[len])) { - try = kwsexec(kwset, beg, --len, &kwsmatch); + offset = kwsexec (kwset, beg, --len, &kwsmatch); + if (offset == (size_t) -1) + { +#ifdef MBS_SUPPORT + if (MB_CUR_MAX > 1) + free (mb_properties); +#endif /* MBS_SUPPORT */ + return offset; + } + try = beg + offset; len = kwsmatch.size[0]; } else @@ -407,15 +572,153 @@ Fexecute (char *buf, size_t size, char **endp) goto success; } - return 0; +#ifdef MBS_SUPPORT + if (MB_CUR_MAX > 1) + free (mb_properties); +#endif /* MBS_SUPPORT */ + return -1; success: - if ((end = memchr(beg + len, eol, (buf + size) - (beg + len))) != 0) - ++end; - else - end = buf + size; - *endp = end; - while (beg > buf && beg[-1] != '\n') + end = memchr (beg + len, eol, (buf + size) - (beg + len)); + end++; + while (buf < beg && beg[-1] != eol) --beg; - return beg; + *match_size = end - beg; +#ifdef MBS_SUPPORT + if (MB_CUR_MAX > 1) + free (mb_properties); +#endif /* MBS_SUPPORT */ + return beg - buf; +} + +#if HAVE_LIBPCRE +/* Compiled internal form of a Perl regular expression. */ +static pcre *cre; + +/* Additional information about the pattern. */ +static pcre_extra *extra; +#endif + +static void +Pcompile (char const *pattern, size_t size) +{ +#if !HAVE_LIBPCRE + error (2, 0, _("The -P option is not supported")); +#else + int e; + char const *ep; + char *re = xmalloc (4 * size + 7); + int flags = PCRE_MULTILINE | (match_icase ? PCRE_CASELESS : 0); + char const *patlim = pattern + size; + char *n = re; + char const *p; + char const *pnul; + + /* FIXME: Remove this restriction. */ + if (eolbyte != '\n') + error (2, 0, _("The -P and -z options cannot be combined")); + + *n = '\0'; + if (match_lines) + strcpy (n, "^("); + if (match_words) + strcpy (n, "\\b("); + n += strlen (n); + + /* The PCRE interface doesn't allow NUL bytes in the pattern, so + replace each NUL byte in the pattern with the four characters + "\000", removing a preceding backslash if there are an odd + number of backslashes before the NUL. + + FIXME: This method does not work with some multibyte character + encodings, notably Shift-JIS, where a multibyte character can end + in a backslash byte. */ + for (p = pattern; (pnul = memchr (p, '\0', patlim - p)); p = pnul + 1) + { + memcpy (n, p, pnul - p); + n += pnul - p; + for (p = pnul; pattern < p && p[-1] == '\\'; p--) + continue; + n -= (pnul - p) & 1; + strcpy (n, "\\000"); + n += 4; + } + + memcpy (n, p, patlim - p); + n += patlim - p; + *n = '\0'; + if (match_words) + strcpy (n, ")\\b"); + if (match_lines) + strcpy (n, ")$"); + + cre = pcre_compile (re, flags, &ep, &e, pcre_maketables ()); + if (!cre) + error (2, 0, ep); + + extra = pcre_study (cre, 0, &ep); + if (ep) + error (2, 0, ep); + + free (re); +#endif +} + +static size_t +Pexecute (char const *buf, size_t size, size_t *match_size, int exact) +{ +#if !HAVE_LIBPCRE + abort (); + return -1; +#else + /* This array must have at least two elements; everything after that + is just for performance improvement in pcre_exec. */ + int sub[300]; + + int e = pcre_exec (cre, extra, buf, size, 0, 0, + sub, sizeof sub / sizeof *sub); + + if (e <= 0) + { + switch (e) + { + case PCRE_ERROR_NOMATCH: + return -1; + + case PCRE_ERROR_NOMEMORY: + error (2, 0, _("Memory exhausted")); + + default: + abort (); + } + } + else + { + /* Narrow down to the line we've found. */ + char const *beg = buf + sub[0]; + char const *end = buf + sub[1]; + char const *buflim = buf + size; + char eol = eolbyte; + if (!exact) + { + end = memchr (end, eol, buflim - end); + end++; + while (buf < beg && beg[-1] != eol) + --beg; + } + + *match_size = end - beg; + return beg - buf; + } +#endif } + +struct matcher const matchers[] = { + { "default", Gcompile, EGexecute }, + { "grep", Gcompile, EGexecute }, + { "egrep", Ecompile, EGexecute }, + { "awk", Ecompile, EGexecute }, + { "fgrep", Fcompile, Fexecute }, + { "perl", Pcompile, Pexecute }, + { "", 0, 0 }, +}; |