summaryrefslogtreecommitdiff
path: root/gnu/usr.bin/grep/search.c
diff options
context:
space:
mode:
authorTim J. Robbins <tjr@FreeBSD.org>2004-07-04 09:52:08 +0000
committerTim J. Robbins <tjr@FreeBSD.org>2004-07-04 09:52:08 +0000
commit6fdbbb54872ec7fac83387296f0165f7ad3400a9 (patch)
tree70ffe417c9436d51a3807610738f2a5993f7f0fc /gnu/usr.bin/grep/search.c
parent7a39f4da90d53085484e72b7504a3e53dbc9f051 (diff)
Notes
Diffstat (limited to 'gnu/usr.bin/grep/search.c')
-rw-r--r--gnu/usr.bin/grep/search.c723
1 files changed, 513 insertions, 210 deletions
diff --git a/gnu/usr.bin/grep/search.c b/gnu/usr.bin/grep/search.c
index 9763dba5f5bd9..7bd233fbcaf14 100644
--- a/gnu/usr.bin/grep/search.c
+++ b/gnu/usr.bin/grep/search.c
@@ -22,54 +22,71 @@
# include <config.h>
#endif
#include <sys/types.h>
+#if defined HAVE_WCTYPE_H && defined HAVE_WCHAR_H && defined HAVE_MBRTOWC
+/* We can handle multibyte string. */
+# define MBS_SUPPORT
+# include <wchar.h>
+# include <wctype.h>
+#endif
+
#include "system.h"
#include "grep.h"
#include "regex.h"
#include "dfa.h"
#include "kwset.h"
+#include "error.h"
+#include "xalloc.h"
+#ifdef HAVE_LIBPCRE
+# include <pcre.h>
+#endif
#define NCHAR (UCHAR_MAX + 1)
-static void Gcompile PARAMS((char *, size_t));
-static void Ecompile PARAMS((char *, size_t));
-static char *EGexecute PARAMS((char *, size_t, char **));
-static void Fcompile PARAMS((char *, size_t));
-static char *Fexecute PARAMS((char *, size_t, char **));
-static void kwsinit PARAMS((void));
-
-/* Here is the matchers vector for the main program. */
-struct matcher matchers[] = {
- { "default", Gcompile, EGexecute },
- { "grep", Gcompile, EGexecute },
- { "egrep", Ecompile, EGexecute },
- { "awk", Ecompile, EGexecute },
- { "fgrep", Fcompile, Fexecute },
- { 0, 0, 0 },
-};
-
/* For -w, we also consider _ to be word constituent. */
#define WCHAR(C) (ISALNUM(C) || (C) == '_')
/* DFA compiled regexp. */
static struct dfa dfa;
-/* Regex compiled regexp. */
-static struct re_pattern_buffer regexbuf;
+/* The Regex compiled patterns. */
+static struct patterns
+{
+ /* Regex compiled regexp. */
+ struct re_pattern_buffer regexbuf;
+ struct re_registers regs; /* This is here on account of a BRAIN-DEAD
+ Q@#%!# library interface in regex.c. */
+} patterns0;
+
+struct patterns *patterns;
+size_t pcount;
/* KWset compiled pattern. For Ecompile and Gcompile, we compile
a list of strings, at least one of which is known to occur in
any string matching the regexp. */
static kwset_t kwset;
-/* Last compiled fixed string known to exactly match the regexp.
- If kwsexec() returns < lastexact, then we don't need to
+/* Number of compiled fixed strings known to exactly match the regexp.
+ If kwsexec returns < kwset_exact_matches, then we don't need to
call the regexp matcher at all. */
-static int lastexact;
+static int kwset_exact_matches;
+
+#if defined(MBS_SUPPORT)
+static char* check_multibyte_string PARAMS ((char const *buf, size_t size));
+#endif
+static void kwsinit PARAMS ((void));
+static void kwsmusts PARAMS ((void));
+static void Gcompile PARAMS ((char const *, size_t));
+static void Ecompile PARAMS ((char const *, size_t));
+static size_t EGexecute PARAMS ((char const *, size_t, size_t *, int ));
+static void Fcompile PARAMS ((char const *, size_t));
+static size_t Fexecute PARAMS ((char const *, size_t, size_t *, int));
+static void Pcompile PARAMS ((char const *, size_t ));
+static size_t Pexecute PARAMS ((char const *, size_t, size_t *, int));
void
dfaerror (char const *mesg)
{
- fatal(mesg, 0);
+ error (2, 0, mesg);
}
static void
@@ -80,10 +97,10 @@ kwsinit (void)
if (match_icase)
for (i = 0; i < NCHAR; ++i)
- trans[i] = TOLOWER(i);
+ trans[i] = TOLOWER (i);
- if (!(kwset = kwsalloc(match_icase ? trans : (char *) 0)))
- fatal("memory exhausted", 0);
+ if (!(kwset = kwsalloc (match_icase ? trans : (char *) 0)))
+ error (2, 0, _("memory exhausted"));
}
/* If the DFA turns out to have some set of fixed strings one of
@@ -93,12 +110,12 @@ kwsinit (void)
static void
kwsmusts (void)
{
- struct dfamust *dm;
- char *err;
+ struct dfamust const *dm;
+ char const *err;
if (dfa.musts)
{
- kwsinit();
+ kwsinit ();
/* First, we compile in the substrings known to be exact
matches. The kwset matcher will return the index
of the matching string that it chooses. */
@@ -106,9 +123,9 @@ kwsmusts (void)
{
if (!dm->exact)
continue;
- ++lastexact;
- if ((err = kwsincr(kwset, dm->must, strlen(dm->must))) != 0)
- fatal(err, 0);
+ ++kwset_exact_matches;
+ if ((err = kwsincr (kwset, dm->must, strlen (dm->must))) != 0)
+ error (2, 0, err);
}
/* Now, we compile the substrings that will require
the use of the regexp matcher. */
@@ -116,24 +133,90 @@ kwsmusts (void)
{
if (dm->exact)
continue;
- if ((err = kwsincr(kwset, dm->must, strlen(dm->must))) != 0)
- fatal(err, 0);
+ if ((err = kwsincr (kwset, dm->must, strlen (dm->must))) != 0)
+ error (2, 0, err);
+ }
+ if ((err = kwsprep (kwset)) != 0)
+ error (2, 0, err);
+ }
+}
+
+#ifdef MBS_SUPPORT
+/* This function allocate the array which correspond to "buf".
+ Then this check multibyte string and mark on the positions which
+ are not singlebyte character nor the first byte of a multibyte
+ character. Caller must free the array. */
+static char*
+check_multibyte_string(char const *buf, size_t size)
+{
+ char *mb_properties = malloc(size);
+ mbstate_t cur_state;
+ int i;
+ memset(&cur_state, 0, sizeof(mbstate_t));
+ memset(mb_properties, 0, sizeof(char)*size);
+ for (i = 0; i < size ;)
+ {
+ size_t mbclen;
+ mbclen = mbrlen(buf + i, size - i, &cur_state);
+
+ if (mbclen == (size_t) -1 || mbclen == (size_t) -2 || mbclen == 0)
+ {
+ /* An invalid sequence, or a truncated multibyte character.
+ We treat it as a singlebyte character. */
+ mbclen = 1;
}
- if ((err = kwsprep(kwset)) != 0)
- fatal(err, 0);
+ mb_properties[i] = mbclen;
+ i += mbclen;
}
+
+ return mb_properties;
}
+#endif
static void
-Gcompile (char *pattern, size_t size)
+Gcompile (char const *pattern, size_t size)
{
const char *err;
+ char const *sep;
+ size_t total = size;
+ char const *motif = pattern;
+
+ re_set_syntax (RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE);
+ dfasyntax (RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE, match_icase, eolbyte);
+
+ /* For GNU regex compiler we have to pass the patterns separately to detect
+ errors like "[\nallo\n]\n". The patterns here are "[", "allo" and "]"
+ GNU regex should have raise a syntax error. The same for backref, where
+ the backref should have been local to each pattern. */
+ do
+ {
+ size_t len;
+ sep = memchr (motif, '\n', total);
+ if (sep)
+ {
+ len = sep - motif;
+ sep++;
+ total -= (len + 1);
+ }
+ else
+ {
+ len = total;
+ total = 0;
+ }
+
+ patterns = realloc (patterns, (pcount + 1) * sizeof (*patterns));
+ if (patterns == NULL)
+ error (2, errno, _("memory exhausted"));
+
+ patterns[pcount] = patterns0;
- re_set_syntax(RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE);
- dfasyntax(RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE, match_icase, eolbyte);
+ if ((err = re_compile_pattern (motif, len,
+ &(patterns[pcount].regexbuf))) != 0)
+ error (2, 0, err);
+ pcount++;
- if ((err = re_compile_pattern(pattern, size, &regexbuf)) != 0)
- fatal(err, 0);
+ motif = sep;
+ } while (sep && total != 0);
/* In the match_words and match_lines cases, we use a different pattern
for the DFA matcher that will quickly throw out cases that won't work.
@@ -142,49 +225,42 @@ Gcompile (char *pattern, size_t size)
if (match_words || match_lines)
{
/* In the whole-word case, we use the pattern:
- (^|[^A-Za-z_])(userpattern)([^A-Za-z_]|$).
+ \(^\|[^[:alnum:]_]\)\(userpattern\)\([^[:alnum:]_]|$\).
In the whole-line case, we use the pattern:
- ^(userpattern)$.
- BUG: Using [A-Za-z_] is locale-dependent!
- So will use [:alnum:] */
-
- char *n = malloc(size + 50);
- int i = 0;
-
- strcpy(n, "");
-
- if (match_lines)
- strcpy(n, "^\\(");
- if (match_words)
- strcpy(n, "\\(^\\|[^[:alnum:]_]\\)\\(");
-
- i = strlen(n);
- memcpy(n + i, pattern, size);
+ ^\(userpattern\)$. */
+
+ static char const line_beg[] = "^\\(";
+ static char const line_end[] = "\\)$";
+ static char const word_beg[] = "\\(^\\|[^[:alnum:]_]\\)\\(";
+ static char const word_end[] = "\\)\\([^[:alnum:]_]\\|$\\)";
+ char *n = malloc (sizeof word_beg - 1 + size + sizeof word_end);
+ size_t i;
+ strcpy (n, match_lines ? line_beg : word_beg);
+ i = strlen (n);
+ memcpy (n + i, pattern, size);
i += size;
-
- if (match_words)
- strcpy(n + i, "\\)\\([^[:alnum:]_]\\|$\\)");
- if (match_lines)
- strcpy(n + i, "\\)$");
-
- i += strlen(n + i);
- dfacomp(n, i, &dfa, 1);
+ strcpy (n + i, match_lines ? line_end : word_end);
+ i += strlen (n + i);
+ pattern = n;
+ size = i;
}
- else
- dfacomp(pattern, size, &dfa, 1);
- kwsmusts();
+ dfacomp (pattern, size, &dfa, 1);
+ kwsmusts ();
}
static void
-Ecompile (char *pattern, size_t size)
+Ecompile (char const *pattern, size_t size)
{
const char *err;
+ const char *sep;
+ size_t total = size;
+ char const *motif = pattern;
- if (strcmp(matcher, "awk") == 0)
+ if (strcmp (matcher, "awk") == 0)
{
- re_set_syntax(RE_SYNTAX_AWK);
- dfasyntax(RE_SYNTAX_AWK, match_icase, eolbyte);
+ re_set_syntax (RE_SYNTAX_AWK);
+ dfasyntax (RE_SYNTAX_AWK, match_icase, eolbyte);
}
else
{
@@ -192,8 +268,38 @@ Ecompile (char *pattern, size_t size)
dfasyntax (RE_SYNTAX_POSIX_EGREP, match_icase, eolbyte);
}
- if ((err = re_compile_pattern(pattern, size, &regexbuf)) != 0)
- fatal(err, 0);
+ /* For GNU regex compiler we have to pass the patterns separately to detect
+ errors like "[\nallo\n]\n". The patterns here are "[", "allo" and "]"
+ GNU regex should have raise a syntax error. The same for backref, where
+ the backref should have been local to each pattern. */
+ do
+ {
+ size_t len;
+ sep = memchr (motif, '\n', total);
+ if (sep)
+ {
+ len = sep - motif;
+ sep++;
+ total -= (len + 1);
+ }
+ else
+ {
+ len = total;
+ total = 0;
+ }
+
+ patterns = realloc (patterns, (pcount + 1) * sizeof (*patterns));
+ if (patterns == NULL)
+ error (2, errno, _("memory exhausted"));
+ patterns[pcount] = patterns0;
+
+ if ((err = re_compile_pattern (motif, len,
+ &(patterns[pcount].regexbuf))) != 0)
+ error (2, 0, err);
+ pcount++;
+
+ motif = sep;
+ } while (sep && total != 0);
/* In the match_words and match_lines cases, we use a different pattern
for the DFA matcher that will quickly throw out cases that won't work.
@@ -202,186 +308,236 @@ Ecompile (char *pattern, size_t size)
if (match_words || match_lines)
{
/* In the whole-word case, we use the pattern:
- (^|[^A-Za-z_])(userpattern)([^A-Za-z_]|$).
+ (^|[^[:alnum:]_])(userpattern)([^[:alnum:]_]|$).
In the whole-line case, we use the pattern:
- ^(userpattern)$.
- BUG: Using [A-Za-z_] is locale-dependent!
- so will use the char class */
-
- char *n = malloc(size + 50);
- int i = 0;
-
- strcpy(n, "");
-
- if (match_lines)
- strcpy(n, "^(");
- if (match_words)
- strcpy(n, "(^|[^[:alnum:]_])(");
-
+ ^(userpattern)$. */
+
+ static char const line_beg[] = "^(";
+ static char const line_end[] = ")$";
+ static char const word_beg[] = "(^|[^[:alnum:]_])(";
+ static char const word_end[] = ")([^[:alnum:]_]|$)";
+ char *n = malloc (sizeof word_beg - 1 + size + sizeof word_end);
+ size_t i;
+ strcpy (n, match_lines ? line_beg : word_beg);
i = strlen(n);
- memcpy(n + i, pattern, size);
+ memcpy (n + i, pattern, size);
i += size;
-
- if (match_words)
- strcpy(n + i, ")([^[:alnum:]_]|$)");
- if (match_lines)
- strcpy(n + i, ")$");
-
- i += strlen(n + i);
- dfacomp(n, i, &dfa, 1);
+ strcpy (n + i, match_lines ? line_end : word_end);
+ i += strlen (n + i);
+ pattern = n;
+ size = i;
}
- else
- dfacomp(pattern, size, &dfa, 1);
- kwsmusts();
+ dfacomp (pattern, size, &dfa, 1);
+ kwsmusts ();
}
-static char *
-EGexecute (char *buf, size_t size, char **endp)
+static size_t
+EGexecute (char const *buf, size_t size, size_t *match_size, int exact)
{
- register char *buflim, *beg, *end, save;
+ register char const *buflim, *beg, *end;
char eol = eolbyte;
int backref, start, len;
struct kwsmatch kwsm;
- static struct re_registers regs; /* This is static on account of a BRAIN-DEAD
- Q@#%!# library interface in regex.c. */
+ size_t i;
+#ifdef MBS_SUPPORT
+ char *mb_properties = NULL;
+#endif /* MBS_SUPPORT */
+
+#ifdef MBS_SUPPORT
+ if (MB_CUR_MAX > 1 && kwset)
+ mb_properties = check_multibyte_string(buf, size);
+#endif /* MBS_SUPPORT */
buflim = buf + size;
- for (beg = end = buf; end < buflim; beg = end + 1)
+ for (beg = end = buf; end < buflim; beg = end)
{
- if (kwset)
+ if (!exact)
{
- /* Find a possible match using the KWset matcher. */
- beg = kwsexec(kwset, beg, buflim - beg, &kwsm);
- if (!beg)
- goto failure;
- /* Narrow down to the line containing the candidate, and
- run it through DFA. */
- end = memchr(beg, eol, buflim - beg);
- if (!end)
- end = buflim;
- while (beg > buf && beg[-1] != eol)
- --beg;
- save = *end;
- if (kwsm.index < lastexact)
- goto success;
- if (!dfaexec(&dfa, beg, end, 0, (int *) 0, &backref))
+ if (kwset)
{
- *end = save;
- continue;
+ /* Find a possible match using the KWset matcher. */
+ size_t offset = kwsexec (kwset, beg, buflim - beg, &kwsm);
+ if (offset == (size_t) -1)
+ {
+#ifdef MBS_SUPPORT
+ if (MB_CUR_MAX > 1)
+ free(mb_properties);
+#endif
+ return (size_t)-1;
+ }
+ beg += offset;
+ /* Narrow down to the line containing the candidate, and
+ run it through DFA. */
+ end = memchr(beg, eol, buflim - beg);
+ end++;
+#ifdef MBS_SUPPORT
+ if (MB_CUR_MAX > 1 && mb_properties[beg - buf] == 0)
+ continue;
+#endif
+ while (beg > buf && beg[-1] != eol)
+ --beg;
+ if (kwsm.index < kwset_exact_matches)
+ goto success;
+ if (dfaexec (&dfa, beg, end - beg, &backref) == (size_t) -1)
+ continue;
+ }
+ else
+ {
+ /* No good fixed strings; start with DFA. */
+ size_t offset = dfaexec (&dfa, beg, buflim - beg, &backref);
+ if (offset == (size_t) -1)
+ break;
+ /* Narrow down to the line we've found. */
+ beg += offset;
+ end = memchr (beg, eol, buflim - beg);
+ end++;
+ while (beg > buf && beg[-1] != eol)
+ --beg;
}
- *end = save;
- /* Successful, no backreferences encountered. */
- if (!backref)
- goto success;
- }
- else
- {
- /* No good fixed strings; start with DFA. */
- save = *buflim;
- beg = dfaexec(&dfa, beg, buflim, 0, (int *) 0, &backref);
- *buflim = save;
- if (!beg)
- goto failure;
- /* Narrow down to the line we've found. */
- end = memchr(beg, eol, buflim - beg);
- if (!end)
- end = buflim;
- while (beg > buf && beg[-1] != eol)
- --beg;
/* Successful, no backreferences encountered! */
if (!backref)
goto success;
}
+ else
+ end = beg + size;
+
/* If we've made it to this point, this means DFA has seen
a probable match, and we need to run it through Regex. */
- regexbuf.not_eol = 0;
- if ((start = re_search(&regexbuf, beg, end - beg, 0, end - beg, &regs)) >= 0)
+ for (i = 0; i < pcount; i++)
{
- len = regs.end[0] - start;
- if ((!match_lines && !match_words)
- || (match_lines && len == end - beg))
- goto success;
- /* If -w, check if the match aligns with word boundaries.
- We do this iteratively because:
- (a) the line may contain more than one occurence of the pattern, and
- (b) Several alternatives in the pattern might be valid at a given
- point, and we may need to consider a shorter one to find a word
- boundary. */
- if (match_words)
- while (start >= 0)
- {
- if ((start == 0 || !WCHAR ((unsigned char) beg[start - 1]))
- && (len == end - beg
- || !WCHAR ((unsigned char) beg[start + len])))
- goto success;
- if (len > 0)
- {
- /* Try a shorter length anchored at the same place. */
- --len;
- regexbuf.not_eol = 1;
- len = re_match(&regexbuf, beg, start + len, start, &regs);
- }
- if (len <= 0)
+ patterns[i].regexbuf.not_eol = 0;
+ if (0 <= (start = re_search (&(patterns[i].regexbuf), beg,
+ end - beg - 1, 0,
+ end - beg - 1, &(patterns[i].regs))))
+ {
+ len = patterns[i].regs.end[0] - start;
+ if (exact)
+ {
+ *match_size = len;
+ return start;
+ }
+ if ((!match_lines && !match_words)
+ || (match_lines && len == end - beg - 1))
+ goto success;
+ /* If -w, check if the match aligns with word boundaries.
+ We do this iteratively because:
+ (a) the line may contain more than one occurence of the
+ pattern, and
+ (b) Several alternatives in the pattern might be valid at a
+ given point, and we may need to consider a shorter one to
+ find a word boundary. */
+ if (match_words)
+ while (start >= 0)
{
- /* Try looking further on. */
- if (start == end - beg)
- break;
- ++start;
- regexbuf.not_eol = 0;
- start = re_search(&regexbuf, beg, end - beg,
- start, end - beg - start, &regs);
- len = regs.end[0] - start;
+ if ((start == 0 || !WCHAR ((unsigned char) beg[start - 1]))
+ && (len == end - beg - 1
+ || !WCHAR ((unsigned char) beg[start + len])))
+ goto success;
+ if (len > 0)
+ {
+ /* Try a shorter length anchored at the same place. */
+ --len;
+ patterns[i].regexbuf.not_eol = 1;
+ len = re_match (&(patterns[i].regexbuf), beg,
+ start + len, start,
+ &(patterns[i].regs));
+ }
+ if (len <= 0)
+ {
+ /* Try looking further on. */
+ if (start == end - beg - 1)
+ break;
+ ++start;
+ patterns[i].regexbuf.not_eol = 0;
+ start = re_search (&(patterns[i].regexbuf), beg,
+ end - beg - 1,
+ start, end - beg - 1 - start,
+ &(patterns[i].regs));
+ len = patterns[i].regs.end[0] - start;
+ }
}
- }
- }
- }
-
- failure:
- return 0;
+ }
+ } /* for Regex patterns. */
+ } /* for (beg = end ..) */
+#ifdef MBS_SUPPORT
+ if (MB_CUR_MAX > 1 && mb_properties)
+ free (mb_properties);
+#endif /* MBS_SUPPORT */
+ return (size_t) -1;
success:
- *endp = end < buflim ? end + 1 : end;
- return beg;
+#ifdef MBS_SUPPORT
+ if (MB_CUR_MAX > 1 && mb_properties)
+ free (mb_properties);
+#endif /* MBS_SUPPORT */
+ *match_size = end - beg;
+ return beg - buf;
}
static void
-Fcompile (char *pattern, size_t size)
+Fcompile (char const *pattern, size_t size)
{
- char *beg, *lim, *err;
+ char const *beg, *lim, *err;
- kwsinit();
+ kwsinit ();
beg = pattern;
do
{
for (lim = beg; lim < pattern + size && *lim != '\n'; ++lim)
;
- if ((err = kwsincr(kwset, beg, lim - beg)) != 0)
- fatal(err, 0);
+ if ((err = kwsincr (kwset, beg, lim - beg)) != 0)
+ error (2, 0, err);
if (lim < pattern + size)
++lim;
beg = lim;
}
while (beg < pattern + size);
- if ((err = kwsprep(kwset)) != 0)
- fatal(err, 0);
+ if ((err = kwsprep (kwset)) != 0)
+ error (2, 0, err);
}
-static char *
-Fexecute (char *buf, size_t size, char **endp)
+static size_t
+Fexecute (char const *buf, size_t size, size_t *match_size, int exact)
{
- register char *beg, *try, *end;
+ register char const *beg, *try, *end;
register size_t len;
char eol = eolbyte;
struct kwsmatch kwsmatch;
+#ifdef MBS_SUPPORT
+ char *mb_properties;
+ if (MB_CUR_MAX > 1)
+ mb_properties = check_multibyte_string (buf, size);
+#endif /* MBS_SUPPORT */
for (beg = buf; beg <= buf + size; ++beg)
{
- if (!(beg = kwsexec(kwset, beg, buf + size - beg, &kwsmatch)))
- return 0;
+ size_t offset = kwsexec (kwset, beg, buf + size - beg, &kwsmatch);
+ if (offset == (size_t) -1)
+ {
+#ifdef MBS_SUPPORT
+ if (MB_CUR_MAX > 1)
+ free(mb_properties);
+#endif /* MBS_SUPPORT */
+ return offset;
+ }
+#ifdef MBS_SUPPORT
+ if (MB_CUR_MAX > 1 && mb_properties[offset+beg-buf] == 0)
+ continue; /* It is a part of multibyte character. */
+#endif /* MBS_SUPPORT */
+ beg += offset;
len = kwsmatch.size[0];
+ if (exact)
+ {
+ *match_size = len;
+#ifdef MBS_SUPPORT
+ if (MB_CUR_MAX > 1)
+ free (mb_properties);
+#endif /* MBS_SUPPORT */
+ return beg - buf;
+ }
if (match_lines)
{
if (beg > buf && beg[-1] != eol)
@@ -391,13 +547,22 @@ Fexecute (char *buf, size_t size, char **endp)
goto success;
}
else if (match_words)
- for (try = beg; len && try;)
+ for (try = beg; len; )
{
if (try > buf && WCHAR((unsigned char) try[-1]))
break;
if (try + len < buf + size && WCHAR((unsigned char) try[len]))
{
- try = kwsexec(kwset, beg, --len, &kwsmatch);
+ offset = kwsexec (kwset, beg, --len, &kwsmatch);
+ if (offset == (size_t) -1)
+ {
+#ifdef MBS_SUPPORT
+ if (MB_CUR_MAX > 1)
+ free (mb_properties);
+#endif /* MBS_SUPPORT */
+ return offset;
+ }
+ try = beg + offset;
len = kwsmatch.size[0];
}
else
@@ -407,15 +572,153 @@ Fexecute (char *buf, size_t size, char **endp)
goto success;
}
- return 0;
+#ifdef MBS_SUPPORT
+ if (MB_CUR_MAX > 1)
+ free (mb_properties);
+#endif /* MBS_SUPPORT */
+ return -1;
success:
- if ((end = memchr(beg + len, eol, (buf + size) - (beg + len))) != 0)
- ++end;
- else
- end = buf + size;
- *endp = end;
- while (beg > buf && beg[-1] != '\n')
+ end = memchr (beg + len, eol, (buf + size) - (beg + len));
+ end++;
+ while (buf < beg && beg[-1] != eol)
--beg;
- return beg;
+ *match_size = end - beg;
+#ifdef MBS_SUPPORT
+ if (MB_CUR_MAX > 1)
+ free (mb_properties);
+#endif /* MBS_SUPPORT */
+ return beg - buf;
+}
+
+#if HAVE_LIBPCRE
+/* Compiled internal form of a Perl regular expression. */
+static pcre *cre;
+
+/* Additional information about the pattern. */
+static pcre_extra *extra;
+#endif
+
+static void
+Pcompile (char const *pattern, size_t size)
+{
+#if !HAVE_LIBPCRE
+ error (2, 0, _("The -P option is not supported"));
+#else
+ int e;
+ char const *ep;
+ char *re = xmalloc (4 * size + 7);
+ int flags = PCRE_MULTILINE | (match_icase ? PCRE_CASELESS : 0);
+ char const *patlim = pattern + size;
+ char *n = re;
+ char const *p;
+ char const *pnul;
+
+ /* FIXME: Remove this restriction. */
+ if (eolbyte != '\n')
+ error (2, 0, _("The -P and -z options cannot be combined"));
+
+ *n = '\0';
+ if (match_lines)
+ strcpy (n, "^(");
+ if (match_words)
+ strcpy (n, "\\b(");
+ n += strlen (n);
+
+ /* The PCRE interface doesn't allow NUL bytes in the pattern, so
+ replace each NUL byte in the pattern with the four characters
+ "\000", removing a preceding backslash if there are an odd
+ number of backslashes before the NUL.
+
+ FIXME: This method does not work with some multibyte character
+ encodings, notably Shift-JIS, where a multibyte character can end
+ in a backslash byte. */
+ for (p = pattern; (pnul = memchr (p, '\0', patlim - p)); p = pnul + 1)
+ {
+ memcpy (n, p, pnul - p);
+ n += pnul - p;
+ for (p = pnul; pattern < p && p[-1] == '\\'; p--)
+ continue;
+ n -= (pnul - p) & 1;
+ strcpy (n, "\\000");
+ n += 4;
+ }
+
+ memcpy (n, p, patlim - p);
+ n += patlim - p;
+ *n = '\0';
+ if (match_words)
+ strcpy (n, ")\\b");
+ if (match_lines)
+ strcpy (n, ")$");
+
+ cre = pcre_compile (re, flags, &ep, &e, pcre_maketables ());
+ if (!cre)
+ error (2, 0, ep);
+
+ extra = pcre_study (cre, 0, &ep);
+ if (ep)
+ error (2, 0, ep);
+
+ free (re);
+#endif
+}
+
+static size_t
+Pexecute (char const *buf, size_t size, size_t *match_size, int exact)
+{
+#if !HAVE_LIBPCRE
+ abort ();
+ return -1;
+#else
+ /* This array must have at least two elements; everything after that
+ is just for performance improvement in pcre_exec. */
+ int sub[300];
+
+ int e = pcre_exec (cre, extra, buf, size, 0, 0,
+ sub, sizeof sub / sizeof *sub);
+
+ if (e <= 0)
+ {
+ switch (e)
+ {
+ case PCRE_ERROR_NOMATCH:
+ return -1;
+
+ case PCRE_ERROR_NOMEMORY:
+ error (2, 0, _("Memory exhausted"));
+
+ default:
+ abort ();
+ }
+ }
+ else
+ {
+ /* Narrow down to the line we've found. */
+ char const *beg = buf + sub[0];
+ char const *end = buf + sub[1];
+ char const *buflim = buf + size;
+ char eol = eolbyte;
+ if (!exact)
+ {
+ end = memchr (end, eol, buflim - end);
+ end++;
+ while (buf < beg && beg[-1] != eol)
+ --beg;
+ }
+
+ *match_size = end - beg;
+ return beg - buf;
+ }
+#endif
}
+
+struct matcher const matchers[] = {
+ { "default", Gcompile, EGexecute },
+ { "grep", Gcompile, EGexecute },
+ { "egrep", Ecompile, EGexecute },
+ { "awk", Ecompile, EGexecute },
+ { "fgrep", Fcompile, Fexecute },
+ { "perl", Pcompile, Pexecute },
+ { "", 0, 0 },
+};