vendor/misc-GNU/grep

author: Tim J. Robbins <tjr@FreeBSD.org> 2004-07-04 09:52:08 +0000
committer: Tim J. Robbins <tjr@FreeBSD.org> 2004-07-04 09:52:08 +0000
commit: 6fdbbb54872ec7fac83387296f0165f7ad3400a9 (patch)
tree: 70ffe417c9436d51a3807610738f2a5993f7f0fc /gnu/usr.bin/grep/search.c
parent: 7a39f4da90d53085484e72b7504a3e53dbc9f051 (diff)
1 files changed, 513 insertions, 210 deletions
diff --git a/gnu/usr.bin/grep/search.c b/gnu/usr.bin/grep/search.c
index 9763dba5f5bd9..7bd233fbcaf14 100644
--- a/gnu/usr.bin/grep/search.c
+++ b/gnu/usr.bin/grep/search.c
@@ -22,54 +22,71 @@
 # include <config.h>
 #endif
 #include <sys/types.h>
+#if defined HAVE_WCTYPE_H && defined HAVE_WCHAR_H && defined HAVE_MBRTOWC
+/* We can handle multibyte string.  */
+# define MBS_SUPPORT
+# include <wchar.h>
+# include <wctype.h>
+#endif
+
 #include "system.h"
 #include "grep.h"
 #include "regex.h"
 #include "dfa.h"
 #include "kwset.h"
+#include "error.h"
+#include "xalloc.h"
+#ifdef HAVE_LIBPCRE
+# include <pcre.h>
+#endif
 
 #define NCHAR (UCHAR_MAX + 1)
 
-static void Gcompile PARAMS((char *, size_t));
-static void Ecompile PARAMS((char *, size_t));
-static char *EGexecute PARAMS((char *, size_t, char **));
-static void Fcompile PARAMS((char *, size_t));
-static char *Fexecute PARAMS((char *, size_t, char **));
-static void kwsinit PARAMS((void));
-
-/* Here is the matchers vector for the main program. */
-struct matcher matchers[] = {
-  { "default", Gcompile, EGexecute },
-  { "grep", Gcompile, EGexecute },
-  { "egrep", Ecompile, EGexecute },
-  { "awk", Ecompile, EGexecute },
-  { "fgrep", Fcompile, Fexecute },
-  { 0, 0, 0 },
-};
-
 /* For -w, we also consider _ to be word constituent.  */
 #define WCHAR(C) (ISALNUM(C) || (C) == '_')
 
 /* DFA compiled regexp. */
 static struct dfa dfa;
 
-/* Regex compiled regexp. */
-static struct re_pattern_buffer regexbuf;
+/* The Regex compiled patterns.  */
+static struct patterns
+{
+  /* Regex compiled regexp. */
+  struct re_pattern_buffer regexbuf;
+  struct re_registers regs; /* This is here on account of a BRAIN-DEAD
+			       Q@#%!# library interface in regex.c.  */
+} patterns0;
+
+struct patterns *patterns;
+size_t pcount;
 
 /* KWset compiled pattern.  For Ecompile and Gcompile, we compile
    a list of strings, at least one of which is known to occur in
    any string matching the regexp. */
 static kwset_t kwset;
 
-/* Last compiled fixed string known to exactly match the regexp.
-   If kwsexec() returns < lastexact, then we don't need to
+/* Number of compiled fixed strings known to exactly match the regexp.
+   If kwsexec returns < kwset_exact_matches, then we don't need to
    call the regexp matcher at all. */
-static int lastexact;
+static int kwset_exact_matches;
+
+#if defined(MBS_SUPPORT)
+static char* check_multibyte_string PARAMS ((char const *buf, size_t size));
+#endif
+static void kwsinit PARAMS ((void));
+static void kwsmusts PARAMS ((void));
+static void Gcompile PARAMS ((char const *, size_t));
+static void Ecompile PARAMS ((char const *, size_t));
+static size_t EGexecute PARAMS ((char const *, size_t, size_t *, int ));
+static void Fcompile PARAMS ((char const *, size_t));
+static size_t Fexecute PARAMS ((char const *, size_t, size_t *, int));
+static void Pcompile PARAMS ((char const *, size_t ));
+static size_t Pexecute PARAMS ((char const *, size_t, size_t *, int));
 
 void
 dfaerror (char const *mesg)
 {
-  fatal(mesg, 0);
+  error (2, 0, mesg);
 }
 
 static void
@@ -80,10 +97,10 @@ kwsinit (void)
 
   if (match_icase)
     for (i = 0; i < NCHAR; ++i)
-      trans[i] = TOLOWER(i);
+      trans[i] = TOLOWER (i);
 
-  if (!(kwset = kwsalloc(match_icase ? trans : (char *) 0)))
-    fatal("memory exhausted", 0);
+  if (!(kwset = kwsalloc (match_icase ? trans : (char *) 0)))
+    error (2, 0, _("memory exhausted"));
 }
 
 /* If the DFA turns out to have some set of fixed strings one of
@@ -93,12 +110,12 @@ kwsinit (void)
 static void
 kwsmusts (void)
 {
-  struct dfamust *dm;
-  char *err;
+  struct dfamust const *dm;
+  char const *err;
 
   if (dfa.musts)
     {
-      kwsinit();
+      kwsinit ();
       /* First, we compile in the substrings known to be exact
 	 matches.  The kwset matcher will return the index
 	 of the matching string that it chooses. */
@@ -106,9 +123,9 @@ kwsmusts (void)
 	{
 	  if (!dm->exact)
 	    continue;
-	  ++lastexact;
-	  if ((err = kwsincr(kwset, dm->must, strlen(dm->must))) != 0)
-	    fatal(err, 0);
+	  ++kwset_exact_matches;
+	  if ((err = kwsincr (kwset, dm->must, strlen (dm->must))) != 0)
+	    error (2, 0, err);
 	}
       /* Now, we compile the substrings that will require
 	 the use of the regexp matcher.  */
@@ -116,24 +133,90 @@ kwsmusts (void)
 	{
 	  if (dm->exact)
 	    continue;
-	  if ((err = kwsincr(kwset, dm->must, strlen(dm->must))) != 0)
-	    fatal(err, 0);
+	  if ((err = kwsincr (kwset, dm->must, strlen (dm->must))) != 0)
+	    error (2, 0, err);
+	}
+      if ((err = kwsprep (kwset)) != 0)
+	error (2, 0, err);
+    }
+}
+
+#ifdef MBS_SUPPORT
+/* This function allocate the array which correspond to "buf".
+   Then this check multibyte string and mark on the positions which
+   are not singlebyte character nor the first byte of a multibyte
+   character.  Caller must free the array.  */
+static char*
+check_multibyte_string(char const *buf, size_t size)
+{
+  char *mb_properties = malloc(size);
+  mbstate_t cur_state;
+  int i;
+  memset(&cur_state, 0, sizeof(mbstate_t));
+  memset(mb_properties, 0, sizeof(char)*size);
+  for (i = 0; i < size ;)
+    {
+      size_t mbclen;
+      mbclen = mbrlen(buf + i, size - i, &cur_state);
+
+      if (mbclen == (size_t) -1 || mbclen == (size_t) -2 || mbclen == 0)
+	{
+	  /* An invalid sequence, or a truncated multibyte character.
+	     We treat it as a singlebyte character.  */
+	  mbclen = 1;
 	}
-      if ((err = kwsprep(kwset)) != 0)
-	fatal(err, 0);
+      mb_properties[i] = mbclen;
+      i += mbclen;
     }
+
+  return mb_properties;
 }
+#endif
 
 static void
-Gcompile (char *pattern, size_t size)
+Gcompile (char const *pattern, size_t size)
 {
   const char *err;
+  char const *sep;
+  size_t total = size;
+  char const *motif = pattern;
+
+  re_set_syntax (RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE);
+  dfasyntax (RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE, match_icase, eolbyte);
+
+  /* For GNU regex compiler we have to pass the patterns separately to detect
+     errors like "[\nallo\n]\n".  The patterns here are "[", "allo" and "]"
+     GNU regex should have raise a syntax error.  The same for backref, where
+     the backref should have been local to each pattern.  */
+  do
+    {
+      size_t len;
+      sep = memchr (motif, '\n', total);
+      if (sep)
+	{
+	  len = sep - motif;
+	  sep++;
+	  total -= (len + 1);
+	}
+      else
+	{
+	  len = total;
+	  total = 0;
+	}
+
+      patterns = realloc (patterns, (pcount + 1) * sizeof (*patterns));
+      if (patterns == NULL)
+	error (2, errno, _("memory exhausted"));
+
+      patterns[pcount] = patterns0;
 
-  re_set_syntax(RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE);
-  dfasyntax(RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE, match_icase, eolbyte);
+      if ((err = re_compile_pattern (motif, len,
+				    &(patterns[pcount].regexbuf))) != 0)
+	error (2, 0, err);
+      pcount++;
 
-  if ((err = re_compile_pattern(pattern, size, &regexbuf)) != 0)
-    fatal(err, 0);
+      motif = sep;
+    } while (sep && total != 0);
 
   /* In the match_words and match_lines cases, we use a different pattern
      for the DFA matcher that will quickly throw out cases that won't work.
@@ -142,49 +225,42 @@ Gcompile (char *pattern, size_t size)
   if (match_words || match_lines)
     {
       /* In the whole-word case, we use the pattern:
-	 (^|[^A-Za-z_])(userpattern)([^A-Za-z_]|$).
+	 \(^\|[^[:alnum:]_]\)\(userpattern\)\([^[:alnum:]_]|$\).
 	 In the whole-line case, we use the pattern:
-	 ^(userpattern)$.
-	 BUG: Using [A-Za-z_] is locale-dependent!
-	 So will use [:alnum:] */
-
-      char *n = malloc(size + 50);
-      int i = 0;
-
-      strcpy(n, "");
-
-      if (match_lines)
-	strcpy(n, "^\\(");
-      if (match_words)
-	strcpy(n, "\\(^\\|[^[:alnum:]_]\\)\\(");
-
-      i = strlen(n);
-      memcpy(n + i, pattern, size);
+	 ^\(userpattern\)$.  */
+
+      static char const line_beg[] = "^\\(";
+      static char const line_end[] = "\\)$";
+      static char const word_beg[] = "\\(^\\|[^[:alnum:]_]\\)\\(";
+      static char const word_end[] = "\\)\\([^[:alnum:]_]\\|$\\)";
+      char *n = malloc (sizeof word_beg - 1 + size + sizeof word_end);
+      size_t i;
+      strcpy (n, match_lines ? line_beg : word_beg);
+      i = strlen (n);
+      memcpy (n + i, pattern, size);
       i += size;
-
-      if (match_words)
-	strcpy(n + i, "\\)\\([^[:alnum:]_]\\|$\\)");
-      if (match_lines)
-	strcpy(n + i, "\\)$");
-
-      i += strlen(n + i);
-      dfacomp(n, i, &dfa, 1);
+      strcpy (n + i, match_lines ? line_end : word_end);
+      i += strlen (n + i);
+      pattern = n;
+      size = i;
     }
-  else
-    dfacomp(pattern, size, &dfa, 1);
 
-  kwsmusts();
+  dfacomp (pattern, size, &dfa, 1);
+  kwsmusts ();
 }
 
 static void
-Ecompile (char *pattern, size_t size)
+Ecompile (char const *pattern, size_t size)
 {
   const char *err;
+  const char *sep;
+  size_t total = size;
+  char const *motif = pattern;
 
-  if (strcmp(matcher, "awk") == 0)
+  if (strcmp (matcher, "awk") == 0)
     {
-      re_set_syntax(RE_SYNTAX_AWK);
-      dfasyntax(RE_SYNTAX_AWK, match_icase, eolbyte);
+      re_set_syntax (RE_SYNTAX_AWK);
+      dfasyntax (RE_SYNTAX_AWK, match_icase, eolbyte);
     }
   else
     {
@@ -192,8 +268,38 @@ Ecompile (char *pattern, size_t size)
       dfasyntax (RE_SYNTAX_POSIX_EGREP, match_icase, eolbyte);
     }
 
-  if ((err = re_compile_pattern(pattern, size, &regexbuf)) != 0)
-    fatal(err, 0);
+  /* For GNU regex compiler we have to pass the patterns separately to detect
+     errors like "[\nallo\n]\n".  The patterns here are "[", "allo" and "]"
+     GNU regex should have raise a syntax error.  The same for backref, where
+     the backref should have been local to each pattern.  */
+  do
+    {
+      size_t len;
+      sep = memchr (motif, '\n', total);
+      if (sep)
+	{
+	  len = sep - motif;
+	  sep++;
+	  total -= (len + 1);
+	}
+      else
+	{
+	  len = total;
+	  total = 0;
+	}
+
+      patterns = realloc (patterns, (pcount + 1) * sizeof (*patterns));
+      if (patterns == NULL)
+	error (2, errno, _("memory exhausted"));
+      patterns[pcount] = patterns0;
+
+      if ((err = re_compile_pattern (motif, len,
+				    &(patterns[pcount].regexbuf))) != 0)
+	error (2, 0, err);
+      pcount++;
+
+      motif = sep;
+    } while (sep && total != 0);
 
   /* In the match_words and match_lines cases, we use a different pattern
      for the DFA matcher that will quickly throw out cases that won't work.
@@ -202,186 +308,236 @@ Ecompile (char *pattern, size_t size)
   if (match_words || match_lines)
     {
       /* In the whole-word case, we use the pattern:
-	 (^|[^A-Za-z_])(userpattern)([^A-Za-z_]|$).
+	 (^|[^[:alnum:]_])(userpattern)([^[:alnum:]_]|$).
 	 In the whole-line case, we use the pattern:
-	 ^(userpattern)$.
-	 BUG: Using [A-Za-z_] is locale-dependent!
-	 so will use the char class */
-
-      char *n = malloc(size + 50);
-      int i = 0;
-
-      strcpy(n, "");
-
-      if (match_lines)
-	strcpy(n, "^(");
-      if (match_words)
-	strcpy(n, "(^|[^[:alnum:]_])(");
-
+	 ^(userpattern)$.  */
+
+      static char const line_beg[] = "^(";
+      static char const line_end[] = ")$";
+      static char const word_beg[] = "(^|[^[:alnum:]_])(";
+      static char const word_end[] = ")([^[:alnum:]_]|$)";
+      char *n = malloc (sizeof word_beg - 1 + size + sizeof word_end);
+      size_t i;
+      strcpy (n, match_lines ? line_beg : word_beg);
       i = strlen(n);
-      memcpy(n + i, pattern, size);
+      memcpy (n + i, pattern, size);
       i += size;
-
-      if (match_words)
-	strcpy(n + i, ")([^[:alnum:]_]|$)");
-      if (match_lines)
-	strcpy(n + i, ")$");
-
-      i += strlen(n + i);
-      dfacomp(n, i, &dfa, 1);
+      strcpy (n + i, match_lines ? line_end : word_end);
+      i += strlen (n + i);
+      pattern = n;
+      size = i;
     }
-  else
-    dfacomp(pattern, size, &dfa, 1);
 
-  kwsmusts();
+  dfacomp (pattern, size, &dfa, 1);
+  kwsmusts ();
 }
 
-static char *
-EGexecute (char *buf, size_t size, char **endp)
+static size_t
+EGexecute (char const *buf, size_t size, size_t *match_size, int exact)
 {
-  register char *buflim, *beg, *end, save;
+  register char const *buflim, *beg, *end;
   char eol = eolbyte;
   int backref, start, len;
   struct kwsmatch kwsm;
-  static struct re_registers regs; /* This is static on account of a BRAIN-DEAD
-				    Q@#%!# library interface in regex.c.  */
+  size_t i;
+#ifdef MBS_SUPPORT
+  char *mb_properties = NULL;
+#endif /* MBS_SUPPORT */
+
+#ifdef MBS_SUPPORT
+  if (MB_CUR_MAX > 1 && kwset)
+    mb_properties = check_multibyte_string(buf, size);
+#endif /* MBS_SUPPORT */
 
   buflim = buf + size;
 
-  for (beg = end = buf; end < buflim; beg = end + 1)
+  for (beg = end = buf; end < buflim; beg = end)
     {
-      if (kwset)
+      if (!exact)
 	{
-	  /* Find a possible match using the KWset matcher. */
-	  beg = kwsexec(kwset, beg, buflim - beg, &kwsm);
-	  if (!beg)
-	    goto failure;
-	  /* Narrow down to the line containing the candidate, and
-	     run it through DFA. */
-	  end = memchr(beg, eol, buflim - beg);
-	  if (!end)
-	    end = buflim;
-	  while (beg > buf && beg[-1] != eol)
-	    --beg;
-	  save = *end;
-	  if (kwsm.index < lastexact)
-	    goto success;
-	  if (!dfaexec(&dfa, beg, end, 0, (int *) 0, &backref))
+	  if (kwset)
 	    {
-	      *end = save;
-	      continue;
+	      /* Find a possible match using the KWset matcher. */
+	      size_t offset = kwsexec (kwset, beg, buflim - beg, &kwsm);
+	      if (offset == (size_t) -1)
+		{
+#ifdef MBS_SUPPORT
+		  if (MB_CUR_MAX > 1)
+		    free(mb_properties);
+#endif
+		  return (size_t)-1;
+		}
+	      beg += offset;
+	      /* Narrow down to the line containing the candidate, and
+		 run it through DFA. */
+	      end = memchr(beg, eol, buflim - beg);
+	      end++;
+#ifdef MBS_SUPPORT
+	      if (MB_CUR_MAX > 1 && mb_properties[beg - buf] == 0)
+		continue;
+#endif
+	      while (beg > buf && beg[-1] != eol)
+		--beg;
+	      if (kwsm.index < kwset_exact_matches)
+		goto success;
+	      if (dfaexec (&dfa, beg, end - beg, &backref) == (size_t) -1)
+		continue;
+	    }
+	  else
+	    {
+	      /* No good fixed strings; start with DFA. */
+	      size_t offset = dfaexec (&dfa, beg, buflim - beg, &backref);
+	      if (offset == (size_t) -1)
+		break;
+	      /* Narrow down to the line we've found. */
+	      beg += offset;
+	      end = memchr (beg, eol, buflim - beg);
+	      end++;
+	      while (beg > buf && beg[-1] != eol)
+		--beg;
 	    }
-	  *end = save;
-	  /* Successful, no backreferences encountered. */
-	  if (!backref)
-	    goto success;
-	}
-      else
-	{
-	  /* No good fixed strings; start with DFA. */
-	  save = *buflim;
-	  beg = dfaexec(&dfa, beg, buflim, 0, (int *) 0, &backref);
-	  *buflim = save;
-	  if (!beg)
-	    goto failure;
-	  /* Narrow down to the line we've found. */
-	  end = memchr(beg, eol, buflim - beg);
-	  if (!end)
-	    end = buflim;
-	  while (beg > buf && beg[-1] != eol)
-	    --beg;
 	  /* Successful, no backreferences encountered! */
 	  if (!backref)
 	    goto success;
 	}
+      else
+	end = beg + size;
+
       /* If we've made it to this point, this means DFA has seen
 	 a probable match, and we need to run it through Regex. */
-      regexbuf.not_eol = 0;
-      if ((start = re_search(&regexbuf, beg, end - beg, 0, end - beg, &regs)) >= 0)
+      for (i = 0; i < pcount; i++)
 	{
-	  len = regs.end[0] - start;
-	  if ((!match_lines && !match_words)
-	      || (match_lines && len == end - beg))
-	    goto success;
-	  /* If -w, check if the match aligns with word boundaries.
-	     We do this iteratively because:
-	     (a) the line may contain more than one occurence of the pattern, and
-	     (b) Several alternatives in the pattern might be valid at a given
-	     point, and we may need to consider a shorter one to find a word
-	     boundary. */
-	  if (match_words)
-	    while (start >= 0)
-	      {
-		if ((start == 0 || !WCHAR ((unsigned char) beg[start - 1]))
-		    && (len == end - beg
-			|| !WCHAR ((unsigned char) beg[start + len])))
-		  goto success;
-		if (len > 0)
-		  {
-		    /* Try a shorter length anchored at the same place. */
-		    --len;
-		    regexbuf.not_eol = 1;
-		    len = re_match(&regexbuf, beg, start + len, start, &regs);
-		  }
-		if (len <= 0)
+	  patterns[i].regexbuf.not_eol = 0;
+	  if (0 <= (start = re_search (&(patterns[i].regexbuf), beg,
+				       end - beg - 1, 0,
+				       end - beg - 1, &(patterns[i].regs))))
+	    {
+	      len = patterns[i].regs.end[0] - start;
+	      if (exact)
+		{
+		  *match_size = len;
+		  return start;
+		}
+	      if ((!match_lines && !match_words)
+		  || (match_lines && len == end - beg - 1))
+		goto success;
+	      /* If -w, check if the match aligns with word boundaries.
+		 We do this iteratively because:
+		 (a) the line may contain more than one occurence of the
+		 pattern, and
+		 (b) Several alternatives in the pattern might be valid at a
+		 given point, and we may need to consider a shorter one to
+		 find a word boundary.  */
+	      if (match_words)
+		while (start >= 0)
 		  {
-		    /* Try looking further on. */
-		    if (start == end - beg)
-		      break;
-		    ++start;
-		    regexbuf.not_eol = 0;
-		    start = re_search(&regexbuf, beg, end - beg,
-				      start, end - beg - start, &regs);
-		    len = regs.end[0] - start;
+		    if ((start == 0 || !WCHAR ((unsigned char) beg[start - 1]))
+			&& (len == end - beg - 1
+			    || !WCHAR ((unsigned char) beg[start + len])))
+		      goto success;
+		    if (len > 0)
+		      {
+			/* Try a shorter length anchored at the same place. */
+			--len;
+			patterns[i].regexbuf.not_eol = 1;
+			len = re_match (&(patterns[i].regexbuf), beg,
+					start + len, start,
+					&(patterns[i].regs));
+		      }
+		    if (len <= 0)
+		      {
+			/* Try looking further on. */
+			if (start == end - beg - 1)
+			  break;
+			++start;
+			patterns[i].regexbuf.not_eol = 0;
+			start = re_search (&(patterns[i].regexbuf), beg,
+					   end - beg - 1,
+					   start, end - beg - 1 - start,
+					   &(patterns[i].regs));
+			len = patterns[i].regs.end[0] - start;
+		      }
 		  }
-	      }
-	}
-    }
-
- failure:
-  return 0;
+	    }
+	} /* for Regex patterns.  */
+    } /* for (beg = end ..) */
+#ifdef MBS_SUPPORT
+  if (MB_CUR_MAX > 1 && mb_properties)
+    free (mb_properties);
+#endif /* MBS_SUPPORT */
+  return (size_t) -1;
 
  success:
-  *endp = end < buflim ? end + 1 : end;
-  return beg;
+#ifdef MBS_SUPPORT
+  if (MB_CUR_MAX > 1 && mb_properties)
+    free (mb_properties);
+#endif /* MBS_SUPPORT */
+  *match_size = end - beg;
+  return beg - buf;
 }
 
 static void
-Fcompile (char *pattern, size_t size)
+Fcompile (char const *pattern, size_t size)
 {
-  char *beg, *lim, *err;
+  char const *beg, *lim, *err;
 
-  kwsinit();
+  kwsinit ();
   beg = pattern;
   do
     {
       for (lim = beg; lim < pattern + size && *lim != '\n'; ++lim)
 	;
-      if ((err = kwsincr(kwset, beg, lim - beg)) != 0)
-	fatal(err, 0);
+      if ((err = kwsincr (kwset, beg, lim - beg)) != 0)
+	error (2, 0, err);
       if (lim < pattern + size)
 	++lim;
       beg = lim;
     }
   while (beg < pattern + size);
 
-  if ((err = kwsprep(kwset)) != 0)
-    fatal(err, 0);
+  if ((err = kwsprep (kwset)) != 0)
+    error (2, 0, err);
 }
 
-static char *
-Fexecute (char *buf, size_t size, char **endp)
+static size_t
+Fexecute (char const *buf, size_t size, size_t *match_size, int exact)
 {
-  register char *beg, *try, *end;
+  register char const *beg, *try, *end;
   register size_t len;
   char eol = eolbyte;
   struct kwsmatch kwsmatch;
+#ifdef MBS_SUPPORT
+  char *mb_properties;
+  if (MB_CUR_MAX > 1)
+    mb_properties = check_multibyte_string (buf, size);
+#endif /* MBS_SUPPORT */
 
   for (beg = buf; beg <= buf + size; ++beg)
     {
-      if (!(beg = kwsexec(kwset, beg, buf + size - beg, &kwsmatch)))
-	return 0;
+      size_t offset = kwsexec (kwset, beg, buf + size - beg, &kwsmatch);
+      if (offset == (size_t) -1)
+	{
+#ifdef MBS_SUPPORT
+	  if (MB_CUR_MAX > 1)
+	    free(mb_properties);
+#endif /* MBS_SUPPORT */
+	  return offset;
+	}
+#ifdef MBS_SUPPORT
+      if (MB_CUR_MAX > 1 && mb_properties[offset+beg-buf] == 0)
+	continue; /* It is a part of multibyte character.  */
+#endif /* MBS_SUPPORT */
+      beg += offset;
       len = kwsmatch.size[0];
+      if (exact)
+	{
+	  *match_size = len;
+#ifdef MBS_SUPPORT
+	  if (MB_CUR_MAX > 1)
+	    free (mb_properties);
+#endif /* MBS_SUPPORT */
+	  return beg - buf;
+	}
       if (match_lines)
 	{
 	  if (beg > buf && beg[-1] != eol)
@@ -391,13 +547,22 @@ Fexecute (char *buf, size_t size, char **endp)
 	  goto success;
 	}
       else if (match_words)
-	for (try = beg; len && try;)
+	for (try = beg; len; )
 	  {
 	    if (try > buf && WCHAR((unsigned char) try[-1]))
 	      break;
 	    if (try + len < buf + size && WCHAR((unsigned char) try[len]))
 	      {
-		try = kwsexec(kwset, beg, --len, &kwsmatch);
+		offset = kwsexec (kwset, beg, --len, &kwsmatch);
+		if (offset == (size_t) -1)
+		  {
+#ifdef MBS_SUPPORT
+		    if (MB_CUR_MAX > 1)
+		      free (mb_properties);
+#endif /* MBS_SUPPORT */
+		    return offset;
+		  }
+		try = beg + offset;
 		len = kwsmatch.size[0];
 	      }
 	    else
@@ -407,15 +572,153 @@ Fexecute (char *buf, size_t size, char **endp)
 	goto success;
     }
 
-  return 0;
+#ifdef MBS_SUPPORT
+  if (MB_CUR_MAX > 1)
+    free (mb_properties);
+#endif /* MBS_SUPPORT */
+  return -1;
 
  success:
-  if ((end = memchr(beg + len, eol, (buf + size) - (beg + len))) != 0)
-    ++end;
-  else
-    end = buf + size;
-  *endp = end;
-  while (beg > buf && beg[-1] != '\n')
+  end = memchr (beg + len, eol, (buf + size) - (beg + len));
+  end++;
+  while (buf < beg && beg[-1] != eol)
     --beg;
-  return beg;
+  *match_size = end - beg;
+#ifdef MBS_SUPPORT
+  if (MB_CUR_MAX > 1)
+    free (mb_properties);
+#endif /* MBS_SUPPORT */
+  return beg - buf;
+}
+
+#if HAVE_LIBPCRE
+/* Compiled internal form of a Perl regular expression.  */
+static pcre *cre;
+
+/* Additional information about the pattern.  */
+static pcre_extra *extra;
+#endif
+
+static void
+Pcompile (char const *pattern, size_t size)
+{
+#if !HAVE_LIBPCRE
+  error (2, 0, _("The -P option is not supported"));
+#else
+  int e;
+  char const *ep;
+  char *re = xmalloc (4 * size + 7);
+  int flags = PCRE_MULTILINE | (match_icase ? PCRE_CASELESS : 0);
+  char const *patlim = pattern + size;
+  char *n = re;
+  char const *p;
+  char const *pnul;
+
+  /* FIXME: Remove this restriction.  */
+  if (eolbyte != '\n')
+    error (2, 0, _("The -P and -z options cannot be combined"));
+
+  *n = '\0';
+  if (match_lines)
+    strcpy (n, "^(");
+  if (match_words)
+    strcpy (n, "\\b(");
+  n += strlen (n);
+
+  /* The PCRE interface doesn't allow NUL bytes in the pattern, so
+     replace each NUL byte in the pattern with the four characters
+     "\000", removing a preceding backslash if there are an odd
+     number of backslashes before the NUL.
+
+     FIXME: This method does not work with some multibyte character
+     encodings, notably Shift-JIS, where a multibyte character can end
+     in a backslash byte.  */
+  for (p = pattern; (pnul = memchr (p, '\0', patlim - p)); p = pnul + 1)
+    {
+      memcpy (n, p, pnul - p);
+      n += pnul - p;
+      for (p = pnul; pattern < p && p[-1] == '\\'; p--)
+	continue;
+      n -= (pnul - p) & 1;
+      strcpy (n, "\\000");
+      n += 4;
+    }
+
+  memcpy (n, p, patlim - p);
+  n += patlim - p;
+  *n = '\0';
+  if (match_words)
+    strcpy (n, ")\\b");
+  if (match_lines)
+    strcpy (n, ")$");
+
+  cre = pcre_compile (re, flags, &ep, &e, pcre_maketables ());
+  if (!cre)
+    error (2, 0, ep);
+
+  extra = pcre_study (cre, 0, &ep);
+  if (ep)
+    error (2, 0, ep);
+
+  free (re);
+#endif
+}
+
+static size_t
+Pexecute (char const *buf, size_t size, size_t *match_size, int exact)
+{
+#if !HAVE_LIBPCRE
+  abort ();
+  return -1;
+#else
+  /* This array must have at least two elements; everything after that
+     is just for performance improvement in pcre_exec.  */
+  int sub[300];
+
+  int e = pcre_exec (cre, extra, buf, size, 0, 0,
+		     sub, sizeof sub / sizeof *sub);
+
+  if (e <= 0)
+    {
+      switch (e)
+	{
+	case PCRE_ERROR_NOMATCH:
+	  return -1;
+
+	case PCRE_ERROR_NOMEMORY:
+	  error (2, 0, _("Memory exhausted"));
+
+	default:
+	  abort ();
+	}
+    }
+  else
+    {
+      /* Narrow down to the line we've found.  */
+      char const *beg = buf + sub[0];
+      char const *end = buf + sub[1];
+      char const *buflim = buf + size;
+      char eol = eolbyte;
+      if (!exact)
+	{
+	  end = memchr (end, eol, buflim - end);
+	  end++;
+	  while (buf < beg && beg[-1] != eol)
+	    --beg;
+	}
+
+      *match_size = end - beg;
+      return beg - buf;
+    }
+#endif
 }
+
+struct matcher const matchers[] = {
+  { "default", Gcompile, EGexecute },
+  { "grep", Gcompile, EGexecute },
+  { "egrep", Ecompile, EGexecute },
+  { "awk", Ecompile, EGexecute },
+  { "fgrep", Fcompile, Fexecute },
+  { "perl", Pcompile, Pexecute },
+  { "", 0, 0 },
+};
author	Tim J. Robbins <tjr@FreeBSD.org>	2004-07-04 09:52:08 +0000
committer	Tim J. Robbins <tjr@FreeBSD.org>	2004-07-04 09:52:08 +0000
commit	6fdbbb54872ec7fac83387296f0165f7ad3400a9 (patch)
tree	70ffe417c9436d51a3807610738f2a5993f7f0fc /gnu/usr.bin/grep/search.c
parent	7a39f4da90d53085484e72b7504a3e53dbc9f051 (diff)