Blame grep-2.20-pcre-backported-fixes.patch

23be49
diff --git a/src/grep.h b/src/grep.h
23be49
index 4935872..729c906 100644
23be49
--- a/src/grep.h
23be49
+++ b/src/grep.h
23be49
@@ -27,4 +27,19 @@ extern int match_words;		/* -w */
23be49
 extern int match_lines;		/* -x */
23be49
 extern unsigned char eolbyte;	/* -z */
23be49
 
23be49
+/* An enum textbin describes the file's type, inferred from data read
23be49
+   before the first line is selected for output.  */
23be49
+enum textbin
23be49
+  {
23be49
+    /* Binary, as it contains null bytes and the -z option is not in effect,
23be49
+       or it contains encoding errors.  */
23be49
+    TEXTBIN_BINARY = -1,
23be49
+
23be49
+    /* Not known yet.  Only text has been seen so far.  */
23be49
+    TEXTBIN_UNKNOWN = 0,
23be49
+
23be49
+    /* Text.  */
23be49
+    TEXTBIN_TEXT = 1
23be49
+  };
23be49
+
23be49
 #endif
23be49
diff --git a/src/pcresearch.c b/src/pcresearch.c
23be49
index 820dd00..9938ffc 100644
23be49
--- a/src/pcresearch.c
23be49
+++ b/src/pcresearch.c
23be49
@@ -33,13 +33,19 @@ static pcre *cre;
23be49
 /* Additional information about the pattern.  */
23be49
 static pcre_extra *extra;
23be49
 
23be49
-# ifdef PCRE_STUDY_JIT_COMPILE
23be49
-static pcre_jit_stack *jit_stack;
23be49
-# else
23be49
+# ifndef PCRE_STUDY_JIT_COMPILE
23be49
 #  define PCRE_STUDY_JIT_COMPILE 0
23be49
 # endif
23be49
 #endif
23be49
 
23be49
+/* Table, indexed by ! (flag & PCRE_NOTBOL), of whether the empty
23be49
+   string matches when that flag is used.  */
23be49
+static int empty_match[2];
23be49
+
23be49
+/* This must be at least 2; everything after that is for performance
23be49
+   in pcre_exec.  */
23be49
+enum { NSUB = 300 };
23be49
+
23be49
 void
23be49
 Pcompile (char const *pattern, size_t size)
23be49
 {
23be49
@@ -52,13 +58,17 @@ Pcompile (char const *pattern, size_t size)
23be49
   char const *ep;
23be49
   char *re = xnmalloc (4, size + 7);
23be49
   int flags = (PCRE_MULTILINE
23be49
-               | (match_icase ? PCRE_CASELESS : 0)
23be49
-               | (using_utf8 () ? PCRE_UTF8 : 0));
23be49
+               | (match_icase ? PCRE_CASELESS : 0));
23be49
   char const *patlim = pattern + size;
23be49
   char *n = re;
23be49
   char const *p;
23be49
   char const *pnul;
23be49
 
23be49
+  if (using_utf8 ())
23be49
+    flags |= PCRE_UTF8;
23be49
+  else if (MB_CUR_MAX != 1)
23be49
+    error (EXIT_TROUBLE, 0, _("-P supports only unibyte and UTF-8 locales"));
23be49
+
23be49
   /* FIXME: Remove these restrictions.  */
23be49
   if (memchr (pattern, '\n', size))
23be49
     error (EXIT_TROUBLE, 0, _("the -P option only supports a single pattern"));
23be49
@@ -114,14 +124,20 @@ Pcompile (char const *pattern, size_t size)
23be49
       /* A 32K stack is allocated for the machine code by default, which
23be49
          can grow to 512K if necessary. Since JIT uses far less memory
23be49
          than the interpreter, this should be enough in practice.  */
23be49
-      jit_stack = pcre_jit_stack_alloc (32 * 1024, 512 * 1024);
23be49
+      pcre_jit_stack *jit_stack = pcre_jit_stack_alloc (32 * 1024, 512 * 1024);
23be49
       if (!jit_stack)
23be49
         error (EXIT_TROUBLE, 0,
23be49
                _("failed to allocate memory for the PCRE JIT stack"));
23be49
       pcre_assign_jit_stack (extra, NULL, jit_stack);
23be49
     }
23be49
+
23be49
 # endif
23be49
   free (re);
23be49
+
23be49
+  int sub[NSUB];
23be49
+  empty_match[false] = pcre_exec (cre, extra, "", 0, 0,
23be49
+                                  PCRE_NOTBOL, sub, NSUB);
23be49
+  empty_match[true] = pcre_exec (cre, extra, "", 0, 0, 0, sub, NSUB);
23be49
 #endif /* HAVE_LIBPCRE */
23be49
 }
23be49
 
23be49
@@ -134,36 +150,110 @@ Pexecute (char const *buf, size_t size, size_t *match_size,
23be49
   error (EXIT_TROUBLE, 0, _("internal error"));
23be49
   return -1;
23be49
 #else
23be49
-  /* This array must have at least two elements; everything after that
23be49
-     is just for performance improvement in pcre_exec.  */
23be49
-  int sub[300];
23be49
-
23be49
-  const char *line_buf, *line_end, *line_next;
23be49
+  int sub[NSUB];
23be49
+  char const *p = start_ptr ? start_ptr : buf;
23be49
+  bool bol = p[-1] == eolbyte;
23be49
+  char const *line_start = buf;
23be49
   int e = PCRE_ERROR_NOMATCH;
23be49
-  ptrdiff_t start_ofs = start_ptr ? start_ptr - buf : 0;
23be49
+  char const *line_end;
23be49
 
23be49
-  /* PCRE can't limit the matching to single lines, therefore we have to
23be49
-     match each line in the buffer separately.  */
23be49
-  for (line_next = buf;
23be49
-       e == PCRE_ERROR_NOMATCH && line_next < buf + size;
23be49
-       start_ofs -= line_next - line_buf)
23be49
+  /* If the input type is unknown, the caller is still testing the
23be49
+     input, which means the current buffer cannot contain encoding
23be49
+     errors and a multiline search is typically more efficient.
23be49
+     Otherwise, a single-line search is typically faster, so that
23be49
+     pcre_exec doesn't waste time validating the entire input
23be49
+     buffer.  */
23be49
+  bool multiline = TEXTBIN_UNKNOWN;
23be49
+
23be49
+  for (; p < buf + size; p = line_start = line_end + 1)
23be49
     {
23be49
-      line_buf = line_next;
23be49
-      line_end = memchr (line_buf, eolbyte, (buf + size) - line_buf);
23be49
-      if (line_end == NULL)
23be49
-        line_next = line_end = buf + size;
23be49
-      else
23be49
-        line_next = line_end + 1;
23be49
+      bool too_big;
23be49
 
23be49
-      if (start_ptr && start_ptr >= line_end)
23be49
-        continue;
23be49
+      if (multiline)
23be49
+        {
23be49
+          size_t pcre_size_max = MIN (INT_MAX, SIZE_MAX - 1);
23be49
+          size_t scan_size = MIN (pcre_size_max + 1, buf + size - p);
23be49
+          line_end = memrchr (p, eolbyte, scan_size);
23be49
+          too_big = ! line_end;
23be49
+        }
23be49
+      else
23be49
+        {
23be49
+          line_end = memchr (p, eolbyte, buf + size - p);
23be49
+          too_big = INT_MAX < line_end - p;
23be49
+        }
23be49
 
23be49
-      if (INT_MAX < line_end - line_buf)
23be49
+      if (too_big)
23be49
         error (EXIT_TROUBLE, 0, _("exceeded PCRE's line length limit"));
23be49
 
23be49
-      e = pcre_exec (cre, extra, line_buf, line_end - line_buf,
23be49
-                     start_ofs < 0 ? 0 : start_ofs, 0,
23be49
-                     sub, sizeof sub / sizeof *sub);
23be49
+      for (;;)
23be49
+        {
23be49
+          /* Skip past bytes that are easily determined to be encoding
23be49
+             errors, treating them as data that cannot match.  This is
23be49
+             faster than having pcre_exec check them.  */
23be49
+          while (mbclen_cache[to_uchar (*p)] == (size_t) -1)
23be49
+            {
23be49
+              p++;
23be49
+              bol = false;
23be49
+            }
23be49
+
23be49
+          /* Check for an empty match; this is faster than letting
23be49
+             pcre_exec do it.  */
23be49
+          int search_bytes = line_end - p;
23be49
+          if (search_bytes == 0)
23be49
+            {
23be49
+              sub[0] = sub[1] = 0;
23be49
+              e = empty_match[bol];
23be49
+              break;
23be49
+            }
23be49
+
23be49
+          int options = 0;
23be49
+          if (!bol)
23be49
+            options |= PCRE_NOTBOL;
23be49
+          if (multiline)
23be49
+            options |= PCRE_NO_UTF8_CHECK;
23be49
+
23be49
+          e = pcre_exec (cre, extra, p, search_bytes, 0,
23be49
+                         options, sub, NSUB);
23be49
+          if (e != PCRE_ERROR_BADUTF8)
23be49
+            {
23be49
+              if (0 < e && multiline && sub[1] - sub[0] != 0)
23be49
+                {
23be49
+                  char const *nl = memchr (p + sub[0], eolbyte,
23be49
+                                           sub[1] - sub[0]);
23be49
+                  if (nl)
23be49
+                    {
23be49
+                      /* This match crosses a line boundary; reject it.  */
23be49
+                      p += sub[0];
23be49
+                      line_end = nl;
23be49
+                      continue;
23be49
+                    }
23be49
+                }
23be49
+              break;
23be49
+            }
23be49
+          int valid_bytes = sub[0];
23be49
+
23be49
+          /* Try to match the string before the encoding error.
23be49
+             Again, handle the empty-match case specially, for speed.  */
23be49
+          if (valid_bytes == 0)
23be49
+            {
23be49
+              sub[1] = 0;
23be49
+              e = empty_match[bol];
23be49
+            }
23be49
+          else
23be49
+            e = pcre_exec (cre, extra, p, valid_bytes, 0,
23be49
+                           options | PCRE_NO_UTF8_CHECK | PCRE_NOTEOL,
23be49
+                           sub, NSUB);
23be49
+          if (e != PCRE_ERROR_NOMATCH)
23be49
+            break;
23be49
+
23be49
+          /* Treat the encoding error as data that cannot match.  */
23be49
+          p += valid_bytes + 1;
23be49
+          bol = false;
23be49
+        }
23be49
+
23be49
+      if (e != PCRE_ERROR_NOMATCH)
23be49
+        break;
23be49
+      bol = true;
23be49
     }
23be49
 
23be49
   if (e <= 0)
23be49
@@ -171,7 +261,7 @@ Pexecute (char const *buf, size_t size, size_t *match_size,
23be49
       switch (e)
23be49
         {
23be49
         case PCRE_ERROR_NOMATCH:
23be49
-          return -1;
23be49
+          break;
23be49
 
23be49
         case PCRE_ERROR_NOMEMORY:
23be49
           error (EXIT_TROUBLE, 0, _("memory exhausted"));
23be49
@@ -180,10 +270,6 @@ Pexecute (char const *buf, size_t size, size_t *match_size,
23be49
           error (EXIT_TROUBLE, 0,
23be49
                  _("exceeded PCRE's backtracking limit"));
23be49
 
23be49
-        case PCRE_ERROR_BADUTF8:
23be49
-          error (EXIT_TROUBLE, 0,
23be49
-                 _("invalid UTF-8 byte sequence in input"));
23be49
-
23be49
         default:
23be49
           /* For now, we lump all remaining PCRE failures into this basket.
23be49
              If anyone cares to provide sample grep usage that can trigger
23be49
@@ -192,30 +278,33 @@ Pexecute (char const *buf, size_t size, size_t *match_size,
23be49
           error (EXIT_TROUBLE, 0, _("internal PCRE error: %d"), e);
23be49
         }
23be49
 
23be49
-      /* NOTREACHED */
23be49
       return -1;
23be49
     }
23be49
   else
23be49
     {
23be49
-      /* Narrow down to the line we've found.  */
23be49
-      char const *beg = line_buf + sub[0];
23be49
-      char const *end = line_buf + sub[1];
23be49
-      char const *buflim = buf + size;
23be49
-      char eol = eolbyte;
23be49
-      if (!start_ptr)
23be49
+      char const *matchbeg = p + sub[0];
23be49
+      char const *matchend = p + sub[1];
23be49
+      char const *beg;
23be49
+      char const *end;
23be49
+      if (start_ptr)
23be49
         {
23be49
-          /* FIXME: The case when '\n' is not found indicates a bug:
23be49
-             Since grep is line oriented, the match should never contain
23be49
-             a newline, so there _must_ be a newline following.
23be49
-           */
23be49
-          if (!(end = memchr (end, eol, buflim - end)))
23be49
-            end = buflim;
23be49
-          else
23be49
-            end++;
23be49
-          while (buf < beg && beg[-1] != eol)
23be49
-            --beg;
23be49
+          beg = matchbeg;
23be49
+          end = matchend;
23be49
+        }
23be49
+      else if (multiline)
23be49
+        {
23be49
+          char const *prev_nl = memrchr (line_start - 1, eolbyte,
23be49
+                                         matchbeg - (line_start - 1));
23be49
+          char const *next_nl = memchr (matchend, eolbyte,
23be49
+                                        line_end + 1 - matchend);
23be49
+          beg = prev_nl + 1;
23be49
+          end = next_nl + 1;
23be49
+        }
23be49
+      else
23be49
+        {
23be49
+          beg = line_start;
23be49
+          end = line_end + 1;
23be49
         }
23be49
-
23be49
       *match_size = end - beg;
23be49
       return beg - buf;
23be49
     }
23be49
diff --git a/src/search.h b/src/search.h
23be49
index 14877bc..e671bea 100644
23be49
--- a/src/search.h
23be49
+++ b/src/search.h
23be49
@@ -45,6 +45,7 @@ extern void kwsinit (kwset_t *);
23be49
 
23be49
 extern char *mbtoupper (char const *, size_t *, mb_len_map_t **);
23be49
 extern void build_mbclen_cache (void);
23be49
+extern size_t mbclen_cache[];
23be49
 extern ptrdiff_t mb_goback (char const **, char const *, char const *);
23be49
 extern wint_t mb_prev_wc (char const *, char const *, char const *);
23be49
 extern wint_t mb_next_wc (char const *, char const *);
23be49
diff --git a/src/searchutils.c b/src/searchutils.c
23be49
index 5eb9a12..aba9335 100644
23be49
--- a/src/searchutils.c
23be49
+++ b/src/searchutils.c
23be49
@@ -22,7 +22,7 @@
23be49
 
23be49
 #define NCHAR (UCHAR_MAX + 1)
23be49
 
23be49
-static size_t mbclen_cache[NCHAR];
23be49
+size_t mbclen_cache[NCHAR];
23be49
 
23be49
 void
23be49
 kwsinit (kwset_t *kwset)
23be49
diff --git a/tests/pcre-infloop b/tests/pcre-infloop
23be49
index 1b33e72..8054844 100755
23be49
--- a/tests/pcre-infloop
23be49
+++ b/tests/pcre-infloop
23be49
@@ -18,16 +18,16 @@
23be49
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
23be49
 
23be49
 . "${srcdir=.}/init.sh"; path_prepend_ ../src
23be49
-require_pcre_
23be49
 require_timeout_
23be49
 require_en_utf8_locale_
23be49
 require_compiled_in_MB_support
23be49
+LC_ALL=en_US.UTF-8 require_pcre_
23be49
 
23be49
 printf 'a\201b\r' > in || framework_failure_
23be49
 
23be49
 fail=0
23be49
 
23be49
 LC_ALL=en_US.UTF-8 timeout 3 grep -P 'a.?..b' in
23be49
-test $? = 2 || fail_ "libpcre's match function appears to infloop"
23be49
+test $? = 1 || fail_ "libpcre's match function appears to infloop"
23be49
 
23be49
 Exit $fail
23be49
diff --git a/tests/pcre-invalid-utf8-input b/tests/pcre-invalid-utf8-input
23be49
index 913e8ee..abcc7e8 100755
23be49
--- a/tests/pcre-invalid-utf8-input
23be49
+++ b/tests/pcre-invalid-utf8-input
23be49
@@ -8,14 +8,19 @@
23be49
 # notice and this notice are preserved.
23be49
 
23be49
 . "${srcdir=.}/init.sh"; path_prepend_ ../src
23be49
-require_pcre_
23be49
+require_timeout_
23be49
 require_en_utf8_locale_
23be49
+require_compiled_in_MB_support
23be49
+LC_ALL=en_US.UTF-8 require_pcre_
23be49
 
23be49
 fail=0
23be49
 
23be49
-printf 'j\202\nj\n' > in || framework_failure_
23be49
+printf 'j\202j\nj\nk\202\n' > in || framework_failure_
23be49
 
23be49
-LC_ALL=en_US.UTF-8 grep -P j in
23be49
-test $? -eq 2 || fail=1
23be49
+LC_ALL=en_US.UTF-8 timeout 3 grep -P j in
23be49
+test $? -eq 0 || fail=1
23be49
+
23be49
+LC_ALL=en_US.UTF-8 timeout 3 grep -P 'k$' in
23be49
+test $? -eq 1 || fail=1
23be49
 
23be49
 Exit $fail
23be49
diff --git a/tests/pcre-utf8 b/tests/pcre-utf8
23be49
index 41676f4..2dda116 100755
23be49
--- a/tests/pcre-utf8
23be49
+++ b/tests/pcre-utf8
23be49
@@ -8,8 +8,8 @@
23be49
 # notice and this notice are preserved.
23be49
 
23be49
 . "${srcdir=.}/init.sh"; path_prepend_ ../src
23be49
-require_pcre_
23be49
 require_en_utf8_locale_
23be49
+LC_ALL=en_US.UTF-8 require_pcre_
23be49
 
23be49
 fail=0
23be49