Blame SOURCES/grep-2.20-pcre-backported-fixes.patch

51e48f
diff --git a/src/grep.h b/src/grep.h
51e48f
index 4935872..729c906 100644
51e48f
--- a/src/grep.h
51e48f
+++ b/src/grep.h
51e48f
@@ -27,4 +27,19 @@ extern int match_words;		/* -w */
51e48f
 extern int match_lines;		/* -x */
51e48f
 extern unsigned char eolbyte;	/* -z */
51e48f
 
51e48f
+/* An enum textbin describes the file's type, inferred from data read
51e48f
+   before the first line is selected for output.  */
51e48f
+enum textbin
51e48f
+  {
51e48f
+    /* Binary, as it contains null bytes and the -z option is not in effect,
51e48f
+       or it contains encoding errors.  */
51e48f
+    TEXTBIN_BINARY = -1,
51e48f
+
51e48f
+    /* Not known yet.  Only text has been seen so far.  */
51e48f
+    TEXTBIN_UNKNOWN = 0,
51e48f
+
51e48f
+    /* Text.  */
51e48f
+    TEXTBIN_TEXT = 1
51e48f
+  };
51e48f
+
51e48f
 #endif
51e48f
diff --git a/src/pcresearch.c b/src/pcresearch.c
51e48f
index 820dd00..9938ffc 100644
51e48f
--- a/src/pcresearch.c
51e48f
+++ b/src/pcresearch.c
51e48f
@@ -33,13 +33,19 @@ static pcre *cre;
51e48f
 /* Additional information about the pattern.  */
51e48f
 static pcre_extra *extra;
51e48f
 
51e48f
-# ifdef PCRE_STUDY_JIT_COMPILE
51e48f
-static pcre_jit_stack *jit_stack;
51e48f
-# else
51e48f
+# ifndef PCRE_STUDY_JIT_COMPILE
51e48f
 #  define PCRE_STUDY_JIT_COMPILE 0
51e48f
 # endif
51e48f
 #endif
51e48f
 
51e48f
+/* Table, indexed by ! (flag & PCRE_NOTBOL), of whether the empty
51e48f
+   string matches when that flag is used.  */
51e48f
+static int empty_match[2];
51e48f
+
51e48f
+/* This must be at least 2; everything after that is for performance
51e48f
+   in pcre_exec.  */
51e48f
+enum { NSUB = 300 };
51e48f
+
51e48f
 void
51e48f
 Pcompile (char const *pattern, size_t size)
51e48f
 {
51e48f
@@ -52,13 +58,17 @@ Pcompile (char const *pattern, size_t size)
51e48f
   char const *ep;
51e48f
   char *re = xnmalloc (4, size + 7);
51e48f
   int flags = (PCRE_MULTILINE
51e48f
-               | (match_icase ? PCRE_CASELESS : 0)
51e48f
-               | (using_utf8 () ? PCRE_UTF8 : 0));
51e48f
+               | (match_icase ? PCRE_CASELESS : 0));
51e48f
   char const *patlim = pattern + size;
51e48f
   char *n = re;
51e48f
   char const *p;
51e48f
   char const *pnul;
51e48f
 
51e48f
+  if (using_utf8 ())
51e48f
+    flags |= PCRE_UTF8;
51e48f
+  else if (MB_CUR_MAX != 1)
51e48f
+    error (EXIT_TROUBLE, 0, _("-P supports only unibyte and UTF-8 locales"));
51e48f
+
51e48f
   /* FIXME: Remove these restrictions.  */
51e48f
   if (memchr (pattern, '\n', size))
51e48f
     error (EXIT_TROUBLE, 0, _("the -P option only supports a single pattern"));
51e48f
@@ -114,14 +124,20 @@ Pcompile (char const *pattern, size_t size)
51e48f
       /* A 32K stack is allocated for the machine code by default, which
51e48f
          can grow to 512K if necessary. Since JIT uses far less memory
51e48f
          than the interpreter, this should be enough in practice.  */
51e48f
-      jit_stack = pcre_jit_stack_alloc (32 * 1024, 512 * 1024);
51e48f
+      pcre_jit_stack *jit_stack = pcre_jit_stack_alloc (32 * 1024, 512 * 1024);
51e48f
       if (!jit_stack)
51e48f
         error (EXIT_TROUBLE, 0,
51e48f
                _("failed to allocate memory for the PCRE JIT stack"));
51e48f
       pcre_assign_jit_stack (extra, NULL, jit_stack);
51e48f
     }
51e48f
+
51e48f
 # endif
51e48f
   free (re);
51e48f
+
51e48f
+  int sub[NSUB];
51e48f
+  empty_match[false] = pcre_exec (cre, extra, "", 0, 0,
51e48f
+                                  PCRE_NOTBOL, sub, NSUB);
51e48f
+  empty_match[true] = pcre_exec (cre, extra, "", 0, 0, 0, sub, NSUB);
51e48f
 #endif /* HAVE_LIBPCRE */
51e48f
 }
51e48f
 
51e48f
@@ -134,36 +150,110 @@ Pexecute (char const *buf, size_t size, size_t *match_size,
51e48f
   error (EXIT_TROUBLE, 0, _("internal error"));
51e48f
   return -1;
51e48f
 #else
51e48f
-  /* This array must have at least two elements; everything after that
51e48f
-     is just for performance improvement in pcre_exec.  */
51e48f
-  int sub[300];
51e48f
-
51e48f
-  const char *line_buf, *line_end, *line_next;
51e48f
+  int sub[NSUB];
51e48f
+  char const *p = start_ptr ? start_ptr : buf;
51e48f
+  bool bol = p[-1] == eolbyte;
51e48f
+  char const *line_start = buf;
51e48f
   int e = PCRE_ERROR_NOMATCH;
51e48f
-  ptrdiff_t start_ofs = start_ptr ? start_ptr - buf : 0;
51e48f
+  char const *line_end;
51e48f
 
51e48f
-  /* PCRE can't limit the matching to single lines, therefore we have to
51e48f
-     match each line in the buffer separately.  */
51e48f
-  for (line_next = buf;
51e48f
-       e == PCRE_ERROR_NOMATCH && line_next < buf + size;
51e48f
-       start_ofs -= line_next - line_buf)
51e48f
+  /* If the input type is unknown, the caller is still testing the
51e48f
+     input, which means the current buffer cannot contain encoding
51e48f
+     errors and a multiline search is typically more efficient.
51e48f
+     Otherwise, a single-line search is typically faster, so that
51e48f
+     pcre_exec doesn't waste time validating the entire input
51e48f
+     buffer.  */
51e48f
+  bool multiline = TEXTBIN_UNKNOWN;
51e48f
+
51e48f
+  for (; p < buf + size; p = line_start = line_end + 1)
51e48f
     {
51e48f
-      line_buf = line_next;
51e48f
-      line_end = memchr (line_buf, eolbyte, (buf + size) - line_buf);
51e48f
-      if (line_end == NULL)
51e48f
-        line_next = line_end = buf + size;
51e48f
-      else
51e48f
-        line_next = line_end + 1;
51e48f
+      bool too_big;
51e48f
 
51e48f
-      if (start_ptr && start_ptr >= line_end)
51e48f
-        continue;
51e48f
+      if (multiline)
51e48f
+        {
51e48f
+          size_t pcre_size_max = MIN (INT_MAX, SIZE_MAX - 1);
51e48f
+          size_t scan_size = MIN (pcre_size_max + 1, buf + size - p);
51e48f
+          line_end = memrchr (p, eolbyte, scan_size);
51e48f
+          too_big = ! line_end;
51e48f
+        }
51e48f
+      else
51e48f
+        {
51e48f
+          line_end = memchr (p, eolbyte, buf + size - p);
51e48f
+          too_big = INT_MAX < line_end - p;
51e48f
+        }
51e48f
 
51e48f
-      if (INT_MAX < line_end - line_buf)
51e48f
+      if (too_big)
51e48f
         error (EXIT_TROUBLE, 0, _("exceeded PCRE's line length limit"));
51e48f
 
51e48f
-      e = pcre_exec (cre, extra, line_buf, line_end - line_buf,
51e48f
-                     start_ofs < 0 ? 0 : start_ofs, 0,
51e48f
-                     sub, sizeof sub / sizeof *sub);
51e48f
+      for (;;)
51e48f
+        {
51e48f
+          /* Skip past bytes that are easily determined to be encoding
51e48f
+             errors, treating them as data that cannot match.  This is
51e48f
+             faster than having pcre_exec check them.  */
51e48f
+          while (mbclen_cache[to_uchar (*p)] == (size_t) -1)
51e48f
+            {
51e48f
+              p++;
51e48f
+              bol = false;
51e48f
+            }
51e48f
+
51e48f
+          /* Check for an empty match; this is faster than letting
51e48f
+             pcre_exec do it.  */
51e48f
+          int search_bytes = line_end - p;
51e48f
+          if (search_bytes == 0)
51e48f
+            {
51e48f
+              sub[0] = sub[1] = 0;
51e48f
+              e = empty_match[bol];
51e48f
+              break;
51e48f
+            }
51e48f
+
51e48f
+          int options = 0;
51e48f
+          if (!bol)
51e48f
+            options |= PCRE_NOTBOL;
51e48f
+          if (multiline)
51e48f
+            options |= PCRE_NO_UTF8_CHECK;
51e48f
+
51e48f
+          e = pcre_exec (cre, extra, p, search_bytes, 0,
51e48f
+                         options, sub, NSUB);
51e48f
+          if (e != PCRE_ERROR_BADUTF8)
51e48f
+            {
51e48f
+              if (0 < e && multiline && sub[1] - sub[0] != 0)
51e48f
+                {
51e48f
+                  char const *nl = memchr (p + sub[0], eolbyte,
51e48f
+                                           sub[1] - sub[0]);
51e48f
+                  if (nl)
51e48f
+                    {
51e48f
+                      /* This match crosses a line boundary; reject it.  */
51e48f
+                      p += sub[0];
51e48f
+                      line_end = nl;
51e48f
+                      continue;
51e48f
+                    }
51e48f
+                }
51e48f
+              break;
51e48f
+            }
51e48f
+          int valid_bytes = sub[0];
51e48f
+
51e48f
+          /* Try to match the string before the encoding error.
51e48f
+             Again, handle the empty-match case specially, for speed.  */
51e48f
+          if (valid_bytes == 0)
51e48f
+            {
51e48f
+              sub[1] = 0;
51e48f
+              e = empty_match[bol];
51e48f
+            }
51e48f
+          else
51e48f
+            e = pcre_exec (cre, extra, p, valid_bytes, 0,
51e48f
+                           options | PCRE_NO_UTF8_CHECK | PCRE_NOTEOL,
51e48f
+                           sub, NSUB);
51e48f
+          if (e != PCRE_ERROR_NOMATCH || valid_bytes < 0)
51e48f
+            break;
51e48f
+
51e48f
+          /* Treat the encoding error as data that cannot match.  */
51e48f
+          p += valid_bytes + 1;
51e48f
+          bol = false;
51e48f
+        }
51e48f
+
51e48f
+      if (e != PCRE_ERROR_NOMATCH)
51e48f
+        break;
51e48f
+      bol = true;
51e48f
     }
51e48f
 
51e48f
   if (e <= 0)
51e48f
@@ -171,7 +261,7 @@ Pexecute (char const *buf, size_t size, size_t *match_size,
51e48f
       switch (e)
51e48f
         {
51e48f
         case PCRE_ERROR_NOMATCH:
51e48f
-          return -1;
51e48f
+          break;
51e48f
 
51e48f
         case PCRE_ERROR_NOMEMORY:
51e48f
           error (EXIT_TROUBLE, 0, _("memory exhausted"));
51e48f
@@ -180,10 +270,6 @@ Pexecute (char const *buf, size_t size, size_t *match_size,
51e48f
           error (EXIT_TROUBLE, 0,
51e48f
                  _("exceeded PCRE's backtracking limit"));
51e48f
 
51e48f
-        case PCRE_ERROR_BADUTF8:
51e48f
-          error (EXIT_TROUBLE, 0,
51e48f
-                 _("invalid UTF-8 byte sequence in input"));
51e48f
-
51e48f
         default:
51e48f
           /* For now, we lump all remaining PCRE failures into this basket.
51e48f
              If anyone cares to provide sample grep usage that can trigger
51e48f
@@ -192,30 +278,33 @@ Pexecute (char const *buf, size_t size, size_t *match_size,
51e48f
           error (EXIT_TROUBLE, 0, _("internal PCRE error: %d"), e);
51e48f
         }
51e48f
 
51e48f
-      /* NOTREACHED */
51e48f
       return -1;
51e48f
     }
51e48f
   else
51e48f
     {
51e48f
-      /* Narrow down to the line we've found.  */
51e48f
-      char const *beg = line_buf + sub[0];
51e48f
-      char const *end = line_buf + sub[1];
51e48f
-      char const *buflim = buf + size;
51e48f
-      char eol = eolbyte;
51e48f
-      if (!start_ptr)
51e48f
+      char const *matchbeg = p + sub[0];
51e48f
+      char const *matchend = p + sub[1];
51e48f
+      char const *beg;
51e48f
+      char const *end;
51e48f
+      if (start_ptr)
51e48f
         {
51e48f
-          /* FIXME: The case when '\n' is not found indicates a bug:
51e48f
-             Since grep is line oriented, the match should never contain
51e48f
-             a newline, so there _must_ be a newline following.
51e48f
-           */
51e48f
-          if (!(end = memchr (end, eol, buflim - end)))
51e48f
-            end = buflim;
51e48f
-          else
51e48f
-            end++;
51e48f
-          while (buf < beg && beg[-1] != eol)
51e48f
-            --beg;
51e48f
+          beg = matchbeg;
51e48f
+          end = matchend;
51e48f
+        }
51e48f
+      else if (multiline)
51e48f
+        {
51e48f
+          char const *prev_nl = memrchr (line_start - 1, eolbyte,
51e48f
+                                         matchbeg - (line_start - 1));
51e48f
+          char const *next_nl = memchr (matchend, eolbyte,
51e48f
+                                        line_end + 1 - matchend);
51e48f
+          beg = prev_nl + 1;
51e48f
+          end = next_nl + 1;
51e48f
+        }
51e48f
+      else
51e48f
+        {
51e48f
+          beg = line_start;
51e48f
+          end = line_end + 1;
51e48f
         }
51e48f
-
51e48f
       *match_size = end - beg;
51e48f
       return beg - buf;
51e48f
     }
51e48f
diff --git a/src/search.h b/src/search.h
51e48f
index 14877bc..e671bea 100644
51e48f
--- a/src/search.h
51e48f
+++ b/src/search.h
51e48f
@@ -45,6 +45,7 @@ extern void kwsinit (kwset_t *);
51e48f
 
51e48f
 extern char *mbtoupper (char const *, size_t *, mb_len_map_t **);
51e48f
 extern void build_mbclen_cache (void);
51e48f
+extern size_t mbclen_cache[];
51e48f
 extern ptrdiff_t mb_goback (char const **, char const *, char const *);
51e48f
 extern wint_t mb_prev_wc (char const *, char const *, char const *);
51e48f
 extern wint_t mb_next_wc (char const *, char const *);
51e48f
diff --git a/src/searchutils.c b/src/searchutils.c
51e48f
index 5eb9a12..aba9335 100644
51e48f
--- a/src/searchutils.c
51e48f
+++ b/src/searchutils.c
51e48f
@@ -22,7 +22,7 @@
51e48f
 
51e48f
 #define NCHAR (UCHAR_MAX + 1)
51e48f
 
51e48f
-static size_t mbclen_cache[NCHAR];
51e48f
+size_t mbclen_cache[NCHAR];
51e48f
 
51e48f
 void
51e48f
 kwsinit (kwset_t *kwset)
51e48f
diff --git a/tests/pcre-infloop b/tests/pcre-infloop
51e48f
index 1b33e72..8054844 100755
51e48f
--- a/tests/pcre-infloop
51e48f
+++ b/tests/pcre-infloop
51e48f
@@ -18,16 +18,16 @@
51e48f
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
51e48f
 
51e48f
 . "${srcdir=.}/init.sh"; path_prepend_ ../src
51e48f
-require_pcre_
51e48f
 require_timeout_
51e48f
 require_en_utf8_locale_
51e48f
 require_compiled_in_MB_support
51e48f
+LC_ALL=en_US.UTF-8 require_pcre_
51e48f
 
51e48f
 printf 'a\201b\r' > in || framework_failure_
51e48f
 
51e48f
 fail=0
51e48f
 
51e48f
 LC_ALL=en_US.UTF-8 timeout 3 grep -P 'a.?..b' in
51e48f
-test $? = 2 || fail_ "libpcre's match function appears to infloop"
51e48f
+test $? = 1 || fail_ "libpcre's match function appears to infloop"
51e48f
 
51e48f
 Exit $fail
51e48f
diff --git a/tests/pcre-invalid-utf8-input b/tests/pcre-invalid-utf8-input
51e48f
index 913e8ee..abcc7e8 100755
51e48f
--- a/tests/pcre-invalid-utf8-input
51e48f
+++ b/tests/pcre-invalid-utf8-input
51e48f
@@ -8,14 +8,19 @@
51e48f
 # notice and this notice are preserved.
51e48f
 
51e48f
 . "${srcdir=.}/init.sh"; path_prepend_ ../src
51e48f
-require_pcre_
51e48f
+require_timeout_
51e48f
 require_en_utf8_locale_
51e48f
+require_compiled_in_MB_support
51e48f
+LC_ALL=en_US.UTF-8 require_pcre_
51e48f
 
51e48f
 fail=0
51e48f
 
51e48f
-printf 'j\202\nj\n' > in || framework_failure_
51e48f
+printf 'j\202j\nj\nk\202\n' > in || framework_failure_
51e48f
 
51e48f
-LC_ALL=en_US.UTF-8 grep -P j in
51e48f
-test $? -eq 2 || fail=1
51e48f
+LC_ALL=en_US.UTF-8 timeout 3 grep -P j in
51e48f
+test $? -eq 0 || fail=1
51e48f
+
51e48f
+LC_ALL=en_US.UTF-8 timeout 3 grep -P 'k$' in
51e48f
+test $? -eq 1 || fail=1
51e48f
 
51e48f
 Exit $fail
51e48f
diff --git a/tests/pcre-utf8 b/tests/pcre-utf8
51e48f
index 41676f4..2dda116 100755
51e48f
--- a/tests/pcre-utf8
51e48f
+++ b/tests/pcre-utf8
51e48f
@@ -8,8 +8,8 @@
51e48f
 # notice and this notice are preserved.
51e48f
 
51e48f
 . "${srcdir=.}/init.sh"; path_prepend_ ../src
51e48f
-require_pcre_
51e48f
 require_en_utf8_locale_
51e48f
+LC_ALL=en_US.UTF-8 require_pcre_
51e48f
 
51e48f
 fail=0
51e48f