Blame grep-2.6.3-dfa-convert-to-wide-char.patch

Jaroslav Škarvada 6ccb10
From ff191d4667709b52758fcc5bdc568726d1616be4 Mon Sep 17 00:00:00 2001
Jaroslav Škarvada 6ccb10
From: Paolo Bonzini <bonzini@gnu.org>
Jaroslav Škarvada 6ccb10
Date: Tue, 4 May 2010 17:26:09 +0200
Jaroslav Škarvada 6ccb10
Subject: [PATCH] dfa: convert to wide character line-by-line
Jaroslav Škarvada 6ccb10
MIME-Version: 1.0
Jaroslav Škarvada 6ccb10
Content-Type: text/plain; charset=UTF-8
Jaroslav Škarvada 6ccb10
Content-Transfer-Encoding: 8bit
Jaroslav Škarvada 6ccb10
Jaroslav Škarvada 6ccb10
This provides a nice speedup for -m in general, but especially
Jaroslav Škarvada 6ccb10
it avoids quadratic complexity in case we have to go to glibc.
Jaroslav Škarvada 6ccb10
Jaroslav Škarvada 6ccb10
Testcases:
Jaroslav Škarvada 6ccb10
Jaroslav Škarvada 6ccb10
   # From upstream backref-multibyte-slow
Jaroslav Škarvada 6ccb10
   yes aba | sed 10000q > aba.txt
Jaroslav Škarvada 6ccb10
   time ./egrep -c '^([a-z]).\1$' aba.txt
Jaroslav Škarvada 6ccb10
Jaroslav Škarvada 6ccb10
   # From rbiba
Jaroslav Škarvada 6ccb10
   time grep '^[a-f][h-j][l-ž]$' cestina-sorted.txt
Jaroslav Škarvada 6ccb10
Jaroslav Škarvada 6ccb10
* src/dfa.c (prepare_wc_buf): Extract out of dfaexec.  Convert
Jaroslav Škarvada 6ccb10
only up to the next newline.
Jaroslav Škarvada 6ccb10
(dfaexec): Exit multibyte processing loop if past buf_end.
Jaroslav Škarvada 6ccb10
Call prepare_wc_buf again after processing a newline.
Jaroslav Škarvada 6ccb10
---
Jaroslav Škarvada 6ccb10
 src/dfa.c |   96 +++++++++++++++++++++++++++++++++++++-----------------------
Jaroslav Škarvada 6ccb10
 1 files changed, 59 insertions(+), 37 deletions(-)
Jaroslav Škarvada 6ccb10
Jaroslav Škarvada 6ccb10
diff --git a/src/dfa.c b/src/dfa.c
Jaroslav Škarvada 6ccb10
index 523fe05..70aa5a8 100644
Jaroslav Škarvada 6ccb10
--- a/src/dfa.c
Jaroslav Škarvada 6ccb10
+++ b/src/dfa.c
Jaroslav Škarvada 6ccb10
@@ -2824,6 +2824,53 @@ transit_state (struct dfa *d, int s, unsigned char const **pp)
Jaroslav Škarvada 6ccb10
 
Jaroslav Škarvada 6ccb10
 #endif /* MBS_SUPPORT */
Jaroslav Škarvada 6ccb10
 
Jaroslav Škarvada 6ccb10
+/* Initialize mblen_buf and inputwcs with data from the next line.  */
Jaroslav Škarvada 6ccb10
+
Jaroslav Škarvada 6ccb10
+static void
Jaroslav Škarvada 6ccb10
+prepare_wc_buf (const char *begin, const char *end)
Jaroslav Škarvada 6ccb10
+{
Jaroslav Škarvada 6ccb10
+  unsigned char eol = eolbyte;
Jaroslav Škarvada 6ccb10
+  size_t remain_bytes, i;
Jaroslav Škarvada 6ccb10
+
Jaroslav Škarvada 6ccb10
+  buf_begin = (unsigned char *) begin;
Jaroslav Škarvada 6ccb10
+
Jaroslav Škarvada 6ccb10
+  remain_bytes = 0;
Jaroslav Škarvada 6ccb10
+  for (i = 0; i < end - begin + 1; i++)
Jaroslav Škarvada 6ccb10
+    {
Jaroslav Škarvada 6ccb10
+      if (remain_bytes == 0)
Jaroslav Škarvada 6ccb10
+        {
Jaroslav Škarvada 6ccb10
+          remain_bytes
Jaroslav Škarvada 6ccb10
+            = mbrtowc(inputwcs + i, begin + i, end - begin - i + 1, &mbs);
Jaroslav Škarvada 6ccb10
+          if (remain_bytes < 1
Jaroslav Škarvada 6ccb10
+              || remain_bytes == (size_t) -1
Jaroslav Škarvada 6ccb10
+              || remain_bytes == (size_t) -2
Jaroslav Škarvada 6ccb10
+              || (remain_bytes == 1 && inputwcs[i] == (wchar_t)begin[i]))
Jaroslav Škarvada 6ccb10
+            {
Jaroslav Škarvada 6ccb10
+              remain_bytes = 0;
Jaroslav Škarvada 6ccb10
+              inputwcs[i] = (wchar_t)begin[i];
Jaroslav Škarvada 6ccb10
+              mblen_buf[i] = 0;
Jaroslav Škarvada 6ccb10
+              if (begin[i] == eol)
Jaroslav Škarvada 6ccb10
+                break;
Jaroslav Škarvada 6ccb10
+            }
Jaroslav Škarvada 6ccb10
+          else
Jaroslav Škarvada 6ccb10
+            {
Jaroslav Škarvada 6ccb10
+              mblen_buf[i] = remain_bytes;
Jaroslav Škarvada 6ccb10
+              remain_bytes--;
Jaroslav Škarvada 6ccb10
+            }
Jaroslav Škarvada 6ccb10
+        }
Jaroslav Škarvada 6ccb10
+      else
Jaroslav Škarvada 6ccb10
+        {
Jaroslav Škarvada 6ccb10
+          mblen_buf[i] = remain_bytes;
Jaroslav Škarvada 6ccb10
+          inputwcs[i] = 0;
Jaroslav Škarvada 6ccb10
+          remain_bytes--;
Jaroslav Škarvada 6ccb10
+        }
Jaroslav Škarvada 6ccb10
+    }
Jaroslav Škarvada 6ccb10
+
Jaroslav Škarvada 6ccb10
+  buf_end = (unsigned char *) (begin + i);
Jaroslav Škarvada 6ccb10
+  mblen_buf[i] = 0;
Jaroslav Škarvada 6ccb10
+  inputwcs[i] = 0; /* sentinel */
Jaroslav Škarvada 6ccb10
+}
Jaroslav Škarvada 6ccb10
+
Jaroslav Škarvada 6ccb10
 /* Search through a buffer looking for a match to the given struct dfa.
Jaroslav Škarvada 6ccb10
    Find the first occurrence of a string matching the regexp in the
Jaroslav Škarvada 6ccb10
    buffer, and the shortest possible version thereof.  Return a pointer to
Jaroslav Škarvada 6ccb10
@@ -2870,43 +2917,10 @@ dfaexec (struct dfa *d, char const *begin, char *end,
Jaroslav Škarvada 6ccb10
 #ifdef MBS_SUPPORT
Jaroslav Škarvada 6ccb10
   if (d->mb_cur_max > 1)
Jaroslav Škarvada 6ccb10
     {
Jaroslav Škarvada 6ccb10
-      int remain_bytes, i;
Jaroslav Škarvada 6ccb10
-      buf_begin = (unsigned char *) begin;
Jaroslav Škarvada 6ccb10
-      buf_end = (unsigned char *) end;
Jaroslav Škarvada 6ccb10
-
Jaroslav Škarvada 6ccb10
-      /* initialize mblen_buf, and inputwcs.  */
Jaroslav Škarvada 6ccb10
       MALLOC(mblen_buf, unsigned char, end - begin + 2);
Jaroslav Škarvada 6ccb10
       MALLOC(inputwcs, wchar_t, end - begin + 2);
Jaroslav Škarvada 6ccb10
       memset(&mbs, 0, sizeof(mbstate_t));
Jaroslav Škarvada 6ccb10
-      remain_bytes = 0;
Jaroslav Škarvada 6ccb10
-      for (i = 0; i < end - begin + 1; i++)
Jaroslav Škarvada 6ccb10
-	{
Jaroslav Škarvada 6ccb10
-	  if (remain_bytes == 0)
Jaroslav Škarvada 6ccb10
-	    {
Jaroslav Škarvada 6ccb10
-	      remain_bytes
Jaroslav Škarvada 6ccb10
-		= mbrtowc(inputwcs + i, begin + i, end - begin - i + 1, &mbs);
Jaroslav Škarvada 6ccb10
-	      if (remain_bytes < 1
Jaroslav Škarvada 6ccb10
-		|| (remain_bytes == 1 && inputwcs[i] == (wchar_t)begin[i]))
Jaroslav Škarvada 6ccb10
-		{
Jaroslav Škarvada 6ccb10
-		  remain_bytes = 0;
Jaroslav Škarvada 6ccb10
-		  inputwcs[i] = (wchar_t)begin[i];
Jaroslav Škarvada 6ccb10
-		  mblen_buf[i] = 0;
Jaroslav Škarvada 6ccb10
-		}
Jaroslav Škarvada 6ccb10
-	      else
Jaroslav Škarvada 6ccb10
-		{
Jaroslav Škarvada 6ccb10
-		  mblen_buf[i] = remain_bytes;
Jaroslav Škarvada 6ccb10
-		  remain_bytes--;
Jaroslav Škarvada 6ccb10
-		}
Jaroslav Škarvada 6ccb10
-	    }
Jaroslav Škarvada 6ccb10
-	  else
Jaroslav Škarvada 6ccb10
-	    {
Jaroslav Škarvada 6ccb10
-	      mblen_buf[i] = remain_bytes;
Jaroslav Škarvada 6ccb10
-	      inputwcs[i] = 0;
Jaroslav Škarvada 6ccb10
-	      remain_bytes--;
Jaroslav Škarvada 6ccb10
-	    }
Jaroslav Škarvada 6ccb10
-	}
Jaroslav Škarvada 6ccb10
-      mblen_buf[i] = 0;
Jaroslav Škarvada 6ccb10
-      inputwcs[i] = 0; /* sentinel */
Jaroslav Škarvada 6ccb10
+      prepare_wc_buf (p, end);
Jaroslav Škarvada 6ccb10
     }
Jaroslav Škarvada 6ccb10
 #endif /* MBS_SUPPORT */
Jaroslav Škarvada 6ccb10
 
Jaroslav Škarvada 6ccb10
@@ -2916,7 +2930,7 @@ dfaexec (struct dfa *d, char const *begin, char *end,
Jaroslav Škarvada 6ccb10
       if (d->mb_cur_max > 1)
Jaroslav Škarvada 6ccb10
 	while ((t = trans[s]))
Jaroslav Škarvada 6ccb10
 	  {
Jaroslav Škarvada 6ccb10
-	    if ((char *) p > end)
Jaroslav Škarvada 6ccb10
+	    if (p > buf_end)
Jaroslav Škarvada 6ccb10
 	      break;
Jaroslav Škarvada 6ccb10
 	    s1 = s;
Jaroslav Škarvada 6ccb10
 	    SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p);
Jaroslav Škarvada 6ccb10
@@ -2985,8 +2999,16 @@ dfaexec (struct dfa *d, char const *begin, char *end,
Jaroslav Škarvada 6ccb10
 	}
Jaroslav Škarvada 6ccb10
 
Jaroslav Škarvada 6ccb10
       /* If the previous character was a newline, count it. */
Jaroslav Škarvada 6ccb10
-      if (count && (char *) p <= end && p[-1] == eol)
Jaroslav Škarvada 6ccb10
-	++*count;
Jaroslav Škarvada 6ccb10
+      if ((char *) p <= end && p[-1] == eol)
Jaroslav Škarvada 6ccb10
+        {
Jaroslav Škarvada 6ccb10
+          if (count)
Jaroslav Škarvada 6ccb10
+            ++*count;
Jaroslav Škarvada 6ccb10
+
Jaroslav Škarvada 6ccb10
+#ifdef MBS_SUPPORT
Jaroslav Škarvada 6ccb10
+          if (d->mb_cur_max > 1)
Jaroslav Škarvada 6ccb10
+            prepare_wc_buf (p, end);
Jaroslav Škarvada 6ccb10
+#endif
Jaroslav Škarvada 6ccb10
+        }
Jaroslav Škarvada 6ccb10
 
Jaroslav Škarvada 6ccb10
       /* Check if we've run off the end of the buffer. */
Jaroslav Škarvada 6ccb10
       if ((char *) p > end)
Jaroslav Škarvada 6ccb10
-- 
Jaroslav Škarvada 6ccb10
1.6.6.1
Jaroslav Škarvada 6ccb10