|
Jaroslav Škarvada |
6ccb10 |
From 01422220ebf40f829c1f00418a96873b82f206ff Mon Sep 17 00:00:00 2001
|
|
Jaroslav Škarvada |
6ccb10 |
From: Paolo Bonzini <bonzini@gnu.org>
|
|
Jaroslav Škarvada |
6ccb10 |
Date: Mon, 19 Apr 2010 14:50:23 +0200
|
|
Jaroslav Škarvada |
6ccb10 |
Subject: [PATCH 1/2] dfa: optimize UTF-8 period
|
|
Jaroslav Škarvada |
6ccb10 |
|
|
Jaroslav Škarvada |
6ccb10 |
Backport of upstream commits 7a0ad00 and 42ac56a.
|
|
Jaroslav Škarvada |
6ccb10 |
|
|
Jaroslav Škarvada |
6ccb10 |
* src/dfa.h (struct dfa): Add utf8_anychar_classes.
|
|
Jaroslav Škarvada |
6ccb10 |
* src/dfa.c (add_utf8_anychar): New.
|
|
Jaroslav Škarvada |
6ccb10 |
(atom): Simplify if/else nesting. Call add_utf8_anychar for ANYCHAR
|
|
Jaroslav Škarvada |
6ccb10 |
in UTF-8 locales.
|
|
Jaroslav Škarvada |
6ccb10 |
(dfaoptimize): Abort on ANYCHAR.
|
|
Jaroslav Škarvada |
6ccb10 |
---
|
|
Jaroslav Škarvada |
6ccb10 |
src/dfa.c | 95 ++++++++++++++++++++++++++++++++++++++++++++++++++++---------
|
|
Jaroslav Škarvada |
6ccb10 |
src/dfa.h | 1 +
|
|
Jaroslav Škarvada |
6ccb10 |
2 files changed, 82 insertions(+), 14 deletions(-)
|
|
Jaroslav Škarvada |
6ccb10 |
|
|
Jaroslav Škarvada |
6ccb10 |
diff --git a/src/dfa.c b/src/dfa.c
|
|
Jaroslav Škarvada |
6ccb10 |
index ba78b08..e13c361 100644
|
|
Jaroslav Škarvada |
6ccb10 |
--- a/src/dfa.c
|
|
Jaroslav Škarvada |
6ccb10 |
+++ b/src/dfa.c
|
|
Jaroslav Škarvada |
6ccb10 |
@@ -1191,6 +1191,55 @@ addtok_wc (wint_t wc)
|
|
Jaroslav Škarvada |
6ccb10 |
}
|
|
Jaroslav Škarvada |
6ccb10 |
#endif
|
|
Jaroslav Škarvada |
6ccb10 |
|
|
Jaroslav Škarvada |
6ccb10 |
+static void
|
|
Jaroslav Škarvada |
6ccb10 |
+add_utf8_anychar (void)
|
|
Jaroslav Škarvada |
6ccb10 |
+{
|
|
Jaroslav Škarvada |
6ccb10 |
+ static const charclass utf8_classes[5] = {
|
|
Jaroslav Škarvada |
6ccb10 |
+ { 0, 0, 0, 0, ~0, ~0, 0, 0 }, /* 80-bf: non-lead bytes */
|
|
Jaroslav Škarvada |
6ccb10 |
+ { ~0, ~0, ~0, ~0, ~0, ~0, 0, 0xff000000 }, /* 00-bf, f8-ff: 1-byte/invalid */
|
|
Jaroslav Škarvada |
6ccb10 |
+ { 0, 0, 0, 0, 0, 0, ~0, 0 }, /* c0-df: 2-byte sequence */
|
|
Jaroslav Škarvada |
6ccb10 |
+ { 0, 0, 0, 0, 0, 0, 0, 0xffff }, /* e0-ef: 3-byte sequence */
|
|
Jaroslav Škarvada |
6ccb10 |
+ { 0, 0, 0, 0, 0, 0, 0, 0xff0000 } /* f0-f7: 4-byte sequence */
|
|
Jaroslav Škarvada |
6ccb10 |
+ };
|
|
Jaroslav Škarvada |
6ccb10 |
+ const unsigned int n = sizeof (utf8_classes) / sizeof (utf8_classes[0]);
|
|
Jaroslav Škarvada |
6ccb10 |
+ unsigned int i;
|
|
Jaroslav Škarvada |
6ccb10 |
+
|
|
Jaroslav Škarvada |
6ccb10 |
+ /* Define the five character classes that are needed below. */
|
|
Jaroslav Škarvada |
6ccb10 |
+ if (dfa->utf8_anychar_classes[0] == 0)
|
|
Jaroslav Škarvada |
6ccb10 |
+ for (i = 0; i < n; i++)
|
|
Jaroslav Škarvada |
6ccb10 |
+ {
|
|
Jaroslav Škarvada |
6ccb10 |
+ charclass c;
|
|
Jaroslav Škarvada |
6ccb10 |
+ memcpy (c, utf8_classes[i], sizeof c);
|
|
Jaroslav Škarvada |
6ccb10 |
+ if (i == 1)
|
|
Jaroslav Škarvada |
6ccb10 |
+ {
|
|
Jaroslav Škarvada |
6ccb10 |
+ if (!(syntax_bits & RE_DOT_NEWLINE))
|
|
Jaroslav Škarvada |
6ccb10 |
+ clrbit (eolbyte, c);
|
|
Jaroslav Škarvada |
6ccb10 |
+ if (syntax_bits & RE_DOT_NOT_NULL)
|
|
Jaroslav Škarvada |
6ccb10 |
+ clrbit ('\0', c);
|
|
Jaroslav Škarvada |
6ccb10 |
+ }
|
|
Jaroslav Škarvada |
6ccb10 |
+ dfa->utf8_anychar_classes[i] = CSET + charclass_index(c);
|
|
Jaroslav Škarvada |
6ccb10 |
+ }
|
|
Jaroslav Škarvada |
6ccb10 |
+
|
|
Jaroslav Škarvada |
6ccb10 |
+ /* A valid UTF-8 character is
|
|
Jaroslav Škarvada |
6ccb10 |
+
|
|
Jaroslav Škarvada |
6ccb10 |
+ ([0x00-0x7f]
|
|
Jaroslav Škarvada |
6ccb10 |
+ |[0xc2-0xdf][0x80-0xbf]
|
|
Jaroslav Škarvada |
6ccb10 |
+ |[0xe0-0xef[0x80-0xbf][0x80-0xbf]
|
|
Jaroslav Škarvada |
6ccb10 |
+ |[0xf0-f7][0x80-0xbf][0x80-0xbf][0x80-0xbf])
|
|
Jaroslav Škarvada |
6ccb10 |
+
|
|
Jaroslav Škarvada |
6ccb10 |
+ which I'll write more concisely "B|CA|DAA|EAAA". Factor the [0x80-0xbf]
|
|
Jaroslav Škarvada |
6ccb10 |
+ and you get "B|(C|(D|EA)A)A". And since the token buffer is in reverse
|
|
Jaroslav Škarvada |
6ccb10 |
+ Polish notation, you get "B C D E A CAT OR A CAT OR A CAT OR". */
|
|
Jaroslav Škarvada |
6ccb10 |
+ for (i = 1; i < n; i++)
|
|
Jaroslav Škarvada |
6ccb10 |
+ addtok (dfa->utf8_anychar_classes[i]);
|
|
Jaroslav Škarvada |
6ccb10 |
+ while (--i > 1)
|
|
Jaroslav Škarvada |
6ccb10 |
+ {
|
|
Jaroslav Škarvada |
6ccb10 |
+ addtok (dfa->utf8_anychar_classes[0]);
|
|
Jaroslav Škarvada |
6ccb10 |
+ addtok (CAT);
|
|
Jaroslav Škarvada |
6ccb10 |
+ addtok (OR);
|
|
Jaroslav Škarvada |
6ccb10 |
+ }
|
|
Jaroslav Škarvada |
6ccb10 |
+}
|
|
Jaroslav Škarvada |
6ccb10 |
+
|
|
Jaroslav Škarvada |
6ccb10 |
/* The grammar understood by the parser is as follows.
|
|
Jaroslav Škarvada |
6ccb10 |
|
|
Jaroslav Škarvada |
6ccb10 |
regexp:
|
|
Jaroslav Škarvada |
6ccb10 |
@@ -1229,8 +1278,12 @@ addtok_wc (wint_t wc)
|
|
Jaroslav Škarvada |
6ccb10 |
static void
|
|
Jaroslav Škarvada |
6ccb10 |
atom (void)
|
|
Jaroslav Škarvada |
6ccb10 |
{
|
|
Jaroslav Škarvada |
6ccb10 |
+ if (0)
|
|
Jaroslav Škarvada |
6ccb10 |
+ {
|
|
Jaroslav Škarvada |
6ccb10 |
+ /* empty */
|
|
Jaroslav Škarvada |
6ccb10 |
+ }
|
|
Jaroslav Škarvada |
6ccb10 |
#ifdef MBS_SUPPORT
|
|
Jaroslav Škarvada |
6ccb10 |
- if (tok == WCHAR)
|
|
Jaroslav Škarvada |
6ccb10 |
+ else if (tok == WCHAR)
|
|
Jaroslav Škarvada |
6ccb10 |
{
|
|
Jaroslav Škarvada |
6ccb10 |
addtok_wc (case_fold ? towlower(wctok) : wctok);
|
|
Jaroslav Škarvada |
6ccb10 |
#ifndef GREP
|
|
Jaroslav Škarvada |
6ccb10 |
@@ -1242,16 +1295,28 @@ atom (void)
|
|
Jaroslav Škarvada |
6ccb10 |
#endif
|
|
Jaroslav Škarvada |
6ccb10 |
|
|
Jaroslav Škarvada |
6ccb10 |
tok = lex();
|
|
Jaroslav Škarvada |
6ccb10 |
- return;
|
|
Jaroslav Škarvada |
6ccb10 |
+ }
|
|
Jaroslav Škarvada |
6ccb10 |
+
|
|
Jaroslav Škarvada |
6ccb10 |
+ else if (tok == ANYCHAR && using_utf8())
|
|
Jaroslav Škarvada |
6ccb10 |
+ {
|
|
Jaroslav Škarvada |
6ccb10 |
+ /* For UTF-8 expand the period to a series of CSETs that define a valid
|
|
Jaroslav Škarvada |
6ccb10 |
+ UTF-8 character. This avoids using the slow multibyte path. I'm
|
|
Jaroslav Škarvada |
6ccb10 |
+ pretty sure it would be both profitable and correct to do it for
|
|
Jaroslav Škarvada |
6ccb10 |
+ any encoding; however, the optimization must be done manually as
|
|
Jaroslav Škarvada |
6ccb10 |
+ it is done above in add_utf8_anychar. So, let's start with
|
|
Jaroslav Škarvada |
6ccb10 |
+ UTF-8: it is the most used, and the structure of the encoding
|
|
Jaroslav Škarvada |
6ccb10 |
+ makes the correctness more obvious. */
|
|
Jaroslav Škarvada |
6ccb10 |
+ add_utf8_anychar();
|
|
Jaroslav Škarvada |
6ccb10 |
+ tok = lex();
|
|
Jaroslav Škarvada |
6ccb10 |
}
|
|
Jaroslav Škarvada |
6ccb10 |
#endif /* MBS_SUPPORT */
|
|
Jaroslav Škarvada |
6ccb10 |
|
|
Jaroslav Škarvada |
6ccb10 |
- if ((tok >= 0 && tok < NOTCHAR) || tok >= CSET || tok == BACKREF
|
|
Jaroslav Škarvada |
6ccb10 |
- || tok == BEGLINE || tok == ENDLINE || tok == BEGWORD
|
|
Jaroslav Škarvada |
6ccb10 |
+ else if ((tok >= 0 && tok < NOTCHAR) || tok >= CSET || tok == BACKREF
|
|
Jaroslav Škarvada |
6ccb10 |
+ || tok == BEGLINE || tok == ENDLINE || tok == BEGWORD
|
|
Jaroslav Škarvada |
6ccb10 |
#ifdef MBS_SUPPORT
|
|
Jaroslav Škarvada |
6ccb10 |
- || tok == ANYCHAR || tok == MBCSET /* MB_CUR_MAX > 1 */
|
|
Jaroslav Škarvada |
6ccb10 |
+ || tok == ANYCHAR || tok == MBCSET
|
|
Jaroslav Škarvada |
6ccb10 |
#endif /* MBS_SUPPORT */
|
|
Jaroslav Škarvada |
6ccb10 |
- || tok == ENDWORD || tok == LIMWORD || tok == NOTLIMWORD)
|
|
Jaroslav Škarvada |
6ccb10 |
+ || tok == ENDWORD || tok == LIMWORD || tok == NOTLIMWORD)
|
|
Jaroslav Škarvada |
6ccb10 |
{
|
|
Jaroslav Škarvada |
6ccb10 |
addtok(tok);
|
|
Jaroslav Škarvada |
6ccb10 |
tok = lex();
|
|
Jaroslav Škarvada |
6ccb10 |
@@ -3027,14 +3092,16 @@ dfaoptimize (struct dfa *d)
|
|
Jaroslav Škarvada |
6ccb10 |
for (i = 0; i < d->tindex; ++i)
|
|
Jaroslav Škarvada |
6ccb10 |
{
|
|
Jaroslav Škarvada |
6ccb10 |
switch(d->tokens[i])
|
|
Jaroslav Škarvada |
6ccb10 |
- {
|
|
Jaroslav Škarvada |
6ccb10 |
- case ANYCHAR:
|
|
Jaroslav Škarvada |
6ccb10 |
- case MBCSET:
|
|
Jaroslav Škarvada |
6ccb10 |
- /* Requires multi-byte algorithm. */
|
|
Jaroslav Škarvada |
6ccb10 |
- return;
|
|
Jaroslav Škarvada |
6ccb10 |
- default:
|
|
Jaroslav Škarvada |
6ccb10 |
- break;
|
|
Jaroslav Škarvada |
6ccb10 |
- }
|
|
Jaroslav Škarvada |
6ccb10 |
+ {
|
|
Jaroslav Škarvada |
6ccb10 |
+ case ANYCHAR:
|
|
Jaroslav Škarvada |
6ccb10 |
+ /* Lowered. */
|
|
Jaroslav Škarvada |
6ccb10 |
+ abort ();
|
|
Jaroslav Škarvada |
6ccb10 |
+ case MBCSET:
|
|
Jaroslav Škarvada |
6ccb10 |
+ /* Requires multi-byte algorithm. */
|
|
Jaroslav Škarvada |
6ccb10 |
+ return;
|
|
Jaroslav Škarvada |
6ccb10 |
+ default:
|
|
Jaroslav Škarvada |
6ccb10 |
+ break;
|
|
Jaroslav Škarvada |
6ccb10 |
+ }
|
|
Jaroslav Škarvada |
6ccb10 |
}
|
|
Jaroslav Škarvada |
6ccb10 |
|
|
Jaroslav Škarvada |
6ccb10 |
free_mbdata (d);
|
|
Jaroslav Škarvada |
6ccb10 |
diff --git a/src/dfa.h b/src/dfa.h
|
|
Jaroslav Škarvada |
6ccb10 |
index 1c85207..42c177a 100644
|
|
Jaroslav Škarvada |
6ccb10 |
--- a/src/dfa.h
|
|
Jaroslav Škarvada |
6ccb10 |
+++ b/src/dfa.h
|
|
Jaroslav Škarvada |
6ccb10 |
@@ -283,6 +283,7 @@ struct dfa
|
|
Jaroslav Škarvada |
6ccb10 |
with dfaparse(). */
|
|
Jaroslav Škarvada |
6ccb10 |
#ifdef MBS_SUPPORT
|
|
Jaroslav Škarvada |
6ccb10 |
unsigned int mb_cur_max; /* Cached value of MB_CUR_MAX. */
|
|
Jaroslav Škarvada |
6ccb10 |
+ int utf8_anychar_classes[5]; /* To lower ANYCHAR in UTF-8 locales. */
|
|
Jaroslav Škarvada |
6ccb10 |
|
|
Jaroslav Škarvada |
6ccb10 |
/* The following are used only if MB_CUR_MAX > 1. */
|
|
Jaroslav Škarvada |
6ccb10 |
|
|
Jaroslav Škarvada |
6ccb10 |
--
|
|
Jaroslav Škarvada |
6ccb10 |
1.6.6.1
|
|
Jaroslav Škarvada |
6ccb10 |
|