|
|
13d99c |
--- a/src/dfa.c
|
|
|
13d99c |
+++ b/src/dfa.c
|
|
|
13d99c |
@@ -1238,6 +1238,20 @@ parse_bracket_exp (void)
|
|
|
13d99c |
return CSET + charclass_index (ccl);
|
|
|
13d99c |
}
|
|
|
13d99c |
|
|
|
13d99c |
+#define PUSH_LEX_STATE(s) \
|
|
|
13d99c |
+ do \
|
|
|
13d99c |
+ { \
|
|
|
13d99c |
+ char const *lexptr_saved = lexptr; \
|
|
|
13d99c |
+ size_t lexleft_saved = lexleft; \
|
|
|
13d99c |
+ lexptr = (s); \
|
|
|
13d99c |
+ lexleft = strlen (lexptr)
|
|
|
13d99c |
+
|
|
|
13d99c |
+#define POP_LEX_STATE() \
|
|
|
13d99c |
+ lexptr = lexptr_saved; \
|
|
|
13d99c |
+ lexleft = lexleft_saved; \
|
|
|
13d99c |
+ } \
|
|
|
13d99c |
+ while (0)
|
|
|
13d99c |
+
|
|
|
13d99c |
static token
|
|
|
13d99c |
lex (void)
|
|
|
13d99c |
{
|
|
|
13d99c |
@@ -1485,20 +1499,6 @@ lex (void)
|
|
|
13d99c |
return lasttok = CSET + charclass_index (ccl);
|
|
|
13d99c |
}
|
|
|
13d99c |
|
|
|
13d99c |
-#define PUSH_LEX_STATE(s) \
|
|
|
13d99c |
- do \
|
|
|
13d99c |
- { \
|
|
|
13d99c |
- char const *lexptr_saved = lexptr; \
|
|
|
13d99c |
- size_t lexleft_saved = lexleft; \
|
|
|
13d99c |
- lexptr = (s); \
|
|
|
13d99c |
- lexleft = strlen (lexptr)
|
|
|
13d99c |
-
|
|
|
13d99c |
-#define POP_LEX_STATE() \
|
|
|
13d99c |
- lexptr = lexptr_saved; \
|
|
|
13d99c |
- lexleft = lexleft_saved; \
|
|
|
13d99c |
- } \
|
|
|
13d99c |
- while (0)
|
|
|
13d99c |
-
|
|
|
13d99c |
/* FIXME: see if optimizing this, as is done with ANYCHAR and
|
|
|
13d99c |
add_utf8_anychar, makes sense. */
|
|
|
13d99c |
|
|
|
13d99c |
@@ -1518,14 +1518,33 @@ lex (void)
|
|
|
13d99c |
case 'W':
|
|
|
13d99c |
if (!backslash || (syntax_bits & RE_NO_GNU_OPS))
|
|
|
13d99c |
goto normal_char;
|
|
|
13d99c |
- zeroset (ccl);
|
|
|
13d99c |
- for (c2 = 0; c2 < NOTCHAR; ++c2)
|
|
|
13d99c |
- if (IS_WORD_CONSTITUENT (c2))
|
|
|
13d99c |
- setbit (c2, ccl);
|
|
|
13d99c |
- if (c == 'W')
|
|
|
13d99c |
- notset (ccl);
|
|
|
13d99c |
+
|
|
|
13d99c |
+ if (!dfa->multibyte)
|
|
|
13d99c |
+ {
|
|
|
13d99c |
+ zeroset (ccl);
|
|
|
13d99c |
+ for (c2 = 0; c2 < NOTCHAR; ++c2)
|
|
|
13d99c |
+ if (IS_WORD_CONSTITUENT (c2))
|
|
|
13d99c |
+ setbit (c2, ccl);
|
|
|
13d99c |
+ if (c == 'W')
|
|
|
13d99c |
+ notset (ccl);
|
|
|
13d99c |
+ laststart = false;
|
|
|
13d99c |
+ return lasttok = CSET + charclass_index (ccl);
|
|
|
13d99c |
+ }
|
|
|
13d99c |
+
|
|
|
13d99c |
+ /* FIXME: see if optimizing this, as is done with ANYCHAR and
|
|
|
13d99c |
+ add_utf8_anychar, makes sense. */
|
|
|
13d99c |
+
|
|
|
13d99c |
+ /* \w and \W are documented to be equivalent to [_[:alnum:]] and
|
|
|
13d99c |
+ [^_[:alnum:]] respectively, so tell the lexer to process those
|
|
|
13d99c |
+ strings, each minus its "already processed" '['. */
|
|
|
13d99c |
+ PUSH_LEX_STATE (c == 'w' ? "_[:alnum:]]" : "^_[:alnum:]]");
|
|
|
13d99c |
+
|
|
|
13d99c |
+ lasttok = parse_bracket_exp ();
|
|
|
13d99c |
+
|
|
|
13d99c |
+ POP_LEX_STATE ();
|
|
|
13d99c |
+
|
|
|
13d99c |
laststart = false;
|
|
|
13d99c |
- return lasttok = CSET + charclass_index (ccl);
|
|
|
13d99c |
+ return lasttok;
|
|
|
13d99c |
|
|
|
13d99c |
case '[':
|
|
|
13d99c |
if (backslash)
|
|
|
13d99c |
--- a/tests/Makefile.am
|
|
|
13d99c |
+++ b/tests/Makefile.am
|
|
|
13d99c |
@@ -110,6 +110,7 @@ TESTS = \
|
|
|
13d99c |
warn-char-classes \
|
|
|
13d99c |
word-delim-multibyte \
|
|
|
13d99c |
word-multi-file \
|
|
|
13d99c |
+ word-multibyte \
|
|
|
13d99c |
yesno
|
|
|
13d99c |
|
|
|
13d99c |
EXTRA_DIST = \
|
|
|
13d99c |
--- a/tests/Makefile.in
|
|
|
13d99c |
+++ b/tests/Makefile.in
|
|
|
13d99c |
@@ -1409,6 +1409,7 @@ TESTS = \
|
|
|
13d99c |
warn-char-classes \
|
|
|
13d99c |
word-delim-multibyte \
|
|
|
13d99c |
word-multi-file \
|
|
|
13d99c |
+ word-multibyte \
|
|
|
13d99c |
yesno
|
|
|
13d99c |
|
|
|
13d99c |
EXTRA_DIST = \
|
|
|
13d99c |
@@ -2286,6 +2287,13 @@ word-multi-file.log: word-multi-file
|
|
|
13d99c |
--log-file $$b.log --trs-file $$b.trs \
|
|
|
13d99c |
$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
|
|
|
13d99c |
"$$tst" $(AM_TESTS_FD_REDIRECT)
|
|
|
13d99c |
+word-multibyte.log: word-multibyte
|
|
|
13d99c |
+ @p='word-multibyte'; \
|
|
|
13d99c |
+ b='word-multibyte'; \
|
|
|
13d99c |
+ $(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
|
|
|
13d99c |
+ --log-file $$b.log --trs-file $$b.trs \
|
|
|
13d99c |
+ $(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
|
|
|
13d99c |
+ "$$tst" $(AM_TESTS_FD_REDIRECT)
|
|
|
13d99c |
yesno.log: yesno
|
|
|
13d99c |
@p='yesno'; \
|
|
|
13d99c |
b='yesno'; \
|
|
|
13d99c |
--- a/dev/null
|
|
|
13d99c |
+++ a/tests/word-multibyte
|
|
|
13d99c |
@@ -0,0 +1,23 @@
|
|
|
13d99c |
+#!/bin/sh
|
|
|
13d99c |
+# This would fail for grep-2.20
|
|
|
13d99c |
+. "${srcdir=.}/init.sh"; path_prepend_ ../src
|
|
|
13d99c |
+
|
|
|
13d99c |
+require_en_utf8_locale_
|
|
|
13d99c |
+
|
|
|
13d99c |
+printf '\xc3\xa1\n' > in || framework_failure_
|
|
|
13d99c |
+LC_ALL=en_US.UTF-8
|
|
|
13d99c |
+export LC_ALL
|
|
|
13d99c |
+
|
|
|
13d99c |
+fail=0
|
|
|
13d99c |
+
|
|
|
13d99c |
+for LOC in en_US.UTF-8 zh_CN $LOCALE_FR_UTF8; do
|
|
|
13d99c |
+ out=out1-$LOC
|
|
|
13d99c |
+ LC_ALL=$LOC grep '\w' in >$out || fail=1
|
|
|
13d99c |
+ compare in $out || fail=1
|
|
|
13d99c |
+
|
|
|
13d99c |
+ out=out2-$LOC
|
|
|
13d99c |
+ LC_ALL=$LOC grep '\W' in >$out && fail=1
|
|
|
13d99c |
+ compare /dev/null $out || fail=1
|
|
|
13d99c |
+done
|
|
|
13d99c |
+
|
|
|
13d99c |
+Exit $fail
|