--- grep-2.5.1/src/search.c 2004-12-16 17:46:57.039678304 +0000 +++ grep-2.5.1/src/search.c 2004-12-17 13:03:49.300731757 +0000 @@ -39,6 +39,9 @@ #ifdef HAVE_LIBPCRE # include #endif +#ifdef HAVE_LANGINFO_CODESET +# include +#endif #define NCHAR (UCHAR_MAX + 1) @@ -70,9 +73,10 @@ call the regexp matcher at all. */ static int kwset_exact_matches; -#if defined(MBS_SUPPORT) -static char* check_multibyte_string PARAMS ((char const *buf, size_t size)); -#endif +/* UTF-8 encoding allows some optimizations that we can't otherwise + assume in a multibyte encoding. */ +static int using_utf8; + static void kwsinit PARAMS ((void)); static void kwsmusts PARAMS ((void)); static void Gcompile PARAMS ((char const *, size_t)); @@ -84,6 +88,15 @@ static size_t Pexecute PARAMS ((char const *, size_t, size_t *, int)); void +check_utf8 (void) +{ +#ifdef HAVE_LANGINFO_CODESET + if (strcmp (nl_langinfo (CODESET), "UTF-8") == 0) + using_utf8 = 1; +#endif +} + +void dfaerror (char const *mesg) { error (2, 0, mesg); @@ -141,47 +154,6 @@ } } -#ifdef MBS_SUPPORT -/* This function allocate the array which correspond to "buf". - Then this check multibyte string and mark on the positions which - are not singlebyte character nor the first byte of a multibyte - character. Caller must free the array. */ -static char* -check_multibyte_string(char const *buf, size_t size) -{ - char *mb_properties = xmalloc(size); - mbstate_t cur_state; - wchar_t wc; - int i; - memset(&cur_state, 0, sizeof(mbstate_t)); - memset(mb_properties, 0, sizeof(char)*size); - for (i = 0; i < size ;) - { - size_t mbclen; - mbclen = mbrtowc(&wc, buf + i, size - i, &cur_state); - - if (mbclen == (size_t) -1 || mbclen == (size_t) -2 || mbclen == 0) - { - /* An invalid sequence, or a truncated multibyte character. - We treat it as a singlebyte character. */ - mbclen = 1; - } - else if (match_icase) - { - if (iswupper((wint_t)wc)) - { - wc = towlower((wint_t)wc); - wcrtomb(buf + i, wc, &cur_state); - } - } - mb_properties[i] = mbclen; - i += mbclen; - } - - return mb_properties; -} -#endif - static void Gcompile (char const *pattern, size_t size) { @@ -190,6 +162,7 @@ size_t total = size; char const *motif = pattern; + check_utf8 (); re_set_syntax (RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE | (match_icase ? RE_ICASE : 0)); dfasyntax (RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE, match_icase, eolbyte); @@ -266,6 +239,7 @@ size_t total = size; char const *motif = pattern; + check_utf8 (); if (strcmp (matcher, "awk") == 0) { re_set_syntax (RE_SYNTAX_AWK | (match_icase ? RE_ICASE : 0)); @@ -350,18 +324,8 @@ struct kwsmatch kwsm; size_t i, ret_val; #ifdef MBS_SUPPORT - char *mb_properties = NULL; - if (MB_CUR_MAX > 1) - { - if (match_icase) - { - char *case_buf = xmalloc(size); - memcpy(case_buf, buf, size); - buf = case_buf; - } - if (kwset) - mb_properties = check_multibyte_string(buf, size); - } + mbstate_t mbs; + memset (&mbs, '\0', sizeof (mbstate_t)); #endif /* MBS_SUPPORT */ buflim = buf + size; @@ -373,21 +337,63 @@ if (kwset) { /* Find a possible match using the KWset matcher. */ - size_t offset = kwsexec (kwset, beg, buflim - beg, &kwsm); +#ifdef MBS_SUPPORT + size_t bytes_left = 0; +#endif /* MBS_SUPPORT */ + size_t offset; +#ifdef MBS_SUPPORT + /* kwsexec doesn't work with match_icase and multibyte input. */ + if (match_icase && MB_CUR_MAX > 1) + /* Avoid kwset */ + offset = 0; + else +#endif /* MBS_SUPPORT */ + offset = kwsexec (kwset, beg, buflim - beg, &kwsm); if (offset == (size_t) -1) goto failure; +#ifdef MBS_SUPPORT + if (MB_CUR_MAX > 1 && !using_utf8) + { + bytes_left = offset; + while (bytes_left) + { + size_t len = mbrlen (beg, bytes_left, &mbs); + if (len == (size_t) -1 || len == 0) + { + /* Incomplete character: treat as single-byte. */ + memset (&mbs, '\0', sizeof (mbstate_t)); + beg++; + bytes_left--; + continue; + } + + if (len == (size_t) -2) + /* Offset points inside multibyte character: + * no good. */ + break; + + beg += len; + bytes_left -= len; + } + } + else +#endif /* MBS_SUPPORT */ beg += offset; /* Narrow down to the line containing the candidate, and run it through DFA. */ end = memchr(beg, eol, buflim - beg); end++; #ifdef MBS_SUPPORT - if (MB_CUR_MAX > 1 && mb_properties[beg - buf] == 0) + if (MB_CUR_MAX > 1 && bytes_left) continue; -#endif +#endif /* MBS_SUPPORT */ while (beg > buf && beg[-1] != eol) --beg; - if (kwsm.index < kwset_exact_matches) + if ( +#ifdef MBS_SUPPORT + !(match_icase && MB_CUR_MAX > 1) && +#endif /* MBS_SUPPORT */ + (kwsm.index < kwset_exact_matches)) goto success_in_beg_and_end; if (dfaexec (&dfa, beg, end - beg, &backref) == (size_t) -1) continue; @@ -395,13 +401,47 @@ else { /* No good fixed strings; start with DFA. */ +#ifdef MBS_SUPPORT + size_t bytes_left = 0; +#endif /* MBS_SUPPORT */ size_t offset = dfaexec (&dfa, beg, buflim - beg, &backref); if (offset == (size_t) -1) break; /* Narrow down to the line we've found. */ +#ifdef MBS_SUPPORT + if (MB_CUR_MAX > 1 && !using_utf8) + { + bytes_left = offset; + while (bytes_left) + { + size_t len = mbrlen (beg, bytes_left, &mbs); + if (len == (size_t) -1 || len == 0) + { + /* Incomplete character: treat as single-byte. */ + memset (&mbs, '\0', sizeof (mbstate_t)); + beg++; + bytes_left--; + continue; + } + + if (len == (size_t) -2) + /* Offset points inside multibyte character: + * no good. */ + break; + + beg += len; + bytes_left -= len; + } + } + else +#endif /* MBS_SUPPORT */ beg += offset; end = memchr (beg, eol, buflim - beg); end++; +#ifdef MBS_SUPPORT + if (MB_CUR_MAX > 1 && bytes_left) + continue; +#endif /* MBS_SUPPORT */ while (beg > buf && beg[-1] != eol) --beg; } @@ -469,15 +509,6 @@ } /* for (beg = end ..) */ failure: -#ifdef MBS_SUPPORT - if (MB_CUR_MAX > 1) - { - if (mb_properties) - free (mb_properties); - if (match_icase) - free ((char *) buf); - } -#endif /* MBS_SUPPORT */ return (size_t) -1; success_in_beg_and_end: @@ -486,15 +517,6 @@ /* FALLTHROUGH */ success_in_start_and_len: -#ifdef MBS_SUPPORT - if (MB_CUR_MAX > 1) - { - if (mb_properties) - free (mb_properties); - if (match_icase) - free ((char *) buf); - } -#endif /* MBS_SUPPORT */ *match_size = len; return start; } @@ -504,6 +526,7 @@ { char const *beg, *lim, *err; + check_utf8 (); kwsinit (); beg = pattern; do @@ -531,17 +554,8 @@ struct kwsmatch kwsmatch; size_t ret_val; #ifdef MBS_SUPPORT - char *mb_properties = NULL; - if (MB_CUR_MAX > 1) - { - if (match_icase) - { - char *case_buf = xmalloc(size); - memcpy(case_buf, buf, size); - buf = case_buf; - } - mb_properties = check_multibyte_string(buf, size); - } + mbstate_t mbs; + memset (&mbs, '\0', sizeof (mbstate_t)); #endif /* MBS_SUPPORT */ for (beg = buf; beg <= buf + size; ++beg) @@ -550,8 +564,33 @@ if (offset == (size_t) -1) goto failure; #ifdef MBS_SUPPORT - if (MB_CUR_MAX > 1 && mb_properties[offset+beg-buf] == 0) - continue; /* It is a part of multibyte character. */ + if (MB_CUR_MAX > 1 && !using_utf8) + { + size_t bytes_left = offset; + while (bytes_left) + { + size_t len = mbrlen (beg, bytes_left, &mbs); + if (len == (size_t) -1 || len == 0) + { + /* Incomplete character: treat as single-byte. */ + memset (&mbs, '\0', sizeof (mbstate_t)); + beg++; + bytes_left--; + continue; + } + + if (len == (size_t) -2) + /* Offset points inside multibyte character: no good. */ + break; + + beg += len; + bytes_left -= len; + } + + if (bytes_left) + continue; + } + else #endif /* MBS_SUPPORT */ beg += offset; len = kwsmatch.size[0]; @@ -587,6 +626,36 @@ if (offset == -1) { break; /* Try a different anchor. */ } +#ifdef MBS_SUPPORT + if (MB_CUR_MAX > 1 && !using_utf8) + { + size_t bytes_left = offset; + while (bytes_left) + { + size_t len = mbrlen (beg, bytes_left, &mbs); + if (len == (size_t) -1 || len == 0) + { + /* Incomplete character: treat as single-byte. */ + memset (&mbs, '\0', sizeof (mbstate_t)); + beg++; + bytes_left--; + continue; + } + + if (len == (size_t) -2) + /* Offset points inside multibyte character: + * no good. */ + break; + + beg += len; + bytes_left -= len; + } + + if (bytes_left) + break; /* Try a different anchor. */ + } + else +#endif /* MBS_SUPPORT */ beg += offset; len = kwsmatch.size[0]; } @@ -597,19 +666,31 @@ } failure: + return -1; + + success: #ifdef MBS_SUPPORT - if (MB_CUR_MAX > 1) + if (MB_CUR_MAX > 1 && !using_utf8) { - if (match_icase) - free((char *) buf); - if (mb_properties) - free(mb_properties); + end = beg + len; + while (end < buf + size) + { + size_t len = mbrlen (end, buf + size - end, &mbs); + if (len == (size_t) -1 || len == (size_t) -2 || len == 0) + { + memset (&mbs, '\0', sizeof (mbstate_t)); + len = 1; + } + if (len == 1 && *end == eol) + break; + + end += len; + } } + else #endif /* MBS_SUPPORT */ - return -1; - - success: end = memchr (beg + len, eol, (buf + size) - (beg + len)); + end++; while (buf < beg && beg[-1] != eol) --beg; @@ -618,15 +699,6 @@ success_in_beg_and_len: *match_size = len; -#ifdef MBS_SUPPORT - if (MB_CUR_MAX > 1) - { - if (mb_properties) - free (mb_properties); - if (match_icase) - free ((char *) buf); - } -#endif /* MBS_SUPPORT */ return beg - buf; }