|
|
23be49 |
diff --git a/src/grep.h b/src/grep.h
|
|
|
23be49 |
index 4935872..729c906 100644
|
|
|
23be49 |
--- a/src/grep.h
|
|
|
23be49 |
+++ b/src/grep.h
|
|
|
23be49 |
@@ -27,4 +27,19 @@ extern int match_words; /* -w */
|
|
|
23be49 |
extern int match_lines; /* -x */
|
|
|
23be49 |
extern unsigned char eolbyte; /* -z */
|
|
|
23be49 |
|
|
|
23be49 |
+/* An enum textbin describes the file's type, inferred from data read
|
|
|
23be49 |
+ before the first line is selected for output. */
|
|
|
23be49 |
+enum textbin
|
|
|
23be49 |
+ {
|
|
|
23be49 |
+ /* Binary, as it contains null bytes and the -z option is not in effect,
|
|
|
23be49 |
+ or it contains encoding errors. */
|
|
|
23be49 |
+ TEXTBIN_BINARY = -1,
|
|
|
23be49 |
+
|
|
|
23be49 |
+ /* Not known yet. Only text has been seen so far. */
|
|
|
23be49 |
+ TEXTBIN_UNKNOWN = 0,
|
|
|
23be49 |
+
|
|
|
23be49 |
+ /* Text. */
|
|
|
23be49 |
+ TEXTBIN_TEXT = 1
|
|
|
23be49 |
+ };
|
|
|
23be49 |
+
|
|
|
23be49 |
#endif
|
|
|
23be49 |
diff --git a/src/pcresearch.c b/src/pcresearch.c
|
|
|
23be49 |
index 820dd00..9938ffc 100644
|
|
|
23be49 |
--- a/src/pcresearch.c
|
|
|
23be49 |
+++ b/src/pcresearch.c
|
|
|
23be49 |
@@ -33,13 +33,19 @@ static pcre *cre;
|
|
|
23be49 |
/* Additional information about the pattern. */
|
|
|
23be49 |
static pcre_extra *extra;
|
|
|
23be49 |
|
|
|
23be49 |
-# ifdef PCRE_STUDY_JIT_COMPILE
|
|
|
23be49 |
-static pcre_jit_stack *jit_stack;
|
|
|
23be49 |
-# else
|
|
|
23be49 |
+# ifndef PCRE_STUDY_JIT_COMPILE
|
|
|
23be49 |
# define PCRE_STUDY_JIT_COMPILE 0
|
|
|
23be49 |
# endif
|
|
|
23be49 |
#endif
|
|
|
23be49 |
|
|
|
23be49 |
+/* Table, indexed by ! (flag & PCRE_NOTBOL), of whether the empty
|
|
|
23be49 |
+ string matches when that flag is used. */
|
|
|
23be49 |
+static int empty_match[2];
|
|
|
23be49 |
+
|
|
|
23be49 |
+/* This must be at least 2; everything after that is for performance
|
|
|
23be49 |
+ in pcre_exec. */
|
|
|
23be49 |
+enum { NSUB = 300 };
|
|
|
23be49 |
+
|
|
|
23be49 |
void
|
|
|
23be49 |
Pcompile (char const *pattern, size_t size)
|
|
|
23be49 |
{
|
|
|
23be49 |
@@ -52,13 +58,17 @@ Pcompile (char const *pattern, size_t size)
|
|
|
23be49 |
char const *ep;
|
|
|
23be49 |
char *re = xnmalloc (4, size + 7);
|
|
|
23be49 |
int flags = (PCRE_MULTILINE
|
|
|
23be49 |
- | (match_icase ? PCRE_CASELESS : 0)
|
|
|
23be49 |
- | (using_utf8 () ? PCRE_UTF8 : 0));
|
|
|
23be49 |
+ | (match_icase ? PCRE_CASELESS : 0));
|
|
|
23be49 |
char const *patlim = pattern + size;
|
|
|
23be49 |
char *n = re;
|
|
|
23be49 |
char const *p;
|
|
|
23be49 |
char const *pnul;
|
|
|
23be49 |
|
|
|
23be49 |
+ if (using_utf8 ())
|
|
|
23be49 |
+ flags |= PCRE_UTF8;
|
|
|
23be49 |
+ else if (MB_CUR_MAX != 1)
|
|
|
23be49 |
+ error (EXIT_TROUBLE, 0, _("-P supports only unibyte and UTF-8 locales"));
|
|
|
23be49 |
+
|
|
|
23be49 |
/* FIXME: Remove these restrictions. */
|
|
|
23be49 |
if (memchr (pattern, '\n', size))
|
|
|
23be49 |
error (EXIT_TROUBLE, 0, _("the -P option only supports a single pattern"));
|
|
|
23be49 |
@@ -114,14 +124,20 @@ Pcompile (char const *pattern, size_t size)
|
|
|
23be49 |
/* A 32K stack is allocated for the machine code by default, which
|
|
|
23be49 |
can grow to 512K if necessary. Since JIT uses far less memory
|
|
|
23be49 |
than the interpreter, this should be enough in practice. */
|
|
|
23be49 |
- jit_stack = pcre_jit_stack_alloc (32 * 1024, 512 * 1024);
|
|
|
23be49 |
+ pcre_jit_stack *jit_stack = pcre_jit_stack_alloc (32 * 1024, 512 * 1024);
|
|
|
23be49 |
if (!jit_stack)
|
|
|
23be49 |
error (EXIT_TROUBLE, 0,
|
|
|
23be49 |
_("failed to allocate memory for the PCRE JIT stack"));
|
|
|
23be49 |
pcre_assign_jit_stack (extra, NULL, jit_stack);
|
|
|
23be49 |
}
|
|
|
23be49 |
+
|
|
|
23be49 |
# endif
|
|
|
23be49 |
free (re);
|
|
|
23be49 |
+
|
|
|
23be49 |
+ int sub[NSUB];
|
|
|
23be49 |
+ empty_match[false] = pcre_exec (cre, extra, "", 0, 0,
|
|
|
23be49 |
+ PCRE_NOTBOL, sub, NSUB);
|
|
|
23be49 |
+ empty_match[true] = pcre_exec (cre, extra, "", 0, 0, 0, sub, NSUB);
|
|
|
23be49 |
#endif /* HAVE_LIBPCRE */
|
|
|
23be49 |
}
|
|
|
23be49 |
|
|
|
23be49 |
@@ -134,36 +150,110 @@ Pexecute (char const *buf, size_t size, size_t *match_size,
|
|
|
23be49 |
error (EXIT_TROUBLE, 0, _("internal error"));
|
|
|
23be49 |
return -1;
|
|
|
23be49 |
#else
|
|
|
23be49 |
- /* This array must have at least two elements; everything after that
|
|
|
23be49 |
- is just for performance improvement in pcre_exec. */
|
|
|
23be49 |
- int sub[300];
|
|
|
23be49 |
-
|
|
|
23be49 |
- const char *line_buf, *line_end, *line_next;
|
|
|
23be49 |
+ int sub[NSUB];
|
|
|
23be49 |
+ char const *p = start_ptr ? start_ptr : buf;
|
|
|
23be49 |
+ bool bol = p[-1] == eolbyte;
|
|
|
23be49 |
+ char const *line_start = buf;
|
|
|
23be49 |
int e = PCRE_ERROR_NOMATCH;
|
|
|
23be49 |
- ptrdiff_t start_ofs = start_ptr ? start_ptr - buf : 0;
|
|
|
23be49 |
+ char const *line_end;
|
|
|
23be49 |
|
|
|
23be49 |
- /* PCRE can't limit the matching to single lines, therefore we have to
|
|
|
23be49 |
- match each line in the buffer separately. */
|
|
|
23be49 |
- for (line_next = buf;
|
|
|
23be49 |
- e == PCRE_ERROR_NOMATCH && line_next < buf + size;
|
|
|
23be49 |
- start_ofs -= line_next - line_buf)
|
|
|
23be49 |
+ /* If the input type is unknown, the caller is still testing the
|
|
|
23be49 |
+ input, which means the current buffer cannot contain encoding
|
|
|
23be49 |
+ errors and a multiline search is typically more efficient.
|
|
|
23be49 |
+ Otherwise, a single-line search is typically faster, so that
|
|
|
23be49 |
+ pcre_exec doesn't waste time validating the entire input
|
|
|
23be49 |
+ buffer. */
|
|
|
23be49 |
+ bool multiline = TEXTBIN_UNKNOWN;
|
|
|
23be49 |
+
|
|
|
23be49 |
+ for (; p < buf + size; p = line_start = line_end + 1)
|
|
|
23be49 |
{
|
|
|
23be49 |
- line_buf = line_next;
|
|
|
23be49 |
- line_end = memchr (line_buf, eolbyte, (buf + size) - line_buf);
|
|
|
23be49 |
- if (line_end == NULL)
|
|
|
23be49 |
- line_next = line_end = buf + size;
|
|
|
23be49 |
- else
|
|
|
23be49 |
- line_next = line_end + 1;
|
|
|
23be49 |
+ bool too_big;
|
|
|
23be49 |
|
|
|
23be49 |
- if (start_ptr && start_ptr >= line_end)
|
|
|
23be49 |
- continue;
|
|
|
23be49 |
+ if (multiline)
|
|
|
23be49 |
+ {
|
|
|
23be49 |
+ size_t pcre_size_max = MIN (INT_MAX, SIZE_MAX - 1);
|
|
|
23be49 |
+ size_t scan_size = MIN (pcre_size_max + 1, buf + size - p);
|
|
|
23be49 |
+ line_end = memrchr (p, eolbyte, scan_size);
|
|
|
23be49 |
+ too_big = ! line_end;
|
|
|
23be49 |
+ }
|
|
|
23be49 |
+ else
|
|
|
23be49 |
+ {
|
|
|
23be49 |
+ line_end = memchr (p, eolbyte, buf + size - p);
|
|
|
23be49 |
+ too_big = INT_MAX < line_end - p;
|
|
|
23be49 |
+ }
|
|
|
23be49 |
|
|
|
23be49 |
- if (INT_MAX < line_end - line_buf)
|
|
|
23be49 |
+ if (too_big)
|
|
|
23be49 |
error (EXIT_TROUBLE, 0, _("exceeded PCRE's line length limit"));
|
|
|
23be49 |
|
|
|
23be49 |
- e = pcre_exec (cre, extra, line_buf, line_end - line_buf,
|
|
|
23be49 |
- start_ofs < 0 ? 0 : start_ofs, 0,
|
|
|
23be49 |
- sub, sizeof sub / sizeof *sub);
|
|
|
23be49 |
+ for (;;)
|
|
|
23be49 |
+ {
|
|
|
23be49 |
+ /* Skip past bytes that are easily determined to be encoding
|
|
|
23be49 |
+ errors, treating them as data that cannot match. This is
|
|
|
23be49 |
+ faster than having pcre_exec check them. */
|
|
|
23be49 |
+ while (mbclen_cache[to_uchar (*p)] == (size_t) -1)
|
|
|
23be49 |
+ {
|
|
|
23be49 |
+ p++;
|
|
|
23be49 |
+ bol = false;
|
|
|
23be49 |
+ }
|
|
|
23be49 |
+
|
|
|
23be49 |
+ /* Check for an empty match; this is faster than letting
|
|
|
23be49 |
+ pcre_exec do it. */
|
|
|
23be49 |
+ int search_bytes = line_end - p;
|
|
|
23be49 |
+ if (search_bytes == 0)
|
|
|
23be49 |
+ {
|
|
|
23be49 |
+ sub[0] = sub[1] = 0;
|
|
|
23be49 |
+ e = empty_match[bol];
|
|
|
23be49 |
+ break;
|
|
|
23be49 |
+ }
|
|
|
23be49 |
+
|
|
|
23be49 |
+ int options = 0;
|
|
|
23be49 |
+ if (!bol)
|
|
|
23be49 |
+ options |= PCRE_NOTBOL;
|
|
|
23be49 |
+ if (multiline)
|
|
|
23be49 |
+ options |= PCRE_NO_UTF8_CHECK;
|
|
|
23be49 |
+
|
|
|
23be49 |
+ e = pcre_exec (cre, extra, p, search_bytes, 0,
|
|
|
23be49 |
+ options, sub, NSUB);
|
|
|
23be49 |
+ if (e != PCRE_ERROR_BADUTF8)
|
|
|
23be49 |
+ {
|
|
|
23be49 |
+ if (0 < e && multiline && sub[1] - sub[0] != 0)
|
|
|
23be49 |
+ {
|
|
|
23be49 |
+ char const *nl = memchr (p + sub[0], eolbyte,
|
|
|
23be49 |
+ sub[1] - sub[0]);
|
|
|
23be49 |
+ if (nl)
|
|
|
23be49 |
+ {
|
|
|
23be49 |
+ /* This match crosses a line boundary; reject it. */
|
|
|
23be49 |
+ p += sub[0];
|
|
|
23be49 |
+ line_end = nl;
|
|
|
23be49 |
+ continue;
|
|
|
23be49 |
+ }
|
|
|
23be49 |
+ }
|
|
|
23be49 |
+ break;
|
|
|
23be49 |
+ }
|
|
|
23be49 |
+ int valid_bytes = sub[0];
|
|
|
23be49 |
+
|
|
|
23be49 |
+ /* Try to match the string before the encoding error.
|
|
|
23be49 |
+ Again, handle the empty-match case specially, for speed. */
|
|
|
23be49 |
+ if (valid_bytes == 0)
|
|
|
23be49 |
+ {
|
|
|
23be49 |
+ sub[1] = 0;
|
|
|
23be49 |
+ e = empty_match[bol];
|
|
|
23be49 |
+ }
|
|
|
23be49 |
+ else
|
|
|
23be49 |
+ e = pcre_exec (cre, extra, p, valid_bytes, 0,
|
|
|
23be49 |
+ options | PCRE_NO_UTF8_CHECK | PCRE_NOTEOL,
|
|
|
23be49 |
+ sub, NSUB);
|
|
|
23be49 |
+ if (e != PCRE_ERROR_NOMATCH)
|
|
|
23be49 |
+ break;
|
|
|
23be49 |
+
|
|
|
23be49 |
+ /* Treat the encoding error as data that cannot match. */
|
|
|
23be49 |
+ p += valid_bytes + 1;
|
|
|
23be49 |
+ bol = false;
|
|
|
23be49 |
+ }
|
|
|
23be49 |
+
|
|
|
23be49 |
+ if (e != PCRE_ERROR_NOMATCH)
|
|
|
23be49 |
+ break;
|
|
|
23be49 |
+ bol = true;
|
|
|
23be49 |
}
|
|
|
23be49 |
|
|
|
23be49 |
if (e <= 0)
|
|
|
23be49 |
@@ -171,7 +261,7 @@ Pexecute (char const *buf, size_t size, size_t *match_size,
|
|
|
23be49 |
switch (e)
|
|
|
23be49 |
{
|
|
|
23be49 |
case PCRE_ERROR_NOMATCH:
|
|
|
23be49 |
- return -1;
|
|
|
23be49 |
+ break;
|
|
|
23be49 |
|
|
|
23be49 |
case PCRE_ERROR_NOMEMORY:
|
|
|
23be49 |
error (EXIT_TROUBLE, 0, _("memory exhausted"));
|
|
|
23be49 |
@@ -180,10 +270,6 @@ Pexecute (char const *buf, size_t size, size_t *match_size,
|
|
|
23be49 |
error (EXIT_TROUBLE, 0,
|
|
|
23be49 |
_("exceeded PCRE's backtracking limit"));
|
|
|
23be49 |
|
|
|
23be49 |
- case PCRE_ERROR_BADUTF8:
|
|
|
23be49 |
- error (EXIT_TROUBLE, 0,
|
|
|
23be49 |
- _("invalid UTF-8 byte sequence in input"));
|
|
|
23be49 |
-
|
|
|
23be49 |
default:
|
|
|
23be49 |
/* For now, we lump all remaining PCRE failures into this basket.
|
|
|
23be49 |
If anyone cares to provide sample grep usage that can trigger
|
|
|
23be49 |
@@ -192,30 +278,33 @@ Pexecute (char const *buf, size_t size, size_t *match_size,
|
|
|
23be49 |
error (EXIT_TROUBLE, 0, _("internal PCRE error: %d"), e);
|
|
|
23be49 |
}
|
|
|
23be49 |
|
|
|
23be49 |
- /* NOTREACHED */
|
|
|
23be49 |
return -1;
|
|
|
23be49 |
}
|
|
|
23be49 |
else
|
|
|
23be49 |
{
|
|
|
23be49 |
- /* Narrow down to the line we've found. */
|
|
|
23be49 |
- char const *beg = line_buf + sub[0];
|
|
|
23be49 |
- char const *end = line_buf + sub[1];
|
|
|
23be49 |
- char const *buflim = buf + size;
|
|
|
23be49 |
- char eol = eolbyte;
|
|
|
23be49 |
- if (!start_ptr)
|
|
|
23be49 |
+ char const *matchbeg = p + sub[0];
|
|
|
23be49 |
+ char const *matchend = p + sub[1];
|
|
|
23be49 |
+ char const *beg;
|
|
|
23be49 |
+ char const *end;
|
|
|
23be49 |
+ if (start_ptr)
|
|
|
23be49 |
{
|
|
|
23be49 |
- /* FIXME: The case when '\n' is not found indicates a bug:
|
|
|
23be49 |
- Since grep is line oriented, the match should never contain
|
|
|
23be49 |
- a newline, so there _must_ be a newline following.
|
|
|
23be49 |
- */
|
|
|
23be49 |
- if (!(end = memchr (end, eol, buflim - end)))
|
|
|
23be49 |
- end = buflim;
|
|
|
23be49 |
- else
|
|
|
23be49 |
- end++;
|
|
|
23be49 |
- while (buf < beg && beg[-1] != eol)
|
|
|
23be49 |
- --beg;
|
|
|
23be49 |
+ beg = matchbeg;
|
|
|
23be49 |
+ end = matchend;
|
|
|
23be49 |
+ }
|
|
|
23be49 |
+ else if (multiline)
|
|
|
23be49 |
+ {
|
|
|
23be49 |
+ char const *prev_nl = memrchr (line_start - 1, eolbyte,
|
|
|
23be49 |
+ matchbeg - (line_start - 1));
|
|
|
23be49 |
+ char const *next_nl = memchr (matchend, eolbyte,
|
|
|
23be49 |
+ line_end + 1 - matchend);
|
|
|
23be49 |
+ beg = prev_nl + 1;
|
|
|
23be49 |
+ end = next_nl + 1;
|
|
|
23be49 |
+ }
|
|
|
23be49 |
+ else
|
|
|
23be49 |
+ {
|
|
|
23be49 |
+ beg = line_start;
|
|
|
23be49 |
+ end = line_end + 1;
|
|
|
23be49 |
}
|
|
|
23be49 |
-
|
|
|
23be49 |
*match_size = end - beg;
|
|
|
23be49 |
return beg - buf;
|
|
|
23be49 |
}
|
|
|
23be49 |
diff --git a/src/search.h b/src/search.h
|
|
|
23be49 |
index 14877bc..e671bea 100644
|
|
|
23be49 |
--- a/src/search.h
|
|
|
23be49 |
+++ b/src/search.h
|
|
|
23be49 |
@@ -45,6 +45,7 @@ extern void kwsinit (kwset_t *);
|
|
|
23be49 |
|
|
|
23be49 |
extern char *mbtoupper (char const *, size_t *, mb_len_map_t **);
|
|
|
23be49 |
extern void build_mbclen_cache (void);
|
|
|
23be49 |
+extern size_t mbclen_cache[];
|
|
|
23be49 |
extern ptrdiff_t mb_goback (char const **, char const *, char const *);
|
|
|
23be49 |
extern wint_t mb_prev_wc (char const *, char const *, char const *);
|
|
|
23be49 |
extern wint_t mb_next_wc (char const *, char const *);
|
|
|
23be49 |
diff --git a/src/searchutils.c b/src/searchutils.c
|
|
|
23be49 |
index 5eb9a12..aba9335 100644
|
|
|
23be49 |
--- a/src/searchutils.c
|
|
|
23be49 |
+++ b/src/searchutils.c
|
|
|
23be49 |
@@ -22,7 +22,7 @@
|
|
|
23be49 |
|
|
|
23be49 |
#define NCHAR (UCHAR_MAX + 1)
|
|
|
23be49 |
|
|
|
23be49 |
-static size_t mbclen_cache[NCHAR];
|
|
|
23be49 |
+size_t mbclen_cache[NCHAR];
|
|
|
23be49 |
|
|
|
23be49 |
void
|
|
|
23be49 |
kwsinit (kwset_t *kwset)
|
|
|
23be49 |
diff --git a/tests/pcre-infloop b/tests/pcre-infloop
|
|
|
23be49 |
index 1b33e72..8054844 100755
|
|
|
23be49 |
--- a/tests/pcre-infloop
|
|
|
23be49 |
+++ b/tests/pcre-infloop
|
|
|
23be49 |
@@ -18,16 +18,16 @@
|
|
|
23be49 |
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
|
23be49 |
|
|
|
23be49 |
. "${srcdir=.}/init.sh"; path_prepend_ ../src
|
|
|
23be49 |
-require_pcre_
|
|
|
23be49 |
require_timeout_
|
|
|
23be49 |
require_en_utf8_locale_
|
|
|
23be49 |
require_compiled_in_MB_support
|
|
|
23be49 |
+LC_ALL=en_US.UTF-8 require_pcre_
|
|
|
23be49 |
|
|
|
23be49 |
printf 'a\201b\r' > in || framework_failure_
|
|
|
23be49 |
|
|
|
23be49 |
fail=0
|
|
|
23be49 |
|
|
|
23be49 |
LC_ALL=en_US.UTF-8 timeout 3 grep -P 'a.?..b' in
|
|
|
23be49 |
-test $? = 2 || fail_ "libpcre's match function appears to infloop"
|
|
|
23be49 |
+test $? = 1 || fail_ "libpcre's match function appears to infloop"
|
|
|
23be49 |
|
|
|
23be49 |
Exit $fail
|
|
|
23be49 |
diff --git a/tests/pcre-invalid-utf8-input b/tests/pcre-invalid-utf8-input
|
|
|
23be49 |
index 913e8ee..abcc7e8 100755
|
|
|
23be49 |
--- a/tests/pcre-invalid-utf8-input
|
|
|
23be49 |
+++ b/tests/pcre-invalid-utf8-input
|
|
|
23be49 |
@@ -8,14 +8,19 @@
|
|
|
23be49 |
# notice and this notice are preserved.
|
|
|
23be49 |
|
|
|
23be49 |
. "${srcdir=.}/init.sh"; path_prepend_ ../src
|
|
|
23be49 |
-require_pcre_
|
|
|
23be49 |
+require_timeout_
|
|
|
23be49 |
require_en_utf8_locale_
|
|
|
23be49 |
+require_compiled_in_MB_support
|
|
|
23be49 |
+LC_ALL=en_US.UTF-8 require_pcre_
|
|
|
23be49 |
|
|
|
23be49 |
fail=0
|
|
|
23be49 |
|
|
|
23be49 |
-printf 'j\202\nj\n' > in || framework_failure_
|
|
|
23be49 |
+printf 'j\202j\nj\nk\202\n' > in || framework_failure_
|
|
|
23be49 |
|
|
|
23be49 |
-LC_ALL=en_US.UTF-8 grep -P j in
|
|
|
23be49 |
-test $? -eq 2 || fail=1
|
|
|
23be49 |
+LC_ALL=en_US.UTF-8 timeout 3 grep -P j in
|
|
|
23be49 |
+test $? -eq 0 || fail=1
|
|
|
23be49 |
+
|
|
|
23be49 |
+LC_ALL=en_US.UTF-8 timeout 3 grep -P 'k$' in
|
|
|
23be49 |
+test $? -eq 1 || fail=1
|
|
|
23be49 |
|
|
|
23be49 |
Exit $fail
|
|
|
23be49 |
diff --git a/tests/pcre-utf8 b/tests/pcre-utf8
|
|
|
23be49 |
index 41676f4..2dda116 100755
|
|
|
23be49 |
--- a/tests/pcre-utf8
|
|
|
23be49 |
+++ b/tests/pcre-utf8
|
|
|
23be49 |
@@ -8,8 +8,8 @@
|
|
|
23be49 |
# notice and this notice are preserved.
|
|
|
23be49 |
|
|
|
23be49 |
. "${srcdir=.}/init.sh"; path_prepend_ ../src
|
|
|
23be49 |
-require_pcre_
|
|
|
23be49 |
require_en_utf8_locale_
|
|
|
23be49 |
+LC_ALL=en_US.UTF-8 require_pcre_
|
|
|
23be49 |
|
|
|
23be49 |
fail=0
|
|
|
23be49 |
|