2110 lines
55 KiB
Diff
2110 lines
55 KiB
Diff
Submitted by: Alexander E. Patrakov
|
|
Date: 2005-08-13
|
|
Initial Package Version: 2.5.1a
|
|
Upstream Status: Partially accepted, partially rejected, but required for LSB >= 2.0 certification
|
|
Origin: RedHat
|
|
Description: Various fixes from RedHat. Individual patches:
|
|
|
|
grep-2.5.1-fgrep.patch
|
|
grep-2.5.1-bracket.patch
|
|
grep-2.5-i18n.patch
|
|
grep-2.5.1-oi.patch
|
|
grep-2.5.1-manpage.patch
|
|
grep-2.5.1-color.patch
|
|
grep-2.5.1-icolor.patch
|
|
grep-2.5.1-egf-speedup.patch
|
|
grep-2.5.1-dfa-optional.patch
|
|
grep-2.5.1-tests.patch
|
|
grep-2.5.1-w.patch
|
|
|
|
Testcases:
|
|
|
|
-fgrep: ???, but required for other patches
|
|
-bracket: echo "[" | LANG=en_US.UTF-8 grep "[[:space:]]"
|
|
-i18n: many fixes for multibyte locale support, required for LSB.
|
|
-oi: echo xxYYzz | LANG=C grep -i -o yy
|
|
-manpage: typo
|
|
-color: restore the background color correctly
|
|
-icolor: ??? echo 'spam foo SPAM FOO' | grep -i --color spam
|
|
(but that's also fixed by -oi. Is this patch just a cleanup?)
|
|
-egf-speedup: without this, grep is as slow as a snail in UTF-8 locales.
|
|
-dfa-optional: disables dfa in multibyte locales by default.
|
|
-w: (echo 'foo';echo 'fo') > /tmp/testfile && grep -F -w fo /tmp/testfile
|
|
|
|
diff -urN grep-2.5.1a.orig/doc/grep.1 grep-2.5.1a/doc/grep.1
|
|
--- grep-2.5.1a.orig/doc/grep.1 2004-11-12 16:26:37.000000000 +0500
|
|
+++ grep-2.5.1a/doc/grep.1 2005-10-23 09:49:43.000000000 +0600
|
|
@@ -191,6 +191,7 @@
|
|
.I PATTERN
|
|
as a list of fixed strings, separated by newlines,
|
|
any of which is to be matched.
|
|
+.TP
|
|
.BR \-P ", " \-\^\-perl-regexp
|
|
Interpret
|
|
.I PATTERN
|
|
@@ -302,7 +303,7 @@
|
|
This is especially useful for tools like zgrep, e.g.
|
|
.B "gzip -cd foo.gz |grep --label=foo something"
|
|
.TP
|
|
-.BR \-\^\-line-buffering
|
|
+.BR \-\^\-line-buffered
|
|
Use line buffering, it can be a performance penality.
|
|
.TP
|
|
.BR \-q ", " \-\^\-quiet ", " \-\^\-silent
|
|
diff -urN grep-2.5.1a.orig/lib/posix/regex.h grep-2.5.1a/lib/posix/regex.h
|
|
--- grep-2.5.1a.orig/lib/posix/regex.h 2001-04-02 23:56:50.000000000 +0600
|
|
+++ grep-2.5.1a/lib/posix/regex.h 2005-10-23 09:49:31.000000000 +0600
|
|
@@ -109,6 +109,10 @@
|
|
If not set, \{, \}, {, and } are literals. */
|
|
#define RE_INTERVALS (RE_HAT_LISTS_NOT_NEWLINE << 1)
|
|
|
|
+/* If this bit is set, then ignore case when matching.
|
|
+ If not set, then case is significant. */
|
|
+#define RE_ICASE (RE_INVALID_INTERVAL_ORD << 1)
|
|
+
|
|
/* If this bit is set, +, ? and | aren't recognized as operators.
|
|
If not set, they are. */
|
|
#define RE_LIMITED_OPS (RE_INTERVALS << 1)
|
|
diff -urN grep-2.5.1a.orig/src/dfa.c grep-2.5.1a/src/dfa.c
|
|
--- grep-2.5.1a.orig/src/dfa.c 2001-09-26 22:57:55.000000000 +0600
|
|
+++ grep-2.5.1a/src/dfa.c 2005-10-23 09:49:17.000000000 +0600
|
|
@@ -414,7 +414,7 @@
|
|
|
|
/* This function fetch a wide character, and update cur_mb_len,
|
|
used only if the current locale is a multibyte environment. */
|
|
-static wchar_t
|
|
+static wint_t
|
|
fetch_wc (char const *eoferr)
|
|
{
|
|
wchar_t wc;
|
|
@@ -423,7 +423,7 @@
|
|
if (eoferr != 0)
|
|
dfaerror (eoferr);
|
|
else
|
|
- return -1;
|
|
+ return WEOF;
|
|
}
|
|
|
|
cur_mb_len = mbrtowc(&wc, lexptr, lexleft, &mbs);
|
|
@@ -459,7 +459,7 @@
|
|
static void
|
|
parse_bracket_exp_mb ()
|
|
{
|
|
- wchar_t wc, wc1, wc2;
|
|
+ wint_t wc, wc1, wc2;
|
|
|
|
/* Work area to build a mb_char_classes. */
|
|
struct mb_char_classes *work_mbc;
|
|
@@ -496,7 +496,7 @@
|
|
work_mbc->invert = 0;
|
|
do
|
|
{
|
|
- wc1 = -1; /* mark wc1 is not initialized". */
|
|
+ wc1 = WEOF; /* mark wc1 is not initialized". */
|
|
|
|
/* Note that if we're looking at some other [:...:] construct,
|
|
we just treat it as a bunch of ordinary characters. We can do
|
|
@@ -586,7 +586,7 @@
|
|
work_mbc->coll_elems[work_mbc->ncoll_elems++] = elem;
|
|
}
|
|
}
|
|
- wc = -1;
|
|
+ wc1 = wc = WEOF;
|
|
}
|
|
else
|
|
/* We treat '[' as a normal character here. */
|
|
@@ -600,7 +600,7 @@
|
|
wc = fetch_wc(("Unbalanced ["));
|
|
}
|
|
|
|
- if (wc1 == -1)
|
|
+ if (wc1 == WEOF)
|
|
wc1 = fetch_wc(_("Unbalanced ["));
|
|
|
|
if (wc1 == L'-')
|
|
@@ -630,17 +630,17 @@
|
|
}
|
|
REALLOC_IF_NECESSARY(work_mbc->range_sts, wchar_t,
|
|
range_sts_al, work_mbc->nranges + 1);
|
|
- work_mbc->range_sts[work_mbc->nranges] = wc;
|
|
+ work_mbc->range_sts[work_mbc->nranges] = (wchar_t)wc;
|
|
REALLOC_IF_NECESSARY(work_mbc->range_ends, wchar_t,
|
|
range_ends_al, work_mbc->nranges + 1);
|
|
- work_mbc->range_ends[work_mbc->nranges++] = wc2;
|
|
+ work_mbc->range_ends[work_mbc->nranges++] = (wchar_t)wc2;
|
|
}
|
|
- else if (wc != -1)
|
|
+ else if (wc != WEOF)
|
|
/* build normal characters. */
|
|
{
|
|
REALLOC_IF_NECESSARY(work_mbc->chars, wchar_t, chars_al,
|
|
work_mbc->nchars + 1);
|
|
- work_mbc->chars[work_mbc->nchars++] = wc;
|
|
+ work_mbc->chars[work_mbc->nchars++] = (wchar_t)wc;
|
|
}
|
|
}
|
|
while ((wc = wc1) != L']');
|
|
@@ -2552,6 +2552,8 @@
|
|
}
|
|
|
|
/* match with a character? */
|
|
+ if (case_fold)
|
|
+ wc = towlower (wc);
|
|
for (i = 0; i<work_mbc->nchars; i++)
|
|
{
|
|
if (wc == work_mbc->chars[i])
|
|
diff -urN grep-2.5.1a.orig/src/grep.c grep-2.5.1a/src/grep.c
|
|
--- grep-2.5.1a.orig/src/grep.c 2004-11-12 16:25:35.000000000 +0500
|
|
+++ grep-2.5.1a/src/grep.c 2005-10-23 09:50:06.000000000 +0600
|
|
@@ -30,6 +30,12 @@
|
|
# include <sys/time.h>
|
|
# include <sys/resource.h>
|
|
#endif
|
|
+#if defined HAVE_WCTYPE_H && defined HAVE_WCHAR_H && defined HAVE_MBRTOWC
|
|
+/* We can handle multibyte string. */
|
|
+# define MBS_SUPPORT
|
|
+# include <wchar.h>
|
|
+# include <wctype.h>
|
|
+#endif
|
|
#include <stdio.h>
|
|
#include "system.h"
|
|
#include "getopt.h"
|
|
@@ -558,33 +564,6 @@
|
|
{
|
|
size_t match_size;
|
|
size_t match_offset;
|
|
- if(match_icase)
|
|
- {
|
|
- /* Yuck, this is tricky */
|
|
- char *buf = (char*) xmalloc (lim - beg);
|
|
- char *ibeg = buf;
|
|
- char *ilim = ibeg + (lim - beg);
|
|
- int i;
|
|
- for (i = 0; i < lim - beg; i++)
|
|
- ibeg[i] = tolower (beg[i]);
|
|
- while ((match_offset = (*execute) (ibeg, ilim-ibeg, &match_size, 1))
|
|
- != (size_t) -1)
|
|
- {
|
|
- char const *b = beg + match_offset;
|
|
- if (b == lim)
|
|
- break;
|
|
- fwrite (beg, sizeof (char), match_offset, stdout);
|
|
- printf ("\33[%sm", grep_color);
|
|
- fwrite (b, sizeof (char), match_size, stdout);
|
|
- fputs ("\33[00m", stdout);
|
|
- beg = b + match_size;
|
|
- ibeg = ibeg + match_offset + match_size;
|
|
- }
|
|
- fwrite (beg, 1, lim - beg, stdout);
|
|
- free (buf);
|
|
- lastout = lim;
|
|
- return;
|
|
- }
|
|
while (lim-beg && (match_offset = (*execute) (beg, lim - beg, &match_size, 1))
|
|
!= (size_t) -1)
|
|
{
|
|
@@ -601,6 +580,7 @@
|
|
fputs ("\33[00m", stdout);
|
|
beg = b + match_size;
|
|
}
|
|
+ fputs ("\33[K", stdout);
|
|
}
|
|
fwrite (beg, 1, lim - beg, stdout);
|
|
if (ferror (stdout))
|
|
@@ -1697,6 +1677,37 @@
|
|
if (!install_matcher (matcher) && !install_matcher ("default"))
|
|
abort ();
|
|
|
|
+#ifdef MBS_SUPPORT
|
|
+ if (MB_CUR_MAX != 1 && match_icase)
|
|
+ {
|
|
+ wchar_t wc;
|
|
+ mbstate_t cur_state, prev_state;
|
|
+ int i, len = strlen(keys);
|
|
+
|
|
+ memset(&cur_state, 0, sizeof(mbstate_t));
|
|
+ for (i = 0; i <= len ;)
|
|
+ {
|
|
+ size_t mbclen;
|
|
+ mbclen = mbrtowc(&wc, keys + i, len - i, &cur_state);
|
|
+ if (mbclen == (size_t) -1 || mbclen == (size_t) -2 || mbclen == 0)
|
|
+ {
|
|
+ /* An invalid sequence, or a truncated multibyte character.
|
|
+ We treat it as a singlebyte character. */
|
|
+ mbclen = 1;
|
|
+ }
|
|
+ else
|
|
+ {
|
|
+ if (iswupper((wint_t)wc))
|
|
+ {
|
|
+ wc = towlower((wint_t)wc);
|
|
+ wcrtomb(keys + i, wc, &cur_state);
|
|
+ }
|
|
+ }
|
|
+ i += mbclen;
|
|
+ }
|
|
+ }
|
|
+#endif /* MBS_SUPPORT */
|
|
+
|
|
(*compile)(keys, keycc);
|
|
|
|
if ((argc - optind > 1 && !no_filenames) || with_filenames)
|
|
diff -urN grep-2.5.1a.orig/src/search.c grep-2.5.1a/src/search.c
|
|
--- grep-2.5.1a.orig/src/search.c 2001-04-19 09:42:14.000000000 +0600
|
|
+++ grep-2.5.1a/src/search.c 2005-10-23 09:51:25.000000000 +0600
|
|
@@ -18,9 +18,13 @@
|
|
|
|
/* Written August 1992 by Mike Haertel. */
|
|
|
|
+#ifndef _GNU_SOURCE
|
|
+# define _GNU_SOURCE 1
|
|
+#endif
|
|
#ifdef HAVE_CONFIG_H
|
|
# include <config.h>
|
|
#endif
|
|
+#include <assert.h>
|
|
#include <sys/types.h>
|
|
#if defined HAVE_WCTYPE_H && defined HAVE_WCHAR_H && defined HAVE_MBRTOWC
|
|
/* We can handle multibyte string. */
|
|
@@ -31,7 +35,7 @@
|
|
|
|
#include "system.h"
|
|
#include "grep.h"
|
|
-#include "regex.h"
|
|
+#include <regex.h>
|
|
#include "dfa.h"
|
|
#include "kwset.h"
|
|
#include "error.h"
|
|
@@ -39,6 +43,9 @@
|
|
#ifdef HAVE_LIBPCRE
|
|
# include <pcre.h>
|
|
#endif
|
|
+#ifdef HAVE_LANGINFO_CODESET
|
|
+# include <langinfo.h>
|
|
+#endif
|
|
|
|
#define NCHAR (UCHAR_MAX + 1)
|
|
|
|
@@ -70,9 +77,10 @@
|
|
call the regexp matcher at all. */
|
|
static int kwset_exact_matches;
|
|
|
|
-#if defined(MBS_SUPPORT)
|
|
-static char* check_multibyte_string PARAMS ((char const *buf, size_t size));
|
|
-#endif
|
|
+/* UTF-8 encoding allows some optimizations that we can't otherwise
|
|
+ assume in a multibyte encoding. */
|
|
+static int using_utf8;
|
|
+
|
|
static void kwsinit PARAMS ((void));
|
|
static void kwsmusts PARAMS ((void));
|
|
static void Gcompile PARAMS ((char const *, size_t));
|
|
@@ -84,6 +92,15 @@
|
|
static size_t Pexecute PARAMS ((char const *, size_t, size_t *, int));
|
|
|
|
void
|
|
+check_utf8 (void)
|
|
+{
|
|
+#ifdef HAVE_LANGINFO_CODESET
|
|
+ if (strcmp (nl_langinfo (CODESET), "UTF-8") == 0)
|
|
+ using_utf8 = 1;
|
|
+#endif
|
|
+}
|
|
+
|
|
+void
|
|
dfaerror (char const *mesg)
|
|
{
|
|
error (2, 0, mesg);
|
|
@@ -141,38 +158,6 @@
|
|
}
|
|
}
|
|
|
|
-#ifdef MBS_SUPPORT
|
|
-/* This function allocate the array which correspond to "buf".
|
|
- Then this check multibyte string and mark on the positions which
|
|
- are not singlebyte character nor the first byte of a multibyte
|
|
- character. Caller must free the array. */
|
|
-static char*
|
|
-check_multibyte_string(char const *buf, size_t size)
|
|
-{
|
|
- char *mb_properties = malloc(size);
|
|
- mbstate_t cur_state;
|
|
- int i;
|
|
- memset(&cur_state, 0, sizeof(mbstate_t));
|
|
- memset(mb_properties, 0, sizeof(char)*size);
|
|
- for (i = 0; i < size ;)
|
|
- {
|
|
- size_t mbclen;
|
|
- mbclen = mbrlen(buf + i, size - i, &cur_state);
|
|
-
|
|
- if (mbclen == (size_t) -1 || mbclen == (size_t) -2 || mbclen == 0)
|
|
- {
|
|
- /* An invalid sequence, or a truncated multibyte character.
|
|
- We treat it as a singlebyte character. */
|
|
- mbclen = 1;
|
|
- }
|
|
- mb_properties[i] = mbclen;
|
|
- i += mbclen;
|
|
- }
|
|
-
|
|
- return mb_properties;
|
|
-}
|
|
-#endif
|
|
-
|
|
static void
|
|
Gcompile (char const *pattern, size_t size)
|
|
{
|
|
@@ -181,7 +166,8 @@
|
|
size_t total = size;
|
|
char const *motif = pattern;
|
|
|
|
- re_set_syntax (RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE);
|
|
+ check_utf8 ();
|
|
+ re_set_syntax (RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE | (match_icase ? RE_ICASE : 0));
|
|
dfasyntax (RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE, match_icase, eolbyte);
|
|
|
|
/* For GNU regex compiler we have to pass the patterns separately to detect
|
|
@@ -233,7 +219,7 @@
|
|
static char const line_end[] = "\\)$";
|
|
static char const word_beg[] = "\\(^\\|[^[:alnum:]_]\\)\\(";
|
|
static char const word_end[] = "\\)\\([^[:alnum:]_]\\|$\\)";
|
|
- char *n = malloc (sizeof word_beg - 1 + size + sizeof word_end);
|
|
+ char *n = xmalloc (sizeof word_beg - 1 + size + sizeof word_end);
|
|
size_t i;
|
|
strcpy (n, match_lines ? line_beg : word_beg);
|
|
i = strlen (n);
|
|
@@ -257,14 +243,15 @@
|
|
size_t total = size;
|
|
char const *motif = pattern;
|
|
|
|
+ check_utf8 ();
|
|
if (strcmp (matcher, "awk") == 0)
|
|
{
|
|
- re_set_syntax (RE_SYNTAX_AWK);
|
|
+ re_set_syntax (RE_SYNTAX_AWK | (match_icase ? RE_ICASE : 0));
|
|
dfasyntax (RE_SYNTAX_AWK, match_icase, eolbyte);
|
|
}
|
|
else
|
|
{
|
|
- re_set_syntax (RE_SYNTAX_POSIX_EGREP);
|
|
+ re_set_syntax (RE_SYNTAX_POSIX_EGREP | (match_icase ? RE_ICASE : 0));
|
|
dfasyntax (RE_SYNTAX_POSIX_EGREP, match_icase, eolbyte);
|
|
}
|
|
|
|
@@ -316,7 +303,7 @@
|
|
static char const line_end[] = ")$";
|
|
static char const word_beg[] = "(^|[^[:alnum:]_])(";
|
|
static char const word_end[] = ")([^[:alnum:]_]|$)";
|
|
- char *n = malloc (sizeof word_beg - 1 + size + sizeof word_end);
|
|
+ char *n = xmalloc (sizeof word_beg - 1 + size + sizeof word_end);
|
|
size_t i;
|
|
strcpy (n, match_lines ? line_beg : word_beg);
|
|
i = strlen(n);
|
|
@@ -339,15 +326,35 @@
|
|
char eol = eolbyte;
|
|
int backref, start, len;
|
|
struct kwsmatch kwsm;
|
|
- size_t i;
|
|
+ size_t i, ret_val;
|
|
+ static int use_dfa;
|
|
+ static int use_dfa_checked = 0;
|
|
#ifdef MBS_SUPPORT
|
|
- char *mb_properties = NULL;
|
|
+ const char *last_char = NULL;
|
|
+ int mb_cur_max = MB_CUR_MAX;
|
|
+ mbstate_t mbs;
|
|
+ memset (&mbs, '\0', sizeof (mbstate_t));
|
|
#endif /* MBS_SUPPORT */
|
|
|
|
+ if (!use_dfa_checked)
|
|
+ {
|
|
+ char *grep_use_dfa = getenv ("GREP_USE_DFA");
|
|
+ if (!grep_use_dfa)
|
|
+ {
|
|
#ifdef MBS_SUPPORT
|
|
- if (MB_CUR_MAX > 1 && kwset)
|
|
- mb_properties = check_multibyte_string(buf, size);
|
|
+ /* Turn off DFA when processing multibyte input. */
|
|
+ use_dfa = (MB_CUR_MAX == 1);
|
|
+#else
|
|
+ use_dfa = 1;
|
|
#endif /* MBS_SUPPORT */
|
|
+ }
|
|
+ else
|
|
+ {
|
|
+ use_dfa = atoi (grep_use_dfa);
|
|
+ }
|
|
+
|
|
+ use_dfa_checked = 1;
|
|
+ }
|
|
|
|
buflim = buf + size;
|
|
|
|
@@ -358,47 +365,124 @@
|
|
if (kwset)
|
|
{
|
|
/* Find a possible match using the KWset matcher. */
|
|
- size_t offset = kwsexec (kwset, beg, buflim - beg, &kwsm);
|
|
+#ifdef MBS_SUPPORT
|
|
+ size_t bytes_left = 0;
|
|
+#endif /* MBS_SUPPORT */
|
|
+ size_t offset;
|
|
+#ifdef MBS_SUPPORT
|
|
+ /* kwsexec doesn't work with match_icase and multibyte input. */
|
|
+ if (match_icase && mb_cur_max > 1)
|
|
+ /* Avoid kwset */
|
|
+ offset = 0;
|
|
+ else
|
|
+#endif /* MBS_SUPPORT */
|
|
+ offset = kwsexec (kwset, beg, buflim - beg, &kwsm);
|
|
if (offset == (size_t) -1)
|
|
- {
|
|
+ goto failure;
|
|
#ifdef MBS_SUPPORT
|
|
- if (MB_CUR_MAX > 1)
|
|
- free(mb_properties);
|
|
-#endif
|
|
- return (size_t)-1;
|
|
+ if (mb_cur_max > 1 && !using_utf8)
|
|
+ {
|
|
+ bytes_left = offset;
|
|
+ while (bytes_left)
|
|
+ {
|
|
+ size_t mlen = mbrlen (beg, bytes_left, &mbs);
|
|
+
|
|
+ last_char = beg;
|
|
+ if (mlen == (size_t) -1 || mlen == 0)
|
|
+ {
|
|
+ /* Incomplete character: treat as single-byte. */
|
|
+ memset (&mbs, '\0', sizeof (mbstate_t));
|
|
+ beg++;
|
|
+ bytes_left--;
|
|
+ continue;
|
|
+ }
|
|
+
|
|
+ if (mlen == (size_t) -2)
|
|
+ /* Offset points inside multibyte character:
|
|
+ * no good. */
|
|
+ break;
|
|
+
|
|
+ beg += mlen;
|
|
+ bytes_left -= mlen;
|
|
+ }
|
|
}
|
|
+ else
|
|
+#endif /* MBS_SUPPORT */
|
|
beg += offset;
|
|
/* Narrow down to the line containing the candidate, and
|
|
run it through DFA. */
|
|
end = memchr(beg, eol, buflim - beg);
|
|
end++;
|
|
#ifdef MBS_SUPPORT
|
|
- if (MB_CUR_MAX > 1 && mb_properties[beg - buf] == 0)
|
|
+ if (mb_cur_max > 1 && bytes_left)
|
|
continue;
|
|
-#endif
|
|
+#endif /* MBS_SUPPORT */
|
|
while (beg > buf && beg[-1] != eol)
|
|
--beg;
|
|
- if (kwsm.index < kwset_exact_matches)
|
|
- goto success;
|
|
- if (dfaexec (&dfa, beg, end - beg, &backref) == (size_t) -1)
|
|
+ if (
|
|
+#ifdef MBS_SUPPORT
|
|
+ !(match_icase && mb_cur_max > 1) &&
|
|
+#endif /* MBS_SUPPORT */
|
|
+ (kwsm.index < kwset_exact_matches))
|
|
+ goto success_in_beg_and_end;
|
|
+ if (use_dfa &&
|
|
+ dfaexec (&dfa, beg, end - beg, &backref) == (size_t) -1)
|
|
continue;
|
|
}
|
|
else
|
|
{
|
|
/* No good fixed strings; start with DFA. */
|
|
- size_t offset = dfaexec (&dfa, beg, buflim - beg, &backref);
|
|
+#ifdef MBS_SUPPORT
|
|
+ size_t bytes_left = 0;
|
|
+#endif /* MBS_SUPPORT */
|
|
+ size_t offset = 0;
|
|
+ if (use_dfa)
|
|
+ offset = dfaexec (&dfa, beg, buflim - beg, &backref);
|
|
if (offset == (size_t) -1)
|
|
break;
|
|
/* Narrow down to the line we've found. */
|
|
+#ifdef MBS_SUPPORT
|
|
+ if (mb_cur_max > 1 && !using_utf8)
|
|
+ {
|
|
+ bytes_left = offset;
|
|
+ while (bytes_left)
|
|
+ {
|
|
+ size_t mlen = mbrlen (beg, bytes_left, &mbs);
|
|
+
|
|
+ last_char = beg;
|
|
+ if (mlen == (size_t) -1 || mlen == 0)
|
|
+ {
|
|
+ /* Incomplete character: treat as single-byte. */
|
|
+ memset (&mbs, '\0', sizeof (mbstate_t));
|
|
+ beg++;
|
|
+ bytes_left--;
|
|
+ continue;
|
|
+ }
|
|
+
|
|
+ if (mlen == (size_t) -2)
|
|
+ /* Offset points inside multibyte character:
|
|
+ * no good. */
|
|
+ break;
|
|
+
|
|
+ beg += mlen;
|
|
+ bytes_left -= mlen;
|
|
+ }
|
|
+ }
|
|
+ else
|
|
+#endif /* MBS_SUPPORT */
|
|
beg += offset;
|
|
end = memchr (beg, eol, buflim - beg);
|
|
end++;
|
|
+#ifdef MBS_SUPPORT
|
|
+ if (mb_cur_max > 1 && bytes_left)
|
|
+ continue;
|
|
+#endif /* MBS_SUPPORT */
|
|
while (beg > buf && beg[-1] != eol)
|
|
--beg;
|
|
}
|
|
/* Successful, no backreferences encountered! */
|
|
- if (!backref)
|
|
- goto success;
|
|
+ if (use_dfa && !backref)
|
|
+ goto success_in_beg_and_end;
|
|
}
|
|
else
|
|
end = beg + size;
|
|
@@ -413,14 +497,11 @@
|
|
end - beg - 1, &(patterns[i].regs))))
|
|
{
|
|
len = patterns[i].regs.end[0] - start;
|
|
- if (exact)
|
|
- {
|
|
- *match_size = len;
|
|
- return start;
|
|
- }
|
|
+ if (exact && !match_words)
|
|
+ goto success_in_start_and_len;
|
|
if ((!match_lines && !match_words)
|
|
|| (match_lines && len == end - beg - 1))
|
|
- goto success;
|
|
+ goto success_in_beg_and_end;
|
|
/* If -w, check if the match aligns with word boundaries.
|
|
We do this iteratively because:
|
|
(a) the line may contain more than one occurence of the
|
|
@@ -431,10 +512,84 @@
|
|
if (match_words)
|
|
while (start >= 0)
|
|
{
|
|
- if ((start == 0 || !WCHAR ((unsigned char) beg[start - 1]))
|
|
- && (len == end - beg - 1
|
|
- || !WCHAR ((unsigned char) beg[start + len])))
|
|
- goto success;
|
|
+ int lword_match = 0;
|
|
+ if (start == 0)
|
|
+ lword_match = 1;
|
|
+ else
|
|
+ {
|
|
+ assert (start > 0);
|
|
+#ifdef MBS_SUPPORT
|
|
+ if (mb_cur_max > 1)
|
|
+ {
|
|
+ const char *s;
|
|
+ int mr;
|
|
+ wchar_t pwc;
|
|
+
|
|
+ if (using_utf8)
|
|
+ {
|
|
+ s = beg + start - 1;
|
|
+ while (s > buf
|
|
+ && (unsigned char) *s >= 0x80
|
|
+ && (unsigned char) *s <= 0xbf)
|
|
+ --s;
|
|
+ }
|
|
+ else
|
|
+ s = last_char;
|
|
+ mr = mbtowc (&pwc, s, beg + start - s);
|
|
+ if (mr <= 0)
|
|
+ {
|
|
+ memset (&mbs, '\0', sizeof (mbstate_t));
|
|
+ lword_match = 1;
|
|
+ }
|
|
+ else if (!(iswalnum (pwc) || pwc == L'_')
|
|
+ && mr == (int) (beg + start - s))
|
|
+ lword_match = 1;
|
|
+ }
|
|
+ else
|
|
+#endif /* MBS_SUPPORT */
|
|
+ if (!WCHAR ((unsigned char) beg[start - 1]))
|
|
+ lword_match = 1;
|
|
+ }
|
|
+
|
|
+ if (lword_match)
|
|
+ {
|
|
+ int rword_match = 0;
|
|
+ if (start + len == end - beg - 1)
|
|
+ rword_match = 1;
|
|
+ else
|
|
+ {
|
|
+#ifdef MBS_SUPPORT
|
|
+ if (mb_cur_max > 1)
|
|
+ {
|
|
+ wchar_t nwc;
|
|
+ int mr;
|
|
+
|
|
+ mr = mbtowc (&nwc, beg + start + len,
|
|
+ end - beg - start - len - 1);
|
|
+ if (mr <= 0)
|
|
+ {
|
|
+ memset (&mbs, '\0', sizeof (mbstate_t));
|
|
+ rword_match = 1;
|
|
+ }
|
|
+ else if (!iswalnum (nwc) && nwc != L'_')
|
|
+ rword_match = 1;
|
|
+ }
|
|
+ else
|
|
+#endif /* MBS_SUPPORT */
|
|
+ if (!WCHAR ((unsigned char) beg[start + len]))
|
|
+ rword_match = 1;
|
|
+ }
|
|
+
|
|
+ if (rword_match)
|
|
+ {
|
|
+ if (!exact)
|
|
+ /* Returns the whole line. */
|
|
+ goto success_in_beg_and_end;
|
|
+ else
|
|
+ /* Returns just this word match. */
|
|
+ goto success_in_start_and_len;
|
|
+ }
|
|
+ }
|
|
if (len > 0)
|
|
{
|
|
/* Try a shorter length anchored at the same place. */
|
|
@@ -461,26 +616,154 @@
|
|
}
|
|
} /* for Regex patterns. */
|
|
} /* for (beg = end ..) */
|
|
-#ifdef MBS_SUPPORT
|
|
- if (MB_CUR_MAX > 1 && mb_properties)
|
|
- free (mb_properties);
|
|
-#endif /* MBS_SUPPORT */
|
|
+
|
|
+ failure:
|
|
return (size_t) -1;
|
|
|
|
- success:
|
|
-#ifdef MBS_SUPPORT
|
|
- if (MB_CUR_MAX > 1 && mb_properties)
|
|
- free (mb_properties);
|
|
-#endif /* MBS_SUPPORT */
|
|
- *match_size = end - beg;
|
|
- return beg - buf;
|
|
+ success_in_beg_and_end:
|
|
+ len = end - beg;
|
|
+ start = beg - buf;
|
|
+ /* FALLTHROUGH */
|
|
+
|
|
+ success_in_start_and_len:
|
|
+ *match_size = len;
|
|
+ return start;
|
|
}
|
|
|
|
+#ifdef MBS_SUPPORT
|
|
+static int f_i_multibyte; /* whether we're using the new -Fi MB method */
|
|
+static struct
|
|
+{
|
|
+ wchar_t **patterns;
|
|
+ size_t count, maxlen;
|
|
+ unsigned char *match;
|
|
+} Fimb;
|
|
+#endif
|
|
+
|
|
static void
|
|
Fcompile (char const *pattern, size_t size)
|
|
{
|
|
+ int mb_cur_max = MB_CUR_MAX;
|
|
char const *beg, *lim, *err;
|
|
|
|
+ check_utf8 ();
|
|
+#ifdef MBS_SUPPORT
|
|
+ /* Support -F -i for UTF-8 input. */
|
|
+ if (match_icase && mb_cur_max > 1)
|
|
+ {
|
|
+ mbstate_t mbs;
|
|
+ wchar_t *wcpattern = xmalloc ((size + 1) * sizeof (wchar_t));
|
|
+ const char *patternend = pattern;
|
|
+ size_t wcsize;
|
|
+ kwset_t fimb_kwset = NULL;
|
|
+ char *starts = NULL;
|
|
+ wchar_t *wcbeg, *wclim;
|
|
+ size_t allocated = 0;
|
|
+
|
|
+ memset (&mbs, '\0', sizeof (mbs));
|
|
+# ifdef __GNU_LIBRARY__
|
|
+ wcsize = mbsnrtowcs (wcpattern, &patternend, size, size, &mbs);
|
|
+ if (patternend != pattern + size)
|
|
+ wcsize = (size_t) -1;
|
|
+# else
|
|
+ {
|
|
+ char *patterncopy = xmalloc (size + 1);
|
|
+
|
|
+ memcpy (patterncopy, pattern, size);
|
|
+ patterncopy[size] = '\0';
|
|
+ patternend = patterncopy;
|
|
+ wcsize = mbsrtowcs (wcpattern, &patternend, size, &mbs);
|
|
+ if (patternend != patterncopy + size)
|
|
+ wcsize = (size_t) -1;
|
|
+ free (patterncopy);
|
|
+ }
|
|
+# endif
|
|
+ if (wcsize + 2 <= 2)
|
|
+ {
|
|
+fimb_fail:
|
|
+ free (wcpattern);
|
|
+ free (starts);
|
|
+ if (fimb_kwset)
|
|
+ kwsfree (fimb_kwset);
|
|
+ free (Fimb.patterns);
|
|
+ Fimb.patterns = NULL;
|
|
+ }
|
|
+ else
|
|
+ {
|
|
+ if (!(fimb_kwset = kwsalloc (NULL)))
|
|
+ error (2, 0, _("memory exhausted"));
|
|
+
|
|
+ starts = xmalloc (mb_cur_max * 3);
|
|
+ wcbeg = wcpattern;
|
|
+ do
|
|
+ {
|
|
+ int i;
|
|
+ size_t wclen;
|
|
+
|
|
+ if (Fimb.count >= allocated)
|
|
+ {
|
|
+ if (allocated == 0)
|
|
+ allocated = 128;
|
|
+ else
|
|
+ allocated *= 2;
|
|
+ Fimb.patterns = xrealloc (Fimb.patterns,
|
|
+ sizeof (wchar_t *) * allocated);
|
|
+ }
|
|
+ Fimb.patterns[Fimb.count++] = wcbeg;
|
|
+ for (wclim = wcbeg;
|
|
+ wclim < wcpattern + wcsize && *wclim != L'\n'; ++wclim)
|
|
+ *wclim = towlower (*wclim);
|
|
+ *wclim = L'\0';
|
|
+ wclen = wclim - wcbeg;
|
|
+ if (wclen > Fimb.maxlen)
|
|
+ Fimb.maxlen = wclen;
|
|
+ if (wclen > 3)
|
|
+ wclen = 3;
|
|
+ if (wclen == 0)
|
|
+ {
|
|
+ if ((err = kwsincr (fimb_kwset, "", 0)) != 0)
|
|
+ error (2, 0, err);
|
|
+ }
|
|
+ else
|
|
+ for (i = 0; i < (1 << wclen); i++)
|
|
+ {
|
|
+ char *p = starts;
|
|
+ int j, k;
|
|
+
|
|
+ for (j = 0; j < wclen; ++j)
|
|
+ {
|
|
+ wchar_t wc = wcbeg[j];
|
|
+ if (i & (1 << j))
|
|
+ {
|
|
+ wc = towupper (wc);
|
|
+ if (wc == wcbeg[j])
|
|
+ continue;
|
|
+ }
|
|
+ k = wctomb (p, wc);
|
|
+ if (k <= 0)
|
|
+ goto fimb_fail;
|
|
+ p += k;
|
|
+ }
|
|
+ if ((err = kwsincr (fimb_kwset, starts, p - starts)) != 0)
|
|
+ error (2, 0, err);
|
|
+ }
|
|
+ if (wclim < wcpattern + wcsize)
|
|
+ ++wclim;
|
|
+ wcbeg = wclim;
|
|
+ }
|
|
+ while (wcbeg < wcpattern + wcsize);
|
|
+ f_i_multibyte = 1;
|
|
+ kwset = fimb_kwset;
|
|
+ free (starts);
|
|
+ Fimb.match = xmalloc (Fimb.count);
|
|
+ if ((err = kwsprep (kwset)) != 0)
|
|
+ error (2, 0, err);
|
|
+ return;
|
|
+ }
|
|
+ }
|
|
+#endif /* MBS_SUPPORT */
|
|
+
|
|
+
|
|
kwsinit ();
|
|
beg = pattern;
|
|
do
|
|
@@ -499,6 +782,76 @@
|
|
error (2, 0, err);
|
|
}
|
|
|
|
+#ifdef MBS_SUPPORT
|
|
+static int
|
|
+Fimbexec (const char *buf, size_t size, size_t *plen, int exact)
|
|
+{
|
|
+ size_t len, letter, i;
|
|
+ int ret = -1;
|
|
+ mbstate_t mbs;
|
|
+ wchar_t wc;
|
|
+ int patterns_left;
|
|
+
|
|
+ assert (match_icase && f_i_multibyte == 1);
|
|
+ assert (MB_CUR_MAX > 1);
|
|
+
|
|
+ memset (&mbs, '\0', sizeof (mbs));
|
|
+ memset (Fimb.match, '\1', Fimb.count);
|
|
+ letter = len = 0;
|
|
+ patterns_left = 1;
|
|
+ while (patterns_left && len <= size)
|
|
+ {
|
|
+ size_t c;
|
|
+
|
|
+ patterns_left = 0;
|
|
+ if (len < size)
|
|
+ {
|
|
+ c = mbrtowc (&wc, buf + len, size - len, &mbs);
|
|
+ if (c + 2 <= 2)
|
|
+ return ret;
|
|
+
|
|
+ wc = towlower (wc);
|
|
+ }
|
|
+ else
|
|
+ {
|
|
+ c = 1;
|
|
+ wc = L'\0';
|
|
+ }
|
|
+
|
|
+ for (i = 0; i < Fimb.count; i++)
|
|
+ {
|
|
+ if (Fimb.match[i])
|
|
+ {
|
|
+ if (Fimb.patterns[i][letter] == L'\0')
|
|
+ {
|
|
+ /* Found a match. */
|
|
+ *plen = len;
|
|
+ if (!exact && !match_words)
|
|
+ return 0;
|
|
+ else
|
|
+ {
|
|
+ /* For -w or exact look for longest match. */
|
|
+ ret = 0;
|
|
+ Fimb.match[i] = '\0';
|
|
+ continue;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ if (Fimb.patterns[i][letter] == wc)
|
|
+ patterns_left = 1;
|
|
+ else
|
|
+ Fimb.match[i] = '\0';
|
|
+ }
|
|
+ }
|
|
+
|
|
+ len += c;
|
|
+ letter++;
|
|
+ }
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+#endif /* MBS_SUPPORT */
|
|
+
|
|
static size_t
|
|
Fexecute (char const *buf, size_t size, size_t *match_size, int exact)
|
|
{
|
|
@@ -506,88 +859,268 @@
|
|
register size_t len;
|
|
char eol = eolbyte;
|
|
struct kwsmatch kwsmatch;
|
|
+ size_t ret_val;
|
|
#ifdef MBS_SUPPORT
|
|
- char *mb_properties;
|
|
- if (MB_CUR_MAX > 1)
|
|
- mb_properties = check_multibyte_string (buf, size);
|
|
+ int mb_cur_max = MB_CUR_MAX;
|
|
+ mbstate_t mbs;
|
|
+ memset (&mbs, '\0', sizeof (mbstate_t));
|
|
+ const char *last_char = NULL;
|
|
#endif /* MBS_SUPPORT */
|
|
|
|
for (beg = buf; beg <= buf + size; ++beg)
|
|
{
|
|
- size_t offset = kwsexec (kwset, beg, buf + size - beg, &kwsmatch);
|
|
+ size_t offset;
|
|
+ offset = kwsexec (kwset, beg, buf + size - beg, &kwsmatch);
|
|
+
|
|
if (offset == (size_t) -1)
|
|
- {
|
|
+ goto failure;
|
|
#ifdef MBS_SUPPORT
|
|
- if (MB_CUR_MAX > 1)
|
|
- free(mb_properties);
|
|
-#endif /* MBS_SUPPORT */
|
|
- return offset;
|
|
+ if (mb_cur_max > 1 && !using_utf8)
|
|
+ {
|
|
+ size_t bytes_left = offset;
|
|
+ while (bytes_left)
|
|
+ {
|
|
+ size_t mlen = mbrlen (beg, bytes_left, &mbs);
|
|
+
|
|
+ last_char = beg;
|
|
+ if (mlen == (size_t) -1 || mlen == 0)
|
|
+ {
|
|
+ /* Incomplete character: treat as single-byte. */
|
|
+ memset (&mbs, '\0', sizeof (mbstate_t));
|
|
+ beg++;
|
|
+ bytes_left--;
|
|
+ continue;
|
|
+ }
|
|
+
|
|
+ if (mlen == (size_t) -2)
|
|
+ /* Offset points inside multibyte character: no good. */
|
|
+ break;
|
|
+
|
|
+ beg += mlen;
|
|
+ bytes_left -= mlen;
|
|
+ }
|
|
+
|
|
+ if (bytes_left)
|
|
+ continue;
|
|
}
|
|
-#ifdef MBS_SUPPORT
|
|
- if (MB_CUR_MAX > 1 && mb_properties[offset+beg-buf] == 0)
|
|
- continue; /* It is a part of multibyte character. */
|
|
+ else
|
|
#endif /* MBS_SUPPORT */
|
|
beg += offset;
|
|
- len = kwsmatch.size[0];
|
|
- if (exact)
|
|
- {
|
|
- *match_size = len;
|
|
#ifdef MBS_SUPPORT
|
|
- if (MB_CUR_MAX > 1)
|
|
- free (mb_properties);
|
|
+ /* For f_i_multibyte, the string at beg now matches first 3 chars of
|
|
+ one of the search strings (less if there are shorter search strings).
|
|
+ See if this is a real match. */
|
|
+ if (f_i_multibyte
|
|
+ && Fimbexec (beg, buf + size - beg, &kwsmatch.size[0], exact))
|
|
+ goto next_char;
|
|
#endif /* MBS_SUPPORT */
|
|
- return beg - buf;
|
|
- }
|
|
+ len = kwsmatch.size[0];
|
|
+ if (exact && !match_words)
|
|
+ goto success_in_beg_and_len;
|
|
if (match_lines)
|
|
{
|
|
if (beg > buf && beg[-1] != eol)
|
|
- continue;
|
|
+ goto next_char;
|
|
if (beg + len < buf + size && beg[len] != eol)
|
|
- continue;
|
|
+ goto next_char;
|
|
goto success;
|
|
}
|
|
else if (match_words)
|
|
- for (try = beg; len; )
|
|
- {
|
|
- if (try > buf && WCHAR((unsigned char) try[-1]))
|
|
- break;
|
|
- if (try + len < buf + size && WCHAR((unsigned char) try[len]))
|
|
- {
|
|
- offset = kwsexec (kwset, beg, --len, &kwsmatch);
|
|
- if (offset == (size_t) -1)
|
|
- {
|
|
+ {
|
|
+ while (len)
|
|
+ {
|
|
+ int word_match = 0;
|
|
+ if (beg > buf)
|
|
+ {
|
|
#ifdef MBS_SUPPORT
|
|
- if (MB_CUR_MAX > 1)
|
|
- free (mb_properties);
|
|
+ if (mb_cur_max > 1)
|
|
+ {
|
|
+ const char *s;
|
|
+ int mr;
|
|
+ wchar_t pwc;
|
|
+
|
|
+ if (using_utf8)
|
|
+ {
|
|
+ s = beg - 1;
|
|
+ while (s > buf
|
|
+ && (unsigned char) *s >= 0x80
|
|
+ && (unsigned char) *s <= 0xbf)
|
|
+ --s;
|
|
+ }
|
|
+ else
|
|
+ s = last_char;
|
|
+ mr = mbtowc (&pwc, s, beg - s);
|
|
+ if (mr <= 0)
|
|
+ memset (&mbs, '\0', sizeof (mbstate_t));
|
|
+ else if ((iswalnum (pwc) || pwc == L'_')
|
|
+ && mr == (int) (beg - s))
|
|
+ goto next_char;
|
|
+ }
|
|
+ else
|
|
#endif /* MBS_SUPPORT */
|
|
- return offset;
|
|
- }
|
|
- try = beg + offset;
|
|
- len = kwsmatch.size[0];
|
|
- }
|
|
- else
|
|
- goto success;
|
|
- }
|
|
+ if (WCHAR ((unsigned char) beg[-1]))
|
|
+ goto next_char;
|
|
+ }
|
|
+#ifdef MBS_SUPPORT
|
|
+ if (mb_cur_max > 1)
|
|
+ {
|
|
+ wchar_t nwc;
|
|
+ int mr;
|
|
+
|
|
+ mr = mbtowc (&nwc, beg + len, buf + size - beg - len);
|
|
+ if (mr <= 0)
|
|
+ {
|
|
+ memset (&mbs, '\0', sizeof (mbstate_t));
|
|
+ word_match = 1;
|
|
+ }
|
|
+ else if (!iswalnum (nwc) && nwc != L'_')
|
|
+ word_match = 1;
|
|
+ }
|
|
+ else
|
|
+#endif /* MBS_SUPPORT */
|
|
+ if (beg + len >= buf + size || !WCHAR ((unsigned char) beg[len]))
|
|
+ word_match = 1;
|
|
+ if (word_match)
|
|
+ {
|
|
+ if (!exact)
|
|
+ /* Returns the whole line now we know there's a word match. */
|
|
+ goto success;
|
|
+ else
|
|
+ /* Returns just this word match. */
|
|
+ goto success_in_beg_and_len;
|
|
+ }
|
|
+ if (len > 0)
|
|
+ {
|
|
+ /* Try a shorter length anchored at the same place. */
|
|
+ --len;
|
|
+ offset = kwsexec (kwset, beg, len, &kwsmatch);
|
|
+
|
|
+ if (offset == -1)
|
|
+ goto next_char; /* Try a different anchor. */
|
|
+#ifdef MBS_SUPPORT
|
|
+ if (mb_cur_max > 1 && !using_utf8)
|
|
+ {
|
|
+ size_t bytes_left = offset;
|
|
+ while (bytes_left)
|
|
+ {
|
|
+ size_t mlen = mbrlen (beg, bytes_left, &mbs);
|
|
+
|
|
+ last_char = beg;
|
|
+ if (mlen == (size_t) -1 || mlen == 0)
|
|
+ {
|
|
+ /* Incomplete character: treat as single-byte. */
|
|
+ memset (&mbs, '\0', sizeof (mbstate_t));
|
|
+ beg++;
|
|
+ bytes_left--;
|
|
+ continue;
|
|
+ }
|
|
+
|
|
+ if (mlen == (size_t) -2)
|
|
+ {
|
|
+ /* Offset points inside multibyte character:
|
|
+ * no good. */
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ beg += mlen;
|
|
+ bytes_left -= mlen;
|
|
+ }
|
|
+
|
|
+ if (bytes_left)
|
|
+ {
|
|
+ memset (&mbs, '\0', sizeof (mbstate_t));
|
|
+ goto next_char; /* Try a different anchor. */
|
|
+ }
|
|
+ }
|
|
+ else
|
|
+#endif /* MBS_SUPPORT */
|
|
+ beg += offset;
|
|
+#ifdef MBS_SUPPORT
|
|
+ /* The string at beg now matches first 3 chars of one of
|
|
+ the search strings (less if there are shorter search
|
|
+ strings). See if this is a real match. */
|
|
+ if (f_i_multibyte
|
|
+ && Fimbexec (beg, len - offset, &kwsmatch.size[0],
|
|
+ exact))
|
|
+ goto next_char;
|
|
+#endif /* MBS_SUPPORT */
|
|
+ len = kwsmatch.size[0];
|
|
+ }
|
|
+ }
|
|
+ }
|
|
else
|
|
goto success;
|
|
- }
|
|
-
|
|
+next_char:;
|
|
#ifdef MBS_SUPPORT
|
|
- if (MB_CUR_MAX > 1)
|
|
- free (mb_properties);
|
|
+ /* Advance to next character. For MB_CUR_MAX == 1 case this is handled
|
|
+ by ++beg above. */
|
|
+ if (mb_cur_max > 1)
|
|
+ {
|
|
+ if (using_utf8)
|
|
+ {
|
|
+ unsigned char c = *beg;
|
|
+ if (c >= 0xc2)
|
|
+ {
|
|
+ if (c < 0xe0)
|
|
+ ++beg;
|
|
+ else if (c < 0xf0)
|
|
+ beg += 2;
|
|
+ else if (c < 0xf8)
|
|
+ beg += 3;
|
|
+ else if (c < 0xfc)
|
|
+ beg += 4;
|
|
+ else if (c < 0xfe)
|
|
+ beg += 5;
|
|
+ }
|
|
+ }
|
|
+ else
|
|
+ {
|
|
+ size_t l = mbrlen (beg, buf + size - beg, &mbs);
|
|
+
|
|
+ last_char = beg;
|
|
+ if (l + 2 >= 2)
|
|
+ beg += l - 1;
|
|
+ else
|
|
+ memset (&mbs, '\0', sizeof (mbstate_t));
|
|
+ }
|
|
+ }
|
|
#endif /* MBS_SUPPORT */
|
|
+ }
|
|
+
|
|
+ failure:
|
|
return -1;
|
|
|
|
success:
|
|
+#ifdef MBS_SUPPORT
|
|
+ if (mb_cur_max > 1 && !using_utf8)
|
|
+ {
|
|
+ end = beg + len;
|
|
+ while (end < buf + size)
|
|
+ {
|
|
+ size_t mlen = mbrlen (end, buf + size - end, &mbs);
|
|
+ if (mlen == (size_t) -1 || mlen == (size_t) -2 || mlen == 0)
|
|
+ {
|
|
+ memset (&mbs, '\0', sizeof (mbstate_t));
|
|
+ mlen = 1;
|
|
+ }
|
|
+ if (mlen == 1 && *end == eol)
|
|
+ break;
|
|
+
|
|
+ end += mlen;
|
|
+ }
|
|
+ }
|
|
+ else
|
|
+#endif /* MBS_SUPPORT */
|
|
end = memchr (beg + len, eol, (buf + size) - (beg + len));
|
|
+
|
|
end++;
|
|
while (buf < beg && beg[-1] != eol)
|
|
--beg;
|
|
- *match_size = end - beg;
|
|
-#ifdef MBS_SUPPORT
|
|
- if (MB_CUR_MAX > 1)
|
|
- free (mb_properties);
|
|
-#endif /* MBS_SUPPORT */
|
|
+ len = end - beg;
|
|
+ /* FALLTHROUGH */
|
|
+
|
|
+ success_in_beg_and_len:
|
|
+ *match_size = len;
|
|
return beg - buf;
|
|
}
|
|
|
|
diff -urN grep-2.5.1a.orig/src/search.c.orig grep-2.5.1a/src/search.c.orig
|
|
--- grep-2.5.1a.orig/src/search.c.orig 1970-01-01 05:00:00.000000000 +0500
|
|
+++ grep-2.5.1a/src/search.c.orig 2005-10-23 09:48:39.000000000 +0600
|
|
@@ -0,0 +1,714 @@
|
|
+/* search.c - searching subroutines using dfa, kwset and regex for grep.
|
|
+ Copyright 1992, 1998, 2000 Free Software Foundation, Inc.
|
|
+
|
|
+ This program is free software; you can redistribute it and/or modify
|
|
+ it under the terms of the GNU General Public License as published by
|
|
+ the Free Software Foundation; either version 2, or (at your option)
|
|
+ any later version.
|
|
+
|
|
+ This program is distributed in the hope that it will be useful,
|
|
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
+ GNU General Public License for more details.
|
|
+
|
|
+ You should have received a copy of the GNU General Public License
|
|
+ along with this program; if not, write to the Free Software
|
|
+ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
|
|
+ 02111-1307, USA. */
|
|
+
|
|
+/* Written August 1992 by Mike Haertel. */
|
|
+
|
|
+#ifdef HAVE_CONFIG_H
|
|
+# include <config.h>
|
|
+#endif
|
|
+#include <sys/types.h>
|
|
+#if defined HAVE_WCTYPE_H && defined HAVE_WCHAR_H && defined HAVE_MBRTOWC
|
|
+/* We can handle multibyte string. */
|
|
+# define MBS_SUPPORT
|
|
+# include <wchar.h>
|
|
+# include <wctype.h>
|
|
+#endif
|
|
+
|
|
+#include "system.h"
|
|
+#include "grep.h"
|
|
+#include "regex.h"
|
|
+#include "dfa.h"
|
|
+#include "kwset.h"
|
|
+#include "error.h"
|
|
+#include "xalloc.h"
|
|
+#ifdef HAVE_LIBPCRE
|
|
+# include <pcre.h>
|
|
+#endif
|
|
+
|
|
+#define NCHAR (UCHAR_MAX + 1)
|
|
+
|
|
+/* For -w, we also consider _ to be word constituent. */
|
|
+#define WCHAR(C) (ISALNUM(C) || (C) == '_')
|
|
+
|
|
+/* DFA compiled regexp. */
|
|
+static struct dfa dfa;
|
|
+
|
|
+/* The Regex compiled patterns. */
|
|
+static struct patterns
|
|
+{
|
|
+ /* Regex compiled regexp. */
|
|
+ struct re_pattern_buffer regexbuf;
|
|
+ struct re_registers regs; /* This is here on account of a BRAIN-DEAD
|
|
+ Q@#%!# library interface in regex.c. */
|
|
+} patterns0;
|
|
+
|
|
+struct patterns *patterns;
|
|
+size_t pcount;
|
|
+
|
|
+/* KWset compiled pattern. For Ecompile and Gcompile, we compile
|
|
+ a list of strings, at least one of which is known to occur in
|
|
+ any string matching the regexp. */
|
|
+static kwset_t kwset;
|
|
+
|
|
+/* Number of compiled fixed strings known to exactly match the regexp.
|
|
+ If kwsexec returns < kwset_exact_matches, then we don't need to
|
|
+ call the regexp matcher at all. */
|
|
+static int kwset_exact_matches;
|
|
+
|
|
+#if defined(MBS_SUPPORT)
|
|
+static char* check_multibyte_string PARAMS ((char const *buf, size_t size));
|
|
+#endif
|
|
+static void kwsinit PARAMS ((void));
|
|
+static void kwsmusts PARAMS ((void));
|
|
+static void Gcompile PARAMS ((char const *, size_t));
|
|
+static void Ecompile PARAMS ((char const *, size_t));
|
|
+static size_t EGexecute PARAMS ((char const *, size_t, size_t *, int ));
|
|
+static void Fcompile PARAMS ((char const *, size_t));
|
|
+static size_t Fexecute PARAMS ((char const *, size_t, size_t *, int));
|
|
+static void Pcompile PARAMS ((char const *, size_t ));
|
|
+static size_t Pexecute PARAMS ((char const *, size_t, size_t *, int));
|
|
+
|
|
+void
|
|
+dfaerror (char const *mesg)
|
|
+{
|
|
+ error (2, 0, mesg);
|
|
+}
|
|
+
|
|
+static void
|
|
+kwsinit (void)
|
|
+{
|
|
+ static char trans[NCHAR];
|
|
+ int i;
|
|
+
|
|
+ if (match_icase)
|
|
+ for (i = 0; i < NCHAR; ++i)
|
|
+ trans[i] = TOLOWER (i);
|
|
+
|
|
+ if (!(kwset = kwsalloc (match_icase ? trans : (char *) 0)))
|
|
+ error (2, 0, _("memory exhausted"));
|
|
+}
|
|
+
|
|
+/* If the DFA turns out to have some set of fixed strings one of
|
|
+ which must occur in the match, then we build a kwset matcher
|
|
+ to find those strings, and thus quickly filter out impossible
|
|
+ matches. */
|
|
+static void
|
|
+kwsmusts (void)
|
|
+{
|
|
+ struct dfamust const *dm;
|
|
+ char const *err;
|
|
+
|
|
+ if (dfa.musts)
|
|
+ {
|
|
+ kwsinit ();
|
|
+ /* First, we compile in the substrings known to be exact
|
|
+ matches. The kwset matcher will return the index
|
|
+ of the matching string that it chooses. */
|
|
+ for (dm = dfa.musts; dm; dm = dm->next)
|
|
+ {
|
|
+ if (!dm->exact)
|
|
+ continue;
|
|
+ ++kwset_exact_matches;
|
|
+ if ((err = kwsincr (kwset, dm->must, strlen (dm->must))) != 0)
|
|
+ error (2, 0, err);
|
|
+ }
|
|
+ /* Now, we compile the substrings that will require
|
|
+ the use of the regexp matcher. */
|
|
+ for (dm = dfa.musts; dm; dm = dm->next)
|
|
+ {
|
|
+ if (dm->exact)
|
|
+ continue;
|
|
+ if ((err = kwsincr (kwset, dm->must, strlen (dm->must))) != 0)
|
|
+ error (2, 0, err);
|
|
+ }
|
|
+ if ((err = kwsprep (kwset)) != 0)
|
|
+ error (2, 0, err);
|
|
+ }
|
|
+}
|
|
+
|
|
+#ifdef MBS_SUPPORT
|
|
+/* This function allocate the array which correspond to "buf".
|
|
+ Then this check multibyte string and mark on the positions which
|
|
+ are not singlebyte character nor the first byte of a multibyte
|
|
+ character. Caller must free the array. */
|
|
+static char*
|
|
+check_multibyte_string(char const *buf, size_t size)
|
|
+{
|
|
+ char *mb_properties = malloc(size);
|
|
+ mbstate_t cur_state;
|
|
+ int i;
|
|
+ memset(&cur_state, 0, sizeof(mbstate_t));
|
|
+ memset(mb_properties, 0, sizeof(char)*size);
|
|
+ for (i = 0; i < size ;)
|
|
+ {
|
|
+ size_t mbclen;
|
|
+ mbclen = mbrlen(buf + i, size - i, &cur_state);
|
|
+
|
|
+ if (mbclen == (size_t) -1 || mbclen == (size_t) -2 || mbclen == 0)
|
|
+ {
|
|
+ /* An invalid sequence, or a truncated multibyte character.
|
|
+ We treat it as a singlebyte character. */
|
|
+ mbclen = 1;
|
|
+ }
|
|
+ mb_properties[i] = mbclen;
|
|
+ i += mbclen;
|
|
+ }
|
|
+
|
|
+ return mb_properties;
|
|
+}
|
|
+#endif
|
|
+
|
|
+static void
|
|
+Gcompile (char const *pattern, size_t size)
|
|
+{
|
|
+ const char *err;
|
|
+ char const *sep;
|
|
+ size_t total = size;
|
|
+ char const *motif = pattern;
|
|
+
|
|
+ re_set_syntax (RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE);
|
|
+ dfasyntax (RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE, match_icase, eolbyte);
|
|
+
|
|
+ /* For GNU regex compiler we have to pass the patterns separately to detect
|
|
+ errors like "[\nallo\n]\n". The patterns here are "[", "allo" and "]"
|
|
+ GNU regex should have raise a syntax error. The same for backref, where
|
|
+ the backref should have been local to each pattern. */
|
|
+ do
|
|
+ {
|
|
+ size_t len;
|
|
+ sep = memchr (motif, '\n', total);
|
|
+ if (sep)
|
|
+ {
|
|
+ len = sep - motif;
|
|
+ sep++;
|
|
+ total -= (len + 1);
|
|
+ }
|
|
+ else
|
|
+ {
|
|
+ len = total;
|
|
+ total = 0;
|
|
+ }
|
|
+
|
|
+ patterns = realloc (patterns, (pcount + 1) * sizeof (*patterns));
|
|
+ if (patterns == NULL)
|
|
+ error (2, errno, _("memory exhausted"));
|
|
+
|
|
+ patterns[pcount] = patterns0;
|
|
+
|
|
+ if ((err = re_compile_pattern (motif, len,
|
|
+ &(patterns[pcount].regexbuf))) != 0)
|
|
+ error (2, 0, err);
|
|
+ pcount++;
|
|
+
|
|
+ motif = sep;
|
|
+ } while (sep && total != 0);
|
|
+
|
|
+ /* In the match_words and match_lines cases, we use a different pattern
|
|
+ for the DFA matcher that will quickly throw out cases that won't work.
|
|
+ Then if DFA succeeds we do some hairy stuff using the regex matcher
|
|
+ to decide whether the match should really count. */
|
|
+ if (match_words || match_lines)
|
|
+ {
|
|
+ /* In the whole-word case, we use the pattern:
|
|
+ \(^\|[^[:alnum:]_]\)\(userpattern\)\([^[:alnum:]_]|$\).
|
|
+ In the whole-line case, we use the pattern:
|
|
+ ^\(userpattern\)$. */
|
|
+
|
|
+ static char const line_beg[] = "^\\(";
|
|
+ static char const line_end[] = "\\)$";
|
|
+ static char const word_beg[] = "\\(^\\|[^[:alnum:]_]\\)\\(";
|
|
+ static char const word_end[] = "\\)\\([^[:alnum:]_]\\|$\\)";
|
|
+ char *n = malloc (sizeof word_beg - 1 + size + sizeof word_end);
|
|
+ size_t i;
|
|
+ strcpy (n, match_lines ? line_beg : word_beg);
|
|
+ i = strlen (n);
|
|
+ memcpy (n + i, pattern, size);
|
|
+ i += size;
|
|
+ strcpy (n + i, match_lines ? line_end : word_end);
|
|
+ i += strlen (n + i);
|
|
+ pattern = n;
|
|
+ size = i;
|
|
+ }
|
|
+
|
|
+ dfacomp (pattern, size, &dfa, 1);
|
|
+ kwsmusts ();
|
|
+}
|
|
+
|
|
+static void
|
|
+Ecompile (char const *pattern, size_t size)
|
|
+{
|
|
+ const char *err;
|
|
+ const char *sep;
|
|
+ size_t total = size;
|
|
+ char const *motif = pattern;
|
|
+
|
|
+ if (strcmp (matcher, "awk") == 0)
|
|
+ {
|
|
+ re_set_syntax (RE_SYNTAX_AWK);
|
|
+ dfasyntax (RE_SYNTAX_AWK, match_icase, eolbyte);
|
|
+ }
|
|
+ else
|
|
+ {
|
|
+ re_set_syntax (RE_SYNTAX_POSIX_EGREP);
|
|
+ dfasyntax (RE_SYNTAX_POSIX_EGREP, match_icase, eolbyte);
|
|
+ }
|
|
+
|
|
+ /* For GNU regex compiler we have to pass the patterns separately to detect
|
|
+ errors like "[\nallo\n]\n". The patterns here are "[", "allo" and "]"
|
|
+ GNU regex should have raise a syntax error. The same for backref, where
|
|
+ the backref should have been local to each pattern. */
|
|
+ do
|
|
+ {
|
|
+ size_t len;
|
|
+ sep = memchr (motif, '\n', total);
|
|
+ if (sep)
|
|
+ {
|
|
+ len = sep - motif;
|
|
+ sep++;
|
|
+ total -= (len + 1);
|
|
+ }
|
|
+ else
|
|
+ {
|
|
+ len = total;
|
|
+ total = 0;
|
|
+ }
|
|
+
|
|
+ patterns = realloc (patterns, (pcount + 1) * sizeof (*patterns));
|
|
+ if (patterns == NULL)
|
|
+ error (2, errno, _("memory exhausted"));
|
|
+ patterns[pcount] = patterns0;
|
|
+
|
|
+ if ((err = re_compile_pattern (motif, len,
|
|
+ &(patterns[pcount].regexbuf))) != 0)
|
|
+ error (2, 0, err);
|
|
+ pcount++;
|
|
+
|
|
+ motif = sep;
|
|
+ } while (sep && total != 0);
|
|
+
|
|
+ /* In the match_words and match_lines cases, we use a different pattern
|
|
+ for the DFA matcher that will quickly throw out cases that won't work.
|
|
+ Then if DFA succeeds we do some hairy stuff using the regex matcher
|
|
+ to decide whether the match should really count. */
|
|
+ if (match_words || match_lines)
|
|
+ {
|
|
+ /* In the whole-word case, we use the pattern:
|
|
+ (^|[^[:alnum:]_])(userpattern)([^[:alnum:]_]|$).
|
|
+ In the whole-line case, we use the pattern:
|
|
+ ^(userpattern)$. */
|
|
+
|
|
+ static char const line_beg[] = "^(";
|
|
+ static char const line_end[] = ")$";
|
|
+ static char const word_beg[] = "(^|[^[:alnum:]_])(";
|
|
+ static char const word_end[] = ")([^[:alnum:]_]|$)";
|
|
+ char *n = malloc (sizeof word_beg - 1 + size + sizeof word_end);
|
|
+ size_t i;
|
|
+ strcpy (n, match_lines ? line_beg : word_beg);
|
|
+ i = strlen(n);
|
|
+ memcpy (n + i, pattern, size);
|
|
+ i += size;
|
|
+ strcpy (n + i, match_lines ? line_end : word_end);
|
|
+ i += strlen (n + i);
|
|
+ pattern = n;
|
|
+ size = i;
|
|
+ }
|
|
+
|
|
+ dfacomp (pattern, size, &dfa, 1);
|
|
+ kwsmusts ();
|
|
+}
|
|
+
|
|
+static size_t
|
|
+EGexecute (char const *buf, size_t size, size_t *match_size, int exact)
|
|
+{
|
|
+ register char const *buflim, *beg, *end;
|
|
+ char eol = eolbyte;
|
|
+ int backref, start, len;
|
|
+ struct kwsmatch kwsm;
|
|
+ size_t i;
|
|
+#ifdef MBS_SUPPORT
|
|
+ char *mb_properties = NULL;
|
|
+#endif /* MBS_SUPPORT */
|
|
+
|
|
+#ifdef MBS_SUPPORT
|
|
+ if (MB_CUR_MAX > 1 && kwset)
|
|
+ mb_properties = check_multibyte_string(buf, size);
|
|
+#endif /* MBS_SUPPORT */
|
|
+
|
|
+ buflim = buf + size;
|
|
+
|
|
+ for (beg = end = buf; end < buflim; beg = end)
|
|
+ {
|
|
+ if (!exact)
|
|
+ {
|
|
+ if (kwset)
|
|
+ {
|
|
+ /* Find a possible match using the KWset matcher. */
|
|
+ size_t offset = kwsexec (kwset, beg, buflim - beg, &kwsm);
|
|
+ if (offset == (size_t) -1)
|
|
+ goto failure;
|
|
+ beg += offset;
|
|
+ /* Narrow down to the line containing the candidate, and
|
|
+ run it through DFA. */
|
|
+ end = memchr(beg, eol, buflim - beg);
|
|
+ end++;
|
|
+#ifdef MBS_SUPPORT
|
|
+ if (MB_CUR_MAX > 1 && mb_properties[beg - buf] == 0)
|
|
+ continue;
|
|
+#endif
|
|
+ while (beg > buf && beg[-1] != eol)
|
|
+ --beg;
|
|
+ if (kwsm.index < kwset_exact_matches)
|
|
+ goto success_in_beg_and_end;
|
|
+ if (dfaexec (&dfa, beg, end - beg, &backref) == (size_t) -1)
|
|
+ continue;
|
|
+ }
|
|
+ else
|
|
+ {
|
|
+ /* No good fixed strings; start with DFA. */
|
|
+ size_t offset = dfaexec (&dfa, beg, buflim - beg, &backref);
|
|
+ if (offset == (size_t) -1)
|
|
+ break;
|
|
+ /* Narrow down to the line we've found. */
|
|
+ beg += offset;
|
|
+ end = memchr (beg, eol, buflim - beg);
|
|
+ end++;
|
|
+ while (beg > buf && beg[-1] != eol)
|
|
+ --beg;
|
|
+ }
|
|
+ /* Successful, no backreferences encountered! */
|
|
+ if (!backref)
|
|
+ goto success_in_beg_and_end;
|
|
+ }
|
|
+ else
|
|
+ end = beg + size;
|
|
+
|
|
+ /* If we've made it to this point, this means DFA has seen
|
|
+ a probable match, and we need to run it through Regex. */
|
|
+ for (i = 0; i < pcount; i++)
|
|
+ {
|
|
+ patterns[i].regexbuf.not_eol = 0;
|
|
+ if (0 <= (start = re_search (&(patterns[i].regexbuf), beg,
|
|
+ end - beg - 1, 0,
|
|
+ end - beg - 1, &(patterns[i].regs))))
|
|
+ {
|
|
+ len = patterns[i].regs.end[0] - start;
|
|
+ if (exact && !match_words)
|
|
+ goto success_in_start_and_len;
|
|
+ if ((!match_lines && !match_words)
|
|
+ || (match_lines && len == end - beg - 1))
|
|
+ goto success_in_beg_and_end;
|
|
+ /* If -w, check if the match aligns with word boundaries.
|
|
+ We do this iteratively because:
|
|
+ (a) the line may contain more than one occurence of the
|
|
+ pattern, and
|
|
+ (b) Several alternatives in the pattern might be valid at a
|
|
+ given point, and we may need to consider a shorter one to
|
|
+ find a word boundary. */
|
|
+ if (match_words)
|
|
+ while (start >= 0)
|
|
+ {
|
|
+ if ((start == 0 || !WCHAR ((unsigned char) beg[start - 1]))
|
|
+ && (len == end - beg - 1
|
|
+ || !WCHAR ((unsigned char) beg[start + len])))
|
|
+ goto success_in_beg_and_end;
|
|
+ if (len > 0)
|
|
+ {
|
|
+ /* Try a shorter length anchored at the same place. */
|
|
+ --len;
|
|
+ patterns[i].regexbuf.not_eol = 1;
|
|
+ len = re_match (&(patterns[i].regexbuf), beg,
|
|
+ start + len, start,
|
|
+ &(patterns[i].regs));
|
|
+ }
|
|
+ if (len <= 0)
|
|
+ {
|
|
+ /* Try looking further on. */
|
|
+ if (start == end - beg - 1)
|
|
+ break;
|
|
+ ++start;
|
|
+ patterns[i].regexbuf.not_eol = 0;
|
|
+ start = re_search (&(patterns[i].regexbuf), beg,
|
|
+ end - beg - 1,
|
|
+ start, end - beg - 1 - start,
|
|
+ &(patterns[i].regs));
|
|
+ len = patterns[i].regs.end[0] - start;
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+ } /* for Regex patterns. */
|
|
+ } /* for (beg = end ..) */
|
|
+
|
|
+ failure:
|
|
+#ifdef MBS_SUPPORT
|
|
+ if (MB_CUR_MAX > 1 && mb_properties)
|
|
+ free (mb_properties);
|
|
+#endif /* MBS_SUPPORT */
|
|
+ return (size_t) -1;
|
|
+
|
|
+ success_in_beg_and_end:
|
|
+ len = end - beg;
|
|
+ start = beg - buf;
|
|
+ /* FALLTHROUGH */
|
|
+
|
|
+ success_in_start_and_len:
|
|
+#ifdef MBS_SUPPORT
|
|
+ if (MB_CUR_MAX > 1 && mb_properties)
|
|
+ free (mb_properties);
|
|
+#endif /* MBS_SUPPORT */
|
|
+ *match_size = len;
|
|
+ return start;
|
|
+}
|
|
+
|
|
+static void
|
|
+Fcompile (char const *pattern, size_t size)
|
|
+{
|
|
+ char const *beg, *lim, *err;
|
|
+
|
|
+ kwsinit ();
|
|
+ beg = pattern;
|
|
+ do
|
|
+ {
|
|
+ for (lim = beg; lim < pattern + size && *lim != '\n'; ++lim)
|
|
+ ;
|
|
+ if ((err = kwsincr (kwset, beg, lim - beg)) != 0)
|
|
+ error (2, 0, err);
|
|
+ if (lim < pattern + size)
|
|
+ ++lim;
|
|
+ beg = lim;
|
|
+ }
|
|
+ while (beg < pattern + size);
|
|
+
|
|
+ if ((err = kwsprep (kwset)) != 0)
|
|
+ error (2, 0, err);
|
|
+}
|
|
+
|
|
+static size_t
|
|
+Fexecute (char const *buf, size_t size, size_t *match_size, int exact)
|
|
+{
|
|
+ register char const *beg, *try, *end;
|
|
+ register size_t len;
|
|
+ char eol = eolbyte;
|
|
+ struct kwsmatch kwsmatch;
|
|
+#ifdef MBS_SUPPORT
|
|
+ char *mb_properties;
|
|
+ if (MB_CUR_MAX > 1)
|
|
+ mb_properties = check_multibyte_string (buf, size);
|
|
+#endif /* MBS_SUPPORT */
|
|
+
|
|
+ for (beg = buf; beg <= buf + size; ++beg)
|
|
+ {
|
|
+ size_t offset = kwsexec (kwset, beg, buf + size - beg, &kwsmatch);
|
|
+ if (offset == (size_t) -1)
|
|
+ goto failure;
|
|
+#ifdef MBS_SUPPORT
|
|
+ if (MB_CUR_MAX > 1 && mb_properties[offset+beg-buf] == 0)
|
|
+ continue; /* It is a part of multibyte character. */
|
|
+#endif /* MBS_SUPPORT */
|
|
+ beg += offset;
|
|
+ len = kwsmatch.size[0];
|
|
+ if (exact && !match_words)
|
|
+ goto success_in_beg_and_len;
|
|
+ if (match_lines)
|
|
+ {
|
|
+ if (beg > buf && beg[-1] != eol)
|
|
+ continue;
|
|
+ if (beg + len < buf + size && beg[len] != eol)
|
|
+ continue;
|
|
+ goto success;
|
|
+ }
|
|
+ else if (match_words)
|
|
+ for (try = beg; len; )
|
|
+ {
|
|
+ if (try > buf && WCHAR((unsigned char) try[-1]))
|
|
+ break;
|
|
+ if (try + len < buf + size && WCHAR((unsigned char) try[len]))
|
|
+ {
|
|
+ offset = kwsexec (kwset, beg, --len, &kwsmatch);
|
|
+ if (offset == (size_t) -1)
|
|
+ {
|
|
+#ifdef MBS_SUPPORT
|
|
+ if (MB_CUR_MAX > 1)
|
|
+ free (mb_properties);
|
|
+#endif /* MBS_SUPPORT */
|
|
+ return offset;
|
|
+ }
|
|
+ try = beg + offset;
|
|
+ len = kwsmatch.size[0];
|
|
+ }
|
|
+ else
|
|
+ goto success;
|
|
+ }
|
|
+ else
|
|
+ goto success;
|
|
+ }
|
|
+
|
|
+ failure:
|
|
+#ifdef MBS_SUPPORT
|
|
+ if (MB_CUR_MAX > 1)
|
|
+ free (mb_properties);
|
|
+#endif /* MBS_SUPPORT */
|
|
+ return -1;
|
|
+
|
|
+ success:
|
|
+ end = memchr (beg + len, eol, (buf + size) - (beg + len));
|
|
+ end++;
|
|
+ while (buf < beg && beg[-1] != eol)
|
|
+ --beg;
|
|
+ len = end - beg;
|
|
+ /* FALLTHROUGH */
|
|
+
|
|
+ success_in_beg_and_len:
|
|
+ *match_size = len;
|
|
+#ifdef MBS_SUPPORT
|
|
+ if (MB_CUR_MAX > 1)
|
|
+ free (mb_properties);
|
|
+#endif /* MBS_SUPPORT */
|
|
+ return beg - buf;
|
|
+}
|
|
+
|
|
+#if HAVE_LIBPCRE
|
|
+/* Compiled internal form of a Perl regular expression. */
|
|
+static pcre *cre;
|
|
+
|
|
+/* Additional information about the pattern. */
|
|
+static pcre_extra *extra;
|
|
+#endif
|
|
+
|
|
+static void
|
|
+Pcompile (char const *pattern, size_t size)
|
|
+{
|
|
+#if !HAVE_LIBPCRE
|
|
+ error (2, 0, _("The -P option is not supported"));
|
|
+#else
|
|
+ int e;
|
|
+ char const *ep;
|
|
+ char *re = xmalloc (4 * size + 7);
|
|
+ int flags = PCRE_MULTILINE | (match_icase ? PCRE_CASELESS : 0);
|
|
+ char const *patlim = pattern + size;
|
|
+ char *n = re;
|
|
+ char const *p;
|
|
+ char const *pnul;
|
|
+
|
|
+ /* FIXME: Remove this restriction. */
|
|
+ if (eolbyte != '\n')
|
|
+ error (2, 0, _("The -P and -z options cannot be combined"));
|
|
+
|
|
+ *n = '\0';
|
|
+ if (match_lines)
|
|
+ strcpy (n, "^(");
|
|
+ if (match_words)
|
|
+ strcpy (n, "\\b(");
|
|
+ n += strlen (n);
|
|
+
|
|
+ /* The PCRE interface doesn't allow NUL bytes in the pattern, so
|
|
+ replace each NUL byte in the pattern with the four characters
|
|
+ "\000", removing a preceding backslash if there are an odd
|
|
+ number of backslashes before the NUL.
|
|
+
|
|
+ FIXME: This method does not work with some multibyte character
|
|
+ encodings, notably Shift-JIS, where a multibyte character can end
|
|
+ in a backslash byte. */
|
|
+ for (p = pattern; (pnul = memchr (p, '\0', patlim - p)); p = pnul + 1)
|
|
+ {
|
|
+ memcpy (n, p, pnul - p);
|
|
+ n += pnul - p;
|
|
+ for (p = pnul; pattern < p && p[-1] == '\\'; p--)
|
|
+ continue;
|
|
+ n -= (pnul - p) & 1;
|
|
+ strcpy (n, "\\000");
|
|
+ n += 4;
|
|
+ }
|
|
+
|
|
+ memcpy (n, p, patlim - p);
|
|
+ n += patlim - p;
|
|
+ *n = '\0';
|
|
+ if (match_words)
|
|
+ strcpy (n, ")\\b");
|
|
+ if (match_lines)
|
|
+ strcpy (n, ")$");
|
|
+
|
|
+ cre = pcre_compile (re, flags, &ep, &e, pcre_maketables ());
|
|
+ if (!cre)
|
|
+ error (2, 0, ep);
|
|
+
|
|
+ extra = pcre_study (cre, 0, &ep);
|
|
+ if (ep)
|
|
+ error (2, 0, ep);
|
|
+
|
|
+ free (re);
|
|
+#endif
|
|
+}
|
|
+
|
|
+static size_t
|
|
+Pexecute (char const *buf, size_t size, size_t *match_size, int exact)
|
|
+{
|
|
+#if !HAVE_LIBPCRE
|
|
+ abort ();
|
|
+ return -1;
|
|
+#else
|
|
+ /* This array must have at least two elements; everything after that
|
|
+ is just for performance improvement in pcre_exec. */
|
|
+ int sub[300];
|
|
+
|
|
+ int e = pcre_exec (cre, extra, buf, size, 0, 0,
|
|
+ sub, sizeof sub / sizeof *sub);
|
|
+
|
|
+ if (e <= 0)
|
|
+ {
|
|
+ switch (e)
|
|
+ {
|
|
+ case PCRE_ERROR_NOMATCH:
|
|
+ return -1;
|
|
+
|
|
+ case PCRE_ERROR_NOMEMORY:
|
|
+ error (2, 0, _("Memory exhausted"));
|
|
+
|
|
+ default:
|
|
+ abort ();
|
|
+ }
|
|
+ }
|
|
+ else
|
|
+ {
|
|
+ /* Narrow down to the line we've found. */
|
|
+ char const *beg = buf + sub[0];
|
|
+ char const *end = buf + sub[1];
|
|
+ char const *buflim = buf + size;
|
|
+ char eol = eolbyte;
|
|
+ if (!exact)
|
|
+ {
|
|
+ end = memchr (end, eol, buflim - end);
|
|
+ end++;
|
|
+ while (buf < beg && beg[-1] != eol)
|
|
+ --beg;
|
|
+ }
|
|
+
|
|
+ *match_size = end - beg;
|
|
+ return beg - buf;
|
|
+ }
|
|
+#endif
|
|
+}
|
|
+
|
|
+struct matcher const matchers[] = {
|
|
+ { "default", Gcompile, EGexecute },
|
|
+ { "grep", Gcompile, EGexecute },
|
|
+ { "egrep", Ecompile, EGexecute },
|
|
+ { "awk", Ecompile, EGexecute },
|
|
+ { "fgrep", Fcompile, Fexecute },
|
|
+ { "perl", Pcompile, Pexecute },
|
|
+ { "", 0, 0 },
|
|
+};
|
|
diff -urN grep-2.5.1a.orig/tests/fmbtest.sh grep-2.5.1a/tests/fmbtest.sh
|
|
--- grep-2.5.1a.orig/tests/fmbtest.sh 1970-01-01 05:00:00.000000000 +0500
|
|
+++ grep-2.5.1a/tests/fmbtest.sh 2005-10-23 09:51:12.000000000 +0600
|
|
@@ -0,0 +1,111 @@
|
|
+#!/bin/sh
|
|
+
|
|
+: ${srcdir=.}
|
|
+
|
|
+# If cs_CZ.UTF-8 locale doesn't work, skip this test silently
|
|
+LC_ALL=cs_CZ.UTF-8 locale -k LC_CTYPE 2>/dev/null | ${GREP} -q charmap.*UTF-8 \
|
|
+ || exit 77
|
|
+
|
|
+failures=0
|
|
+
|
|
+cat > csinput <<EOF
|
|
+01 Žluťoučká číše
|
|
+ČíŠE 02
|
|
+03 Z číší Čiší cosi
|
|
+04 Čí
|
|
+Še 05
|
|
+06 ČČČČČČČíšČÍŠčíš
|
|
+07 ČČČ ČČČČíšČÍŠčíšEEEE
|
|
+čAs 08
|
|
+09Čapka
|
|
+10ČaSy se měnÍ
|
|
+ČÍšE11
|
|
+Čas12
|
|
+𝇕ČÍšE𝇓13
|
|
+ŽČÍšE𝇓14
|
|
+𝇕ČÍšEŽ15
|
|
+ŽČÍšEŽ16
|
|
+ČÍšE𝇓17
|
|
+ČÍšEŽ18
|
|
+19𝇕ČÍše
|
|
+20ŽČÍše
|
|
+EOF
|
|
+cat > cspatfile <<EOF
|
|
+ČÍšE
|
|
+Čas
|
|
+EOF
|
|
+
|
|
+for mode in F G E; do
|
|
+
|
|
+test1="$(echo `LC_ALL=cs_CZ.UTF-8 ${GREP} -${mode} -f cspatfile csinput \
|
|
+ | LC_ALL=C sed 's/^.*\([0-9][0-9]\).*$/\1/'`)"
|
|
+if test "$test1" != "11 12 13 14 15 16 17 18"; then
|
|
+ echo "Test #1 ${mode} failed: $test1"
|
|
+ failures=1
|
|
+fi
|
|
+
|
|
+test2="$(echo `LC_ALL=cs_CZ.UTF-8 ${GREP} -${mode}i -f cspatfile csinput \
|
|
+ | LC_ALL=C sed 's/^.*\([0-9][0-9]\).*$/\1/'`)"
|
|
+if test "$test2" != "01 02 07 08 10 11 12 13 14 15 16 17 18 19 20"; then
|
|
+ echo "Test #2 ${mode} failed: $test2"
|
|
+ failures=1
|
|
+fi
|
|
+
|
|
+test3="$(echo `LC_ALL=cs_CZ.UTF-8 ${GREP} -${mode}i -e 'ČÍšE' -e 'Čas' csinput \
|
|
+ | LC_ALL=C sed 's/^.*\([0-9][0-9]\).*$/\1/'`)"
|
|
+if test "$test3" != "01 02 07 08 10 11 12 13 14 15 16 17 18 19 20"; then
|
|
+ echo "Test #3 ${mode} failed: $test3"
|
|
+ failures=1
|
|
+fi
|
|
+
|
|
+test4="$(echo `LC_ALL=cs_CZ.UTF-8 ${GREP} -${mode}iw -f cspatfile csinput \
|
|
+ | LC_ALL=C sed 's/^.*\([0-9][0-9]\).*$/\1/'`)"
|
|
+if test "$test4" != "01 02 08 13 17 19"; then
|
|
+ echo "Test #4 ${mode} failed: $test4"
|
|
+ failures=1
|
|
+fi
|
|
+
|
|
+done
|
|
+
|
|
+# Test that -F --color=always prefers longer matches.
|
|
+test5="`echo 'Cosi tu ČišÍ...' \
|
|
+ | LC_ALL=cs_CZ.UTF-8 ${GREP} --color=always -Fi -e 'čiš' -e 'čiší'`"
|
|
+if echo "$test5" | LC_ALL=C ${GREP} -q 'Cosi tu .*\[.*mČišÍ.*\[.*m\(.\[K\)\?\.\.\.'; then
|
|
+ :
|
|
+else
|
|
+ echo "Test #5 F failed: $test5"
|
|
+ failures=1
|
|
+fi
|
|
+
|
|
+for mode in G E; do
|
|
+
|
|
+# Test that -{G,E} --color=always prefers earlier pattern matches.
|
|
+test6="`echo 'Cosi tu ČišÍ...' \
|
|
+ | LC_ALL=cs_CZ.UTF-8 ${GREP} --color=always -${mode}i -e 'čiš' -e 'čiší'`"
|
|
+if echo "$test6" | LC_ALL=C ${GREP} -q 'Cosi tu .*\[.*mČiš.*\[.*m\(.\[K\)\?Í\.\.\.'; then
|
|
+ :
|
|
+else
|
|
+ echo "Test #6 ${mode} failed: $test6"
|
|
+ failures=1
|
|
+fi
|
|
+
|
|
+# Test that -{G,E} --color=always prefers earlier pattern matches.
|
|
+test7="`echo 'Cosi tu ČišÍ...' \
|
|
+ | LC_ALL=cs_CZ.UTF-8 ${GREP} --color=always -${mode}i -e 'čiší' -e 'čiš'`"
|
|
+if echo "$test7" | LC_ALL=C ${GREP} -q 'Cosi tu .*\[.*mČišÍ.*\[.*m\(.\[K\)\?\.\.\.'; then
|
|
+ :
|
|
+else
|
|
+ echo "Test #7 ${mode} failed: $test7"
|
|
+ failures=1
|
|
+fi
|
|
+
|
|
+test8="$(echo `LC_ALL=cs_CZ.UTF-8 ${GREP} -${mode}i -e 'Č.šE' -e 'Č[a-f]s' csinput \
|
|
+ | LC_ALL=C sed 's/^.*\([0-9][0-9]\).*$/\1/'`)"
|
|
+if test "$test8" != "01 02 07 08 10 11 12 13 14 15 16 17 18 19 20"; then
|
|
+ echo "Test #8 ${mode} failed: $test8"
|
|
+ failures=1
|
|
+fi
|
|
+
|
|
+done
|
|
+
|
|
+exit $failures
|
|
diff -urN grep-2.5.1a.orig/tests/Makefile.am grep-2.5.1a/tests/Makefile.am
|
|
--- grep-2.5.1a.orig/tests/Makefile.am 2001-03-07 09:11:27.000000000 +0500
|
|
+++ grep-2.5.1a/tests/Makefile.am 2005-10-23 09:51:12.000000000 +0600
|
|
@@ -3,7 +3,8 @@
|
|
AWK=@AWK@
|
|
|
|
TESTS = warning.sh khadafy.sh spencer1.sh bre.sh ere.sh \
|
|
- status.sh empty.sh options.sh backref.sh file.sh
|
|
+ status.sh empty.sh options.sh backref.sh file.sh \
|
|
+ fmbtest.sh
|
|
EXTRA_DIST = $(TESTS) \
|
|
khadafy.lines khadafy.regexp \
|
|
spencer1.awk spencer1.tests \
|
|
diff -urN grep-2.5.1a.orig/tests/Makefile.in grep-2.5.1a/tests/Makefile.in
|
|
--- grep-2.5.1a.orig/tests/Makefile.in 2002-03-26 21:09:36.000000000 +0500
|
|
+++ grep-2.5.1a/tests/Makefile.in 2005-10-23 09:51:13.000000000 +0600
|
|
@@ -97,7 +97,8 @@
|
|
AWK = @AWK@
|
|
|
|
TESTS = warning.sh khadafy.sh spencer1.sh bre.sh ere.sh \
|
|
- status.sh empty.sh options.sh backref.sh file.sh
|
|
+ status.sh empty.sh options.sh backref.sh file.sh \
|
|
+ fmbtest.sh
|
|
|
|
EXTRA_DIST = $(TESTS) \
|
|
khadafy.lines khadafy.regexp \
|