|
|
e63663 |
diff --git a/src/cut.c b/src/cut.c
|
|
|
e63663 |
index 7ab6be4..022d0ad 100644
|
|
|
e63663 |
--- a/src/cut.c
|
|
|
e63663 |
+++ b/src/cut.c
|
|
|
e63663 |
@@ -28,6 +28,11 @@
|
|
|
e63663 |
#include <assert.h>
|
|
|
e63663 |
#include <getopt.h>
|
|
|
e63663 |
#include <sys/types.h>
|
|
|
e63663 |
+
|
|
|
e63663 |
+/* Get mbstate_t, mbrtowc(). */
|
|
|
e63663 |
+#if HAVE_WCHAR_H
|
|
|
e63663 |
+# include <wchar.h>
|
|
|
e63663 |
+#endif
|
|
|
e63663 |
#include "system.h"
|
|
|
e63663 |
|
|
|
e63663 |
#include "error.h"
|
|
|
e63663 |
@@ -38,6 +43,18 @@
|
|
|
e63663 |
|
|
|
e63663 |
#include "set-fields.h"
|
|
|
e63663 |
|
|
|
e63663 |
+/* MB_LEN_MAX is incorrectly defined to be 1 in at least one GCC
|
|
|
e63663 |
+ installation; work around this configuration error. */
|
|
|
e63663 |
+#if !defined MB_LEN_MAX || MB_LEN_MAX < 2
|
|
|
e63663 |
+# undef MB_LEN_MAX
|
|
|
e63663 |
+# define MB_LEN_MAX 16
|
|
|
e63663 |
+#endif
|
|
|
e63663 |
+
|
|
|
e63663 |
+/* Some systems, like BeOS, have multibyte encodings but lack mbstate_t. */
|
|
|
e63663 |
+#if HAVE_MBRTOWC && defined mbstate_t
|
|
|
e63663 |
+# define mbrtowc(pwc, s, n, ps) (mbrtowc) (pwc, s, n, 0)
|
|
|
e63663 |
+#endif
|
|
|
e63663 |
+
|
|
|
e63663 |
/* The official name of this program (e.g., no 'g' prefix). */
|
|
|
e63663 |
#define PROGRAM_NAME "cut"
|
|
|
e63663 |
|
|
|
e63663 |
@@ -54,6 +71,52 @@
|
|
|
e63663 |
} \
|
|
|
e63663 |
while (0)
|
|
|
e63663 |
|
|
|
e63663 |
+/* Refill the buffer BUF to get a multibyte character. */
|
|
|
e63663 |
+#define REFILL_BUFFER(BUF, BUFPOS, BUFLEN, STREAM) \
|
|
|
e63663 |
+ do \
|
|
|
e63663 |
+ { \
|
|
|
e63663 |
+ if (BUFLEN < MB_LEN_MAX && !feof (STREAM) && !ferror (STREAM)) \
|
|
|
e63663 |
+ { \
|
|
|
e63663 |
+ memmove (BUF, BUFPOS, BUFLEN); \
|
|
|
e63663 |
+ BUFLEN += fread (BUF + BUFLEN, sizeof(char), BUFSIZ, STREAM); \
|
|
|
e63663 |
+ BUFPOS = BUF; \
|
|
|
e63663 |
+ } \
|
|
|
e63663 |
+ } \
|
|
|
e63663 |
+ while (0)
|
|
|
e63663 |
+
|
|
|
e63663 |
+/* Get wide character on BUFPOS. BUFPOS is not included after that.
|
|
|
e63663 |
+ If byte sequence is not valid as a character, CONVFAIL is true. Otherwise false. */
|
|
|
e63663 |
+#define GET_NEXT_WC_FROM_BUFFER(WC, BUFPOS, BUFLEN, MBLENGTH, STATE, CONVFAIL) \
|
|
|
e63663 |
+ do \
|
|
|
e63663 |
+ { \
|
|
|
e63663 |
+ mbstate_t state_bak; \
|
|
|
e63663 |
+ \
|
|
|
e63663 |
+ if (BUFLEN < 1) \
|
|
|
e63663 |
+ { \
|
|
|
e63663 |
+ WC = WEOF; \
|
|
|
e63663 |
+ break; \
|
|
|
e63663 |
+ } \
|
|
|
e63663 |
+ \
|
|
|
e63663 |
+ /* Get a wide character. */ \
|
|
|
e63663 |
+ CONVFAIL = false; \
|
|
|
e63663 |
+ state_bak = STATE; \
|
|
|
e63663 |
+ MBLENGTH = mbrtowc ((wchar_t *)&WC, BUFPOS, BUFLEN, &STATE); \
|
|
|
e63663 |
+ \
|
|
|
e63663 |
+ switch (MBLENGTH) \
|
|
|
e63663 |
+ { \
|
|
|
e63663 |
+ case (size_t)-1: \
|
|
|
e63663 |
+ case (size_t)-2: \
|
|
|
e63663 |
+ CONVFAIL = true; \
|
|
|
e63663 |
+ STATE = state_bak; \
|
|
|
e63663 |
+ /* Fall througn. */ \
|
|
|
e63663 |
+ \
|
|
|
e63663 |
+ case 0: \
|
|
|
e63663 |
+ MBLENGTH = 1; \
|
|
|
e63663 |
+ break; \
|
|
|
e63663 |
+ } \
|
|
|
e63663 |
+ } \
|
|
|
e63663 |
+ while (0)
|
|
|
e63663 |
+
|
|
|
e63663 |
|
|
|
e63663 |
/* Pointer inside RP. When checking if a byte or field is selected
|
|
|
e63663 |
by a finite range, we check if it is between CURRENT_RP.LO
|
|
|
e63663 |
@@ -61,6 +124,9 @@
|
|
|
e63663 |
CURRENT_RP.HI then we make CURRENT_RP to point to the next range pair. */
|
|
|
e63663 |
static struct field_range_pair *current_rp;
|
|
|
e63663 |
|
|
|
e63663 |
+/* Length of the delimiter given as argument to -d. */
|
|
|
e63663 |
+size_t delimlen;
|
|
|
e63663 |
+
|
|
|
e63663 |
/* This buffer is used to support the semantics of the -s option
|
|
|
e63663 |
(or lack of same) when the specified field list includes (does
|
|
|
e63663 |
not include) the first field. In both of those cases, the entire
|
|
|
e63663 |
@@ -77,15 +143,25 @@ enum operating_mode
|
|
|
e63663 |
{
|
|
|
e63663 |
undefined_mode,
|
|
|
e63663 |
|
|
|
e63663 |
- /* Output characters that are in the given bytes. */
|
|
|
e63663 |
+ /* Output bytes that are at the given positions. */
|
|
|
e63663 |
byte_mode,
|
|
|
e63663 |
|
|
|
e63663 |
+ /* Output characters that are at the given positions. */
|
|
|
e63663 |
+ character_mode,
|
|
|
e63663 |
+
|
|
|
e63663 |
/* Output the given delimiter-separated fields. */
|
|
|
e63663 |
field_mode
|
|
|
e63663 |
};
|
|
|
e63663 |
|
|
|
e63663 |
static enum operating_mode operating_mode;
|
|
|
e63663 |
|
|
|
e63663 |
+/* If nonzero, when in byte mode, don't split multibyte characters. */
|
|
|
e63663 |
+static int byte_mode_character_aware;
|
|
|
e63663 |
+
|
|
|
e63663 |
+/* If nonzero, the function for single byte locale is work
|
|
|
e63663 |
+ if this program runs on multibyte locale. */
|
|
|
e63663 |
+static int force_singlebyte_mode;
|
|
|
e63663 |
+
|
|
|
e63663 |
/* If true do not output lines containing no delimiter characters.
|
|
|
e63663 |
Otherwise, all such lines are printed. This option is valid only
|
|
|
e63663 |
with field mode. */
|
|
|
e63663 |
@@ -97,6 +173,9 @@ static bool complement;
|
|
|
e63663 |
|
|
|
e63663 |
/* The delimiter character for field mode. */
|
|
|
e63663 |
static unsigned char delim;
|
|
|
e63663 |
+#if HAVE_WCHAR_H
|
|
|
e63663 |
+static wchar_t wcdelim;
|
|
|
e63663 |
+#endif
|
|
|
e63663 |
|
|
|
e63663 |
/* The delimiter for each line/record. */
|
|
|
e63663 |
static unsigned char line_delim = '\n';
|
|
|
e63663 |
@@ -164,7 +243,7 @@ Print selected parts of lines from each FILE to standard output.\n\
|
|
|
e63663 |
-f, --fields=LIST select only these fields; also print any line\n\
|
|
|
e63663 |
that contains no delimiter character, unless\n\
|
|
|
e63663 |
the -s option is specified\n\
|
|
|
e63663 |
- -n (ignored)\n\
|
|
|
e63663 |
+ -n with -b: don't split multibyte characters\n\
|
|
|
e63663 |
"), stdout);
|
|
|
e63663 |
fputs (_("\
|
|
|
e63663 |
--complement complement the set of selected bytes, characters\n\
|
|
|
e63663 |
@@ -280,6 +359,82 @@ cut_bytes (FILE *stream)
|
|
|
e63663 |
}
|
|
|
e63663 |
}
|
|
|
e63663 |
|
|
|
e63663 |
+#if HAVE_MBRTOWC
|
|
|
e63663 |
+/* This function is in use for the following case.
|
|
|
e63663 |
+
|
|
|
e63663 |
+ 1. Read from the stream STREAM, printing to standard output any selected
|
|
|
e63663 |
+ characters.
|
|
|
e63663 |
+
|
|
|
e63663 |
+ 2. Read from stream STREAM, printing to standard output any selected bytes,
|
|
|
e63663 |
+ without splitting multibyte characters. */
|
|
|
e63663 |
+
|
|
|
e63663 |
+static void
|
|
|
e63663 |
+cut_characters_or_cut_bytes_no_split (FILE *stream)
|
|
|
e63663 |
+{
|
|
|
e63663 |
+ uintmax_t idx; /* number of bytes or characters in the line so far. */
|
|
|
e63663 |
+ char buf[MB_LEN_MAX + BUFSIZ]; /* For spooling a read byte sequence. */
|
|
|
e63663 |
+ char *bufpos; /* Next read position of BUF. */
|
|
|
e63663 |
+ size_t buflen; /* The length of the byte sequence in buf. */
|
|
|
e63663 |
+ wint_t wc; /* A gotten wide character. */
|
|
|
e63663 |
+ size_t mblength; /* The byte size of a multibyte character which shows
|
|
|
e63663 |
+ as same character as WC. */
|
|
|
e63663 |
+ mbstate_t state; /* State of the stream. */
|
|
|
e63663 |
+ bool convfail = false; /* true, when conversion failed. Otherwise false. */
|
|
|
e63663 |
+ /* Whether to begin printing delimiters between ranges for the current line.
|
|
|
e63663 |
+ Set after we've begun printing data corresponding to the first range. */
|
|
|
e63663 |
+ bool print_delimiter = false;
|
|
|
e63663 |
+
|
|
|
e63663 |
+ idx = 0;
|
|
|
e63663 |
+ buflen = 0;
|
|
|
e63663 |
+ bufpos = buf;
|
|
|
e63663 |
+ memset (&state, '\0', sizeof(mbstate_t));
|
|
|
e63663 |
+
|
|
|
e63663 |
+ current_rp = frp;
|
|
|
e63663 |
+
|
|
|
e63663 |
+ while (1)
|
|
|
e63663 |
+ {
|
|
|
e63663 |
+ REFILL_BUFFER (buf, bufpos, buflen, stream);
|
|
|
e63663 |
+
|
|
|
e63663 |
+ GET_NEXT_WC_FROM_BUFFER (wc, bufpos, buflen, mblength, state, convfail);
|
|
|
e63663 |
+ (void) convfail; /* ignore unused */
|
|
|
e63663 |
+
|
|
|
e63663 |
+ if (wc == WEOF)
|
|
|
e63663 |
+ {
|
|
|
e63663 |
+ if (idx > 0)
|
|
|
e63663 |
+ putchar (line_delim);
|
|
|
e63663 |
+ break;
|
|
|
e63663 |
+ }
|
|
|
e63663 |
+ else if (wc == line_delim)
|
|
|
e63663 |
+ {
|
|
|
e63663 |
+ putchar (line_delim);
|
|
|
e63663 |
+ idx = 0;
|
|
|
e63663 |
+ print_delimiter = false;
|
|
|
e63663 |
+ current_rp = frp;
|
|
|
e63663 |
+ }
|
|
|
e63663 |
+ else
|
|
|
e63663 |
+ {
|
|
|
e63663 |
+ next_item (&idx);
|
|
|
e63663 |
+ if (print_kth (idx))
|
|
|
e63663 |
+ {
|
|
|
e63663 |
+ if (output_delimiter_specified)
|
|
|
e63663 |
+ {
|
|
|
e63663 |
+ if (print_delimiter && is_range_start_index (idx))
|
|
|
e63663 |
+ {
|
|
|
e63663 |
+ fwrite (output_delimiter_string, sizeof (char),
|
|
|
e63663 |
+ output_delimiter_length, stdout);
|
|
|
e63663 |
+ }
|
|
|
e63663 |
+ print_delimiter = true;
|
|
|
e63663 |
+ }
|
|
|
e63663 |
+ fwrite (bufpos, mblength, sizeof(char), stdout);
|
|
|
e63663 |
+ }
|
|
|
e63663 |
+ }
|
|
|
e63663 |
+
|
|
|
e63663 |
+ buflen -= mblength;
|
|
|
e63663 |
+ bufpos += mblength;
|
|
|
e63663 |
+ }
|
|
|
e63663 |
+}
|
|
|
e63663 |
+#endif
|
|
|
e63663 |
+
|
|
|
e63663 |
/* Read from stream STREAM, printing to standard output any selected fields. */
|
|
|
e63663 |
|
|
|
e63663 |
static void
|
|
|
e63663 |
@@ -425,13 +580,211 @@ cut_fields (FILE *stream)
|
|
|
e63663 |
}
|
|
|
e63663 |
}
|
|
|
e63663 |
|
|
|
e63663 |
+#if HAVE_MBRTOWC
|
|
|
e63663 |
+static void
|
|
|
e63663 |
+cut_fields_mb (FILE *stream)
|
|
|
e63663 |
+{
|
|
|
e63663 |
+ int c;
|
|
|
e63663 |
+ uintmax_t field_idx;
|
|
|
e63663 |
+ int found_any_selected_field;
|
|
|
e63663 |
+ int buffer_first_field;
|
|
|
e63663 |
+ int empty_input;
|
|
|
e63663 |
+ char buf[MB_LEN_MAX + BUFSIZ]; /* For spooling a read byte sequence. */
|
|
|
e63663 |
+ char *bufpos; /* Next read position of BUF. */
|
|
|
e63663 |
+ size_t buflen; /* The length of the byte sequence in buf. */
|
|
|
e63663 |
+ wint_t wc = 0; /* A gotten wide character. */
|
|
|
e63663 |
+ size_t mblength; /* The byte size of a multibyte character which shows
|
|
|
e63663 |
+ as same character as WC. */
|
|
|
e63663 |
+ mbstate_t state; /* State of the stream. */
|
|
|
e63663 |
+ bool convfail = false; /* true, when conversion failed. Otherwise false. */
|
|
|
e63663 |
+
|
|
|
e63663 |
+ current_rp = frp;
|
|
|
e63663 |
+
|
|
|
e63663 |
+ found_any_selected_field = 0;
|
|
|
e63663 |
+ field_idx = 1;
|
|
|
e63663 |
+ bufpos = buf;
|
|
|
e63663 |
+ buflen = 0;
|
|
|
e63663 |
+ memset (&state, '\0', sizeof(mbstate_t));
|
|
|
e63663 |
+
|
|
|
e63663 |
+ c = getc (stream);
|
|
|
e63663 |
+ empty_input = (c == EOF);
|
|
|
e63663 |
+ if (c != EOF)
|
|
|
e63663 |
+ {
|
|
|
e63663 |
+ ungetc (c, stream);
|
|
|
e63663 |
+ wc = 0;
|
|
|
e63663 |
+ }
|
|
|
e63663 |
+ else
|
|
|
e63663 |
+ wc = WEOF;
|
|
|
e63663 |
+
|
|
|
e63663 |
+ /* To support the semantics of the -s flag, we may have to buffer
|
|
|
e63663 |
+ all of the first field to determine whether it is `delimited.'
|
|
|
e63663 |
+ But that is unnecessary if all non-delimited lines must be printed
|
|
|
e63663 |
+ and the first field has been selected, or if non-delimited lines
|
|
|
e63663 |
+ must be suppressed and the first field has *not* been selected.
|
|
|
e63663 |
+ That is because a non-delimited line has exactly one field. */
|
|
|
e63663 |
+ buffer_first_field = (suppress_non_delimited ^ !print_kth (1));
|
|
|
e63663 |
+
|
|
|
e63663 |
+ while (1)
|
|
|
e63663 |
+ {
|
|
|
e63663 |
+ if (field_idx == 1 && buffer_first_field)
|
|
|
e63663 |
+ {
|
|
|
e63663 |
+ int len = 0;
|
|
|
e63663 |
+
|
|
|
e63663 |
+ while (1)
|
|
|
e63663 |
+ {
|
|
|
e63663 |
+ REFILL_BUFFER (buf, bufpos, buflen, stream);
|
|
|
e63663 |
+
|
|
|
e63663 |
+ GET_NEXT_WC_FROM_BUFFER
|
|
|
e63663 |
+ (wc, bufpos, buflen, mblength, state, convfail);
|
|
|
e63663 |
+
|
|
|
e63663 |
+ if (wc == WEOF)
|
|
|
e63663 |
+ break;
|
|
|
e63663 |
+
|
|
|
e63663 |
+ field_1_buffer = xrealloc (field_1_buffer, len + mblength);
|
|
|
e63663 |
+ memcpy (field_1_buffer + len, bufpos, mblength);
|
|
|
e63663 |
+ len += mblength;
|
|
|
e63663 |
+ buflen -= mblength;
|
|
|
e63663 |
+ bufpos += mblength;
|
|
|
e63663 |
+
|
|
|
e63663 |
+ if (!convfail && (wc == line_delim || wc == wcdelim))
|
|
|
e63663 |
+ break;
|
|
|
e63663 |
+ }
|
|
|
e63663 |
+
|
|
|
e63663 |
+ if (len <= 0 && wc == WEOF)
|
|
|
e63663 |
+ break;
|
|
|
e63663 |
+
|
|
|
e63663 |
+ /* If the first field extends to the end of line (it is not
|
|
|
e63663 |
+ delimited) and we are printing all non-delimited lines,
|
|
|
e63663 |
+ print this one. */
|
|
|
e63663 |
+ if (convfail || (!convfail && wc != wcdelim))
|
|
|
e63663 |
+ {
|
|
|
e63663 |
+ if (suppress_non_delimited)
|
|
|
e63663 |
+ {
|
|
|
e63663 |
+ /* Empty. */
|
|
|
e63663 |
+ }
|
|
|
e63663 |
+ else
|
|
|
e63663 |
+ {
|
|
|
e63663 |
+ fwrite (field_1_buffer, sizeof (char), len, stdout);
|
|
|
e63663 |
+ /* Make sure the output line is newline terminated. */
|
|
|
e63663 |
+ if (convfail || (!convfail && wc != line_delim))
|
|
|
e63663 |
+ putchar (line_delim);
|
|
|
e63663 |
+ }
|
|
|
e63663 |
+ continue;
|
|
|
e63663 |
+ }
|
|
|
e63663 |
+
|
|
|
e63663 |
+ if (print_kth (1))
|
|
|
e63663 |
+ {
|
|
|
e63663 |
+ /* Print the field, but not the trailing delimiter. */
|
|
|
e63663 |
+ fwrite (field_1_buffer, sizeof (char), len - 1, stdout);
|
|
|
e63663 |
+ found_any_selected_field = 1;
|
|
|
e63663 |
+ }
|
|
|
e63663 |
+ next_item (&field_idx);
|
|
|
e63663 |
+ }
|
|
|
e63663 |
+
|
|
|
e63663 |
+ if (wc != WEOF)
|
|
|
e63663 |
+ {
|
|
|
e63663 |
+ if (print_kth (field_idx))
|
|
|
e63663 |
+ {
|
|
|
e63663 |
+ if (found_any_selected_field)
|
|
|
e63663 |
+ {
|
|
|
e63663 |
+ fwrite (output_delimiter_string, sizeof (char),
|
|
|
e63663 |
+ output_delimiter_length, stdout);
|
|
|
e63663 |
+ }
|
|
|
e63663 |
+ found_any_selected_field = 1;
|
|
|
e63663 |
+ }
|
|
|
e63663 |
+
|
|
|
e63663 |
+ while (1)
|
|
|
e63663 |
+ {
|
|
|
e63663 |
+ REFILL_BUFFER (buf, bufpos, buflen, stream);
|
|
|
e63663 |
+
|
|
|
e63663 |
+ GET_NEXT_WC_FROM_BUFFER
|
|
|
e63663 |
+ (wc, bufpos, buflen, mblength, state, convfail);
|
|
|
e63663 |
+
|
|
|
e63663 |
+ if (wc == WEOF)
|
|
|
e63663 |
+ break;
|
|
|
e63663 |
+ else if (!convfail && (wc == wcdelim || wc == line_delim))
|
|
|
e63663 |
+ {
|
|
|
e63663 |
+ buflen -= mblength;
|
|
|
e63663 |
+ bufpos += mblength;
|
|
|
e63663 |
+ break;
|
|
|
e63663 |
+ }
|
|
|
e63663 |
+
|
|
|
e63663 |
+ if (print_kth (field_idx))
|
|
|
e63663 |
+ fwrite (bufpos, mblength, sizeof(char), stdout);
|
|
|
e63663 |
+
|
|
|
e63663 |
+ buflen -= mblength;
|
|
|
e63663 |
+ bufpos += mblength;
|
|
|
e63663 |
+ }
|
|
|
e63663 |
+ }
|
|
|
e63663 |
+
|
|
|
e63663 |
+ if ((!convfail || wc == line_delim) && buflen < 1)
|
|
|
e63663 |
+ wc = WEOF;
|
|
|
e63663 |
+
|
|
|
e63663 |
+ if (!convfail && wc == wcdelim)
|
|
|
e63663 |
+ next_item (&field_idx);
|
|
|
e63663 |
+ else if (wc == WEOF || (!convfail && wc == line_delim))
|
|
|
e63663 |
+ {
|
|
|
e63663 |
+ if (found_any_selected_field
|
|
|
e63663 |
+ || (!empty_input && !(suppress_non_delimited && field_idx == 1)))
|
|
|
e63663 |
+ putchar (line_delim);
|
|
|
e63663 |
+ if (wc == WEOF)
|
|
|
e63663 |
+ break;
|
|
|
e63663 |
+ field_idx = 1;
|
|
|
e63663 |
+ current_rp = frp;
|
|
|
e63663 |
+ found_any_selected_field = 0;
|
|
|
e63663 |
+ }
|
|
|
e63663 |
+ }
|
|
|
e63663 |
+}
|
|
|
e63663 |
+#endif
|
|
|
e63663 |
+
|
|
|
e63663 |
static void
|
|
|
e63663 |
cut_stream (FILE *stream)
|
|
|
e63663 |
{
|
|
|
e63663 |
- if (operating_mode == byte_mode)
|
|
|
e63663 |
- cut_bytes (stream);
|
|
|
e63663 |
+#if HAVE_MBRTOWC
|
|
|
e63663 |
+ if (MB_CUR_MAX > 1 && !force_singlebyte_mode)
|
|
|
e63663 |
+ {
|
|
|
e63663 |
+ switch (operating_mode)
|
|
|
e63663 |
+ {
|
|
|
e63663 |
+ case byte_mode:
|
|
|
e63663 |
+ if (byte_mode_character_aware)
|
|
|
e63663 |
+ cut_characters_or_cut_bytes_no_split (stream);
|
|
|
e63663 |
+ else
|
|
|
e63663 |
+ cut_bytes (stream);
|
|
|
e63663 |
+ break;
|
|
|
e63663 |
+
|
|
|
e63663 |
+ case character_mode:
|
|
|
e63663 |
+ cut_characters_or_cut_bytes_no_split (stream);
|
|
|
e63663 |
+ break;
|
|
|
e63663 |
+
|
|
|
e63663 |
+ case field_mode:
|
|
|
e63663 |
+ if (delimlen == 1)
|
|
|
e63663 |
+ {
|
|
|
e63663 |
+ /* Check if we have utf8 multibyte locale, so we can use this
|
|
|
e63663 |
+ optimization because of uniqueness of characters, which is
|
|
|
e63663 |
+ not true for e.g. SJIS */
|
|
|
e63663 |
+ char * loc = setlocale(LC_CTYPE, NULL);
|
|
|
e63663 |
+ if (loc && (strstr (loc, "UTF-8") || strstr (loc, "utf-8") ||
|
|
|
e63663 |
+ strstr (loc, "UTF8") || strstr (loc, "utf8")))
|
|
|
e63663 |
+ {
|
|
|
e63663 |
+ cut_fields (stream);
|
|
|
e63663 |
+ break;
|
|
|
e63663 |
+ }
|
|
|
e63663 |
+ }
|
|
|
e63663 |
+ cut_fields_mb (stream);
|
|
|
e63663 |
+ break;
|
|
|
e63663 |
+
|
|
|
e63663 |
+ default:
|
|
|
e63663 |
+ abort ();
|
|
|
e63663 |
+ }
|
|
|
e63663 |
+ }
|
|
|
e63663 |
else
|
|
|
e63663 |
- cut_fields (stream);
|
|
|
e63663 |
+#endif
|
|
|
e63663 |
+ {
|
|
|
e63663 |
+ if (operating_mode == field_mode)
|
|
|
e63663 |
+ cut_fields (stream);
|
|
|
e63663 |
+ else
|
|
|
e63663 |
+ cut_bytes (stream);
|
|
|
e63663 |
+ }
|
|
|
e63663 |
}
|
|
|
e63663 |
|
|
|
e63663 |
/* Process file FILE to standard output.
|
|
|
e63663 |
@@ -483,6 +836,7 @@ main (int argc, char **argv)
|
|
|
e63663 |
bool ok;
|
|
|
e63663 |
bool delim_specified = false;
|
|
|
e63663 |
char *spec_list_string IF_LINT ( = NULL);
|
|
|
e63663 |
+ char mbdelim[MB_LEN_MAX + 1];
|
|
|
e63663 |
|
|
|
e63663 |
initialize_main (&argc, &argv);
|
|
|
e63663 |
set_program_name (argv[0]);
|
|
|
e63663 |
@@ -505,7 +859,6 @@ main (int argc, char **argv)
|
|
|
e63663 |
switch (optc)
|
|
|
e63663 |
{
|
|
|
e63663 |
case 'b':
|
|
|
e63663 |
- case 'c':
|
|
|
e63663 |
/* Build the byte list. */
|
|
|
e63663 |
if (operating_mode != undefined_mode)
|
|
|
e63663 |
FATAL_ERROR (_("only one type of list may be specified"));
|
|
|
e63663 |
@@ -513,6 +866,14 @@ main (int argc, char **argv)
|
|
|
e63663 |
spec_list_string = optarg;
|
|
|
e63663 |
break;
|
|
|
e63663 |
|
|
|
e63663 |
+ case 'c':
|
|
|
e63663 |
+ /* Build the character list. */
|
|
|
e63663 |
+ if (operating_mode != undefined_mode)
|
|
|
e63663 |
+ FATAL_ERROR (_("only one type of list may be specified"));
|
|
|
e63663 |
+ operating_mode = character_mode;
|
|
|
e63663 |
+ spec_list_string = optarg;
|
|
|
e63663 |
+ break;
|
|
|
e63663 |
+
|
|
|
e63663 |
case 'f':
|
|
|
e63663 |
/* Build the field list. */
|
|
|
e63663 |
if (operating_mode != undefined_mode)
|
|
|
e63663 |
@@ -524,10 +885,38 @@ main (int argc, char **argv)
|
|
|
e63663 |
case 'd':
|
|
|
e63663 |
/* New delimiter. */
|
|
|
e63663 |
/* Interpret -d '' to mean 'use the NUL byte as the delimiter.' */
|
|
|
e63663 |
- if (optarg[0] != '\0' && optarg[1] != '\0')
|
|
|
e63663 |
- FATAL_ERROR (_("the delimiter must be a single character"));
|
|
|
e63663 |
- delim = optarg[0];
|
|
|
e63663 |
- delim_specified = true;
|
|
|
e63663 |
+ {
|
|
|
e63663 |
+#if HAVE_MBRTOWC
|
|
|
e63663 |
+ if(MB_CUR_MAX > 1)
|
|
|
e63663 |
+ {
|
|
|
e63663 |
+ mbstate_t state;
|
|
|
e63663 |
+
|
|
|
e63663 |
+ memset (&state, '\0', sizeof(mbstate_t));
|
|
|
e63663 |
+ delimlen = mbrtowc (&wcdelim, optarg, strnlen(optarg, MB_LEN_MAX), &state);
|
|
|
e63663 |
+
|
|
|
e63663 |
+ if (delimlen == (size_t)-1 || delimlen == (size_t)-2)
|
|
|
e63663 |
+ ++force_singlebyte_mode;
|
|
|
e63663 |
+ else
|
|
|
e63663 |
+ {
|
|
|
e63663 |
+ delimlen = (delimlen < 1) ? 1 : delimlen;
|
|
|
e63663 |
+ if (wcdelim != L'\0' && *(optarg + delimlen) != '\0')
|
|
|
e63663 |
+ FATAL_ERROR (_("the delimiter must be a single character"));
|
|
|
e63663 |
+ memcpy (mbdelim, optarg, delimlen);
|
|
|
e63663 |
+ mbdelim[delimlen] = '\0';
|
|
|
e63663 |
+ if (delimlen == 1)
|
|
|
e63663 |
+ delim = *optarg;
|
|
|
e63663 |
+ }
|
|
|
e63663 |
+ }
|
|
|
e63663 |
+
|
|
|
e63663 |
+ if (MB_CUR_MAX <= 1 || force_singlebyte_mode)
|
|
|
e63663 |
+#endif
|
|
|
e63663 |
+ {
|
|
|
e63663 |
+ if (optarg[0] != '\0' && optarg[1] != '\0')
|
|
|
e63663 |
+ FATAL_ERROR (_("the delimiter must be a single character"));
|
|
|
e63663 |
+ delim = (unsigned char) optarg[0];
|
|
|
e63663 |
+ }
|
|
|
e63663 |
+ delim_specified = true;
|
|
|
e63663 |
+ }
|
|
|
e63663 |
break;
|
|
|
e63663 |
|
|
|
e63663 |
case OUTPUT_DELIMITER_OPTION:
|
|
|
e63663 |
@@ -540,6 +929,7 @@ main (int argc, char **argv)
|
|
|
e63663 |
break;
|
|
|
e63663 |
|
|
|
e63663 |
case 'n':
|
|
|
e63663 |
+ byte_mode_character_aware = 1;
|
|
|
e63663 |
break;
|
|
|
e63663 |
|
|
|
e63663 |
case 's':
|
|
|
e63663 |
@@ -579,15 +969,34 @@ main (int argc, char **argv)
|
|
|
e63663 |
| (complement ? SETFLD_COMPLEMENT : 0) );
|
|
|
e63663 |
|
|
|
e63663 |
if (!delim_specified)
|
|
|
e63663 |
- delim = '\t';
|
|
|
e63663 |
+ {
|
|
|
e63663 |
+ delim = '\t';
|
|
|
e63663 |
+#ifdef HAVE_MBRTOWC
|
|
|
e63663 |
+ wcdelim = L'\t';
|
|
|
e63663 |
+ mbdelim[0] = '\t';
|
|
|
e63663 |
+ mbdelim[1] = '\0';
|
|
|
e63663 |
+ delimlen = 1;
|
|
|
e63663 |
+#endif
|
|
|
e63663 |
+ }
|
|
|
e63663 |
|
|
|
e63663 |
if (output_delimiter_string == NULL)
|
|
|
e63663 |
{
|
|
|
e63663 |
- static char dummy[2];
|
|
|
e63663 |
- dummy[0] = delim;
|
|
|
e63663 |
- dummy[1] = '\0';
|
|
|
e63663 |
- output_delimiter_string = dummy;
|
|
|
e63663 |
- output_delimiter_length = 1;
|
|
|
e63663 |
+#ifdef HAVE_MBRTOWC
|
|
|
e63663 |
+ if (MB_CUR_MAX > 1 && !force_singlebyte_mode)
|
|
|
e63663 |
+ {
|
|
|
e63663 |
+ output_delimiter_string = xstrdup(mbdelim);
|
|
|
e63663 |
+ output_delimiter_length = delimlen;
|
|
|
e63663 |
+ }
|
|
|
e63663 |
+
|
|
|
e63663 |
+ if (MB_CUR_MAX <= 1 || force_singlebyte_mode)
|
|
|
e63663 |
+#endif
|
|
|
e63663 |
+ {
|
|
|
e63663 |
+ static char dummy[2];
|
|
|
e63663 |
+ dummy[0] = delim;
|
|
|
e63663 |
+ dummy[1] = '\0';
|
|
|
e63663 |
+ output_delimiter_string = dummy;
|
|
|
e63663 |
+ output_delimiter_length = 1;
|
|
|
e63663 |
+ }
|
|
|
e63663 |
}
|
|
|
e63663 |
|
|
|
e63663 |
if (optind == argc)
|