04161d
diff --git a/src/cut.c b/src/cut.c
04161d
index 7ab6be4..022d0ad 100644
04161d
--- a/src/cut.c
04161d
+++ b/src/cut.c
04161d
@@ -28,6 +28,11 @@
04161d
 #include <assert.h>
04161d
 #include <getopt.h>
04161d
 #include <sys/types.h>
04161d
+
04161d
+/* Get mbstate_t, mbrtowc().  */
04161d
+#if HAVE_WCHAR_H
04161d
+# include <wchar.h>
04161d
+#endif
04161d
 #include "system.h"
04161d
 
04161d
 #include "error.h"
04161d
@@ -38,6 +43,18 @@
04161d
 
04161d
 #include "set-fields.h"
04161d
 
04161d
+/* MB_LEN_MAX is incorrectly defined to be 1 in at least one GCC
04161d
+   installation; work around this configuration error.        */
04161d
+#if !defined MB_LEN_MAX || MB_LEN_MAX < 2
04161d
+# undef MB_LEN_MAX
04161d
+# define MB_LEN_MAX 16
04161d
+#endif
04161d
+
04161d
+/* Some systems, like BeOS, have multibyte encodings but lack mbstate_t.  */
04161d
+#if HAVE_MBRTOWC && defined mbstate_t
04161d
+# define mbrtowc(pwc, s, n, ps) (mbrtowc) (pwc, s, n, 0)
04161d
+#endif
04161d
+
04161d
 /* The official name of this program (e.g., no 'g' prefix).  */
04161d
 #define PROGRAM_NAME "cut"
04161d
 
04161d
@@ -54,6 +71,52 @@
04161d
     }									\
04161d
   while (0)
04161d
 
04161d
+/* Refill the buffer BUF to get a multibyte character. */
04161d
+#define REFILL_BUFFER(BUF, BUFPOS, BUFLEN, STREAM)                        \
04161d
+  do                                                                        \
04161d
+    {                                                                        \
04161d
+      if (BUFLEN < MB_LEN_MAX && !feof (STREAM) && !ferror (STREAM))        \
04161d
+        {                                                                \
04161d
+          memmove (BUF, BUFPOS, BUFLEN);                                \
04161d
+          BUFLEN += fread (BUF + BUFLEN, sizeof(char), BUFSIZ, STREAM); \
04161d
+          BUFPOS = BUF;                                                        \
04161d
+        }                                                                \
04161d
+    }                                                                        \
04161d
+  while (0)
04161d
+
04161d
+/* Get wide character on BUFPOS. BUFPOS is not included after that.
04161d
+   If byte sequence is not valid as a character, CONVFAIL is true. Otherwise false. */
04161d
+#define GET_NEXT_WC_FROM_BUFFER(WC, BUFPOS, BUFLEN, MBLENGTH, STATE, CONVFAIL) \
04161d
+  do                                                                        \
04161d
+    {                                                                        \
04161d
+      mbstate_t state_bak;                                                \
04161d
+                                                                        \
04161d
+      if (BUFLEN < 1)                                                        \
04161d
+        {                                                                \
04161d
+          WC = WEOF;                                                        \
04161d
+          break;                                                        \
04161d
+        }                                                                \
04161d
+                                                                        \
04161d
+      /* Get a wide character. */                                        \
04161d
+      CONVFAIL = false;                                                        \
04161d
+      state_bak = STATE;                                                \
04161d
+      MBLENGTH = mbrtowc ((wchar_t *)&WC, BUFPOS, BUFLEN, &STATE);        \
04161d
+                                                                        \
04161d
+      switch (MBLENGTH)                                                        \
04161d
+        {                                                                \
04161d
+        case (size_t)-1:                                                \
04161d
+        case (size_t)-2:                                                \
04161d
+          CONVFAIL = true;                                                        \
04161d
+          STATE = state_bak;                                                \
04161d
+          /* Fall througn. */                                                \
04161d
+                                                                        \
04161d
+        case 0:                                                                \
04161d
+          MBLENGTH = 1;                                                        \
04161d
+          break;                                                        \
04161d
+        }                                                                \
04161d
+    }                                                                        \
04161d
+  while (0)
04161d
+
04161d
 
04161d
 /* Pointer inside RP.  When checking if a byte or field is selected
04161d
    by a finite range, we check if it is between CURRENT_RP.LO
04161d
@@ -61,6 +124,9 @@
04161d
    CURRENT_RP.HI then we make CURRENT_RP to point to the next range pair. */
04161d
 static struct field_range_pair *current_rp;
04161d
 
04161d
+/* Length of the delimiter given as argument to -d.  */
04161d
+size_t delimlen;
04161d
+
04161d
 /* This buffer is used to support the semantics of the -s option
04161d
    (or lack of same) when the specified field list includes (does
04161d
    not include) the first field.  In both of those cases, the entire
04161d
@@ -77,15 +143,25 @@ enum operating_mode
04161d
   {
04161d
     undefined_mode,
04161d
 
04161d
-    /* Output characters that are in the given bytes. */
04161d
+    /* Output bytes that are at the given positions. */
04161d
     byte_mode,
04161d
 
04161d
+    /* Output characters that are at the given positions. */
04161d
+    character_mode,
04161d
+
04161d
     /* Output the given delimiter-separated fields. */
04161d
     field_mode
04161d
   };
04161d
 
04161d
 static enum operating_mode operating_mode;
04161d
 
04161d
+/* If nonzero, when in byte mode, don't split multibyte characters.  */
04161d
+static int byte_mode_character_aware;
04161d
+
04161d
+/* If nonzero, the function for single byte locale is work
04161d
+   if this program runs on multibyte locale. */
04161d
+static int force_singlebyte_mode;
04161d
+
04161d
 /* If true do not output lines containing no delimiter characters.
04161d
    Otherwise, all such lines are printed.  This option is valid only
04161d
    with field mode.  */
04161d
@@ -97,6 +173,9 @@ static bool complement;
04161d
 
04161d
 /* The delimiter character for field mode. */
04161d
 static unsigned char delim;
04161d
+#if HAVE_WCHAR_H
04161d
+static wchar_t wcdelim;
04161d
+#endif
04161d
 
04161d
 /* The delimiter for each line/record. */
04161d
 static unsigned char line_delim = '\n';
04161d
@@ -164,7 +243,7 @@ Print selected parts of lines from each FILE to standard output.\n\
04161d
   -f, --fields=LIST       select only these fields;  also print any line\n\
04161d
                             that contains no delimiter character, unless\n\
04161d
                             the -s option is specified\n\
04161d
-  -n                      (ignored)\n\
04161d
+  -n                      with -b: don't split multibyte characters\n\
04161d
 "), stdout);
04161d
       fputs (_("\
04161d
       --complement        complement the set of selected bytes, characters\n\
04161d
@@ -280,6 +359,82 @@ cut_bytes (FILE *stream)
04161d
     }
04161d
 }
04161d
 
04161d
+#if HAVE_MBRTOWC
04161d
+/* This function is in use for the following case.
04161d
+
04161d
+   1. Read from the stream STREAM, printing to standard output any selected
04161d
+   characters.
04161d
+
04161d
+   2. Read from stream STREAM, printing to standard output any selected bytes,
04161d
+   without splitting multibyte characters.  */
04161d
+
04161d
+static void
04161d
+cut_characters_or_cut_bytes_no_split (FILE *stream)
04161d
+{
04161d
+  uintmax_t idx;             /* number of bytes or characters in the line so far. */
04161d
+  char buf[MB_LEN_MAX + BUFSIZ];  /* For spooling a read byte sequence. */
04161d
+  char *bufpos;                /* Next read position of BUF. */
04161d
+  size_t buflen;        /* The length of the byte sequence in buf. */
04161d
+  wint_t wc;                /* A gotten wide character. */
04161d
+  size_t mblength;        /* The byte size of a multibyte character which shows
04161d
+                           as same character as WC. */
04161d
+  mbstate_t state;        /* State of the stream. */
04161d
+  bool convfail = false;  /* true, when conversion failed. Otherwise false. */
04161d
+  /* Whether to begin printing delimiters between ranges for the current line.
04161d
+     Set after we've begun printing data corresponding to the first range.  */
04161d
+  bool print_delimiter = false;
04161d
+
04161d
+  idx = 0;
04161d
+  buflen = 0;
04161d
+  bufpos = buf;
04161d
+  memset (&state, '\0', sizeof(mbstate_t));
04161d
+
04161d
+  current_rp = frp;
04161d
+
04161d
+  while (1)
04161d
+    {
04161d
+      REFILL_BUFFER (buf, bufpos, buflen, stream);
04161d
+
04161d
+      GET_NEXT_WC_FROM_BUFFER (wc, bufpos, buflen, mblength, state, convfail);
04161d
+      (void) convfail;  /* ignore unused */
04161d
+
04161d
+      if (wc == WEOF)
04161d
+        {
04161d
+          if (idx > 0)
04161d
+            putchar (line_delim);
04161d
+          break;
04161d
+        }
04161d
+      else if (wc == line_delim)
04161d
+        {
04161d
+          putchar (line_delim);
04161d
+          idx = 0;
04161d
+          print_delimiter = false;
04161d
+          current_rp = frp;
04161d
+        }
04161d
+      else
04161d
+        {
04161d
+          next_item (&idx);
04161d
+          if (print_kth (idx))
04161d
+            {
04161d
+              if (output_delimiter_specified)
04161d
+                {
04161d
+                  if (print_delimiter && is_range_start_index (idx))
04161d
+                    {
04161d
+                      fwrite (output_delimiter_string, sizeof (char),
04161d
+                              output_delimiter_length, stdout);
04161d
+                    }
04161d
+                  print_delimiter = true;
04161d
+                }
04161d
+              fwrite (bufpos, mblength, sizeof(char), stdout);
04161d
+            }
04161d
+        }
04161d
+
04161d
+      buflen -= mblength;
04161d
+      bufpos += mblength;
04161d
+    }
04161d
+}
04161d
+#endif
04161d
+
04161d
 /* Read from stream STREAM, printing to standard output any selected fields.  */
04161d
 
04161d
 static void
04161d
@@ -425,13 +580,211 @@ cut_fields (FILE *stream)
04161d
     }
04161d
 }
04161d
 
04161d
+#if HAVE_MBRTOWC
04161d
+static void
04161d
+cut_fields_mb (FILE *stream)
04161d
+{
04161d
+  int c;
04161d
+  uintmax_t field_idx;
04161d
+  int found_any_selected_field;
04161d
+  int buffer_first_field;
04161d
+  int empty_input;
04161d
+  char buf[MB_LEN_MAX + BUFSIZ];  /* For spooling a read byte sequence. */
04161d
+  char *bufpos;                /* Next read position of BUF. */
04161d
+  size_t buflen;        /* The length of the byte sequence in buf. */
04161d
+  wint_t wc = 0;        /* A gotten wide character. */
04161d
+  size_t mblength;        /* The byte size of a multibyte character which shows
04161d
+                           as same character as WC. */
04161d
+  mbstate_t state;        /* State of the stream. */
04161d
+  bool convfail = false;  /* true, when conversion failed. Otherwise false. */
04161d
+
04161d
+  current_rp = frp;
04161d
+
04161d
+  found_any_selected_field = 0;
04161d
+  field_idx = 1;
04161d
+  bufpos = buf;
04161d
+  buflen = 0;
04161d
+  memset (&state, '\0', sizeof(mbstate_t));
04161d
+
04161d
+  c = getc (stream);
04161d
+  empty_input = (c == EOF);
04161d
+  if (c != EOF)
04161d
+  {
04161d
+    ungetc (c, stream);
04161d
+    wc = 0;
04161d
+  }
04161d
+  else
04161d
+    wc = WEOF;
04161d
+
04161d
+  /* To support the semantics of the -s flag, we may have to buffer
04161d
+     all of the first field to determine whether it is `delimited.'
04161d
+     But that is unnecessary if all non-delimited lines must be printed
04161d
+     and the first field has been selected, or if non-delimited lines
04161d
+     must be suppressed and the first field has *not* been selected.
04161d
+     That is because a non-delimited line has exactly one field.  */
04161d
+  buffer_first_field = (suppress_non_delimited ^ !print_kth (1));
04161d
+
04161d
+  while (1)
04161d
+    {
04161d
+      if (field_idx == 1 && buffer_first_field)
04161d
+        {
04161d
+          int len = 0;
04161d
+
04161d
+          while (1)
04161d
+            {
04161d
+              REFILL_BUFFER (buf, bufpos, buflen, stream);
04161d
+
04161d
+              GET_NEXT_WC_FROM_BUFFER
04161d
+                (wc, bufpos, buflen, mblength, state, convfail);
04161d
+
04161d
+              if (wc == WEOF)
04161d
+                break;
04161d
+
04161d
+              field_1_buffer = xrealloc (field_1_buffer, len + mblength);
04161d
+              memcpy (field_1_buffer + len, bufpos, mblength);
04161d
+              len += mblength;
04161d
+              buflen -= mblength;
04161d
+              bufpos += mblength;
04161d
+
04161d
+              if (!convfail && (wc == line_delim || wc == wcdelim))
04161d
+                break;
04161d
+            }
04161d
+
04161d
+          if (len <= 0 && wc == WEOF)
04161d
+            break;
04161d
+
04161d
+          /* If the first field extends to the end of line (it is not
04161d
+             delimited) and we are printing all non-delimited lines,
04161d
+             print this one.  */
04161d
+          if (convfail || (!convfail && wc != wcdelim))
04161d
+            {
04161d
+              if (suppress_non_delimited)
04161d
+                {
04161d
+                  /* Empty.        */
04161d
+                }
04161d
+              else
04161d
+                {
04161d
+                  fwrite (field_1_buffer, sizeof (char), len, stdout);
04161d
+                  /* Make sure the output line is newline terminated.  */
04161d
+                  if (convfail || (!convfail && wc != line_delim))
04161d
+                    putchar (line_delim);
04161d
+                }
04161d
+              continue;
04161d
+            }
04161d
+
04161d
+          if (print_kth (1))
04161d
+            {
04161d
+              /* Print the field, but not the trailing delimiter.  */
04161d
+              fwrite (field_1_buffer, sizeof (char), len - 1, stdout);
04161d
+              found_any_selected_field = 1;
04161d
+            }
04161d
+          next_item (&field_idx);
04161d
+        }
04161d
+
04161d
+      if (wc != WEOF)
04161d
+        {
04161d
+          if (print_kth (field_idx))
04161d
+            {
04161d
+              if (found_any_selected_field)
04161d
+                {
04161d
+                  fwrite (output_delimiter_string, sizeof (char),
04161d
+                          output_delimiter_length, stdout);
04161d
+                }
04161d
+              found_any_selected_field = 1;
04161d
+            }
04161d
+
04161d
+          while (1)
04161d
+            {
04161d
+              REFILL_BUFFER (buf, bufpos, buflen, stream);
04161d
+
04161d
+              GET_NEXT_WC_FROM_BUFFER
04161d
+                (wc, bufpos, buflen, mblength, state, convfail);
04161d
+
04161d
+              if (wc == WEOF)
04161d
+                break;
04161d
+              else if (!convfail && (wc == wcdelim || wc == line_delim))
04161d
+                {
04161d
+                  buflen -= mblength;
04161d
+                  bufpos += mblength;
04161d
+                  break;
04161d
+                }
04161d
+
04161d
+              if (print_kth (field_idx))
04161d
+                fwrite (bufpos, mblength, sizeof(char), stdout);
04161d
+
04161d
+              buflen -= mblength;
04161d
+              bufpos += mblength;
04161d
+            }
04161d
+        }
04161d
+
04161d
+      if ((!convfail || wc == line_delim) && buflen < 1)
04161d
+        wc = WEOF;
04161d
+
04161d
+      if (!convfail && wc == wcdelim)
04161d
+        next_item (&field_idx);
04161d
+      else if (wc == WEOF || (!convfail && wc == line_delim))
04161d
+        {
04161d
+          if (found_any_selected_field
04161d
+              || (!empty_input && !(suppress_non_delimited && field_idx == 1)))
04161d
+            putchar (line_delim);
04161d
+          if (wc == WEOF)
04161d
+            break;
04161d
+          field_idx = 1;
04161d
+          current_rp = frp;
04161d
+          found_any_selected_field = 0;
04161d
+        }
04161d
+    }
04161d
+}
04161d
+#endif
04161d
+
04161d
 static void
04161d
 cut_stream (FILE *stream)
04161d
 {
04161d
-  if (operating_mode == byte_mode)
04161d
-    cut_bytes (stream);
04161d
+#if HAVE_MBRTOWC
04161d
+  if (MB_CUR_MAX > 1 && !force_singlebyte_mode)
04161d
+    {
04161d
+      switch (operating_mode)
04161d
+        {
04161d
+        case byte_mode:
04161d
+          if (byte_mode_character_aware)
04161d
+            cut_characters_or_cut_bytes_no_split (stream);
04161d
+          else
04161d
+            cut_bytes (stream);
04161d
+          break;
04161d
+
04161d
+        case character_mode:
04161d
+          cut_characters_or_cut_bytes_no_split (stream);
04161d
+          break;
04161d
+
04161d
+        case field_mode:
04161d
+          if (delimlen == 1)
04161d
+            {
04161d
+              /* Check if we have utf8 multibyte locale, so we can use this
04161d
+                 optimization because of uniqueness of characters, which is
04161d
+                 not true for e.g. SJIS */
04161d
+              char * loc = setlocale(LC_CTYPE, NULL);
04161d
+              if (loc && (strstr (loc, "UTF-8") || strstr (loc, "utf-8") ||
04161d
+                  strstr (loc, "UTF8") || strstr (loc, "utf8")))
04161d
+                {
04161d
+                  cut_fields (stream);
04161d
+                  break;
04161d
+                }
04161d
+            }
04161d
+          cut_fields_mb (stream);
04161d
+          break;
04161d
+
04161d
+        default:
04161d
+          abort ();
04161d
+        }
04161d
+    }
04161d
   else
04161d
-    cut_fields (stream);
04161d
+#endif
04161d
+    {
04161d
+      if (operating_mode == field_mode)
04161d
+        cut_fields (stream);
04161d
+      else
04161d
+        cut_bytes (stream);
04161d
+    }
04161d
 }
04161d
 
04161d
 /* Process file FILE to standard output.
04161d
@@ -483,6 +836,7 @@ main (int argc, char **argv)
04161d
   bool ok;
04161d
   bool delim_specified = false;
04161d
   char *spec_list_string IF_LINT ( = NULL);
04161d
+  char mbdelim[MB_LEN_MAX + 1];
04161d
 
04161d
   initialize_main (&argc, &argv);
04161d
   set_program_name (argv[0]);
04161d
@@ -505,7 +859,6 @@ main (int argc, char **argv)
04161d
       switch (optc)
04161d
         {
04161d
         case 'b':
04161d
-        case 'c':
04161d
           /* Build the byte list. */
04161d
           if (operating_mode != undefined_mode)
04161d
             FATAL_ERROR (_("only one type of list may be specified"));
04161d
@@ -513,6 +866,14 @@ main (int argc, char **argv)
04161d
           spec_list_string = optarg;
04161d
           break;
04161d
 
04161d
+        case 'c':
04161d
+          /* Build the character list. */
04161d
+          if (operating_mode != undefined_mode)
04161d
+            FATAL_ERROR (_("only one type of list may be specified"));
04161d
+          operating_mode = character_mode;
04161d
+          spec_list_string = optarg;
04161d
+          break;
04161d
+
04161d
         case 'f':
04161d
           /* Build the field list. */
04161d
           if (operating_mode != undefined_mode)
04161d
@@ -524,10 +885,38 @@ main (int argc, char **argv)
04161d
         case 'd':
04161d
           /* New delimiter. */
04161d
           /* Interpret -d '' to mean 'use the NUL byte as the delimiter.'  */
04161d
-          if (optarg[0] != '\0' && optarg[1] != '\0')
04161d
-            FATAL_ERROR (_("the delimiter must be a single character"));
04161d
-          delim = optarg[0];
04161d
-          delim_specified = true;
04161d
+            {
04161d
+#if HAVE_MBRTOWC
04161d
+              if(MB_CUR_MAX > 1)
04161d
+                {
04161d
+                  mbstate_t state;
04161d
+
04161d
+                  memset (&state, '\0', sizeof(mbstate_t));
04161d
+                  delimlen = mbrtowc (&wcdelim, optarg, strnlen(optarg, MB_LEN_MAX), &state);
04161d
+
04161d
+                  if (delimlen == (size_t)-1 || delimlen == (size_t)-2)
04161d
+                    ++force_singlebyte_mode;
04161d
+                  else
04161d
+                    {
04161d
+                      delimlen = (delimlen < 1) ? 1 : delimlen;
04161d
+                      if (wcdelim != L'\0' && *(optarg + delimlen) != '\0')
04161d
+                        FATAL_ERROR (_("the delimiter must be a single character"));
04161d
+                      memcpy (mbdelim, optarg, delimlen);
04161d
+                      mbdelim[delimlen] = '\0';
04161d
+                      if (delimlen == 1)
04161d
+                        delim = *optarg;
04161d
+                    }
04161d
+                }
04161d
+
04161d
+              if (MB_CUR_MAX <= 1 || force_singlebyte_mode)
04161d
+#endif
04161d
+                {
04161d
+                  if (optarg[0] != '\0' && optarg[1] != '\0')
04161d
+                    FATAL_ERROR (_("the delimiter must be a single character"));
04161d
+                  delim = (unsigned char) optarg[0];
04161d
+                }
04161d
+            delim_specified = true;
04161d
+          }
04161d
           break;
04161d
 
04161d
         case OUTPUT_DELIMITER_OPTION:
04161d
@@ -540,6 +929,7 @@ main (int argc, char **argv)
04161d
           break;
04161d
 
04161d
         case 'n':
04161d
+          byte_mode_character_aware = 1;
04161d
           break;
04161d
 
04161d
         case 's':
04161d
@@ -579,15 +969,34 @@ main (int argc, char **argv)
04161d
               | (complement ? SETFLD_COMPLEMENT : 0) );
04161d
 
04161d
   if (!delim_specified)
04161d
-    delim = '\t';
04161d
+    {
04161d
+      delim = '\t';
04161d
+#ifdef HAVE_MBRTOWC
04161d
+      wcdelim = L'\t';
04161d
+      mbdelim[0] = '\t';
04161d
+      mbdelim[1] = '\0';
04161d
+      delimlen = 1;
04161d
+#endif
04161d
+    }
04161d
 
04161d
   if (output_delimiter_string == NULL)
04161d
     {
04161d
-      static char dummy[2];
04161d
-      dummy[0] = delim;
04161d
-      dummy[1] = '\0';
04161d
-      output_delimiter_string = dummy;
04161d
-      output_delimiter_length = 1;
04161d
+#ifdef HAVE_MBRTOWC
04161d
+      if (MB_CUR_MAX > 1 && !force_singlebyte_mode)
04161d
+        {
04161d
+          output_delimiter_string = xstrdup(mbdelim);
04161d
+          output_delimiter_length = delimlen;
04161d
+        }
04161d
+
04161d
+      if (MB_CUR_MAX <= 1 || force_singlebyte_mode)
04161d
+#endif
04161d
+        {
04161d
+          static char dummy[2];
04161d
+          dummy[0] = delim;
04161d
+          dummy[1] = '\0';
04161d
+          output_delimiter_string = dummy;
04161d
+          output_delimiter_length = 1;
04161d
+        }
04161d
     }
04161d
 
04161d
   if (optind == argc)