Blame SOURCES/binutils.unicode.patch

5d235a
diff -rup binutils.orig/binutils/NEWS binutils-2.36.1/binutils/NEWS
5d235a
--- binutils.orig/binutils/NEWS	2021-10-21 16:56:20.322761363 +0100
5d235a
+++ binutils-2.36.1/binutils/NEWS	2021-10-21 16:56:29.692696238 +0100
5d235a
@@ -151,6 +151,15 @@ Changes in 2.32:
5d235a
 
5d235a
 Changes in 2.31:
5d235a
 
5d235a
+* Tools which display names or strings (readelf, strings, nm, objdump)
5d235a
+  have a new command line option which controls how unicode characters are
5d235a
+  handled.  By default they are treated as normal for the tool.  Using
5d235a
+  --unicode=locale will display them according to the current locale.
5d235a
+  Using --unicode=hex will display them as hex byte values, whilst
5d235a
+  --unicode=escape will display them as escape sequences.  In addition
5d235a
+  using --unicode=highlight will display them as unicode escape sequences
5d235a
+  highlighted in red (if supported by the output device).
5d235a
+
5d235a
 * Add support for disassembling netronome Flow Processor (NFP) firmware files.
5d235a
 
5d235a
 * The AArch64 port now supports showing disassembly notes which are emitted
5d235a
Only in binutils-2.36.1/binutils/: NEWS.orig
5d235a
diff -rup binutils.orig/binutils/doc/binutils.texi binutils-2.36.1/binutils/doc/binutils.texi
5d235a
--- binutils.orig/binutils/doc/binutils.texi	2021-10-21 16:56:20.324761349 +0100
5d235a
+++ binutils-2.36.1/binutils/doc/binutils.texi	2021-10-21 16:56:29.694696225 +0100
5d235a
@@ -799,6 +799,7 @@ nm [@option{-A}|@option{-o}|@option{--pr
5d235a
    [@option{-g}|@option{--extern-only}] [@option{-h}|@option{--help}]
5d235a
    [@option{--ifunc-chars=@var{CHARS}}]
5d235a
    [@option{-l}|@option{--line-numbers}] [@option{--inlines}]
5d235a
+   [@option{-U} @var{method}] [@option{--unicode=}@var{method}]
5d235a
    [@option{-n}|@option{-v}|@option{--numeric-sort}]
5d235a
    [@option{-P}|@option{--portability}] [@option{-p}|@option{--no-sort}]
5d235a
    [@option{-r}|@option{--reverse-sort}] [@option{-S}|@option{--print-size}]
5d235a
@@ -1114,6 +1115,21 @@ Use @var{radix} as the radix for printin
5d235a
 @cindex undefined symbols
5d235a
 Display only undefined symbols (those external to each object file).
5d235a
 
5d235a
+@item -U @var{[d|i|l|e|x|h]}
5d235a
+@itemx --unicode=@var{[default|invalid|locale|escape|hex|highlight]}
5d235a
+Controls the display of UTF-8 encoded mulibyte characters in strings.
5d235a
+The default (@option{--unicode=default}) is to give them no special
5d235a
+treatment.  The @option{--unicode=locale} option displays the sequence
5d235a
+in the current locale, which may or may not support them.  The options
5d235a
+@option{--unicode=hex} and @option{--unicode=invalid} display them as
5d235a
+hex byte sequences enclosed by either angle brackets or curly braces.
5d235a
+
5d235a
+The @option{--unicode=escape} option displays them as escape sequences
5d235a
+(@var{\uxxxx}) and the @option{--unicode=highlight} option displays
5d235a
+them as escape sequences highlighted in red (if supported by the
5d235a
+output device).  The colouring is intended to draw attention to the
5d235a
+presence of unicode sequences where they might not be expected.
5d235a
+
5d235a
 @item -V
5d235a
 @itemx --version
5d235a
 Show the version number of @command{nm} and exit.
5d235a
@@ -2210,6 +2226,7 @@ objdump [@option{-a}|@option{--archive-h
5d235a
         [@option{--prefix-strip=}@var{level}]
5d235a
         [@option{--insn-width=}@var{width}]
5d235a
         [@option{--visualize-jumps[=color|=extended-color|=off]}
5d235a
+        [@option{-U} @var{method}] [@option{--unicode=}@var{method}]
5d235a
         [@option{-V}|@option{--version}]
5d235a
         [@option{-H}|@option{--help}]
5d235a
         @var{objfile}@dots{}
5d235a
@@ -2877,6 +2894,21 @@ When displaying symbols include those wh
5d235a
 special in some way and which would not normally be of interest to the
5d235a
 user.
5d235a
 
5d235a
+@item -U @var{[d|i|l|e|x|h]}
5d235a
+@itemx --unicode=@var{[default|invalid|locale|escape|hex|highlight]}
5d235a
+Controls the display of UTF-8 encoded mulibyte characters in strings.
5d235a
+The default (@option{--unicode=default}) is to give them no special
5d235a
+treatment.  The @option{--unicode=locale} option displays the sequence
5d235a
+in the current locale, which may or may not support them.  The options
5d235a
+@option{--unicode=hex} and @option{--unicode=invalid} display them as
5d235a
+hex byte sequences enclosed by either angle brackets or curly braces.
5d235a
+
5d235a
+The @option{--unicode=escape} option displays them as escape sequences
5d235a
+(@var{\uxxxx}) and the @option{--unicode=highlight} option displays
5d235a
+them as escape sequences highlighted in red (if supported by the
5d235a
+output device).  The colouring is intended to draw attention to the
5d235a
+presence of unicode sequences where they might not be expected.
5d235a
+
5d235a
 @item -V
5d235a
 @itemx --version
5d235a
 Print the version number of @command{objdump} and exit.
5d235a
@@ -3153,6 +3185,7 @@ strings [@option{-afovV}] [@option{-}@va
5d235a
         [@option{-n} @var{min-len}] [@option{--bytes=}@var{min-len}]
5d235a
         [@option{-t} @var{radix}] [@option{--radix=}@var{radix}]
5d235a
         [@option{-e} @var{encoding}] [@option{--encoding=}@var{encoding}]
5d235a
+        [@option{-U} @var{method}] [@option{--unicode=}@var{method}]
5d235a
         [@option{-}] [@option{--all}] [@option{--print-file-name}]
5d235a
         [@option{-T} @var{bfdname}] [@option{--target=}@var{bfdname}]
5d235a
         [@option{-w}] [@option{--include-all-whitespace}]
5d235a
@@ -3244,6 +3277,28 @@ single-8-bit-byte characters, @samp{b} =
5d235a
 littleendian.  Useful for finding wide character strings. (@samp{l}
5d235a
 and @samp{b} apply to, for example, Unicode UTF-16/UCS-2 encodings).
5d235a
 
5d235a
+@item -U @var{[d|i|l|e|x|h]}
5d235a
+@itemx --unicode=@var{[default|invalid|locale|escape|hex|highlight]}
5d235a
+Controls the display of UTF-8 encoded mulibyte characters in strings.
5d235a
+The default (@option{--unicode=default}) is to give them no special
5d235a
+treatment, and instead rely upon the setting of the
5d235a
+@option{--encoding} option.  The other values for this option
5d235a
+automatically enable @option{--encoding=S}.
5d235a
+
5d235a
+The @option{--unicode=invalid} option treats them as non-graphic
5d235a
+characters and hence not part of a valid string.  All the remaining
5d235a
+options treat them as valid string characters.
5d235a
+
5d235a
+The @option{--unicode=locale} option displays them in the current
5d235a
+locale, which may or may not support UTF-8 encoding.  The
5d235a
+@option{--unicode=hex} option displays them as hex byte sequences
5d235a
+enclosed between @var{<>} characters.  The @option{--unicode=escape}
5d235a
+option displays them as escape sequences (@var{\uxxxx}) and the
5d235a
+@option{--unicode=highlight} option displays them as escape sequences
5d235a
+highlighted in red (if supported by the output device).  The colouring
5d235a
+is intended to draw attention to the presence of unicode sequences
5d235a
+where they might not be expected.
5d235a
+
5d235a
 @item -T @var{bfdname}
5d235a
 @itemx --target=@var{bfdname}
5d235a
 @cindex object code format
5d235a
@@ -4766,6 +4821,7 @@ readelf [@option{-a}|@option{--all}]
5d235a
         [@option{-W}|@option{--wide}]
5d235a
         [@option{-T}|@option{--silent-truncation}]
5d235a
         [@option{-H}|@option{--help}]
5d235a
+        [@option{-U} @var{method}|@option{--unicode=}@var{method}]
5d235a
         @var{elffile}@dots{}
5d235a
 @c man end
5d235a
 @end smallexample
5d235a
@@ -4887,6 +4943,28 @@ necessary in order to demangle truly com
5d235a
 that if the recursion limit is disabled then stack exhaustion is
5d235a
 possible and any bug reports about such an event will be rejected.
5d235a
 
5d235a
+@item -U @var{[d|i|l|e|x|h]}
5d235a
+@itemx --unicode=[default|invalid|locale|escape|hex|highlight]
5d235a
+Controls the display of non-ASCII characters in identifier names.
5d235a
+The default (@option{--unicode=locale} or @option{--unicode=default}) is
5d235a
+to treat them as multibyte characters and display them in the current
5d235a
+locale.  All other versions of this option treat the bytes as UTF-8
5d235a
+encoded values and attempt to interpret them.  If they cannot be
5d235a
+interpreted or if the @option{--unicode=invalid} option is used then
5d235a
+they are displayed as a sequence of hex bytes, encloses in curly
5d235a
+parethesis characters.
5d235a
+
5d235a
+Using the @option{--unicode=escape} option will display the characters
5d235a
+as as unicode escape sequences (@var{\uxxxx}).  Using the
5d235a
+@option{--unicode=hex} will display the characters as hex byte
5d235a
+sequences enclosed between angle brackets.
5d235a
+
5d235a
+Using the @option{--unicode=highlight} will display the characters as 
5d235a
+unicode escape sequences but it will also highlighted them in red,
5d235a
+assuming that colouring is supported by the output device.  The
5d235a
+colouring is intended to draw attention to the presence of unicode
5d235a
+sequences when they might not be expected.
5d235a
+
5d235a
 @item -e
5d235a
 @itemx --headers
5d235a
 Display all the headers in the file.  Equivalent to @option{-h -l -S}.
5d235a
Only in binutils-2.36.1/binutils/doc: binutils.texi.orig
5d235a
diff -rup binutils.orig/binutils/nm.c binutils-2.36.1/binutils/nm.c
5d235a
--- binutils.orig/binutils/nm.c	2021-10-21 16:56:20.318761391 +0100
5d235a
+++ binutils-2.36.1/binutils/nm.c	2021-10-21 16:59:56.105261602 +0100
5d235a
@@ -38,6 +38,11 @@
5d235a
 #include "bucomm.h"
5d235a
 #include "plugin-api.h"
5d235a
 #include "plugin.h"
5d235a
+#include "safe-ctype.h"
5d235a
+
5d235a
+#ifndef streq
5d235a
+#define streq(a,b) (strcmp ((a),(b)) == 0)
5d235a
+#endif
5d235a
 
5d235a
 /* When sorting by size, we use this structure to hold the size and a
5d235a
    pointer to the minisymbol.  */
5d235a
@@ -192,6 +197,18 @@ static const char *plugin_target = NULL;
5d235a
 static bfd *lineno_cache_bfd;
5d235a
 static bfd *lineno_cache_rel_bfd;
5d235a
 
5d235a
+typedef enum unicode_display_type
5d235a
+{
5d235a
+  unicode_default = 0,
5d235a
+  unicode_locale,
5d235a
+  unicode_escape,
5d235a
+  unicode_hex,
5d235a
+  unicode_highlight,
5d235a
+  unicode_invalid
5d235a
+} unicode_display_type;
5d235a
+
5d235a
+static unicode_display_type unicode_display = unicode_default;
5d235a
+
5d235a
 enum long_option_values
5d235a
 {
5d235a
   OPTION_TARGET = 200,
5d235a
@@ -234,6 +251,7 @@ static struct option long_options[] =
5d235a
   {"target", required_argument, 0, OPTION_TARGET},
5d235a
   {"defined-only", no_argument, &defined_only, 1},
5d235a
   {"undefined-only", no_argument, &undefined_only, 1},
5d235a
+  {"unicode", required_argument, NULL, 'U'},
5d235a
   {"version", no_argument, &show_version, 1},
5d235a
   {"with-symbol-versions", no_argument, NULL,
5d235a
    OPTION_WITH_SYMBOL_VERSIONS},
5d235a
@@ -285,6 +303,8 @@ usage (FILE *stream, int status)
5d235a
   -t, --radix=RADIX      Use RADIX for printing symbol values\n\
5d235a
       --target=BFDNAME   Specify the target object format as BFDNAME\n\
5d235a
   -u, --undefined-only   Display only undefined symbols\n\
5d235a
+  -U {d|s|i|x|e|h}       Specify how to treat UTF-8 encoded unicode characters\n\
5d235a
+      --unicode={default|show|invalid|hex|escape|highlight}\n\
5d235a
       --with-symbol-versions  Display version strings after symbol names\n\
5d235a
   -X 32_64               (ignored)\n\
5d235a
   @FILE                  Read options from FILE\n\
5d235a
@@ -400,6 +420,189 @@ get_coff_symbol_type (const struct inter
5d235a
   return bufp;
5d235a
 }
5d235a
 
5d235a
+/* Convert a potential UTF-8 encoded sequence in IN into characters in OUT.
5d235a
+   The conversion format is controlled by the unicode_display variable.
5d235a
+   Returns the number of characters added to OUT.
5d235a
+   Returns the number of bytes consumed from IN in CONSUMED.
5d235a
+   Always consumes at least one byte and displays at least one character.  */
5d235a
+   
5d235a
+static unsigned int
5d235a
+display_utf8 (const unsigned char * in, char * out, unsigned int * consumed)
5d235a
+{
5d235a
+  char *        orig_out = out;
5d235a
+  unsigned int  nchars = 0;
5d235a
+
5d235a
+  if (unicode_display == unicode_default)
5d235a
+    goto invalid;
5d235a
+
5d235a
+  if (in[0] < 0xc0)
5d235a
+    goto invalid;
5d235a
+
5d235a
+  if ((in[1] & 0xc0) != 0x80)
5d235a
+    goto invalid;
5d235a
+
5d235a
+  if ((in[0] & 0x20) == 0)
5d235a
+    {
5d235a
+      nchars = 2;
5d235a
+      goto valid;
5d235a
+    }
5d235a
+
5d235a
+  if ((in[2] & 0xc0) != 0x80)
5d235a
+    goto invalid;
5d235a
+
5d235a
+  if ((in[0] & 0x10) == 0)
5d235a
+    {
5d235a
+      nchars = 3;
5d235a
+      goto valid;
5d235a
+    }
5d235a
+
5d235a
+  if ((in[3] & 0xc0) != 0x80)
5d235a
+    goto invalid;
5d235a
+
5d235a
+  nchars = 4;
5d235a
+
5d235a
+ valid:
5d235a
+  switch (unicode_display)
5d235a
+    {
5d235a
+    case unicode_locale:
5d235a
+      /* Copy the bytes into the output buffer as is.  */
5d235a
+      memcpy (out, in, nchars);
5d235a
+      out += nchars;
5d235a
+      break;
5d235a
+
5d235a
+    case unicode_invalid:
5d235a
+    case unicode_hex:
5d235a
+      {
5d235a
+      unsigned int j;
5d235a
+
5d235a
+      out += sprintf (out, "%c", unicode_display == unicode_hex ? '<' : '{');
5d235a
+      for (j = 0; j < nchars; j++)
5d235a
+	out += sprintf (out, "%02x", in [j]);
5d235a
+      out += sprintf (out, "%c", unicode_display == unicode_hex ? '>' : '}');
5d235a
+      }
5d235a
+      break;
5d235a
+      
5d235a
+    case unicode_highlight:
5d235a
+      if (isatty (1))
5d235a
+	out += sprintf (out, "\x1B[31;47m"); /* Red.  */
5d235a
+      /* Fall through.  */
5d235a
+    case unicode_escape:
5d235a
+      switch (nchars)
5d235a
+	{
5d235a
+	case 2:
5d235a
+	  out += sprintf (out, "\\u%02x%02x",
5d235a
+		  ((in[0] & 0x1c) >> 2), 
5d235a
+		  ((in[0] & 0x03) << 6) | (in[1] & 0x3f));
5d235a
+	  break;
5d235a
+
5d235a
+	case 3:
5d235a
+	  out += sprintf (out, "\\u%02x%02x",
5d235a
+		  ((in[0] & 0x0f) << 4) | ((in[1] & 0x3c) >> 2),
5d235a
+		  ((in[1] & 0x03) << 6) | ((in[2] & 0x3f)));
5d235a
+	  break;
5d235a
+
5d235a
+	case 4:
5d235a
+	  out += sprintf (out, "\\u%02x%02x%02x",
5d235a
+		  ((in[0] & 0x07) << 6) | ((in[1] & 0x3c) >> 2),
5d235a
+		  ((in[1] & 0x03) << 6) | ((in[2] & 0x3c) >> 2),
5d235a
+		  ((in[2] & 0x03) << 6) | ((in[3] & 0x3f)));
5d235a
+	  break;
5d235a
+	default:
5d235a
+	  /* URG.  */
5d235a
+	  break;
5d235a
+	}
5d235a
+
5d235a
+      if (unicode_display == unicode_highlight && isatty (1))
5d235a
+	out += sprintf (out, "\033[0m"); /* Default colour.  */
5d235a
+      break;
5d235a
+
5d235a
+    default:
5d235a
+      /* URG */
5d235a
+      break;
5d235a
+    }
5d235a
+
5d235a
+  * consumed = nchars;
5d235a
+  return out - orig_out;
5d235a
+
5d235a
+ invalid:
5d235a
+  /* Not a valid UTF-8 sequence.  */
5d235a
+  *out = *in;
5d235a
+  * consumed = 1;
5d235a
+  return 1;
5d235a
+}
5d235a
+
5d235a
+/* Convert any UTF-8 encoded characters in NAME into the form specified by
5d235a
+   unicode_display.  Also converts control characters.  Returns a static
5d235a
+   buffer if conversion was necessary.
5d235a
+   Code stolen from objdump.c:sanitize_string().  */
5d235a
+
5d235a
+static const char *
5d235a
+convert_utf8 (const char * in)
5d235a
+{
5d235a
+  static char *  buffer = NULL;
5d235a
+  static size_t  buffer_len = 0;
5d235a
+  const char *   original = in;
5d235a
+  char *         out;
5d235a
+
5d235a
+  /* Paranoia.  */
5d235a
+  if (in == NULL)
5d235a
+    return "";
5d235a
+
5d235a
+  /* See if any conversion is necessary.
5d235a
+     In the majority of cases it will not be needed.  */
5d235a
+  do
5d235a
+    {
5d235a
+      unsigned char c = *in++;
5d235a
+
5d235a
+      if (c == 0)
5d235a
+	return original;
5d235a
+
5d235a
+      if (ISCNTRL (c))
5d235a
+	break;
5d235a
+
5d235a
+      if (unicode_display != unicode_default && c >= 0xc0)
5d235a
+	break;
5d235a
+    }
5d235a
+  while (1);
5d235a
+
5d235a
+  /* Copy the input, translating as needed.  */
5d235a
+  in = original;
5d235a
+  if (buffer_len < (strlen (in) * 9))
5d235a
+    {
5d235a
+      free ((void *) buffer);
5d235a
+      buffer_len = strlen (in) * 9;
5d235a
+      buffer = xmalloc (buffer_len + 1);
5d235a
+    }
5d235a
+
5d235a
+  out = buffer;
5d235a
+  do
5d235a
+    {
5d235a
+      unsigned char c = *in++;
5d235a
+
5d235a
+      if (c == 0)
5d235a
+	break;
5d235a
+
5d235a
+      if (ISCNTRL (c))
5d235a
+	{
5d235a
+	  *out++ = '^';
5d235a
+	  *out++ = c + 0x40;
5d235a
+	}
5d235a
+      else if (unicode_display != unicode_default && c >= 0xc0)
5d235a
+	{
5d235a
+	  unsigned int num_consumed;
5d235a
+
5d235a
+	  out += display_utf8 ((const unsigned char *)(in - 1), out, & num_consumed);
5d235a
+	  in += num_consumed - 1;
5d235a
+	}
5d235a
+      else
5d235a
+	*out++ = c;
5d235a
+    }
5d235a
+  while (1);
5d235a
+
5d235a
+  *out = 0;
5d235a
+  return buffer;
5d235a
+}
5d235a
+
5d235a
 /* Print symbol name NAME, read from ABFD, with printf format FORM,
5d235a
    demangling it if requested.  */
5d235a
 
5d235a
@@ -418,6 +621,9 @@ print_symname (const char *form, struct
5d235a
 	name = alloc;
5d235a
     }
5d235a
 
5d235a
+  if (unicode_display != unicode_default)
5d235a
+    name = convert_utf8 (name);
5d235a
+
5d235a
   if (info != NULL && info->elfinfo)
5d235a
     {
5d235a
       const char *version_string;
5d235a
@@ -1738,7 +1944,7 @@ main (int argc, char **argv)
5d235a
     fatal (_("fatal error: libbfd ABI mismatch"));
5d235a
   set_default_bfd_target ();
5d235a
 
5d235a
-  while ((c = getopt_long (argc, argv, "aABCDef:gHhlnopPrSst:uvVvX:",
5d235a
+  while ((c = getopt_long (argc, argv, "aABCDef:gHhlnopPrSst:uU:vVvX:",
5d235a
 			   long_options, (int *) 0)) != EOF)
5d235a
     {
5d235a
       switch (c)
5d235a
@@ -1828,6 +2034,24 @@ main (int argc, char **argv)
5d235a
 	case 'u':
5d235a
 	  undefined_only = 1;
5d235a
 	  break;
5d235a
+
5d235a
+	case 'U':
5d235a
+	  if (streq (optarg, "default") || streq (optarg, "d"))
5d235a
+	    unicode_display = unicode_default;
5d235a
+	  else if (streq (optarg, "locale") || streq (optarg, "l"))
5d235a
+	    unicode_display = unicode_locale;
5d235a
+	  else if (streq (optarg, "escape") || streq (optarg, "e"))
5d235a
+	    unicode_display = unicode_escape;
5d235a
+	  else if (streq (optarg, "invalid") || streq (optarg, "i"))
5d235a
+	    unicode_display = unicode_invalid;
5d235a
+	  else if (streq (optarg, "hex") || streq (optarg, "x"))
5d235a
+	    unicode_display = unicode_hex;
5d235a
+	  else if (streq (optarg, "highlight") || streq (optarg, "h"))
5d235a
+	    unicode_display = unicode_highlight;
5d235a
+	  else
5d235a
+	    fatal (_("invalid argument to -U/--unicode: %s"), optarg);
5d235a
+	  break;
5d235a
+
5d235a
 	case 'V':
5d235a
 	  show_version = 1;
5d235a
 	  break;
5d235a
Only in binutils-2.36.1/binutils/: nm.c.orig
5d235a
Only in binutils-2.36.1/binutils/: nm.c.rej
5d235a
diff -rup binutils.orig/binutils/objdump.c binutils-2.36.1/binutils/objdump.c
5d235a
--- binutils.orig/binutils/objdump.c	2021-10-21 16:56:20.320761377 +0100
5d235a
+++ binutils-2.36.1/binutils/objdump.c	2021-10-21 16:56:29.695696218 +0100
5d235a
@@ -205,6 +205,18 @@ static const struct objdump_private_desc
5d235a
 
5d235a
 /* The list of detected jumps inside a function.  */
5d235a
 static struct jump_info *detected_jumps = NULL;
5d235a
+
5d235a
+typedef enum unicode_display_type
5d235a
+{
5d235a
+  unicode_default = 0,
5d235a
+  unicode_locale,
5d235a
+  unicode_escape,
5d235a
+  unicode_hex,
5d235a
+  unicode_highlight,
5d235a
+  unicode_invalid
5d235a
+} unicode_display_type;
5d235a
+
5d235a
+static unicode_display_type unicode_display = unicode_default;
5d235a
 
5d235a
 static void usage (FILE *, int) ATTRIBUTE_NORETURN;
5d235a
 static void
5d235a
@@ -247,6 +259,9 @@ usage (FILE *stream, int status)
5d235a
   -r, --reloc              Display the relocation entries in the file\n\
5d235a
   -R, --dynamic-reloc      Display the dynamic relocation entries in the file\n\
5d235a
   @<file>                  Read options from <file>\n\
5d235a
+  -U[d|l|i|x|e|h]          Controls the display of UTF-8 unicode characters\n\
5d235a
+  --unicode=[default|locale|invalid|hex|escape|highlight]\n"));
5d235a
+      fprintf (stream, _("\
5d235a
   -v, --version            Display this program's version number\n\
5d235a
   -i, --info               List object formats and architectures supported\n\
5d235a
   -H, --help               Display this information\n\
5d235a
@@ -395,6 +410,7 @@ static struct option long_options[]=
5d235a
   {"stop-address", required_argument, NULL, OPTION_STOP_ADDRESS},
5d235a
   {"syms", no_argument, NULL, 't'},
5d235a
   {"target", required_argument, NULL, 'b'},
5d235a
+  {"unicode", required_argument, NULL, 'U'},
5d235a
   {"version", no_argument, NULL, 'V'},
5d235a
   {"wide", no_argument, NULL, 'w'},
5d235a
   {"prefix", required_argument, NULL, OPTION_PREFIX},
5d235a
@@ -414,10 +430,124 @@ nonfatal (const char *msg)
5d235a
   bfd_nonfatal (msg);
5d235a
   exit_status = 1;
5d235a
 }
5d235a
+
5d235a
+/* Convert a potential UTF-8 encoded sequence in IN into characters in OUT.
5d235a
+   The conversion format is controlled by the unicode_display variable.
5d235a
+   Returns the number of characters added to OUT.
5d235a
+   Returns the number of bytes consumed from IN in CONSUMED.
5d235a
+   Always consumes at least one byte and displays at least one character.  */
5d235a
+   
5d235a
+static unsigned int
5d235a
+display_utf8 (const unsigned char * in, char * out, unsigned int * consumed)
5d235a
+{
5d235a
+  char *        orig_out = out;
5d235a
+  unsigned int  nchars = 0;
5d235a
+
5d235a
+  if (unicode_display == unicode_default)
5d235a
+    goto invalid;
5d235a
+
5d235a
+  if (in[0] < 0xc0)
5d235a
+    goto invalid;
5d235a
+
5d235a
+  if ((in[1] & 0xc0) != 0x80)
5d235a
+    goto invalid;
5d235a
+
5d235a
+  if ((in[0] & 0x20) == 0)
5d235a
+    {
5d235a
+      nchars = 2;
5d235a
+      goto valid;
5d235a
+    }
5d235a
+
5d235a
+  if ((in[2] & 0xc0) != 0x80)
5d235a
+    goto invalid;
5d235a
+
5d235a
+  if ((in[0] & 0x10) == 0)
5d235a
+    {
5d235a
+      nchars = 3;
5d235a
+      goto valid;
5d235a
+    }
5d235a
+
5d235a
+  if ((in[3] & 0xc0) != 0x80)
5d235a
+    goto invalid;
5d235a
+
5d235a
+  nchars = 4;
5d235a
+
5d235a
+ valid:
5d235a
+  switch (unicode_display)
5d235a
+    {
5d235a
+    case unicode_locale:
5d235a
+      /* Copy the bytes into the output buffer as is.  */
5d235a
+      memcpy (out, in, nchars);
5d235a
+      out += nchars;
5d235a
+      break;
5d235a
+
5d235a
+    case unicode_invalid:
5d235a
+    case unicode_hex:
5d235a
+      {
5d235a
+      unsigned int j;
5d235a
+
5d235a
+      out += sprintf (out, "%c", unicode_display == unicode_hex ? '<' : '{');
5d235a
+      for (j = 0; j < nchars; j++)
5d235a
+	out += sprintf (out, "%02x", in [j]);
5d235a
+      out += sprintf (out, "%c", unicode_display == unicode_hex ? '>' : '}');
5d235a
+      }
5d235a
+      break;
5d235a
+      
5d235a
+    case unicode_highlight:
5d235a
+      if (isatty (1))
5d235a
+	out += sprintf (out, "\x1B[31;47m"); /* Red.  */
5d235a
+      /* Fall through.  */
5d235a
+    case unicode_escape:
5d235a
+      switch (nchars)
5d235a
+	{
5d235a
+	case 2:
5d235a
+	  out += sprintf (out, "\\u%02x%02x",
5d235a
+		  ((in[0] & 0x1c) >> 2), 
5d235a
+		  ((in[0] & 0x03) << 6) | (in[1] & 0x3f));
5d235a
+	  break;
5d235a
+
5d235a
+	case 3:
5d235a
+	  out += sprintf (out, "\\u%02x%02x",
5d235a
+		  ((in[0] & 0x0f) << 4) | ((in[1] & 0x3c) >> 2),
5d235a
+		  ((in[1] & 0x03) << 6) | ((in[2] & 0x3f)));
5d235a
+	  break;
5d235a
+
5d235a
+	case 4:
5d235a
+	  out += sprintf (out, "\\u%02x%02x%02x",
5d235a
+		  ((in[0] & 0x07) << 6) | ((in[1] & 0x3c) >> 2),
5d235a
+		  ((in[1] & 0x03) << 6) | ((in[2] & 0x3c) >> 2),
5d235a
+		  ((in[2] & 0x03) << 6) | ((in[3] & 0x3f)));
5d235a
+	  break;
5d235a
+	default:
5d235a
+	  /* URG.  */
5d235a
+	  break;
5d235a
+	}
5d235a
+
5d235a
+      if (unicode_display == unicode_highlight && isatty (1))
5d235a
+	out += sprintf (out, "\033[0m"); /* Default colour.  */
5d235a
+      break;
5d235a
+
5d235a
+    default:
5d235a
+      /* URG */
5d235a
+      break;
5d235a
+    }
5d235a
+
5d235a
+  * consumed = nchars;
5d235a
+  return out - orig_out;
5d235a
+
5d235a
+ invalid:
5d235a
+  /* Not a valid UTF-8 sequence.  */
5d235a
+  *out = *in;
5d235a
+  * consumed = 1;
5d235a
+  return 1;
5d235a
+}
5d235a
 
5d235a
 /* Returns a version of IN with any control characters
5d235a
    replaced by escape sequences.  Uses a static buffer
5d235a
-   if necessary.  */
5d235a
+   if necessary.
5d235a
+
5d235a
+   If unicode display is enabled, then also handles the
5d235a
+   conversion of unicode characters.  */
5d235a
 
5d235a
 static const char *
5d235a
 sanitize_string (const char * in)
5d235a
@@ -435,40 +565,50 @@ sanitize_string (const char * in)
5d235a
      of cases it will not be needed.  */
5d235a
   do
5d235a
     {
5d235a
-      char c = *in++;
5d235a
+      unsigned char c = *in++;
5d235a
 
5d235a
       if (c == 0)
5d235a
 	return original;
5d235a
 
5d235a
       if (ISCNTRL (c))
5d235a
 	break;
5d235a
+
5d235a
+      if (unicode_display != unicode_default && c >= 0xc0)
5d235a
+	break;
5d235a
     }
5d235a
   while (1);
5d235a
 
5d235a
   /* Copy the input, translating as needed.  */
5d235a
   in = original;
5d235a
-  if (buffer_len < (strlen (in) * 2))
5d235a
+  if (buffer_len < (strlen (in) * 9))
5d235a
     {
5d235a
       free ((void *) buffer);
5d235a
-      buffer_len = strlen (in) * 2;
5d235a
+      buffer_len = strlen (in) * 9;
5d235a
       buffer = xmalloc (buffer_len + 1);
5d235a
     }
5d235a
 
5d235a
   out = buffer;
5d235a
   do
5d235a
     {
5d235a
-      char c = *in++;
5d235a
+      unsigned char c = *in++;
5d235a
 
5d235a
       if (c == 0)
5d235a
 	break;
5d235a
 
5d235a
-      if (!ISCNTRL (c))
5d235a
-	*out++ = c;
5d235a
-      else
5d235a
+      if (ISCNTRL (c))
5d235a
 	{
5d235a
 	  *out++ = '^';
5d235a
 	  *out++ = c + 0x40;
5d235a
 	}
5d235a
+      else if (unicode_display != unicode_default && c >= 0xc0)
5d235a
+	{
5d235a
+	  unsigned int num_consumed;
5d235a
+
5d235a
+	  out += display_utf8 ((const unsigned char *)(in - 1), out, & num_consumed);
5d235a
+	  in += num_consumed - 1;
5d235a
+	}
5d235a
+      else
5d235a
+	*out++ = c;
5d235a
     }
5d235a
   while (1);
5d235a
 
5d235a
@@ -476,7 +616,6 @@ sanitize_string (const char * in)
5d235a
   return buffer;
5d235a
 }
5d235a
 
5d235a
-
5d235a
 /* Returns TRUE if the specified section should be dumped.  */
5d235a
 
5d235a
 static bfd_boolean
5d235a
@@ -1055,6 +1194,8 @@ objdump_print_symname (bfd *abfd, struct
5d235a
 
5d235a
   name = sanitize_string (name);
5d235a
 
5d235a
+  name = sanitize_string (name);
5d235a
+
5d235a
   if (inf != NULL)
5d235a
     {
5d235a
       (*inf->fprintf_func) (inf->stream, "%s", name);
5d235a
@@ -3136,7 +3277,7 @@ disassemble_section (bfd *abfd, asection
5d235a
   if (!bfd_malloc_and_get_section (abfd, section, &data))
5d235a
     {
5d235a
       non_fatal (_("Reading section %s failed because: %s"),
5d235a
-		 section->name, bfd_errmsg (bfd_get_error ()));
5d235a
+		 sanitize_string (section->name), bfd_errmsg (bfd_get_error ()));
5d235a
       return;
5d235a
     }
5d235a
 
5d235a
@@ -4341,7 +4482,7 @@ dump_section (bfd *abfd, asection *secti
5d235a
   if (!bfd_get_full_section_contents (abfd, section, &data))
5d235a
     {
5d235a
       non_fatal (_("Reading section %s failed because: %s"),
5d235a
-		 section->name, bfd_errmsg (bfd_get_error ()));
5d235a
+		 sanitize_string (section->name), bfd_errmsg (bfd_get_error ()));
5d235a
       return;
5d235a
     }
5d235a
 
5d235a
@@ -4481,6 +4622,24 @@ dump_symbols (bfd *abfd ATTRIBUTE_UNUSED
5d235a
 		  free (alloc);
5d235a
 		}
5d235a
 	    }
5d235a
+	  else if (unicode_display != unicode_default
5d235a
+		   && name != NULL && *name != '\0')
5d235a
+	    {
5d235a
+	      const char * sanitized_name;
5d235a
+
5d235a
+	      /* If we want to sanitize the name, we do it here, and
5d235a
+		 temporarily clobber it while calling bfd_print_symbol.
5d235a
+		 FIXME: This is a gross hack.  */
5d235a
+	      sanitized_name = sanitize_string (name);
5d235a
+	      if (sanitized_name != name)
5d235a
+		(*current)->name = sanitized_name;
5d235a
+	      else
5d235a
+		sanitized_name = NULL;
5d235a
+	      bfd_print_symbol (cur_bfd, stdout, *current,
5d235a
+				bfd_print_symbol_all);
5d235a
+	      if (sanitized_name != NULL)
5d235a
+		(*current)->name = name;
5d235a
+	    }
5d235a
 	  else
5d235a
 	    bfd_print_symbol (cur_bfd, stdout, *current,
5d235a
 			      bfd_print_symbol_all);
5d235a
@@ -5162,7 +5321,7 @@ main (int argc, char **argv)
5d235a
   set_default_bfd_target ();
5d235a
 
5d235a
   while ((c = getopt_long (argc, argv,
5d235a
-			   "pP:ib:m:M:VvCdDlfFaHhrRtTxsSI:j:wE:zgeGW::",
5d235a
+			   "pP:ib:m:M:VvCdDlfFaHhrRtTxsSI:j:wE:zgeGW::U:",
5d235a
 			   long_options, (int *) 0))
5d235a
 	 != EOF)
5d235a
     {
5d235a
@@ -5441,6 +5600,23 @@ main (int argc, char **argv)
5d235a
 	  seenflag = TRUE;
5d235a
 	  break;
5d235a
 
5d235a
+	case 'U':
5d235a
+	  if (streq (optarg, "default") || streq (optarg, "d"))
5d235a
+	    unicode_display = unicode_default;
5d235a
+	  else if (streq (optarg, "locale") || streq (optarg, "l"))
5d235a
+	    unicode_display = unicode_locale;
5d235a
+	  else if (streq (optarg, "escape") || streq (optarg, "e"))
5d235a
+	    unicode_display = unicode_escape;
5d235a
+	  else if (streq (optarg, "invalid") || streq (optarg, "i"))
5d235a
+	    unicode_display = unicode_invalid;
5d235a
+	  else if (streq (optarg, "hex") || streq (optarg, "x"))
5d235a
+	    unicode_display = unicode_hex;
5d235a
+	  else if (streq (optarg, "highlight") || streq (optarg, "h"))
5d235a
+	    unicode_display = unicode_highlight;
5d235a
+	  else
5d235a
+	    fatal (_("invalid argument to -U/--unicode: %s"), optarg);
5d235a
+	  break;
5d235a
+
5d235a
 	case 'H':
5d235a
 	  usage (stdout, 0);
5d235a
 	  /* No need to set seenflag or to break - usage() does not return.  */
5d235a
Only in binutils-2.36.1/binutils/: objdump.c.orig
5d235a
diff -rup binutils.orig/binutils/readelf.c binutils-2.36.1/binutils/readelf.c
5d235a
--- binutils.orig/binutils/readelf.c	2021-10-21 16:56:20.323761356 +0100
5d235a
+++ binutils-2.36.1/binutils/readelf.c	2021-10-21 17:00:54.169858044 +0100
5d235a
@@ -321,6 +321,18 @@ typedef enum print_mode
5d235a
 }
5d235a
 print_mode;
5d235a
 
5d235a
+typedef enum unicode_display_type
5d235a
+{
5d235a
+  unicode_locale,
5d235a
+  unicode_escape,
5d235a
+  unicode_hex,
5d235a
+  unicode_highlight,
5d235a
+  unicode_invalid
5d235a
+} unicode_display_type;
5d235a
+
5d235a
+static unicode_display_type unicode_display = unicode_locale;
5d235a
+
5d235a
+  
5d235a
 /* Versioned symbol info.  */
5d235a
 enum versioned_symbol_info
5d235a
 {
5d235a
@@ -613,11 +625,18 @@ print_symbol (signed int width, const ch
5d235a
       if (c == 0)
5d235a
 	break;
5d235a
 
5d235a
-      /* Do not print control characters directly as they can affect terminal
5d235a
-	 settings.  Such characters usually appear in the names generated
5d235a
-	 by the assembler for local labels.  */
5d235a
-      if (ISCNTRL (c))
5d235a
+      if (ISPRINT (c))
5d235a
+	{
5d235a
+	  putchar (c);
5d235a
+	  width_remaining --;
5d235a
+	  num_printed ++;
5d235a
+	}
5d235a
+      else if (ISCNTRL (c))
5d235a
 	{
5d235a
+	  /* Do not print control characters directly as they can affect terminal
5d235a
+	     settings.  Such characters usually appear in the names generated
5d235a
+	     by the assembler for local labels.  */
5d235a
+
5d235a
 	  if (width_remaining < 2)
5d235a
 	    break;
5d235a
 
5d235a
@@ -625,11 +644,135 @@ print_symbol (signed int width, const ch
5d235a
 	  width_remaining -= 2;
5d235a
 	  num_printed += 2;
5d235a
 	}
5d235a
-      else if (ISPRINT (c))
5d235a
+      else if (c == 0x7f)
5d235a
 	{
5d235a
-	  putchar (c);
5d235a
-	  width_remaining --;
5d235a
-	  num_printed ++;
5d235a
+	  if (width_remaining < 5)
5d235a
+	    break;
5d235a
+	  printf ("");
5d235a
+	  width_remaining -= 5;
5d235a
+	  num_printed += 5;
5d235a
+	}
5d235a
+      else if (unicode_display != unicode_locale)
5d235a
+	{
5d235a
+	  /* Display unicode characters as something else.  */
5d235a
+	  unsigned char bytes[4];
5d235a
+	  bfd_boolean   is_utf8;
5d235a
+	  unsigned int  nbytes;
5d235a
+
5d235a
+	  bytes[0] = c;
5d235a
+
5d235a
+	  if (bytes[0] < 0xc0)
5d235a
+	    {
5d235a
+	      nbytes = 1;
5d235a
+	      is_utf8 = FALSE;
5d235a
+	    }
5d235a
+	  else
5d235a
+	    {
5d235a
+	      bytes[1] = *symbol++;
5d235a
+
5d235a
+	      if ((bytes[1] & 0xc0) != 0x80)
5d235a
+		{
5d235a
+		  is_utf8 = FALSE;
5d235a
+		  /* Do not consume this character.  It may only
5d235a
+		     be the first byte in the sequence that was
5d235a
+		     corrupt.  */
5d235a
+		  --symbol;
5d235a
+		  nbytes = 1;
5d235a
+		}
5d235a
+	      else if ((bytes[0] & 0x20) == 0)
5d235a
+		{
5d235a
+		  is_utf8 = TRUE;
5d235a
+		  nbytes = 2;
5d235a
+		}
5d235a
+	      else
5d235a
+		{
5d235a
+		  bytes[2] = *symbol++;
5d235a
+
5d235a
+		  if ((bytes[2] & 0xc0) != 0x80)
5d235a
+		    {
5d235a
+		      is_utf8 = FALSE;
5d235a
+		      symbol -= 2;
5d235a
+		      nbytes = 1;
5d235a
+		    }
5d235a
+		  else if ((bytes[0] & 0x10) == 0)
5d235a
+		    {
5d235a
+		      is_utf8 = TRUE;
5d235a
+		      nbytes = 3;
5d235a
+		    }
5d235a
+		  else
5d235a
+		    {
5d235a
+		      bytes[3] = *symbol++;
5d235a
+
5d235a
+		      nbytes = 4;
5d235a
+
5d235a
+		      if ((bytes[3] & 0xc0) != 0x80)
5d235a
+			{
5d235a
+			  is_utf8 = FALSE;
5d235a
+			  symbol -= 3;
5d235a
+			  nbytes = 1;
5d235a
+			}
5d235a
+		      else
5d235a
+			is_utf8 = TRUE;
5d235a
+		    }
5d235a
+		}
5d235a
+	    }
5d235a
+
5d235a
+	  if (unicode_display == unicode_invalid)
5d235a
+	    is_utf8 = FALSE;
5d235a
+
5d235a
+	  if (unicode_display == unicode_hex || ! is_utf8)
5d235a
+	    {
5d235a
+	      unsigned int i;
5d235a
+
5d235a
+	      if (width_remaining < (nbytes * 2) + 2)
5d235a
+		break;
5d235a
+	  
5d235a
+	      putchar (is_utf8 ? '<' : '{');
5d235a
+	      for (i = 0; i < nbytes; i++)
5d235a
+		printf ("%02x", bytes[i]);
5d235a
+	      putchar (is_utf8 ? '>' : '}');
5d235a
+	    }
5d235a
+	  else
5d235a
+	    {
5d235a
+	      if (unicode_display == unicode_highlight && isatty (1))
5d235a
+		printf ("\x1B[31;47m"); /* Red.  */
5d235a
+	      
5d235a
+	      switch (nbytes)
5d235a
+		{
5d235a
+		case 2:
5d235a
+		  if (width_remaining < 6)
5d235a
+		    break;
5d235a
+		  printf ("\\u%02x%02x",
5d235a
+			  (bytes[0] & 0x1c) >> 2, 
5d235a
+			  ((bytes[0] & 0x03) << 6) | (bytes[1] & 0x3f));
5d235a
+		  break;
5d235a
+		case 3:
5d235a
+		  if (width_remaining < 6)
5d235a
+		    break;
5d235a
+		  printf ("\\u%02x%02x",
5d235a
+			  ((bytes[0] & 0x0f) << 4) | ((bytes[1] & 0x3c) >> 2),
5d235a
+			  ((bytes[1] & 0x03) << 6) | (bytes[2] & 0x3f));
5d235a
+		  break;
5d235a
+		case 4:
5d235a
+		  if (width_remaining < 8)
5d235a
+		    break;
5d235a
+		  printf ("\\u%02x%02x%02x",
5d235a
+			  ((bytes[0] & 0x07) << 6) | ((bytes[1] & 0x3c) >> 2),
5d235a
+			  ((bytes[1] & 0x03) << 6) | ((bytes[2] & 0x3c) >> 2),
5d235a
+			  ((bytes[2] & 0x03) << 6) | (bytes[3] & 0x3f));
5d235a
+		  
5d235a
+		  break;
5d235a
+		default:
5d235a
+		  /* URG.  */
5d235a
+		  break;
5d235a
+		}
5d235a
+
5d235a
+	      if (unicode_display == unicode_highlight && isatty (1))
5d235a
+		printf ("\033[0m"); /* Default colour.  */
5d235a
+	    }
5d235a
+	  
5d235a
+	  if (bytes[nbytes - 1] == 0)
5d235a
+	    break;
5d235a
 	}
5d235a
       else
5d235a
 	{
5d235a
@@ -4555,6 +4698,7 @@ static struct option options[] =
5d235a
   {"syms",	       no_argument, 0, 's'},
5d235a
   {"silent-truncation",no_argument, 0, 'T'},
5d235a
   {"section-details",  no_argument, 0, 't'},
5d235a
+  {"unicode",          required_argument, 0, 'U'},
5d235a
   {"unwind",	       no_argument, 0, 'u'},
5d235a
   {"version-info",     no_argument, 0, 'V'},
5d235a
   {"version",	       no_argument, 0, 'v'},
5d235a
@@ -4652,6 +4796,11 @@ usage (FILE * stream)
5d235a
 #endif
5d235a
   fprintf (stream, _("\
5d235a
   -I --histogram         Display histogram of bucket list lengths\n\
5d235a
+  -U --unicode=[locale|escape|hex|highlight|invalid]\n\
5d235a
+                         Display unicode characters as determined by the current locale\n\
5d235a
+                          (default), escape sequences, \"<hex sequences>\", highlighted\n\
5d235a
+                          escape sequences, or treat them as invalid and display as\n\
5d235a
+                          \"{hex sequences}\"\n\
5d235a
   -W --wide              Allow output width to exceed 80 characters\n\
5d235a
   -T --silent-truncation If a symbol name is truncated, do not add a suffix [...]\n\
5d235a
   @<file>                Read options from <file>\n\
5d235a
@@ -4748,7 +4897,7 @@ parse_args (struct dump_data *dumpdata,
5d235a
     usage (stderr);
5d235a
 
5d235a
   while ((c = getopt_long
5d235a
-	  (argc, argv, "ACDHILNR:STVWacdeghi:lnp:rstuvw::x:z", options, NULL)) != EOF)
5d235a
+	  (argc, argv, "ACDHILNR:STU:VWacdeghi:lnp:rstuvw::x:z", options, NULL)) != EOF)
5d235a
     {
5d235a
       switch (c)
5d235a
 	{
5d235a
@@ -4905,6 +5054,25 @@ parse_args (struct dump_data *dumpdata,
5d235a
 	  request_dump (dumpdata, DISASS_DUMP);
5d235a
 	  break;
5d235a
 #endif
5d235a
+	case 'U':
5d235a
+	  if (optarg == NULL)
5d235a
+	    error (_("Missing arg to -U/--unicode")); /* Can this happen ?  */
5d235a
+	  else if (streq (optarg, "default") || streq (optarg, "d"))
5d235a
+	    unicode_display = unicode_locale;
5d235a
+	  else if (streq (optarg, "locale") || streq (optarg, "l"))
5d235a
+	    unicode_display = unicode_locale;
5d235a
+	  else if (streq (optarg, "escape") || streq (optarg, "e"))
5d235a
+	    unicode_display = unicode_escape;
5d235a
+	  else if (streq (optarg, "invalid") || streq (optarg, "i"))
5d235a
+	    unicode_display = unicode_invalid;
5d235a
+	  else if (streq (optarg, "hex") || streq (optarg, "x"))
5d235a
+	    unicode_display = unicode_hex;
5d235a
+	  else if (streq (optarg, "highlight") || streq (optarg, "h"))
5d235a
+	    unicode_display = unicode_highlight;
5d235a
+	  else
5d235a
+	    error (_("unknown argument to -U/--unicode: %s"), optarg);
5d235a
+	  break;
5d235a
+
5d235a
 	case 'v':
5d235a
 	  print_version (program_name);
5d235a
 	  break;
5d235a
Only in binutils-2.36.1/binutils/: readelf.c.orig
5d235a
Only in binutils-2.36.1/binutils/: readelf.c.rej
5d235a
diff -rup binutils.orig/binutils/strings.c binutils-2.36.1/binutils/strings.c
5d235a
--- binutils.orig/binutils/strings.c	2021-10-21 16:56:20.321761370 +0100
5d235a
+++ binutils-2.36.1/binutils/strings.c	2021-10-21 16:56:29.698696197 +0100
5d235a
@@ -55,6 +55,19 @@
5d235a
    -T {bfdname}
5d235a
 		Specify a non-default object file format.
5d235a
 
5d235a
+  --unicode={default|locale|invalid|hex|escape|highlight}
5d235a
+  -U {d|l|i|x|e|h}
5d235a
+                Determine how to handle UTF-8 unicode characters.  The default
5d235a
+		is no special treatment.  All other versions of this option
5d235a
+		only apply if the encoding is valid and enabling the option
5d235a
+		implies --encoding=S.
5d235a
+		The 'locale' option displays the characters according to the
5d235a
+		current locale.  The 'invalid' option treats them as
5d235a
+		non-string characters.  The 'hex' option displays them as hex
5d235a
+		byte sequences.  The 'escape' option displays them as escape
5d235a
+		sequences and the 'highlight' option displays them as
5d235a
+		coloured escape sequences.
5d235a
+
5d235a
   --output-separator=sep_string
5d235a
   -s sep_string	String used to separate parsed strings in output.
5d235a
 		Default is newline.
5d235a
@@ -76,6 +89,22 @@
5d235a
 #include "safe-ctype.h"
5d235a
 #include "bucomm.h"
5d235a
 
5d235a
+#ifndef streq
5d235a
+#define streq(a,b) (strcmp ((a),(b)) == 0)
5d235a
+#endif
5d235a
+
5d235a
+typedef enum unicode_display_type
5d235a
+{
5d235a
+  unicode_default = 0,
5d235a
+  unicode_locale,
5d235a
+  unicode_escape,
5d235a
+  unicode_hex,
5d235a
+  unicode_highlight,
5d235a
+  unicode_invalid
5d235a
+} unicode_display_type;
5d235a
+
5d235a
+static unicode_display_type unicode_display = unicode_default;
5d235a
+
5d235a
 #define STRING_ISGRAPHIC(c) \
5d235a
       (   (c) >= 0 \
5d235a
        && (c) <= 255 \
5d235a
@@ -94,7 +123,7 @@ extern int errno;
5d235a
 static int address_radix;
5d235a
 
5d235a
 /* Minimum length of sequence of graphic chars to trigger output.  */
5d235a
-static int string_min;
5d235a
+static uint string_min;
5d235a
 
5d235a
 /* Whether or not we include all whitespace as a graphic char.   */
5d235a
 static bfd_boolean include_all_whitespace;
5d235a
@@ -130,6 +159,7 @@ static struct option long_options[] =
5d235a
   {"target", required_argument, NULL, 'T'},
5d235a
   {"output-separator", required_argument, NULL, 's'},
5d235a
   {"help", no_argument, NULL, 'h'},
5d235a
+  {"unicode", required_argument, NULL, 'U'},
5d235a
   {"version", no_argument, NULL, 'v'},
5d235a
   {NULL, 0, NULL, 0}
5d235a
 };
5d235a
@@ -173,7 +203,7 @@ main (int argc, char **argv)
5d235a
   encoding = 's';
5d235a
   output_separator = NULL;
5d235a
 
5d235a
-  while ((optc = getopt_long (argc, argv, "adfhHn:wot:e:T:s:Vv0123456789",
5d235a
+  while ((optc = getopt_long (argc, argv, "adfhHn:wot:e:T:s:U:Vv0123456789",
5d235a
 			      long_options, (int *) 0)) != EOF)
5d235a
     {
5d235a
       switch (optc)
5d235a
@@ -246,6 +276,23 @@ main (int argc, char **argv)
5d235a
 	  output_separator = optarg;
5d235a
           break;
5d235a
 
5d235a
+	case 'U':
5d235a
+	  if (streq (optarg, "default") || streq (optarg, "d"))
5d235a
+	    unicode_display = unicode_default;
5d235a
+	  else if (streq (optarg, "locale") || streq (optarg, "l"))
5d235a
+	    unicode_display = unicode_locale;
5d235a
+	  else if (streq (optarg, "escape") || streq (optarg, "e"))
5d235a
+	    unicode_display = unicode_escape;
5d235a
+	  else if (streq (optarg, "invalid") || streq (optarg, "i"))
5d235a
+	    unicode_display = unicode_invalid;
5d235a
+	  else if (streq (optarg, "hex") || streq (optarg, "x"))
5d235a
+	    unicode_display = unicode_hex;
5d235a
+	  else if (streq (optarg, "highlight") || streq (optarg, "h"))
5d235a
+	    unicode_display = unicode_highlight;
5d235a
+	  else
5d235a
+	    fatal (_("invalid argument to -U/--unicode: %s"), optarg);
5d235a
+	  break;
5d235a
+
5d235a
 	case 'V':
5d235a
 	case 'v':
5d235a
 	  print_version ("strings");
5d235a
@@ -260,6 +307,9 @@ main (int argc, char **argv)
5d235a
 	}
5d235a
     }
5d235a
 
5d235a
+  if (unicode_display != unicode_default)
5d235a
+    encoding = 'S';
5d235a
+
5d235a
   if (numeric_opt != 0)
5d235a
     {
5d235a
       string_min = (int) strtoul (argv[numeric_opt - 1] + 1, &s, 0);
5d235a
@@ -553,11 +603,629 @@ unget_part_char (long c, file_ptr *addre
5d235a
 	}
5d235a
     }
5d235a
 }
5d235a
+
5d235a
+static void
5d235a
+print_filename_and_address (const char * filename, file_ptr address)
5d235a
+{
5d235a
+  if (print_filenames)
5d235a
+    printf ("%s: ", filename);
5d235a
+
5d235a
+  if (! print_addresses)
5d235a
+    return;
5d235a
+
5d235a
+  switch (address_radix)
5d235a
+    {
5d235a
+    case 8:
5d235a
+      if (sizeof (address) > sizeof (long))
5d235a
+	{
5d235a
+#ifndef __MSVCRT__
5d235a
+	  printf ("%7llo ", (unsigned long long) address);
5d235a
+#else
5d235a
+	  printf ("%7I64o ", (unsigned long long) address);
5d235a
+#endif
5d235a
+	}
5d235a
+      else
5d235a
+	printf ("%7lo ", (unsigned long) address);
5d235a
+      break;
5d235a
+
5d235a
+    case 10:
5d235a
+      if (sizeof (address) > sizeof (long))
5d235a
+	{
5d235a
+#ifndef __MSVCRT__
5d235a
+	  printf ("%7llu ", (unsigned long long) address);
5d235a
+#else
5d235a
+	  printf ("%7I64d ", (unsigned long long) address);
5d235a
+#endif
5d235a
+	}
5d235a
+      else
5d235a
+	printf ("%7ld ", (long) address);
5d235a
+      break;
5d235a
+
5d235a
+    case 16:
5d235a
+      if (sizeof (address) > sizeof (long))
5d235a
+	{
5d235a
+#ifndef __MSVCRT__
5d235a
+	  printf ("%7llx ", (unsigned long long) address);
5d235a
+#else
5d235a
+	  printf ("%7I64x ", (unsigned long long) address);
5d235a
+#endif
5d235a
+	}
5d235a
+      else
5d235a
+	printf ("%7lx ", (unsigned long) address);
5d235a
+      break;
5d235a
+    }
5d235a
+}
5d235a
+
5d235a
+/* Return non-zero if the bytes starting at BUFFER form a valid UTF-8 encoding.
5d235a
+   If the encoding is valid then returns the number of bytes it uses.  */
5d235a
+
5d235a
+static unsigned int
5d235a
+is_valid_utf8 (const unsigned char * buffer, unsigned long buflen)
5d235a
+{
5d235a
+  if (buffer[0] < 0xc0)
5d235a
+    return 0;
5d235a
+
5d235a
+  if (buflen < 2)
5d235a
+    return 0;
5d235a
+
5d235a
+  if ((buffer[1] & 0xc0) != 0x80)
5d235a
+    return 0;
5d235a
+
5d235a
+  if ((buffer[0] & 0x20) == 0)
5d235a
+    return 2;
5d235a
+
5d235a
+  if (buflen < 3)
5d235a
+    return 0;
5d235a
+
5d235a
+  if ((buffer[2] & 0xc0) != 0x80)
5d235a
+    return 0;
5d235a
+  
5d235a
+  if ((buffer[0] & 0x10) == 0)
5d235a
+    return 3;
5d235a
+
5d235a
+  if (buflen < 4)
5d235a
+    return 0;
5d235a
+
5d235a
+  if ((buffer[3] & 0xc0) != 0x80)
5d235a
+    return 0;
5d235a
+
5d235a
+  return 4;
5d235a
+}
5d235a
+
5d235a
+/* Display a UTF-8 encoded character in BUFFER according to the setting
5d235a
+   of unicode_display.  The character is known to be valid.
5d235a
+   Returns the number of bytes consumed.  */
5d235a
+
5d235a
+static unsigned int
5d235a
+display_utf8_char (const unsigned char * buffer)
5d235a
+{
5d235a
+  unsigned int j;
5d235a
+  unsigned int utf8_len;
5d235a
+
5d235a
+  switch (buffer[0] & 0x30)
5d235a
+    {
5d235a
+    case 0x00:
5d235a
+    case 0x10:
5d235a
+      utf8_len = 2;
5d235a
+      break;
5d235a
+    case 0x20:
5d235a
+      utf8_len = 3;
5d235a
+      break;
5d235a
+    default:
5d235a
+      utf8_len = 4;
5d235a
+    }
5d235a
+      
5d235a
+  switch (unicode_display)
5d235a
+    {
5d235a
+    default:
5d235a
+      fprintf (stderr, "ICE: unexpected unicode display type\n");
5d235a
+      break;
5d235a
+
5d235a
+    case unicode_escape:
5d235a
+    case unicode_highlight:
5d235a
+      if (unicode_display == unicode_highlight && isatty (1))
5d235a
+	printf ("\x1B[31;47m"); /* Red.  */
5d235a
+
5d235a
+      switch (utf8_len)
5d235a
+	{
5d235a
+	case 2:
5d235a
+	  printf ("\\u%02x%02x",
5d235a
+		  ((buffer[0] & 0x1c) >> 2), 
5d235a
+		  ((buffer[0] & 0x03) << 6) | (buffer[1] & 0x3f));
5d235a
+	  break;
5d235a
+
5d235a
+	case 3:
5d235a
+	  printf ("\\u%02x%02x",
5d235a
+		  ((buffer[0] & 0x0f) << 4) | ((buffer[1] & 0x3c) >> 2),
5d235a
+		  ((buffer[1] & 0x03) << 6) | ((buffer[2] & 0x3f)));
5d235a
+	  break;
5d235a
+
5d235a
+	case 4:
5d235a
+	  printf ("\\u%02x%02x%02x",
5d235a
+		  ((buffer[0] & 0x07) << 6) | ((buffer[1] & 0x3c) >> 2),
5d235a
+		  ((buffer[1] & 0x03) << 6) | ((buffer[2] & 0x3c) >> 2),
5d235a
+		  ((buffer[2] & 0x03) << 6) | ((buffer[3] & 0x3f)));
5d235a
+	  break;
5d235a
+	default:
5d235a
+	  /* URG.  */
5d235a
+	  break;
5d235a
+	}
5d235a
+
5d235a
+      if (unicode_display == unicode_highlight && isatty (1))
5d235a
+	printf ("\033[0m"); /* Default colour.  */
5d235a
+      break;
5d235a
+
5d235a
+    case unicode_hex:
5d235a
+      putchar ('<');
5d235a
+      for (j = 0; j < utf8_len; j++)
5d235a
+	printf ("%02x", buffer [j]);
5d235a
+      putchar ('>');
5d235a
+      break;
5d235a
+
5d235a
+    case unicode_locale:
5d235a
+      printf ("%.1s", buffer);
5d235a
+      break;
5d235a
+    }
5d235a
+
5d235a
+  return utf8_len;
5d235a
+}
5d235a
+
5d235a
+/* Display strings in BUFFER.  Treat any UTF-8 encoded characters encountered
5d235a
+   according to the setting of the unicode_display variable.  The buffer
5d235a
+   contains BUFLEN bytes.
5d235a
+
5d235a
+   Display the characters as if they started at ADDRESS and are contained in
5d235a
+   FILENAME.  */
5d235a
+
5d235a
+static void
5d235a
+print_unicode_buffer (const char *            filename,
5d235a
+		      file_ptr                address,
5d235a
+		      const unsigned char *   buffer,
5d235a
+		      unsigned long           buflen)
5d235a
+{
5d235a
+  /* Paranoia checks...  */
5d235a
+  if (filename == NULL
5d235a
+      || buffer == NULL
5d235a
+      || unicode_display == unicode_default
5d235a
+      || encoding != 'S'
5d235a
+      || encoding_bytes != 1)
5d235a
+    {
5d235a
+      fprintf (stderr, "ICE: bad arguments to print_unicode_buffer\n");
5d235a
+      return;
5d235a
+    }
5d235a
+
5d235a
+  if (buflen == 0)
5d235a
+    return;
5d235a
+
5d235a
+  /* We must only display strings that are at least string_min *characters*
5d235a
+     long.  So we scan the buffer in two stages.  First we locate the start
5d235a
+     of a potential string.  Then we walk along it until we have found
5d235a
+     string_min characters.  Then we go back to the start point and start
5d235a
+     displaying characters according to the unicode_display setting.  */
5d235a
+
5d235a
+  unsigned long start_point = 0;
5d235a
+  unsigned long i = 0;
5d235a
+  unsigned int char_len = 1;
5d235a
+  unsigned int num_found = 0;
5d235a
+
5d235a
+  for (i = 0; i < buflen; i += char_len)
5d235a
+    {
5d235a
+      int c = buffer[i];
5d235a
+
5d235a
+      char_len = 1;
5d235a
+
5d235a
+      /* Find the first potential character of a string.  */
5d235a
+      if (! STRING_ISGRAPHIC (c))
5d235a
+	{
5d235a
+	  num_found = 0;
5d235a
+	  continue;
5d235a
+	}
5d235a
+
5d235a
+      if (c > 126)
5d235a
+	{
5d235a
+	  if (c < 0xc0)
5d235a
+	    {
5d235a
+	      num_found = 0;
5d235a
+	      continue;
5d235a
+	    }
5d235a
+
5d235a
+	  if ((char_len = is_valid_utf8 (buffer + i, buflen - i)) == 0)
5d235a
+	    {
5d235a
+	      char_len = 1;
5d235a
+	      num_found = 0;
5d235a
+	      continue;
5d235a
+	    }
5d235a
+
5d235a
+	  if (unicode_display == unicode_invalid)
5d235a
+	    {
5d235a
+	      /* We have found a valid UTF-8 character, but we treat it as non-graphic.  */
5d235a
+	      num_found = 0;
5d235a
+	      continue;
5d235a
+	    }
5d235a
+	}
5d235a
+
5d235a
+      if (num_found == 0)
5d235a
+	/* We have found a potential starting point for a string.  */
5d235a
+	start_point = i;
5d235a
+
5d235a
+      ++ num_found;
5d235a
+
5d235a
+      if (num_found >= string_min)
5d235a
+	break;
5d235a
+    }
5d235a
+
5d235a
+  if (num_found < string_min)
5d235a
+    return;
5d235a
+
5d235a
+  print_filename_and_address (filename, address + start_point);
5d235a
+  
5d235a
+  /* We have found string_min characters.  Display them and any
5d235a
+     more that follow.  */
5d235a
+  for (i = start_point; i < buflen; i += char_len)
5d235a
+    {
5d235a
+      int c = buffer[i];
5d235a
+
5d235a
+      char_len = 1;
5d235a
+
5d235a
+      if (! STRING_ISGRAPHIC (c))
5d235a
+	break;
5d235a
+      else if (c < 127)
5d235a
+	putchar (c);
5d235a
+      else if (! is_valid_utf8 (buffer + i, buflen - i))
5d235a
+	break;
5d235a
+      else if (unicode_display == unicode_invalid)
5d235a
+	break;
5d235a
+      else
5d235a
+	char_len = display_utf8_char (buffer + i);
5d235a
+    }
5d235a
+
5d235a
+  if (output_separator)
5d235a
+    fputs (output_separator, stdout);
5d235a
+  else
5d235a
+    putchar ('\n');
5d235a
+
5d235a
+  /* FIXME: Using tail recursion here is lazy programming...  */
5d235a
+  print_unicode_buffer (filename, address + i, buffer + i, buflen - i);
5d235a
+}
5d235a
+
5d235a
+static int
5d235a
+get_unicode_byte (FILE *           stream,
5d235a
+		  unsigned char *  putback,
5d235a
+		  unsigned int *   num_putback,
5d235a
+		  unsigned int *   num_read)
5d235a
+{
5d235a
+  if (* num_putback > 0)
5d235a
+    {
5d235a
+      * num_putback = * num_putback - 1;
5d235a
+      return putback [* num_putback];
5d235a
+    }
5d235a
+
5d235a
+  * num_read = * num_read + 1;
5d235a
+
5d235a
+#if defined(HAVE_GETC_UNLOCKED) && HAVE_DECL_GETC_UNLOCKED
5d235a
+  return getc_unlocked (stream);
5d235a
+#else
5d235a
+  return getc (stream);
5d235a
+#endif
5d235a
+}
5d235a
+
5d235a
+/* Helper function for print_unicode_stream.  */
5d235a
+
5d235a
+static void
5d235a
+print_unicode_stream_body (const char *     filename,
5d235a
+			   file_ptr         address,
5d235a
+			   FILE *           stream,
5d235a
+			   unsigned char *  putback_buf,
5d235a
+			   unsigned int     num_putback,
5d235a
+			   unsigned char *  print_buf)
5d235a
+{
5d235a
+  /* It would be nice if we could just read the stream into a buffer
5d235a
+     and then process if with print_unicode_buffer.  But the input
5d235a
+     might be huge or it might time-locked (eg stdin).  So instead
5d235a
+     we go one byte at a time...  */
5d235a
+
5d235a
+  file_ptr start_point = 0;
5d235a
+  unsigned int num_read = 0;
5d235a
+  unsigned int num_chars = 0;
5d235a
+  unsigned int num_print = 0;
5d235a
+  int c;
5d235a
+
5d235a
+  /* Find a series of string_min characters.  Put them into print_buf.  */
5d235a
+  do
5d235a
+    {
5d235a
+      if (num_chars >= string_min)
5d235a
+	break;
5d235a
+
5d235a
+      c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
5d235a
+      if (c == EOF)
5d235a
+	break;
5d235a
+
5d235a
+      if (! STRING_ISGRAPHIC (c))
5d235a
+	{
5d235a
+	  num_chars = num_print = 0;
5d235a
+	  continue;
5d235a
+	}
5d235a
+
5d235a
+      if (num_chars == 0)
5d235a
+	start_point = num_read - 1;
5d235a
+
5d235a
+      if (c < 127)
5d235a
+	{
5d235a
+	  print_buf[num_print] = c;
5d235a
+	  num_chars ++;
5d235a
+	  num_print ++;
5d235a
+	  continue;
5d235a
+	}
5d235a
+
5d235a
+      if (c < 0xc0)
5d235a
+	{
5d235a
+	  num_chars = num_print = 0;
5d235a
+	  continue;
5d235a
+	}
5d235a
+
5d235a
+      /* We *might* have a UTF-8 sequence.  Time to start peeking.  */
5d235a
+      char utf8[4];
5d235a
+
5d235a
+      utf8[0] = c;
5d235a
+      c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
5d235a
+      if (c == EOF)
5d235a
+	break;
5d235a
+      utf8[1] = c;
5d235a
+
5d235a
+      if ((utf8[1] & 0xc0) != 0x80)
5d235a
+	{
5d235a
+	  /* Invalid UTF-8.  */
5d235a
+	  putback_buf[num_putback++] = utf8[1];
5d235a
+	  num_chars = num_print = 0;
5d235a
+	  continue;
5d235a
+	}
5d235a
+      else if ((utf8[0] & 0x20) == 0)
5d235a
+	{
5d235a
+	  /* A valid 2-byte UTF-8 encoding.  */
5d235a
+	  if (unicode_display == unicode_invalid)
5d235a
+	    {
5d235a
+	      putback_buf[num_putback++] = utf8[1];
5d235a
+	      num_chars = num_print = 0;
5d235a
+	    }
5d235a
+	  else
5d235a
+	    {
5d235a
+	      print_buf[num_print ++] = utf8[0];
5d235a
+	      print_buf[num_print ++] = utf8[1];
5d235a
+	      num_chars ++;
5d235a
+	    }
5d235a
+	  continue;
5d235a
+	}
5d235a
+
5d235a
+      c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
5d235a
+      if (c == EOF)
5d235a
+	break;
5d235a
+      utf8[2] = c;
5d235a
+
5d235a
+      if ((utf8[2] & 0xc0) != 0x80)
5d235a
+	{
5d235a
+	  /* Invalid UTF-8.  */
5d235a
+	  putback_buf[num_putback++] = utf8[2];
5d235a
+	  putback_buf[num_putback++] = utf8[1];
5d235a
+	  num_chars = num_print = 0;
5d235a
+	  continue;
5d235a
+	}
5d235a
+      else if ((utf8[0] & 0x10) == 0)
5d235a
+	{
5d235a
+	  /* A valid 3-byte UTF-8 encoding.  */
5d235a
+	  if (unicode_display == unicode_invalid)
5d235a
+	    {
5d235a
+	      putback_buf[num_putback++] = utf8[2];
5d235a
+	      putback_buf[num_putback++] = utf8[1];
5d235a
+	      num_chars = num_print = 0;
5d235a
+	    }
5d235a
+	  else
5d235a
+	    {
5d235a
+	      print_buf[num_print ++] = utf8[0];
5d235a
+	      print_buf[num_print ++] = utf8[1];
5d235a
+	      print_buf[num_print ++] = utf8[2];
5d235a
+	      num_chars ++;
5d235a
+	    }
5d235a
+	  continue;
5d235a
+	}
5d235a
+
5d235a
+      c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
5d235a
+      if (c == EOF)
5d235a
+	break;
5d235a
+      utf8[3] = c;
5d235a
+
5d235a
+      if ((utf8[3] & 0xc0) != 0x80)
5d235a
+	{
5d235a
+	  /* Invalid UTF-8.  */
5d235a
+	  putback_buf[num_putback++] = utf8[3];
5d235a
+	  putback_buf[num_putback++] = utf8[2];
5d235a
+	  putback_buf[num_putback++] = utf8[1];
5d235a
+	  num_chars = num_print = 0;
5d235a
+	}
5d235a
+      /* We have a valid 4-byte UTF-8 encoding.  */
5d235a
+      else if (unicode_display == unicode_invalid)
5d235a
+	{
5d235a
+	  putback_buf[num_putback++] = utf8[3];
5d235a
+	  putback_buf[num_putback++] = utf8[1];
5d235a
+	  putback_buf[num_putback++] = utf8[2];
5d235a
+	  num_chars = num_print = 0;
5d235a
+	}
5d235a
+      else
5d235a
+	{
5d235a
+	  print_buf[num_print ++] = utf8[0];
5d235a
+	  print_buf[num_print ++] = utf8[1];
5d235a
+	  print_buf[num_print ++] = utf8[2];
5d235a
+	  print_buf[num_print ++] = utf8[3];
5d235a
+	  num_chars ++;
5d235a
+	}
5d235a
+    }
5d235a
+  while (1);
5d235a
+
5d235a
+  if (num_chars >= string_min)
5d235a
+    {
5d235a
+      /* We know that we have string_min valid characters in print_buf,
5d235a
+	 and there may be more to come in the stream.  Start displaying
5d235a
+	 them.  */
5d235a
+
5d235a
+      print_filename_and_address (filename, address + start_point);
5d235a
+
5d235a
+      unsigned int i;
5d235a
+      for (i = 0; i < num_print;)
5d235a
+	{
5d235a
+	  if (print_buf[i] < 127)
5d235a
+	    putchar (print_buf[i++]);
5d235a
+	  else
5d235a
+	    i += display_utf8_char (print_buf + i);
5d235a
+	}
5d235a
+
5d235a
+      /* OK so now we have to start read unchecked bytes.  */
5d235a
+
5d235a
+        /* Find a series of string_min characters.  Put them into print_buf.  */
5d235a
+      do
5d235a
+	{
5d235a
+	  c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
5d235a
+	  if (c == EOF)
5d235a
+	    break;
5d235a
+
5d235a
+	  if (! STRING_ISGRAPHIC (c))
5d235a
+	    break;
5d235a
+
5d235a
+	  if (c < 127)
5d235a
+	    {
5d235a
+	      putchar (c);
5d235a
+	      continue;
5d235a
+	    }
5d235a
+
5d235a
+	  if (c < 0xc0)
5d235a
+	    break;
5d235a
+
5d235a
+	  /* We *might* have a UTF-8 sequence.  Time to start peeking.  */
5d235a
+	  unsigned char utf8[4];
5d235a
+
5d235a
+	  utf8[0] = c;
5d235a
+	  c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
5d235a
+	  if (c == EOF)
5d235a
+	    break;
5d235a
+	  utf8[1] = c;
5d235a
+
5d235a
+	  if ((utf8[1] & 0xc0) != 0x80)
5d235a
+	    {
5d235a
+	      /* Invalid UTF-8.  */
5d235a
+	      putback_buf[num_putback++] = utf8[1];
5d235a
+	      break;
5d235a
+	    }
5d235a
+	  else if ((utf8[0] & 0x20) == 0)
5d235a
+	    {
5d235a
+	      /* Valid 2-byte UTF-8.  */
5d235a
+	      if (unicode_display == unicode_invalid)
5d235a
+		{
5d235a
+		  putback_buf[num_putback++] = utf8[1];
5d235a
+		  break;
5d235a
+		}
5d235a
+	      else
5d235a
+		{
5d235a
+		  (void) display_utf8_char (utf8);
5d235a
+		  continue;
5d235a
+		}
5d235a
+	    }
5d235a
+
5d235a
+	  c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
5d235a
+	  if (c == EOF)
5d235a
+	    break;
5d235a
+	  utf8[2] = c;
5d235a
+
5d235a
+	  if ((utf8[2] & 0xc0) != 0x80)
5d235a
+	    {
5d235a
+	      /* Invalid UTF-8.  */
5d235a
+	      putback_buf[num_putback++] = utf8[2];
5d235a
+	      putback_buf[num_putback++] = utf8[1];
5d235a
+	      break;
5d235a
+	    }
5d235a
+	  else if ((utf8[0] & 0x10) == 0)
5d235a
+	    {
5d235a
+	      /* Valid 3-byte UTF-8.  */
5d235a
+	      if (unicode_display == unicode_invalid)
5d235a
+		{
5d235a
+		  putback_buf[num_putback++] = utf8[2];
5d235a
+		  putback_buf[num_putback++] = utf8[1];
5d235a
+		  break;
5d235a
+		}
5d235a
+	      else
5d235a
+		{
5d235a
+		  (void) display_utf8_char (utf8);
5d235a
+		  continue;
5d235a
+		}
5d235a
+	    }
5d235a
+
5d235a
+	  c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
5d235a
+	  if (c == EOF)
5d235a
+	    break;
5d235a
+	  utf8[3] = c;
5d235a
+
5d235a
+	  if ((utf8[3] & 0xc0) != 0x80)
5d235a
+	    {
5d235a
+	      /* Invalid UTF-8.  */
5d235a
+	      putback_buf[num_putback++] = utf8[3];
5d235a
+	      putback_buf[num_putback++] = utf8[2];
5d235a
+	      putback_buf[num_putback++] = utf8[1];
5d235a
+	      break;
5d235a
+	    }
5d235a
+	  else if (unicode_display == unicode_invalid)
5d235a
+	    {
5d235a
+	      putback_buf[num_putback++] = utf8[3];
5d235a
+	      putback_buf[num_putback++] = utf8[2];
5d235a
+	      putback_buf[num_putback++] = utf8[1];
5d235a
+	      break;
5d235a
+	    }
5d235a
+	  else
5d235a
+	    /* A valid 4-byte UTF-8 encoding.  */
5d235a
+	    (void) display_utf8_char (utf8);
5d235a
+	}
5d235a
+      while (1);
5d235a
+
5d235a
+      if (output_separator)
5d235a
+	fputs (output_separator, stdout);
5d235a
+      else
5d235a
+	putchar ('\n');
5d235a
+    }
5d235a
+
5d235a
+  if (c != EOF)
5d235a
+    /* FIXME: Using tail recursion here is lazy, but it works.  */
5d235a
+    print_unicode_stream_body (filename, address + num_read, stream, putback_buf, num_putback, print_buf);
5d235a
+}
5d235a
+
5d235a
+/* Display strings read in from STREAM.  Treat any UTF-8 encoded characters
5d235a
+   encountered according to the setting of the unicode_display variable.
5d235a
+   The stream is positioned at ADDRESS and is attached to FILENAME.  */
5d235a
+
5d235a
+static void
5d235a
+print_unicode_stream (const char * filename,
5d235a
+		      file_ptr     address,
5d235a
+		      FILE *       stream)
5d235a
+{
5d235a
+  /* Paranoia checks...  */
5d235a
+  if (filename == NULL
5d235a
+      || stream == NULL
5d235a
+      || unicode_display == unicode_default
5d235a
+      || encoding != 'S'
5d235a
+      || encoding_bytes != 1)
5d235a
+    {
5d235a
+      fprintf (stderr, "ICE: bad arguments to print_unicode_stream\n");
5d235a
+      return;
5d235a
+    }
5d235a
+
5d235a
+  /* Allocate space for string_min 4-byte utf-8 characters.  */
5d235a
+  unsigned char * print_buf = xmalloc ((4 * string_min) + 1);
5d235a
+  /* We should never have to put back more than 4 bytes.  */
5d235a
+  unsigned char putback_buf[5];
5d235a
+  unsigned int num_putback = 0;
5d235a
+
5d235a
+  print_unicode_stream_body (filename, address, stream, putback_buf, num_putback, print_buf);
5d235a
+  free (print_buf);
5d235a
+}
5d235a
 
5d235a
 /* Find the strings in file FILENAME, read from STREAM.
5d235a
    Assume that STREAM is positioned so that the next byte read
5d235a
    is at address ADDRESS in the file.
5d235a
-   Stop reading at address STOP_POINT in the file, if nonzero.
5d235a
 
5d235a
    If STREAM is NULL, do not read from it.
5d235a
    The caller can supply a buffer of characters
5d235a
@@ -570,18 +1238,27 @@ static void
5d235a
 print_strings (const char *filename, FILE *stream, file_ptr address,
5d235a
 	       int stop_point, int magiccount, char *magic)
5d235a
 {
5d235a
+  if (unicode_display != unicode_default)
5d235a
+    {
5d235a
+      if (magic != NULL)
5d235a
+	print_unicode_buffer (filename, address,
5d235a
+			      (const unsigned char *) magic, magiccount);
5d235a
+
5d235a
+      if (stream != NULL)
5d235a
+	print_unicode_stream (filename, address, stream);
5d235a
+      return;
5d235a
+    }
5d235a
+
5d235a
   char *buf = (char *) xmalloc (sizeof (char) * (string_min + 1));
5d235a
 
5d235a
   while (1)
5d235a
     {
5d235a
       file_ptr start;
5d235a
-      int i;
5d235a
+      unsigned int i;
5d235a
       long c;
5d235a
 
5d235a
       /* See if the next `string_min' chars are all graphic chars.  */
5d235a
     tryline:
5d235a
-      if (stop_point && address >= stop_point)
5d235a
-	break;
5d235a
       start = address;
5d235a
       for (i = 0; i < string_min; i++)
5d235a
 	{
5d235a
@@ -718,6 +1395,8 @@ usage (FILE *stream, int status)
5d235a
   -T --target=<BFDNAME>     Specify the binary file format\n\
5d235a
   -e --encoding={s,S,b,l,B,L} Select character size and endianness:\n\
5d235a
                             s = 7-bit, S = 8-bit, {b,l} = 16-bit, {B,L} = 32-bit\n\
5d235a
+  --unicode={default|show|invalid|hex|escape|highlight}\n\
5d235a
+  -U {d|s|i|x|e|h}          Specify how to treat UTF-8 encoded unicode characters\n\
5d235a
   -s --output-separator=<string> String used to separate strings in output.\n\
5d235a
   @<file>                   Read options from <file>\n\
5d235a
   -h --help                 Display this information\n\