Blame SOURCES/binutils.unicode.patch

c407ab
diff -rup binutils.orig/binutils/NEWS binutils-2.36.1/binutils/NEWS
c407ab
--- binutils.orig/binutils/NEWS	2021-10-21 16:56:20.322761363 +0100
c407ab
+++ binutils-2.36.1/binutils/NEWS	2021-10-21 16:56:29.692696238 +0100
c407ab
@@ -151,6 +151,15 @@ Changes in 2.32:
c407ab
 
c407ab
 Changes in 2.31:
c407ab
 
c407ab
+* Tools which display names or strings (readelf, strings, nm, objdump)
c407ab
+  have a new command line option which controls how unicode characters are
c407ab
+  handled.  By default they are treated as normal for the tool.  Using
c407ab
+  --unicode=locale will display them according to the current locale.
c407ab
+  Using --unicode=hex will display them as hex byte values, whilst
c407ab
+  --unicode=escape will display them as escape sequences.  In addition
c407ab
+  using --unicode=highlight will display them as unicode escape sequences
c407ab
+  highlighted in red (if supported by the output device).
c407ab
+
c407ab
 * Add support for disassembling netronome Flow Processor (NFP) firmware files.
c407ab
 
c407ab
 * The AArch64 port now supports showing disassembly notes which are emitted
c407ab
Only in binutils-2.36.1/binutils/: NEWS.orig
c407ab
diff -rup binutils.orig/binutils/doc/binutils.texi binutils-2.36.1/binutils/doc/binutils.texi
c407ab
--- binutils.orig/binutils/doc/binutils.texi	2021-10-21 16:56:20.324761349 +0100
c407ab
+++ binutils-2.36.1/binutils/doc/binutils.texi	2021-10-21 16:56:29.694696225 +0100
c407ab
@@ -799,6 +799,7 @@ nm [@option{-A}|@option{-o}|@option{--pr
c407ab
    [@option{-g}|@option{--extern-only}] [@option{-h}|@option{--help}]
c407ab
    [@option{--ifunc-chars=@var{CHARS}}]
c407ab
    [@option{-l}|@option{--line-numbers}] [@option{--inlines}]
c407ab
+   [@option{-U} @var{method}] [@option{--unicode=}@var{method}]
c407ab
    [@option{-n}|@option{-v}|@option{--numeric-sort}]
c407ab
    [@option{-P}|@option{--portability}] [@option{-p}|@option{--no-sort}]
c407ab
    [@option{-r}|@option{--reverse-sort}] [@option{-S}|@option{--print-size}]
c407ab
@@ -1114,6 +1115,21 @@ Use @var{radix} as the radix for printin
c407ab
 @cindex undefined symbols
c407ab
 Display only undefined symbols (those external to each object file).
c407ab
 
c407ab
+@item -U @var{[d|i|l|e|x|h]}
c407ab
+@itemx --unicode=@var{[default|invalid|locale|escape|hex|highlight]}
c407ab
+Controls the display of UTF-8 encoded mulibyte characters in strings.
c407ab
+The default (@option{--unicode=default}) is to give them no special
c407ab
+treatment.  The @option{--unicode=locale} option displays the sequence
c407ab
+in the current locale, which may or may not support them.  The options
c407ab
+@option{--unicode=hex} and @option{--unicode=invalid} display them as
c407ab
+hex byte sequences enclosed by either angle brackets or curly braces.
c407ab
+
c407ab
+The @option{--unicode=escape} option displays them as escape sequences
c407ab
+(@var{\uxxxx}) and the @option{--unicode=highlight} option displays
c407ab
+them as escape sequences highlighted in red (if supported by the
c407ab
+output device).  The colouring is intended to draw attention to the
c407ab
+presence of unicode sequences where they might not be expected.
c407ab
+
c407ab
 @item -V
c407ab
 @itemx --version
c407ab
 Show the version number of @command{nm} and exit.
c407ab
@@ -2210,6 +2226,7 @@ objdump [@option{-a}|@option{--archive-h
c407ab
         [@option{--prefix-strip=}@var{level}]
c407ab
         [@option{--insn-width=}@var{width}]
c407ab
         [@option{--visualize-jumps[=color|=extended-color|=off]}
c407ab
+        [@option{-U} @var{method}] [@option{--unicode=}@var{method}]
c407ab
         [@option{-V}|@option{--version}]
c407ab
         [@option{-H}|@option{--help}]
c407ab
         @var{objfile}@dots{}
c407ab
@@ -2877,6 +2894,21 @@ When displaying symbols include those wh
c407ab
 special in some way and which would not normally be of interest to the
c407ab
 user.
c407ab
 
c407ab
+@item -U @var{[d|i|l|e|x|h]}
c407ab
+@itemx --unicode=@var{[default|invalid|locale|escape|hex|highlight]}
c407ab
+Controls the display of UTF-8 encoded mulibyte characters in strings.
c407ab
+The default (@option{--unicode=default}) is to give them no special
c407ab
+treatment.  The @option{--unicode=locale} option displays the sequence
c407ab
+in the current locale, which may or may not support them.  The options
c407ab
+@option{--unicode=hex} and @option{--unicode=invalid} display them as
c407ab
+hex byte sequences enclosed by either angle brackets or curly braces.
c407ab
+
c407ab
+The @option{--unicode=escape} option displays them as escape sequences
c407ab
+(@var{\uxxxx}) and the @option{--unicode=highlight} option displays
c407ab
+them as escape sequences highlighted in red (if supported by the
c407ab
+output device).  The colouring is intended to draw attention to the
c407ab
+presence of unicode sequences where they might not be expected.
c407ab
+
c407ab
 @item -V
c407ab
 @itemx --version
c407ab
 Print the version number of @command{objdump} and exit.
c407ab
@@ -3153,6 +3185,7 @@ strings [@option{-afovV}] [@option{-}@va
c407ab
         [@option{-n} @var{min-len}] [@option{--bytes=}@var{min-len}]
c407ab
         [@option{-t} @var{radix}] [@option{--radix=}@var{radix}]
c407ab
         [@option{-e} @var{encoding}] [@option{--encoding=}@var{encoding}]
c407ab
+        [@option{-U} @var{method}] [@option{--unicode=}@var{method}]
c407ab
         [@option{-}] [@option{--all}] [@option{--print-file-name}]
c407ab
         [@option{-T} @var{bfdname}] [@option{--target=}@var{bfdname}]
c407ab
         [@option{-w}] [@option{--include-all-whitespace}]
c407ab
@@ -3244,6 +3277,28 @@ single-8-bit-byte characters, @samp{b} =
c407ab
 littleendian.  Useful for finding wide character strings. (@samp{l}
c407ab
 and @samp{b} apply to, for example, Unicode UTF-16/UCS-2 encodings).
c407ab
 
c407ab
+@item -U @var{[d|i|l|e|x|h]}
c407ab
+@itemx --unicode=@var{[default|invalid|locale|escape|hex|highlight]}
c407ab
+Controls the display of UTF-8 encoded mulibyte characters in strings.
c407ab
+The default (@option{--unicode=default}) is to give them no special
c407ab
+treatment, and instead rely upon the setting of the
c407ab
+@option{--encoding} option.  The other values for this option
c407ab
+automatically enable @option{--encoding=S}.
c407ab
+
c407ab
+The @option{--unicode=invalid} option treats them as non-graphic
c407ab
+characters and hence not part of a valid string.  All the remaining
c407ab
+options treat them as valid string characters.
c407ab
+
c407ab
+The @option{--unicode=locale} option displays them in the current
c407ab
+locale, which may or may not support UTF-8 encoding.  The
c407ab
+@option{--unicode=hex} option displays them as hex byte sequences
c407ab
+enclosed between @var{<>} characters.  The @option{--unicode=escape}
c407ab
+option displays them as escape sequences (@var{\uxxxx}) and the
c407ab
+@option{--unicode=highlight} option displays them as escape sequences
c407ab
+highlighted in red (if supported by the output device).  The colouring
c407ab
+is intended to draw attention to the presence of unicode sequences
c407ab
+where they might not be expected.
c407ab
+
c407ab
 @item -T @var{bfdname}
c407ab
 @itemx --target=@var{bfdname}
c407ab
 @cindex object code format
c407ab
@@ -4766,6 +4821,7 @@ readelf [@option{-a}|@option{--all}]
c407ab
         [@option{-W}|@option{--wide}]
c407ab
         [@option{-T}|@option{--silent-truncation}]
c407ab
         [@option{-H}|@option{--help}]
c407ab
+        [@option{-U} @var{method}|@option{--unicode=}@var{method}]
c407ab
         @var{elffile}@dots{}
c407ab
 @c man end
c407ab
 @end smallexample
c407ab
@@ -4887,6 +4943,28 @@ necessary in order to demangle truly com
c407ab
 that if the recursion limit is disabled then stack exhaustion is
c407ab
 possible and any bug reports about such an event will be rejected.
c407ab
 
c407ab
+@item -U @var{[d|i|l|e|x|h]}
c407ab
+@itemx --unicode=[default|invalid|locale|escape|hex|highlight]
c407ab
+Controls the display of non-ASCII characters in identifier names.
c407ab
+The default (@option{--unicode=locale} or @option{--unicode=default}) is
c407ab
+to treat them as multibyte characters and display them in the current
c407ab
+locale.  All other versions of this option treat the bytes as UTF-8
c407ab
+encoded values and attempt to interpret them.  If they cannot be
c407ab
+interpreted or if the @option{--unicode=invalid} option is used then
c407ab
+they are displayed as a sequence of hex bytes, encloses in curly
c407ab
+parethesis characters.
c407ab
+
c407ab
+Using the @option{--unicode=escape} option will display the characters
c407ab
+as as unicode escape sequences (@var{\uxxxx}).  Using the
c407ab
+@option{--unicode=hex} will display the characters as hex byte
c407ab
+sequences enclosed between angle brackets.
c407ab
+
c407ab
+Using the @option{--unicode=highlight} will display the characters as 
c407ab
+unicode escape sequences but it will also highlighted them in red,
c407ab
+assuming that colouring is supported by the output device.  The
c407ab
+colouring is intended to draw attention to the presence of unicode
c407ab
+sequences when they might not be expected.
c407ab
+
c407ab
 @item -e
c407ab
 @itemx --headers
c407ab
 Display all the headers in the file.  Equivalent to @option{-h -l -S}.
c407ab
Only in binutils-2.36.1/binutils/doc: binutils.texi.orig
c407ab
diff -rup binutils.orig/binutils/nm.c binutils-2.36.1/binutils/nm.c
c407ab
--- binutils.orig/binutils/nm.c	2021-10-21 16:56:20.318761391 +0100
c407ab
+++ binutils-2.36.1/binutils/nm.c	2021-10-21 16:59:56.105261602 +0100
c407ab
@@ -38,6 +38,11 @@
c407ab
 #include "bucomm.h"
c407ab
 #include "plugin-api.h"
c407ab
 #include "plugin.h"
c407ab
+#include "safe-ctype.h"
c407ab
+
c407ab
+#ifndef streq
c407ab
+#define streq(a,b) (strcmp ((a),(b)) == 0)
c407ab
+#endif
c407ab
 
c407ab
 /* When sorting by size, we use this structure to hold the size and a
c407ab
    pointer to the minisymbol.  */
c407ab
@@ -192,6 +197,18 @@ static const char *plugin_target = NULL;
c407ab
 static bfd *lineno_cache_bfd;
c407ab
 static bfd *lineno_cache_rel_bfd;
c407ab
 
c407ab
+typedef enum unicode_display_type
c407ab
+{
c407ab
+  unicode_default = 0,
c407ab
+  unicode_locale,
c407ab
+  unicode_escape,
c407ab
+  unicode_hex,
c407ab
+  unicode_highlight,
c407ab
+  unicode_invalid
c407ab
+} unicode_display_type;
c407ab
+
c407ab
+static unicode_display_type unicode_display = unicode_default;
c407ab
+
c407ab
 enum long_option_values
c407ab
 {
c407ab
   OPTION_TARGET = 200,
c407ab
@@ -234,6 +251,7 @@ static struct option long_options[] =
c407ab
   {"target", required_argument, 0, OPTION_TARGET},
c407ab
   {"defined-only", no_argument, &defined_only, 1},
c407ab
   {"undefined-only", no_argument, &undefined_only, 1},
c407ab
+  {"unicode", required_argument, NULL, 'U'},
c407ab
   {"version", no_argument, &show_version, 1},
c407ab
   {"with-symbol-versions", no_argument, NULL,
c407ab
    OPTION_WITH_SYMBOL_VERSIONS},
c407ab
@@ -285,6 +303,8 @@ usage (FILE *stream, int status)
c407ab
   -t, --radix=RADIX      Use RADIX for printing symbol values\n\
c407ab
       --target=BFDNAME   Specify the target object format as BFDNAME\n\
c407ab
   -u, --undefined-only   Display only undefined symbols\n\
c407ab
+  -U {d|s|i|x|e|h}       Specify how to treat UTF-8 encoded unicode characters\n\
c407ab
+      --unicode={default|show|invalid|hex|escape|highlight}\n\
c407ab
       --with-symbol-versions  Display version strings after symbol names\n\
c407ab
   -X 32_64               (ignored)\n\
c407ab
   @FILE                  Read options from FILE\n\
c407ab
@@ -400,6 +420,189 @@ get_coff_symbol_type (const struct inter
c407ab
   return bufp;
c407ab
 }
c407ab
 
c407ab
+/* Convert a potential UTF-8 encoded sequence in IN into characters in OUT.
c407ab
+   The conversion format is controlled by the unicode_display variable.
c407ab
+   Returns the number of characters added to OUT.
c407ab
+   Returns the number of bytes consumed from IN in CONSUMED.
c407ab
+   Always consumes at least one byte and displays at least one character.  */
c407ab
+   
c407ab
+static unsigned int
c407ab
+display_utf8 (const unsigned char * in, char * out, unsigned int * consumed)
c407ab
+{
c407ab
+  char *        orig_out = out;
c407ab
+  unsigned int  nchars = 0;
c407ab
+
c407ab
+  if (unicode_display == unicode_default)
c407ab
+    goto invalid;
c407ab
+
c407ab
+  if (in[0] < 0xc0)
c407ab
+    goto invalid;
c407ab
+
c407ab
+  if ((in[1] & 0xc0) != 0x80)
c407ab
+    goto invalid;
c407ab
+
c407ab
+  if ((in[0] & 0x20) == 0)
c407ab
+    {
c407ab
+      nchars = 2;
c407ab
+      goto valid;
c407ab
+    }
c407ab
+
c407ab
+  if ((in[2] & 0xc0) != 0x80)
c407ab
+    goto invalid;
c407ab
+
c407ab
+  if ((in[0] & 0x10) == 0)
c407ab
+    {
c407ab
+      nchars = 3;
c407ab
+      goto valid;
c407ab
+    }
c407ab
+
c407ab
+  if ((in[3] & 0xc0) != 0x80)
c407ab
+    goto invalid;
c407ab
+
c407ab
+  nchars = 4;
c407ab
+
c407ab
+ valid:
c407ab
+  switch (unicode_display)
c407ab
+    {
c407ab
+    case unicode_locale:
c407ab
+      /* Copy the bytes into the output buffer as is.  */
c407ab
+      memcpy (out, in, nchars);
c407ab
+      out += nchars;
c407ab
+      break;
c407ab
+
c407ab
+    case unicode_invalid:
c407ab
+    case unicode_hex:
c407ab
+      {
c407ab
+      unsigned int j;
c407ab
+
c407ab
+      out += sprintf (out, "%c", unicode_display == unicode_hex ? '<' : '{');
c407ab
+      for (j = 0; j < nchars; j++)
c407ab
+	out += sprintf (out, "%02x", in [j]);
c407ab
+      out += sprintf (out, "%c", unicode_display == unicode_hex ? '>' : '}');
c407ab
+      }
c407ab
+      break;
c407ab
+      
c407ab
+    case unicode_highlight:
c407ab
+      if (isatty (1))
c407ab
+	out += sprintf (out, "\x1B[31;47m"); /* Red.  */
c407ab
+      /* Fall through.  */
c407ab
+    case unicode_escape:
c407ab
+      switch (nchars)
c407ab
+	{
c407ab
+	case 2:
c407ab
+	  out += sprintf (out, "\\u%02x%02x",
c407ab
+		  ((in[0] & 0x1c) >> 2), 
c407ab
+		  ((in[0] & 0x03) << 6) | (in[1] & 0x3f));
c407ab
+	  break;
c407ab
+
c407ab
+	case 3:
c407ab
+	  out += sprintf (out, "\\u%02x%02x",
c407ab
+		  ((in[0] & 0x0f) << 4) | ((in[1] & 0x3c) >> 2),
c407ab
+		  ((in[1] & 0x03) << 6) | ((in[2] & 0x3f)));
c407ab
+	  break;
c407ab
+
c407ab
+	case 4:
c407ab
+	  out += sprintf (out, "\\u%02x%02x%02x",
c407ab
+		  ((in[0] & 0x07) << 6) | ((in[1] & 0x3c) >> 2),
c407ab
+		  ((in[1] & 0x03) << 6) | ((in[2] & 0x3c) >> 2),
c407ab
+		  ((in[2] & 0x03) << 6) | ((in[3] & 0x3f)));
c407ab
+	  break;
c407ab
+	default:
c407ab
+	  /* URG.  */
c407ab
+	  break;
c407ab
+	}
c407ab
+
c407ab
+      if (unicode_display == unicode_highlight && isatty (1))
c407ab
+	out += sprintf (out, "\033[0m"); /* Default colour.  */
c407ab
+      break;
c407ab
+
c407ab
+    default:
c407ab
+      /* URG */
c407ab
+      break;
c407ab
+    }
c407ab
+
c407ab
+  * consumed = nchars;
c407ab
+  return out - orig_out;
c407ab
+
c407ab
+ invalid:
c407ab
+  /* Not a valid UTF-8 sequence.  */
c407ab
+  *out = *in;
c407ab
+  * consumed = 1;
c407ab
+  return 1;
c407ab
+}
c407ab
+
c407ab
+/* Convert any UTF-8 encoded characters in NAME into the form specified by
c407ab
+   unicode_display.  Also converts control characters.  Returns a static
c407ab
+   buffer if conversion was necessary.
c407ab
+   Code stolen from objdump.c:sanitize_string().  */
c407ab
+
c407ab
+static const char *
c407ab
+convert_utf8 (const char * in)
c407ab
+{
c407ab
+  static char *  buffer = NULL;
c407ab
+  static size_t  buffer_len = 0;
c407ab
+  const char *   original = in;
c407ab
+  char *         out;
c407ab
+
c407ab
+  /* Paranoia.  */
c407ab
+  if (in == NULL)
c407ab
+    return "";
c407ab
+
c407ab
+  /* See if any conversion is necessary.
c407ab
+     In the majority of cases it will not be needed.  */
c407ab
+  do
c407ab
+    {
c407ab
+      unsigned char c = *in++;
c407ab
+
c407ab
+      if (c == 0)
c407ab
+	return original;
c407ab
+
c407ab
+      if (ISCNTRL (c))
c407ab
+	break;
c407ab
+
c407ab
+      if (unicode_display != unicode_default && c >= 0xc0)
c407ab
+	break;
c407ab
+    }
c407ab
+  while (1);
c407ab
+
c407ab
+  /* Copy the input, translating as needed.  */
c407ab
+  in = original;
c407ab
+  if (buffer_len < (strlen (in) * 9))
c407ab
+    {
c407ab
+      free ((void *) buffer);
c407ab
+      buffer_len = strlen (in) * 9;
c407ab
+      buffer = xmalloc (buffer_len + 1);
c407ab
+    }
c407ab
+
c407ab
+  out = buffer;
c407ab
+  do
c407ab
+    {
c407ab
+      unsigned char c = *in++;
c407ab
+
c407ab
+      if (c == 0)
c407ab
+	break;
c407ab
+
c407ab
+      if (ISCNTRL (c))
c407ab
+	{
c407ab
+	  *out++ = '^';
c407ab
+	  *out++ = c + 0x40;
c407ab
+	}
c407ab
+      else if (unicode_display != unicode_default && c >= 0xc0)
c407ab
+	{
c407ab
+	  unsigned int num_consumed;
c407ab
+
c407ab
+	  out += display_utf8 ((const unsigned char *)(in - 1), out, & num_consumed);
c407ab
+	  in += num_consumed - 1;
c407ab
+	}
c407ab
+      else
c407ab
+	*out++ = c;
c407ab
+    }
c407ab
+  while (1);
c407ab
+
c407ab
+  *out = 0;
c407ab
+  return buffer;
c407ab
+}
c407ab
+
c407ab
 /* Print symbol name NAME, read from ABFD, with printf format FORM,
c407ab
    demangling it if requested.  */
c407ab
 
c407ab
@@ -418,6 +621,9 @@ print_symname (const char *form, struct
c407ab
 	name = alloc;
c407ab
     }
c407ab
 
c407ab
+  if (unicode_display != unicode_default)
c407ab
+    name = convert_utf8 (name);
c407ab
+
c407ab
   if (info != NULL && info->elfinfo)
c407ab
     {
c407ab
       const char *version_string;
c407ab
@@ -1738,7 +1944,7 @@ main (int argc, char **argv)
c407ab
     fatal (_("fatal error: libbfd ABI mismatch"));
c407ab
   set_default_bfd_target ();
c407ab
 
c407ab
-  while ((c = getopt_long (argc, argv, "aABCDef:gHhlnopPrSst:uvVvX:",
c407ab
+  while ((c = getopt_long (argc, argv, "aABCDef:gHhlnopPrSst:uU:vVvX:",
c407ab
 			   long_options, (int *) 0)) != EOF)
c407ab
     {
c407ab
       switch (c)
c407ab
@@ -1828,6 +2034,24 @@ main (int argc, char **argv)
c407ab
 	case 'u':
c407ab
 	  undefined_only = 1;
c407ab
 	  break;
c407ab
+
c407ab
+	case 'U':
c407ab
+	  if (streq (optarg, "default") || streq (optarg, "d"))
c407ab
+	    unicode_display = unicode_default;
c407ab
+	  else if (streq (optarg, "locale") || streq (optarg, "l"))
c407ab
+	    unicode_display = unicode_locale;
c407ab
+	  else if (streq (optarg, "escape") || streq (optarg, "e"))
c407ab
+	    unicode_display = unicode_escape;
c407ab
+	  else if (streq (optarg, "invalid") || streq (optarg, "i"))
c407ab
+	    unicode_display = unicode_invalid;
c407ab
+	  else if (streq (optarg, "hex") || streq (optarg, "x"))
c407ab
+	    unicode_display = unicode_hex;
c407ab
+	  else if (streq (optarg, "highlight") || streq (optarg, "h"))
c407ab
+	    unicode_display = unicode_highlight;
c407ab
+	  else
c407ab
+	    fatal (_("invalid argument to -U/--unicode: %s"), optarg);
c407ab
+	  break;
c407ab
+
c407ab
 	case 'V':
c407ab
 	  show_version = 1;
c407ab
 	  break;
c407ab
Only in binutils-2.36.1/binutils/: nm.c.orig
c407ab
Only in binutils-2.36.1/binutils/: nm.c.rej
c407ab
diff -rup binutils.orig/binutils/objdump.c binutils-2.36.1/binutils/objdump.c
c407ab
--- binutils.orig/binutils/objdump.c	2021-10-21 16:56:20.320761377 +0100
c407ab
+++ binutils-2.36.1/binutils/objdump.c	2021-10-21 16:56:29.695696218 +0100
c407ab
@@ -205,6 +205,18 @@ static const struct objdump_private_desc
c407ab
 
c407ab
 /* The list of detected jumps inside a function.  */
c407ab
 static struct jump_info *detected_jumps = NULL;
c407ab
+
c407ab
+typedef enum unicode_display_type
c407ab
+{
c407ab
+  unicode_default = 0,
c407ab
+  unicode_locale,
c407ab
+  unicode_escape,
c407ab
+  unicode_hex,
c407ab
+  unicode_highlight,
c407ab
+  unicode_invalid
c407ab
+} unicode_display_type;
c407ab
+
c407ab
+static unicode_display_type unicode_display = unicode_default;
c407ab
 
c407ab
 static void usage (FILE *, int) ATTRIBUTE_NORETURN;
c407ab
 static void
c407ab
@@ -247,6 +259,9 @@ usage (FILE *stream, int status)
c407ab
   -r, --reloc              Display the relocation entries in the file\n\
c407ab
   -R, --dynamic-reloc      Display the dynamic relocation entries in the file\n\
c407ab
   @<file>                  Read options from <file>\n\
c407ab
+  -U[d|l|i|x|e|h]          Controls the display of UTF-8 unicode characters\n\
c407ab
+  --unicode=[default|locale|invalid|hex|escape|highlight]\n"));
c407ab
+      fprintf (stream, _("\
c407ab
   -v, --version            Display this program's version number\n\
c407ab
   -i, --info               List object formats and architectures supported\n\
c407ab
   -H, --help               Display this information\n\
c407ab
@@ -395,6 +410,7 @@ static struct option long_options[]=
c407ab
   {"stop-address", required_argument, NULL, OPTION_STOP_ADDRESS},
c407ab
   {"syms", no_argument, NULL, 't'},
c407ab
   {"target", required_argument, NULL, 'b'},
c407ab
+  {"unicode", required_argument, NULL, 'U'},
c407ab
   {"version", no_argument, NULL, 'V'},
c407ab
   {"wide", no_argument, NULL, 'w'},
c407ab
   {"prefix", required_argument, NULL, OPTION_PREFIX},
c407ab
@@ -414,10 +430,124 @@ nonfatal (const char *msg)
c407ab
   bfd_nonfatal (msg);
c407ab
   exit_status = 1;
c407ab
 }
c407ab
+
c407ab
+/* Convert a potential UTF-8 encoded sequence in IN into characters in OUT.
c407ab
+   The conversion format is controlled by the unicode_display variable.
c407ab
+   Returns the number of characters added to OUT.
c407ab
+   Returns the number of bytes consumed from IN in CONSUMED.
c407ab
+   Always consumes at least one byte and displays at least one character.  */
c407ab
+   
c407ab
+static unsigned int
c407ab
+display_utf8 (const unsigned char * in, char * out, unsigned int * consumed)
c407ab
+{
c407ab
+  char *        orig_out = out;
c407ab
+  unsigned int  nchars = 0;
c407ab
+
c407ab
+  if (unicode_display == unicode_default)
c407ab
+    goto invalid;
c407ab
+
c407ab
+  if (in[0] < 0xc0)
c407ab
+    goto invalid;
c407ab
+
c407ab
+  if ((in[1] & 0xc0) != 0x80)
c407ab
+    goto invalid;
c407ab
+
c407ab
+  if ((in[0] & 0x20) == 0)
c407ab
+    {
c407ab
+      nchars = 2;
c407ab
+      goto valid;
c407ab
+    }
c407ab
+
c407ab
+  if ((in[2] & 0xc0) != 0x80)
c407ab
+    goto invalid;
c407ab
+
c407ab
+  if ((in[0] & 0x10) == 0)
c407ab
+    {
c407ab
+      nchars = 3;
c407ab
+      goto valid;
c407ab
+    }
c407ab
+
c407ab
+  if ((in[3] & 0xc0) != 0x80)
c407ab
+    goto invalid;
c407ab
+
c407ab
+  nchars = 4;
c407ab
+
c407ab
+ valid:
c407ab
+  switch (unicode_display)
c407ab
+    {
c407ab
+    case unicode_locale:
c407ab
+      /* Copy the bytes into the output buffer as is.  */
c407ab
+      memcpy (out, in, nchars);
c407ab
+      out += nchars;
c407ab
+      break;
c407ab
+
c407ab
+    case unicode_invalid:
c407ab
+    case unicode_hex:
c407ab
+      {
c407ab
+      unsigned int j;
c407ab
+
c407ab
+      out += sprintf (out, "%c", unicode_display == unicode_hex ? '<' : '{');
c407ab
+      for (j = 0; j < nchars; j++)
c407ab
+	out += sprintf (out, "%02x", in [j]);
c407ab
+      out += sprintf (out, "%c", unicode_display == unicode_hex ? '>' : '}');
c407ab
+      }
c407ab
+      break;
c407ab
+      
c407ab
+    case unicode_highlight:
c407ab
+      if (isatty (1))
c407ab
+	out += sprintf (out, "\x1B[31;47m"); /* Red.  */
c407ab
+      /* Fall through.  */
c407ab
+    case unicode_escape:
c407ab
+      switch (nchars)
c407ab
+	{
c407ab
+	case 2:
c407ab
+	  out += sprintf (out, "\\u%02x%02x",
c407ab
+		  ((in[0] & 0x1c) >> 2), 
c407ab
+		  ((in[0] & 0x03) << 6) | (in[1] & 0x3f));
c407ab
+	  break;
c407ab
+
c407ab
+	case 3:
c407ab
+	  out += sprintf (out, "\\u%02x%02x",
c407ab
+		  ((in[0] & 0x0f) << 4) | ((in[1] & 0x3c) >> 2),
c407ab
+		  ((in[1] & 0x03) << 6) | ((in[2] & 0x3f)));
c407ab
+	  break;
c407ab
+
c407ab
+	case 4:
c407ab
+	  out += sprintf (out, "\\u%02x%02x%02x",
c407ab
+		  ((in[0] & 0x07) << 6) | ((in[1] & 0x3c) >> 2),
c407ab
+		  ((in[1] & 0x03) << 6) | ((in[2] & 0x3c) >> 2),
c407ab
+		  ((in[2] & 0x03) << 6) | ((in[3] & 0x3f)));
c407ab
+	  break;
c407ab
+	default:
c407ab
+	  /* URG.  */
c407ab
+	  break;
c407ab
+	}
c407ab
+
c407ab
+      if (unicode_display == unicode_highlight && isatty (1))
c407ab
+	out += sprintf (out, "\033[0m"); /* Default colour.  */
c407ab
+      break;
c407ab
+
c407ab
+    default:
c407ab
+      /* URG */
c407ab
+      break;
c407ab
+    }
c407ab
+
c407ab
+  * consumed = nchars;
c407ab
+  return out - orig_out;
c407ab
+
c407ab
+ invalid:
c407ab
+  /* Not a valid UTF-8 sequence.  */
c407ab
+  *out = *in;
c407ab
+  * consumed = 1;
c407ab
+  return 1;
c407ab
+}
c407ab
 
c407ab
 /* Returns a version of IN with any control characters
c407ab
    replaced by escape sequences.  Uses a static buffer
c407ab
-   if necessary.  */
c407ab
+   if necessary.
c407ab
+
c407ab
+   If unicode display is enabled, then also handles the
c407ab
+   conversion of unicode characters.  */
c407ab
 
c407ab
 static const char *
c407ab
 sanitize_string (const char * in)
c407ab
@@ -435,40 +565,50 @@ sanitize_string (const char * in)
c407ab
      of cases it will not be needed.  */
c407ab
   do
c407ab
     {
c407ab
-      char c = *in++;
c407ab
+      unsigned char c = *in++;
c407ab
 
c407ab
       if (c == 0)
c407ab
 	return original;
c407ab
 
c407ab
       if (ISCNTRL (c))
c407ab
 	break;
c407ab
+
c407ab
+      if (unicode_display != unicode_default && c >= 0xc0)
c407ab
+	break;
c407ab
     }
c407ab
   while (1);
c407ab
 
c407ab
   /* Copy the input, translating as needed.  */
c407ab
   in = original;
c407ab
-  if (buffer_len < (strlen (in) * 2))
c407ab
+  if (buffer_len < (strlen (in) * 9))
c407ab
     {
c407ab
       free ((void *) buffer);
c407ab
-      buffer_len = strlen (in) * 2;
c407ab
+      buffer_len = strlen (in) * 9;
c407ab
       buffer = xmalloc (buffer_len + 1);
c407ab
     }
c407ab
 
c407ab
   out = buffer;
c407ab
   do
c407ab
     {
c407ab
-      char c = *in++;
c407ab
+      unsigned char c = *in++;
c407ab
 
c407ab
       if (c == 0)
c407ab
 	break;
c407ab
 
c407ab
-      if (!ISCNTRL (c))
c407ab
-	*out++ = c;
c407ab
-      else
c407ab
+      if (ISCNTRL (c))
c407ab
 	{
c407ab
 	  *out++ = '^';
c407ab
 	  *out++ = c + 0x40;
c407ab
 	}
c407ab
+      else if (unicode_display != unicode_default && c >= 0xc0)
c407ab
+	{
c407ab
+	  unsigned int num_consumed;
c407ab
+
c407ab
+	  out += display_utf8 ((const unsigned char *)(in - 1), out, & num_consumed);
c407ab
+	  in += num_consumed - 1;
c407ab
+	}
c407ab
+      else
c407ab
+	*out++ = c;
c407ab
     }
c407ab
   while (1);
c407ab
 
c407ab
@@ -476,7 +616,6 @@ sanitize_string (const char * in)
c407ab
   return buffer;
c407ab
 }
c407ab
 
c407ab
-
c407ab
 /* Returns TRUE if the specified section should be dumped.  */
c407ab
 
c407ab
 static bfd_boolean
c407ab
@@ -1055,6 +1194,8 @@ objdump_print_symname (bfd *abfd, struct
c407ab
 
c407ab
   name = sanitize_string (name);
c407ab
 
c407ab
+  name = sanitize_string (name);
c407ab
+
c407ab
   if (inf != NULL)
c407ab
     {
c407ab
       (*inf->fprintf_func) (inf->stream, "%s", name);
c407ab
@@ -3136,7 +3277,7 @@ disassemble_section (bfd *abfd, asection
c407ab
   if (!bfd_malloc_and_get_section (abfd, section, &data))
c407ab
     {
c407ab
       non_fatal (_("Reading section %s failed because: %s"),
c407ab
-		 section->name, bfd_errmsg (bfd_get_error ()));
c407ab
+		 sanitize_string (section->name), bfd_errmsg (bfd_get_error ()));
c407ab
       return;
c407ab
     }
c407ab
 
c407ab
@@ -4341,7 +4482,7 @@ dump_section (bfd *abfd, asection *secti
c407ab
   if (!bfd_get_full_section_contents (abfd, section, &data))
c407ab
     {
c407ab
       non_fatal (_("Reading section %s failed because: %s"),
c407ab
-		 section->name, bfd_errmsg (bfd_get_error ()));
c407ab
+		 sanitize_string (section->name), bfd_errmsg (bfd_get_error ()));
c407ab
       return;
c407ab
     }
c407ab
 
c407ab
@@ -4481,6 +4622,24 @@ dump_symbols (bfd *abfd ATTRIBUTE_UNUSED
c407ab
 		  free (alloc);
c407ab
 		}
c407ab
 	    }
c407ab
+	  else if (unicode_display != unicode_default
c407ab
+		   && name != NULL && *name != '\0')
c407ab
+	    {
c407ab
+	      const char * sanitized_name;
c407ab
+
c407ab
+	      /* If we want to sanitize the name, we do it here, and
c407ab
+		 temporarily clobber it while calling bfd_print_symbol.
c407ab
+		 FIXME: This is a gross hack.  */
c407ab
+	      sanitized_name = sanitize_string (name);
c407ab
+	      if (sanitized_name != name)
c407ab
+		(*current)->name = sanitized_name;
c407ab
+	      else
c407ab
+		sanitized_name = NULL;
c407ab
+	      bfd_print_symbol (cur_bfd, stdout, *current,
c407ab
+				bfd_print_symbol_all);
c407ab
+	      if (sanitized_name != NULL)
c407ab
+		(*current)->name = name;
c407ab
+	    }
c407ab
 	  else
c407ab
 	    bfd_print_symbol (cur_bfd, stdout, *current,
c407ab
 			      bfd_print_symbol_all);
c407ab
@@ -5162,7 +5321,7 @@ main (int argc, char **argv)
c407ab
   set_default_bfd_target ();
c407ab
 
c407ab
   while ((c = getopt_long (argc, argv,
c407ab
-			   "pP:ib:m:M:VvCdDlfFaHhrRtTxsSI:j:wE:zgeGW::",
c407ab
+			   "pP:ib:m:M:VvCdDlfFaHhrRtTxsSI:j:wE:zgeGW::U:",
c407ab
 			   long_options, (int *) 0))
c407ab
 	 != EOF)
c407ab
     {
c407ab
@@ -5441,6 +5600,23 @@ main (int argc, char **argv)
c407ab
 	  seenflag = TRUE;
c407ab
 	  break;
c407ab
 
c407ab
+	case 'U':
c407ab
+	  if (streq (optarg, "default") || streq (optarg, "d"))
c407ab
+	    unicode_display = unicode_default;
c407ab
+	  else if (streq (optarg, "locale") || streq (optarg, "l"))
c407ab
+	    unicode_display = unicode_locale;
c407ab
+	  else if (streq (optarg, "escape") || streq (optarg, "e"))
c407ab
+	    unicode_display = unicode_escape;
c407ab
+	  else if (streq (optarg, "invalid") || streq (optarg, "i"))
c407ab
+	    unicode_display = unicode_invalid;
c407ab
+	  else if (streq (optarg, "hex") || streq (optarg, "x"))
c407ab
+	    unicode_display = unicode_hex;
c407ab
+	  else if (streq (optarg, "highlight") || streq (optarg, "h"))
c407ab
+	    unicode_display = unicode_highlight;
c407ab
+	  else
c407ab
+	    fatal (_("invalid argument to -U/--unicode: %s"), optarg);
c407ab
+	  break;
c407ab
+
c407ab
 	case 'H':
c407ab
 	  usage (stdout, 0);
c407ab
 	  /* No need to set seenflag or to break - usage() does not return.  */
c407ab
Only in binutils-2.36.1/binutils/: objdump.c.orig
c407ab
diff -rup binutils.orig/binutils/readelf.c binutils-2.36.1/binutils/readelf.c
c407ab
--- binutils.orig/binutils/readelf.c	2021-10-21 16:56:20.323761356 +0100
c407ab
+++ binutils-2.36.1/binutils/readelf.c	2021-10-21 17:00:54.169858044 +0100
c407ab
@@ -321,6 +321,18 @@ typedef enum print_mode
c407ab
 }
c407ab
 print_mode;
c407ab
 
c407ab
+typedef enum unicode_display_type
c407ab
+{
c407ab
+  unicode_locale,
c407ab
+  unicode_escape,
c407ab
+  unicode_hex,
c407ab
+  unicode_highlight,
c407ab
+  unicode_invalid
c407ab
+} unicode_display_type;
c407ab
+
c407ab
+static unicode_display_type unicode_display = unicode_locale;
c407ab
+
c407ab
+  
c407ab
 /* Versioned symbol info.  */
c407ab
 enum versioned_symbol_info
c407ab
 {
c407ab
@@ -613,11 +625,18 @@ print_symbol (signed int width, const ch
c407ab
       if (c == 0)
c407ab
 	break;
c407ab
 
c407ab
-      /* Do not print control characters directly as they can affect terminal
c407ab
-	 settings.  Such characters usually appear in the names generated
c407ab
-	 by the assembler for local labels.  */
c407ab
-      if (ISCNTRL (c))
c407ab
+      if (ISPRINT (c))
c407ab
+	{
c407ab
+	  putchar (c);
c407ab
+	  width_remaining --;
c407ab
+	  num_printed ++;
c407ab
+	}
c407ab
+      else if (ISCNTRL (c))
c407ab
 	{
c407ab
+	  /* Do not print control characters directly as they can affect terminal
c407ab
+	     settings.  Such characters usually appear in the names generated
c407ab
+	     by the assembler for local labels.  */
c407ab
+
c407ab
 	  if (width_remaining < 2)
c407ab
 	    break;
c407ab
 
c407ab
@@ -625,11 +644,135 @@ print_symbol (signed int width, const ch
c407ab
 	  width_remaining -= 2;
c407ab
 	  num_printed += 2;
c407ab
 	}
c407ab
-      else if (ISPRINT (c))
c407ab
+      else if (c == 0x7f)
c407ab
 	{
c407ab
-	  putchar (c);
c407ab
-	  width_remaining --;
c407ab
-	  num_printed ++;
c407ab
+	  if (width_remaining < 5)
c407ab
+	    break;
c407ab
+	  printf ("");
c407ab
+	  width_remaining -= 5;
c407ab
+	  num_printed += 5;
c407ab
+	}
c407ab
+      else if (unicode_display != unicode_locale)
c407ab
+	{
c407ab
+	  /* Display unicode characters as something else.  */
c407ab
+	  unsigned char bytes[4];
c407ab
+	  bfd_boolean   is_utf8;
c407ab
+	  unsigned int  nbytes;
c407ab
+
c407ab
+	  bytes[0] = c;
c407ab
+
c407ab
+	  if (bytes[0] < 0xc0)
c407ab
+	    {
c407ab
+	      nbytes = 1;
c407ab
+	      is_utf8 = FALSE;
c407ab
+	    }
c407ab
+	  else
c407ab
+	    {
c407ab
+	      bytes[1] = *symbol++;
c407ab
+
c407ab
+	      if ((bytes[1] & 0xc0) != 0x80)
c407ab
+		{
c407ab
+		  is_utf8 = FALSE;
c407ab
+		  /* Do not consume this character.  It may only
c407ab
+		     be the first byte in the sequence that was
c407ab
+		     corrupt.  */
c407ab
+		  --symbol;
c407ab
+		  nbytes = 1;
c407ab
+		}
c407ab
+	      else if ((bytes[0] & 0x20) == 0)
c407ab
+		{
c407ab
+		  is_utf8 = TRUE;
c407ab
+		  nbytes = 2;
c407ab
+		}
c407ab
+	      else
c407ab
+		{
c407ab
+		  bytes[2] = *symbol++;
c407ab
+
c407ab
+		  if ((bytes[2] & 0xc0) != 0x80)
c407ab
+		    {
c407ab
+		      is_utf8 = FALSE;
c407ab
+		      symbol -= 2;
c407ab
+		      nbytes = 1;
c407ab
+		    }
c407ab
+		  else if ((bytes[0] & 0x10) == 0)
c407ab
+		    {
c407ab
+		      is_utf8 = TRUE;
c407ab
+		      nbytes = 3;
c407ab
+		    }
c407ab
+		  else
c407ab
+		    {
c407ab
+		      bytes[3] = *symbol++;
c407ab
+
c407ab
+		      nbytes = 4;
c407ab
+
c407ab
+		      if ((bytes[3] & 0xc0) != 0x80)
c407ab
+			{
c407ab
+			  is_utf8 = FALSE;
c407ab
+			  symbol -= 3;
c407ab
+			  nbytes = 1;
c407ab
+			}
c407ab
+		      else
c407ab
+			is_utf8 = TRUE;
c407ab
+		    }
c407ab
+		}
c407ab
+	    }
c407ab
+
c407ab
+	  if (unicode_display == unicode_invalid)
c407ab
+	    is_utf8 = FALSE;
c407ab
+
c407ab
+	  if (unicode_display == unicode_hex || ! is_utf8)
c407ab
+	    {
c407ab
+	      unsigned int i;
c407ab
+
c407ab
+	      if (width_remaining < (nbytes * 2) + 2)
c407ab
+		break;
c407ab
+	  
c407ab
+	      putchar (is_utf8 ? '<' : '{');
c407ab
+	      for (i = 0; i < nbytes; i++)
c407ab
+		printf ("%02x", bytes[i]);
c407ab
+	      putchar (is_utf8 ? '>' : '}');
c407ab
+	    }
c407ab
+	  else
c407ab
+	    {
c407ab
+	      if (unicode_display == unicode_highlight && isatty (1))
c407ab
+		printf ("\x1B[31;47m"); /* Red.  */
c407ab
+	      
c407ab
+	      switch (nbytes)
c407ab
+		{
c407ab
+		case 2:
c407ab
+		  if (width_remaining < 6)
c407ab
+		    break;
c407ab
+		  printf ("\\u%02x%02x",
c407ab
+			  (bytes[0] & 0x1c) >> 2, 
c407ab
+			  ((bytes[0] & 0x03) << 6) | (bytes[1] & 0x3f));
c407ab
+		  break;
c407ab
+		case 3:
c407ab
+		  if (width_remaining < 6)
c407ab
+		    break;
c407ab
+		  printf ("\\u%02x%02x",
c407ab
+			  ((bytes[0] & 0x0f) << 4) | ((bytes[1] & 0x3c) >> 2),
c407ab
+			  ((bytes[1] & 0x03) << 6) | (bytes[2] & 0x3f));
c407ab
+		  break;
c407ab
+		case 4:
c407ab
+		  if (width_remaining < 8)
c407ab
+		    break;
c407ab
+		  printf ("\\u%02x%02x%02x",
c407ab
+			  ((bytes[0] & 0x07) << 6) | ((bytes[1] & 0x3c) >> 2),
c407ab
+			  ((bytes[1] & 0x03) << 6) | ((bytes[2] & 0x3c) >> 2),
c407ab
+			  ((bytes[2] & 0x03) << 6) | (bytes[3] & 0x3f));
c407ab
+		  
c407ab
+		  break;
c407ab
+		default:
c407ab
+		  /* URG.  */
c407ab
+		  break;
c407ab
+		}
c407ab
+
c407ab
+	      if (unicode_display == unicode_highlight && isatty (1))
c407ab
+		printf ("\033[0m"); /* Default colour.  */
c407ab
+	    }
c407ab
+	  
c407ab
+	  if (bytes[nbytes - 1] == 0)
c407ab
+	    break;
c407ab
 	}
c407ab
       else
c407ab
 	{
c407ab
@@ -4555,6 +4698,7 @@ static struct option options[] =
c407ab
   {"syms",	       no_argument, 0, 's'},
c407ab
   {"silent-truncation",no_argument, 0, 'T'},
c407ab
   {"section-details",  no_argument, 0, 't'},
c407ab
+  {"unicode",          required_argument, 0, 'U'},
c407ab
   {"unwind",	       no_argument, 0, 'u'},
c407ab
   {"version-info",     no_argument, 0, 'V'},
c407ab
   {"version",	       no_argument, 0, 'v'},
c407ab
@@ -4652,6 +4796,11 @@ usage (FILE * stream)
c407ab
 #endif
c407ab
   fprintf (stream, _("\
c407ab
   -I --histogram         Display histogram of bucket list lengths\n\
c407ab
+  -U --unicode=[locale|escape|hex|highlight|invalid]\n\
c407ab
+                         Display unicode characters as determined by the current locale\n\
c407ab
+                          (default), escape sequences, \"<hex sequences>\", highlighted\n\
c407ab
+                          escape sequences, or treat them as invalid and display as\n\
c407ab
+                          \"{hex sequences}\"\n\
c407ab
   -W --wide              Allow output width to exceed 80 characters\n\
c407ab
   -T --silent-truncation If a symbol name is truncated, do not add a suffix [...]\n\
c407ab
   @<file>                Read options from <file>\n\
c407ab
@@ -4748,7 +4897,7 @@ parse_args (struct dump_data *dumpdata,
c407ab
     usage (stderr);
c407ab
 
c407ab
   while ((c = getopt_long
c407ab
-	  (argc, argv, "ACDHILNR:STVWacdeghi:lnp:rstuvw::x:z", options, NULL)) != EOF)
c407ab
+	  (argc, argv, "ACDHILNR:STU:VWacdeghi:lnp:rstuvw::x:z", options, NULL)) != EOF)
c407ab
     {
c407ab
       switch (c)
c407ab
 	{
c407ab
@@ -4905,6 +5054,25 @@ parse_args (struct dump_data *dumpdata,
c407ab
 	  request_dump (dumpdata, DISASS_DUMP);
c407ab
 	  break;
c407ab
 #endif
c407ab
+	case 'U':
c407ab
+	  if (optarg == NULL)
c407ab
+	    error (_("Missing arg to -U/--unicode")); /* Can this happen ?  */
c407ab
+	  else if (streq (optarg, "default") || streq (optarg, "d"))
c407ab
+	    unicode_display = unicode_locale;
c407ab
+	  else if (streq (optarg, "locale") || streq (optarg, "l"))
c407ab
+	    unicode_display = unicode_locale;
c407ab
+	  else if (streq (optarg, "escape") || streq (optarg, "e"))
c407ab
+	    unicode_display = unicode_escape;
c407ab
+	  else if (streq (optarg, "invalid") || streq (optarg, "i"))
c407ab
+	    unicode_display = unicode_invalid;
c407ab
+	  else if (streq (optarg, "hex") || streq (optarg, "x"))
c407ab
+	    unicode_display = unicode_hex;
c407ab
+	  else if (streq (optarg, "highlight") || streq (optarg, "h"))
c407ab
+	    unicode_display = unicode_highlight;
c407ab
+	  else
c407ab
+	    error (_("unknown argument to -U/--unicode: %s"), optarg);
c407ab
+	  break;
c407ab
+
c407ab
 	case 'v':
c407ab
 	  print_version (program_name);
c407ab
 	  break;
c407ab
Only in binutils-2.36.1/binutils/: readelf.c.orig
c407ab
Only in binutils-2.36.1/binutils/: readelf.c.rej
c407ab
diff -rup binutils.orig/binutils/strings.c binutils-2.36.1/binutils/strings.c
c407ab
--- binutils.orig/binutils/strings.c	2021-10-21 16:56:20.321761370 +0100
c407ab
+++ binutils-2.36.1/binutils/strings.c	2021-10-21 16:56:29.698696197 +0100
c407ab
@@ -55,6 +55,19 @@
c407ab
    -T {bfdname}
c407ab
 		Specify a non-default object file format.
c407ab
 
c407ab
+  --unicode={default|locale|invalid|hex|escape|highlight}
c407ab
+  -U {d|l|i|x|e|h}
c407ab
+                Determine how to handle UTF-8 unicode characters.  The default
c407ab
+		is no special treatment.  All other versions of this option
c407ab
+		only apply if the encoding is valid and enabling the option
c407ab
+		implies --encoding=S.
c407ab
+		The 'locale' option displays the characters according to the
c407ab
+		current locale.  The 'invalid' option treats them as
c407ab
+		non-string characters.  The 'hex' option displays them as hex
c407ab
+		byte sequences.  The 'escape' option displays them as escape
c407ab
+		sequences and the 'highlight' option displays them as
c407ab
+		coloured escape sequences.
c407ab
+
c407ab
   --output-separator=sep_string
c407ab
   -s sep_string	String used to separate parsed strings in output.
c407ab
 		Default is newline.
c407ab
@@ -76,6 +89,22 @@
c407ab
 #include "safe-ctype.h"
c407ab
 #include "bucomm.h"
c407ab
 
c407ab
+#ifndef streq
c407ab
+#define streq(a,b) (strcmp ((a),(b)) == 0)
c407ab
+#endif
c407ab
+
c407ab
+typedef enum unicode_display_type
c407ab
+{
c407ab
+  unicode_default = 0,
c407ab
+  unicode_locale,
c407ab
+  unicode_escape,
c407ab
+  unicode_hex,
c407ab
+  unicode_highlight,
c407ab
+  unicode_invalid
c407ab
+} unicode_display_type;
c407ab
+
c407ab
+static unicode_display_type unicode_display = unicode_default;
c407ab
+
c407ab
 #define STRING_ISGRAPHIC(c) \
c407ab
       (   (c) >= 0 \
c407ab
        && (c) <= 255 \
c407ab
@@ -94,7 +123,7 @@ extern int errno;
c407ab
 static int address_radix;
c407ab
 
c407ab
 /* Minimum length of sequence of graphic chars to trigger output.  */
c407ab
-static int string_min;
c407ab
+static uint string_min;
c407ab
 
c407ab
 /* Whether or not we include all whitespace as a graphic char.   */
c407ab
 static bfd_boolean include_all_whitespace;
c407ab
@@ -130,6 +159,7 @@ static struct option long_options[] =
c407ab
   {"target", required_argument, NULL, 'T'},
c407ab
   {"output-separator", required_argument, NULL, 's'},
c407ab
   {"help", no_argument, NULL, 'h'},
c407ab
+  {"unicode", required_argument, NULL, 'U'},
c407ab
   {"version", no_argument, NULL, 'v'},
c407ab
   {NULL, 0, NULL, 0}
c407ab
 };
c407ab
@@ -173,7 +203,7 @@ main (int argc, char **argv)
c407ab
   encoding = 's';
c407ab
   output_separator = NULL;
c407ab
 
c407ab
-  while ((optc = getopt_long (argc, argv, "adfhHn:wot:e:T:s:Vv0123456789",
c407ab
+  while ((optc = getopt_long (argc, argv, "adfhHn:wot:e:T:s:U:Vv0123456789",
c407ab
 			      long_options, (int *) 0)) != EOF)
c407ab
     {
c407ab
       switch (optc)
c407ab
@@ -246,6 +276,23 @@ main (int argc, char **argv)
c407ab
 	  output_separator = optarg;
c407ab
           break;
c407ab
 
c407ab
+	case 'U':
c407ab
+	  if (streq (optarg, "default") || streq (optarg, "d"))
c407ab
+	    unicode_display = unicode_default;
c407ab
+	  else if (streq (optarg, "locale") || streq (optarg, "l"))
c407ab
+	    unicode_display = unicode_locale;
c407ab
+	  else if (streq (optarg, "escape") || streq (optarg, "e"))
c407ab
+	    unicode_display = unicode_escape;
c407ab
+	  else if (streq (optarg, "invalid") || streq (optarg, "i"))
c407ab
+	    unicode_display = unicode_invalid;
c407ab
+	  else if (streq (optarg, "hex") || streq (optarg, "x"))
c407ab
+	    unicode_display = unicode_hex;
c407ab
+	  else if (streq (optarg, "highlight") || streq (optarg, "h"))
c407ab
+	    unicode_display = unicode_highlight;
c407ab
+	  else
c407ab
+	    fatal (_("invalid argument to -U/--unicode: %s"), optarg);
c407ab
+	  break;
c407ab
+
c407ab
 	case 'V':
c407ab
 	case 'v':
c407ab
 	  print_version ("strings");
c407ab
@@ -260,6 +307,9 @@ main (int argc, char **argv)
c407ab
 	}
c407ab
     }
c407ab
 
c407ab
+  if (unicode_display != unicode_default)
c407ab
+    encoding = 'S';
c407ab
+
c407ab
   if (numeric_opt != 0)
c407ab
     {
c407ab
       string_min = (int) strtoul (argv[numeric_opt - 1] + 1, &s, 0);
c407ab
@@ -553,11 +603,629 @@ unget_part_char (long c, file_ptr *addre
c407ab
 	}
c407ab
     }
c407ab
 }
c407ab
+
c407ab
+static void
c407ab
+print_filename_and_address (const char * filename, file_ptr address)
c407ab
+{
c407ab
+  if (print_filenames)
c407ab
+    printf ("%s: ", filename);
c407ab
+
c407ab
+  if (! print_addresses)
c407ab
+    return;
c407ab
+
c407ab
+  switch (address_radix)
c407ab
+    {
c407ab
+    case 8:
c407ab
+      if (sizeof (address) > sizeof (long))
c407ab
+	{
c407ab
+#ifndef __MSVCRT__
c407ab
+	  printf ("%7llo ", (unsigned long long) address);
c407ab
+#else
c407ab
+	  printf ("%7I64o ", (unsigned long long) address);
c407ab
+#endif
c407ab
+	}
c407ab
+      else
c407ab
+	printf ("%7lo ", (unsigned long) address);
c407ab
+      break;
c407ab
+
c407ab
+    case 10:
c407ab
+      if (sizeof (address) > sizeof (long))
c407ab
+	{
c407ab
+#ifndef __MSVCRT__
c407ab
+	  printf ("%7llu ", (unsigned long long) address);
c407ab
+#else
c407ab
+	  printf ("%7I64d ", (unsigned long long) address);
c407ab
+#endif
c407ab
+	}
c407ab
+      else
c407ab
+	printf ("%7ld ", (long) address);
c407ab
+      break;
c407ab
+
c407ab
+    case 16:
c407ab
+      if (sizeof (address) > sizeof (long))
c407ab
+	{
c407ab
+#ifndef __MSVCRT__
c407ab
+	  printf ("%7llx ", (unsigned long long) address);
c407ab
+#else
c407ab
+	  printf ("%7I64x ", (unsigned long long) address);
c407ab
+#endif
c407ab
+	}
c407ab
+      else
c407ab
+	printf ("%7lx ", (unsigned long) address);
c407ab
+      break;
c407ab
+    }
c407ab
+}
c407ab
+
c407ab
+/* Return non-zero if the bytes starting at BUFFER form a valid UTF-8 encoding.
c407ab
+   If the encoding is valid then returns the number of bytes it uses.  */
c407ab
+
c407ab
+static unsigned int
c407ab
+is_valid_utf8 (const unsigned char * buffer, unsigned long buflen)
c407ab
+{
c407ab
+  if (buffer[0] < 0xc0)
c407ab
+    return 0;
c407ab
+
c407ab
+  if (buflen < 2)
c407ab
+    return 0;
c407ab
+
c407ab
+  if ((buffer[1] & 0xc0) != 0x80)
c407ab
+    return 0;
c407ab
+
c407ab
+  if ((buffer[0] & 0x20) == 0)
c407ab
+    return 2;
c407ab
+
c407ab
+  if (buflen < 3)
c407ab
+    return 0;
c407ab
+
c407ab
+  if ((buffer[2] & 0xc0) != 0x80)
c407ab
+    return 0;
c407ab
+  
c407ab
+  if ((buffer[0] & 0x10) == 0)
c407ab
+    return 3;
c407ab
+
c407ab
+  if (buflen < 4)
c407ab
+    return 0;
c407ab
+
c407ab
+  if ((buffer[3] & 0xc0) != 0x80)
c407ab
+    return 0;
c407ab
+
c407ab
+  return 4;
c407ab
+}
c407ab
+
c407ab
+/* Display a UTF-8 encoded character in BUFFER according to the setting
c407ab
+   of unicode_display.  The character is known to be valid.
c407ab
+   Returns the number of bytes consumed.  */
c407ab
+
c407ab
+static unsigned int
c407ab
+display_utf8_char (const unsigned char * buffer)
c407ab
+{
c407ab
+  unsigned int j;
c407ab
+  unsigned int utf8_len;
c407ab
+
c407ab
+  switch (buffer[0] & 0x30)
c407ab
+    {
c407ab
+    case 0x00:
c407ab
+    case 0x10:
c407ab
+      utf8_len = 2;
c407ab
+      break;
c407ab
+    case 0x20:
c407ab
+      utf8_len = 3;
c407ab
+      break;
c407ab
+    default:
c407ab
+      utf8_len = 4;
c407ab
+    }
c407ab
+      
c407ab
+  switch (unicode_display)
c407ab
+    {
c407ab
+    default:
c407ab
+      fprintf (stderr, "ICE: unexpected unicode display type\n");
c407ab
+      break;
c407ab
+
c407ab
+    case unicode_escape:
c407ab
+    case unicode_highlight:
c407ab
+      if (unicode_display == unicode_highlight && isatty (1))
c407ab
+	printf ("\x1B[31;47m"); /* Red.  */
c407ab
+
c407ab
+      switch (utf8_len)
c407ab
+	{
c407ab
+	case 2:
c407ab
+	  printf ("\\u%02x%02x",
c407ab
+		  ((buffer[0] & 0x1c) >> 2), 
c407ab
+		  ((buffer[0] & 0x03) << 6) | (buffer[1] & 0x3f));
c407ab
+	  break;
c407ab
+
c407ab
+	case 3:
c407ab
+	  printf ("\\u%02x%02x",
c407ab
+		  ((buffer[0] & 0x0f) << 4) | ((buffer[1] & 0x3c) >> 2),
c407ab
+		  ((buffer[1] & 0x03) << 6) | ((buffer[2] & 0x3f)));
c407ab
+	  break;
c407ab
+
c407ab
+	case 4:
c407ab
+	  printf ("\\u%02x%02x%02x",
c407ab
+		  ((buffer[0] & 0x07) << 6) | ((buffer[1] & 0x3c) >> 2),
c407ab
+		  ((buffer[1] & 0x03) << 6) | ((buffer[2] & 0x3c) >> 2),
c407ab
+		  ((buffer[2] & 0x03) << 6) | ((buffer[3] & 0x3f)));
c407ab
+	  break;
c407ab
+	default:
c407ab
+	  /* URG.  */
c407ab
+	  break;
c407ab
+	}
c407ab
+
c407ab
+      if (unicode_display == unicode_highlight && isatty (1))
c407ab
+	printf ("\033[0m"); /* Default colour.  */
c407ab
+      break;
c407ab
+
c407ab
+    case unicode_hex:
c407ab
+      putchar ('<');
c407ab
+      for (j = 0; j < utf8_len; j++)
c407ab
+	printf ("%02x", buffer [j]);
c407ab
+      putchar ('>');
c407ab
+      break;
c407ab
+
c407ab
+    case unicode_locale:
c407ab
+      printf ("%.1s", buffer);
c407ab
+      break;
c407ab
+    }
c407ab
+
c407ab
+  return utf8_len;
c407ab
+}
c407ab
+
c407ab
+/* Display strings in BUFFER.  Treat any UTF-8 encoded characters encountered
c407ab
+   according to the setting of the unicode_display variable.  The buffer
c407ab
+   contains BUFLEN bytes.
c407ab
+
c407ab
+   Display the characters as if they started at ADDRESS and are contained in
c407ab
+   FILENAME.  */
c407ab
+
c407ab
+static void
c407ab
+print_unicode_buffer (const char *            filename,
c407ab
+		      file_ptr                address,
c407ab
+		      const unsigned char *   buffer,
c407ab
+		      unsigned long           buflen)
c407ab
+{
c407ab
+  /* Paranoia checks...  */
c407ab
+  if (filename == NULL
c407ab
+      || buffer == NULL
c407ab
+      || unicode_display == unicode_default
c407ab
+      || encoding != 'S'
c407ab
+      || encoding_bytes != 1)
c407ab
+    {
c407ab
+      fprintf (stderr, "ICE: bad arguments to print_unicode_buffer\n");
c407ab
+      return;
c407ab
+    }
c407ab
+
c407ab
+  if (buflen == 0)
c407ab
+    return;
c407ab
+
c407ab
+  /* We must only display strings that are at least string_min *characters*
c407ab
+     long.  So we scan the buffer in two stages.  First we locate the start
c407ab
+     of a potential string.  Then we walk along it until we have found
c407ab
+     string_min characters.  Then we go back to the start point and start
c407ab
+     displaying characters according to the unicode_display setting.  */
c407ab
+
c407ab
+  unsigned long start_point = 0;
c407ab
+  unsigned long i = 0;
c407ab
+  unsigned int char_len = 1;
c407ab
+  unsigned int num_found = 0;
c407ab
+
c407ab
+  for (i = 0; i < buflen; i += char_len)
c407ab
+    {
c407ab
+      int c = buffer[i];
c407ab
+
c407ab
+      char_len = 1;
c407ab
+
c407ab
+      /* Find the first potential character of a string.  */
c407ab
+      if (! STRING_ISGRAPHIC (c))
c407ab
+	{
c407ab
+	  num_found = 0;
c407ab
+	  continue;
c407ab
+	}
c407ab
+
c407ab
+      if (c > 126)
c407ab
+	{
c407ab
+	  if (c < 0xc0)
c407ab
+	    {
c407ab
+	      num_found = 0;
c407ab
+	      continue;
c407ab
+	    }
c407ab
+
c407ab
+	  if ((char_len = is_valid_utf8 (buffer + i, buflen - i)) == 0)
c407ab
+	    {
c407ab
+	      char_len = 1;
c407ab
+	      num_found = 0;
c407ab
+	      continue;
c407ab
+	    }
c407ab
+
c407ab
+	  if (unicode_display == unicode_invalid)
c407ab
+	    {
c407ab
+	      /* We have found a valid UTF-8 character, but we treat it as non-graphic.  */
c407ab
+	      num_found = 0;
c407ab
+	      continue;
c407ab
+	    }
c407ab
+	}
c407ab
+
c407ab
+      if (num_found == 0)
c407ab
+	/* We have found a potential starting point for a string.  */
c407ab
+	start_point = i;
c407ab
+
c407ab
+      ++ num_found;
c407ab
+
c407ab
+      if (num_found >= string_min)
c407ab
+	break;
c407ab
+    }
c407ab
+
c407ab
+  if (num_found < string_min)
c407ab
+    return;
c407ab
+
c407ab
+  print_filename_and_address (filename, address + start_point);
c407ab
+  
c407ab
+  /* We have found string_min characters.  Display them and any
c407ab
+     more that follow.  */
c407ab
+  for (i = start_point; i < buflen; i += char_len)
c407ab
+    {
c407ab
+      int c = buffer[i];
c407ab
+
c407ab
+      char_len = 1;
c407ab
+
c407ab
+      if (! STRING_ISGRAPHIC (c))
c407ab
+	break;
c407ab
+      else if (c < 127)
c407ab
+	putchar (c);
c407ab
+      else if (! is_valid_utf8 (buffer + i, buflen - i))
c407ab
+	break;
c407ab
+      else if (unicode_display == unicode_invalid)
c407ab
+	break;
c407ab
+      else
c407ab
+	char_len = display_utf8_char (buffer + i);
c407ab
+    }
c407ab
+
c407ab
+  if (output_separator)
c407ab
+    fputs (output_separator, stdout);
c407ab
+  else
c407ab
+    putchar ('\n');
c407ab
+
c407ab
+  /* FIXME: Using tail recursion here is lazy programming...  */
c407ab
+  print_unicode_buffer (filename, address + i, buffer + i, buflen - i);
c407ab
+}
c407ab
+
c407ab
+static int
c407ab
+get_unicode_byte (FILE *           stream,
c407ab
+		  unsigned char *  putback,
c407ab
+		  unsigned int *   num_putback,
c407ab
+		  unsigned int *   num_read)
c407ab
+{
c407ab
+  if (* num_putback > 0)
c407ab
+    {
c407ab
+      * num_putback = * num_putback - 1;
c407ab
+      return putback [* num_putback];
c407ab
+    }
c407ab
+
c407ab
+  * num_read = * num_read + 1;
c407ab
+
c407ab
+#if defined(HAVE_GETC_UNLOCKED) && HAVE_DECL_GETC_UNLOCKED
c407ab
+  return getc_unlocked (stream);
c407ab
+#else
c407ab
+  return getc (stream);
c407ab
+#endif
c407ab
+}
c407ab
+
c407ab
+/* Helper function for print_unicode_stream.  */
c407ab
+
c407ab
+static void
c407ab
+print_unicode_stream_body (const char *     filename,
c407ab
+			   file_ptr         address,
c407ab
+			   FILE *           stream,
c407ab
+			   unsigned char *  putback_buf,
c407ab
+			   unsigned int     num_putback,
c407ab
+			   unsigned char *  print_buf)
c407ab
+{
c407ab
+  /* It would be nice if we could just read the stream into a buffer
c407ab
+     and then process if with print_unicode_buffer.  But the input
c407ab
+     might be huge or it might time-locked (eg stdin).  So instead
c407ab
+     we go one byte at a time...  */
c407ab
+
c407ab
+  file_ptr start_point = 0;
c407ab
+  unsigned int num_read = 0;
c407ab
+  unsigned int num_chars = 0;
c407ab
+  unsigned int num_print = 0;
c407ab
+  int c;
c407ab
+
c407ab
+  /* Find a series of string_min characters.  Put them into print_buf.  */
c407ab
+  do
c407ab
+    {
c407ab
+      if (num_chars >= string_min)
c407ab
+	break;
c407ab
+
c407ab
+      c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
c407ab
+      if (c == EOF)
c407ab
+	break;
c407ab
+
c407ab
+      if (! STRING_ISGRAPHIC (c))
c407ab
+	{
c407ab
+	  num_chars = num_print = 0;
c407ab
+	  continue;
c407ab
+	}
c407ab
+
c407ab
+      if (num_chars == 0)
c407ab
+	start_point = num_read - 1;
c407ab
+
c407ab
+      if (c < 127)
c407ab
+	{
c407ab
+	  print_buf[num_print] = c;
c407ab
+	  num_chars ++;
c407ab
+	  num_print ++;
c407ab
+	  continue;
c407ab
+	}
c407ab
+
c407ab
+      if (c < 0xc0)
c407ab
+	{
c407ab
+	  num_chars = num_print = 0;
c407ab
+	  continue;
c407ab
+	}
c407ab
+
c407ab
+      /* We *might* have a UTF-8 sequence.  Time to start peeking.  */
c407ab
+      char utf8[4];
c407ab
+
c407ab
+      utf8[0] = c;
c407ab
+      c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
c407ab
+      if (c == EOF)
c407ab
+	break;
c407ab
+      utf8[1] = c;
c407ab
+
c407ab
+      if ((utf8[1] & 0xc0) != 0x80)
c407ab
+	{
c407ab
+	  /* Invalid UTF-8.  */
c407ab
+	  putback_buf[num_putback++] = utf8[1];
c407ab
+	  num_chars = num_print = 0;
c407ab
+	  continue;
c407ab
+	}
c407ab
+      else if ((utf8[0] & 0x20) == 0)
c407ab
+	{
c407ab
+	  /* A valid 2-byte UTF-8 encoding.  */
c407ab
+	  if (unicode_display == unicode_invalid)
c407ab
+	    {
c407ab
+	      putback_buf[num_putback++] = utf8[1];
c407ab
+	      num_chars = num_print = 0;
c407ab
+	    }
c407ab
+	  else
c407ab
+	    {
c407ab
+	      print_buf[num_print ++] = utf8[0];
c407ab
+	      print_buf[num_print ++] = utf8[1];
c407ab
+	      num_chars ++;
c407ab
+	    }
c407ab
+	  continue;
c407ab
+	}
c407ab
+
c407ab
+      c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
c407ab
+      if (c == EOF)
c407ab
+	break;
c407ab
+      utf8[2] = c;
c407ab
+
c407ab
+      if ((utf8[2] & 0xc0) != 0x80)
c407ab
+	{
c407ab
+	  /* Invalid UTF-8.  */
c407ab
+	  putback_buf[num_putback++] = utf8[2];
c407ab
+	  putback_buf[num_putback++] = utf8[1];
c407ab
+	  num_chars = num_print = 0;
c407ab
+	  continue;
c407ab
+	}
c407ab
+      else if ((utf8[0] & 0x10) == 0)
c407ab
+	{
c407ab
+	  /* A valid 3-byte UTF-8 encoding.  */
c407ab
+	  if (unicode_display == unicode_invalid)
c407ab
+	    {
c407ab
+	      putback_buf[num_putback++] = utf8[2];
c407ab
+	      putback_buf[num_putback++] = utf8[1];
c407ab
+	      num_chars = num_print = 0;
c407ab
+	    }
c407ab
+	  else
c407ab
+	    {
c407ab
+	      print_buf[num_print ++] = utf8[0];
c407ab
+	      print_buf[num_print ++] = utf8[1];
c407ab
+	      print_buf[num_print ++] = utf8[2];
c407ab
+	      num_chars ++;
c407ab
+	    }
c407ab
+	  continue;
c407ab
+	}
c407ab
+
c407ab
+      c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
c407ab
+      if (c == EOF)
c407ab
+	break;
c407ab
+      utf8[3] = c;
c407ab
+
c407ab
+      if ((utf8[3] & 0xc0) != 0x80)
c407ab
+	{
c407ab
+	  /* Invalid UTF-8.  */
c407ab
+	  putback_buf[num_putback++] = utf8[3];
c407ab
+	  putback_buf[num_putback++] = utf8[2];
c407ab
+	  putback_buf[num_putback++] = utf8[1];
c407ab
+	  num_chars = num_print = 0;
c407ab
+	}
c407ab
+      /* We have a valid 4-byte UTF-8 encoding.  */
c407ab
+      else if (unicode_display == unicode_invalid)
c407ab
+	{
c407ab
+	  putback_buf[num_putback++] = utf8[3];
c407ab
+	  putback_buf[num_putback++] = utf8[1];
c407ab
+	  putback_buf[num_putback++] = utf8[2];
c407ab
+	  num_chars = num_print = 0;
c407ab
+	}
c407ab
+      else
c407ab
+	{
c407ab
+	  print_buf[num_print ++] = utf8[0];
c407ab
+	  print_buf[num_print ++] = utf8[1];
c407ab
+	  print_buf[num_print ++] = utf8[2];
c407ab
+	  print_buf[num_print ++] = utf8[3];
c407ab
+	  num_chars ++;
c407ab
+	}
c407ab
+    }
c407ab
+  while (1);
c407ab
+
c407ab
+  if (num_chars >= string_min)
c407ab
+    {
c407ab
+      /* We know that we have string_min valid characters in print_buf,
c407ab
+	 and there may be more to come in the stream.  Start displaying
c407ab
+	 them.  */
c407ab
+
c407ab
+      print_filename_and_address (filename, address + start_point);
c407ab
+
c407ab
+      unsigned int i;
c407ab
+      for (i = 0; i < num_print;)
c407ab
+	{
c407ab
+	  if (print_buf[i] < 127)
c407ab
+	    putchar (print_buf[i++]);
c407ab
+	  else
c407ab
+	    i += display_utf8_char (print_buf + i);
c407ab
+	}
c407ab
+
c407ab
+      /* OK so now we have to start read unchecked bytes.  */
c407ab
+
c407ab
+        /* Find a series of string_min characters.  Put them into print_buf.  */
c407ab
+      do
c407ab
+	{
c407ab
+	  c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
c407ab
+	  if (c == EOF)
c407ab
+	    break;
c407ab
+
c407ab
+	  if (! STRING_ISGRAPHIC (c))
c407ab
+	    break;
c407ab
+
c407ab
+	  if (c < 127)
c407ab
+	    {
c407ab
+	      putchar (c);
c407ab
+	      continue;
c407ab
+	    }
c407ab
+
c407ab
+	  if (c < 0xc0)
c407ab
+	    break;
c407ab
+
c407ab
+	  /* We *might* have a UTF-8 sequence.  Time to start peeking.  */
c407ab
+	  unsigned char utf8[4];
c407ab
+
c407ab
+	  utf8[0] = c;
c407ab
+	  c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
c407ab
+	  if (c == EOF)
c407ab
+	    break;
c407ab
+	  utf8[1] = c;
c407ab
+
c407ab
+	  if ((utf8[1] & 0xc0) != 0x80)
c407ab
+	    {
c407ab
+	      /* Invalid UTF-8.  */
c407ab
+	      putback_buf[num_putback++] = utf8[1];
c407ab
+	      break;
c407ab
+	    }
c407ab
+	  else if ((utf8[0] & 0x20) == 0)
c407ab
+	    {
c407ab
+	      /* Valid 2-byte UTF-8.  */
c407ab
+	      if (unicode_display == unicode_invalid)
c407ab
+		{
c407ab
+		  putback_buf[num_putback++] = utf8[1];
c407ab
+		  break;
c407ab
+		}
c407ab
+	      else
c407ab
+		{
c407ab
+		  (void) display_utf8_char (utf8);
c407ab
+		  continue;
c407ab
+		}
c407ab
+	    }
c407ab
+
c407ab
+	  c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
c407ab
+	  if (c == EOF)
c407ab
+	    break;
c407ab
+	  utf8[2] = c;
c407ab
+
c407ab
+	  if ((utf8[2] & 0xc0) != 0x80)
c407ab
+	    {
c407ab
+	      /* Invalid UTF-8.  */
c407ab
+	      putback_buf[num_putback++] = utf8[2];
c407ab
+	      putback_buf[num_putback++] = utf8[1];
c407ab
+	      break;
c407ab
+	    }
c407ab
+	  else if ((utf8[0] & 0x10) == 0)
c407ab
+	    {
c407ab
+	      /* Valid 3-byte UTF-8.  */
c407ab
+	      if (unicode_display == unicode_invalid)
c407ab
+		{
c407ab
+		  putback_buf[num_putback++] = utf8[2];
c407ab
+		  putback_buf[num_putback++] = utf8[1];
c407ab
+		  break;
c407ab
+		}
c407ab
+	      else
c407ab
+		{
c407ab
+		  (void) display_utf8_char (utf8);
c407ab
+		  continue;
c407ab
+		}
c407ab
+	    }
c407ab
+
c407ab
+	  c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
c407ab
+	  if (c == EOF)
c407ab
+	    break;
c407ab
+	  utf8[3] = c;
c407ab
+
c407ab
+	  if ((utf8[3] & 0xc0) != 0x80)
c407ab
+	    {
c407ab
+	      /* Invalid UTF-8.  */
c407ab
+	      putback_buf[num_putback++] = utf8[3];
c407ab
+	      putback_buf[num_putback++] = utf8[2];
c407ab
+	      putback_buf[num_putback++] = utf8[1];
c407ab
+	      break;
c407ab
+	    }
c407ab
+	  else if (unicode_display == unicode_invalid)
c407ab
+	    {
c407ab
+	      putback_buf[num_putback++] = utf8[3];
c407ab
+	      putback_buf[num_putback++] = utf8[2];
c407ab
+	      putback_buf[num_putback++] = utf8[1];
c407ab
+	      break;
c407ab
+	    }
c407ab
+	  else
c407ab
+	    /* A valid 4-byte UTF-8 encoding.  */
c407ab
+	    (void) display_utf8_char (utf8);
c407ab
+	}
c407ab
+      while (1);
c407ab
+
c407ab
+      if (output_separator)
c407ab
+	fputs (output_separator, stdout);
c407ab
+      else
c407ab
+	putchar ('\n');
c407ab
+    }
c407ab
+
c407ab
+  if (c != EOF)
c407ab
+    /* FIXME: Using tail recursion here is lazy, but it works.  */
c407ab
+    print_unicode_stream_body (filename, address + num_read, stream, putback_buf, num_putback, print_buf);
c407ab
+}
c407ab
+
c407ab
+/* Display strings read in from STREAM.  Treat any UTF-8 encoded characters
c407ab
+   encountered according to the setting of the unicode_display variable.
c407ab
+   The stream is positioned at ADDRESS and is attached to FILENAME.  */
c407ab
+
c407ab
+static void
c407ab
+print_unicode_stream (const char * filename,
c407ab
+		      file_ptr     address,
c407ab
+		      FILE *       stream)
c407ab
+{
c407ab
+  /* Paranoia checks...  */
c407ab
+  if (filename == NULL
c407ab
+      || stream == NULL
c407ab
+      || unicode_display == unicode_default
c407ab
+      || encoding != 'S'
c407ab
+      || encoding_bytes != 1)
c407ab
+    {
c407ab
+      fprintf (stderr, "ICE: bad arguments to print_unicode_stream\n");
c407ab
+      return;
c407ab
+    }
c407ab
+
c407ab
+  /* Allocate space for string_min 4-byte utf-8 characters.  */
c407ab
+  unsigned char * print_buf = xmalloc ((4 * string_min) + 1);
c407ab
+  /* We should never have to put back more than 4 bytes.  */
c407ab
+  unsigned char putback_buf[5];
c407ab
+  unsigned int num_putback = 0;
c407ab
+
c407ab
+  print_unicode_stream_body (filename, address, stream, putback_buf, num_putback, print_buf);
c407ab
+  free (print_buf);
c407ab
+}
c407ab
 
c407ab
 /* Find the strings in file FILENAME, read from STREAM.
c407ab
    Assume that STREAM is positioned so that the next byte read
c407ab
    is at address ADDRESS in the file.
c407ab
-   Stop reading at address STOP_POINT in the file, if nonzero.
c407ab
 
c407ab
    If STREAM is NULL, do not read from it.
c407ab
    The caller can supply a buffer of characters
c407ab
@@ -570,18 +1238,27 @@ static void
c407ab
 print_strings (const char *filename, FILE *stream, file_ptr address,
c407ab
 	       int stop_point, int magiccount, char *magic)
c407ab
 {
c407ab
+  if (unicode_display != unicode_default)
c407ab
+    {
c407ab
+      if (magic != NULL)
c407ab
+	print_unicode_buffer (filename, address,
c407ab
+			      (const unsigned char *) magic, magiccount);
c407ab
+
c407ab
+      if (stream != NULL)
c407ab
+	print_unicode_stream (filename, address, stream);
c407ab
+      return;
c407ab
+    }
c407ab
+
c407ab
   char *buf = (char *) xmalloc (sizeof (char) * (string_min + 1));
c407ab
 
c407ab
   while (1)
c407ab
     {
c407ab
       file_ptr start;
c407ab
-      int i;
c407ab
+      unsigned int i;
c407ab
       long c;
c407ab
 
c407ab
       /* See if the next `string_min' chars are all graphic chars.  */
c407ab
     tryline:
c407ab
-      if (stop_point && address >= stop_point)
c407ab
-	break;
c407ab
       start = address;
c407ab
       for (i = 0; i < string_min; i++)
c407ab
 	{
c407ab
@@ -718,6 +1395,8 @@ usage (FILE *stream, int status)
c407ab
   -T --target=<BFDNAME>     Specify the binary file format\n\
c407ab
   -e --encoding={s,S,b,l,B,L} Select character size and endianness:\n\
c407ab
                             s = 7-bit, S = 8-bit, {b,l} = 16-bit, {B,L} = 32-bit\n\
c407ab
+  --unicode={default|show|invalid|hex|escape|highlight}\n\
c407ab
+  -U {d|s|i|x|e|h}          Specify how to treat UTF-8 encoded unicode characters\n\
c407ab
   -s --output-separator=<string> String used to separate strings in output.\n\
c407ab
   @<file>                   Read options from <file>\n\
c407ab
   -h --help                 Display this information\n\