Blame SOURCES/binutils.unicode.patch

2b5b46
diff -rup binutils.orig/binutils/NEWS binutils-2.36.1/binutils/NEWS
2b5b46
--- binutils.orig/binutils/NEWS	2021-10-21 16:56:20.322761363 +0100
2b5b46
+++ binutils-2.36.1/binutils/NEWS	2021-10-21 16:56:29.692696238 +0100
2b5b46
@@ -151,6 +151,15 @@ Changes in 2.32:
2b5b46
 
2b5b46
 Changes in 2.31:
2b5b46
 
2b5b46
+* Tools which display names or strings (readelf, strings, nm, objdump)
2b5b46
+  have a new command line option which controls how unicode characters are
2b5b46
+  handled.  By default they are treated as normal for the tool.  Using
2b5b46
+  --unicode=locale will display them according to the current locale.
2b5b46
+  Using --unicode=hex will display them as hex byte values, whilst
2b5b46
+  --unicode=escape will display them as escape sequences.  In addition
2b5b46
+  using --unicode=highlight will display them as unicode escape sequences
2b5b46
+  highlighted in red (if supported by the output device).
2b5b46
+
2b5b46
 * Add support for disassembling netronome Flow Processor (NFP) firmware files.
2b5b46
 
2b5b46
 * The AArch64 port now supports showing disassembly notes which are emitted
2b5b46
Only in binutils-2.36.1/binutils/: NEWS.orig
2b5b46
diff -rup binutils.orig/binutils/doc/binutils.texi binutils-2.36.1/binutils/doc/binutils.texi
2b5b46
--- binutils.orig/binutils/doc/binutils.texi	2021-10-21 16:56:20.324761349 +0100
2b5b46
+++ binutils-2.36.1/binutils/doc/binutils.texi	2021-10-21 16:56:29.694696225 +0100
2b5b46
@@ -799,6 +799,7 @@ nm [@option{-A}|@option{-o}|@option{--pr
2b5b46
    [@option{-g}|@option{--extern-only}] [@option{-h}|@option{--help}]
2b5b46
    [@option{--ifunc-chars=@var{CHARS}}]
2b5b46
    [@option{-l}|@option{--line-numbers}] [@option{--inlines}]
2b5b46
+   [@option{-U} @var{method}] [@option{--unicode=}@var{method}]
2b5b46
    [@option{-n}|@option{-v}|@option{--numeric-sort}]
2b5b46
    [@option{-P}|@option{--portability}] [@option{-p}|@option{--no-sort}]
2b5b46
    [@option{-r}|@option{--reverse-sort}] [@option{-S}|@option{--print-size}]
2b5b46
@@ -1114,6 +1115,21 @@ Use @var{radix} as the radix for printin
2b5b46
 @cindex undefined symbols
2b5b46
 Display only undefined symbols (those external to each object file).
2b5b46
 
2b5b46
+@item -U @var{[d|i|l|e|x|h]}
2b5b46
+@itemx --unicode=@var{[default|invalid|locale|escape|hex|highlight]}
2b5b46
+Controls the display of UTF-8 encoded mulibyte characters in strings.
2b5b46
+The default (@option{--unicode=default}) is to give them no special
2b5b46
+treatment.  The @option{--unicode=locale} option displays the sequence
2b5b46
+in the current locale, which may or may not support them.  The options
2b5b46
+@option{--unicode=hex} and @option{--unicode=invalid} display them as
2b5b46
+hex byte sequences enclosed by either angle brackets or curly braces.
2b5b46
+
2b5b46
+The @option{--unicode=escape} option displays them as escape sequences
2b5b46
+(@var{\uxxxx}) and the @option{--unicode=highlight} option displays
2b5b46
+them as escape sequences highlighted in red (if supported by the
2b5b46
+output device).  The colouring is intended to draw attention to the
2b5b46
+presence of unicode sequences where they might not be expected.
2b5b46
+
2b5b46
 @item -V
2b5b46
 @itemx --version
2b5b46
 Show the version number of @command{nm} and exit.
2b5b46
@@ -2210,6 +2226,7 @@ objdump [@option{-a}|@option{--archive-h
2b5b46
         [@option{--prefix-strip=}@var{level}]
2b5b46
         [@option{--insn-width=}@var{width}]
2b5b46
         [@option{--visualize-jumps[=color|=extended-color|=off]}
2b5b46
+        [@option{-U} @var{method}] [@option{--unicode=}@var{method}]
2b5b46
         [@option{-V}|@option{--version}]
2b5b46
         [@option{-H}|@option{--help}]
2b5b46
         @var{objfile}@dots{}
2b5b46
@@ -2877,6 +2894,21 @@ When displaying symbols include those wh
2b5b46
 special in some way and which would not normally be of interest to the
2b5b46
 user.
2b5b46
 
2b5b46
+@item -U @var{[d|i|l|e|x|h]}
2b5b46
+@itemx --unicode=@var{[default|invalid|locale|escape|hex|highlight]}
2b5b46
+Controls the display of UTF-8 encoded mulibyte characters in strings.
2b5b46
+The default (@option{--unicode=default}) is to give them no special
2b5b46
+treatment.  The @option{--unicode=locale} option displays the sequence
2b5b46
+in the current locale, which may or may not support them.  The options
2b5b46
+@option{--unicode=hex} and @option{--unicode=invalid} display them as
2b5b46
+hex byte sequences enclosed by either angle brackets or curly braces.
2b5b46
+
2b5b46
+The @option{--unicode=escape} option displays them as escape sequences
2b5b46
+(@var{\uxxxx}) and the @option{--unicode=highlight} option displays
2b5b46
+them as escape sequences highlighted in red (if supported by the
2b5b46
+output device).  The colouring is intended to draw attention to the
2b5b46
+presence of unicode sequences where they might not be expected.
2b5b46
+
2b5b46
 @item -V
2b5b46
 @itemx --version
2b5b46
 Print the version number of @command{objdump} and exit.
2b5b46
@@ -3153,6 +3185,7 @@ strings [@option{-afovV}] [@option{-}@va
2b5b46
         [@option{-n} @var{min-len}] [@option{--bytes=}@var{min-len}]
2b5b46
         [@option{-t} @var{radix}] [@option{--radix=}@var{radix}]
2b5b46
         [@option{-e} @var{encoding}] [@option{--encoding=}@var{encoding}]
2b5b46
+        [@option{-U} @var{method}] [@option{--unicode=}@var{method}]
2b5b46
         [@option{-}] [@option{--all}] [@option{--print-file-name}]
2b5b46
         [@option{-T} @var{bfdname}] [@option{--target=}@var{bfdname}]
2b5b46
         [@option{-w}] [@option{--include-all-whitespace}]
2b5b46
@@ -3244,6 +3277,28 @@ single-8-bit-byte characters, @samp{b} =
2b5b46
 littleendian.  Useful for finding wide character strings. (@samp{l}
2b5b46
 and @samp{b} apply to, for example, Unicode UTF-16/UCS-2 encodings).
2b5b46
 
2b5b46
+@item -U @var{[d|i|l|e|x|h]}
2b5b46
+@itemx --unicode=@var{[default|invalid|locale|escape|hex|highlight]}
2b5b46
+Controls the display of UTF-8 encoded mulibyte characters in strings.
2b5b46
+The default (@option{--unicode=default}) is to give them no special
2b5b46
+treatment, and instead rely upon the setting of the
2b5b46
+@option{--encoding} option.  The other values for this option
2b5b46
+automatically enable @option{--encoding=S}.
2b5b46
+
2b5b46
+The @option{--unicode=invalid} option treats them as non-graphic
2b5b46
+characters and hence not part of a valid string.  All the remaining
2b5b46
+options treat them as valid string characters.
2b5b46
+
2b5b46
+The @option{--unicode=locale} option displays them in the current
2b5b46
+locale, which may or may not support UTF-8 encoding.  The
2b5b46
+@option{--unicode=hex} option displays them as hex byte sequences
2b5b46
+enclosed between @var{<>} characters.  The @option{--unicode=escape}
2b5b46
+option displays them as escape sequences (@var{\uxxxx}) and the
2b5b46
+@option{--unicode=highlight} option displays them as escape sequences
2b5b46
+highlighted in red (if supported by the output device).  The colouring
2b5b46
+is intended to draw attention to the presence of unicode sequences
2b5b46
+where they might not be expected.
2b5b46
+
2b5b46
 @item -T @var{bfdname}
2b5b46
 @itemx --target=@var{bfdname}
2b5b46
 @cindex object code format
2b5b46
@@ -4766,6 +4821,7 @@ readelf [@option{-a}|@option{--all}]
2b5b46
         [@option{-W}|@option{--wide}]
2b5b46
         [@option{-T}|@option{--silent-truncation}]
2b5b46
         [@option{-H}|@option{--help}]
2b5b46
+        [@option{-U} @var{method}|@option{--unicode=}@var{method}]
2b5b46
         @var{elffile}@dots{}
2b5b46
 @c man end
2b5b46
 @end smallexample
2b5b46
@@ -4887,6 +4943,28 @@ necessary in order to demangle truly com
2b5b46
 that if the recursion limit is disabled then stack exhaustion is
2b5b46
 possible and any bug reports about such an event will be rejected.
2b5b46
 
2b5b46
+@item -U @var{[d|i|l|e|x|h]}
2b5b46
+@itemx --unicode=[default|invalid|locale|escape|hex|highlight]
2b5b46
+Controls the display of non-ASCII characters in identifier names.
2b5b46
+The default (@option{--unicode=locale} or @option{--unicode=default}) is
2b5b46
+to treat them as multibyte characters and display them in the current
2b5b46
+locale.  All other versions of this option treat the bytes as UTF-8
2b5b46
+encoded values and attempt to interpret them.  If they cannot be
2b5b46
+interpreted or if the @option{--unicode=invalid} option is used then
2b5b46
+they are displayed as a sequence of hex bytes, encloses in curly
2b5b46
+parethesis characters.
2b5b46
+
2b5b46
+Using the @option{--unicode=escape} option will display the characters
2b5b46
+as as unicode escape sequences (@var{\uxxxx}).  Using the
2b5b46
+@option{--unicode=hex} will display the characters as hex byte
2b5b46
+sequences enclosed between angle brackets.
2b5b46
+
2b5b46
+Using the @option{--unicode=highlight} will display the characters as 
2b5b46
+unicode escape sequences but it will also highlighted them in red,
2b5b46
+assuming that colouring is supported by the output device.  The
2b5b46
+colouring is intended to draw attention to the presence of unicode
2b5b46
+sequences when they might not be expected.
2b5b46
+
2b5b46
 @item -e
2b5b46
 @itemx --headers
2b5b46
 Display all the headers in the file.  Equivalent to @option{-h -l -S}.
2b5b46
Only in binutils-2.36.1/binutils/doc: binutils.texi.orig
2b5b46
diff -rup binutils.orig/binutils/nm.c binutils-2.36.1/binutils/nm.c
2b5b46
--- binutils.orig/binutils/nm.c	2021-10-21 16:56:20.318761391 +0100
2b5b46
+++ binutils-2.36.1/binutils/nm.c	2021-10-21 16:59:56.105261602 +0100
2b5b46
@@ -38,6 +38,11 @@
2b5b46
 #include "bucomm.h"
2b5b46
 #include "plugin-api.h"
2b5b46
 #include "plugin.h"
2b5b46
+#include "safe-ctype.h"
2b5b46
+
2b5b46
+#ifndef streq
2b5b46
+#define streq(a,b) (strcmp ((a),(b)) == 0)
2b5b46
+#endif
2b5b46
 
2b5b46
 /* When sorting by size, we use this structure to hold the size and a
2b5b46
    pointer to the minisymbol.  */
2b5b46
@@ -192,6 +197,18 @@ static const char *plugin_target = NULL;
2b5b46
 static bfd *lineno_cache_bfd;
2b5b46
 static bfd *lineno_cache_rel_bfd;
2b5b46
 
2b5b46
+typedef enum unicode_display_type
2b5b46
+{
2b5b46
+  unicode_default = 0,
2b5b46
+  unicode_locale,
2b5b46
+  unicode_escape,
2b5b46
+  unicode_hex,
2b5b46
+  unicode_highlight,
2b5b46
+  unicode_invalid
2b5b46
+} unicode_display_type;
2b5b46
+
2b5b46
+static unicode_display_type unicode_display = unicode_default;
2b5b46
+
2b5b46
 enum long_option_values
2b5b46
 {
2b5b46
   OPTION_TARGET = 200,
2b5b46
@@ -234,6 +251,7 @@ static struct option long_options[] =
2b5b46
   {"target", required_argument, 0, OPTION_TARGET},
2b5b46
   {"defined-only", no_argument, &defined_only, 1},
2b5b46
   {"undefined-only", no_argument, &undefined_only, 1},
2b5b46
+  {"unicode", required_argument, NULL, 'U'},
2b5b46
   {"version", no_argument, &show_version, 1},
2b5b46
   {"with-symbol-versions", no_argument, NULL,
2b5b46
    OPTION_WITH_SYMBOL_VERSIONS},
2b5b46
@@ -285,6 +303,8 @@ usage (FILE *stream, int status)
2b5b46
   -t, --radix=RADIX      Use RADIX for printing symbol values\n\
2b5b46
       --target=BFDNAME   Specify the target object format as BFDNAME\n\
2b5b46
   -u, --undefined-only   Display only undefined symbols\n\
2b5b46
+  -U {d|s|i|x|e|h}       Specify how to treat UTF-8 encoded unicode characters\n\
2b5b46
+      --unicode={default|show|invalid|hex|escape|highlight}\n\
2b5b46
       --with-symbol-versions  Display version strings after symbol names\n\
2b5b46
   -X 32_64               (ignored)\n\
2b5b46
   @FILE                  Read options from FILE\n\
2b5b46
@@ -400,6 +420,189 @@ get_coff_symbol_type (const struct inter
2b5b46
   return bufp;
2b5b46
 }
2b5b46
 
2b5b46
+/* Convert a potential UTF-8 encoded sequence in IN into characters in OUT.
2b5b46
+   The conversion format is controlled by the unicode_display variable.
2b5b46
+   Returns the number of characters added to OUT.
2b5b46
+   Returns the number of bytes consumed from IN in CONSUMED.
2b5b46
+   Always consumes at least one byte and displays at least one character.  */
2b5b46
+   
2b5b46
+static unsigned int
2b5b46
+display_utf8 (const unsigned char * in, char * out, unsigned int * consumed)
2b5b46
+{
2b5b46
+  char *        orig_out = out;
2b5b46
+  unsigned int  nchars = 0;
2b5b46
+
2b5b46
+  if (unicode_display == unicode_default)
2b5b46
+    goto invalid;
2b5b46
+
2b5b46
+  if (in[0] < 0xc0)
2b5b46
+    goto invalid;
2b5b46
+
2b5b46
+  if ((in[1] & 0xc0) != 0x80)
2b5b46
+    goto invalid;
2b5b46
+
2b5b46
+  if ((in[0] & 0x20) == 0)
2b5b46
+    {
2b5b46
+      nchars = 2;
2b5b46
+      goto valid;
2b5b46
+    }
2b5b46
+
2b5b46
+  if ((in[2] & 0xc0) != 0x80)
2b5b46
+    goto invalid;
2b5b46
+
2b5b46
+  if ((in[0] & 0x10) == 0)
2b5b46
+    {
2b5b46
+      nchars = 3;
2b5b46
+      goto valid;
2b5b46
+    }
2b5b46
+
2b5b46
+  if ((in[3] & 0xc0) != 0x80)
2b5b46
+    goto invalid;
2b5b46
+
2b5b46
+  nchars = 4;
2b5b46
+
2b5b46
+ valid:
2b5b46
+  switch (unicode_display)
2b5b46
+    {
2b5b46
+    case unicode_locale:
2b5b46
+      /* Copy the bytes into the output buffer as is.  */
2b5b46
+      memcpy (out, in, nchars);
2b5b46
+      out += nchars;
2b5b46
+      break;
2b5b46
+
2b5b46
+    case unicode_invalid:
2b5b46
+    case unicode_hex:
2b5b46
+      {
2b5b46
+      unsigned int j;
2b5b46
+
2b5b46
+      out += sprintf (out, "%c", unicode_display == unicode_hex ? '<' : '{');
2b5b46
+      for (j = 0; j < nchars; j++)
2b5b46
+	out += sprintf (out, "%02x", in [j]);
2b5b46
+      out += sprintf (out, "%c", unicode_display == unicode_hex ? '>' : '}');
2b5b46
+      }
2b5b46
+      break;
2b5b46
+      
2b5b46
+    case unicode_highlight:
2b5b46
+      if (isatty (1))
2b5b46
+	out += sprintf (out, "\x1B[31;47m"); /* Red.  */
2b5b46
+      /* Fall through.  */
2b5b46
+    case unicode_escape:
2b5b46
+      switch (nchars)
2b5b46
+	{
2b5b46
+	case 2:
2b5b46
+	  out += sprintf (out, "\\u%02x%02x",
2b5b46
+		  ((in[0] & 0x1c) >> 2), 
2b5b46
+		  ((in[0] & 0x03) << 6) | (in[1] & 0x3f));
2b5b46
+	  break;
2b5b46
+
2b5b46
+	case 3:
2b5b46
+	  out += sprintf (out, "\\u%02x%02x",
2b5b46
+		  ((in[0] & 0x0f) << 4) | ((in[1] & 0x3c) >> 2),
2b5b46
+		  ((in[1] & 0x03) << 6) | ((in[2] & 0x3f)));
2b5b46
+	  break;
2b5b46
+
2b5b46
+	case 4:
2b5b46
+	  out += sprintf (out, "\\u%02x%02x%02x",
2b5b46
+		  ((in[0] & 0x07) << 6) | ((in[1] & 0x3c) >> 2),
2b5b46
+		  ((in[1] & 0x03) << 6) | ((in[2] & 0x3c) >> 2),
2b5b46
+		  ((in[2] & 0x03) << 6) | ((in[3] & 0x3f)));
2b5b46
+	  break;
2b5b46
+	default:
2b5b46
+	  /* URG.  */
2b5b46
+	  break;
2b5b46
+	}
2b5b46
+
2b5b46
+      if (unicode_display == unicode_highlight && isatty (1))
2b5b46
+	out += sprintf (out, "\033[0m"); /* Default colour.  */
2b5b46
+      break;
2b5b46
+
2b5b46
+    default:
2b5b46
+      /* URG */
2b5b46
+      break;
2b5b46
+    }
2b5b46
+
2b5b46
+  * consumed = nchars;
2b5b46
+  return out - orig_out;
2b5b46
+
2b5b46
+ invalid:
2b5b46
+  /* Not a valid UTF-8 sequence.  */
2b5b46
+  *out = *in;
2b5b46
+  * consumed = 1;
2b5b46
+  return 1;
2b5b46
+}
2b5b46
+
2b5b46
+/* Convert any UTF-8 encoded characters in NAME into the form specified by
2b5b46
+   unicode_display.  Also converts control characters.  Returns a static
2b5b46
+   buffer if conversion was necessary.
2b5b46
+   Code stolen from objdump.c:sanitize_string().  */
2b5b46
+
2b5b46
+static const char *
2b5b46
+convert_utf8 (const char * in)
2b5b46
+{
2b5b46
+  static char *  buffer = NULL;
2b5b46
+  static size_t  buffer_len = 0;
2b5b46
+  const char *   original = in;
2b5b46
+  char *         out;
2b5b46
+
2b5b46
+  /* Paranoia.  */
2b5b46
+  if (in == NULL)
2b5b46
+    return "";
2b5b46
+
2b5b46
+  /* See if any conversion is necessary.
2b5b46
+     In the majority of cases it will not be needed.  */
2b5b46
+  do
2b5b46
+    {
2b5b46
+      unsigned char c = *in++;
2b5b46
+
2b5b46
+      if (c == 0)
2b5b46
+	return original;
2b5b46
+
2b5b46
+      if (ISCNTRL (c))
2b5b46
+	break;
2b5b46
+
2b5b46
+      if (unicode_display != unicode_default && c >= 0xc0)
2b5b46
+	break;
2b5b46
+    }
2b5b46
+  while (1);
2b5b46
+
2b5b46
+  /* Copy the input, translating as needed.  */
2b5b46
+  in = original;
2b5b46
+  if (buffer_len < (strlen (in) * 9))
2b5b46
+    {
2b5b46
+      free ((void *) buffer);
2b5b46
+      buffer_len = strlen (in) * 9;
2b5b46
+      buffer = xmalloc (buffer_len + 1);
2b5b46
+    }
2b5b46
+
2b5b46
+  out = buffer;
2b5b46
+  do
2b5b46
+    {
2b5b46
+      unsigned char c = *in++;
2b5b46
+
2b5b46
+      if (c == 0)
2b5b46
+	break;
2b5b46
+
2b5b46
+      if (ISCNTRL (c))
2b5b46
+	{
2b5b46
+	  *out++ = '^';
2b5b46
+	  *out++ = c + 0x40;
2b5b46
+	}
2b5b46
+      else if (unicode_display != unicode_default && c >= 0xc0)
2b5b46
+	{
2b5b46
+	  unsigned int num_consumed;
2b5b46
+
2b5b46
+	  out += display_utf8 ((const unsigned char *)(in - 1), out, & num_consumed);
2b5b46
+	  in += num_consumed - 1;
2b5b46
+	}
2b5b46
+      else
2b5b46
+	*out++ = c;
2b5b46
+    }
2b5b46
+  while (1);
2b5b46
+
2b5b46
+  *out = 0;
2b5b46
+  return buffer;
2b5b46
+}
2b5b46
+
2b5b46
 /* Print symbol name NAME, read from ABFD, with printf format FORM,
2b5b46
    demangling it if requested.  */
2b5b46
 
2b5b46
@@ -418,6 +621,9 @@ print_symname (const char *form, struct
2b5b46
 	name = alloc;
2b5b46
     }
2b5b46
 
2b5b46
+  if (unicode_display != unicode_default)
2b5b46
+    name = convert_utf8 (name);
2b5b46
+
2b5b46
   if (info != NULL && info->elfinfo)
2b5b46
     {
2b5b46
       const char *version_string;
2b5b46
@@ -1738,7 +1944,7 @@ main (int argc, char **argv)
2b5b46
     fatal (_("fatal error: libbfd ABI mismatch"));
2b5b46
   set_default_bfd_target ();
2b5b46
 
2b5b46
-  while ((c = getopt_long (argc, argv, "aABCDef:gHhlnopPrSst:uvVvX:",
2b5b46
+  while ((c = getopt_long (argc, argv, "aABCDef:gHhlnopPrSst:uU:vVvX:",
2b5b46
 			   long_options, (int *) 0)) != EOF)
2b5b46
     {
2b5b46
       switch (c)
2b5b46
@@ -1828,6 +2034,24 @@ main (int argc, char **argv)
2b5b46
 	case 'u':
2b5b46
 	  undefined_only = 1;
2b5b46
 	  break;
2b5b46
+
2b5b46
+	case 'U':
2b5b46
+	  if (streq (optarg, "default") || streq (optarg, "d"))
2b5b46
+	    unicode_display = unicode_default;
2b5b46
+	  else if (streq (optarg, "locale") || streq (optarg, "l"))
2b5b46
+	    unicode_display = unicode_locale;
2b5b46
+	  else if (streq (optarg, "escape") || streq (optarg, "e"))
2b5b46
+	    unicode_display = unicode_escape;
2b5b46
+	  else if (streq (optarg, "invalid") || streq (optarg, "i"))
2b5b46
+	    unicode_display = unicode_invalid;
2b5b46
+	  else if (streq (optarg, "hex") || streq (optarg, "x"))
2b5b46
+	    unicode_display = unicode_hex;
2b5b46
+	  else if (streq (optarg, "highlight") || streq (optarg, "h"))
2b5b46
+	    unicode_display = unicode_highlight;
2b5b46
+	  else
2b5b46
+	    fatal (_("invalid argument to -U/--unicode: %s"), optarg);
2b5b46
+	  break;
2b5b46
+
2b5b46
 	case 'V':
2b5b46
 	  show_version = 1;
2b5b46
 	  break;
2b5b46
Only in binutils-2.36.1/binutils/: nm.c.orig
2b5b46
Only in binutils-2.36.1/binutils/: nm.c.rej
2b5b46
diff -rup binutils.orig/binutils/objdump.c binutils-2.36.1/binutils/objdump.c
2b5b46
--- binutils.orig/binutils/objdump.c	2021-10-21 16:56:20.320761377 +0100
2b5b46
+++ binutils-2.36.1/binutils/objdump.c	2021-10-21 16:56:29.695696218 +0100
2b5b46
@@ -205,6 +205,18 @@ static const struct objdump_private_desc
2b5b46
 
2b5b46
 /* The list of detected jumps inside a function.  */
2b5b46
 static struct jump_info *detected_jumps = NULL;
2b5b46
+
2b5b46
+typedef enum unicode_display_type
2b5b46
+{
2b5b46
+  unicode_default = 0,
2b5b46
+  unicode_locale,
2b5b46
+  unicode_escape,
2b5b46
+  unicode_hex,
2b5b46
+  unicode_highlight,
2b5b46
+  unicode_invalid
2b5b46
+} unicode_display_type;
2b5b46
+
2b5b46
+static unicode_display_type unicode_display = unicode_default;
2b5b46
 
2b5b46
 static void usage (FILE *, int) ATTRIBUTE_NORETURN;
2b5b46
 static void
2b5b46
@@ -247,6 +259,9 @@ usage (FILE *stream, int status)
2b5b46
   -r, --reloc              Display the relocation entries in the file\n\
2b5b46
   -R, --dynamic-reloc      Display the dynamic relocation entries in the file\n\
2b5b46
   @<file>                  Read options from <file>\n\
2b5b46
+  -U[d|l|i|x|e|h]          Controls the display of UTF-8 unicode characters\n\
2b5b46
+  --unicode=[default|locale|invalid|hex|escape|highlight]\n"));
2b5b46
+      fprintf (stream, _("\
2b5b46
   -v, --version            Display this program's version number\n\
2b5b46
   -i, --info               List object formats and architectures supported\n\
2b5b46
   -H, --help               Display this information\n\
2b5b46
@@ -395,6 +410,7 @@ static struct option long_options[]=
2b5b46
   {"stop-address", required_argument, NULL, OPTION_STOP_ADDRESS},
2b5b46
   {"syms", no_argument, NULL, 't'},
2b5b46
   {"target", required_argument, NULL, 'b'},
2b5b46
+  {"unicode", required_argument, NULL, 'U'},
2b5b46
   {"version", no_argument, NULL, 'V'},
2b5b46
   {"wide", no_argument, NULL, 'w'},
2b5b46
   {"prefix", required_argument, NULL, OPTION_PREFIX},
2b5b46
@@ -414,10 +430,124 @@ nonfatal (const char *msg)
2b5b46
   bfd_nonfatal (msg);
2b5b46
   exit_status = 1;
2b5b46
 }
2b5b46
+
2b5b46
+/* Convert a potential UTF-8 encoded sequence in IN into characters in OUT.
2b5b46
+   The conversion format is controlled by the unicode_display variable.
2b5b46
+   Returns the number of characters added to OUT.
2b5b46
+   Returns the number of bytes consumed from IN in CONSUMED.
2b5b46
+   Always consumes at least one byte and displays at least one character.  */
2b5b46
+   
2b5b46
+static unsigned int
2b5b46
+display_utf8 (const unsigned char * in, char * out, unsigned int * consumed)
2b5b46
+{
2b5b46
+  char *        orig_out = out;
2b5b46
+  unsigned int  nchars = 0;
2b5b46
+
2b5b46
+  if (unicode_display == unicode_default)
2b5b46
+    goto invalid;
2b5b46
+
2b5b46
+  if (in[0] < 0xc0)
2b5b46
+    goto invalid;
2b5b46
+
2b5b46
+  if ((in[1] & 0xc0) != 0x80)
2b5b46
+    goto invalid;
2b5b46
+
2b5b46
+  if ((in[0] & 0x20) == 0)
2b5b46
+    {
2b5b46
+      nchars = 2;
2b5b46
+      goto valid;
2b5b46
+    }
2b5b46
+
2b5b46
+  if ((in[2] & 0xc0) != 0x80)
2b5b46
+    goto invalid;
2b5b46
+
2b5b46
+  if ((in[0] & 0x10) == 0)
2b5b46
+    {
2b5b46
+      nchars = 3;
2b5b46
+      goto valid;
2b5b46
+    }
2b5b46
+
2b5b46
+  if ((in[3] & 0xc0) != 0x80)
2b5b46
+    goto invalid;
2b5b46
+
2b5b46
+  nchars = 4;
2b5b46
+
2b5b46
+ valid:
2b5b46
+  switch (unicode_display)
2b5b46
+    {
2b5b46
+    case unicode_locale:
2b5b46
+      /* Copy the bytes into the output buffer as is.  */
2b5b46
+      memcpy (out, in, nchars);
2b5b46
+      out += nchars;
2b5b46
+      break;
2b5b46
+
2b5b46
+    case unicode_invalid:
2b5b46
+    case unicode_hex:
2b5b46
+      {
2b5b46
+      unsigned int j;
2b5b46
+
2b5b46
+      out += sprintf (out, "%c", unicode_display == unicode_hex ? '<' : '{');
2b5b46
+      for (j = 0; j < nchars; j++)
2b5b46
+	out += sprintf (out, "%02x", in [j]);
2b5b46
+      out += sprintf (out, "%c", unicode_display == unicode_hex ? '>' : '}');
2b5b46
+      }
2b5b46
+      break;
2b5b46
+      
2b5b46
+    case unicode_highlight:
2b5b46
+      if (isatty (1))
2b5b46
+	out += sprintf (out, "\x1B[31;47m"); /* Red.  */
2b5b46
+      /* Fall through.  */
2b5b46
+    case unicode_escape:
2b5b46
+      switch (nchars)
2b5b46
+	{
2b5b46
+	case 2:
2b5b46
+	  out += sprintf (out, "\\u%02x%02x",
2b5b46
+		  ((in[0] & 0x1c) >> 2), 
2b5b46
+		  ((in[0] & 0x03) << 6) | (in[1] & 0x3f));
2b5b46
+	  break;
2b5b46
+
2b5b46
+	case 3:
2b5b46
+	  out += sprintf (out, "\\u%02x%02x",
2b5b46
+		  ((in[0] & 0x0f) << 4) | ((in[1] & 0x3c) >> 2),
2b5b46
+		  ((in[1] & 0x03) << 6) | ((in[2] & 0x3f)));
2b5b46
+	  break;
2b5b46
+
2b5b46
+	case 4:
2b5b46
+	  out += sprintf (out, "\\u%02x%02x%02x",
2b5b46
+		  ((in[0] & 0x07) << 6) | ((in[1] & 0x3c) >> 2),
2b5b46
+		  ((in[1] & 0x03) << 6) | ((in[2] & 0x3c) >> 2),
2b5b46
+		  ((in[2] & 0x03) << 6) | ((in[3] & 0x3f)));
2b5b46
+	  break;
2b5b46
+	default:
2b5b46
+	  /* URG.  */
2b5b46
+	  break;
2b5b46
+	}
2b5b46
+
2b5b46
+      if (unicode_display == unicode_highlight && isatty (1))
2b5b46
+	out += sprintf (out, "\033[0m"); /* Default colour.  */
2b5b46
+      break;
2b5b46
+
2b5b46
+    default:
2b5b46
+      /* URG */
2b5b46
+      break;
2b5b46
+    }
2b5b46
+
2b5b46
+  * consumed = nchars;
2b5b46
+  return out - orig_out;
2b5b46
+
2b5b46
+ invalid:
2b5b46
+  /* Not a valid UTF-8 sequence.  */
2b5b46
+  *out = *in;
2b5b46
+  * consumed = 1;
2b5b46
+  return 1;
2b5b46
+}
2b5b46
 
2b5b46
 /* Returns a version of IN with any control characters
2b5b46
    replaced by escape sequences.  Uses a static buffer
2b5b46
-   if necessary.  */
2b5b46
+   if necessary.
2b5b46
+
2b5b46
+   If unicode display is enabled, then also handles the
2b5b46
+   conversion of unicode characters.  */
2b5b46
 
2b5b46
 static const char *
2b5b46
 sanitize_string (const char * in)
2b5b46
@@ -435,40 +565,50 @@ sanitize_string (const char * in)
2b5b46
      of cases it will not be needed.  */
2b5b46
   do
2b5b46
     {
2b5b46
-      char c = *in++;
2b5b46
+      unsigned char c = *in++;
2b5b46
 
2b5b46
       if (c == 0)
2b5b46
 	return original;
2b5b46
 
2b5b46
       if (ISCNTRL (c))
2b5b46
 	break;
2b5b46
+
2b5b46
+      if (unicode_display != unicode_default && c >= 0xc0)
2b5b46
+	break;
2b5b46
     }
2b5b46
   while (1);
2b5b46
 
2b5b46
   /* Copy the input, translating as needed.  */
2b5b46
   in = original;
2b5b46
-  if (buffer_len < (strlen (in) * 2))
2b5b46
+  if (buffer_len < (strlen (in) * 9))
2b5b46
     {
2b5b46
       free ((void *) buffer);
2b5b46
-      buffer_len = strlen (in) * 2;
2b5b46
+      buffer_len = strlen (in) * 9;
2b5b46
       buffer = xmalloc (buffer_len + 1);
2b5b46
     }
2b5b46
 
2b5b46
   out = buffer;
2b5b46
   do
2b5b46
     {
2b5b46
-      char c = *in++;
2b5b46
+      unsigned char c = *in++;
2b5b46
 
2b5b46
       if (c == 0)
2b5b46
 	break;
2b5b46
 
2b5b46
-      if (!ISCNTRL (c))
2b5b46
-	*out++ = c;
2b5b46
-      else
2b5b46
+      if (ISCNTRL (c))
2b5b46
 	{
2b5b46
 	  *out++ = '^';
2b5b46
 	  *out++ = c + 0x40;
2b5b46
 	}
2b5b46
+      else if (unicode_display != unicode_default && c >= 0xc0)
2b5b46
+	{
2b5b46
+	  unsigned int num_consumed;
2b5b46
+
2b5b46
+	  out += display_utf8 ((const unsigned char *)(in - 1), out, & num_consumed);
2b5b46
+	  in += num_consumed - 1;
2b5b46
+	}
2b5b46
+      else
2b5b46
+	*out++ = c;
2b5b46
     }
2b5b46
   while (1);
2b5b46
 
2b5b46
@@ -476,7 +616,6 @@ sanitize_string (const char * in)
2b5b46
   return buffer;
2b5b46
 }
2b5b46
 
2b5b46
-
2b5b46
 /* Returns TRUE if the specified section should be dumped.  */
2b5b46
 
2b5b46
 static bfd_boolean
2b5b46
@@ -1055,6 +1194,8 @@ objdump_print_symname (bfd *abfd, struct
2b5b46
 
2b5b46
   name = sanitize_string (name);
2b5b46
 
2b5b46
+  name = sanitize_string (name);
2b5b46
+
2b5b46
   if (inf != NULL)
2b5b46
     {
2b5b46
       (*inf->fprintf_func) (inf->stream, "%s", name);
2b5b46
@@ -3136,7 +3277,7 @@ disassemble_section (bfd *abfd, asection
2b5b46
   if (!bfd_malloc_and_get_section (abfd, section, &data))
2b5b46
     {
2b5b46
       non_fatal (_("Reading section %s failed because: %s"),
2b5b46
-		 section->name, bfd_errmsg (bfd_get_error ()));
2b5b46
+		 sanitize_string (section->name), bfd_errmsg (bfd_get_error ()));
2b5b46
       return;
2b5b46
     }
2b5b46
 
2b5b46
@@ -4341,7 +4482,7 @@ dump_section (bfd *abfd, asection *secti
2b5b46
   if (!bfd_get_full_section_contents (abfd, section, &data))
2b5b46
     {
2b5b46
       non_fatal (_("Reading section %s failed because: %s"),
2b5b46
-		 section->name, bfd_errmsg (bfd_get_error ()));
2b5b46
+		 sanitize_string (section->name), bfd_errmsg (bfd_get_error ()));
2b5b46
       return;
2b5b46
     }
2b5b46
 
2b5b46
@@ -4481,6 +4622,24 @@ dump_symbols (bfd *abfd ATTRIBUTE_UNUSED
2b5b46
 		  free (alloc);
2b5b46
 		}
2b5b46
 	    }
2b5b46
+	  else if (unicode_display != unicode_default
2b5b46
+		   && name != NULL && *name != '\0')
2b5b46
+	    {
2b5b46
+	      const char * sanitized_name;
2b5b46
+
2b5b46
+	      /* If we want to sanitize the name, we do it here, and
2b5b46
+		 temporarily clobber it while calling bfd_print_symbol.
2b5b46
+		 FIXME: This is a gross hack.  */
2b5b46
+	      sanitized_name = sanitize_string (name);
2b5b46
+	      if (sanitized_name != name)
2b5b46
+		(*current)->name = sanitized_name;
2b5b46
+	      else
2b5b46
+		sanitized_name = NULL;
2b5b46
+	      bfd_print_symbol (cur_bfd, stdout, *current,
2b5b46
+				bfd_print_symbol_all);
2b5b46
+	      if (sanitized_name != NULL)
2b5b46
+		(*current)->name = name;
2b5b46
+	    }
2b5b46
 	  else
2b5b46
 	    bfd_print_symbol (cur_bfd, stdout, *current,
2b5b46
 			      bfd_print_symbol_all);
2b5b46
@@ -5162,7 +5321,7 @@ main (int argc, char **argv)
2b5b46
   set_default_bfd_target ();
2b5b46
 
2b5b46
   while ((c = getopt_long (argc, argv,
2b5b46
-			   "pP:ib:m:M:VvCdDlfFaHhrRtTxsSI:j:wE:zgeGW::",
2b5b46
+			   "pP:ib:m:M:VvCdDlfFaHhrRtTxsSI:j:wE:zgeGW::U:",
2b5b46
 			   long_options, (int *) 0))
2b5b46
 	 != EOF)
2b5b46
     {
2b5b46
@@ -5441,6 +5600,23 @@ main (int argc, char **argv)
2b5b46
 	  seenflag = TRUE;
2b5b46
 	  break;
2b5b46
 
2b5b46
+	case 'U':
2b5b46
+	  if (streq (optarg, "default") || streq (optarg, "d"))
2b5b46
+	    unicode_display = unicode_default;
2b5b46
+	  else if (streq (optarg, "locale") || streq (optarg, "l"))
2b5b46
+	    unicode_display = unicode_locale;
2b5b46
+	  else if (streq (optarg, "escape") || streq (optarg, "e"))
2b5b46
+	    unicode_display = unicode_escape;
2b5b46
+	  else if (streq (optarg, "invalid") || streq (optarg, "i"))
2b5b46
+	    unicode_display = unicode_invalid;
2b5b46
+	  else if (streq (optarg, "hex") || streq (optarg, "x"))
2b5b46
+	    unicode_display = unicode_hex;
2b5b46
+	  else if (streq (optarg, "highlight") || streq (optarg, "h"))
2b5b46
+	    unicode_display = unicode_highlight;
2b5b46
+	  else
2b5b46
+	    fatal (_("invalid argument to -U/--unicode: %s"), optarg);
2b5b46
+	  break;
2b5b46
+
2b5b46
 	case 'H':
2b5b46
 	  usage (stdout, 0);
2b5b46
 	  /* No need to set seenflag or to break - usage() does not return.  */
2b5b46
Only in binutils-2.36.1/binutils/: objdump.c.orig
2b5b46
diff -rup binutils.orig/binutils/readelf.c binutils-2.36.1/binutils/readelf.c
2b5b46
--- binutils.orig/binutils/readelf.c	2021-10-21 16:56:20.323761356 +0100
2b5b46
+++ binutils-2.36.1/binutils/readelf.c	2021-10-21 17:00:54.169858044 +0100
2b5b46
@@ -321,6 +321,18 @@ typedef enum print_mode
2b5b46
 }
2b5b46
 print_mode;
2b5b46
 
2b5b46
+typedef enum unicode_display_type
2b5b46
+{
2b5b46
+  unicode_locale,
2b5b46
+  unicode_escape,
2b5b46
+  unicode_hex,
2b5b46
+  unicode_highlight,
2b5b46
+  unicode_invalid
2b5b46
+} unicode_display_type;
2b5b46
+
2b5b46
+static unicode_display_type unicode_display = unicode_locale;
2b5b46
+
2b5b46
+  
2b5b46
 /* Versioned symbol info.  */
2b5b46
 enum versioned_symbol_info
2b5b46
 {
2b5b46
@@ -613,11 +625,18 @@ print_symbol (signed int width, const ch
2b5b46
       if (c == 0)
2b5b46
 	break;
2b5b46
 
2b5b46
-      /* Do not print control characters directly as they can affect terminal
2b5b46
-	 settings.  Such characters usually appear in the names generated
2b5b46
-	 by the assembler for local labels.  */
2b5b46
-      if (ISCNTRL (c))
2b5b46
+      if (ISPRINT (c))
2b5b46
+	{
2b5b46
+	  putchar (c);
2b5b46
+	  width_remaining --;
2b5b46
+	  num_printed ++;
2b5b46
+	}
2b5b46
+      else if (ISCNTRL (c))
2b5b46
 	{
2b5b46
+	  /* Do not print control characters directly as they can affect terminal
2b5b46
+	     settings.  Such characters usually appear in the names generated
2b5b46
+	     by the assembler for local labels.  */
2b5b46
+
2b5b46
 	  if (width_remaining < 2)
2b5b46
 	    break;
2b5b46
 
2b5b46
@@ -625,11 +644,135 @@ print_symbol (signed int width, const ch
2b5b46
 	  width_remaining -= 2;
2b5b46
 	  num_printed += 2;
2b5b46
 	}
2b5b46
-      else if (ISPRINT (c))
2b5b46
+      else if (c == 0x7f)
2b5b46
 	{
2b5b46
-	  putchar (c);
2b5b46
-	  width_remaining --;
2b5b46
-	  num_printed ++;
2b5b46
+	  if (width_remaining < 5)
2b5b46
+	    break;
2b5b46
+	  printf ("");
2b5b46
+	  width_remaining -= 5;
2b5b46
+	  num_printed += 5;
2b5b46
+	}
2b5b46
+      else if (unicode_display != unicode_locale)
2b5b46
+	{
2b5b46
+	  /* Display unicode characters as something else.  */
2b5b46
+	  unsigned char bytes[4];
2b5b46
+	  bfd_boolean   is_utf8;
2b5b46
+	  unsigned int  nbytes;
2b5b46
+
2b5b46
+	  bytes[0] = c;
2b5b46
+
2b5b46
+	  if (bytes[0] < 0xc0)
2b5b46
+	    {
2b5b46
+	      nbytes = 1;
2b5b46
+	      is_utf8 = FALSE;
2b5b46
+	    }
2b5b46
+	  else
2b5b46
+	    {
2b5b46
+	      bytes[1] = *symbol++;
2b5b46
+
2b5b46
+	      if ((bytes[1] & 0xc0) != 0x80)
2b5b46
+		{
2b5b46
+		  is_utf8 = FALSE;
2b5b46
+		  /* Do not consume this character.  It may only
2b5b46
+		     be the first byte in the sequence that was
2b5b46
+		     corrupt.  */
2b5b46
+		  --symbol;
2b5b46
+		  nbytes = 1;
2b5b46
+		}
2b5b46
+	      else if ((bytes[0] & 0x20) == 0)
2b5b46
+		{
2b5b46
+		  is_utf8 = TRUE;
2b5b46
+		  nbytes = 2;
2b5b46
+		}
2b5b46
+	      else
2b5b46
+		{
2b5b46
+		  bytes[2] = *symbol++;
2b5b46
+
2b5b46
+		  if ((bytes[2] & 0xc0) != 0x80)
2b5b46
+		    {
2b5b46
+		      is_utf8 = FALSE;
2b5b46
+		      symbol -= 2;
2b5b46
+		      nbytes = 1;
2b5b46
+		    }
2b5b46
+		  else if ((bytes[0] & 0x10) == 0)
2b5b46
+		    {
2b5b46
+		      is_utf8 = TRUE;
2b5b46
+		      nbytes = 3;
2b5b46
+		    }
2b5b46
+		  else
2b5b46
+		    {
2b5b46
+		      bytes[3] = *symbol++;
2b5b46
+
2b5b46
+		      nbytes = 4;
2b5b46
+
2b5b46
+		      if ((bytes[3] & 0xc0) != 0x80)
2b5b46
+			{
2b5b46
+			  is_utf8 = FALSE;
2b5b46
+			  symbol -= 3;
2b5b46
+			  nbytes = 1;
2b5b46
+			}
2b5b46
+		      else
2b5b46
+			is_utf8 = TRUE;
2b5b46
+		    }
2b5b46
+		}
2b5b46
+	    }
2b5b46
+
2b5b46
+	  if (unicode_display == unicode_invalid)
2b5b46
+	    is_utf8 = FALSE;
2b5b46
+
2b5b46
+	  if (unicode_display == unicode_hex || ! is_utf8)
2b5b46
+	    {
2b5b46
+	      unsigned int i;
2b5b46
+
2b5b46
+	      if (width_remaining < (nbytes * 2) + 2)
2b5b46
+		break;
2b5b46
+	  
2b5b46
+	      putchar (is_utf8 ? '<' : '{');
2b5b46
+	      for (i = 0; i < nbytes; i++)
2b5b46
+		printf ("%02x", bytes[i]);
2b5b46
+	      putchar (is_utf8 ? '>' : '}');
2b5b46
+	    }
2b5b46
+	  else
2b5b46
+	    {
2b5b46
+	      if (unicode_display == unicode_highlight && isatty (1))
2b5b46
+		printf ("\x1B[31;47m"); /* Red.  */
2b5b46
+	      
2b5b46
+	      switch (nbytes)
2b5b46
+		{
2b5b46
+		case 2:
2b5b46
+		  if (width_remaining < 6)
2b5b46
+		    break;
2b5b46
+		  printf ("\\u%02x%02x",
2b5b46
+			  (bytes[0] & 0x1c) >> 2, 
2b5b46
+			  ((bytes[0] & 0x03) << 6) | (bytes[1] & 0x3f));
2b5b46
+		  break;
2b5b46
+		case 3:
2b5b46
+		  if (width_remaining < 6)
2b5b46
+		    break;
2b5b46
+		  printf ("\\u%02x%02x",
2b5b46
+			  ((bytes[0] & 0x0f) << 4) | ((bytes[1] & 0x3c) >> 2),
2b5b46
+			  ((bytes[1] & 0x03) << 6) | (bytes[2] & 0x3f));
2b5b46
+		  break;
2b5b46
+		case 4:
2b5b46
+		  if (width_remaining < 8)
2b5b46
+		    break;
2b5b46
+		  printf ("\\u%02x%02x%02x",
2b5b46
+			  ((bytes[0] & 0x07) << 6) | ((bytes[1] & 0x3c) >> 2),
2b5b46
+			  ((bytes[1] & 0x03) << 6) | ((bytes[2] & 0x3c) >> 2),
2b5b46
+			  ((bytes[2] & 0x03) << 6) | (bytes[3] & 0x3f));
2b5b46
+		  
2b5b46
+		  break;
2b5b46
+		default:
2b5b46
+		  /* URG.  */
2b5b46
+		  break;
2b5b46
+		}
2b5b46
+
2b5b46
+	      if (unicode_display == unicode_highlight && isatty (1))
2b5b46
+		printf ("\033[0m"); /* Default colour.  */
2b5b46
+	    }
2b5b46
+	  
2b5b46
+	  if (bytes[nbytes - 1] == 0)
2b5b46
+	    break;
2b5b46
 	}
2b5b46
       else
2b5b46
 	{
2b5b46
@@ -4555,6 +4698,7 @@ static struct option options[] =
2b5b46
   {"syms",	       no_argument, 0, 's'},
2b5b46
   {"silent-truncation",no_argument, 0, 'T'},
2b5b46
   {"section-details",  no_argument, 0, 't'},
2b5b46
+  {"unicode",          required_argument, 0, 'U'},
2b5b46
   {"unwind",	       no_argument, 0, 'u'},
2b5b46
   {"version-info",     no_argument, 0, 'V'},
2b5b46
   {"version",	       no_argument, 0, 'v'},
2b5b46
@@ -4652,6 +4796,11 @@ usage (FILE * stream)
2b5b46
 #endif
2b5b46
   fprintf (stream, _("\
2b5b46
   -I --histogram         Display histogram of bucket list lengths\n\
2b5b46
+  -U --unicode=[locale|escape|hex|highlight|invalid]\n\
2b5b46
+                         Display unicode characters as determined by the current locale\n\
2b5b46
+                          (default), escape sequences, \"<hex sequences>\", highlighted\n\
2b5b46
+                          escape sequences, or treat them as invalid and display as\n\
2b5b46
+                          \"{hex sequences}\"\n\
2b5b46
   -W --wide              Allow output width to exceed 80 characters\n\
2b5b46
   -T --silent-truncation If a symbol name is truncated, do not add a suffix [...]\n\
2b5b46
   @<file>                Read options from <file>\n\
2b5b46
@@ -4748,7 +4897,7 @@ parse_args (struct dump_data *dumpdata,
2b5b46
     usage (stderr);
2b5b46
 
2b5b46
   while ((c = getopt_long
2b5b46
-	  (argc, argv, "ACDHILNR:STVWacdeghi:lnp:rstuvw::x:z", options, NULL)) != EOF)
2b5b46
+	  (argc, argv, "ACDHILNR:STU:VWacdeghi:lnp:rstuvw::x:z", options, NULL)) != EOF)
2b5b46
     {
2b5b46
       switch (c)
2b5b46
 	{
2b5b46
@@ -4905,6 +5054,25 @@ parse_args (struct dump_data *dumpdata,
2b5b46
 	  request_dump (dumpdata, DISASS_DUMP);
2b5b46
 	  break;
2b5b46
 #endif
2b5b46
+	case 'U':
2b5b46
+	  if (optarg == NULL)
2b5b46
+	    error (_("Missing arg to -U/--unicode")); /* Can this happen ?  */
2b5b46
+	  else if (streq (optarg, "default") || streq (optarg, "d"))
2b5b46
+	    unicode_display = unicode_locale;
2b5b46
+	  else if (streq (optarg, "locale") || streq (optarg, "l"))
2b5b46
+	    unicode_display = unicode_locale;
2b5b46
+	  else if (streq (optarg, "escape") || streq (optarg, "e"))
2b5b46
+	    unicode_display = unicode_escape;
2b5b46
+	  else if (streq (optarg, "invalid") || streq (optarg, "i"))
2b5b46
+	    unicode_display = unicode_invalid;
2b5b46
+	  else if (streq (optarg, "hex") || streq (optarg, "x"))
2b5b46
+	    unicode_display = unicode_hex;
2b5b46
+	  else if (streq (optarg, "highlight") || streq (optarg, "h"))
2b5b46
+	    unicode_display = unicode_highlight;
2b5b46
+	  else
2b5b46
+	    error (_("unknown argument to -U/--unicode: %s"), optarg);
2b5b46
+	  break;
2b5b46
+
2b5b46
 	case 'v':
2b5b46
 	  print_version (program_name);
2b5b46
 	  break;
2b5b46
Only in binutils-2.36.1/binutils/: readelf.c.orig
2b5b46
Only in binutils-2.36.1/binutils/: readelf.c.rej
2b5b46
diff -rup binutils.orig/binutils/strings.c binutils-2.36.1/binutils/strings.c
2b5b46
--- binutils.orig/binutils/strings.c	2021-10-21 16:56:20.321761370 +0100
2b5b46
+++ binutils-2.36.1/binutils/strings.c	2021-10-21 16:56:29.698696197 +0100
2b5b46
@@ -55,6 +55,19 @@
2b5b46
    -T {bfdname}
2b5b46
 		Specify a non-default object file format.
2b5b46
 
2b5b46
+  --unicode={default|locale|invalid|hex|escape|highlight}
2b5b46
+  -U {d|l|i|x|e|h}
2b5b46
+                Determine how to handle UTF-8 unicode characters.  The default
2b5b46
+		is no special treatment.  All other versions of this option
2b5b46
+		only apply if the encoding is valid and enabling the option
2b5b46
+		implies --encoding=S.
2b5b46
+		The 'locale' option displays the characters according to the
2b5b46
+		current locale.  The 'invalid' option treats them as
2b5b46
+		non-string characters.  The 'hex' option displays them as hex
2b5b46
+		byte sequences.  The 'escape' option displays them as escape
2b5b46
+		sequences and the 'highlight' option displays them as
2b5b46
+		coloured escape sequences.
2b5b46
+
2b5b46
   --output-separator=sep_string
2b5b46
   -s sep_string	String used to separate parsed strings in output.
2b5b46
 		Default is newline.
2b5b46
@@ -76,6 +89,22 @@
2b5b46
 #include "safe-ctype.h"
2b5b46
 #include "bucomm.h"
2b5b46
 
2b5b46
+#ifndef streq
2b5b46
+#define streq(a,b) (strcmp ((a),(b)) == 0)
2b5b46
+#endif
2b5b46
+
2b5b46
+typedef enum unicode_display_type
2b5b46
+{
2b5b46
+  unicode_default = 0,
2b5b46
+  unicode_locale,
2b5b46
+  unicode_escape,
2b5b46
+  unicode_hex,
2b5b46
+  unicode_highlight,
2b5b46
+  unicode_invalid
2b5b46
+} unicode_display_type;
2b5b46
+
2b5b46
+static unicode_display_type unicode_display = unicode_default;
2b5b46
+
2b5b46
 #define STRING_ISGRAPHIC(c) \
2b5b46
       (   (c) >= 0 \
2b5b46
        && (c) <= 255 \
2b5b46
@@ -94,7 +123,7 @@ extern int errno;
2b5b46
 static int address_radix;
2b5b46
 
2b5b46
 /* Minimum length of sequence of graphic chars to trigger output.  */
2b5b46
-static int string_min;
2b5b46
+static uint string_min;
2b5b46
 
2b5b46
 /* Whether or not we include all whitespace as a graphic char.   */
2b5b46
 static bfd_boolean include_all_whitespace;
2b5b46
@@ -130,6 +159,7 @@ static struct option long_options[] =
2b5b46
   {"target", required_argument, NULL, 'T'},
2b5b46
   {"output-separator", required_argument, NULL, 's'},
2b5b46
   {"help", no_argument, NULL, 'h'},
2b5b46
+  {"unicode", required_argument, NULL, 'U'},
2b5b46
   {"version", no_argument, NULL, 'v'},
2b5b46
   {NULL, 0, NULL, 0}
2b5b46
 };
2b5b46
@@ -173,7 +203,7 @@ main (int argc, char **argv)
2b5b46
   encoding = 's';
2b5b46
   output_separator = NULL;
2b5b46
 
2b5b46
-  while ((optc = getopt_long (argc, argv, "adfhHn:wot:e:T:s:Vv0123456789",
2b5b46
+  while ((optc = getopt_long (argc, argv, "adfhHn:wot:e:T:s:U:Vv0123456789",
2b5b46
 			      long_options, (int *) 0)) != EOF)
2b5b46
     {
2b5b46
       switch (optc)
2b5b46
@@ -246,6 +276,23 @@ main (int argc, char **argv)
2b5b46
 	  output_separator = optarg;
2b5b46
           break;
2b5b46
 
2b5b46
+	case 'U':
2b5b46
+	  if (streq (optarg, "default") || streq (optarg, "d"))
2b5b46
+	    unicode_display = unicode_default;
2b5b46
+	  else if (streq (optarg, "locale") || streq (optarg, "l"))
2b5b46
+	    unicode_display = unicode_locale;
2b5b46
+	  else if (streq (optarg, "escape") || streq (optarg, "e"))
2b5b46
+	    unicode_display = unicode_escape;
2b5b46
+	  else if (streq (optarg, "invalid") || streq (optarg, "i"))
2b5b46
+	    unicode_display = unicode_invalid;
2b5b46
+	  else if (streq (optarg, "hex") || streq (optarg, "x"))
2b5b46
+	    unicode_display = unicode_hex;
2b5b46
+	  else if (streq (optarg, "highlight") || streq (optarg, "h"))
2b5b46
+	    unicode_display = unicode_highlight;
2b5b46
+	  else
2b5b46
+	    fatal (_("invalid argument to -U/--unicode: %s"), optarg);
2b5b46
+	  break;
2b5b46
+
2b5b46
 	case 'V':
2b5b46
 	case 'v':
2b5b46
 	  print_version ("strings");
2b5b46
@@ -260,6 +307,9 @@ main (int argc, char **argv)
2b5b46
 	}
2b5b46
     }
2b5b46
 
2b5b46
+  if (unicode_display != unicode_default)
2b5b46
+    encoding = 'S';
2b5b46
+
2b5b46
   if (numeric_opt != 0)
2b5b46
     {
2b5b46
       string_min = (int) strtoul (argv[numeric_opt - 1] + 1, &s, 0);
2b5b46
@@ -553,11 +603,629 @@ unget_part_char (long c, file_ptr *addre
2b5b46
 	}
2b5b46
     }
2b5b46
 }
2b5b46
+
2b5b46
+static void
2b5b46
+print_filename_and_address (const char * filename, file_ptr address)
2b5b46
+{
2b5b46
+  if (print_filenames)
2b5b46
+    printf ("%s: ", filename);
2b5b46
+
2b5b46
+  if (! print_addresses)
2b5b46
+    return;
2b5b46
+
2b5b46
+  switch (address_radix)
2b5b46
+    {
2b5b46
+    case 8:
2b5b46
+      if (sizeof (address) > sizeof (long))
2b5b46
+	{
2b5b46
+#ifndef __MSVCRT__
2b5b46
+	  printf ("%7llo ", (unsigned long long) address);
2b5b46
+#else
2b5b46
+	  printf ("%7I64o ", (unsigned long long) address);
2b5b46
+#endif
2b5b46
+	}
2b5b46
+      else
2b5b46
+	printf ("%7lo ", (unsigned long) address);
2b5b46
+      break;
2b5b46
+
2b5b46
+    case 10:
2b5b46
+      if (sizeof (address) > sizeof (long))
2b5b46
+	{
2b5b46
+#ifndef __MSVCRT__
2b5b46
+	  printf ("%7llu ", (unsigned long long) address);
2b5b46
+#else
2b5b46
+	  printf ("%7I64d ", (unsigned long long) address);
2b5b46
+#endif
2b5b46
+	}
2b5b46
+      else
2b5b46
+	printf ("%7ld ", (long) address);
2b5b46
+      break;
2b5b46
+
2b5b46
+    case 16:
2b5b46
+      if (sizeof (address) > sizeof (long))
2b5b46
+	{
2b5b46
+#ifndef __MSVCRT__
2b5b46
+	  printf ("%7llx ", (unsigned long long) address);
2b5b46
+#else
2b5b46
+	  printf ("%7I64x ", (unsigned long long) address);
2b5b46
+#endif
2b5b46
+	}
2b5b46
+      else
2b5b46
+	printf ("%7lx ", (unsigned long) address);
2b5b46
+      break;
2b5b46
+    }
2b5b46
+}
2b5b46
+
2b5b46
+/* Return non-zero if the bytes starting at BUFFER form a valid UTF-8 encoding.
2b5b46
+   If the encoding is valid then returns the number of bytes it uses.  */
2b5b46
+
2b5b46
+static unsigned int
2b5b46
+is_valid_utf8 (const unsigned char * buffer, unsigned long buflen)
2b5b46
+{
2b5b46
+  if (buffer[0] < 0xc0)
2b5b46
+    return 0;
2b5b46
+
2b5b46
+  if (buflen < 2)
2b5b46
+    return 0;
2b5b46
+
2b5b46
+  if ((buffer[1] & 0xc0) != 0x80)
2b5b46
+    return 0;
2b5b46
+
2b5b46
+  if ((buffer[0] & 0x20) == 0)
2b5b46
+    return 2;
2b5b46
+
2b5b46
+  if (buflen < 3)
2b5b46
+    return 0;
2b5b46
+
2b5b46
+  if ((buffer[2] & 0xc0) != 0x80)
2b5b46
+    return 0;
2b5b46
+  
2b5b46
+  if ((buffer[0] & 0x10) == 0)
2b5b46
+    return 3;
2b5b46
+
2b5b46
+  if (buflen < 4)
2b5b46
+    return 0;
2b5b46
+
2b5b46
+  if ((buffer[3] & 0xc0) != 0x80)
2b5b46
+    return 0;
2b5b46
+
2b5b46
+  return 4;
2b5b46
+}
2b5b46
+
2b5b46
+/* Display a UTF-8 encoded character in BUFFER according to the setting
2b5b46
+   of unicode_display.  The character is known to be valid.
2b5b46
+   Returns the number of bytes consumed.  */
2b5b46
+
2b5b46
+static unsigned int
2b5b46
+display_utf8_char (const unsigned char * buffer)
2b5b46
+{
2b5b46
+  unsigned int j;
2b5b46
+  unsigned int utf8_len;
2b5b46
+
2b5b46
+  switch (buffer[0] & 0x30)
2b5b46
+    {
2b5b46
+    case 0x00:
2b5b46
+    case 0x10:
2b5b46
+      utf8_len = 2;
2b5b46
+      break;
2b5b46
+    case 0x20:
2b5b46
+      utf8_len = 3;
2b5b46
+      break;
2b5b46
+    default:
2b5b46
+      utf8_len = 4;
2b5b46
+    }
2b5b46
+      
2b5b46
+  switch (unicode_display)
2b5b46
+    {
2b5b46
+    default:
2b5b46
+      fprintf (stderr, "ICE: unexpected unicode display type\n");
2b5b46
+      break;
2b5b46
+
2b5b46
+    case unicode_escape:
2b5b46
+    case unicode_highlight:
2b5b46
+      if (unicode_display == unicode_highlight && isatty (1))
2b5b46
+	printf ("\x1B[31;47m"); /* Red.  */
2b5b46
+
2b5b46
+      switch (utf8_len)
2b5b46
+	{
2b5b46
+	case 2:
2b5b46
+	  printf ("\\u%02x%02x",
2b5b46
+		  ((buffer[0] & 0x1c) >> 2), 
2b5b46
+		  ((buffer[0] & 0x03) << 6) | (buffer[1] & 0x3f));
2b5b46
+	  break;
2b5b46
+
2b5b46
+	case 3:
2b5b46
+	  printf ("\\u%02x%02x",
2b5b46
+		  ((buffer[0] & 0x0f) << 4) | ((buffer[1] & 0x3c) >> 2),
2b5b46
+		  ((buffer[1] & 0x03) << 6) | ((buffer[2] & 0x3f)));
2b5b46
+	  break;
2b5b46
+
2b5b46
+	case 4:
2b5b46
+	  printf ("\\u%02x%02x%02x",
2b5b46
+		  ((buffer[0] & 0x07) << 6) | ((buffer[1] & 0x3c) >> 2),
2b5b46
+		  ((buffer[1] & 0x03) << 6) | ((buffer[2] & 0x3c) >> 2),
2b5b46
+		  ((buffer[2] & 0x03) << 6) | ((buffer[3] & 0x3f)));
2b5b46
+	  break;
2b5b46
+	default:
2b5b46
+	  /* URG.  */
2b5b46
+	  break;
2b5b46
+	}
2b5b46
+
2b5b46
+      if (unicode_display == unicode_highlight && isatty (1))
2b5b46
+	printf ("\033[0m"); /* Default colour.  */
2b5b46
+      break;
2b5b46
+
2b5b46
+    case unicode_hex:
2b5b46
+      putchar ('<');
2b5b46
+      for (j = 0; j < utf8_len; j++)
2b5b46
+	printf ("%02x", buffer [j]);
2b5b46
+      putchar ('>');
2b5b46
+      break;
2b5b46
+
2b5b46
+    case unicode_locale:
2b5b46
+      printf ("%.1s", buffer);
2b5b46
+      break;
2b5b46
+    }
2b5b46
+
2b5b46
+  return utf8_len;
2b5b46
+}
2b5b46
+
2b5b46
+/* Display strings in BUFFER.  Treat any UTF-8 encoded characters encountered
2b5b46
+   according to the setting of the unicode_display variable.  The buffer
2b5b46
+   contains BUFLEN bytes.
2b5b46
+
2b5b46
+   Display the characters as if they started at ADDRESS and are contained in
2b5b46
+   FILENAME.  */
2b5b46
+
2b5b46
+static void
2b5b46
+print_unicode_buffer (const char *            filename,
2b5b46
+		      file_ptr                address,
2b5b46
+		      const unsigned char *   buffer,
2b5b46
+		      unsigned long           buflen)
2b5b46
+{
2b5b46
+  /* Paranoia checks...  */
2b5b46
+  if (filename == NULL
2b5b46
+      || buffer == NULL
2b5b46
+      || unicode_display == unicode_default
2b5b46
+      || encoding != 'S'
2b5b46
+      || encoding_bytes != 1)
2b5b46
+    {
2b5b46
+      fprintf (stderr, "ICE: bad arguments to print_unicode_buffer\n");
2b5b46
+      return;
2b5b46
+    }
2b5b46
+
2b5b46
+  if (buflen == 0)
2b5b46
+    return;
2b5b46
+
2b5b46
+  /* We must only display strings that are at least string_min *characters*
2b5b46
+     long.  So we scan the buffer in two stages.  First we locate the start
2b5b46
+     of a potential string.  Then we walk along it until we have found
2b5b46
+     string_min characters.  Then we go back to the start point and start
2b5b46
+     displaying characters according to the unicode_display setting.  */
2b5b46
+
2b5b46
+  unsigned long start_point = 0;
2b5b46
+  unsigned long i = 0;
2b5b46
+  unsigned int char_len = 1;
2b5b46
+  unsigned int num_found = 0;
2b5b46
+
2b5b46
+  for (i = 0; i < buflen; i += char_len)
2b5b46
+    {
2b5b46
+      int c = buffer[i];
2b5b46
+
2b5b46
+      char_len = 1;
2b5b46
+
2b5b46
+      /* Find the first potential character of a string.  */
2b5b46
+      if (! STRING_ISGRAPHIC (c))
2b5b46
+	{
2b5b46
+	  num_found = 0;
2b5b46
+	  continue;
2b5b46
+	}
2b5b46
+
2b5b46
+      if (c > 126)
2b5b46
+	{
2b5b46
+	  if (c < 0xc0)
2b5b46
+	    {
2b5b46
+	      num_found = 0;
2b5b46
+	      continue;
2b5b46
+	    }
2b5b46
+
2b5b46
+	  if ((char_len = is_valid_utf8 (buffer + i, buflen - i)) == 0)
2b5b46
+	    {
2b5b46
+	      char_len = 1;
2b5b46
+	      num_found = 0;
2b5b46
+	      continue;
2b5b46
+	    }
2b5b46
+
2b5b46
+	  if (unicode_display == unicode_invalid)
2b5b46
+	    {
2b5b46
+	      /* We have found a valid UTF-8 character, but we treat it as non-graphic.  */
2b5b46
+	      num_found = 0;
2b5b46
+	      continue;
2b5b46
+	    }
2b5b46
+	}
2b5b46
+
2b5b46
+      if (num_found == 0)
2b5b46
+	/* We have found a potential starting point for a string.  */
2b5b46
+	start_point = i;
2b5b46
+
2b5b46
+      ++ num_found;
2b5b46
+
2b5b46
+      if (num_found >= string_min)
2b5b46
+	break;
2b5b46
+    }
2b5b46
+
2b5b46
+  if (num_found < string_min)
2b5b46
+    return;
2b5b46
+
2b5b46
+  print_filename_and_address (filename, address + start_point);
2b5b46
+  
2b5b46
+  /* We have found string_min characters.  Display them and any
2b5b46
+     more that follow.  */
2b5b46
+  for (i = start_point; i < buflen; i += char_len)
2b5b46
+    {
2b5b46
+      int c = buffer[i];
2b5b46
+
2b5b46
+      char_len = 1;
2b5b46
+
2b5b46
+      if (! STRING_ISGRAPHIC (c))
2b5b46
+	break;
2b5b46
+      else if (c < 127)
2b5b46
+	putchar (c);
2b5b46
+      else if (! is_valid_utf8 (buffer + i, buflen - i))
2b5b46
+	break;
2b5b46
+      else if (unicode_display == unicode_invalid)
2b5b46
+	break;
2b5b46
+      else
2b5b46
+	char_len = display_utf8_char (buffer + i);
2b5b46
+    }
2b5b46
+
2b5b46
+  if (output_separator)
2b5b46
+    fputs (output_separator, stdout);
2b5b46
+  else
2b5b46
+    putchar ('\n');
2b5b46
+
2b5b46
+  /* FIXME: Using tail recursion here is lazy programming...  */
2b5b46
+  print_unicode_buffer (filename, address + i, buffer + i, buflen - i);
2b5b46
+}
2b5b46
+
2b5b46
+static int
2b5b46
+get_unicode_byte (FILE *           stream,
2b5b46
+		  unsigned char *  putback,
2b5b46
+		  unsigned int *   num_putback,
2b5b46
+		  unsigned int *   num_read)
2b5b46
+{
2b5b46
+  if (* num_putback > 0)
2b5b46
+    {
2b5b46
+      * num_putback = * num_putback - 1;
2b5b46
+      return putback [* num_putback];
2b5b46
+    }
2b5b46
+
2b5b46
+  * num_read = * num_read + 1;
2b5b46
+
2b5b46
+#if defined(HAVE_GETC_UNLOCKED) && HAVE_DECL_GETC_UNLOCKED
2b5b46
+  return getc_unlocked (stream);
2b5b46
+#else
2b5b46
+  return getc (stream);
2b5b46
+#endif
2b5b46
+}
2b5b46
+
2b5b46
+/* Helper function for print_unicode_stream.  */
2b5b46
+
2b5b46
+static void
2b5b46
+print_unicode_stream_body (const char *     filename,
2b5b46
+			   file_ptr         address,
2b5b46
+			   FILE *           stream,
2b5b46
+			   unsigned char *  putback_buf,
2b5b46
+			   unsigned int     num_putback,
2b5b46
+			   unsigned char *  print_buf)
2b5b46
+{
2b5b46
+  /* It would be nice if we could just read the stream into a buffer
2b5b46
+     and then process if with print_unicode_buffer.  But the input
2b5b46
+     might be huge or it might time-locked (eg stdin).  So instead
2b5b46
+     we go one byte at a time...  */
2b5b46
+
2b5b46
+  file_ptr start_point = 0;
2b5b46
+  unsigned int num_read = 0;
2b5b46
+  unsigned int num_chars = 0;
2b5b46
+  unsigned int num_print = 0;
2b5b46
+  int c;
2b5b46
+
2b5b46
+  /* Find a series of string_min characters.  Put them into print_buf.  */
2b5b46
+  do
2b5b46
+    {
2b5b46
+      if (num_chars >= string_min)
2b5b46
+	break;
2b5b46
+
2b5b46
+      c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
2b5b46
+      if (c == EOF)
2b5b46
+	break;
2b5b46
+
2b5b46
+      if (! STRING_ISGRAPHIC (c))
2b5b46
+	{
2b5b46
+	  num_chars = num_print = 0;
2b5b46
+	  continue;
2b5b46
+	}
2b5b46
+
2b5b46
+      if (num_chars == 0)
2b5b46
+	start_point = num_read - 1;
2b5b46
+
2b5b46
+      if (c < 127)
2b5b46
+	{
2b5b46
+	  print_buf[num_print] = c;
2b5b46
+	  num_chars ++;
2b5b46
+	  num_print ++;
2b5b46
+	  continue;
2b5b46
+	}
2b5b46
+
2b5b46
+      if (c < 0xc0)
2b5b46
+	{
2b5b46
+	  num_chars = num_print = 0;
2b5b46
+	  continue;
2b5b46
+	}
2b5b46
+
2b5b46
+      /* We *might* have a UTF-8 sequence.  Time to start peeking.  */
2b5b46
+      char utf8[4];
2b5b46
+
2b5b46
+      utf8[0] = c;
2b5b46
+      c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
2b5b46
+      if (c == EOF)
2b5b46
+	break;
2b5b46
+      utf8[1] = c;
2b5b46
+
2b5b46
+      if ((utf8[1] & 0xc0) != 0x80)
2b5b46
+	{
2b5b46
+	  /* Invalid UTF-8.  */
2b5b46
+	  putback_buf[num_putback++] = utf8[1];
2b5b46
+	  num_chars = num_print = 0;
2b5b46
+	  continue;
2b5b46
+	}
2b5b46
+      else if ((utf8[0] & 0x20) == 0)
2b5b46
+	{
2b5b46
+	  /* A valid 2-byte UTF-8 encoding.  */
2b5b46
+	  if (unicode_display == unicode_invalid)
2b5b46
+	    {
2b5b46
+	      putback_buf[num_putback++] = utf8[1];
2b5b46
+	      num_chars = num_print = 0;
2b5b46
+	    }
2b5b46
+	  else
2b5b46
+	    {
2b5b46
+	      print_buf[num_print ++] = utf8[0];
2b5b46
+	      print_buf[num_print ++] = utf8[1];
2b5b46
+	      num_chars ++;
2b5b46
+	    }
2b5b46
+	  continue;
2b5b46
+	}
2b5b46
+
2b5b46
+      c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
2b5b46
+      if (c == EOF)
2b5b46
+	break;
2b5b46
+      utf8[2] = c;
2b5b46
+
2b5b46
+      if ((utf8[2] & 0xc0) != 0x80)
2b5b46
+	{
2b5b46
+	  /* Invalid UTF-8.  */
2b5b46
+	  putback_buf[num_putback++] = utf8[2];
2b5b46
+	  putback_buf[num_putback++] = utf8[1];
2b5b46
+	  num_chars = num_print = 0;
2b5b46
+	  continue;
2b5b46
+	}
2b5b46
+      else if ((utf8[0] & 0x10) == 0)
2b5b46
+	{
2b5b46
+	  /* A valid 3-byte UTF-8 encoding.  */
2b5b46
+	  if (unicode_display == unicode_invalid)
2b5b46
+	    {
2b5b46
+	      putback_buf[num_putback++] = utf8[2];
2b5b46
+	      putback_buf[num_putback++] = utf8[1];
2b5b46
+	      num_chars = num_print = 0;
2b5b46
+	    }
2b5b46
+	  else
2b5b46
+	    {
2b5b46
+	      print_buf[num_print ++] = utf8[0];
2b5b46
+	      print_buf[num_print ++] = utf8[1];
2b5b46
+	      print_buf[num_print ++] = utf8[2];
2b5b46
+	      num_chars ++;
2b5b46
+	    }
2b5b46
+	  continue;
2b5b46
+	}
2b5b46
+
2b5b46
+      c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
2b5b46
+      if (c == EOF)
2b5b46
+	break;
2b5b46
+      utf8[3] = c;
2b5b46
+
2b5b46
+      if ((utf8[3] & 0xc0) != 0x80)
2b5b46
+	{
2b5b46
+	  /* Invalid UTF-8.  */
2b5b46
+	  putback_buf[num_putback++] = utf8[3];
2b5b46
+	  putback_buf[num_putback++] = utf8[2];
2b5b46
+	  putback_buf[num_putback++] = utf8[1];
2b5b46
+	  num_chars = num_print = 0;
2b5b46
+	}
2b5b46
+      /* We have a valid 4-byte UTF-8 encoding.  */
2b5b46
+      else if (unicode_display == unicode_invalid)
2b5b46
+	{
2b5b46
+	  putback_buf[num_putback++] = utf8[3];
2b5b46
+	  putback_buf[num_putback++] = utf8[1];
2b5b46
+	  putback_buf[num_putback++] = utf8[2];
2b5b46
+	  num_chars = num_print = 0;
2b5b46
+	}
2b5b46
+      else
2b5b46
+	{
2b5b46
+	  print_buf[num_print ++] = utf8[0];
2b5b46
+	  print_buf[num_print ++] = utf8[1];
2b5b46
+	  print_buf[num_print ++] = utf8[2];
2b5b46
+	  print_buf[num_print ++] = utf8[3];
2b5b46
+	  num_chars ++;
2b5b46
+	}
2b5b46
+    }
2b5b46
+  while (1);
2b5b46
+
2b5b46
+  if (num_chars >= string_min)
2b5b46
+    {
2b5b46
+      /* We know that we have string_min valid characters in print_buf,
2b5b46
+	 and there may be more to come in the stream.  Start displaying
2b5b46
+	 them.  */
2b5b46
+
2b5b46
+      print_filename_and_address (filename, address + start_point);
2b5b46
+
2b5b46
+      unsigned int i;
2b5b46
+      for (i = 0; i < num_print;)
2b5b46
+	{
2b5b46
+	  if (print_buf[i] < 127)
2b5b46
+	    putchar (print_buf[i++]);
2b5b46
+	  else
2b5b46
+	    i += display_utf8_char (print_buf + i);
2b5b46
+	}
2b5b46
+
2b5b46
+      /* OK so now we have to start read unchecked bytes.  */
2b5b46
+
2b5b46
+        /* Find a series of string_min characters.  Put them into print_buf.  */
2b5b46
+      do
2b5b46
+	{
2b5b46
+	  c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
2b5b46
+	  if (c == EOF)
2b5b46
+	    break;
2b5b46
+
2b5b46
+	  if (! STRING_ISGRAPHIC (c))
2b5b46
+	    break;
2b5b46
+
2b5b46
+	  if (c < 127)
2b5b46
+	    {
2b5b46
+	      putchar (c);
2b5b46
+	      continue;
2b5b46
+	    }
2b5b46
+
2b5b46
+	  if (c < 0xc0)
2b5b46
+	    break;
2b5b46
+
2b5b46
+	  /* We *might* have a UTF-8 sequence.  Time to start peeking.  */
2b5b46
+	  unsigned char utf8[4];
2b5b46
+
2b5b46
+	  utf8[0] = c;
2b5b46
+	  c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
2b5b46
+	  if (c == EOF)
2b5b46
+	    break;
2b5b46
+	  utf8[1] = c;
2b5b46
+
2b5b46
+	  if ((utf8[1] & 0xc0) != 0x80)
2b5b46
+	    {
2b5b46
+	      /* Invalid UTF-8.  */
2b5b46
+	      putback_buf[num_putback++] = utf8[1];
2b5b46
+	      break;
2b5b46
+	    }
2b5b46
+	  else if ((utf8[0] & 0x20) == 0)
2b5b46
+	    {
2b5b46
+	      /* Valid 2-byte UTF-8.  */
2b5b46
+	      if (unicode_display == unicode_invalid)
2b5b46
+		{
2b5b46
+		  putback_buf[num_putback++] = utf8[1];
2b5b46
+		  break;
2b5b46
+		}
2b5b46
+	      else
2b5b46
+		{
2b5b46
+		  (void) display_utf8_char (utf8);
2b5b46
+		  continue;
2b5b46
+		}
2b5b46
+	    }
2b5b46
+
2b5b46
+	  c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
2b5b46
+	  if (c == EOF)
2b5b46
+	    break;
2b5b46
+	  utf8[2] = c;
2b5b46
+
2b5b46
+	  if ((utf8[2] & 0xc0) != 0x80)
2b5b46
+	    {
2b5b46
+	      /* Invalid UTF-8.  */
2b5b46
+	      putback_buf[num_putback++] = utf8[2];
2b5b46
+	      putback_buf[num_putback++] = utf8[1];
2b5b46
+	      break;
2b5b46
+	    }
2b5b46
+	  else if ((utf8[0] & 0x10) == 0)
2b5b46
+	    {
2b5b46
+	      /* Valid 3-byte UTF-8.  */
2b5b46
+	      if (unicode_display == unicode_invalid)
2b5b46
+		{
2b5b46
+		  putback_buf[num_putback++] = utf8[2];
2b5b46
+		  putback_buf[num_putback++] = utf8[1];
2b5b46
+		  break;
2b5b46
+		}
2b5b46
+	      else
2b5b46
+		{
2b5b46
+		  (void) display_utf8_char (utf8);
2b5b46
+		  continue;
2b5b46
+		}
2b5b46
+	    }
2b5b46
+
2b5b46
+	  c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
2b5b46
+	  if (c == EOF)
2b5b46
+	    break;
2b5b46
+	  utf8[3] = c;
2b5b46
+
2b5b46
+	  if ((utf8[3] & 0xc0) != 0x80)
2b5b46
+	    {
2b5b46
+	      /* Invalid UTF-8.  */
2b5b46
+	      putback_buf[num_putback++] = utf8[3];
2b5b46
+	      putback_buf[num_putback++] = utf8[2];
2b5b46
+	      putback_buf[num_putback++] = utf8[1];
2b5b46
+	      break;
2b5b46
+	    }
2b5b46
+	  else if (unicode_display == unicode_invalid)
2b5b46
+	    {
2b5b46
+	      putback_buf[num_putback++] = utf8[3];
2b5b46
+	      putback_buf[num_putback++] = utf8[2];
2b5b46
+	      putback_buf[num_putback++] = utf8[1];
2b5b46
+	      break;
2b5b46
+	    }
2b5b46
+	  else
2b5b46
+	    /* A valid 4-byte UTF-8 encoding.  */
2b5b46
+	    (void) display_utf8_char (utf8);
2b5b46
+	}
2b5b46
+      while (1);
2b5b46
+
2b5b46
+      if (output_separator)
2b5b46
+	fputs (output_separator, stdout);
2b5b46
+      else
2b5b46
+	putchar ('\n');
2b5b46
+    }
2b5b46
+
2b5b46
+  if (c != EOF)
2b5b46
+    /* FIXME: Using tail recursion here is lazy, but it works.  */
2b5b46
+    print_unicode_stream_body (filename, address + num_read, stream, putback_buf, num_putback, print_buf);
2b5b46
+}
2b5b46
+
2b5b46
+/* Display strings read in from STREAM.  Treat any UTF-8 encoded characters
2b5b46
+   encountered according to the setting of the unicode_display variable.
2b5b46
+   The stream is positioned at ADDRESS and is attached to FILENAME.  */
2b5b46
+
2b5b46
+static void
2b5b46
+print_unicode_stream (const char * filename,
2b5b46
+		      file_ptr     address,
2b5b46
+		      FILE *       stream)
2b5b46
+{
2b5b46
+  /* Paranoia checks...  */
2b5b46
+  if (filename == NULL
2b5b46
+      || stream == NULL
2b5b46
+      || unicode_display == unicode_default
2b5b46
+      || encoding != 'S'
2b5b46
+      || encoding_bytes != 1)
2b5b46
+    {
2b5b46
+      fprintf (stderr, "ICE: bad arguments to print_unicode_stream\n");
2b5b46
+      return;
2b5b46
+    }
2b5b46
+
2b5b46
+  /* Allocate space for string_min 4-byte utf-8 characters.  */
2b5b46
+  unsigned char * print_buf = xmalloc ((4 * string_min) + 1);
2b5b46
+  /* We should never have to put back more than 4 bytes.  */
2b5b46
+  unsigned char putback_buf[5];
2b5b46
+  unsigned int num_putback = 0;
2b5b46
+
2b5b46
+  print_unicode_stream_body (filename, address, stream, putback_buf, num_putback, print_buf);
2b5b46
+  free (print_buf);
2b5b46
+}
2b5b46
 
2b5b46
 /* Find the strings in file FILENAME, read from STREAM.
2b5b46
    Assume that STREAM is positioned so that the next byte read
2b5b46
    is at address ADDRESS in the file.
2b5b46
-   Stop reading at address STOP_POINT in the file, if nonzero.
2b5b46
 
2b5b46
    If STREAM is NULL, do not read from it.
2b5b46
    The caller can supply a buffer of characters
2b5b46
@@ -570,18 +1238,27 @@ static void
2b5b46
 print_strings (const char *filename, FILE *stream, file_ptr address,
2b5b46
 	       int stop_point, int magiccount, char *magic)
2b5b46
 {
2b5b46
+  if (unicode_display != unicode_default)
2b5b46
+    {
2b5b46
+      if (magic != NULL)
2b5b46
+	print_unicode_buffer (filename, address,
2b5b46
+			      (const unsigned char *) magic, magiccount);
2b5b46
+
2b5b46
+      if (stream != NULL)
2b5b46
+	print_unicode_stream (filename, address, stream);
2b5b46
+      return;
2b5b46
+    }
2b5b46
+
2b5b46
   char *buf = (char *) xmalloc (sizeof (char) * (string_min + 1));
2b5b46
 
2b5b46
   while (1)
2b5b46
     {
2b5b46
       file_ptr start;
2b5b46
-      int i;
2b5b46
+      unsigned int i;
2b5b46
       long c;
2b5b46
 
2b5b46
       /* See if the next `string_min' chars are all graphic chars.  */
2b5b46
     tryline:
2b5b46
-      if (stop_point && address >= stop_point)
2b5b46
-	break;
2b5b46
       start = address;
2b5b46
       for (i = 0; i < string_min; i++)
2b5b46
 	{
2b5b46
@@ -718,6 +1395,8 @@ usage (FILE *stream, int status)
2b5b46
   -T --target=<BFDNAME>     Specify the binary file format\n\
2b5b46
   -e --encoding={s,S,b,l,B,L} Select character size and endianness:\n\
2b5b46
                             s = 7-bit, S = 8-bit, {b,l} = 16-bit, {B,L} = 32-bit\n\
2b5b46
+  --unicode={default|show|invalid|hex|escape|highlight}\n\
2b5b46
+  -U {d|s|i|x|e|h}          Specify how to treat UTF-8 encoded unicode characters\n\
2b5b46
   -s --output-separator=<string> String used to separate strings in output.\n\
2b5b46
   @<file>                   Read options from <file>\n\
2b5b46
   -h --help                 Display this information\n\