Blame SOURCES/binutils.unicode.patch

b1e5cb
diff -rup binutils.orig/binutils/NEWS binutils-2.36.1/binutils/NEWS
b1e5cb
--- binutils.orig/binutils/NEWS	2021-10-21 16:56:20.322761363 +0100
b1e5cb
+++ binutils-2.36.1/binutils/NEWS	2021-10-21 16:56:29.692696238 +0100
b1e5cb
@@ -151,6 +151,15 @@ Changes in 2.32:
b1e5cb
 
b1e5cb
 Changes in 2.31:
b1e5cb
 
b1e5cb
+* Tools which display names or strings (readelf, strings, nm, objdump)
b1e5cb
+  have a new command line option which controls how unicode characters are
b1e5cb
+  handled.  By default they are treated as normal for the tool.  Using
b1e5cb
+  --unicode=locale will display them according to the current locale.
b1e5cb
+  Using --unicode=hex will display them as hex byte values, whilst
b1e5cb
+  --unicode=escape will display them as escape sequences.  In addition
b1e5cb
+  using --unicode=highlight will display them as unicode escape sequences
b1e5cb
+  highlighted in red (if supported by the output device).
b1e5cb
+
b1e5cb
 * Add support for disassembling netronome Flow Processor (NFP) firmware files.
b1e5cb
 
b1e5cb
 * The AArch64 port now supports showing disassembly notes which are emitted
b1e5cb
Only in binutils-2.36.1/binutils/: NEWS.orig
b1e5cb
diff -rup binutils.orig/binutils/doc/binutils.texi binutils-2.36.1/binutils/doc/binutils.texi
b1e5cb
--- binutils.orig/binutils/doc/binutils.texi	2021-10-21 16:56:20.324761349 +0100
b1e5cb
+++ binutils-2.36.1/binutils/doc/binutils.texi	2021-10-21 16:56:29.694696225 +0100
b1e5cb
@@ -799,6 +799,7 @@ nm [@option{-A}|@option{-o}|@option{--pr
b1e5cb
    [@option{-g}|@option{--extern-only}] [@option{-h}|@option{--help}]
b1e5cb
    [@option{--ifunc-chars=@var{CHARS}}]
b1e5cb
    [@option{-l}|@option{--line-numbers}] [@option{--inlines}]
b1e5cb
+   [@option{-U} @var{method}] [@option{--unicode=}@var{method}]
b1e5cb
    [@option{-n}|@option{-v}|@option{--numeric-sort}]
b1e5cb
    [@option{-P}|@option{--portability}] [@option{-p}|@option{--no-sort}]
b1e5cb
    [@option{-r}|@option{--reverse-sort}] [@option{-S}|@option{--print-size}]
b1e5cb
@@ -1114,6 +1115,21 @@ Use @var{radix} as the radix for printin
b1e5cb
 @cindex undefined symbols
b1e5cb
 Display only undefined symbols (those external to each object file).
b1e5cb
 
b1e5cb
+@item -U @var{[d|i|l|e|x|h]}
b1e5cb
+@itemx --unicode=@var{[default|invalid|locale|escape|hex|highlight]}
b1e5cb
+Controls the display of UTF-8 encoded mulibyte characters in strings.
b1e5cb
+The default (@option{--unicode=default}) is to give them no special
b1e5cb
+treatment.  The @option{--unicode=locale} option displays the sequence
b1e5cb
+in the current locale, which may or may not support them.  The options
b1e5cb
+@option{--unicode=hex} and @option{--unicode=invalid} display them as
b1e5cb
+hex byte sequences enclosed by either angle brackets or curly braces.
b1e5cb
+
b1e5cb
+The @option{--unicode=escape} option displays them as escape sequences
b1e5cb
+(@var{\uxxxx}) and the @option{--unicode=highlight} option displays
b1e5cb
+them as escape sequences highlighted in red (if supported by the
b1e5cb
+output device).  The colouring is intended to draw attention to the
b1e5cb
+presence of unicode sequences where they might not be expected.
b1e5cb
+
b1e5cb
 @item -V
b1e5cb
 @itemx --version
b1e5cb
 Show the version number of @command{nm} and exit.
b1e5cb
@@ -2210,6 +2226,7 @@ objdump [@option{-a}|@option{--archive-h
b1e5cb
         [@option{--prefix-strip=}@var{level}]
b1e5cb
         [@option{--insn-width=}@var{width}]
b1e5cb
         [@option{--visualize-jumps[=color|=extended-color|=off]}
b1e5cb
+        [@option{-U} @var{method}] [@option{--unicode=}@var{method}]
b1e5cb
         [@option{-V}|@option{--version}]
b1e5cb
         [@option{-H}|@option{--help}]
b1e5cb
         @var{objfile}@dots{}
b1e5cb
@@ -2877,6 +2894,21 @@ When displaying symbols include those wh
b1e5cb
 special in some way and which would not normally be of interest to the
b1e5cb
 user.
b1e5cb
 
b1e5cb
+@item -U @var{[d|i|l|e|x|h]}
b1e5cb
+@itemx --unicode=@var{[default|invalid|locale|escape|hex|highlight]}
b1e5cb
+Controls the display of UTF-8 encoded mulibyte characters in strings.
b1e5cb
+The default (@option{--unicode=default}) is to give them no special
b1e5cb
+treatment.  The @option{--unicode=locale} option displays the sequence
b1e5cb
+in the current locale, which may or may not support them.  The options
b1e5cb
+@option{--unicode=hex} and @option{--unicode=invalid} display them as
b1e5cb
+hex byte sequences enclosed by either angle brackets or curly braces.
b1e5cb
+
b1e5cb
+The @option{--unicode=escape} option displays them as escape sequences
b1e5cb
+(@var{\uxxxx}) and the @option{--unicode=highlight} option displays
b1e5cb
+them as escape sequences highlighted in red (if supported by the
b1e5cb
+output device).  The colouring is intended to draw attention to the
b1e5cb
+presence of unicode sequences where they might not be expected.
b1e5cb
+
b1e5cb
 @item -V
b1e5cb
 @itemx --version
b1e5cb
 Print the version number of @command{objdump} and exit.
b1e5cb
@@ -3153,6 +3185,7 @@ strings [@option{-afovV}] [@option{-}@va
b1e5cb
         [@option{-n} @var{min-len}] [@option{--bytes=}@var{min-len}]
b1e5cb
         [@option{-t} @var{radix}] [@option{--radix=}@var{radix}]
b1e5cb
         [@option{-e} @var{encoding}] [@option{--encoding=}@var{encoding}]
b1e5cb
+        [@option{-U} @var{method}] [@option{--unicode=}@var{method}]
b1e5cb
         [@option{-}] [@option{--all}] [@option{--print-file-name}]
b1e5cb
         [@option{-T} @var{bfdname}] [@option{--target=}@var{bfdname}]
b1e5cb
         [@option{-w}] [@option{--include-all-whitespace}]
b1e5cb
@@ -3244,6 +3277,28 @@ single-8-bit-byte characters, @samp{b} =
b1e5cb
 littleendian.  Useful for finding wide character strings. (@samp{l}
b1e5cb
 and @samp{b} apply to, for example, Unicode UTF-16/UCS-2 encodings).
b1e5cb
 
b1e5cb
+@item -U @var{[d|i|l|e|x|h]}
b1e5cb
+@itemx --unicode=@var{[default|invalid|locale|escape|hex|highlight]}
b1e5cb
+Controls the display of UTF-8 encoded mulibyte characters in strings.
b1e5cb
+The default (@option{--unicode=default}) is to give them no special
b1e5cb
+treatment, and instead rely upon the setting of the
b1e5cb
+@option{--encoding} option.  The other values for this option
b1e5cb
+automatically enable @option{--encoding=S}.
b1e5cb
+
b1e5cb
+The @option{--unicode=invalid} option treats them as non-graphic
b1e5cb
+characters and hence not part of a valid string.  All the remaining
b1e5cb
+options treat them as valid string characters.
b1e5cb
+
b1e5cb
+The @option{--unicode=locale} option displays them in the current
b1e5cb
+locale, which may or may not support UTF-8 encoding.  The
b1e5cb
+@option{--unicode=hex} option displays them as hex byte sequences
b1e5cb
+enclosed between @var{<>} characters.  The @option{--unicode=escape}
b1e5cb
+option displays them as escape sequences (@var{\uxxxx}) and the
b1e5cb
+@option{--unicode=highlight} option displays them as escape sequences
b1e5cb
+highlighted in red (if supported by the output device).  The colouring
b1e5cb
+is intended to draw attention to the presence of unicode sequences
b1e5cb
+where they might not be expected.
b1e5cb
+
b1e5cb
 @item -T @var{bfdname}
b1e5cb
 @itemx --target=@var{bfdname}
b1e5cb
 @cindex object code format
b1e5cb
@@ -4766,6 +4821,7 @@ readelf [@option{-a}|@option{--all}]
b1e5cb
         [@option{-W}|@option{--wide}]
b1e5cb
         [@option{-T}|@option{--silent-truncation}]
b1e5cb
         [@option{-H}|@option{--help}]
b1e5cb
+        [@option{-U} @var{method}|@option{--unicode=}@var{method}]
b1e5cb
         @var{elffile}@dots{}
b1e5cb
 @c man end
b1e5cb
 @end smallexample
b1e5cb
@@ -4887,6 +4943,28 @@ necessary in order to demangle truly com
b1e5cb
 that if the recursion limit is disabled then stack exhaustion is
b1e5cb
 possible and any bug reports about such an event will be rejected.
b1e5cb
 
b1e5cb
+@item -U @var{[d|i|l|e|x|h]}
b1e5cb
+@itemx --unicode=[default|invalid|locale|escape|hex|highlight]
b1e5cb
+Controls the display of non-ASCII characters in identifier names.
b1e5cb
+The default (@option{--unicode=locale} or @option{--unicode=default}) is
b1e5cb
+to treat them as multibyte characters and display them in the current
b1e5cb
+locale.  All other versions of this option treat the bytes as UTF-8
b1e5cb
+encoded values and attempt to interpret them.  If they cannot be
b1e5cb
+interpreted or if the @option{--unicode=invalid} option is used then
b1e5cb
+they are displayed as a sequence of hex bytes, encloses in curly
b1e5cb
+parethesis characters.
b1e5cb
+
b1e5cb
+Using the @option{--unicode=escape} option will display the characters
b1e5cb
+as as unicode escape sequences (@var{\uxxxx}).  Using the
b1e5cb
+@option{--unicode=hex} will display the characters as hex byte
b1e5cb
+sequences enclosed between angle brackets.
b1e5cb
+
b1e5cb
+Using the @option{--unicode=highlight} will display the characters as 
b1e5cb
+unicode escape sequences but it will also highlighted them in red,
b1e5cb
+assuming that colouring is supported by the output device.  The
b1e5cb
+colouring is intended to draw attention to the presence of unicode
b1e5cb
+sequences when they might not be expected.
b1e5cb
+
b1e5cb
 @item -e
b1e5cb
 @itemx --headers
b1e5cb
 Display all the headers in the file.  Equivalent to @option{-h -l -S}.
b1e5cb
Only in binutils-2.36.1/binutils/doc: binutils.texi.orig
b1e5cb
diff -rup binutils.orig/binutils/nm.c binutils-2.36.1/binutils/nm.c
b1e5cb
--- binutils.orig/binutils/nm.c	2021-10-21 16:56:20.318761391 +0100
b1e5cb
+++ binutils-2.36.1/binutils/nm.c	2021-10-21 16:59:56.105261602 +0100
b1e5cb
@@ -38,6 +38,11 @@
b1e5cb
 #include "bucomm.h"
b1e5cb
 #include "plugin-api.h"
b1e5cb
 #include "plugin.h"
b1e5cb
+#include "safe-ctype.h"
b1e5cb
+
b1e5cb
+#ifndef streq
b1e5cb
+#define streq(a,b) (strcmp ((a),(b)) == 0)
b1e5cb
+#endif
b1e5cb
 
b1e5cb
 /* When sorting by size, we use this structure to hold the size and a
b1e5cb
    pointer to the minisymbol.  */
b1e5cb
@@ -192,6 +197,18 @@ static const char *plugin_target = NULL;
b1e5cb
 static bfd *lineno_cache_bfd;
b1e5cb
 static bfd *lineno_cache_rel_bfd;
b1e5cb
 
b1e5cb
+typedef enum unicode_display_type
b1e5cb
+{
b1e5cb
+  unicode_default = 0,
b1e5cb
+  unicode_locale,
b1e5cb
+  unicode_escape,
b1e5cb
+  unicode_hex,
b1e5cb
+  unicode_highlight,
b1e5cb
+  unicode_invalid
b1e5cb
+} unicode_display_type;
b1e5cb
+
b1e5cb
+static unicode_display_type unicode_display = unicode_default;
b1e5cb
+
b1e5cb
 enum long_option_values
b1e5cb
 {
b1e5cb
   OPTION_TARGET = 200,
b1e5cb
@@ -234,6 +251,7 @@ static struct option long_options[] =
b1e5cb
   {"target", required_argument, 0, OPTION_TARGET},
b1e5cb
   {"defined-only", no_argument, &defined_only, 1},
b1e5cb
   {"undefined-only", no_argument, &undefined_only, 1},
b1e5cb
+  {"unicode", required_argument, NULL, 'U'},
b1e5cb
   {"version", no_argument, &show_version, 1},
b1e5cb
   {"with-symbol-versions", no_argument, NULL,
b1e5cb
    OPTION_WITH_SYMBOL_VERSIONS},
b1e5cb
@@ -285,6 +303,8 @@ usage (FILE *stream, int status)
b1e5cb
   -t, --radix=RADIX      Use RADIX for printing symbol values\n\
b1e5cb
       --target=BFDNAME   Specify the target object format as BFDNAME\n\
b1e5cb
   -u, --undefined-only   Display only undefined symbols\n\
b1e5cb
+  -U {d|s|i|x|e|h}       Specify how to treat UTF-8 encoded unicode characters\n\
b1e5cb
+      --unicode={default|show|invalid|hex|escape|highlight}\n\
b1e5cb
       --with-symbol-versions  Display version strings after symbol names\n\
b1e5cb
   -X 32_64               (ignored)\n\
b1e5cb
   @FILE                  Read options from FILE\n\
b1e5cb
@@ -400,6 +420,189 @@ get_coff_symbol_type (const struct inter
b1e5cb
   return bufp;
b1e5cb
 }
b1e5cb
 
b1e5cb
+/* Convert a potential UTF-8 encoded sequence in IN into characters in OUT.
b1e5cb
+   The conversion format is controlled by the unicode_display variable.
b1e5cb
+   Returns the number of characters added to OUT.
b1e5cb
+   Returns the number of bytes consumed from IN in CONSUMED.
b1e5cb
+   Always consumes at least one byte and displays at least one character.  */
b1e5cb
+   
b1e5cb
+static unsigned int
b1e5cb
+display_utf8 (const unsigned char * in, char * out, unsigned int * consumed)
b1e5cb
+{
b1e5cb
+  char *        orig_out = out;
b1e5cb
+  unsigned int  nchars = 0;
b1e5cb
+
b1e5cb
+  if (unicode_display == unicode_default)
b1e5cb
+    goto invalid;
b1e5cb
+
b1e5cb
+  if (in[0] < 0xc0)
b1e5cb
+    goto invalid;
b1e5cb
+
b1e5cb
+  if ((in[1] & 0xc0) != 0x80)
b1e5cb
+    goto invalid;
b1e5cb
+
b1e5cb
+  if ((in[0] & 0x20) == 0)
b1e5cb
+    {
b1e5cb
+      nchars = 2;
b1e5cb
+      goto valid;
b1e5cb
+    }
b1e5cb
+
b1e5cb
+  if ((in[2] & 0xc0) != 0x80)
b1e5cb
+    goto invalid;
b1e5cb
+
b1e5cb
+  if ((in[0] & 0x10) == 0)
b1e5cb
+    {
b1e5cb
+      nchars = 3;
b1e5cb
+      goto valid;
b1e5cb
+    }
b1e5cb
+
b1e5cb
+  if ((in[3] & 0xc0) != 0x80)
b1e5cb
+    goto invalid;
b1e5cb
+
b1e5cb
+  nchars = 4;
b1e5cb
+
b1e5cb
+ valid:
b1e5cb
+  switch (unicode_display)
b1e5cb
+    {
b1e5cb
+    case unicode_locale:
b1e5cb
+      /* Copy the bytes into the output buffer as is.  */
b1e5cb
+      memcpy (out, in, nchars);
b1e5cb
+      out += nchars;
b1e5cb
+      break;
b1e5cb
+
b1e5cb
+    case unicode_invalid:
b1e5cb
+    case unicode_hex:
b1e5cb
+      {
b1e5cb
+      unsigned int j;
b1e5cb
+
b1e5cb
+      out += sprintf (out, "%c", unicode_display == unicode_hex ? '<' : '{');
b1e5cb
+      for (j = 0; j < nchars; j++)
b1e5cb
+	out += sprintf (out, "%02x", in [j]);
b1e5cb
+      out += sprintf (out, "%c", unicode_display == unicode_hex ? '>' : '}');
b1e5cb
+      }
b1e5cb
+      break;
b1e5cb
+      
b1e5cb
+    case unicode_highlight:
b1e5cb
+      if (isatty (1))
b1e5cb
+	out += sprintf (out, "\x1B[31;47m"); /* Red.  */
b1e5cb
+      /* Fall through.  */
b1e5cb
+    case unicode_escape:
b1e5cb
+      switch (nchars)
b1e5cb
+	{
b1e5cb
+	case 2:
b1e5cb
+	  out += sprintf (out, "\\u%02x%02x",
b1e5cb
+		  ((in[0] & 0x1c) >> 2), 
b1e5cb
+		  ((in[0] & 0x03) << 6) | (in[1] & 0x3f));
b1e5cb
+	  break;
b1e5cb
+
b1e5cb
+	case 3:
b1e5cb
+	  out += sprintf (out, "\\u%02x%02x",
b1e5cb
+		  ((in[0] & 0x0f) << 4) | ((in[1] & 0x3c) >> 2),
b1e5cb
+		  ((in[1] & 0x03) << 6) | ((in[2] & 0x3f)));
b1e5cb
+	  break;
b1e5cb
+
b1e5cb
+	case 4:
b1e5cb
+	  out += sprintf (out, "\\u%02x%02x%02x",
b1e5cb
+		  ((in[0] & 0x07) << 6) | ((in[1] & 0x3c) >> 2),
b1e5cb
+		  ((in[1] & 0x03) << 6) | ((in[2] & 0x3c) >> 2),
b1e5cb
+		  ((in[2] & 0x03) << 6) | ((in[3] & 0x3f)));
b1e5cb
+	  break;
b1e5cb
+	default:
b1e5cb
+	  /* URG.  */
b1e5cb
+	  break;
b1e5cb
+	}
b1e5cb
+
b1e5cb
+      if (unicode_display == unicode_highlight && isatty (1))
b1e5cb
+	out += sprintf (out, "\033[0m"); /* Default colour.  */
b1e5cb
+      break;
b1e5cb
+
b1e5cb
+    default:
b1e5cb
+      /* URG */
b1e5cb
+      break;
b1e5cb
+    }
b1e5cb
+
b1e5cb
+  * consumed = nchars;
b1e5cb
+  return out - orig_out;
b1e5cb
+
b1e5cb
+ invalid:
b1e5cb
+  /* Not a valid UTF-8 sequence.  */
b1e5cb
+  *out = *in;
b1e5cb
+  * consumed = 1;
b1e5cb
+  return 1;
b1e5cb
+}
b1e5cb
+
b1e5cb
+/* Convert any UTF-8 encoded characters in NAME into the form specified by
b1e5cb
+   unicode_display.  Also converts control characters.  Returns a static
b1e5cb
+   buffer if conversion was necessary.
b1e5cb
+   Code stolen from objdump.c:sanitize_string().  */
b1e5cb
+
b1e5cb
+static const char *
b1e5cb
+convert_utf8 (const char * in)
b1e5cb
+{
b1e5cb
+  static char *  buffer = NULL;
b1e5cb
+  static size_t  buffer_len = 0;
b1e5cb
+  const char *   original = in;
b1e5cb
+  char *         out;
b1e5cb
+
b1e5cb
+  /* Paranoia.  */
b1e5cb
+  if (in == NULL)
b1e5cb
+    return "";
b1e5cb
+
b1e5cb
+  /* See if any conversion is necessary.
b1e5cb
+     In the majority of cases it will not be needed.  */
b1e5cb
+  do
b1e5cb
+    {
b1e5cb
+      unsigned char c = *in++;
b1e5cb
+
b1e5cb
+      if (c == 0)
b1e5cb
+	return original;
b1e5cb
+
b1e5cb
+      if (ISCNTRL (c))
b1e5cb
+	break;
b1e5cb
+
b1e5cb
+      if (unicode_display != unicode_default && c >= 0xc0)
b1e5cb
+	break;
b1e5cb
+    }
b1e5cb
+  while (1);
b1e5cb
+
b1e5cb
+  /* Copy the input, translating as needed.  */
b1e5cb
+  in = original;
b1e5cb
+  if (buffer_len < (strlen (in) * 9))
b1e5cb
+    {
b1e5cb
+      free ((void *) buffer);
b1e5cb
+      buffer_len = strlen (in) * 9;
b1e5cb
+      buffer = xmalloc (buffer_len + 1);
b1e5cb
+    }
b1e5cb
+
b1e5cb
+  out = buffer;
b1e5cb
+  do
b1e5cb
+    {
b1e5cb
+      unsigned char c = *in++;
b1e5cb
+
b1e5cb
+      if (c == 0)
b1e5cb
+	break;
b1e5cb
+
b1e5cb
+      if (ISCNTRL (c))
b1e5cb
+	{
b1e5cb
+	  *out++ = '^';
b1e5cb
+	  *out++ = c + 0x40;
b1e5cb
+	}
b1e5cb
+      else if (unicode_display != unicode_default && c >= 0xc0)
b1e5cb
+	{
b1e5cb
+	  unsigned int num_consumed;
b1e5cb
+
b1e5cb
+	  out += display_utf8 ((const unsigned char *)(in - 1), out, & num_consumed);
b1e5cb
+	  in += num_consumed - 1;
b1e5cb
+	}
b1e5cb
+      else
b1e5cb
+	*out++ = c;
b1e5cb
+    }
b1e5cb
+  while (1);
b1e5cb
+
b1e5cb
+  *out = 0;
b1e5cb
+  return buffer;
b1e5cb
+}
b1e5cb
+
b1e5cb
 /* Print symbol name NAME, read from ABFD, with printf format FORM,
b1e5cb
    demangling it if requested.  */
b1e5cb
 
b1e5cb
@@ -418,6 +621,9 @@ print_symname (const char *form, struct
b1e5cb
 	name = alloc;
b1e5cb
     }
b1e5cb
 
b1e5cb
+  if (unicode_display != unicode_default)
b1e5cb
+    name = convert_utf8 (name);
b1e5cb
+
b1e5cb
   if (info != NULL && info->elfinfo)
b1e5cb
     {
b1e5cb
       const char *version_string;
b1e5cb
@@ -1738,7 +1944,7 @@ main (int argc, char **argv)
b1e5cb
     fatal (_("fatal error: libbfd ABI mismatch"));
b1e5cb
   set_default_bfd_target ();
b1e5cb
 
b1e5cb
-  while ((c = getopt_long (argc, argv, "aABCDef:gHhlnopPrSst:uvVvX:",
b1e5cb
+  while ((c = getopt_long (argc, argv, "aABCDef:gHhlnopPrSst:uU:vVvX:",
b1e5cb
 			   long_options, (int *) 0)) != EOF)
b1e5cb
     {
b1e5cb
       switch (c)
b1e5cb
@@ -1828,6 +2034,24 @@ main (int argc, char **argv)
b1e5cb
 	case 'u':
b1e5cb
 	  undefined_only = 1;
b1e5cb
 	  break;
b1e5cb
+
b1e5cb
+	case 'U':
b1e5cb
+	  if (streq (optarg, "default") || streq (optarg, "d"))
b1e5cb
+	    unicode_display = unicode_default;
b1e5cb
+	  else if (streq (optarg, "locale") || streq (optarg, "l"))
b1e5cb
+	    unicode_display = unicode_locale;
b1e5cb
+	  else if (streq (optarg, "escape") || streq (optarg, "e"))
b1e5cb
+	    unicode_display = unicode_escape;
b1e5cb
+	  else if (streq (optarg, "invalid") || streq (optarg, "i"))
b1e5cb
+	    unicode_display = unicode_invalid;
b1e5cb
+	  else if (streq (optarg, "hex") || streq (optarg, "x"))
b1e5cb
+	    unicode_display = unicode_hex;
b1e5cb
+	  else if (streq (optarg, "highlight") || streq (optarg, "h"))
b1e5cb
+	    unicode_display = unicode_highlight;
b1e5cb
+	  else
b1e5cb
+	    fatal (_("invalid argument to -U/--unicode: %s"), optarg);
b1e5cb
+	  break;
b1e5cb
+
b1e5cb
 	case 'V':
b1e5cb
 	  show_version = 1;
b1e5cb
 	  break;
b1e5cb
Only in binutils-2.36.1/binutils/: nm.c.orig
b1e5cb
Only in binutils-2.36.1/binutils/: nm.c.rej
b1e5cb
diff -rup binutils.orig/binutils/objdump.c binutils-2.36.1/binutils/objdump.c
b1e5cb
--- binutils.orig/binutils/objdump.c	2021-10-21 16:56:20.320761377 +0100
b1e5cb
+++ binutils-2.36.1/binutils/objdump.c	2021-10-21 16:56:29.695696218 +0100
b1e5cb
@@ -205,6 +205,18 @@ static const struct objdump_private_desc
b1e5cb
 
b1e5cb
 /* The list of detected jumps inside a function.  */
b1e5cb
 static struct jump_info *detected_jumps = NULL;
b1e5cb
+
b1e5cb
+typedef enum unicode_display_type
b1e5cb
+{
b1e5cb
+  unicode_default = 0,
b1e5cb
+  unicode_locale,
b1e5cb
+  unicode_escape,
b1e5cb
+  unicode_hex,
b1e5cb
+  unicode_highlight,
b1e5cb
+  unicode_invalid
b1e5cb
+} unicode_display_type;
b1e5cb
+
b1e5cb
+static unicode_display_type unicode_display = unicode_default;
b1e5cb
 
b1e5cb
 static void usage (FILE *, int) ATTRIBUTE_NORETURN;
b1e5cb
 static void
b1e5cb
@@ -247,6 +259,9 @@ usage (FILE *stream, int status)
b1e5cb
   -r, --reloc              Display the relocation entries in the file\n\
b1e5cb
   -R, --dynamic-reloc      Display the dynamic relocation entries in the file\n\
b1e5cb
   @<file>                  Read options from <file>\n\
b1e5cb
+  -U[d|l|i|x|e|h]          Controls the display of UTF-8 unicode characters\n\
b1e5cb
+  --unicode=[default|locale|invalid|hex|escape|highlight]\n"));
b1e5cb
+      fprintf (stream, _("\
b1e5cb
   -v, --version            Display this program's version number\n\
b1e5cb
   -i, --info               List object formats and architectures supported\n\
b1e5cb
   -H, --help               Display this information\n\
b1e5cb
@@ -395,6 +410,7 @@ static struct option long_options[]=
b1e5cb
   {"stop-address", required_argument, NULL, OPTION_STOP_ADDRESS},
b1e5cb
   {"syms", no_argument, NULL, 't'},
b1e5cb
   {"target", required_argument, NULL, 'b'},
b1e5cb
+  {"unicode", required_argument, NULL, 'U'},
b1e5cb
   {"version", no_argument, NULL, 'V'},
b1e5cb
   {"wide", no_argument, NULL, 'w'},
b1e5cb
   {"prefix", required_argument, NULL, OPTION_PREFIX},
b1e5cb
@@ -414,10 +430,124 @@ nonfatal (const char *msg)
b1e5cb
   bfd_nonfatal (msg);
b1e5cb
   exit_status = 1;
b1e5cb
 }
b1e5cb
+
b1e5cb
+/* Convert a potential UTF-8 encoded sequence in IN into characters in OUT.
b1e5cb
+   The conversion format is controlled by the unicode_display variable.
b1e5cb
+   Returns the number of characters added to OUT.
b1e5cb
+   Returns the number of bytes consumed from IN in CONSUMED.
b1e5cb
+   Always consumes at least one byte and displays at least one character.  */
b1e5cb
+   
b1e5cb
+static unsigned int
b1e5cb
+display_utf8 (const unsigned char * in, char * out, unsigned int * consumed)
b1e5cb
+{
b1e5cb
+  char *        orig_out = out;
b1e5cb
+  unsigned int  nchars = 0;
b1e5cb
+
b1e5cb
+  if (unicode_display == unicode_default)
b1e5cb
+    goto invalid;
b1e5cb
+
b1e5cb
+  if (in[0] < 0xc0)
b1e5cb
+    goto invalid;
b1e5cb
+
b1e5cb
+  if ((in[1] & 0xc0) != 0x80)
b1e5cb
+    goto invalid;
b1e5cb
+
b1e5cb
+  if ((in[0] & 0x20) == 0)
b1e5cb
+    {
b1e5cb
+      nchars = 2;
b1e5cb
+      goto valid;
b1e5cb
+    }
b1e5cb
+
b1e5cb
+  if ((in[2] & 0xc0) != 0x80)
b1e5cb
+    goto invalid;
b1e5cb
+
b1e5cb
+  if ((in[0] & 0x10) == 0)
b1e5cb
+    {
b1e5cb
+      nchars = 3;
b1e5cb
+      goto valid;
b1e5cb
+    }
b1e5cb
+
b1e5cb
+  if ((in[3] & 0xc0) != 0x80)
b1e5cb
+    goto invalid;
b1e5cb
+
b1e5cb
+  nchars = 4;
b1e5cb
+
b1e5cb
+ valid:
b1e5cb
+  switch (unicode_display)
b1e5cb
+    {
b1e5cb
+    case unicode_locale:
b1e5cb
+      /* Copy the bytes into the output buffer as is.  */
b1e5cb
+      memcpy (out, in, nchars);
b1e5cb
+      out += nchars;
b1e5cb
+      break;
b1e5cb
+
b1e5cb
+    case unicode_invalid:
b1e5cb
+    case unicode_hex:
b1e5cb
+      {
b1e5cb
+      unsigned int j;
b1e5cb
+
b1e5cb
+      out += sprintf (out, "%c", unicode_display == unicode_hex ? '<' : '{');
b1e5cb
+      for (j = 0; j < nchars; j++)
b1e5cb
+	out += sprintf (out, "%02x", in [j]);
b1e5cb
+      out += sprintf (out, "%c", unicode_display == unicode_hex ? '>' : '}');
b1e5cb
+      }
b1e5cb
+      break;
b1e5cb
+      
b1e5cb
+    case unicode_highlight:
b1e5cb
+      if (isatty (1))
b1e5cb
+	out += sprintf (out, "\x1B[31;47m"); /* Red.  */
b1e5cb
+      /* Fall through.  */
b1e5cb
+    case unicode_escape:
b1e5cb
+      switch (nchars)
b1e5cb
+	{
b1e5cb
+	case 2:
b1e5cb
+	  out += sprintf (out, "\\u%02x%02x",
b1e5cb
+		  ((in[0] & 0x1c) >> 2), 
b1e5cb
+		  ((in[0] & 0x03) << 6) | (in[1] & 0x3f));
b1e5cb
+	  break;
b1e5cb
+
b1e5cb
+	case 3:
b1e5cb
+	  out += sprintf (out, "\\u%02x%02x",
b1e5cb
+		  ((in[0] & 0x0f) << 4) | ((in[1] & 0x3c) >> 2),
b1e5cb
+		  ((in[1] & 0x03) << 6) | ((in[2] & 0x3f)));
b1e5cb
+	  break;
b1e5cb
+
b1e5cb
+	case 4:
b1e5cb
+	  out += sprintf (out, "\\u%02x%02x%02x",
b1e5cb
+		  ((in[0] & 0x07) << 6) | ((in[1] & 0x3c) >> 2),
b1e5cb
+		  ((in[1] & 0x03) << 6) | ((in[2] & 0x3c) >> 2),
b1e5cb
+		  ((in[2] & 0x03) << 6) | ((in[3] & 0x3f)));
b1e5cb
+	  break;
b1e5cb
+	default:
b1e5cb
+	  /* URG.  */
b1e5cb
+	  break;
b1e5cb
+	}
b1e5cb
+
b1e5cb
+      if (unicode_display == unicode_highlight && isatty (1))
b1e5cb
+	out += sprintf (out, "\033[0m"); /* Default colour.  */
b1e5cb
+      break;
b1e5cb
+
b1e5cb
+    default:
b1e5cb
+      /* URG */
b1e5cb
+      break;
b1e5cb
+    }
b1e5cb
+
b1e5cb
+  * consumed = nchars;
b1e5cb
+  return out - orig_out;
b1e5cb
+
b1e5cb
+ invalid:
b1e5cb
+  /* Not a valid UTF-8 sequence.  */
b1e5cb
+  *out = *in;
b1e5cb
+  * consumed = 1;
b1e5cb
+  return 1;
b1e5cb
+}
b1e5cb
 
b1e5cb
 /* Returns a version of IN with any control characters
b1e5cb
    replaced by escape sequences.  Uses a static buffer
b1e5cb
-   if necessary.  */
b1e5cb
+   if necessary.
b1e5cb
+
b1e5cb
+   If unicode display is enabled, then also handles the
b1e5cb
+   conversion of unicode characters.  */
b1e5cb
 
b1e5cb
 static const char *
b1e5cb
 sanitize_string (const char * in)
b1e5cb
@@ -435,40 +565,50 @@ sanitize_string (const char * in)
b1e5cb
      of cases it will not be needed.  */
b1e5cb
   do
b1e5cb
     {
b1e5cb
-      char c = *in++;
b1e5cb
+      unsigned char c = *in++;
b1e5cb
 
b1e5cb
       if (c == 0)
b1e5cb
 	return original;
b1e5cb
 
b1e5cb
       if (ISCNTRL (c))
b1e5cb
 	break;
b1e5cb
+
b1e5cb
+      if (unicode_display != unicode_default && c >= 0xc0)
b1e5cb
+	break;
b1e5cb
     }
b1e5cb
   while (1);
b1e5cb
 
b1e5cb
   /* Copy the input, translating as needed.  */
b1e5cb
   in = original;
b1e5cb
-  if (buffer_len < (strlen (in) * 2))
b1e5cb
+  if (buffer_len < (strlen (in) * 9))
b1e5cb
     {
b1e5cb
       free ((void *) buffer);
b1e5cb
-      buffer_len = strlen (in) * 2;
b1e5cb
+      buffer_len = strlen (in) * 9;
b1e5cb
       buffer = xmalloc (buffer_len + 1);
b1e5cb
     }
b1e5cb
 
b1e5cb
   out = buffer;
b1e5cb
   do
b1e5cb
     {
b1e5cb
-      char c = *in++;
b1e5cb
+      unsigned char c = *in++;
b1e5cb
 
b1e5cb
       if (c == 0)
b1e5cb
 	break;
b1e5cb
 
b1e5cb
-      if (!ISCNTRL (c))
b1e5cb
-	*out++ = c;
b1e5cb
-      else
b1e5cb
+      if (ISCNTRL (c))
b1e5cb
 	{
b1e5cb
 	  *out++ = '^';
b1e5cb
 	  *out++ = c + 0x40;
b1e5cb
 	}
b1e5cb
+      else if (unicode_display != unicode_default && c >= 0xc0)
b1e5cb
+	{
b1e5cb
+	  unsigned int num_consumed;
b1e5cb
+
b1e5cb
+	  out += display_utf8 ((const unsigned char *)(in - 1), out, & num_consumed);
b1e5cb
+	  in += num_consumed - 1;
b1e5cb
+	}
b1e5cb
+      else
b1e5cb
+	*out++ = c;
b1e5cb
     }
b1e5cb
   while (1);
b1e5cb
 
b1e5cb
@@ -476,7 +616,6 @@ sanitize_string (const char * in)
b1e5cb
   return buffer;
b1e5cb
 }
b1e5cb
 
b1e5cb
-
b1e5cb
 /* Returns TRUE if the specified section should be dumped.  */
b1e5cb
 
b1e5cb
 static bfd_boolean
b1e5cb
@@ -1055,6 +1194,8 @@ objdump_print_symname (bfd *abfd, struct
b1e5cb
 
b1e5cb
   name = sanitize_string (name);
b1e5cb
 
b1e5cb
+  name = sanitize_string (name);
b1e5cb
+
b1e5cb
   if (inf != NULL)
b1e5cb
     {
b1e5cb
       (*inf->fprintf_func) (inf->stream, "%s", name);
b1e5cb
@@ -3136,7 +3277,7 @@ disassemble_section (bfd *abfd, asection
b1e5cb
   if (!bfd_malloc_and_get_section (abfd, section, &data))
b1e5cb
     {
b1e5cb
       non_fatal (_("Reading section %s failed because: %s"),
b1e5cb
-		 section->name, bfd_errmsg (bfd_get_error ()));
b1e5cb
+		 sanitize_string (section->name), bfd_errmsg (bfd_get_error ()));
b1e5cb
       return;
b1e5cb
     }
b1e5cb
 
b1e5cb
@@ -4341,7 +4482,7 @@ dump_section (bfd *abfd, asection *secti
b1e5cb
   if (!bfd_get_full_section_contents (abfd, section, &data))
b1e5cb
     {
b1e5cb
       non_fatal (_("Reading section %s failed because: %s"),
b1e5cb
-		 section->name, bfd_errmsg (bfd_get_error ()));
b1e5cb
+		 sanitize_string (section->name), bfd_errmsg (bfd_get_error ()));
b1e5cb
       return;
b1e5cb
     }
b1e5cb
 
b1e5cb
@@ -4481,6 +4622,24 @@ dump_symbols (bfd *abfd ATTRIBUTE_UNUSED
b1e5cb
 		  free (alloc);
b1e5cb
 		}
b1e5cb
 	    }
b1e5cb
+	  else if (unicode_display != unicode_default
b1e5cb
+		   && name != NULL && *name != '\0')
b1e5cb
+	    {
b1e5cb
+	      const char * sanitized_name;
b1e5cb
+
b1e5cb
+	      /* If we want to sanitize the name, we do it here, and
b1e5cb
+		 temporarily clobber it while calling bfd_print_symbol.
b1e5cb
+		 FIXME: This is a gross hack.  */
b1e5cb
+	      sanitized_name = sanitize_string (name);
b1e5cb
+	      if (sanitized_name != name)
b1e5cb
+		(*current)->name = sanitized_name;
b1e5cb
+	      else
b1e5cb
+		sanitized_name = NULL;
b1e5cb
+	      bfd_print_symbol (cur_bfd, stdout, *current,
b1e5cb
+				bfd_print_symbol_all);
b1e5cb
+	      if (sanitized_name != NULL)
b1e5cb
+		(*current)->name = name;
b1e5cb
+	    }
b1e5cb
 	  else
b1e5cb
 	    bfd_print_symbol (cur_bfd, stdout, *current,
b1e5cb
 			      bfd_print_symbol_all);
b1e5cb
@@ -5162,7 +5321,7 @@ main (int argc, char **argv)
b1e5cb
   set_default_bfd_target ();
b1e5cb
 
b1e5cb
   while ((c = getopt_long (argc, argv,
b1e5cb
-			   "pP:ib:m:M:VvCdDlfFaHhrRtTxsSI:j:wE:zgeGW::",
b1e5cb
+			   "pP:ib:m:M:VvCdDlfFaHhrRtTxsSI:j:wE:zgeGW::U:",
b1e5cb
 			   long_options, (int *) 0))
b1e5cb
 	 != EOF)
b1e5cb
     {
b1e5cb
@@ -5441,6 +5600,23 @@ main (int argc, char **argv)
b1e5cb
 	  seenflag = TRUE;
b1e5cb
 	  break;
b1e5cb
 
b1e5cb
+	case 'U':
b1e5cb
+	  if (streq (optarg, "default") || streq (optarg, "d"))
b1e5cb
+	    unicode_display = unicode_default;
b1e5cb
+	  else if (streq (optarg, "locale") || streq (optarg, "l"))
b1e5cb
+	    unicode_display = unicode_locale;
b1e5cb
+	  else if (streq (optarg, "escape") || streq (optarg, "e"))
b1e5cb
+	    unicode_display = unicode_escape;
b1e5cb
+	  else if (streq (optarg, "invalid") || streq (optarg, "i"))
b1e5cb
+	    unicode_display = unicode_invalid;
b1e5cb
+	  else if (streq (optarg, "hex") || streq (optarg, "x"))
b1e5cb
+	    unicode_display = unicode_hex;
b1e5cb
+	  else if (streq (optarg, "highlight") || streq (optarg, "h"))
b1e5cb
+	    unicode_display = unicode_highlight;
b1e5cb
+	  else
b1e5cb
+	    fatal (_("invalid argument to -U/--unicode: %s"), optarg);
b1e5cb
+	  break;
b1e5cb
+
b1e5cb
 	case 'H':
b1e5cb
 	  usage (stdout, 0);
b1e5cb
 	  /* No need to set seenflag or to break - usage() does not return.  */
b1e5cb
Only in binutils-2.36.1/binutils/: objdump.c.orig
b1e5cb
diff -rup binutils.orig/binutils/readelf.c binutils-2.36.1/binutils/readelf.c
b1e5cb
--- binutils.orig/binutils/readelf.c	2021-10-21 16:56:20.323761356 +0100
b1e5cb
+++ binutils-2.36.1/binutils/readelf.c	2021-10-21 17:00:54.169858044 +0100
b1e5cb
@@ -321,6 +321,18 @@ typedef enum print_mode
b1e5cb
 }
b1e5cb
 print_mode;
b1e5cb
 
b1e5cb
+typedef enum unicode_display_type
b1e5cb
+{
b1e5cb
+  unicode_locale,
b1e5cb
+  unicode_escape,
b1e5cb
+  unicode_hex,
b1e5cb
+  unicode_highlight,
b1e5cb
+  unicode_invalid
b1e5cb
+} unicode_display_type;
b1e5cb
+
b1e5cb
+static unicode_display_type unicode_display = unicode_locale;
b1e5cb
+
b1e5cb
+  
b1e5cb
 /* Versioned symbol info.  */
b1e5cb
 enum versioned_symbol_info
b1e5cb
 {
b1e5cb
@@ -613,11 +625,18 @@ print_symbol (signed int width, const ch
b1e5cb
       if (c == 0)
b1e5cb
 	break;
b1e5cb
 
b1e5cb
-      /* Do not print control characters directly as they can affect terminal
b1e5cb
-	 settings.  Such characters usually appear in the names generated
b1e5cb
-	 by the assembler for local labels.  */
b1e5cb
-      if (ISCNTRL (c))
b1e5cb
+      if (ISPRINT (c))
b1e5cb
+	{
b1e5cb
+	  putchar (c);
b1e5cb
+	  width_remaining --;
b1e5cb
+	  num_printed ++;
b1e5cb
+	}
b1e5cb
+      else if (ISCNTRL (c))
b1e5cb
 	{
b1e5cb
+	  /* Do not print control characters directly as they can affect terminal
b1e5cb
+	     settings.  Such characters usually appear in the names generated
b1e5cb
+	     by the assembler for local labels.  */
b1e5cb
+
b1e5cb
 	  if (width_remaining < 2)
b1e5cb
 	    break;
b1e5cb
 
b1e5cb
@@ -625,11 +644,135 @@ print_symbol (signed int width, const ch
b1e5cb
 	  width_remaining -= 2;
b1e5cb
 	  num_printed += 2;
b1e5cb
 	}
b1e5cb
-      else if (ISPRINT (c))
b1e5cb
+      else if (c == 0x7f)
b1e5cb
 	{
b1e5cb
-	  putchar (c);
b1e5cb
-	  width_remaining --;
b1e5cb
-	  num_printed ++;
b1e5cb
+	  if (width_remaining < 5)
b1e5cb
+	    break;
b1e5cb
+	  printf ("");
b1e5cb
+	  width_remaining -= 5;
b1e5cb
+	  num_printed += 5;
b1e5cb
+	}
b1e5cb
+      else if (unicode_display != unicode_locale)
b1e5cb
+	{
b1e5cb
+	  /* Display unicode characters as something else.  */
b1e5cb
+	  unsigned char bytes[4];
b1e5cb
+	  bfd_boolean   is_utf8;
b1e5cb
+	  unsigned int  nbytes;
b1e5cb
+
b1e5cb
+	  bytes[0] = c;
b1e5cb
+
b1e5cb
+	  if (bytes[0] < 0xc0)
b1e5cb
+	    {
b1e5cb
+	      nbytes = 1;
b1e5cb
+	      is_utf8 = FALSE;
b1e5cb
+	    }
b1e5cb
+	  else
b1e5cb
+	    {
b1e5cb
+	      bytes[1] = *symbol++;
b1e5cb
+
b1e5cb
+	      if ((bytes[1] & 0xc0) != 0x80)
b1e5cb
+		{
b1e5cb
+		  is_utf8 = FALSE;
b1e5cb
+		  /* Do not consume this character.  It may only
b1e5cb
+		     be the first byte in the sequence that was
b1e5cb
+		     corrupt.  */
b1e5cb
+		  --symbol;
b1e5cb
+		  nbytes = 1;
b1e5cb
+		}
b1e5cb
+	      else if ((bytes[0] & 0x20) == 0)
b1e5cb
+		{
b1e5cb
+		  is_utf8 = TRUE;
b1e5cb
+		  nbytes = 2;
b1e5cb
+		}
b1e5cb
+	      else
b1e5cb
+		{
b1e5cb
+		  bytes[2] = *symbol++;
b1e5cb
+
b1e5cb
+		  if ((bytes[2] & 0xc0) != 0x80)
b1e5cb
+		    {
b1e5cb
+		      is_utf8 = FALSE;
b1e5cb
+		      symbol -= 2;
b1e5cb
+		      nbytes = 1;
b1e5cb
+		    }
b1e5cb
+		  else if ((bytes[0] & 0x10) == 0)
b1e5cb
+		    {
b1e5cb
+		      is_utf8 = TRUE;
b1e5cb
+		      nbytes = 3;
b1e5cb
+		    }
b1e5cb
+		  else
b1e5cb
+		    {
b1e5cb
+		      bytes[3] = *symbol++;
b1e5cb
+
b1e5cb
+		      nbytes = 4;
b1e5cb
+
b1e5cb
+		      if ((bytes[3] & 0xc0) != 0x80)
b1e5cb
+			{
b1e5cb
+			  is_utf8 = FALSE;
b1e5cb
+			  symbol -= 3;
b1e5cb
+			  nbytes = 1;
b1e5cb
+			}
b1e5cb
+		      else
b1e5cb
+			is_utf8 = TRUE;
b1e5cb
+		    }
b1e5cb
+		}
b1e5cb
+	    }
b1e5cb
+
b1e5cb
+	  if (unicode_display == unicode_invalid)
b1e5cb
+	    is_utf8 = FALSE;
b1e5cb
+
b1e5cb
+	  if (unicode_display == unicode_hex || ! is_utf8)
b1e5cb
+	    {
b1e5cb
+	      unsigned int i;
b1e5cb
+
b1e5cb
+	      if (width_remaining < (nbytes * 2) + 2)
b1e5cb
+		break;
b1e5cb
+	  
b1e5cb
+	      putchar (is_utf8 ? '<' : '{');
b1e5cb
+	      for (i = 0; i < nbytes; i++)
b1e5cb
+		printf ("%02x", bytes[i]);
b1e5cb
+	      putchar (is_utf8 ? '>' : '}');
b1e5cb
+	    }
b1e5cb
+	  else
b1e5cb
+	    {
b1e5cb
+	      if (unicode_display == unicode_highlight && isatty (1))
b1e5cb
+		printf ("\x1B[31;47m"); /* Red.  */
b1e5cb
+	      
b1e5cb
+	      switch (nbytes)
b1e5cb
+		{
b1e5cb
+		case 2:
b1e5cb
+		  if (width_remaining < 6)
b1e5cb
+		    break;
b1e5cb
+		  printf ("\\u%02x%02x",
b1e5cb
+			  (bytes[0] & 0x1c) >> 2, 
b1e5cb
+			  ((bytes[0] & 0x03) << 6) | (bytes[1] & 0x3f));
b1e5cb
+		  break;
b1e5cb
+		case 3:
b1e5cb
+		  if (width_remaining < 6)
b1e5cb
+		    break;
b1e5cb
+		  printf ("\\u%02x%02x",
b1e5cb
+			  ((bytes[0] & 0x0f) << 4) | ((bytes[1] & 0x3c) >> 2),
b1e5cb
+			  ((bytes[1] & 0x03) << 6) | (bytes[2] & 0x3f));
b1e5cb
+		  break;
b1e5cb
+		case 4:
b1e5cb
+		  if (width_remaining < 8)
b1e5cb
+		    break;
b1e5cb
+		  printf ("\\u%02x%02x%02x",
b1e5cb
+			  ((bytes[0] & 0x07) << 6) | ((bytes[1] & 0x3c) >> 2),
b1e5cb
+			  ((bytes[1] & 0x03) << 6) | ((bytes[2] & 0x3c) >> 2),
b1e5cb
+			  ((bytes[2] & 0x03) << 6) | (bytes[3] & 0x3f));
b1e5cb
+		  
b1e5cb
+		  break;
b1e5cb
+		default:
b1e5cb
+		  /* URG.  */
b1e5cb
+		  break;
b1e5cb
+		}
b1e5cb
+
b1e5cb
+	      if (unicode_display == unicode_highlight && isatty (1))
b1e5cb
+		printf ("\033[0m"); /* Default colour.  */
b1e5cb
+	    }
b1e5cb
+	  
b1e5cb
+	  if (bytes[nbytes - 1] == 0)
b1e5cb
+	    break;
b1e5cb
 	}
b1e5cb
       else
b1e5cb
 	{
b1e5cb
@@ -4555,6 +4698,7 @@ static struct option options[] =
b1e5cb
   {"syms",	       no_argument, 0, 's'},
b1e5cb
   {"silent-truncation",no_argument, 0, 'T'},
b1e5cb
   {"section-details",  no_argument, 0, 't'},
b1e5cb
+  {"unicode",          required_argument, 0, 'U'},
b1e5cb
   {"unwind",	       no_argument, 0, 'u'},
b1e5cb
   {"version-info",     no_argument, 0, 'V'},
b1e5cb
   {"version",	       no_argument, 0, 'v'},
b1e5cb
@@ -4652,6 +4796,11 @@ usage (FILE * stream)
b1e5cb
 #endif
b1e5cb
   fprintf (stream, _("\
b1e5cb
   -I --histogram         Display histogram of bucket list lengths\n\
b1e5cb
+  -U --unicode=[locale|escape|hex|highlight|invalid]\n\
b1e5cb
+                         Display unicode characters as determined by the current locale\n\
b1e5cb
+                          (default), escape sequences, \"<hex sequences>\", highlighted\n\
b1e5cb
+                          escape sequences, or treat them as invalid and display as\n\
b1e5cb
+                          \"{hex sequences}\"\n\
b1e5cb
   -W --wide              Allow output width to exceed 80 characters\n\
b1e5cb
   -T --silent-truncation If a symbol name is truncated, do not add a suffix [...]\n\
b1e5cb
   @<file>                Read options from <file>\n\
b1e5cb
@@ -4748,7 +4897,7 @@ parse_args (struct dump_data *dumpdata,
b1e5cb
     usage (stderr);
b1e5cb
 
b1e5cb
   while ((c = getopt_long
b1e5cb
-	  (argc, argv, "ACDHILNR:STVWacdeghi:lnp:rstuvw::x:z", options, NULL)) != EOF)
b1e5cb
+	  (argc, argv, "ACDHILNR:STU:VWacdeghi:lnp:rstuvw::x:z", options, NULL)) != EOF)
b1e5cb
     {
b1e5cb
       switch (c)
b1e5cb
 	{
b1e5cb
@@ -4905,6 +5054,25 @@ parse_args (struct dump_data *dumpdata,
b1e5cb
 	  request_dump (dumpdata, DISASS_DUMP);
b1e5cb
 	  break;
b1e5cb
 #endif
b1e5cb
+	case 'U':
b1e5cb
+	  if (optarg == NULL)
b1e5cb
+	    error (_("Missing arg to -U/--unicode")); /* Can this happen ?  */
b1e5cb
+	  else if (streq (optarg, "default") || streq (optarg, "d"))
b1e5cb
+	    unicode_display = unicode_locale;
b1e5cb
+	  else if (streq (optarg, "locale") || streq (optarg, "l"))
b1e5cb
+	    unicode_display = unicode_locale;
b1e5cb
+	  else if (streq (optarg, "escape") || streq (optarg, "e"))
b1e5cb
+	    unicode_display = unicode_escape;
b1e5cb
+	  else if (streq (optarg, "invalid") || streq (optarg, "i"))
b1e5cb
+	    unicode_display = unicode_invalid;
b1e5cb
+	  else if (streq (optarg, "hex") || streq (optarg, "x"))
b1e5cb
+	    unicode_display = unicode_hex;
b1e5cb
+	  else if (streq (optarg, "highlight") || streq (optarg, "h"))
b1e5cb
+	    unicode_display = unicode_highlight;
b1e5cb
+	  else
b1e5cb
+	    error (_("unknown argument to -U/--unicode: %s"), optarg);
b1e5cb
+	  break;
b1e5cb
+
b1e5cb
 	case 'v':
b1e5cb
 	  print_version (program_name);
b1e5cb
 	  break;
b1e5cb
Only in binutils-2.36.1/binutils/: readelf.c.orig
b1e5cb
Only in binutils-2.36.1/binutils/: readelf.c.rej
b1e5cb
diff -rup binutils.orig/binutils/strings.c binutils-2.36.1/binutils/strings.c
b1e5cb
--- binutils.orig/binutils/strings.c	2021-10-21 16:56:20.321761370 +0100
b1e5cb
+++ binutils-2.36.1/binutils/strings.c	2021-10-21 16:56:29.698696197 +0100
b1e5cb
@@ -55,6 +55,19 @@
b1e5cb
    -T {bfdname}
b1e5cb
 		Specify a non-default object file format.
b1e5cb
 
b1e5cb
+  --unicode={default|locale|invalid|hex|escape|highlight}
b1e5cb
+  -U {d|l|i|x|e|h}
b1e5cb
+                Determine how to handle UTF-8 unicode characters.  The default
b1e5cb
+		is no special treatment.  All other versions of this option
b1e5cb
+		only apply if the encoding is valid and enabling the option
b1e5cb
+		implies --encoding=S.
b1e5cb
+		The 'locale' option displays the characters according to the
b1e5cb
+		current locale.  The 'invalid' option treats them as
b1e5cb
+		non-string characters.  The 'hex' option displays them as hex
b1e5cb
+		byte sequences.  The 'escape' option displays them as escape
b1e5cb
+		sequences and the 'highlight' option displays them as
b1e5cb
+		coloured escape sequences.
b1e5cb
+
b1e5cb
   --output-separator=sep_string
b1e5cb
   -s sep_string	String used to separate parsed strings in output.
b1e5cb
 		Default is newline.
b1e5cb
@@ -76,6 +89,22 @@
b1e5cb
 #include "safe-ctype.h"
b1e5cb
 #include "bucomm.h"
b1e5cb
 
b1e5cb
+#ifndef streq
b1e5cb
+#define streq(a,b) (strcmp ((a),(b)) == 0)
b1e5cb
+#endif
b1e5cb
+
b1e5cb
+typedef enum unicode_display_type
b1e5cb
+{
b1e5cb
+  unicode_default = 0,
b1e5cb
+  unicode_locale,
b1e5cb
+  unicode_escape,
b1e5cb
+  unicode_hex,
b1e5cb
+  unicode_highlight,
b1e5cb
+  unicode_invalid
b1e5cb
+} unicode_display_type;
b1e5cb
+
b1e5cb
+static unicode_display_type unicode_display = unicode_default;
b1e5cb
+
b1e5cb
 #define STRING_ISGRAPHIC(c) \
b1e5cb
       (   (c) >= 0 \
b1e5cb
        && (c) <= 255 \
b1e5cb
@@ -94,7 +123,7 @@ extern int errno;
b1e5cb
 static int address_radix;
b1e5cb
 
b1e5cb
 /* Minimum length of sequence of graphic chars to trigger output.  */
b1e5cb
-static int string_min;
b1e5cb
+static uint string_min;
b1e5cb
 
b1e5cb
 /* Whether or not we include all whitespace as a graphic char.   */
b1e5cb
 static bfd_boolean include_all_whitespace;
b1e5cb
@@ -130,6 +159,7 @@ static struct option long_options[] =
b1e5cb
   {"target", required_argument, NULL, 'T'},
b1e5cb
   {"output-separator", required_argument, NULL, 's'},
b1e5cb
   {"help", no_argument, NULL, 'h'},
b1e5cb
+  {"unicode", required_argument, NULL, 'U'},
b1e5cb
   {"version", no_argument, NULL, 'v'},
b1e5cb
   {NULL, 0, NULL, 0}
b1e5cb
 };
b1e5cb
@@ -173,7 +203,7 @@ main (int argc, char **argv)
b1e5cb
   encoding = 's';
b1e5cb
   output_separator = NULL;
b1e5cb
 
b1e5cb
-  while ((optc = getopt_long (argc, argv, "adfhHn:wot:e:T:s:Vv0123456789",
b1e5cb
+  while ((optc = getopt_long (argc, argv, "adfhHn:wot:e:T:s:U:Vv0123456789",
b1e5cb
 			      long_options, (int *) 0)) != EOF)
b1e5cb
     {
b1e5cb
       switch (optc)
b1e5cb
@@ -246,6 +276,23 @@ main (int argc, char **argv)
b1e5cb
 	  output_separator = optarg;
b1e5cb
           break;
b1e5cb
 
b1e5cb
+	case 'U':
b1e5cb
+	  if (streq (optarg, "default") || streq (optarg, "d"))
b1e5cb
+	    unicode_display = unicode_default;
b1e5cb
+	  else if (streq (optarg, "locale") || streq (optarg, "l"))
b1e5cb
+	    unicode_display = unicode_locale;
b1e5cb
+	  else if (streq (optarg, "escape") || streq (optarg, "e"))
b1e5cb
+	    unicode_display = unicode_escape;
b1e5cb
+	  else if (streq (optarg, "invalid") || streq (optarg, "i"))
b1e5cb
+	    unicode_display = unicode_invalid;
b1e5cb
+	  else if (streq (optarg, "hex") || streq (optarg, "x"))
b1e5cb
+	    unicode_display = unicode_hex;
b1e5cb
+	  else if (streq (optarg, "highlight") || streq (optarg, "h"))
b1e5cb
+	    unicode_display = unicode_highlight;
b1e5cb
+	  else
b1e5cb
+	    fatal (_("invalid argument to -U/--unicode: %s"), optarg);
b1e5cb
+	  break;
b1e5cb
+
b1e5cb
 	case 'V':
b1e5cb
 	case 'v':
b1e5cb
 	  print_version ("strings");
b1e5cb
@@ -260,6 +307,9 @@ main (int argc, char **argv)
b1e5cb
 	}
b1e5cb
     }
b1e5cb
 
b1e5cb
+  if (unicode_display != unicode_default)
b1e5cb
+    encoding = 'S';
b1e5cb
+
b1e5cb
   if (numeric_opt != 0)
b1e5cb
     {
b1e5cb
       string_min = (int) strtoul (argv[numeric_opt - 1] + 1, &s, 0);
b1e5cb
@@ -553,11 +603,629 @@ unget_part_char (long c, file_ptr *addre
b1e5cb
 	}
b1e5cb
     }
b1e5cb
 }
b1e5cb
+
b1e5cb
+static void
b1e5cb
+print_filename_and_address (const char * filename, file_ptr address)
b1e5cb
+{
b1e5cb
+  if (print_filenames)
b1e5cb
+    printf ("%s: ", filename);
b1e5cb
+
b1e5cb
+  if (! print_addresses)
b1e5cb
+    return;
b1e5cb
+
b1e5cb
+  switch (address_radix)
b1e5cb
+    {
b1e5cb
+    case 8:
b1e5cb
+      if (sizeof (address) > sizeof (long))
b1e5cb
+	{
b1e5cb
+#ifndef __MSVCRT__
b1e5cb
+	  printf ("%7llo ", (unsigned long long) address);
b1e5cb
+#else
b1e5cb
+	  printf ("%7I64o ", (unsigned long long) address);
b1e5cb
+#endif
b1e5cb
+	}
b1e5cb
+      else
b1e5cb
+	printf ("%7lo ", (unsigned long) address);
b1e5cb
+      break;
b1e5cb
+
b1e5cb
+    case 10:
b1e5cb
+      if (sizeof (address) > sizeof (long))
b1e5cb
+	{
b1e5cb
+#ifndef __MSVCRT__
b1e5cb
+	  printf ("%7llu ", (unsigned long long) address);
b1e5cb
+#else
b1e5cb
+	  printf ("%7I64d ", (unsigned long long) address);
b1e5cb
+#endif
b1e5cb
+	}
b1e5cb
+      else
b1e5cb
+	printf ("%7ld ", (long) address);
b1e5cb
+      break;
b1e5cb
+
b1e5cb
+    case 16:
b1e5cb
+      if (sizeof (address) > sizeof (long))
b1e5cb
+	{
b1e5cb
+#ifndef __MSVCRT__
b1e5cb
+	  printf ("%7llx ", (unsigned long long) address);
b1e5cb
+#else
b1e5cb
+	  printf ("%7I64x ", (unsigned long long) address);
b1e5cb
+#endif
b1e5cb
+	}
b1e5cb
+      else
b1e5cb
+	printf ("%7lx ", (unsigned long) address);
b1e5cb
+      break;
b1e5cb
+    }
b1e5cb
+}
b1e5cb
+
b1e5cb
+/* Return non-zero if the bytes starting at BUFFER form a valid UTF-8 encoding.
b1e5cb
+   If the encoding is valid then returns the number of bytes it uses.  */
b1e5cb
+
b1e5cb
+static unsigned int
b1e5cb
+is_valid_utf8 (const unsigned char * buffer, unsigned long buflen)
b1e5cb
+{
b1e5cb
+  if (buffer[0] < 0xc0)
b1e5cb
+    return 0;
b1e5cb
+
b1e5cb
+  if (buflen < 2)
b1e5cb
+    return 0;
b1e5cb
+
b1e5cb
+  if ((buffer[1] & 0xc0) != 0x80)
b1e5cb
+    return 0;
b1e5cb
+
b1e5cb
+  if ((buffer[0] & 0x20) == 0)
b1e5cb
+    return 2;
b1e5cb
+
b1e5cb
+  if (buflen < 3)
b1e5cb
+    return 0;
b1e5cb
+
b1e5cb
+  if ((buffer[2] & 0xc0) != 0x80)
b1e5cb
+    return 0;
b1e5cb
+  
b1e5cb
+  if ((buffer[0] & 0x10) == 0)
b1e5cb
+    return 3;
b1e5cb
+
b1e5cb
+  if (buflen < 4)
b1e5cb
+    return 0;
b1e5cb
+
b1e5cb
+  if ((buffer[3] & 0xc0) != 0x80)
b1e5cb
+    return 0;
b1e5cb
+
b1e5cb
+  return 4;
b1e5cb
+}
b1e5cb
+
b1e5cb
+/* Display a UTF-8 encoded character in BUFFER according to the setting
b1e5cb
+   of unicode_display.  The character is known to be valid.
b1e5cb
+   Returns the number of bytes consumed.  */
b1e5cb
+
b1e5cb
+static unsigned int
b1e5cb
+display_utf8_char (const unsigned char * buffer)
b1e5cb
+{
b1e5cb
+  unsigned int j;
b1e5cb
+  unsigned int utf8_len;
b1e5cb
+
b1e5cb
+  switch (buffer[0] & 0x30)
b1e5cb
+    {
b1e5cb
+    case 0x00:
b1e5cb
+    case 0x10:
b1e5cb
+      utf8_len = 2;
b1e5cb
+      break;
b1e5cb
+    case 0x20:
b1e5cb
+      utf8_len = 3;
b1e5cb
+      break;
b1e5cb
+    default:
b1e5cb
+      utf8_len = 4;
b1e5cb
+    }
b1e5cb
+      
b1e5cb
+  switch (unicode_display)
b1e5cb
+    {
b1e5cb
+    default:
b1e5cb
+      fprintf (stderr, "ICE: unexpected unicode display type\n");
b1e5cb
+      break;
b1e5cb
+
b1e5cb
+    case unicode_escape:
b1e5cb
+    case unicode_highlight:
b1e5cb
+      if (unicode_display == unicode_highlight && isatty (1))
b1e5cb
+	printf ("\x1B[31;47m"); /* Red.  */
b1e5cb
+
b1e5cb
+      switch (utf8_len)
b1e5cb
+	{
b1e5cb
+	case 2:
b1e5cb
+	  printf ("\\u%02x%02x",
b1e5cb
+		  ((buffer[0] & 0x1c) >> 2), 
b1e5cb
+		  ((buffer[0] & 0x03) << 6) | (buffer[1] & 0x3f));
b1e5cb
+	  break;
b1e5cb
+
b1e5cb
+	case 3:
b1e5cb
+	  printf ("\\u%02x%02x",
b1e5cb
+		  ((buffer[0] & 0x0f) << 4) | ((buffer[1] & 0x3c) >> 2),
b1e5cb
+		  ((buffer[1] & 0x03) << 6) | ((buffer[2] & 0x3f)));
b1e5cb
+	  break;
b1e5cb
+
b1e5cb
+	case 4:
b1e5cb
+	  printf ("\\u%02x%02x%02x",
b1e5cb
+		  ((buffer[0] & 0x07) << 6) | ((buffer[1] & 0x3c) >> 2),
b1e5cb
+		  ((buffer[1] & 0x03) << 6) | ((buffer[2] & 0x3c) >> 2),
b1e5cb
+		  ((buffer[2] & 0x03) << 6) | ((buffer[3] & 0x3f)));
b1e5cb
+	  break;
b1e5cb
+	default:
b1e5cb
+	  /* URG.  */
b1e5cb
+	  break;
b1e5cb
+	}
b1e5cb
+
b1e5cb
+      if (unicode_display == unicode_highlight && isatty (1))
b1e5cb
+	printf ("\033[0m"); /* Default colour.  */
b1e5cb
+      break;
b1e5cb
+
b1e5cb
+    case unicode_hex:
b1e5cb
+      putchar ('<');
b1e5cb
+      for (j = 0; j < utf8_len; j++)
b1e5cb
+	printf ("%02x", buffer [j]);
b1e5cb
+      putchar ('>');
b1e5cb
+      break;
b1e5cb
+
b1e5cb
+    case unicode_locale:
b1e5cb
+      printf ("%.1s", buffer);
b1e5cb
+      break;
b1e5cb
+    }
b1e5cb
+
b1e5cb
+  return utf8_len;
b1e5cb
+}
b1e5cb
+
b1e5cb
+/* Display strings in BUFFER.  Treat any UTF-8 encoded characters encountered
b1e5cb
+   according to the setting of the unicode_display variable.  The buffer
b1e5cb
+   contains BUFLEN bytes.
b1e5cb
+
b1e5cb
+   Display the characters as if they started at ADDRESS and are contained in
b1e5cb
+   FILENAME.  */
b1e5cb
+
b1e5cb
+static void
b1e5cb
+print_unicode_buffer (const char *            filename,
b1e5cb
+		      file_ptr                address,
b1e5cb
+		      const unsigned char *   buffer,
b1e5cb
+		      unsigned long           buflen)
b1e5cb
+{
b1e5cb
+  /* Paranoia checks...  */
b1e5cb
+  if (filename == NULL
b1e5cb
+      || buffer == NULL
b1e5cb
+      || unicode_display == unicode_default
b1e5cb
+      || encoding != 'S'
b1e5cb
+      || encoding_bytes != 1)
b1e5cb
+    {
b1e5cb
+      fprintf (stderr, "ICE: bad arguments to print_unicode_buffer\n");
b1e5cb
+      return;
b1e5cb
+    }
b1e5cb
+
b1e5cb
+  if (buflen == 0)
b1e5cb
+    return;
b1e5cb
+
b1e5cb
+  /* We must only display strings that are at least string_min *characters*
b1e5cb
+     long.  So we scan the buffer in two stages.  First we locate the start
b1e5cb
+     of a potential string.  Then we walk along it until we have found
b1e5cb
+     string_min characters.  Then we go back to the start point and start
b1e5cb
+     displaying characters according to the unicode_display setting.  */
b1e5cb
+
b1e5cb
+  unsigned long start_point = 0;
b1e5cb
+  unsigned long i = 0;
b1e5cb
+  unsigned int char_len = 1;
b1e5cb
+  unsigned int num_found = 0;
b1e5cb
+
b1e5cb
+  for (i = 0; i < buflen; i += char_len)
b1e5cb
+    {
b1e5cb
+      int c = buffer[i];
b1e5cb
+
b1e5cb
+      char_len = 1;
b1e5cb
+
b1e5cb
+      /* Find the first potential character of a string.  */
b1e5cb
+      if (! STRING_ISGRAPHIC (c))
b1e5cb
+	{
b1e5cb
+	  num_found = 0;
b1e5cb
+	  continue;
b1e5cb
+	}
b1e5cb
+
b1e5cb
+      if (c > 126)
b1e5cb
+	{
b1e5cb
+	  if (c < 0xc0)
b1e5cb
+	    {
b1e5cb
+	      num_found = 0;
b1e5cb
+	      continue;
b1e5cb
+	    }
b1e5cb
+
b1e5cb
+	  if ((char_len = is_valid_utf8 (buffer + i, buflen - i)) == 0)
b1e5cb
+	    {
b1e5cb
+	      char_len = 1;
b1e5cb
+	      num_found = 0;
b1e5cb
+	      continue;
b1e5cb
+	    }
b1e5cb
+
b1e5cb
+	  if (unicode_display == unicode_invalid)
b1e5cb
+	    {
b1e5cb
+	      /* We have found a valid UTF-8 character, but we treat it as non-graphic.  */
b1e5cb
+	      num_found = 0;
b1e5cb
+	      continue;
b1e5cb
+	    }
b1e5cb
+	}
b1e5cb
+
b1e5cb
+      if (num_found == 0)
b1e5cb
+	/* We have found a potential starting point for a string.  */
b1e5cb
+	start_point = i;
b1e5cb
+
b1e5cb
+      ++ num_found;
b1e5cb
+
b1e5cb
+      if (num_found >= string_min)
b1e5cb
+	break;
b1e5cb
+    }
b1e5cb
+
b1e5cb
+  if (num_found < string_min)
b1e5cb
+    return;
b1e5cb
+
b1e5cb
+  print_filename_and_address (filename, address + start_point);
b1e5cb
+  
b1e5cb
+  /* We have found string_min characters.  Display them and any
b1e5cb
+     more that follow.  */
b1e5cb
+  for (i = start_point; i < buflen; i += char_len)
b1e5cb
+    {
b1e5cb
+      int c = buffer[i];
b1e5cb
+
b1e5cb
+      char_len = 1;
b1e5cb
+
b1e5cb
+      if (! STRING_ISGRAPHIC (c))
b1e5cb
+	break;
b1e5cb
+      else if (c < 127)
b1e5cb
+	putchar (c);
b1e5cb
+      else if (! is_valid_utf8 (buffer + i, buflen - i))
b1e5cb
+	break;
b1e5cb
+      else if (unicode_display == unicode_invalid)
b1e5cb
+	break;
b1e5cb
+      else
b1e5cb
+	char_len = display_utf8_char (buffer + i);
b1e5cb
+    }
b1e5cb
+
b1e5cb
+  if (output_separator)
b1e5cb
+    fputs (output_separator, stdout);
b1e5cb
+  else
b1e5cb
+    putchar ('\n');
b1e5cb
+
b1e5cb
+  /* FIXME: Using tail recursion here is lazy programming...  */
b1e5cb
+  print_unicode_buffer (filename, address + i, buffer + i, buflen - i);
b1e5cb
+}
b1e5cb
+
b1e5cb
+static int
b1e5cb
+get_unicode_byte (FILE *           stream,
b1e5cb
+		  unsigned char *  putback,
b1e5cb
+		  unsigned int *   num_putback,
b1e5cb
+		  unsigned int *   num_read)
b1e5cb
+{
b1e5cb
+  if (* num_putback > 0)
b1e5cb
+    {
b1e5cb
+      * num_putback = * num_putback - 1;
b1e5cb
+      return putback [* num_putback];
b1e5cb
+    }
b1e5cb
+
b1e5cb
+  * num_read = * num_read + 1;
b1e5cb
+
b1e5cb
+#if defined(HAVE_GETC_UNLOCKED) && HAVE_DECL_GETC_UNLOCKED
b1e5cb
+  return getc_unlocked (stream);
b1e5cb
+#else
b1e5cb
+  return getc (stream);
b1e5cb
+#endif
b1e5cb
+}
b1e5cb
+
b1e5cb
+/* Helper function for print_unicode_stream.  */
b1e5cb
+
b1e5cb
+static void
b1e5cb
+print_unicode_stream_body (const char *     filename,
b1e5cb
+			   file_ptr         address,
b1e5cb
+			   FILE *           stream,
b1e5cb
+			   unsigned char *  putback_buf,
b1e5cb
+			   unsigned int     num_putback,
b1e5cb
+			   unsigned char *  print_buf)
b1e5cb
+{
b1e5cb
+  /* It would be nice if we could just read the stream into a buffer
b1e5cb
+     and then process if with print_unicode_buffer.  But the input
b1e5cb
+     might be huge or it might time-locked (eg stdin).  So instead
b1e5cb
+     we go one byte at a time...  */
b1e5cb
+
b1e5cb
+  file_ptr start_point = 0;
b1e5cb
+  unsigned int num_read = 0;
b1e5cb
+  unsigned int num_chars = 0;
b1e5cb
+  unsigned int num_print = 0;
b1e5cb
+  int c;
b1e5cb
+
b1e5cb
+  /* Find a series of string_min characters.  Put them into print_buf.  */
b1e5cb
+  do
b1e5cb
+    {
b1e5cb
+      if (num_chars >= string_min)
b1e5cb
+	break;
b1e5cb
+
b1e5cb
+      c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
b1e5cb
+      if (c == EOF)
b1e5cb
+	break;
b1e5cb
+
b1e5cb
+      if (! STRING_ISGRAPHIC (c))
b1e5cb
+	{
b1e5cb
+	  num_chars = num_print = 0;
b1e5cb
+	  continue;
b1e5cb
+	}
b1e5cb
+
b1e5cb
+      if (num_chars == 0)
b1e5cb
+	start_point = num_read - 1;
b1e5cb
+
b1e5cb
+      if (c < 127)
b1e5cb
+	{
b1e5cb
+	  print_buf[num_print] = c;
b1e5cb
+	  num_chars ++;
b1e5cb
+	  num_print ++;
b1e5cb
+	  continue;
b1e5cb
+	}
b1e5cb
+
b1e5cb
+      if (c < 0xc0)
b1e5cb
+	{
b1e5cb
+	  num_chars = num_print = 0;
b1e5cb
+	  continue;
b1e5cb
+	}
b1e5cb
+
b1e5cb
+      /* We *might* have a UTF-8 sequence.  Time to start peeking.  */
b1e5cb
+      char utf8[4];
b1e5cb
+
b1e5cb
+      utf8[0] = c;
b1e5cb
+      c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
b1e5cb
+      if (c == EOF)
b1e5cb
+	break;
b1e5cb
+      utf8[1] = c;
b1e5cb
+
b1e5cb
+      if ((utf8[1] & 0xc0) != 0x80)
b1e5cb
+	{
b1e5cb
+	  /* Invalid UTF-8.  */
b1e5cb
+	  putback_buf[num_putback++] = utf8[1];
b1e5cb
+	  num_chars = num_print = 0;
b1e5cb
+	  continue;
b1e5cb
+	}
b1e5cb
+      else if ((utf8[0] & 0x20) == 0)
b1e5cb
+	{
b1e5cb
+	  /* A valid 2-byte UTF-8 encoding.  */
b1e5cb
+	  if (unicode_display == unicode_invalid)
b1e5cb
+	    {
b1e5cb
+	      putback_buf[num_putback++] = utf8[1];
b1e5cb
+	      num_chars = num_print = 0;
b1e5cb
+	    }
b1e5cb
+	  else
b1e5cb
+	    {
b1e5cb
+	      print_buf[num_print ++] = utf8[0];
b1e5cb
+	      print_buf[num_print ++] = utf8[1];
b1e5cb
+	      num_chars ++;
b1e5cb
+	    }
b1e5cb
+	  continue;
b1e5cb
+	}
b1e5cb
+
b1e5cb
+      c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
b1e5cb
+      if (c == EOF)
b1e5cb
+	break;
b1e5cb
+      utf8[2] = c;
b1e5cb
+
b1e5cb
+      if ((utf8[2] & 0xc0) != 0x80)
b1e5cb
+	{
b1e5cb
+	  /* Invalid UTF-8.  */
b1e5cb
+	  putback_buf[num_putback++] = utf8[2];
b1e5cb
+	  putback_buf[num_putback++] = utf8[1];
b1e5cb
+	  num_chars = num_print = 0;
b1e5cb
+	  continue;
b1e5cb
+	}
b1e5cb
+      else if ((utf8[0] & 0x10) == 0)
b1e5cb
+	{
b1e5cb
+	  /* A valid 3-byte UTF-8 encoding.  */
b1e5cb
+	  if (unicode_display == unicode_invalid)
b1e5cb
+	    {
b1e5cb
+	      putback_buf[num_putback++] = utf8[2];
b1e5cb
+	      putback_buf[num_putback++] = utf8[1];
b1e5cb
+	      num_chars = num_print = 0;
b1e5cb
+	    }
b1e5cb
+	  else
b1e5cb
+	    {
b1e5cb
+	      print_buf[num_print ++] = utf8[0];
b1e5cb
+	      print_buf[num_print ++] = utf8[1];
b1e5cb
+	      print_buf[num_print ++] = utf8[2];
b1e5cb
+	      num_chars ++;
b1e5cb
+	    }
b1e5cb
+	  continue;
b1e5cb
+	}
b1e5cb
+
b1e5cb
+      c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
b1e5cb
+      if (c == EOF)
b1e5cb
+	break;
b1e5cb
+      utf8[3] = c;
b1e5cb
+
b1e5cb
+      if ((utf8[3] & 0xc0) != 0x80)
b1e5cb
+	{
b1e5cb
+	  /* Invalid UTF-8.  */
b1e5cb
+	  putback_buf[num_putback++] = utf8[3];
b1e5cb
+	  putback_buf[num_putback++] = utf8[2];
b1e5cb
+	  putback_buf[num_putback++] = utf8[1];
b1e5cb
+	  num_chars = num_print = 0;
b1e5cb
+	}
b1e5cb
+      /* We have a valid 4-byte UTF-8 encoding.  */
b1e5cb
+      else if (unicode_display == unicode_invalid)
b1e5cb
+	{
b1e5cb
+	  putback_buf[num_putback++] = utf8[3];
b1e5cb
+	  putback_buf[num_putback++] = utf8[1];
b1e5cb
+	  putback_buf[num_putback++] = utf8[2];
b1e5cb
+	  num_chars = num_print = 0;
b1e5cb
+	}
b1e5cb
+      else
b1e5cb
+	{
b1e5cb
+	  print_buf[num_print ++] = utf8[0];
b1e5cb
+	  print_buf[num_print ++] = utf8[1];
b1e5cb
+	  print_buf[num_print ++] = utf8[2];
b1e5cb
+	  print_buf[num_print ++] = utf8[3];
b1e5cb
+	  num_chars ++;
b1e5cb
+	}
b1e5cb
+    }
b1e5cb
+  while (1);
b1e5cb
+
b1e5cb
+  if (num_chars >= string_min)
b1e5cb
+    {
b1e5cb
+      /* We know that we have string_min valid characters in print_buf,
b1e5cb
+	 and there may be more to come in the stream.  Start displaying
b1e5cb
+	 them.  */
b1e5cb
+
b1e5cb
+      print_filename_and_address (filename, address + start_point);
b1e5cb
+
b1e5cb
+      unsigned int i;
b1e5cb
+      for (i = 0; i < num_print;)
b1e5cb
+	{
b1e5cb
+	  if (print_buf[i] < 127)
b1e5cb
+	    putchar (print_buf[i++]);
b1e5cb
+	  else
b1e5cb
+	    i += display_utf8_char (print_buf + i);
b1e5cb
+	}
b1e5cb
+
b1e5cb
+      /* OK so now we have to start read unchecked bytes.  */
b1e5cb
+
b1e5cb
+        /* Find a series of string_min characters.  Put them into print_buf.  */
b1e5cb
+      do
b1e5cb
+	{
b1e5cb
+	  c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
b1e5cb
+	  if (c == EOF)
b1e5cb
+	    break;
b1e5cb
+
b1e5cb
+	  if (! STRING_ISGRAPHIC (c))
b1e5cb
+	    break;
b1e5cb
+
b1e5cb
+	  if (c < 127)
b1e5cb
+	    {
b1e5cb
+	      putchar (c);
b1e5cb
+	      continue;
b1e5cb
+	    }
b1e5cb
+
b1e5cb
+	  if (c < 0xc0)
b1e5cb
+	    break;
b1e5cb
+
b1e5cb
+	  /* We *might* have a UTF-8 sequence.  Time to start peeking.  */
b1e5cb
+	  unsigned char utf8[4];
b1e5cb
+
b1e5cb
+	  utf8[0] = c;
b1e5cb
+	  c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
b1e5cb
+	  if (c == EOF)
b1e5cb
+	    break;
b1e5cb
+	  utf8[1] = c;
b1e5cb
+
b1e5cb
+	  if ((utf8[1] & 0xc0) != 0x80)
b1e5cb
+	    {
b1e5cb
+	      /* Invalid UTF-8.  */
b1e5cb
+	      putback_buf[num_putback++] = utf8[1];
b1e5cb
+	      break;
b1e5cb
+	    }
b1e5cb
+	  else if ((utf8[0] & 0x20) == 0)
b1e5cb
+	    {
b1e5cb
+	      /* Valid 2-byte UTF-8.  */
b1e5cb
+	      if (unicode_display == unicode_invalid)
b1e5cb
+		{
b1e5cb
+		  putback_buf[num_putback++] = utf8[1];
b1e5cb
+		  break;
b1e5cb
+		}
b1e5cb
+	      else
b1e5cb
+		{
b1e5cb
+		  (void) display_utf8_char (utf8);
b1e5cb
+		  continue;
b1e5cb
+		}
b1e5cb
+	    }
b1e5cb
+
b1e5cb
+	  c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
b1e5cb
+	  if (c == EOF)
b1e5cb
+	    break;
b1e5cb
+	  utf8[2] = c;
b1e5cb
+
b1e5cb
+	  if ((utf8[2] & 0xc0) != 0x80)
b1e5cb
+	    {
b1e5cb
+	      /* Invalid UTF-8.  */
b1e5cb
+	      putback_buf[num_putback++] = utf8[2];
b1e5cb
+	      putback_buf[num_putback++] = utf8[1];
b1e5cb
+	      break;
b1e5cb
+	    }
b1e5cb
+	  else if ((utf8[0] & 0x10) == 0)
b1e5cb
+	    {
b1e5cb
+	      /* Valid 3-byte UTF-8.  */
b1e5cb
+	      if (unicode_display == unicode_invalid)
b1e5cb
+		{
b1e5cb
+		  putback_buf[num_putback++] = utf8[2];
b1e5cb
+		  putback_buf[num_putback++] = utf8[1];
b1e5cb
+		  break;
b1e5cb
+		}
b1e5cb
+	      else
b1e5cb
+		{
b1e5cb
+		  (void) display_utf8_char (utf8);
b1e5cb
+		  continue;
b1e5cb
+		}
b1e5cb
+	    }
b1e5cb
+
b1e5cb
+	  c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
b1e5cb
+	  if (c == EOF)
b1e5cb
+	    break;
b1e5cb
+	  utf8[3] = c;
b1e5cb
+
b1e5cb
+	  if ((utf8[3] & 0xc0) != 0x80)
b1e5cb
+	    {
b1e5cb
+	      /* Invalid UTF-8.  */
b1e5cb
+	      putback_buf[num_putback++] = utf8[3];
b1e5cb
+	      putback_buf[num_putback++] = utf8[2];
b1e5cb
+	      putback_buf[num_putback++] = utf8[1];
b1e5cb
+	      break;
b1e5cb
+	    }
b1e5cb
+	  else if (unicode_display == unicode_invalid)
b1e5cb
+	    {
b1e5cb
+	      putback_buf[num_putback++] = utf8[3];
b1e5cb
+	      putback_buf[num_putback++] = utf8[2];
b1e5cb
+	      putback_buf[num_putback++] = utf8[1];
b1e5cb
+	      break;
b1e5cb
+	    }
b1e5cb
+	  else
b1e5cb
+	    /* A valid 4-byte UTF-8 encoding.  */
b1e5cb
+	    (void) display_utf8_char (utf8);
b1e5cb
+	}
b1e5cb
+      while (1);
b1e5cb
+
b1e5cb
+      if (output_separator)
b1e5cb
+	fputs (output_separator, stdout);
b1e5cb
+      else
b1e5cb
+	putchar ('\n');
b1e5cb
+    }
b1e5cb
+
b1e5cb
+  if (c != EOF)
b1e5cb
+    /* FIXME: Using tail recursion here is lazy, but it works.  */
b1e5cb
+    print_unicode_stream_body (filename, address + num_read, stream, putback_buf, num_putback, print_buf);
b1e5cb
+}
b1e5cb
+
b1e5cb
+/* Display strings read in from STREAM.  Treat any UTF-8 encoded characters
b1e5cb
+   encountered according to the setting of the unicode_display variable.
b1e5cb
+   The stream is positioned at ADDRESS and is attached to FILENAME.  */
b1e5cb
+
b1e5cb
+static void
b1e5cb
+print_unicode_stream (const char * filename,
b1e5cb
+		      file_ptr     address,
b1e5cb
+		      FILE *       stream)
b1e5cb
+{
b1e5cb
+  /* Paranoia checks...  */
b1e5cb
+  if (filename == NULL
b1e5cb
+      || stream == NULL
b1e5cb
+      || unicode_display == unicode_default
b1e5cb
+      || encoding != 'S'
b1e5cb
+      || encoding_bytes != 1)
b1e5cb
+    {
b1e5cb
+      fprintf (stderr, "ICE: bad arguments to print_unicode_stream\n");
b1e5cb
+      return;
b1e5cb
+    }
b1e5cb
+
b1e5cb
+  /* Allocate space for string_min 4-byte utf-8 characters.  */
b1e5cb
+  unsigned char * print_buf = xmalloc ((4 * string_min) + 1);
b1e5cb
+  /* We should never have to put back more than 4 bytes.  */
b1e5cb
+  unsigned char putback_buf[5];
b1e5cb
+  unsigned int num_putback = 0;
b1e5cb
+
b1e5cb
+  print_unicode_stream_body (filename, address, stream, putback_buf, num_putback, print_buf);
b1e5cb
+  free (print_buf);
b1e5cb
+}
b1e5cb
 
b1e5cb
 /* Find the strings in file FILENAME, read from STREAM.
b1e5cb
    Assume that STREAM is positioned so that the next byte read
b1e5cb
    is at address ADDRESS in the file.
b1e5cb
-   Stop reading at address STOP_POINT in the file, if nonzero.
b1e5cb
 
b1e5cb
    If STREAM is NULL, do not read from it.
b1e5cb
    The caller can supply a buffer of characters
b1e5cb
@@ -570,18 +1238,27 @@ static void
b1e5cb
 print_strings (const char *filename, FILE *stream, file_ptr address,
b1e5cb
 	       int stop_point, int magiccount, char *magic)
b1e5cb
 {
b1e5cb
+  if (unicode_display != unicode_default)
b1e5cb
+    {
b1e5cb
+      if (magic != NULL)
b1e5cb
+	print_unicode_buffer (filename, address,
b1e5cb
+			      (const unsigned char *) magic, magiccount);
b1e5cb
+
b1e5cb
+      if (stream != NULL)
b1e5cb
+	print_unicode_stream (filename, address, stream);
b1e5cb
+      return;
b1e5cb
+    }
b1e5cb
+
b1e5cb
   char *buf = (char *) xmalloc (sizeof (char) * (string_min + 1));
b1e5cb
 
b1e5cb
   while (1)
b1e5cb
     {
b1e5cb
       file_ptr start;
b1e5cb
-      int i;
b1e5cb
+      unsigned int i;
b1e5cb
       long c;
b1e5cb
 
b1e5cb
       /* See if the next `string_min' chars are all graphic chars.  */
b1e5cb
     tryline:
b1e5cb
-      if (stop_point && address >= stop_point)
b1e5cb
-	break;
b1e5cb
       start = address;
b1e5cb
       for (i = 0; i < string_min; i++)
b1e5cb
 	{
b1e5cb
@@ -718,6 +1395,8 @@ usage (FILE *stream, int status)
b1e5cb
   -T --target=<BFDNAME>     Specify the binary file format\n\
b1e5cb
   -e --encoding={s,S,b,l,B,L} Select character size and endianness:\n\
b1e5cb
                             s = 7-bit, S = 8-bit, {b,l} = 16-bit, {B,L} = 32-bit\n\
b1e5cb
+  --unicode={default|show|invalid|hex|escape|highlight}\n\
b1e5cb
+  -U {d|s|i|x|e|h}          Specify how to treat UTF-8 encoded unicode characters\n\
b1e5cb
   -s --output-separator=<string> String used to separate strings in output.\n\
b1e5cb
   @<file>                   Read options from <file>\n\
b1e5cb
   -h --help                 Display this information\n\