Blame SOURCES/expat-2.2.10-Add-missing-validation-of-encoding.patch

4e0c08
From ee2a5b50e7d1940ba8745715b62ceb9efd3a96da Mon Sep 17 00:00:00 2001
4e0c08
From: Sebastian Pipping <sebastian@pipping.org>
4e0c08
Date: Tue, 8 Feb 2022 17:37:14 +0100
4e0c08
Subject: [PATCH 1/5] lib: Drop unused macro UTF8_GET_NAMING
4e0c08
4e0c08
---
4e0c08
 expat/lib/xmltok.c | 5 -----
4e0c08
 1 file changed, 5 deletions(-)
4e0c08
4e0c08
diff --git a/lib/xmltok.c b/lib/xmltok.c
4e0c08
index a72200e8..3bddf125 100644
4e0c08
--- a/lib/xmltok.c
4e0c08
+++ b/lib/xmltok.c
4e0c08
@@ -98,11 +98,6 @@
4e0c08
         + ((((byte)[1]) & 3) << 1) + ((((byte)[2]) >> 5) & 1)]                 \
4e0c08
    & (1u << (((byte)[2]) & 0x1F)))
4e0c08
 
4e0c08
-#define UTF8_GET_NAMING(pages, p, n)                                           \
4e0c08
-  ((n) == 2                                                                    \
4e0c08
-       ? UTF8_GET_NAMING2(pages, (const unsigned char *)(p))                   \
4e0c08
-       : ((n) == 3 ? UTF8_GET_NAMING3(pages, (const unsigned char *)(p)) : 0))
4e0c08
-
4e0c08
 /* Detection of invalid UTF-8 sequences is based on Table 3.1B
4e0c08
    of Unicode 3.2: http://www.unicode.org/unicode/reports/tr28/
4e0c08
    with the additional restriction of not allowing the Unicode
4e0c08
4e0c08
From 3f0a0cb644438d4d8e3294cd0b1245d0edb0c6c6 Mon Sep 17 00:00:00 2001
4e0c08
From: Sebastian Pipping <sebastian@pipping.org>
4e0c08
Date: Tue, 8 Feb 2022 04:32:20 +0100
4e0c08
Subject: [PATCH 2/5] lib: Add missing validation of encoding (CVE-2022-25235)
4e0c08
4e0c08
---
4e0c08
 expat/lib/xmltok_impl.c | 8 ++++++--
4e0c08
 1 file changed, 6 insertions(+), 2 deletions(-)
4e0c08
4e0c08
diff --git a/lib/xmltok_impl.c b/lib/xmltok_impl.c
4e0c08
index 0430591b..64a3b2c1 100644
4e0c08
--- a/lib/xmltok_impl.c
4e0c08
+++ b/lib/xmltok_impl.c
4e0c08
@@ -69,7 +69,7 @@
4e0c08
   case BT_LEAD##n:                                                             \
4e0c08
     if (end - ptr < n)                                                         \
4e0c08
       return XML_TOK_PARTIAL_CHAR;                                             \
4e0c08
-    if (! IS_NAME_CHAR(enc, ptr, n)) {                                         \
4e0c08
+    if (IS_INVALID_CHAR(enc, ptr, n) || ! IS_NAME_CHAR(enc, ptr, n)) {         \
4e0c08
       *nextTokPtr = ptr;                                                       \
4e0c08
       return XML_TOK_INVALID;                                                  \
4e0c08
     }                                                                          \
4e0c08
@@ -98,7 +98,7 @@
4e0c08
   case BT_LEAD##n:                                                             \
4e0c08
     if (end - ptr < n)                                                         \
4e0c08
       return XML_TOK_PARTIAL_CHAR;                                             \
4e0c08
-    if (! IS_NMSTRT_CHAR(enc, ptr, n)) {                                       \
4e0c08
+    if (IS_INVALID_CHAR(enc, ptr, n) || ! IS_NMSTRT_CHAR(enc, ptr, n)) {       \
4e0c08
       *nextTokPtr = ptr;                                                       \
4e0c08
       return XML_TOK_INVALID;                                                  \
4e0c08
     }                                                                          \
4e0c08
@@ -1142,6 +1142,10 @@ PREFIX(prologTok)(const ENCODING *enc, const char *ptr, const char *end,
4e0c08
   case BT_LEAD##n:                                                             \
4e0c08
     if (end - ptr < n)                                                         \
4e0c08
       return XML_TOK_PARTIAL_CHAR;                                             \
4e0c08
+    if (IS_INVALID_CHAR(enc, ptr, n)) {                                        \
4e0c08
+      *nextTokPtr = ptr;                                                       \
4e0c08
+      return XML_TOK_INVALID;                                                  \
4e0c08
+    }                                                                          \
4e0c08
     if (IS_NMSTRT_CHAR(enc, ptr, n)) {                                         \
4e0c08
       ptr += n;                                                                \
4e0c08
       tok = XML_TOK_NAME;                                                      \
4e0c08
4e0c08
From c85a3025e7a1be086dc34e7559fbc543914d047f Mon Sep 17 00:00:00 2001
4e0c08
From: Sebastian Pipping <sebastian@pipping.org>
4e0c08
Date: Wed, 9 Feb 2022 01:00:38 +0100
4e0c08
Subject: [PATCH 3/5] lib: Add comments to BT_LEAD* cases where encoding has
4e0c08
 already been validated
4e0c08
4e0c08
---
4e0c08
 expat/lib/xmltok_impl.c | 10 +++++-----
4e0c08
 1 file changed, 5 insertions(+), 5 deletions(-)
4e0c08
4e0c08
diff --git a/lib/xmltok_impl.c b/lib/xmltok_impl.c
4e0c08
index 64a3b2c1..84ff35f9 100644
4e0c08
--- a/lib/xmltok_impl.c
4e0c08
+++ b/lib/xmltok_impl.c
4e0c08
@@ -1274,7 +1274,7 @@ PREFIX(attributeValueTok)(const ENCODING *enc, const char *ptr, const char *end,
4e0c08
     switch (BYTE_TYPE(enc, ptr)) {
4e0c08
 #  define LEAD_CASE(n)                                                         \
4e0c08
   case BT_LEAD##n:                                                             \
4e0c08
-    ptr += n;                                                                  \
4e0c08
+    ptr += n; /* NOTE: The encoding has already been validated. */             \
4e0c08
     break;
4e0c08
       LEAD_CASE(2)
4e0c08
       LEAD_CASE(3)
4e0c08
@@ -1343,7 +1343,7 @@ PREFIX(entityValueTok)(const ENCODING *enc, const char *ptr, const char *end,
4e0c08
     switch (BYTE_TYPE(enc, ptr)) {
4e0c08
 #  define LEAD_CASE(n)                                                         \
4e0c08
   case BT_LEAD##n:                                                             \
4e0c08
-    ptr += n;                                                                  \
4e0c08
+    ptr += n; /* NOTE: The encoding has already been validated. */             \
4e0c08
     break;
4e0c08
       LEAD_CASE(2)
4e0c08
       LEAD_CASE(3)
4e0c08
@@ -1522,7 +1522,7 @@ PREFIX(getAtts)(const ENCODING *enc, const char *ptr, int attsMax,
4e0c08
       state = inName;                                                          \
4e0c08
     }
4e0c08
 #  define LEAD_CASE(n)                                                         \
4e0c08
-  case BT_LEAD##n:                                                             \
4e0c08
+  case BT_LEAD##n: /* NOTE: The encoding has already been validated. */        \
4e0c08
     START_NAME ptr += (n - MINBPC(enc));                                       \
4e0c08
     break;
4e0c08
       LEAD_CASE(2)
4e0c08
@@ -1734,7 +1734,7 @@ PREFIX(nameLength)(const ENCODING *enc, const char *ptr) {
4e0c08
     switch (BYTE_TYPE(enc, ptr)) {
4e0c08
 #  define LEAD_CASE(n)                                                         \
4e0c08
   case BT_LEAD##n:                                                             \
4e0c08
-    ptr += n;                                                                  \
4e0c08
+    ptr += n; /* NOTE: The encoding has already been validated. */             \
4e0c08
     break;
4e0c08
       LEAD_CASE(2)
4e0c08
       LEAD_CASE(3)
4e0c08
@@ -1779,7 +1779,7 @@ PREFIX(updatePosition)(const ENCODING *enc, const char *ptr, const char *end,
4e0c08
     switch (BYTE_TYPE(enc, ptr)) {
4e0c08
 #  define LEAD_CASE(n)                                                         \
4e0c08
   case BT_LEAD##n:                                                             \
4e0c08
-    ptr += n;                                                                  \
4e0c08
+    ptr += n; /* NOTE: The encoding has already been validated. */             \
4e0c08
     pos->columnNumber++;                                                       \
4e0c08
     break;
4e0c08
       LEAD_CASE(2)
4e0c08
4e0c08
From 6a5510bc6b7efe743356296724e0b38300f05379 Mon Sep 17 00:00:00 2001
4e0c08
From: Sebastian Pipping <sebastian@pipping.org>
4e0c08
Date: Tue, 8 Feb 2022 04:06:21 +0100
4e0c08
Subject: [PATCH 4/5] tests: Cover missing validation of encoding
4e0c08
 (CVE-2022-25235)
4e0c08
4e0c08
---
4e0c08
 expat/tests/runtests.c | 109 +++++++++++++++++++++++++++++++++++++++++
4e0c08
 1 file changed, 109 insertions(+)
4e0c08
4e0c08
diff --git a/tests/runtests.c b/tests/runtests.c
4e0c08
index bc5344b1..9b155b82 100644
4e0c08
--- a/tests/runtests.c
4e0c08
+++ b/tests/runtests.c
4e0c08
@@ -5998,6 +5998,105 @@ START_TEST(test_utf8_in_cdata_section_2) {
4e0c08
 }
4e0c08
 END_TEST
4e0c08
 
4e0c08
+START_TEST(test_utf8_in_start_tags) {
4e0c08
+  struct test_case {
4e0c08
+    bool goodName;
4e0c08
+    bool goodNameStart;
4e0c08
+    const char *tagName;
4e0c08
+  };
4e0c08
+
4e0c08
+  // The idea with the tests below is this:
4e0c08
+  // We want to cover 1-, 2- and 3-byte sequences, 4-byte sequences
4e0c08
+  // go to isNever and are hence not a concern.
4e0c08
+  //
4e0c08
+  // We start with a character that is a valid name character
4e0c08
+  // (or even name-start character, see XML 1.0r4 spec) and then we flip
4e0c08
+  // single bits at places where (1) the result leaves the UTF-8 encoding space
4e0c08
+  // and (2) we stay in the same n-byte sequence family.
4e0c08
+  //
4e0c08
+  // The flipped bits are highlighted in angle brackets in comments,
4e0c08
+  // e.g. "[<1>011 1001]" means we had [0011 1001] but we now flipped
4e0c08
+  // the most significant bit to 1 to leave UTF-8 encoding space.
4e0c08
+  struct test_case cases[] = {
4e0c08
+      // 1-byte UTF-8: [0xxx xxxx]
4e0c08
+      {true, true, "\x3A"},   // [0011 1010] = ASCII colon ':'
4e0c08
+      {false, false, "\xBA"}, // [<1>011 1010]
4e0c08
+      {true, false, "\x39"},  // [0011 1001] = ASCII nine '9'
4e0c08
+      {false, false, "\xB9"}, // [<1>011 1001]
4e0c08
+
4e0c08
+      // 2-byte UTF-8: [110x xxxx] [10xx xxxx]
4e0c08
+      {true, true, "\xDB\xA5"},   // [1101 1011] [1010 0101] =
4e0c08
+                                  // Arabic small waw U+06E5
4e0c08
+      {false, false, "\x9B\xA5"}, // [1<0>01 1011] [1010 0101]
4e0c08
+      {false, false, "\xDB\x25"}, // [1101 1011] [<0>010 0101]
4e0c08
+      {false, false, "\xDB\xE5"}, // [1101 1011] [1<1>10 0101]
4e0c08
+      {true, false, "\xCC\x81"},  // [1100 1100] [1000 0001] =
4e0c08
+                                  // combining char U+0301
4e0c08
+      {false, false, "\x8C\x81"}, // [1<0>00 1100] [1000 0001]
4e0c08
+      {false, false, "\xCC\x01"}, // [1100 1100] [<0>000 0001]
4e0c08
+      {false, false, "\xCC\xC1"}, // [1100 1100] [1<1>00 0001]
4e0c08
+
4e0c08
+      // 3-byte UTF-8: [1110 xxxx] [10xx xxxx] [10xxxxxx]
4e0c08
+      {true, true, "\xE0\xA4\x85"},   // [1110 0000] [1010 0100] [1000 0101] =
4e0c08
+                                      // Devanagari Letter A U+0905
4e0c08
+      {false, false, "\xA0\xA4\x85"}, // [1<0>10 0000] [1010 0100] [1000 0101]
4e0c08
+      {false, false, "\xE0\x24\x85"}, // [1110 0000] [<0>010 0100] [1000 0101]
4e0c08
+      {false, false, "\xE0\xE4\x85"}, // [1110 0000] [1<1>10 0100] [1000 0101]
4e0c08
+      {false, false, "\xE0\xA4\x05"}, // [1110 0000] [1010 0100] [<0>000 0101]
4e0c08
+      {false, false, "\xE0\xA4\xC5"}, // [1110 0000] [1010 0100] [1<1>00 0101]
4e0c08
+      {true, false, "\xE0\xA4\x81"},  // [1110 0000] [1010 0100] [1000 0001] =
4e0c08
+                                      // combining char U+0901
4e0c08
+      {false, false, "\xA0\xA4\x81"}, // [1<0>10 0000] [1010 0100] [1000 0001]
4e0c08
+      {false, false, "\xE0\x24\x81"}, // [1110 0000] [<0>010 0100] [1000 0001]
4e0c08
+      {false, false, "\xE0\xE4\x81"}, // [1110 0000] [1<1>10 0100] [1000 0001]
4e0c08
+      {false, false, "\xE0\xA4\x01"}, // [1110 0000] [1010 0100] [<0>000 0001]
4e0c08
+      {false, false, "\xE0\xA4\xC1"}, // [1110 0000] [1010 0100] [1<1>00 0001]
4e0c08
+  };
4e0c08
+  const bool atNameStart[] = {true, false};
4e0c08
+
4e0c08
+  size_t i = 0;
4e0c08
+  char doc[1024];
4e0c08
+  size_t failCount = 0;
4e0c08
+
4e0c08
+  for (; i < sizeof(cases) / sizeof(cases[0]); i++) {
4e0c08
+    size_t j = 0;
4e0c08
+    for (; j < sizeof(atNameStart) / sizeof(atNameStart[0]); j++) {
4e0c08
+      const bool expectedSuccess
4e0c08
+          = atNameStart[j] ? cases[i].goodNameStart : cases[i].goodName;
4e0c08
+      sprintf(doc, "<%s%s>
4e0c08
+      XML_Parser parser = XML_ParserCreate(NULL);
4e0c08
+
4e0c08
+      const enum XML_Status status
4e0c08
+          = XML_Parse(parser, doc, (int)strlen(doc), /*isFinal=*/XML_FALSE);
4e0c08
+
4e0c08
+      bool success = true;
4e0c08
+      if ((status == XML_STATUS_OK) != expectedSuccess) {
4e0c08
+        success = false;
4e0c08
+      }
4e0c08
+      if ((status == XML_STATUS_ERROR)
4e0c08
+          && (XML_GetErrorCode(parser) != XML_ERROR_INVALID_TOKEN)) {
4e0c08
+        success = false;
4e0c08
+      }
4e0c08
+
4e0c08
+      if (! success) {
4e0c08
+        fprintf(
4e0c08
+            stderr,
4e0c08
+            "FAIL case %2u (%sat name start, %u-byte sequence, error code %d)\n",
4e0c08
+            (unsigned)i + 1u, atNameStart[j] ? "    " : "not ",
4e0c08
+            (unsigned)strlen(cases[i].tagName), XML_GetErrorCode(parser));
4e0c08
+        failCount++;
4e0c08
+      }
4e0c08
+
4e0c08
+      XML_ParserFree(parser);
4e0c08
+    }
4e0c08
+  }
4e0c08
+
4e0c08
+  if (failCount > 0) {
4e0c08
+    fail("UTF-8 regression detected");
4e0c08
+  }
4e0c08
+}
4e0c08
+END_TEST
4e0c08
+
4e0c08
 /* Test trailing spaces in elements are accepted */
4e0c08
 static void XMLCALL
4e0c08
 record_element_end_handler(void *userData, const XML_Char *name) {
4e0c08
@@ -6175,6 +6274,14 @@ START_TEST(test_bad_doctype) {
4e0c08
 }
4e0c08
 END_TEST
4e0c08
 
4e0c08
+START_TEST(test_bad_doctype_utf8) {
4e0c08
+  const char *text = "
4e0c08
+                     "doc><doc/>"; // [1101 1011] [<0>010 0101]
4e0c08
+  expect_failure(text, XML_ERROR_INVALID_TOKEN,
4e0c08
+                 "Invalid UTF-8 in DOCTYPE not faulted");
4e0c08
+}
4e0c08
+END_TEST
4e0c08
+
4e0c08
 START_TEST(test_bad_doctype_utf16) {
4e0c08
   const char text[] =
4e0c08
       /* <doc/>
4e0c08
@@ -11870,6 +11977,7 @@ make_suite(void) {
4e0c08
   tcase_add_test(tc_basic, test_ext_entity_utf8_non_bom);
4e0c08
   tcase_add_test(tc_basic, test_utf8_in_cdata_section);
4e0c08
   tcase_add_test(tc_basic, test_utf8_in_cdata_section_2);
4e0c08
+  tcase_add_test(tc_basic, test_utf8_in_start_tags);
4e0c08
   tcase_add_test(tc_basic, test_trailing_spaces_in_elements);
4e0c08
   tcase_add_test(tc_basic, test_utf16_attribute);
4e0c08
   tcase_add_test(tc_basic, test_utf16_second_attr);
4e0c08
@@ -11878,6 +11986,7 @@ make_suite(void) {
4e0c08
   tcase_add_test(tc_basic, test_bad_attr_desc_keyword);
4e0c08
   tcase_add_test(tc_basic, test_bad_attr_desc_keyword_utf16);
4e0c08
   tcase_add_test(tc_basic, test_bad_doctype);
4e0c08
+  tcase_add_test(tc_basic, test_bad_doctype_utf8);
4e0c08
   tcase_add_test(tc_basic, test_bad_doctype_utf16);
4e0c08
   tcase_add_test(tc_basic, test_bad_doctype_plus);
4e0c08
   tcase_add_test(tc_basic, test_bad_doctype_star);
4e0c08