bpo-31170: Write unit test for Expat 2.2.4 UTF-8 bug (#3570)

Non-regression tests for the Expat 2.2.3 UTF-8 decoder bug.
2025-11-02 03:01:58 +00:00 · 2017-09-25 01:27:34 -07:00 · 2017-09-25 01:27:34 -07:00 · e6d9fcbb8d
commit e6d9fcbb8d
parent 49392c63a2
2 changed files with 34 additions and 0 deletions
--- a/Lib/test/test_xml_etree.py
+++ b/Lib/test/test_xml_etree.py
@ -34,6 +34,7 @@ try:
 except UnicodeEncodeError:
    raise unittest.SkipTest("filename is not encodable to utf8")
 SIMPLE_NS_XMLFILE = findfile("simple-ns.xml", subdir="xmltestdata")
+UTF8_BUG_XMLFILE = findfile("expat224_utf8_bug.xml", subdir="xmltestdata")

 SAMPLE_XML = """\
 <body>
@ -1739,6 +1740,37 @@ class BugsTest(unittest.TestCase):
        self.assertIsInstance(e[0].tag, str)
        self.assertEqual(e[0].tag, 'changed')

+    def check_expat224_utf8_bug(self, text):
+        xml = b'<a b="%s"/>' % text
+        root = ET.XML(xml)
+        self.assertEqual(root.get('b'), text.decode('utf-8'))
+
+    def test_expat224_utf8_bug(self):
+        # bpo-31170: Expat 2.2.3 had a bug in its UTF-8 decoder.
+        # Check that Expat 2.2.4 fixed the bug.
+        #
+        # Test buffer bounds at odd and even positions.
+
+        text = b'\xc3\xa0' * 1024
+        self.check_expat224_utf8_bug(text)
+
+        text = b'x' + b'\xc3\xa0' * 1024
+        self.check_expat224_utf8_bug(text)
+
+    def test_expat224_utf8_bug_file(self):
+        with open(UTF8_BUG_XMLFILE, 'rb') as fp:
+            raw = fp.read()
+        root = ET.fromstring(raw)
+        xmlattr = root.get('b')
+
+        # "Parse" manually the XML file to extract the value of the 'b'
+        # attribute of the <a b='xxx' /> XML element
+        text = raw.decode('utf-8').strip()
+        text = text.replace('\r\n', ' ')
+        text = text[6:-4]
+        self.assertEqual(root.get('b'), text)
+
+

 # --------------------------------------------------------------------