[3.10] gh-135462: Fix quadratic complexity in processing special input in HTMLParser (GH-135464) (GH-135485)

End-of-file errors are now handled according to the HTML5 specs -- comments and declarations are automatically closed, tags are ignored. (cherry picked from commit 6eb6c5dbfb)
2025-07-07 19:35:27 +00:00 · 2025-07-04 00:05:53 +03:00 · 2025-07-04 00:05:53 +03:00 · fdc9d214c0
commit fdc9d214c0
parent f297a2292c
3 changed files with 117 additions and 23 deletions
--- a/Lib/html/parser.py
+++ b/Lib/html/parser.py
@ -25,6 +25,7 @@ entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
 charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
 starttagopen = re.compile('<[a-zA-Z]')
 endtagopen = re.compile('</[a-zA-Z]')
 piclose = re.compile('>')
 commentclose = re.compile(r'--\s*>')
 # Note:
@ -176,7 +177,7 @@ class HTMLParser(_markupbase.ParserBase):
                    k = self.parse_pi(i)
                elif startswith("<!", i):
                    k = self.parse_html_declaration(i)
-                elif (i + 1) < n:
+                elif (i + 1) < n or end:
                    self.handle_data("<")
                    k = i + 1
                else:
@ -184,17 +185,35 @@ class HTMLParser(_markupbase.ParserBase):
                if k < 0:
                    if not end:
                        break
-                    k = rawdata.find('>', i + 1)
+                    if starttagopen.match(rawdata, i):  # < + letter
-                    if k < 0:
+                        pass
-                        k = rawdata.find('<', i + 1)
+                    elif startswith("</", i):
-                        if k < 0:
+                        if i + 2 == n:
-                            k = i + 1
+                            self.handle_data("</")
                        elif endtagopen.match(rawdata, i):  # </ + letter
                            pass
                        else:
                            # bogus comment
                            self.handle_comment(rawdata[i+2:])
                    elif startswith("<!--", i):
                        j = n
                        for suffix in ("--!", "--", "-"):
                            if rawdata.endswith(suffix, i+4):
                                j -= len(suffix)
                                break
                        self.handle_comment(rawdata[i+4:j])
                    elif startswith("<![CDATA[", i):
                        self.unknown_decl(rawdata[i+3:])
                    elif rawdata[i:i+9].lower() == '<!doctype':
                        self.handle_decl(rawdata[i+2:])
                    elif startswith("<!", i):
                        # bogus comment
                        self.handle_comment(rawdata[i+2:])
                    elif startswith("<?", i):
                        self.handle_pi(rawdata[i+2:])
                    else:
-                        k += 1
+                        raise AssertionError("we should not get here!")
-                    if self.convert_charrefs and not self.cdata_elem:
+                    k = n
                        self.handle_data(unescape(rawdata[i:k]))
                    else:
                        self.handle_data(rawdata[i:k])
                i = self.updatepos(i, k)
            elif startswith("&#", i):
                match = charref.match(rawdata, i)
--- a/Lib/test/test_htmlparser.py
+++ b/Lib/test/test_htmlparser.py
@ -4,6 +4,8 @@ import html.parser
 import pprint
 import unittest
 from test import support
 class EventCollector(html.parser.HTMLParser):
@ -391,28 +393,34 @@ text
                            ('data', '<'),
                            ('starttag', 'bc<', [('a', None)]),
                            ('endtag', 'html'),
-                            ('data', '\n<img src="URL>'),
+                            ('data', '\n')])
                            ('comment', '/img'),
                            ('endtag', 'html<')])
    def test_starttag_junk_chars(self):
        self._run_check("<", [('data', '<')])
        self._run_check("<>", [('data', '<>')])
        self._run_check("< >", [('data', '< >')])
        self._run_check("< ", [('data', '< ')])
        self._run_check("</>", [])
        self._run_check("<$>", [('data', '<$>')])
        self._run_check("</$>", [('comment', '$')])
        self._run_check("</", [('data', '</')])
-        self._run_check("</a", [('data', '</a')])
+        self._run_check("</a", [])
        self._run_check("</ a>", [('endtag', 'a')])
        self._run_check("</ a", [('comment', ' a')])
        self._run_check("<a<a>", [('starttag', 'a<a', [])])
        self._run_check("</a<a>", [('endtag', 'a<a')])
-        self._run_check("<!", [('data', '<!')])
+        self._run_check("<!", [('comment', '')])
-        self._run_check("<a", [('data', '<a')])
+        self._run_check("<a", [])
-        self._run_check("<a foo='bar'", [('data', "<a foo='bar'")])
+        self._run_check("<a foo='bar'", [])
-        self._run_check("<a foo='bar", [('data', "<a foo='bar")])
+        self._run_check("<a foo='bar", [])
-        self._run_check("<a foo='>'", [('data', "<a foo='>'")])
+        self._run_check("<a foo='>'", [])
-        self._run_check("<a foo='>", [('data', "<a foo='>")])
+        self._run_check("<a foo='>", [])
        self._run_check("<a$>", [('starttag', 'a$', [])])
        self._run_check("<a$b>", [('starttag', 'a$b', [])])
        self._run_check("<a$b/>", [('startendtag', 'a$b', [])])
        self._run_check("<a$b  >", [('starttag', 'a$b', [])])
        self._run_check("<a$b  />", [('startendtag', 'a$b', [])])
        self._run_check("</a$b>", [('endtag', 'a$b')])
    def test_slashes_in_starttag(self):
        self._run_check('<a foo="var"/>', [('startendtag', 'a', [('foo', 'var')])])
@ -537,13 +545,56 @@ text
        for html, expected in data:
            self._run_check(html, expected)
-    def test_broken_comments(self):
+    def test_eof_in_comments(self):
-        html = ('<! not really a comment >'
+        data = [
            ('<!--', [('comment', '')]),
            ('<!---', [('comment', '')]),
            ('<!----', [('comment', '')]),
            ('<!-----', [('comment', '-')]),
            ('<!------', [('comment', '--')]),
            ('<!----!', [('comment', '')]),
            ('<!---!', [('comment', '-!')]),
            ('<!---!>', [('comment', '-!>')]),
            ('<!--foo', [('comment', 'foo')]),
            ('<!--foo-', [('comment', 'foo')]),
            ('<!--foo--', [('comment', 'foo')]),
            ('<!--foo--!', [('comment', 'foo')]),
            ('<!--<!--', [('comment', '<!')]),
            ('<!--<!--!', [('comment', '<!')]),
        ]
        for html, expected in data:
            self._run_check(html, expected)
    def test_eof_in_declarations(self):
        data = [
            ('<!', [('comment', '')]),
            ('<!-', [('comment', '-')]),
            ('<![', [('comment', '[')]),
            ('<![CDATA[', [('unknown decl', 'CDATA[')]),
            ('<![CDATA[x', [('unknown decl', 'CDATA[x')]),
            ('<![CDATA[x]', [('unknown decl', 'CDATA[x]')]),
            ('<![CDATA[x]]', [('unknown decl', 'CDATA[x]]')]),
            ('<!DOCTYPE', [('decl', 'DOCTYPE')]),
            ('<!DOCTYPE ', [('decl', 'DOCTYPE ')]),
            ('<!DOCTYPE html', [('decl', 'DOCTYPE html')]),
            ('<!DOCTYPE html ', [('decl', 'DOCTYPE html ')]),
            ('<!DOCTYPE html PUBLIC', [('decl', 'DOCTYPE html PUBLIC')]),
            ('<!DOCTYPE html PUBLIC "foo', [('decl', 'DOCTYPE html PUBLIC "foo')]),
            ('<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "foo',
             [('decl', 'DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "foo')]),
        ]
        for html, expected in data:
            self._run_check(html, expected)
    def test_bogus_comments(self):
        html = ('<!ELEMENT br EMPTY>'
                '<! not really a comment >'
                '<! not a comment either -->'
                '<! -- close enough -->'
                '<!><!<-- this was an empty comment>'
                '<!!! another bogus comment !!!>')
        expected = [
            ('comment', 'ELEMENT br EMPTY'),
            ('comment', ' not really a comment '),
            ('comment', ' not a comment either --'),
            ('comment', ' -- close enough --'),
@ -598,6 +649,26 @@ text
             ('endtag', 'a'), ('data', ' bar & baz')]
        )
    @support.requires_resource('cpu')
    def test_eof_no_quadratic_complexity(self):
        # Each of these examples used to take about an hour.
        # Now they take a fraction of a second.
        def check(source):
            parser = html.parser.HTMLParser()
            parser.feed(source)
            parser.close()
        n = 120_000
        check("<a " * n)
        check("<a a=" * n)
        check("</a " * 14 * n)
        check("</a a=" * 11 * n)
        check("<!--" * 4 * n)
        check("<!" * 60 * n)
        check("<?" * 19 * n)
        check("</$" * 15 * n)
        check("<![CDATA[" * 9 * n)
        check("<!doctype" * 35 * n)
 class AttributesTestCase(TestCaseBase):
--- a/Misc/NEWS.d/next/Security/2025-06-13-15-55-22.gh-issue-135462.KBeJpc.rst
+++ b/Misc/NEWS.d/next/Security/2025-06-13-15-55-22.gh-issue-135462.KBeJpc.rst
@ -0,0 +1,4 @@
 Fix quadratic complexity in processing specially crafted input in
 :class:`html.parser.HTMLParser`. End-of-file errors are now handled according
 to the HTML5 specs -- comments and declarations are automatically closed,
 tags are ignored.