[3.14] gh-135661: Fix CDATA section parsing in HTMLParser (GH-135665) (#137772)

Co-authored-by: Serhiy Storchaka <storchaka@gmail.com>
2025-11-23 20:07:19 +00:00 · 2025-09-08 17:31:41 +02:00 · 2025-09-08 17:31:41 +02:00 · 61f7156965
commit 61f7156965
parent 75c2d9f7c4
3 changed files with 90 additions and 27 deletions
--- a/Lib/html/parser.py
+++ b/Lib/html/parser.py
@ -146,6 +146,7 @@ class HTMLParser(_markupbase.ParserBase):
        self.lasttag = '???'
        self.interesting = interesting_normal
        self.cdata_elem = None
+        self._support_cdata = True
        self._escapable = True
        super().reset()

@ -183,6 +184,19 @@ class HTMLParser(_markupbase.ParserBase):
        self.cdata_elem = None
        self._escapable = True

+    def _set_support_cdata(self, flag=True):
+        """Enable or disable support of the CDATA sections.
+        If enabled, "<[CDATA[" starts a CDATA section which ends with "]]>".
+        If disabled, "<[CDATA[" starts a bogus comments which ends with ">".
+
+        This method is not called by default. Its purpose is to be called
+        in custom handle_starttag() and handle_endtag() methods, with
+        value that depends on the adjusted current node.
+        See https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state
+        for details.
+        """
+        self._support_cdata = flag
+
    # Internal -- handle data as far as reasonable.  May leave state
    # and data to be processed by a subsequent call.  If 'end' is
    # true, force handling all data as if followed by EOF marker.
@ -257,7 +271,7 @@ class HTMLParser(_markupbase.ParserBase):
                                j -= len(suffix)
                                break
                        self.handle_comment(rawdata[i+4:j])
-                    elif startswith("<![CDATA[", i):
+                    elif startswith("<![CDATA[", i) and self._support_cdata:
                        self.unknown_decl(rawdata[i+3:])
                    elif rawdata[i:i+9].lower() == '<!doctype':
                        self.handle_decl(rawdata[i+2:])
@ -333,8 +347,12 @@ class HTMLParser(_markupbase.ParserBase):
        if rawdata[i:i+4] == '<!--':
            # this case is actually already handled in goahead()
            return self.parse_comment(i)
-        elif rawdata[i:i+9] == '<![CDATA[':
-            return self.parse_marked_section(i)
+        elif rawdata[i:i+9] == '<![CDATA[' and self._support_cdata:
+            j = rawdata.find(']]>', i+9)
+            if j < 0:
+                return -1
+            self.unknown_decl(rawdata[i+3: j])
+            return j + 3
        elif rawdata[i:i+9].lower() == '<!doctype':
            # find the closing >
            gtpos = rawdata.find('>', i+9)