[3.14] gh-135661: Fix CDATA section parsing in HTMLParser (GH-135665) (#137772)

Co-authored-by: Serhiy Storchaka <storchaka@gmail.com>
This commit is contained in:
Miss Islington (bot) 2025-09-08 17:31:41 +02:00 committed by GitHub
parent 75c2d9f7c4
commit 61f7156965
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 90 additions and 27 deletions

View file

@ -146,6 +146,7 @@ class HTMLParser(_markupbase.ParserBase):
self.lasttag = '???'
self.interesting = interesting_normal
self.cdata_elem = None
self._support_cdata = True
self._escapable = True
super().reset()
@ -183,6 +184,19 @@ class HTMLParser(_markupbase.ParserBase):
self.cdata_elem = None
self._escapable = True
def _set_support_cdata(self, flag=True):
"""Enable or disable support of the CDATA sections.
If enabled, "<[CDATA[" starts a CDATA section which ends with "]]>".
If disabled, "<[CDATA[" starts a bogus comments which ends with ">".
This method is not called by default. Its purpose is to be called
in custom handle_starttag() and handle_endtag() methods, with
value that depends on the adjusted current node.
See https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state
for details.
"""
self._support_cdata = flag
# Internal -- handle data as far as reasonable. May leave state
# and data to be processed by a subsequent call. If 'end' is
# true, force handling all data as if followed by EOF marker.
@ -257,7 +271,7 @@ class HTMLParser(_markupbase.ParserBase):
j -= len(suffix)
break
self.handle_comment(rawdata[i+4:j])
elif startswith("<![CDATA[", i):
elif startswith("<![CDATA[", i) and self._support_cdata:
self.unknown_decl(rawdata[i+3:])
elif rawdata[i:i+9].lower() == '<!doctype':
self.handle_decl(rawdata[i+2:])
@ -333,8 +347,12 @@ class HTMLParser(_markupbase.ParserBase):
if rawdata[i:i+4] == '<!--':
# this case is actually already handled in goahead()
return self.parse_comment(i)
elif rawdata[i:i+9] == '<![CDATA[':
return self.parse_marked_section(i)
elif rawdata[i:i+9] == '<![CDATA[' and self._support_cdata:
j = rawdata.find(']]>', i+9)
if j < 0:
return -1
self.unknown_decl(rawdata[i+3: j])
return j + 3
elif rawdata[i:i+9].lower() == '<!doctype':
# find the closing >
gtpos = rawdata.find('>', i+9)