mirror of
https://github.com/python/cpython.git
synced 2025-11-23 20:07:19 +00:00
[3.14] gh-135661: Fix CDATA section parsing in HTMLParser (GH-135665) (#137772)
Co-authored-by: Serhiy Storchaka <storchaka@gmail.com>
This commit is contained in:
parent
75c2d9f7c4
commit
61f7156965
3 changed files with 90 additions and 27 deletions
|
|
@ -146,6 +146,7 @@ class HTMLParser(_markupbase.ParserBase):
|
|||
self.lasttag = '???'
|
||||
self.interesting = interesting_normal
|
||||
self.cdata_elem = None
|
||||
self._support_cdata = True
|
||||
self._escapable = True
|
||||
super().reset()
|
||||
|
||||
|
|
@ -183,6 +184,19 @@ class HTMLParser(_markupbase.ParserBase):
|
|||
self.cdata_elem = None
|
||||
self._escapable = True
|
||||
|
||||
def _set_support_cdata(self, flag=True):
|
||||
"""Enable or disable support of the CDATA sections.
|
||||
If enabled, "<[CDATA[" starts a CDATA section which ends with "]]>".
|
||||
If disabled, "<[CDATA[" starts a bogus comments which ends with ">".
|
||||
|
||||
This method is not called by default. Its purpose is to be called
|
||||
in custom handle_starttag() and handle_endtag() methods, with
|
||||
value that depends on the adjusted current node.
|
||||
See https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state
|
||||
for details.
|
||||
"""
|
||||
self._support_cdata = flag
|
||||
|
||||
# Internal -- handle data as far as reasonable. May leave state
|
||||
# and data to be processed by a subsequent call. If 'end' is
|
||||
# true, force handling all data as if followed by EOF marker.
|
||||
|
|
@ -257,7 +271,7 @@ class HTMLParser(_markupbase.ParserBase):
|
|||
j -= len(suffix)
|
||||
break
|
||||
self.handle_comment(rawdata[i+4:j])
|
||||
elif startswith("<![CDATA[", i):
|
||||
elif startswith("<![CDATA[", i) and self._support_cdata:
|
||||
self.unknown_decl(rawdata[i+3:])
|
||||
elif rawdata[i:i+9].lower() == '<!doctype':
|
||||
self.handle_decl(rawdata[i+2:])
|
||||
|
|
@ -333,8 +347,12 @@ class HTMLParser(_markupbase.ParserBase):
|
|||
if rawdata[i:i+4] == '<!--':
|
||||
# this case is actually already handled in goahead()
|
||||
return self.parse_comment(i)
|
||||
elif rawdata[i:i+9] == '<![CDATA[':
|
||||
return self.parse_marked_section(i)
|
||||
elif rawdata[i:i+9] == '<![CDATA[' and self._support_cdata:
|
||||
j = rawdata.find(']]>', i+9)
|
||||
if j < 0:
|
||||
return -1
|
||||
self.unknown_decl(rawdata[i+3: j])
|
||||
return j + 3
|
||||
elif rawdata[i:i+9].lower() == '<!doctype':
|
||||
# find the closing >
|
||||
gtpos = rawdata.find('>', i+9)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue