mirror of
https://github.com/python/cpython.git
synced 2025-09-11 11:17:16 +00:00
[3.13] gh-118350: Fix support of elements "textarea" and "title" in HTMLParser (GH-135310) (GH-136985)
(cherry picked from commit 4d02f31cdd
)
Co-authored-by: Timon Viola <44016238+timonviola@users.noreply.github.com>
Co-authored-by: Serhiy Storchaka <storchaka@gmail.com>
Co-authored-by: Łukasz Langa <lukasz@langa.pl>
This commit is contained in:
parent
4999cdbced
commit
8de88e0840
3 changed files with 113 additions and 5 deletions
|
@ -128,6 +128,7 @@ class HTMLParser(_markupbase.ParserBase):
|
|||
"""
|
||||
|
||||
CDATA_CONTENT_ELEMENTS = ("script", "style")
|
||||
RCDATA_CONTENT_ELEMENTS = ("textarea", "title")
|
||||
|
||||
def __init__(self, *, convert_charrefs=True):
|
||||
"""Initialize and reset this instance.
|
||||
|
@ -145,6 +146,7 @@ class HTMLParser(_markupbase.ParserBase):
|
|||
self.lasttag = '???'
|
||||
self.interesting = interesting_normal
|
||||
self.cdata_elem = None
|
||||
self._escapable = True
|
||||
super().reset()
|
||||
|
||||
def feed(self, data):
|
||||
|
@ -166,14 +168,20 @@ class HTMLParser(_markupbase.ParserBase):
|
|||
"""Return full source of start tag: '<...>'."""
|
||||
return self.__starttag_text
|
||||
|
||||
def set_cdata_mode(self, elem):
|
||||
def set_cdata_mode(self, elem, *, escapable=False):
|
||||
self.cdata_elem = elem.lower()
|
||||
self.interesting = re.compile(r'</%s(?=[\t\n\r\f />])' % self.cdata_elem,
|
||||
re.IGNORECASE|re.ASCII)
|
||||
self._escapable = escapable
|
||||
if escapable and not self.convert_charrefs:
|
||||
self.interesting = re.compile(r'&|</%s(?=[\t\n\r\f />])' % self.cdata_elem,
|
||||
re.IGNORECASE|re.ASCII)
|
||||
else:
|
||||
self.interesting = re.compile(r'</%s(?=[\t\n\r\f />])' % self.cdata_elem,
|
||||
re.IGNORECASE|re.ASCII)
|
||||
|
||||
def clear_cdata_mode(self):
|
||||
self.interesting = interesting_normal
|
||||
self.cdata_elem = None
|
||||
self._escapable = True
|
||||
|
||||
# Internal -- handle data as far as reasonable. May leave state
|
||||
# and data to be processed by a subsequent call. If 'end' is
|
||||
|
@ -206,7 +214,7 @@ class HTMLParser(_markupbase.ParserBase):
|
|||
break
|
||||
j = n
|
||||
if i < j:
|
||||
if self.convert_charrefs and not self.cdata_elem:
|
||||
if self.convert_charrefs and self._escapable:
|
||||
self.handle_data(unescape(rawdata[i:j]))
|
||||
else:
|
||||
self.handle_data(rawdata[i:j])
|
||||
|
@ -308,7 +316,7 @@ class HTMLParser(_markupbase.ParserBase):
|
|||
assert 0, "interesting.search() lied"
|
||||
# end while
|
||||
if end and i < n:
|
||||
if self.convert_charrefs and not self.cdata_elem:
|
||||
if self.convert_charrefs and self._escapable:
|
||||
self.handle_data(unescape(rawdata[i:n]))
|
||||
else:
|
||||
self.handle_data(rawdata[i:n])
|
||||
|
@ -420,6 +428,8 @@ class HTMLParser(_markupbase.ParserBase):
|
|||
self.handle_starttag(tag, attrs)
|
||||
if tag in self.CDATA_CONTENT_ELEMENTS:
|
||||
self.set_cdata_mode(tag)
|
||||
elif tag in self.RCDATA_CONTENT_ELEMENTS:
|
||||
self.set_cdata_mode(tag, escapable=True)
|
||||
return endpos
|
||||
|
||||
# Internal -- check to see if we have a complete starttag; return end
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue