mirror of
https://github.com/python/cpython.git
synced 2025-08-25 11:15:02 +00:00
[3.12] gh-118350: Fix support of elements "textarea" and "title" in HTMLParser (GH-135310) (GH-136986)
Some checks failed
Lint / lint (push) Has been cancelled
Tests / Change detection (push) Has been cancelled
Tests / All required checks pass (push) Has been cancelled
Tests / (push) Has been cancelled
Tests / Docs (push) Has been cancelled
Tests / Check if the ABI has changed (push) Has been cancelled
Tests / Check if Autoconf files are up to date (push) Has been cancelled
Tests / Check if generated files are up to date (push) Has been cancelled
Tests / Windows MSI (push) Has been cancelled
Tests / Ubuntu SSL tests with OpenSSL (push) Has been cancelled
Tests / Hypothesis tests on Ubuntu (push) Has been cancelled
Tests / Address sanitizer (push) Has been cancelled
Some checks failed
Lint / lint (push) Has been cancelled
Tests / Change detection (push) Has been cancelled
Tests / All required checks pass (push) Has been cancelled
Tests / (push) Has been cancelled
Tests / Docs (push) Has been cancelled
Tests / Check if the ABI has changed (push) Has been cancelled
Tests / Check if Autoconf files are up to date (push) Has been cancelled
Tests / Check if generated files are up to date (push) Has been cancelled
Tests / Windows MSI (push) Has been cancelled
Tests / Ubuntu SSL tests with OpenSSL (push) Has been cancelled
Tests / Hypothesis tests on Ubuntu (push) Has been cancelled
Tests / Address sanitizer (push) Has been cancelled
(cherry picked from commit 4d02f31cdd
)
Co-authored-by: Timon Viola <44016238+timonviola@users.noreply.github.com>
Co-authored-by: Serhiy Storchaka <storchaka@gmail.com>
Co-authored-by: Łukasz Langa <lukasz@langa.pl>
This commit is contained in:
parent
ad695f5328
commit
f66c75f11d
3 changed files with 113 additions and 5 deletions
|
@ -110,6 +110,7 @@ class HTMLParser(_markupbase.ParserBase):
|
||||||
"""
|
"""
|
||||||
|
|
||||||
CDATA_CONTENT_ELEMENTS = ("script", "style")
|
CDATA_CONTENT_ELEMENTS = ("script", "style")
|
||||||
|
RCDATA_CONTENT_ELEMENTS = ("textarea", "title")
|
||||||
|
|
||||||
def __init__(self, *, convert_charrefs=True):
|
def __init__(self, *, convert_charrefs=True):
|
||||||
"""Initialize and reset this instance.
|
"""Initialize and reset this instance.
|
||||||
|
@ -127,6 +128,7 @@ class HTMLParser(_markupbase.ParserBase):
|
||||||
self.lasttag = '???'
|
self.lasttag = '???'
|
||||||
self.interesting = interesting_normal
|
self.interesting = interesting_normal
|
||||||
self.cdata_elem = None
|
self.cdata_elem = None
|
||||||
|
self._escapable = True
|
||||||
super().reset()
|
super().reset()
|
||||||
|
|
||||||
def feed(self, data):
|
def feed(self, data):
|
||||||
|
@ -148,14 +150,20 @@ class HTMLParser(_markupbase.ParserBase):
|
||||||
"""Return full source of start tag: '<...>'."""
|
"""Return full source of start tag: '<...>'."""
|
||||||
return self.__starttag_text
|
return self.__starttag_text
|
||||||
|
|
||||||
def set_cdata_mode(self, elem):
|
def set_cdata_mode(self, elem, *, escapable=False):
|
||||||
self.cdata_elem = elem.lower()
|
self.cdata_elem = elem.lower()
|
||||||
|
self._escapable = escapable
|
||||||
|
if escapable and not self.convert_charrefs:
|
||||||
|
self.interesting = re.compile(r'&|</%s(?=[\t\n\r\f />])' % self.cdata_elem,
|
||||||
|
re.IGNORECASE|re.ASCII)
|
||||||
|
else:
|
||||||
self.interesting = re.compile(r'</%s(?=[\t\n\r\f />])' % self.cdata_elem,
|
self.interesting = re.compile(r'</%s(?=[\t\n\r\f />])' % self.cdata_elem,
|
||||||
re.IGNORECASE|re.ASCII)
|
re.IGNORECASE|re.ASCII)
|
||||||
|
|
||||||
def clear_cdata_mode(self):
|
def clear_cdata_mode(self):
|
||||||
self.interesting = interesting_normal
|
self.interesting = interesting_normal
|
||||||
self.cdata_elem = None
|
self.cdata_elem = None
|
||||||
|
self._escapable = True
|
||||||
|
|
||||||
# Internal -- handle data as far as reasonable. May leave state
|
# Internal -- handle data as far as reasonable. May leave state
|
||||||
# and data to be processed by a subsequent call. If 'end' is
|
# and data to be processed by a subsequent call. If 'end' is
|
||||||
|
@ -188,7 +196,7 @@ class HTMLParser(_markupbase.ParserBase):
|
||||||
break
|
break
|
||||||
j = n
|
j = n
|
||||||
if i < j:
|
if i < j:
|
||||||
if self.convert_charrefs and not self.cdata_elem:
|
if self.convert_charrefs and self._escapable:
|
||||||
self.handle_data(unescape(rawdata[i:j]))
|
self.handle_data(unescape(rawdata[i:j]))
|
||||||
else:
|
else:
|
||||||
self.handle_data(rawdata[i:j])
|
self.handle_data(rawdata[i:j])
|
||||||
|
@ -290,7 +298,7 @@ class HTMLParser(_markupbase.ParserBase):
|
||||||
assert 0, "interesting.search() lied"
|
assert 0, "interesting.search() lied"
|
||||||
# end while
|
# end while
|
||||||
if end and i < n:
|
if end and i < n:
|
||||||
if self.convert_charrefs and not self.cdata_elem:
|
if self.convert_charrefs and self._escapable:
|
||||||
self.handle_data(unescape(rawdata[i:n]))
|
self.handle_data(unescape(rawdata[i:n]))
|
||||||
else:
|
else:
|
||||||
self.handle_data(rawdata[i:n])
|
self.handle_data(rawdata[i:n])
|
||||||
|
@ -402,6 +410,8 @@ class HTMLParser(_markupbase.ParserBase):
|
||||||
self.handle_starttag(tag, attrs)
|
self.handle_starttag(tag, attrs)
|
||||||
if tag in self.CDATA_CONTENT_ELEMENTS:
|
if tag in self.CDATA_CONTENT_ELEMENTS:
|
||||||
self.set_cdata_mode(tag)
|
self.set_cdata_mode(tag)
|
||||||
|
elif tag in self.RCDATA_CONTENT_ELEMENTS:
|
||||||
|
self.set_cdata_mode(tag, escapable=True)
|
||||||
return endpos
|
return endpos
|
||||||
|
|
||||||
# Internal -- check to see if we have a complete starttag; return end
|
# Internal -- check to see if we have a complete starttag; return end
|
||||||
|
|
|
@ -317,6 +317,49 @@ text
|
||||||
("data", content),
|
("data", content),
|
||||||
("endtag", "style")])
|
("endtag", "style")])
|
||||||
|
|
||||||
|
@support.subTests('content', [
|
||||||
|
'<!-- not a comment -->',
|
||||||
|
"<not a='start tag'>",
|
||||||
|
'<![CDATA[not a cdata]]>',
|
||||||
|
'<!not a bogus comment>',
|
||||||
|
'</not a bogus comment>',
|
||||||
|
'\u2603',
|
||||||
|
'< /title>',
|
||||||
|
'</ title>',
|
||||||
|
'</titled>',
|
||||||
|
'</title\v>',
|
||||||
|
'</title\xa0>',
|
||||||
|
'</tıtle>',
|
||||||
|
])
|
||||||
|
def test_title_content(self, content):
|
||||||
|
source = f"<title>{content}</title>"
|
||||||
|
self._run_check(source, [
|
||||||
|
("starttag", "title", []),
|
||||||
|
("data", content),
|
||||||
|
("endtag", "title"),
|
||||||
|
])
|
||||||
|
|
||||||
|
@support.subTests('content', [
|
||||||
|
'<!-- not a comment -->',
|
||||||
|
"<not a='start tag'>",
|
||||||
|
'<![CDATA[not a cdata]]>',
|
||||||
|
'<!not a bogus comment>',
|
||||||
|
'</not a bogus comment>',
|
||||||
|
'\u2603',
|
||||||
|
'< /textarea>',
|
||||||
|
'</ textarea>',
|
||||||
|
'</textareable>',
|
||||||
|
'</textarea\v>',
|
||||||
|
'</textarea\xa0>',
|
||||||
|
])
|
||||||
|
def test_textarea_content(self, content):
|
||||||
|
source = f"<textarea>{content}</textarea>"
|
||||||
|
self._run_check(source, [
|
||||||
|
("starttag", "textarea", []),
|
||||||
|
("data", content),
|
||||||
|
("endtag", "textarea"),
|
||||||
|
])
|
||||||
|
|
||||||
@support.subTests('endtag', ['script', 'SCRIPT', 'script ', 'script\n',
|
@support.subTests('endtag', ['script', 'SCRIPT', 'script ', 'script\n',
|
||||||
'script/', 'script foo=bar', 'script foo=">"'])
|
'script/', 'script foo=bar', 'script foo=">"'])
|
||||||
def test_script_closing_tag(self, endtag):
|
def test_script_closing_tag(self, endtag):
|
||||||
|
@ -346,6 +389,38 @@ text
|
||||||
("endtag", "style")],
|
("endtag", "style")],
|
||||||
collector=EventCollectorNoNormalize(convert_charrefs=False))
|
collector=EventCollectorNoNormalize(convert_charrefs=False))
|
||||||
|
|
||||||
|
@support.subTests('endtag', ['title', 'TITLE', 'title ', 'title\n',
|
||||||
|
'title/', 'title foo=bar', 'title foo=">"'])
|
||||||
|
def test_title_closing_tag(self, endtag):
|
||||||
|
content = "<!-- not a comment --><i>Egg & Spam</i>"
|
||||||
|
s = f'<TitLe>{content}</{endtag}>'
|
||||||
|
self._run_check(s, [("starttag", "title", []),
|
||||||
|
('data', '<!-- not a comment --><i>Egg & Spam</i>'),
|
||||||
|
("endtag", "title")],
|
||||||
|
collector=EventCollectorNoNormalize(convert_charrefs=True))
|
||||||
|
self._run_check(s, [("starttag", "title", []),
|
||||||
|
('data', '<!-- not a comment --><i>Egg '),
|
||||||
|
('entityref', 'amp'),
|
||||||
|
('data', ' Spam</i>'),
|
||||||
|
("endtag", "title")],
|
||||||
|
collector=EventCollectorNoNormalize(convert_charrefs=False))
|
||||||
|
|
||||||
|
@support.subTests('endtag', ['textarea', 'TEXTAREA', 'textarea ', 'textarea\n',
|
||||||
|
'textarea/', 'textarea foo=bar', 'textarea foo=">"'])
|
||||||
|
def test_textarea_closing_tag(self, endtag):
|
||||||
|
content = "<!-- not a comment --><i>Egg & Spam</i>"
|
||||||
|
s = f'<TexTarEa>{content}</{endtag}>'
|
||||||
|
self._run_check(s, [("starttag", "textarea", []),
|
||||||
|
('data', '<!-- not a comment --><i>Egg & Spam</i>'),
|
||||||
|
("endtag", "textarea")],
|
||||||
|
collector=EventCollectorNoNormalize(convert_charrefs=True))
|
||||||
|
self._run_check(s, [("starttag", "textarea", []),
|
||||||
|
('data', '<!-- not a comment --><i>Egg '),
|
||||||
|
('entityref', 'amp'),
|
||||||
|
('data', ' Spam</i>'),
|
||||||
|
("endtag", "textarea")],
|
||||||
|
collector=EventCollectorNoNormalize(convert_charrefs=False))
|
||||||
|
|
||||||
@support.subTests('tail,end', [
|
@support.subTests('tail,end', [
|
||||||
('', False),
|
('', False),
|
||||||
('<', False),
|
('<', False),
|
||||||
|
@ -363,6 +438,27 @@ text
|
||||||
("data", content if end else content + tail)],
|
("data", content if end else content + tail)],
|
||||||
collector=EventCollectorNoNormalize(convert_charrefs=False))
|
collector=EventCollectorNoNormalize(convert_charrefs=False))
|
||||||
|
|
||||||
|
@support.subTests('tail,end', [
|
||||||
|
('', False),
|
||||||
|
('<', False),
|
||||||
|
('</', False),
|
||||||
|
('</t', False),
|
||||||
|
('</title', False),
|
||||||
|
('</title ', True),
|
||||||
|
('</title foo=bar', True),
|
||||||
|
('</title foo=">', True),
|
||||||
|
])
|
||||||
|
def test_eof_in_title(self, tail, end):
|
||||||
|
s = f'<TitLe>Egg & Spam{tail}'
|
||||||
|
self._run_check(s, [("starttag", "title", []),
|
||||||
|
("data", "Egg & Spam" + ('' if end else tail))],
|
||||||
|
collector=EventCollectorNoNormalize(convert_charrefs=True))
|
||||||
|
self._run_check(s, [("starttag", "title", []),
|
||||||
|
('data', 'Egg '),
|
||||||
|
('entityref', 'amp'),
|
||||||
|
('data', ' Spam' + ('' if end else tail))],
|
||||||
|
collector=EventCollectorNoNormalize(convert_charrefs=False))
|
||||||
|
|
||||||
def test_comments(self):
|
def test_comments(self):
|
||||||
html = ("<!-- I'm a valid comment -->"
|
html = ("<!-- I'm a valid comment -->"
|
||||||
'<!--me too!-->'
|
'<!--me too!-->'
|
||||||
|
|
|
@ -0,0 +1,2 @@
|
||||||
|
Fix support of escapable raw text mode (elements "textarea" and "title")
|
||||||
|
in :class:`html.parser.HTMLParser`.
|
Loading…
Add table
Add a link
Reference in a new issue