mirror of
https://github.com/python/cpython.git
synced 2025-07-08 03:45:36 +00:00
gh-69426: HTMLParser: only unescape properly terminated character entities in attribute values (GH-95215)
According to the HTML5 spec, named character references in attribute values should only be processed if they are not followed by an ASCII alphanumeric, or an equals sign. https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state
This commit is contained in:
parent
3dfed23092
commit
77b14a6d58
3 changed files with 57 additions and 9 deletions
|
@ -348,18 +348,16 @@ text
|
|||
collector = lambda: EventCollectorCharrefs()
|
||||
self.assertTrue(collector().convert_charrefs)
|
||||
charrefs = ['"', '"', '"', '"', '"', '"']
|
||||
# check charrefs in the middle of the text/attributes
|
||||
expected = [('starttag', 'a', [('href', 'foo"zar')]),
|
||||
('data', 'a"z'), ('endtag', 'a')]
|
||||
# check charrefs in the middle of the text
|
||||
expected = [('starttag', 'a', []), ('data', 'a"z'), ('endtag', 'a')]
|
||||
for charref in charrefs:
|
||||
self._run_check('<a href="foo{0}zar">a{0}z</a>'.format(charref),
|
||||
self._run_check('<a>a{0}z</a>'.format(charref),
|
||||
expected, collector=collector())
|
||||
# check charrefs at the beginning/end of the text/attributes
|
||||
expected = [('data', '"'),
|
||||
('starttag', 'a', [('x', '"'), ('y', '"X'), ('z', 'X"')]),
|
||||
# check charrefs at the beginning/end of the text
|
||||
expected = [('data', '"'), ('starttag', 'a', []),
|
||||
('data', '"'), ('endtag', 'a'), ('data', '"')]
|
||||
for charref in charrefs:
|
||||
self._run_check('{0}<a x="{0}" y="{0}X" z="X{0}">'
|
||||
self._run_check('{0}<a>'
|
||||
'{0}</a>{0}'.format(charref),
|
||||
expected, collector=collector())
|
||||
# check charrefs in <script>/<style> elements
|
||||
|
@ -382,6 +380,35 @@ text
|
|||
self._run_check('no charrefs here', [('data', 'no charrefs here')],
|
||||
collector=collector())
|
||||
|
||||
def test_convert_charrefs_in_attribute_values(self):
|
||||
# default value for convert_charrefs is now True
|
||||
collector = lambda: EventCollectorCharrefs()
|
||||
self.assertTrue(collector().convert_charrefs)
|
||||
|
||||
# always unescape terminated entity refs, numeric and hex char refs:
|
||||
# - regardless whether they are at start, middle, end of attribute
|
||||
# - or followed by alphanumeric, non-alphanumeric, or equals char
|
||||
charrefs = ['¢', '¢', '¢', '¢', '¢']
|
||||
expected = [('starttag', 'a',
|
||||
[('x', '¢'), ('x', 'z¢'), ('x', '¢z'),
|
||||
('x', 'z¢z'), ('x', '¢ z'), ('x', '¢=z')]),
|
||||
('endtag', 'a')]
|
||||
for charref in charrefs:
|
||||
self._run_check('<a x="{0}" x="z{0}" x="{0}z" '
|
||||
' x="z{0}z" x="{0} z" x="{0}=z"></a>'
|
||||
.format(charref), expected, collector=collector())
|
||||
|
||||
# only unescape unterminated entity matches if they are not followed by
|
||||
# an alphanumeric or an equals sign
|
||||
charref = '¢'
|
||||
expected = [('starttag', 'a',
|
||||
[('x', '¢'), ('x', 'z¢'), ('x', '¢z'),
|
||||
('x', 'z¢z'), ('x', '¢ z'), ('x', '¢=z')]),
|
||||
('endtag', 'a')]
|
||||
self._run_check('<a x="{0}" x="z{0}" x="{0}z" '
|
||||
' x="z{0}z" x="{0} z" x="{0}=z"></a>'
|
||||
.format(charref), expected, collector=collector())
|
||||
|
||||
# the remaining tests were for the "tolerant" parser (which is now
|
||||
# the default), and check various kind of broken markup
|
||||
def test_tolerant_parsing(self):
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue