"""Tests for HTMLParser.py."""
import html.parser
import pprint
import unittest
from unittest.mock import patch
from test import support
SAMPLE_RCDATA = (
''
"sample
text
“
""", [
("data", "\n"),
("decl", "DOCTYPE html PUBLIC 'foo'"),
("data", "\n"),
("starttag", "html", []),
("entityref", "entity"),
("charref", "32"),
("data", "\n"),
("comment", "comment1a\n->
',
'foo = "";',
'foo = "";',
'foo = <\n/script> ',
'',
('\n//<\\/s\'+\'cript>\');\n//]]>'),
'\n\n',
'',
])
def test_script_content(self, content):
s = f''
self._run_check(s, [
("starttag", "script", []),
("data", content),
("endtag", "script"),
])
@support.subTests('content', [
'a::before { content: ""; }',
'a::before { content: "¬-an-entity-ref;"; }',
'a::before { content: "
''"""
s = f'', True),
])
def test_eof_in_script(self, tail, end):
content = "a = 123"
s = f'{1}'
'{1}'.format(text, charref),
expected, collector=collector())
# check truncated charrefs at the end of the file
html = '&quo '
for x in range(1, len(html)):
self._run_check(html[:x], [('data', html[:x])],
collector=collector())
# check a string with no charrefs
self._run_check('no charrefs here', [('data', 'no charrefs here')],
collector=collector())
def test_convert_charrefs_in_attribute_values(self):
# default value for convert_charrefs is now True
collector = lambda: EventCollectorCharrefs()
self.assertTrue(collector().convert_charrefs)
# always unescape terminated entity refs, numeric and hex char refs:
# - regardless whether they are at start, middle, end of attribute
# - or followed by alphanumeric, non-alphanumeric, or equals char
charrefs = ['¢', '¢', '¢', '¢', '¢']
expected = [('starttag', 'a',
[('x', '¢'), ('x', 'z¢'), ('x', '¢z'),
('x', 'z¢z'), ('x', '¢ z'), ('x', '¢=z')]),
('endtag', 'a')]
for charref in charrefs:
self._run_check(''
.format(charref), expected, collector=collector())
# only unescape unterminated entity matches if they are not followed by
# an alphanumeric or an equals sign
charref = '¢'
expected = [('starttag', 'a',
[('x', '¢'), ('x', 'z¢'), ('x', '¢z'),
('x', 'z¢z'), ('x', '¢ z'), ('x', '¢=z')]),
('endtag', 'a')]
self._run_check(''
.format(charref), expected, collector=collector())
# the remaining tests were for the "tolerant" parser (which is now
# the default), and check various kind of broken markup
def test_tolerant_parsing(self):
self._run_check('te>>xt&a<", [('data', '<>')])
self._run_check("< >", [('data', '< >')])
self._run_check("< ", [('data', '< ')])
self._run_check(">", [])
self._run_check("<$>", [('data', '<$>')])
self._run_check("$>", [('comment', '$')])
self._run_check("", [('data', '')])
self._run_check("
'
'foo'
'
')
expected = [
('comment', '[if !(IE)]'),
('data', 'broken condcom'),
('comment', '[endif]'),
('comment', '[if ! IE]'),
('startendtag', 'link', [('href', 'favicon.tiff')]),
('comment', '[endif]'),
('comment', '[if !IE 6]'),
('startendtag', 'img', [('src', 'firefox.png')]),
('comment', '[endif]'),
('comment', '[if !ie 6]'),
('starttag', 'b', []),
('data', 'foo'),
('endtag', 'b'),
('comment', '[endif]'),
('comment', '[if (!IE)|(lt IE 9)]'),
('startendtag', 'img', [('src', 'mammoth.bmp')]),
('comment', '[endif]')
]
self._run_check(html, expected)
@support.subTests('content', [
'just some plain text',
'',
'¬-an-entity-ref;',
"
",
[("starttag", "img", [("src", "/foo/bar.png"),
("alt", "\u4e2d\u6587")])])
self._run_check(
"",
[("starttag", "a", [("title", "\u30c6\u30b9\u30c8"),
("href", "\u30c6\u30b9\u30c8.html")])])
self._run_check(
'',
[("starttag", "a", [("title", "\u30c6\u30b9\u30c8"),
("href", "\u30c6\u30b9\u30c8.html")])])
def test_attr_entity_replacement(self):
self._run_check(
"",
[("starttag", "a", [("b", "&><\"'")])])
def test_attr_funky_names(self):
self._run_check(
"",
[("starttag", "a", [("a.b", "v"), ("c:d", "v"), ("e-f", "v")])])
def test_entityrefs_in_attributes(self):
self._run_check(
"",
[("starttag", "html", [("foo", "\u20AC&aa&unsupported;")])])
def test_attr_funky_names2(self):
self._run_check(
r"| " "- software-and-i" "- library |