mirror of
https://github.com/python/cpython.git
synced 2025-08-22 01:35:16 +00:00
#13960: HTMLParser is now able to handle broken comments.
This commit is contained in:
parent
32b6371460
commit
4b92cc3f79
3 changed files with 74 additions and 22 deletions
|
@ -160,7 +160,7 @@ class HTMLParser(markupbase.ParserBase):
|
|||
elif startswith("<?", i):
|
||||
k = self.parse_pi(i)
|
||||
elif startswith("<!", i):
|
||||
k = self.parse_declaration(i)
|
||||
k = self.parse_html_declaration(i)
|
||||
elif (i + 1) < n:
|
||||
self.handle_data("<")
|
||||
k = i + 1
|
||||
|
@ -218,6 +218,40 @@ class HTMLParser(markupbase.ParserBase):
|
|||
i = self.updatepos(i, n)
|
||||
self.rawdata = rawdata[i:]
|
||||
|
||||
# Internal -- parse html declarations, return length or -1 if not terminated
|
||||
# See w3.org/TR/html5/tokenization.html#markup-declaration-open-state
|
||||
# See also parse_declaration in _markupbase
|
||||
def parse_html_declaration(self, i):
|
||||
rawdata = self.rawdata
|
||||
if rawdata[i:i+2] != '<!':
|
||||
self.error('unexpected call to parse_html_declaration()')
|
||||
if rawdata[i:i+4] == '<!--':
|
||||
return self.parse_comment(i)
|
||||
elif rawdata[i:i+3] == '<![':
|
||||
return self.parse_marked_section(i)
|
||||
elif rawdata[i:i+9].lower() == '<!doctype':
|
||||
# find the closing >
|
||||
gtpos = rawdata.find('>', 9)
|
||||
if gtpos == -1:
|
||||
return -1
|
||||
self.handle_decl(rawdata[i+2:gtpos])
|
||||
return gtpos+1
|
||||
else:
|
||||
return self.parse_bogus_comment(i)
|
||||
|
||||
# Internal -- parse bogus comment, return length or -1 if not terminated
|
||||
# see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state
|
||||
def parse_bogus_comment(self, i, report=1):
|
||||
rawdata = self.rawdata
|
||||
if rawdata[i:i+2] != '<!':
|
||||
self.error('unexpected call to parse_comment()')
|
||||
pos = rawdata.find('>', i+2)
|
||||
if pos == -1:
|
||||
return -1
|
||||
if report:
|
||||
self.handle_comment(rawdata[i+2:pos])
|
||||
return pos + 1
|
||||
|
||||
# Internal -- parse processing instr, return end or -1 if not terminated
|
||||
def parse_pi(self, i):
|
||||
rawdata = self.rawdata
|
||||
|
|
|
@ -114,7 +114,7 @@ comment1b-->
|
|||
<Img sRc='Bar' isMAP>sample
|
||||
text
|
||||
“
|
||||
<!--comment2a-- --comment2b--><!>
|
||||
<!--comment2a-- --comment2b-->
|
||||
</Html>
|
||||
""", [
|
||||
("data", "\n"),
|
||||
|
@ -142,24 +142,6 @@ text
|
|||
("data", " foo"),
|
||||
])
|
||||
|
||||
def test_doctype_decl(self):
|
||||
inside = """\
|
||||
DOCTYPE html [
|
||||
<!ELEMENT html - O EMPTY>
|
||||
<!ATTLIST html
|
||||
version CDATA #IMPLIED
|
||||
profile CDATA 'DublinCore'>
|
||||
<!NOTATION datatype SYSTEM 'http://xml.python.org/notations/python-module'>
|
||||
<!ENTITY myEntity 'internal parsed entity'>
|
||||
<!ENTITY anEntity SYSTEM 'http://xml.python.org/entities/something.xml'>
|
||||
<!ENTITY % paramEntity 'name|name|name'>
|
||||
%paramEntity;
|
||||
<!-- comment -->
|
||||
]"""
|
||||
self._run_check("<!%s>" % inside, [
|
||||
("decl", inside),
|
||||
])
|
||||
|
||||
def test_bad_nesting(self):
|
||||
# Strangely, this *is* supposed to test that overlapping
|
||||
# elements are allowed. HTMLParser is more geared toward
|
||||
|
@ -182,7 +164,8 @@ DOCTYPE html [
|
|||
])
|
||||
|
||||
def test_illegal_declarations(self):
|
||||
self._parse_error('<!spacer type="block" height="25">')
|
||||
self._run_check('<!spacer type="block" height="25">',
|
||||
[('comment', 'spacer type="block" height="25"')])
|
||||
|
||||
def test_starttag_end_boundary(self):
|
||||
self._run_check("""<a b='<'>""", [("starttag", "a", [("b", "<")])])
|
||||
|
@ -233,7 +216,7 @@ DOCTYPE html [
|
|||
self._parse_error("<a foo='>")
|
||||
|
||||
def test_declaration_junk_chars(self):
|
||||
self._parse_error("<!DOCTYPE foo $ >")
|
||||
self._run_check("<!DOCTYPE foo $ >", [('decl', 'DOCTYPE foo $ ')])
|
||||
|
||||
def test_startendtag(self):
|
||||
self._run_check("<p/>", [
|
||||
|
@ -449,6 +432,39 @@ class AttributesTestCase(TestCaseBase):
|
|||
[("href", "http://www.example.org/\">;")]),
|
||||
("data", "spam"), ("endtag", "a")])
|
||||
|
||||
def test_comments(self):
|
||||
html = ("<!-- I'm a valid comment -->"
|
||||
'<!--me too!-->'
|
||||
'<!------>'
|
||||
'<!---->'
|
||||
'<!----I have many hyphens---->'
|
||||
'<!-- I have a > in the middle -->'
|
||||
'<!-- and I have -- in the middle! -->')
|
||||
expected = [('comment', " I'm a valid comment "),
|
||||
('comment', 'me too!'),
|
||||
('comment', '--'),
|
||||
('comment', ''),
|
||||
('comment', '--I have many hyphens--'),
|
||||
('comment', ' I have a > in the middle '),
|
||||
('comment', ' and I have -- in the middle! ')]
|
||||
self._run_check(html, expected)
|
||||
|
||||
def test_broken_comments(self):
|
||||
html = ('<! not really a comment >'
|
||||
'<! not a comment either -->'
|
||||
'<! -- close enough -->'
|
||||
'<!><!<-- this was an empty comment>'
|
||||
'<!!! another bogus comment !!!>')
|
||||
expected = [
|
||||
('comment', ' not really a comment '),
|
||||
('comment', ' not a comment either --'),
|
||||
('comment', ' -- close enough --'),
|
||||
('comment', ''),
|
||||
('comment', '<-- this was an empty comment'),
|
||||
('comment', '!! another bogus comment !!!'),
|
||||
]
|
||||
self._run_check(html, expected)
|
||||
|
||||
def test_condcoms(self):
|
||||
html = ('<!--[if IE & !(lte IE 8)]>aren\'t<![endif]-->'
|
||||
'<!--[if IE 8]>condcoms<![endif]-->'
|
||||
|
|
|
@ -90,6 +90,8 @@ Core and Builtins
|
|||
Library
|
||||
-------
|
||||
|
||||
- Issue #13960: HTMLParser is now able to handle broken comments.
|
||||
|
||||
- Issue #9750: Fix sqlite3.Connection.iterdump on tables and fields
|
||||
with a name that is a keyword or contains quotes. Patch by Marko
|
||||
Kohtala.
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue