mirror of
https://github.com/python/cpython.git
synced 2025-08-22 09:45:06 +00:00
#13960: HTMLParser is now able to handle broken comments.
This commit is contained in:
parent
32b6371460
commit
4b92cc3f79
3 changed files with 74 additions and 22 deletions
|
@ -160,7 +160,7 @@ class HTMLParser(markupbase.ParserBase):
|
||||||
elif startswith("<?", i):
|
elif startswith("<?", i):
|
||||||
k = self.parse_pi(i)
|
k = self.parse_pi(i)
|
||||||
elif startswith("<!", i):
|
elif startswith("<!", i):
|
||||||
k = self.parse_declaration(i)
|
k = self.parse_html_declaration(i)
|
||||||
elif (i + 1) < n:
|
elif (i + 1) < n:
|
||||||
self.handle_data("<")
|
self.handle_data("<")
|
||||||
k = i + 1
|
k = i + 1
|
||||||
|
@ -218,6 +218,40 @@ class HTMLParser(markupbase.ParserBase):
|
||||||
i = self.updatepos(i, n)
|
i = self.updatepos(i, n)
|
||||||
self.rawdata = rawdata[i:]
|
self.rawdata = rawdata[i:]
|
||||||
|
|
||||||
|
# Internal -- parse html declarations, return length or -1 if not terminated
|
||||||
|
# See w3.org/TR/html5/tokenization.html#markup-declaration-open-state
|
||||||
|
# See also parse_declaration in _markupbase
|
||||||
|
def parse_html_declaration(self, i):
|
||||||
|
rawdata = self.rawdata
|
||||||
|
if rawdata[i:i+2] != '<!':
|
||||||
|
self.error('unexpected call to parse_html_declaration()')
|
||||||
|
if rawdata[i:i+4] == '<!--':
|
||||||
|
return self.parse_comment(i)
|
||||||
|
elif rawdata[i:i+3] == '<![':
|
||||||
|
return self.parse_marked_section(i)
|
||||||
|
elif rawdata[i:i+9].lower() == '<!doctype':
|
||||||
|
# find the closing >
|
||||||
|
gtpos = rawdata.find('>', 9)
|
||||||
|
if gtpos == -1:
|
||||||
|
return -1
|
||||||
|
self.handle_decl(rawdata[i+2:gtpos])
|
||||||
|
return gtpos+1
|
||||||
|
else:
|
||||||
|
return self.parse_bogus_comment(i)
|
||||||
|
|
||||||
|
# Internal -- parse bogus comment, return length or -1 if not terminated
|
||||||
|
# see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state
|
||||||
|
def parse_bogus_comment(self, i, report=1):
|
||||||
|
rawdata = self.rawdata
|
||||||
|
if rawdata[i:i+2] != '<!':
|
||||||
|
self.error('unexpected call to parse_comment()')
|
||||||
|
pos = rawdata.find('>', i+2)
|
||||||
|
if pos == -1:
|
||||||
|
return -1
|
||||||
|
if report:
|
||||||
|
self.handle_comment(rawdata[i+2:pos])
|
||||||
|
return pos + 1
|
||||||
|
|
||||||
# Internal -- parse processing instr, return end or -1 if not terminated
|
# Internal -- parse processing instr, return end or -1 if not terminated
|
||||||
def parse_pi(self, i):
|
def parse_pi(self, i):
|
||||||
rawdata = self.rawdata
|
rawdata = self.rawdata
|
||||||
|
|
|
@ -114,7 +114,7 @@ comment1b-->
|
||||||
<Img sRc='Bar' isMAP>sample
|
<Img sRc='Bar' isMAP>sample
|
||||||
text
|
text
|
||||||
“
|
“
|
||||||
<!--comment2a-- --comment2b--><!>
|
<!--comment2a-- --comment2b-->
|
||||||
</Html>
|
</Html>
|
||||||
""", [
|
""", [
|
||||||
("data", "\n"),
|
("data", "\n"),
|
||||||
|
@ -142,24 +142,6 @@ text
|
||||||
("data", " foo"),
|
("data", " foo"),
|
||||||
])
|
])
|
||||||
|
|
||||||
def test_doctype_decl(self):
|
|
||||||
inside = """\
|
|
||||||
DOCTYPE html [
|
|
||||||
<!ELEMENT html - O EMPTY>
|
|
||||||
<!ATTLIST html
|
|
||||||
version CDATA #IMPLIED
|
|
||||||
profile CDATA 'DublinCore'>
|
|
||||||
<!NOTATION datatype SYSTEM 'http://xml.python.org/notations/python-module'>
|
|
||||||
<!ENTITY myEntity 'internal parsed entity'>
|
|
||||||
<!ENTITY anEntity SYSTEM 'http://xml.python.org/entities/something.xml'>
|
|
||||||
<!ENTITY % paramEntity 'name|name|name'>
|
|
||||||
%paramEntity;
|
|
||||||
<!-- comment -->
|
|
||||||
]"""
|
|
||||||
self._run_check("<!%s>" % inside, [
|
|
||||||
("decl", inside),
|
|
||||||
])
|
|
||||||
|
|
||||||
def test_bad_nesting(self):
|
def test_bad_nesting(self):
|
||||||
# Strangely, this *is* supposed to test that overlapping
|
# Strangely, this *is* supposed to test that overlapping
|
||||||
# elements are allowed. HTMLParser is more geared toward
|
# elements are allowed. HTMLParser is more geared toward
|
||||||
|
@ -182,7 +164,8 @@ DOCTYPE html [
|
||||||
])
|
])
|
||||||
|
|
||||||
def test_illegal_declarations(self):
|
def test_illegal_declarations(self):
|
||||||
self._parse_error('<!spacer type="block" height="25">')
|
self._run_check('<!spacer type="block" height="25">',
|
||||||
|
[('comment', 'spacer type="block" height="25"')])
|
||||||
|
|
||||||
def test_starttag_end_boundary(self):
|
def test_starttag_end_boundary(self):
|
||||||
self._run_check("""<a b='<'>""", [("starttag", "a", [("b", "<")])])
|
self._run_check("""<a b='<'>""", [("starttag", "a", [("b", "<")])])
|
||||||
|
@ -233,7 +216,7 @@ DOCTYPE html [
|
||||||
self._parse_error("<a foo='>")
|
self._parse_error("<a foo='>")
|
||||||
|
|
||||||
def test_declaration_junk_chars(self):
|
def test_declaration_junk_chars(self):
|
||||||
self._parse_error("<!DOCTYPE foo $ >")
|
self._run_check("<!DOCTYPE foo $ >", [('decl', 'DOCTYPE foo $ ')])
|
||||||
|
|
||||||
def test_startendtag(self):
|
def test_startendtag(self):
|
||||||
self._run_check("<p/>", [
|
self._run_check("<p/>", [
|
||||||
|
@ -449,6 +432,39 @@ class AttributesTestCase(TestCaseBase):
|
||||||
[("href", "http://www.example.org/\">;")]),
|
[("href", "http://www.example.org/\">;")]),
|
||||||
("data", "spam"), ("endtag", "a")])
|
("data", "spam"), ("endtag", "a")])
|
||||||
|
|
||||||
|
def test_comments(self):
|
||||||
|
html = ("<!-- I'm a valid comment -->"
|
||||||
|
'<!--me too!-->'
|
||||||
|
'<!------>'
|
||||||
|
'<!---->'
|
||||||
|
'<!----I have many hyphens---->'
|
||||||
|
'<!-- I have a > in the middle -->'
|
||||||
|
'<!-- and I have -- in the middle! -->')
|
||||||
|
expected = [('comment', " I'm a valid comment "),
|
||||||
|
('comment', 'me too!'),
|
||||||
|
('comment', '--'),
|
||||||
|
('comment', ''),
|
||||||
|
('comment', '--I have many hyphens--'),
|
||||||
|
('comment', ' I have a > in the middle '),
|
||||||
|
('comment', ' and I have -- in the middle! ')]
|
||||||
|
self._run_check(html, expected)
|
||||||
|
|
||||||
|
def test_broken_comments(self):
|
||||||
|
html = ('<! not really a comment >'
|
||||||
|
'<! not a comment either -->'
|
||||||
|
'<! -- close enough -->'
|
||||||
|
'<!><!<-- this was an empty comment>'
|
||||||
|
'<!!! another bogus comment !!!>')
|
||||||
|
expected = [
|
||||||
|
('comment', ' not really a comment '),
|
||||||
|
('comment', ' not a comment either --'),
|
||||||
|
('comment', ' -- close enough --'),
|
||||||
|
('comment', ''),
|
||||||
|
('comment', '<-- this was an empty comment'),
|
||||||
|
('comment', '!! another bogus comment !!!'),
|
||||||
|
]
|
||||||
|
self._run_check(html, expected)
|
||||||
|
|
||||||
def test_condcoms(self):
|
def test_condcoms(self):
|
||||||
html = ('<!--[if IE & !(lte IE 8)]>aren\'t<![endif]-->'
|
html = ('<!--[if IE & !(lte IE 8)]>aren\'t<![endif]-->'
|
||||||
'<!--[if IE 8]>condcoms<![endif]-->'
|
'<!--[if IE 8]>condcoms<![endif]-->'
|
||||||
|
|
|
@ -90,6 +90,8 @@ Core and Builtins
|
||||||
Library
|
Library
|
||||||
-------
|
-------
|
||||||
|
|
||||||
|
- Issue #13960: HTMLParser is now able to handle broken comments.
|
||||||
|
|
||||||
- Issue #9750: Fix sqlite3.Connection.iterdump on tables and fields
|
- Issue #9750: Fix sqlite3.Connection.iterdump on tables and fields
|
||||||
with a name that is a keyword or contains quotes. Patch by Marko
|
with a name that is a keyword or contains quotes. Patch by Marko
|
||||||
Kohtala.
|
Kohtala.
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue