[3.11] gh-102555: Fix comment parsing in HTMLParser according to the HTML5 standard (GH-135664) (GH-136274)

* "--!>" now ends the comment.
* "-- >" no longer ends the comment.
* Support abnormally ended empty comments "<-->" and "<--->".

---------
(cherry picked from commit 8ac7613dc8)


Co-author: Kerim Kabirov <the.privat33r+gh@pm.me>

Co-authored-by: Serhiy Storchaka <storchaka@gmail.com>
Co-authored-by: Ezio Melotti <ezio.melotti@gmail.com>
This commit is contained in:
Miss Islington (bot) 2025-07-12 14:24:39 +02:00 committed by GitHub
parent f3c6f882cd
commit d8f6297e6d
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 50 additions and 3 deletions

View file

@ -27,7 +27,8 @@ charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
starttagopen = re.compile('<[a-zA-Z]')
endtagopen = re.compile('</[a-zA-Z]')
piclose = re.compile('>')
commentclose = re.compile(r'--\s*>')
commentclose = re.compile(r'--!?>')
commentabruptclose = re.compile(r'-?>')
# Note:
# 1) if you change tagfind/attrfind remember to update locatestarttagend too;
# 2) if you change tagfind/attrfind and/or locatestarttagend the parser will
@ -290,6 +291,21 @@ class HTMLParser(_markupbase.ParserBase):
else:
return self.parse_bogus_comment(i)
# Internal -- parse comment, return length or -1 if not terminated
# see https://html.spec.whatwg.org/multipage/parsing.html#comment-start-state
def parse_comment(self, i, report=True):
rawdata = self.rawdata
assert rawdata.startswith('<!--', i), 'unexpected call to parse_comment()'
match = commentclose.search(rawdata, i+4)
if not match:
match = commentabruptclose.match(rawdata, i+4)
if not match:
return -1
if report:
j = match.start()
self.handle_comment(rawdata[i+4: j])
return match.end()
# Internal -- parse bogus comment, return length or -1 if not terminated
# see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state
def parse_bogus_comment(self, i, report=1):

View file

@ -321,17 +321,45 @@ text
html = ("<!-- I'm a valid comment -->"
'<!--me too!-->'
'<!------>'
'<!----->'
'<!---->'
# abrupt-closing-of-empty-comment
'<!--->'
'<!-->'
'<!----I have many hyphens---->'
'<!-- I have a > in the middle -->'
'<!-- and I have -- in the middle! -->')
'<!-- and I have -- in the middle! -->'
'<!--incorrectly-closed-comment--!>'
'<!----!>'
'<!----!-->'
'<!---- >-->'
'<!---!>-->'
'<!--!>-->'
# nested-comment
'<!-- <!-- nested --> -->'
'<!--<!-->'
'<!--<!--!>'
)
expected = [('comment', " I'm a valid comment "),
('comment', 'me too!'),
('comment', '--'),
('comment', '-'),
('comment', ''),
('comment', ''),
('comment', ''),
('comment', '--I have many hyphens--'),
('comment', ' I have a > in the middle '),
('comment', ' and I have -- in the middle! ')]
('comment', ' and I have -- in the middle! '),
('comment', 'incorrectly-closed-comment'),
('comment', ''),
('comment', '--!'),
('comment', '-- >'),
('comment', '-!>'),
('comment', '!>'),
('comment', ' <!-- nested '), ('data', ' -->'),
('comment', '<!'),
('comment', '<!'),
]
self._run_check(html, expected)
def test_condcoms(self):

View file

@ -0,0 +1,3 @@
Fix comment parsing in :class:`html.parser.HTMLParser` according to the
HTML5 standard. ``--!>`` now ends the comment. ``-- >`` no longer ends the
comment. Support abnormally ended empty comments ``<-->`` and ``<--->``.