mirror of
https://github.com/python/cpython.git
synced 2025-07-07 19:35:27 +00:00
gh-102555: Fix comment parsing in HTMLParser according to the HTML5 standard (GH-135664)
Some checks failed
Tests / (push) Blocked by required conditions
Tests / Ubuntu SSL tests with OpenSSL (push) Blocked by required conditions
Tests / Change detection (push) Waiting to run
Tests / Docs (push) Blocked by required conditions
Tests / Windows MSI (push) Blocked by required conditions
Tests / Check if Autoconf files are up to date (push) Blocked by required conditions
Tests / Check if generated files are up to date (push) Blocked by required conditions
Tests / WASI (push) Blocked by required conditions
Tests / Hypothesis tests on Ubuntu (push) Blocked by required conditions
Tests / Address sanitizer (push) Blocked by required conditions
Tests / Undefined behavior sanitizer (push) Blocked by required conditions
Tests / Cross build Linux (push) Blocked by required conditions
Tests / CIFuzz (push) Blocked by required conditions
Tests / All required checks pass (push) Blocked by required conditions
Lint / lint (push) Waiting to run
mypy / Run mypy on Lib/_pyrepl (push) Waiting to run
mypy / Run mypy on Lib/test/libregrtest (push) Waiting to run
mypy / Run mypy on Lib/tomllib (push) Waiting to run
mypy / Run mypy on Tools/build (push) Waiting to run
mypy / Run mypy on Tools/cases_generator (push) Waiting to run
mypy / Run mypy on Tools/clinic (push) Waiting to run
mypy / Run mypy on Tools/jit (push) Waiting to run
mypy / Run mypy on Tools/peg_generator (push) Waiting to run
JIT / Interpreter (Debug) (push) Has been cancelled
Tail calling interpreter / aarch64-apple-darwin/clang (push) Has been cancelled
Tail calling interpreter / aarch64-unknown-linux-gnu/gcc (push) Has been cancelled
Tail calling interpreter / x86_64-pc-windows-msvc/msvc (push) Has been cancelled
Tail calling interpreter / x86_64-apple-darwin/clang (push) Has been cancelled
Tail calling interpreter / free-threading (push) Has been cancelled
Tail calling interpreter / x86_64-unknown-linux-gnu/gcc (push) Has been cancelled
JIT / aarch64-pc-windows-msvc/msvc (Release) (push) Has been cancelled
JIT / aarch64-pc-windows-msvc/msvc (Debug) (push) Has been cancelled
JIT / i686-pc-windows-msvc/msvc (Release) (push) Has been cancelled
JIT / i686-pc-windows-msvc/msvc (Debug) (push) Has been cancelled
JIT / aarch64-apple-darwin/clang (Release) (push) Has been cancelled
JIT / aarch64-unknown-linux-gnu/gcc (Release) (push) Has been cancelled
JIT / aarch64-apple-darwin/clang (Debug) (push) Has been cancelled
JIT / aarch64-unknown-linux-gnu/gcc (Debug) (push) Has been cancelled
JIT / x86_64-pc-windows-msvc/msvc (Release) (push) Has been cancelled
JIT / x86_64-pc-windows-msvc/msvc (Debug) (push) Has been cancelled
JIT / x86_64-apple-darwin/clang (Release) (push) Has been cancelled
JIT / x86_64-unknown-linux-gnu/gcc (Release) (push) Has been cancelled
JIT / x86_64-apple-darwin/clang (Debug) (push) Has been cancelled
JIT / x86_64-unknown-linux-gnu/gcc (Debug) (push) Has been cancelled
Some checks failed
Tests / (push) Blocked by required conditions
Tests / Ubuntu SSL tests with OpenSSL (push) Blocked by required conditions
Tests / Change detection (push) Waiting to run
Tests / Docs (push) Blocked by required conditions
Tests / Windows MSI (push) Blocked by required conditions
Tests / Check if Autoconf files are up to date (push) Blocked by required conditions
Tests / Check if generated files are up to date (push) Blocked by required conditions
Tests / WASI (push) Blocked by required conditions
Tests / Hypothesis tests on Ubuntu (push) Blocked by required conditions
Tests / Address sanitizer (push) Blocked by required conditions
Tests / Undefined behavior sanitizer (push) Blocked by required conditions
Tests / Cross build Linux (push) Blocked by required conditions
Tests / CIFuzz (push) Blocked by required conditions
Tests / All required checks pass (push) Blocked by required conditions
Lint / lint (push) Waiting to run
mypy / Run mypy on Lib/_pyrepl (push) Waiting to run
mypy / Run mypy on Lib/test/libregrtest (push) Waiting to run
mypy / Run mypy on Lib/tomllib (push) Waiting to run
mypy / Run mypy on Tools/build (push) Waiting to run
mypy / Run mypy on Tools/cases_generator (push) Waiting to run
mypy / Run mypy on Tools/clinic (push) Waiting to run
mypy / Run mypy on Tools/jit (push) Waiting to run
mypy / Run mypy on Tools/peg_generator (push) Waiting to run
JIT / Interpreter (Debug) (push) Has been cancelled
Tail calling interpreter / aarch64-apple-darwin/clang (push) Has been cancelled
Tail calling interpreter / aarch64-unknown-linux-gnu/gcc (push) Has been cancelled
Tail calling interpreter / x86_64-pc-windows-msvc/msvc (push) Has been cancelled
Tail calling interpreter / x86_64-apple-darwin/clang (push) Has been cancelled
Tail calling interpreter / free-threading (push) Has been cancelled
Tail calling interpreter / x86_64-unknown-linux-gnu/gcc (push) Has been cancelled
JIT / aarch64-pc-windows-msvc/msvc (Release) (push) Has been cancelled
JIT / aarch64-pc-windows-msvc/msvc (Debug) (push) Has been cancelled
JIT / i686-pc-windows-msvc/msvc (Release) (push) Has been cancelled
JIT / i686-pc-windows-msvc/msvc (Debug) (push) Has been cancelled
JIT / aarch64-apple-darwin/clang (Release) (push) Has been cancelled
JIT / aarch64-unknown-linux-gnu/gcc (Release) (push) Has been cancelled
JIT / aarch64-apple-darwin/clang (Debug) (push) Has been cancelled
JIT / aarch64-unknown-linux-gnu/gcc (Debug) (push) Has been cancelled
JIT / x86_64-pc-windows-msvc/msvc (Release) (push) Has been cancelled
JIT / x86_64-pc-windows-msvc/msvc (Debug) (push) Has been cancelled
JIT / x86_64-apple-darwin/clang (Release) (push) Has been cancelled
JIT / x86_64-unknown-linux-gnu/gcc (Release) (push) Has been cancelled
JIT / x86_64-apple-darwin/clang (Debug) (push) Has been cancelled
JIT / x86_64-unknown-linux-gnu/gcc (Debug) (push) Has been cancelled
* "--!>" now ends the comment. * "-- >" no longer ends the comment. * Support abnormally ended empty comments "<-->" and "<--->". --------- Co-author: Kerim Kabirov <the.privat33r+gh@pm.me> Co-authored-by: Ezio Melotti <ezio.melotti@gmail.com>
This commit is contained in:
parent
b582d751b4
commit
8ac7613dc8
3 changed files with 50 additions and 3 deletions
|
@ -29,7 +29,8 @@ attr_charref = re.compile(r'&(#[0-9]+|#[xX][0-9a-fA-F]+|[a-zA-Z][a-zA-Z0-9]*)[;=
|
||||||
starttagopen = re.compile('<[a-zA-Z]')
|
starttagopen = re.compile('<[a-zA-Z]')
|
||||||
endtagopen = re.compile('</[a-zA-Z]')
|
endtagopen = re.compile('</[a-zA-Z]')
|
||||||
piclose = re.compile('>')
|
piclose = re.compile('>')
|
||||||
commentclose = re.compile(r'--\s*>')
|
commentclose = re.compile(r'--!?>')
|
||||||
|
commentabruptclose = re.compile(r'-?>')
|
||||||
# Note:
|
# Note:
|
||||||
# 1) if you change tagfind/attrfind remember to update locatetagend too;
|
# 1) if you change tagfind/attrfind remember to update locatetagend too;
|
||||||
# 2) if you change tagfind/attrfind and/or locatetagend the parser will
|
# 2) if you change tagfind/attrfind and/or locatetagend the parser will
|
||||||
|
@ -336,6 +337,21 @@ class HTMLParser(_markupbase.ParserBase):
|
||||||
else:
|
else:
|
||||||
return self.parse_bogus_comment(i)
|
return self.parse_bogus_comment(i)
|
||||||
|
|
||||||
|
# Internal -- parse comment, return length or -1 if not terminated
|
||||||
|
# see https://html.spec.whatwg.org/multipage/parsing.html#comment-start-state
|
||||||
|
def parse_comment(self, i, report=True):
|
||||||
|
rawdata = self.rawdata
|
||||||
|
assert rawdata.startswith('<!--', i), 'unexpected call to parse_comment()'
|
||||||
|
match = commentclose.search(rawdata, i+4)
|
||||||
|
if not match:
|
||||||
|
match = commentabruptclose.match(rawdata, i+4)
|
||||||
|
if not match:
|
||||||
|
return -1
|
||||||
|
if report:
|
||||||
|
j = match.start()
|
||||||
|
self.handle_comment(rawdata[i+4: j])
|
||||||
|
return match.end()
|
||||||
|
|
||||||
# Internal -- parse bogus comment, return length or -1 if not terminated
|
# Internal -- parse bogus comment, return length or -1 if not terminated
|
||||||
# see https://html.spec.whatwg.org/multipage/parsing.html#bogus-comment-state
|
# see https://html.spec.whatwg.org/multipage/parsing.html#bogus-comment-state
|
||||||
def parse_bogus_comment(self, i, report=1):
|
def parse_bogus_comment(self, i, report=1):
|
||||||
|
|
|
@ -367,17 +367,45 @@ text
|
||||||
html = ("<!-- I'm a valid comment -->"
|
html = ("<!-- I'm a valid comment -->"
|
||||||
'<!--me too!-->'
|
'<!--me too!-->'
|
||||||
'<!------>'
|
'<!------>'
|
||||||
|
'<!----->'
|
||||||
'<!---->'
|
'<!---->'
|
||||||
|
# abrupt-closing-of-empty-comment
|
||||||
|
'<!--->'
|
||||||
|
'<!-->'
|
||||||
'<!----I have many hyphens---->'
|
'<!----I have many hyphens---->'
|
||||||
'<!-- I have a > in the middle -->'
|
'<!-- I have a > in the middle -->'
|
||||||
'<!-- and I have -- in the middle! -->')
|
'<!-- and I have -- in the middle! -->'
|
||||||
|
'<!--incorrectly-closed-comment--!>'
|
||||||
|
'<!----!>'
|
||||||
|
'<!----!-->'
|
||||||
|
'<!---- >-->'
|
||||||
|
'<!---!>-->'
|
||||||
|
'<!--!>-->'
|
||||||
|
# nested-comment
|
||||||
|
'<!-- <!-- nested --> -->'
|
||||||
|
'<!--<!-->'
|
||||||
|
'<!--<!--!>'
|
||||||
|
)
|
||||||
expected = [('comment', " I'm a valid comment "),
|
expected = [('comment', " I'm a valid comment "),
|
||||||
('comment', 'me too!'),
|
('comment', 'me too!'),
|
||||||
('comment', '--'),
|
('comment', '--'),
|
||||||
|
('comment', '-'),
|
||||||
|
('comment', ''),
|
||||||
|
('comment', ''),
|
||||||
('comment', ''),
|
('comment', ''),
|
||||||
('comment', '--I have many hyphens--'),
|
('comment', '--I have many hyphens--'),
|
||||||
('comment', ' I have a > in the middle '),
|
('comment', ' I have a > in the middle '),
|
||||||
('comment', ' and I have -- in the middle! ')]
|
('comment', ' and I have -- in the middle! '),
|
||||||
|
('comment', 'incorrectly-closed-comment'),
|
||||||
|
('comment', ''),
|
||||||
|
('comment', '--!'),
|
||||||
|
('comment', '-- >'),
|
||||||
|
('comment', '-!>'),
|
||||||
|
('comment', '!>'),
|
||||||
|
('comment', ' <!-- nested '), ('data', ' -->'),
|
||||||
|
('comment', '<!'),
|
||||||
|
('comment', '<!'),
|
||||||
|
]
|
||||||
self._run_check(html, expected)
|
self._run_check(html, expected)
|
||||||
|
|
||||||
def test_condcoms(self):
|
def test_condcoms(self):
|
||||||
|
|
|
@ -0,0 +1,3 @@
|
||||||
|
Fix comment parsing in :class:`html.parser.HTMLParser` according to the
|
||||||
|
HTML5 standard. ``--!>`` now ends the comment. ``-- >`` no longer ends the
|
||||||
|
comment. Support abnormally ended empty comments ``<-->`` and ``<--->``.
|
Loading…
Add table
Add a link
Reference in a new issue