mirror of
https://github.com/python/cpython.git
synced 2025-07-07 19:35:27 +00:00
[3.10] gh-135462: Fix quadratic complexity in processing special input in HTMLParser (GH-135464) (GH-135485)
Some checks failed
Tests / Check for source changes (push) Has been cancelled
Tests / Ubuntu (push) Has been cancelled
Tests / Check if the ABI has changed (push) Has been cancelled
Tests / Check if generated files are up to date (push) Has been cancelled
Tests / Windows (x86) (push) Has been cancelled
Tests / Windows (x64) (push) Has been cancelled
Tests / macOS (push) Has been cancelled
Tests / Ubuntu SSL tests with OpenSSL (push) Has been cancelled
Some checks failed
Tests / Check for source changes (push) Has been cancelled
Tests / Ubuntu (push) Has been cancelled
Tests / Check if the ABI has changed (push) Has been cancelled
Tests / Check if generated files are up to date (push) Has been cancelled
Tests / Windows (x86) (push) Has been cancelled
Tests / Windows (x64) (push) Has been cancelled
Tests / macOS (push) Has been cancelled
Tests / Ubuntu SSL tests with OpenSSL (push) Has been cancelled
End-of-file errors are now handled according to the HTML5 specs --
comments and declarations are automatically closed, tags are ignored.
(cherry picked from commit 6eb6c5dbfb
)
This commit is contained in:
parent
f297a2292c
commit
fdc9d214c0
3 changed files with 117 additions and 23 deletions
|
@ -25,6 +25,7 @@ entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
|
||||||
charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
|
charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
|
||||||
|
|
||||||
starttagopen = re.compile('<[a-zA-Z]')
|
starttagopen = re.compile('<[a-zA-Z]')
|
||||||
|
endtagopen = re.compile('</[a-zA-Z]')
|
||||||
piclose = re.compile('>')
|
piclose = re.compile('>')
|
||||||
commentclose = re.compile(r'--\s*>')
|
commentclose = re.compile(r'--\s*>')
|
||||||
# Note:
|
# Note:
|
||||||
|
@ -176,7 +177,7 @@ class HTMLParser(_markupbase.ParserBase):
|
||||||
k = self.parse_pi(i)
|
k = self.parse_pi(i)
|
||||||
elif startswith("<!", i):
|
elif startswith("<!", i):
|
||||||
k = self.parse_html_declaration(i)
|
k = self.parse_html_declaration(i)
|
||||||
elif (i + 1) < n:
|
elif (i + 1) < n or end:
|
||||||
self.handle_data("<")
|
self.handle_data("<")
|
||||||
k = i + 1
|
k = i + 1
|
||||||
else:
|
else:
|
||||||
|
@ -184,17 +185,35 @@ class HTMLParser(_markupbase.ParserBase):
|
||||||
if k < 0:
|
if k < 0:
|
||||||
if not end:
|
if not end:
|
||||||
break
|
break
|
||||||
k = rawdata.find('>', i + 1)
|
if starttagopen.match(rawdata, i): # < + letter
|
||||||
if k < 0:
|
pass
|
||||||
k = rawdata.find('<', i + 1)
|
elif startswith("</", i):
|
||||||
if k < 0:
|
if i + 2 == n:
|
||||||
k = i + 1
|
self.handle_data("</")
|
||||||
|
elif endtagopen.match(rawdata, i): # </ + letter
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
# bogus comment
|
||||||
|
self.handle_comment(rawdata[i+2:])
|
||||||
|
elif startswith("<!--", i):
|
||||||
|
j = n
|
||||||
|
for suffix in ("--!", "--", "-"):
|
||||||
|
if rawdata.endswith(suffix, i+4):
|
||||||
|
j -= len(suffix)
|
||||||
|
break
|
||||||
|
self.handle_comment(rawdata[i+4:j])
|
||||||
|
elif startswith("<![CDATA[", i):
|
||||||
|
self.unknown_decl(rawdata[i+3:])
|
||||||
|
elif rawdata[i:i+9].lower() == '<!doctype':
|
||||||
|
self.handle_decl(rawdata[i+2:])
|
||||||
|
elif startswith("<!", i):
|
||||||
|
# bogus comment
|
||||||
|
self.handle_comment(rawdata[i+2:])
|
||||||
|
elif startswith("<?", i):
|
||||||
|
self.handle_pi(rawdata[i+2:])
|
||||||
else:
|
else:
|
||||||
k += 1
|
raise AssertionError("we should not get here!")
|
||||||
if self.convert_charrefs and not self.cdata_elem:
|
k = n
|
||||||
self.handle_data(unescape(rawdata[i:k]))
|
|
||||||
else:
|
|
||||||
self.handle_data(rawdata[i:k])
|
|
||||||
i = self.updatepos(i, k)
|
i = self.updatepos(i, k)
|
||||||
elif startswith("&#", i):
|
elif startswith("&#", i):
|
||||||
match = charref.match(rawdata, i)
|
match = charref.match(rawdata, i)
|
||||||
|
|
|
@ -4,6 +4,8 @@ import html.parser
|
||||||
import pprint
|
import pprint
|
||||||
import unittest
|
import unittest
|
||||||
|
|
||||||
|
from test import support
|
||||||
|
|
||||||
|
|
||||||
class EventCollector(html.parser.HTMLParser):
|
class EventCollector(html.parser.HTMLParser):
|
||||||
|
|
||||||
|
@ -391,28 +393,34 @@ text
|
||||||
('data', '<'),
|
('data', '<'),
|
||||||
('starttag', 'bc<', [('a', None)]),
|
('starttag', 'bc<', [('a', None)]),
|
||||||
('endtag', 'html'),
|
('endtag', 'html'),
|
||||||
('data', '\n<img src="URL>'),
|
('data', '\n')])
|
||||||
('comment', '/img'),
|
|
||||||
('endtag', 'html<')])
|
|
||||||
|
|
||||||
def test_starttag_junk_chars(self):
|
def test_starttag_junk_chars(self):
|
||||||
|
self._run_check("<", [('data', '<')])
|
||||||
|
self._run_check("<>", [('data', '<>')])
|
||||||
|
self._run_check("< >", [('data', '< >')])
|
||||||
|
self._run_check("< ", [('data', '< ')])
|
||||||
self._run_check("</>", [])
|
self._run_check("</>", [])
|
||||||
|
self._run_check("<$>", [('data', '<$>')])
|
||||||
self._run_check("</$>", [('comment', '$')])
|
self._run_check("</$>", [('comment', '$')])
|
||||||
self._run_check("</", [('data', '</')])
|
self._run_check("</", [('data', '</')])
|
||||||
self._run_check("</a", [('data', '</a')])
|
self._run_check("</a", [])
|
||||||
|
self._run_check("</ a>", [('endtag', 'a')])
|
||||||
|
self._run_check("</ a", [('comment', ' a')])
|
||||||
self._run_check("<a<a>", [('starttag', 'a<a', [])])
|
self._run_check("<a<a>", [('starttag', 'a<a', [])])
|
||||||
self._run_check("</a<a>", [('endtag', 'a<a')])
|
self._run_check("</a<a>", [('endtag', 'a<a')])
|
||||||
self._run_check("<!", [('data', '<!')])
|
self._run_check("<!", [('comment', '')])
|
||||||
self._run_check("<a", [('data', '<a')])
|
self._run_check("<a", [])
|
||||||
self._run_check("<a foo='bar'", [('data', "<a foo='bar'")])
|
self._run_check("<a foo='bar'", [])
|
||||||
self._run_check("<a foo='bar", [('data', "<a foo='bar")])
|
self._run_check("<a foo='bar", [])
|
||||||
self._run_check("<a foo='>'", [('data', "<a foo='>'")])
|
self._run_check("<a foo='>'", [])
|
||||||
self._run_check("<a foo='>", [('data', "<a foo='>")])
|
self._run_check("<a foo='>", [])
|
||||||
self._run_check("<a$>", [('starttag', 'a$', [])])
|
self._run_check("<a$>", [('starttag', 'a$', [])])
|
||||||
self._run_check("<a$b>", [('starttag', 'a$b', [])])
|
self._run_check("<a$b>", [('starttag', 'a$b', [])])
|
||||||
self._run_check("<a$b/>", [('startendtag', 'a$b', [])])
|
self._run_check("<a$b/>", [('startendtag', 'a$b', [])])
|
||||||
self._run_check("<a$b >", [('starttag', 'a$b', [])])
|
self._run_check("<a$b >", [('starttag', 'a$b', [])])
|
||||||
self._run_check("<a$b />", [('startendtag', 'a$b', [])])
|
self._run_check("<a$b />", [('startendtag', 'a$b', [])])
|
||||||
|
self._run_check("</a$b>", [('endtag', 'a$b')])
|
||||||
|
|
||||||
def test_slashes_in_starttag(self):
|
def test_slashes_in_starttag(self):
|
||||||
self._run_check('<a foo="var"/>', [('startendtag', 'a', [('foo', 'var')])])
|
self._run_check('<a foo="var"/>', [('startendtag', 'a', [('foo', 'var')])])
|
||||||
|
@ -537,13 +545,56 @@ text
|
||||||
for html, expected in data:
|
for html, expected in data:
|
||||||
self._run_check(html, expected)
|
self._run_check(html, expected)
|
||||||
|
|
||||||
def test_broken_comments(self):
|
def test_eof_in_comments(self):
|
||||||
html = ('<! not really a comment >'
|
data = [
|
||||||
|
('<!--', [('comment', '')]),
|
||||||
|
('<!---', [('comment', '')]),
|
||||||
|
('<!----', [('comment', '')]),
|
||||||
|
('<!-----', [('comment', '-')]),
|
||||||
|
('<!------', [('comment', '--')]),
|
||||||
|
('<!----!', [('comment', '')]),
|
||||||
|
('<!---!', [('comment', '-!')]),
|
||||||
|
('<!---!>', [('comment', '-!>')]),
|
||||||
|
('<!--foo', [('comment', 'foo')]),
|
||||||
|
('<!--foo-', [('comment', 'foo')]),
|
||||||
|
('<!--foo--', [('comment', 'foo')]),
|
||||||
|
('<!--foo--!', [('comment', 'foo')]),
|
||||||
|
('<!--<!--', [('comment', '<!')]),
|
||||||
|
('<!--<!--!', [('comment', '<!')]),
|
||||||
|
]
|
||||||
|
for html, expected in data:
|
||||||
|
self._run_check(html, expected)
|
||||||
|
|
||||||
|
def test_eof_in_declarations(self):
|
||||||
|
data = [
|
||||||
|
('<!', [('comment', '')]),
|
||||||
|
('<!-', [('comment', '-')]),
|
||||||
|
('<![', [('comment', '[')]),
|
||||||
|
('<![CDATA[', [('unknown decl', 'CDATA[')]),
|
||||||
|
('<![CDATA[x', [('unknown decl', 'CDATA[x')]),
|
||||||
|
('<![CDATA[x]', [('unknown decl', 'CDATA[x]')]),
|
||||||
|
('<![CDATA[x]]', [('unknown decl', 'CDATA[x]]')]),
|
||||||
|
('<!DOCTYPE', [('decl', 'DOCTYPE')]),
|
||||||
|
('<!DOCTYPE ', [('decl', 'DOCTYPE ')]),
|
||||||
|
('<!DOCTYPE html', [('decl', 'DOCTYPE html')]),
|
||||||
|
('<!DOCTYPE html ', [('decl', 'DOCTYPE html ')]),
|
||||||
|
('<!DOCTYPE html PUBLIC', [('decl', 'DOCTYPE html PUBLIC')]),
|
||||||
|
('<!DOCTYPE html PUBLIC "foo', [('decl', 'DOCTYPE html PUBLIC "foo')]),
|
||||||
|
('<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "foo',
|
||||||
|
[('decl', 'DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "foo')]),
|
||||||
|
]
|
||||||
|
for html, expected in data:
|
||||||
|
self._run_check(html, expected)
|
||||||
|
|
||||||
|
def test_bogus_comments(self):
|
||||||
|
html = ('<!ELEMENT br EMPTY>'
|
||||||
|
'<! not really a comment >'
|
||||||
'<! not a comment either -->'
|
'<! not a comment either -->'
|
||||||
'<! -- close enough -->'
|
'<! -- close enough -->'
|
||||||
'<!><!<-- this was an empty comment>'
|
'<!><!<-- this was an empty comment>'
|
||||||
'<!!! another bogus comment !!!>')
|
'<!!! another bogus comment !!!>')
|
||||||
expected = [
|
expected = [
|
||||||
|
('comment', 'ELEMENT br EMPTY'),
|
||||||
('comment', ' not really a comment '),
|
('comment', ' not really a comment '),
|
||||||
('comment', ' not a comment either --'),
|
('comment', ' not a comment either --'),
|
||||||
('comment', ' -- close enough --'),
|
('comment', ' -- close enough --'),
|
||||||
|
@ -598,6 +649,26 @@ text
|
||||||
('endtag', 'a'), ('data', ' bar & baz')]
|
('endtag', 'a'), ('data', ' bar & baz')]
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@support.requires_resource('cpu')
|
||||||
|
def test_eof_no_quadratic_complexity(self):
|
||||||
|
# Each of these examples used to take about an hour.
|
||||||
|
# Now they take a fraction of a second.
|
||||||
|
def check(source):
|
||||||
|
parser = html.parser.HTMLParser()
|
||||||
|
parser.feed(source)
|
||||||
|
parser.close()
|
||||||
|
n = 120_000
|
||||||
|
check("<a " * n)
|
||||||
|
check("<a a=" * n)
|
||||||
|
check("</a " * 14 * n)
|
||||||
|
check("</a a=" * 11 * n)
|
||||||
|
check("<!--" * 4 * n)
|
||||||
|
check("<!" * 60 * n)
|
||||||
|
check("<?" * 19 * n)
|
||||||
|
check("</$" * 15 * n)
|
||||||
|
check("<![CDATA[" * 9 * n)
|
||||||
|
check("<!doctype" * 35 * n)
|
||||||
|
|
||||||
|
|
||||||
class AttributesTestCase(TestCaseBase):
|
class AttributesTestCase(TestCaseBase):
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,4 @@
|
||||||
|
Fix quadratic complexity in processing specially crafted input in
|
||||||
|
:class:`html.parser.HTMLParser`. End-of-file errors are now handled according
|
||||||
|
to the HTML5 specs -- comments and declarations are automatically closed,
|
||||||
|
tags are ignored.
|
Loading…
Add table
Add a link
Reference in a new issue