mirror of
https://github.com/python/cpython.git
synced 2025-11-23 20:07:19 +00:00
[3.14] gh-140875: Fix handling of unclosed charrefs before EOF in HTMLParser (GH-140904) (GH-141745)
(cherry picked from commit 95296a9d40)
Co-authored-by: Serhiy Storchaka <storchaka@gmail.com>
This commit is contained in:
parent
36ad4be1d9
commit
562e23f9d5
3 changed files with 109 additions and 33 deletions
|
|
@ -24,6 +24,7 @@ incomplete = re.compile('&[a-zA-Z#]')
|
||||||
|
|
||||||
entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
|
entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
|
||||||
charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
|
charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
|
||||||
|
incomplete_charref = re.compile('&#(?:[0-9]|[xX][0-9a-fA-F])')
|
||||||
attr_charref = re.compile(r'&(#[0-9]+|#[xX][0-9a-fA-F]+|[a-zA-Z][a-zA-Z0-9]*)[;=]?')
|
attr_charref = re.compile(r'&(#[0-9]+|#[xX][0-9a-fA-F]+|[a-zA-Z][a-zA-Z0-9]*)[;=]?')
|
||||||
|
|
||||||
starttagopen = re.compile('<[a-zA-Z]')
|
starttagopen = re.compile('<[a-zA-Z]')
|
||||||
|
|
@ -304,10 +305,20 @@ class HTMLParser(_markupbase.ParserBase):
|
||||||
k = k - 1
|
k = k - 1
|
||||||
i = self.updatepos(i, k)
|
i = self.updatepos(i, k)
|
||||||
continue
|
continue
|
||||||
else:
|
match = incomplete_charref.match(rawdata, i)
|
||||||
if ";" in rawdata[i:]: # bail by consuming &#
|
if match:
|
||||||
self.handle_data(rawdata[i:i+2])
|
if end:
|
||||||
|
self.handle_charref(rawdata[i+2:])
|
||||||
|
i = self.updatepos(i, n)
|
||||||
|
break
|
||||||
|
# incomplete
|
||||||
|
break
|
||||||
|
elif i + 3 < n: # larger than "&#x"
|
||||||
|
# not the end of the buffer, and can't be confused
|
||||||
|
# with some other construct
|
||||||
|
self.handle_data("&#")
|
||||||
i = self.updatepos(i, i + 2)
|
i = self.updatepos(i, i + 2)
|
||||||
|
else:
|
||||||
break
|
break
|
||||||
elif startswith('&', i):
|
elif startswith('&', i):
|
||||||
match = entityref.match(rawdata, i)
|
match = entityref.match(rawdata, i)
|
||||||
|
|
@ -321,15 +332,13 @@ class HTMLParser(_markupbase.ParserBase):
|
||||||
continue
|
continue
|
||||||
match = incomplete.match(rawdata, i)
|
match = incomplete.match(rawdata, i)
|
||||||
if match:
|
if match:
|
||||||
# match.group() will contain at least 2 chars
|
if end:
|
||||||
if end and match.group() == rawdata[i:]:
|
self.handle_entityref(rawdata[i+1:])
|
||||||
k = match.end()
|
i = self.updatepos(i, n)
|
||||||
if k <= i:
|
break
|
||||||
k = n
|
|
||||||
i = self.updatepos(i, i + 1)
|
|
||||||
# incomplete
|
# incomplete
|
||||||
break
|
break
|
||||||
elif (i + 1) < n:
|
elif i + 1 < n:
|
||||||
# not the end of the buffer, and can't be confused
|
# not the end of the buffer, and can't be confused
|
||||||
# with some other construct
|
# with some other construct
|
||||||
self.handle_data("&")
|
self.handle_data("&")
|
||||||
|
|
|
||||||
|
|
@ -109,12 +109,13 @@ class EventCollectorNoNormalize(EventCollector):
|
||||||
|
|
||||||
class TestCaseBase(unittest.TestCase):
|
class TestCaseBase(unittest.TestCase):
|
||||||
|
|
||||||
def get_collector(self):
|
def get_collector(self, convert_charrefs=False):
|
||||||
return EventCollector(convert_charrefs=False)
|
return EventCollector(convert_charrefs=convert_charrefs)
|
||||||
|
|
||||||
def _run_check(self, source, expected_events, collector=None):
|
def _run_check(self, source, expected_events,
|
||||||
|
*, collector=None, convert_charrefs=False):
|
||||||
if collector is None:
|
if collector is None:
|
||||||
collector = self.get_collector()
|
collector = self.get_collector(convert_charrefs=convert_charrefs)
|
||||||
parser = collector
|
parser = collector
|
||||||
for s in source:
|
for s in source:
|
||||||
parser.feed(s)
|
parser.feed(s)
|
||||||
|
|
@ -128,7 +129,7 @@ class TestCaseBase(unittest.TestCase):
|
||||||
|
|
||||||
def _run_check_extra(self, source, events):
|
def _run_check_extra(self, source, events):
|
||||||
self._run_check(source, events,
|
self._run_check(source, events,
|
||||||
EventCollectorExtra(convert_charrefs=False))
|
collector=EventCollectorExtra(convert_charrefs=False))
|
||||||
|
|
||||||
|
|
||||||
class HTMLParserTestCase(TestCaseBase):
|
class HTMLParserTestCase(TestCaseBase):
|
||||||
|
|
@ -187,10 +188,87 @@ text
|
||||||
])
|
])
|
||||||
|
|
||||||
def test_unclosed_entityref(self):
|
def test_unclosed_entityref(self):
|
||||||
self._run_check("&entityref foo", [
|
self._run_check('> <', [('entityref', 'gt'), ('data', ' '), ('entityref', 'lt')],
|
||||||
("entityref", "entityref"),
|
convert_charrefs=False)
|
||||||
("data", " foo"),
|
self._run_check('> <', [('data', '> <')], convert_charrefs=True)
|
||||||
])
|
|
||||||
|
self._run_check('&undefined <',
|
||||||
|
[('entityref', 'undefined'), ('data', ' '), ('entityref', 'lt')],
|
||||||
|
convert_charrefs=False)
|
||||||
|
self._run_check('&undefined <', [('data', '&undefined <')],
|
||||||
|
convert_charrefs=True)
|
||||||
|
|
||||||
|
self._run_check('>undefined <',
|
||||||
|
[('entityref', 'gtundefined'), ('data', ' '), ('entityref', 'lt')],
|
||||||
|
convert_charrefs=False)
|
||||||
|
self._run_check('>undefined <', [('data', '>undefined <')],
|
||||||
|
convert_charrefs=True)
|
||||||
|
|
||||||
|
self._run_check('& <', [('data', '& '), ('entityref', 'lt')],
|
||||||
|
convert_charrefs=False)
|
||||||
|
self._run_check('& <', [('data', '& <')], convert_charrefs=True)
|
||||||
|
|
||||||
|
def test_eof_in_entityref(self):
|
||||||
|
self._run_check('>', [('entityref', 'gt')], convert_charrefs=False)
|
||||||
|
self._run_check('>', [('data', '>')], convert_charrefs=True)
|
||||||
|
|
||||||
|
self._run_check('&g', [('entityref', 'g')], convert_charrefs=False)
|
||||||
|
self._run_check('&g', [('data', '&g')], convert_charrefs=True)
|
||||||
|
|
||||||
|
self._run_check('&undefined', [('entityref', 'undefined')],
|
||||||
|
convert_charrefs=False)
|
||||||
|
self._run_check('&undefined', [('data', '&undefined')],
|
||||||
|
convert_charrefs=True)
|
||||||
|
|
||||||
|
self._run_check('>undefined', [('entityref', 'gtundefined')],
|
||||||
|
convert_charrefs=False)
|
||||||
|
self._run_check('>undefined', [('data', '>undefined')],
|
||||||
|
convert_charrefs=True)
|
||||||
|
|
||||||
|
self._run_check('&', [('data', '&')], convert_charrefs=False)
|
||||||
|
self._run_check('&', [('data', '&')], convert_charrefs=True)
|
||||||
|
|
||||||
|
def test_unclosed_charref(self):
|
||||||
|
self._run_check('{ <', [('charref', '123'), ('data', ' '), ('entityref', 'lt')],
|
||||||
|
convert_charrefs=False)
|
||||||
|
self._run_check('{ <', [('data', '{ <')], convert_charrefs=True)
|
||||||
|
self._run_check('« <', [('charref', 'xab'), ('data', ' '), ('entityref', 'lt')],
|
||||||
|
convert_charrefs=False)
|
||||||
|
self._run_check('« <', [('data', '\xab <')], convert_charrefs=True)
|
||||||
|
|
||||||
|
self._run_check('� <',
|
||||||
|
[('charref', '123456789'), ('data', ' '), ('entityref', 'lt')],
|
||||||
|
convert_charrefs=False)
|
||||||
|
self._run_check('� <', [('data', '\ufffd <')],
|
||||||
|
convert_charrefs=True)
|
||||||
|
self._run_check('� <',
|
||||||
|
[('charref', 'x123456789'), ('data', ' '), ('entityref', 'lt')],
|
||||||
|
convert_charrefs=False)
|
||||||
|
self._run_check('� <', [('data', '\ufffd <')],
|
||||||
|
convert_charrefs=True)
|
||||||
|
|
||||||
|
self._run_check('&# <', [('data', '&# '), ('entityref', 'lt')], convert_charrefs=False)
|
||||||
|
self._run_check('&# <', [('data', '&# <')], convert_charrefs=True)
|
||||||
|
self._run_check('&#x <', [('data', '&#x '), ('entityref', 'lt')], convert_charrefs=False)
|
||||||
|
self._run_check('&#x <', [('data', '&#x <')], convert_charrefs=True)
|
||||||
|
|
||||||
|
def test_eof_in_charref(self):
|
||||||
|
self._run_check('{', [('charref', '123')], convert_charrefs=False)
|
||||||
|
self._run_check('{', [('data', '{')], convert_charrefs=True)
|
||||||
|
self._run_check('«', [('charref', 'xab')], convert_charrefs=False)
|
||||||
|
self._run_check('«', [('data', '\xab')], convert_charrefs=True)
|
||||||
|
|
||||||
|
self._run_check('�', [('charref', '123456789')],
|
||||||
|
convert_charrefs=False)
|
||||||
|
self._run_check('�', [('data', '\ufffd')], convert_charrefs=True)
|
||||||
|
self._run_check('�', [('charref', 'x123456789')],
|
||||||
|
convert_charrefs=False)
|
||||||
|
self._run_check('�', [('data', '\ufffd')], convert_charrefs=True)
|
||||||
|
|
||||||
|
self._run_check('&#', [('data', '&#')], convert_charrefs=False)
|
||||||
|
self._run_check('&#', [('data', '&#')], convert_charrefs=True)
|
||||||
|
self._run_check('&#x', [('data', '&#x')], convert_charrefs=False)
|
||||||
|
self._run_check('&#x', [('data', '&#x')], convert_charrefs=True)
|
||||||
|
|
||||||
def test_bad_nesting(self):
|
def test_bad_nesting(self):
|
||||||
# Strangely, this *is* supposed to test that overlapping
|
# Strangely, this *is* supposed to test that overlapping
|
||||||
|
|
@ -762,20 +840,6 @@ text
|
||||||
]
|
]
|
||||||
self._run_check(html, expected)
|
self._run_check(html, expected)
|
||||||
|
|
||||||
def test_EOF_in_charref(self):
|
|
||||||
# see #17802
|
|
||||||
# This test checks that the UnboundLocalError reported in the issue
|
|
||||||
# is not raised, however I'm not sure the returned values are correct.
|
|
||||||
# Maybe HTMLParser should use self.unescape for these
|
|
||||||
data = [
|
|
||||||
('a&', [('data', 'a&')]),
|
|
||||||
('a&b', [('data', 'ab')]),
|
|
||||||
('a&b ', [('data', 'a'), ('entityref', 'b'), ('data', ' ')]),
|
|
||||||
('a&b;', [('data', 'a'), ('entityref', 'b')]),
|
|
||||||
]
|
|
||||||
for html, expected in data:
|
|
||||||
self._run_check(html, expected)
|
|
||||||
|
|
||||||
def test_eof_in_comments(self):
|
def test_eof_in_comments(self):
|
||||||
data = [
|
data = [
|
||||||
('<!--', [('comment', '')]),
|
('<!--', [('comment', '')]),
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,3 @@
|
||||||
|
Fix handling of unclosed character references (named and numerical)
|
||||||
|
followed by the end of file in :class:`html.parser.HTMLParser` with
|
||||||
|
``convert_charrefs=False``.
|
||||||
Loading…
Add table
Add a link
Reference in a new issue