From 554a3b82e40573846f893ffdfff230e1d908af57 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Mon, 24 May 2010 21:33:24 +0000 Subject: [PATCH] Issue #6662: Fix parsing of malformatted charref (&#bad;) --- Lib/HTMLParser.py | 3 +++ Lib/test/test_htmlparser.py | 7 +++++++ Misc/NEWS | 2 ++ 3 files changed, 12 insertions(+) diff --git a/Lib/HTMLParser.py b/Lib/HTMLParser.py index 2cbc2ecbc73..7cee47a7c5d 100644 --- a/Lib/HTMLParser.py +++ b/Lib/HTMLParser.py @@ -175,6 +175,9 @@ class HTMLParser(markupbase.ParserBase): i = self.updatepos(i, k) continue else: + if ";" in rawdata[i:]: #bail by consuming &# + self.handle_data(rawdata[0:2]) + i = self.updatepos(i, 2) break elif startswith('&', i): match = entityref.match(rawdata, i) diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py index 810af6c8cbc..c45cf00ecea 100644 --- a/Lib/test/test_htmlparser.py +++ b/Lib/test/test_htmlparser.py @@ -313,6 +313,13 @@ DOCTYPE html [ ("starttag", "html", [("foo", u"\u20AC&aa&unsupported;")]) ]) + def test_malformatted_charref(self): + self._run_check("

&#bad;

", [ + ("starttag", "p", []), + ("data", "&#bad;"), + ("endtag", "p"), + ]) + def test_main(): test_support.run_unittest(HTMLParserTestCase) diff --git a/Misc/NEWS b/Misc/NEWS index b0941268897..e2f0f0ccf8e 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -29,6 +29,8 @@ C-API Library ------- +- Issue #6662: Fix parsing of malformatted charref (&#bad;) + - Issue #8016: Add the CP858 codec. - Issue #3924: Ignore cookies with invalid "version" field in cookielib.