Deal more appropriately with bare ampersands and pointy brackets; this

module has to deal with "class" HTML-as-deployed as well as XHTML, so we cannot be as strict as XHTML allows. This closes SF bug #453059, but uses a different fix than suggested in the bug comments.
2025-08-04 17:08:35 +00:00 · 2001-08-20 21:24:19 +00:00 · 2001-08-20 21:24:19 +00:00 · 029acfb922
commit 029acfb922
parent 18da1e1e7f
2 changed files with 39 additions and 19 deletions
--- a/Lib/test/test_htmlparser.py
+++ b/Lib/test/test_htmlparser.py
@ -1,6 +1,7 @@
 """Tests for HTMLParser.py."""

 import HTMLParser
+import pprint
 import sys
 import test_support
 import unittest
@ -83,9 +84,10 @@ class TestCaseBase(unittest.TestCase):
        for c in self.epilogue:
            parser.feed(c)
        parser.close()
-        self.assert_(parser.get_events() ==
-                     self.initial_events + events + self.final_events,
-                     parser.get_events())
+        events = parser.get_events()
+        self.assertEqual(events,
+                         self.initial_events + events + self.final_events,
+                         "got events:\n" + pprint.pformat(events))

    def _run_check_extra(self, source, events):
        self._run_check(source, events, EventCollectorExtra)
@ -137,6 +139,18 @@ text
    ("data", "\n"),
    ])

+    def test_doctype_decl(self):
+        inside = """\
+DOCTYPE html [
+  <!ELEMENT html - O EMPTY>
+  <!ATTLIST html
+      version CDATA #IMPLIED '4.0'>
+  <!-- comment -->
+]"""
+        self._run_check("<!%s>" % inside, [
+            ("decl", inside),
+            ])
+
    def test_bad_nesting(self):
        # Strangely, this *is* supposed to test that overlapping
        # elements are allowed.  HTMLParser is more geared toward
@ -148,6 +162,16 @@ text
            ("endtag", "b"),
            ])

+    def test_bare_ampersands(self):
+        self._run_check("this text & contains & ampersands &", [
+            ("data", "this text & contains & ampersands &"),
+            ])
+
+    def test_bare_pointy_brackets(self):
+        self._run_check("this < text > contains < bare>pointy< brackets", [
+            ("data", "this < text > contains < bare>pointy< brackets"),
+            ])
+
    def test_attr_syntax(self):
        output = [
          ("starttag", "a", [("b", "v"), ("c", "v"), ("d", "v"), ("e", None)])
@ -199,16 +223,12 @@ text
        self._run_check(["<a b='>'", ">"], output)

    def test_starttag_junk_chars(self):
-        self._parse_error("<")
-        self._parse_error("<>")
        self._parse_error("</>")
        self._parse_error("</$>")
        self._parse_error("</")
        self._parse_error("</a")
        self._parse_error("<a<a>")
        self._parse_error("</a<a>")
-        self._parse_error("<$")
-        self._parse_error("<$>")
        self._parse_error("<!")
        self._parse_error("<a $>")
        self._parse_error("<a")