#13358: HTMLParser now calls handle_data only once for each CDATA.

2025-09-17 22:20:23 +00:00 · 2011-11-18 18:00:40 +02:00 · 2011-11-18 18:00:40 +02:00 · 00dc60beee
commit 00dc60beee
parent 93bbb6a9a6
3 changed files with 27 additions and 3 deletions
--- a/Lib/HTMLParser.py
+++ b/Lib/HTMLParser.py
@ -14,7 +14,6 @@ import re
 # Regular expressions used for parsing
 interesting_normal = re.compile('[&<]')
 interesting_cdata = re.compile(r'<(/|\Z)')
 incomplete = re.compile('&[a-zA-Z#]')
 entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
@ -125,8 +124,8 @@ class HTMLParser(markupbase.ParserBase):
        return self.__starttag_text
    def set_cdata_mode(self, elem):
        self.interesting = interesting_cdata
        self.cdata_elem = elem.lower()
        self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
    def clear_cdata_mode(self):
        self.interesting = interesting_normal
@ -144,6 +143,8 @@ class HTMLParser(markupbase.ParserBase):
            if match:
                j = match.start()
            else:
                if self.cdata_elem:
                    break
                j = n
            if i < j: self.handle_data(rawdata[i:j])
            i = self.updatepos(i, j)
@ -212,7 +213,7 @@ class HTMLParser(markupbase.ParserBase):
            else:
                assert 0, "interesting.search() lied"
        # end while
-        if end and i < n:
+        if end and i < n and not self.cdata_elem:
            self.handle_data(rawdata[i:n])
            i = self.updatepos(i, n)
        self.rawdata = rawdata[i:]
--- a/Lib/test/test_htmlparser.py
+++ b/Lib/test/test_htmlparser.py
@ -286,6 +286,27 @@ DOCTYPE html [
                                    ("data", content),
                                    ("endtag", element_lower)])
    def test_cdata_with_closing_tags(self):
        # see issue #13358
        # make sure that HTMLParser calls handle_data only once for each CDATA.
        # The normal event collector normalizes the events in get_events,
        # so we override it to return the original list of events.
        class Collector(EventCollector):
            def get_events(self):
                return self.events
        content = """<!-- not a comment --> &not-an-entity-ref;
                  <a href="" /> </p><p> &amp; <span></span></style>
                  '</script' + '>' </html> </head> </scripter>!"""
        for element in [' script', 'script ', ' script ',
                        '\nscript', 'script\n', '\nscript\n']:
            s = u'<script>{content}</{element}>'.format(element=element,
                                                        content=content)
            self._run_check(s, [("starttag", "script", []),
                                ("data", content),
                                ("endtag", "script")],
                            collector=Collector)
    def test_malformatted_charref(self):
        self._run_check("<p>&#bad;</p>", [
            ("starttag", "p", []),
--- a/Misc/NEWS
+++ b/Misc/NEWS
@ -79,6 +79,8 @@ Core and Builtins
 Library
 -------
 - Issue #13358: HTMLParser now calls handle_data only once for each CDATA.
 - Issue #4147: minidom's toprettyxml no longer adds whitespace around a text
  node when it is the only child of an element.  Initial patch by Dan
  Kenigsberg.