#13358: HTMLParser now calls handle_data only once for each CDATA.

This commit is contained in:
Ezio Melotti 2011-11-18 18:00:40 +02:00
parent 93bbb6a9a6
commit 00dc60beee
3 changed files with 27 additions and 3 deletions

View file

@ -14,7 +14,6 @@ import re
# Regular expressions used for parsing # Regular expressions used for parsing
interesting_normal = re.compile('[&<]') interesting_normal = re.compile('[&<]')
interesting_cdata = re.compile(r'<(/|\Z)')
incomplete = re.compile('&[a-zA-Z#]') incomplete = re.compile('&[a-zA-Z#]')
entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]') entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
@ -125,8 +124,8 @@ class HTMLParser(markupbase.ParserBase):
return self.__starttag_text return self.__starttag_text
def set_cdata_mode(self, elem): def set_cdata_mode(self, elem):
self.interesting = interesting_cdata
self.cdata_elem = elem.lower() self.cdata_elem = elem.lower()
self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
def clear_cdata_mode(self): def clear_cdata_mode(self):
self.interesting = interesting_normal self.interesting = interesting_normal
@ -144,6 +143,8 @@ class HTMLParser(markupbase.ParserBase):
if match: if match:
j = match.start() j = match.start()
else: else:
if self.cdata_elem:
break
j = n j = n
if i < j: self.handle_data(rawdata[i:j]) if i < j: self.handle_data(rawdata[i:j])
i = self.updatepos(i, j) i = self.updatepos(i, j)
@ -212,7 +213,7 @@ class HTMLParser(markupbase.ParserBase):
else: else:
assert 0, "interesting.search() lied" assert 0, "interesting.search() lied"
# end while # end while
if end and i < n: if end and i < n and not self.cdata_elem:
self.handle_data(rawdata[i:n]) self.handle_data(rawdata[i:n])
i = self.updatepos(i, n) i = self.updatepos(i, n)
self.rawdata = rawdata[i:] self.rawdata = rawdata[i:]

View file

@ -286,6 +286,27 @@ DOCTYPE html [
("data", content), ("data", content),
("endtag", element_lower)]) ("endtag", element_lower)])
def test_cdata_with_closing_tags(self):
# see issue #13358
# make sure that HTMLParser calls handle_data only once for each CDATA.
# The normal event collector normalizes the events in get_events,
# so we override it to return the original list of events.
class Collector(EventCollector):
def get_events(self):
return self.events
content = """<!-- not a comment --> &not-an-entity-ref;
<a href="" /> </p><p> &amp; <span></span></style>
'</script' + '>' </html> </head> </scripter>!"""
for element in [' script', 'script ', ' script ',
'\nscript', 'script\n', '\nscript\n']:
s = u'<script>{content}</{element}>'.format(element=element,
content=content)
self._run_check(s, [("starttag", "script", []),
("data", content),
("endtag", "script")],
collector=Collector)
def test_malformatted_charref(self): def test_malformatted_charref(self):
self._run_check("<p>&#bad;</p>", [ self._run_check("<p>&#bad;</p>", [
("starttag", "p", []), ("starttag", "p", []),

View file

@ -79,6 +79,8 @@ Core and Builtins
Library Library
------- -------
- Issue #13358: HTMLParser now calls handle_data only once for each CDATA.
- Issue #4147: minidom's toprettyxml no longer adds whitespace around a text - Issue #4147: minidom's toprettyxml no longer adds whitespace around a text
node when it is the only child of an element. Initial patch by Dan node when it is the only child of an element. Initial patch by Dan
Kenigsberg. Kenigsberg.