#13273: merge with 3.2.

This commit is contained in:
Ezio Melotti 2011-10-28 13:23:57 +03:00
commit 91ec2e8a31
3 changed files with 38 additions and 3 deletions

View file

@ -30,7 +30,7 @@ attrfind = re.compile(
r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*' r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
r'(\'[^\']*\'|"[^"]*"|[^\s"\'=<>`]*))?') r'(\'[^\']*\'|"[^"]*"|[^\s"\'=<>`]*))?')
attrfind_tolerant = re.compile( attrfind_tolerant = re.compile(
r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*' r',?\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
r'(\'[^\']*\'|"[^"]*"|[^>\s]*))?') r'(\'[^\']*\'|"[^"]*"|[^>\s]*))?')
locatestarttagend = re.compile(r""" locatestarttagend = re.compile(r"""
<[a-zA-Z][-.a-zA-Z0-9:_]* # tag name <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
@ -277,12 +277,11 @@ class HTMLParser(_markupbase.ParserBase):
assert match, 'unexpected call to parse_starttag()' assert match, 'unexpected call to parse_starttag()'
k = match.end() k = match.end()
self.lasttag = tag = rawdata[i+1:k].lower() self.lasttag = tag = rawdata[i+1:k].lower()
while k < endpos: while k < endpos:
if self.strict: if self.strict:
m = attrfind.match(rawdata, k) m = attrfind.match(rawdata, k)
else: else:
m = attrfind_tolerant.search(rawdata, k) m = attrfind_tolerant.match(rawdata, k)
if not m: if not m:
break break
attrname, rest, attrvalue = m.group(1, 2, 3) attrname, rest, attrvalue = m.group(1, 2, 3)

View file

@ -373,6 +373,39 @@ class HTMLParserTolerantTestCase(TestCaseBase):
[('action', 'bogus|&#()value')])], [('action', 'bogus|&#()value')])],
collector = self.collector) collector = self.collector)
def test_issue13273(self):
html = ('<div style="" ><b>The <a href="some_url">rain</a> '
'<br /> in <span>Spain</span></b></div>')
expected = [
('starttag', 'div', [('style', '')]),
('starttag', 'b', []),
('data', 'The '),
('starttag', 'a', [('href', 'some_url')]),
('data', 'rain'),
('endtag', 'a'),
('data', ' '),
('startendtag', 'br', []),
('data', ' in '),
('starttag', 'span', []),
('data', 'Spain'),
('endtag', 'span'),
('endtag', 'b'),
('endtag', 'div')
]
self._run_check(html, expected, collector=self.collector)
def test_issue13273_2(self):
html = '<div style="", foo = "bar" ><b>The <a href="some_url">rain</a>'
expected = [
('starttag', 'div', [('style', ''), ('foo', 'bar')]),
('starttag', 'b', []),
('data', 'The '),
('starttag', 'a', [('href', 'some_url')]),
('data', 'rain'),
('endtag', 'a'),
]
self._run_check(html, expected, collector=self.collector)
def test_unescape_function(self): def test_unescape_function(self):
p = html.parser.HTMLParser() p = html.parser.HTMLParser()
self.assertEqual(p.unescape('&#bad;'),'&#bad;') self.assertEqual(p.unescape('&#bad;'),'&#bad;')

View file

@ -341,6 +341,9 @@ Core and Builtins
Library Library
------- -------
- Issue #13273: fix a bug that prevented HTMLParser to properly detect some
tags when strict=False.
- Issue #11183: Add finer-grained exceptions to the ssl module, so that - Issue #11183: Add finer-grained exceptions to the ssl module, so that
you don't have to inspect the exception's attributes in the common case. you don't have to inspect the exception's attributes in the common case.