This commit is contained in:
Hugo van Kemenade 2025-05-07 18:53:08 +03:00
commit 5ea24116b0
4 changed files with 94 additions and 23 deletions

View file

@ -43,7 +43,9 @@ Example HTML Parser Application
As a basic example, below is a simple HTML parser that uses the As a basic example, below is a simple HTML parser that uses the
:class:`HTMLParser` class to print out start tags, end tags, and data :class:`HTMLParser` class to print out start tags, end tags, and data
as they are encountered:: as they are encountered:
.. testcode::
from html.parser import HTMLParser from html.parser import HTMLParser
@ -63,7 +65,7 @@ as they are encountered::
The output will then be: The output will then be:
.. code-block:: none .. testoutput::
Encountered a start tag: html Encountered a start tag: html
Encountered a start tag: head Encountered a start tag: head
@ -230,7 +232,9 @@ Examples
-------- --------
The following class implements a parser that will be used to illustrate more The following class implements a parser that will be used to illustrate more
examples:: examples:
.. testcode::
from html.parser import HTMLParser from html.parser import HTMLParser
from html.entities import name2codepoint from html.entities import name2codepoint
@ -266,13 +270,17 @@ examples::
parser = MyHTMLParser() parser = MyHTMLParser()
Parsing a doctype:: Parsing a doctype:
.. doctest::
>>> parser.feed('<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" ' >>> parser.feed('<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" '
... '"http://www.w3.org/TR/html4/strict.dtd">') ... '"http://www.w3.org/TR/html4/strict.dtd">')
Decl : DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd" Decl : DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd"
Parsing an element with a few attributes and a title:: Parsing an element with a few attributes and a title:
.. doctest::
>>> parser.feed('<img src="python-logo.png" alt="The Python logo">') >>> parser.feed('<img src="python-logo.png" alt="The Python logo">')
Start tag: img Start tag: img
@ -285,7 +293,9 @@ Parsing an element with a few attributes and a title::
End tag : h1 End tag : h1
The content of ``script`` and ``style`` elements is returned as is, without The content of ``script`` and ``style`` elements is returned as is, without
further parsing:: further parsing:
.. doctest::
>>> parser.feed('<style type="text/css">#python { color: green }</style>') >>> parser.feed('<style type="text/css">#python { color: green }</style>')
Start tag: style Start tag: style
@ -300,16 +310,25 @@ further parsing::
Data : alert("<strong>hello!</strong>"); Data : alert("<strong>hello!</strong>");
End tag : script End tag : script
Parsing comments:: Parsing comments:
>>> parser.feed('<!-- a comment -->' .. doctest::
>>> parser.feed('<!--a comment-->'
... '<!--[if IE 9]>IE-specific content<![endif]-->') ... '<!--[if IE 9]>IE-specific content<![endif]-->')
Comment : a comment Comment : a comment
Comment : [if IE 9]>IE-specific content<![endif] Comment : [if IE 9]>IE-specific content<![endif]
Parsing named and numeric character references and converting them to the Parsing named and numeric character references and converting them to the
correct char (note: these 3 references are all equivalent to ``'>'``):: correct char (note: these 3 references are all equivalent to ``'>'``):
.. doctest::
>>> parser = MyHTMLParser()
>>> parser.feed('&gt;&#62;&#x3E;')
Data : >>>
>>> parser = MyHTMLParser(convert_charrefs=False)
>>> parser.feed('&gt;&#62;&#x3E;') >>> parser.feed('&gt;&#62;&#x3E;')
Named ent: > Named ent: >
Num ent : > Num ent : >
@ -317,18 +336,22 @@ correct char (note: these 3 references are all equivalent to ``'>'``)::
Feeding incomplete chunks to :meth:`~HTMLParser.feed` works, but Feeding incomplete chunks to :meth:`~HTMLParser.feed` works, but
:meth:`~HTMLParser.handle_data` might be called more than once :meth:`~HTMLParser.handle_data` might be called more than once
(unless *convert_charrefs* is set to ``True``):: (unless *convert_charrefs* is set to ``True``):
>>> for chunk in ['<sp', 'an>buff', 'ered ', 'text</s', 'pan>']: .. doctest::
>>> for chunk in ['<sp', 'an>buff', 'ered', ' text</s', 'pan>']:
... parser.feed(chunk) ... parser.feed(chunk)
... ...
Start tag: span Start tag: span
Data : buff Data : buff
Data : ered Data : ered
Data : text Data : text
End tag : span End tag : span
Parsing invalid HTML (e.g. unquoted attributes) also works:: Parsing invalid HTML (e.g. unquoted attributes) also works:
.. doctest::
>>> parser.feed('<p><a class=link href=#main>tag soup</p ></a>') >>> parser.feed('<p><a class=link href=#main>tag soup</p ></a>')
Start tag: p Start tag: p

View file

@ -12,6 +12,7 @@ import re
import _markupbase import _markupbase
from html import unescape from html import unescape
from html.entities import html5 as html5_entities
__all__ = ['HTMLParser'] __all__ = ['HTMLParser']
@ -23,6 +24,7 @@ incomplete = re.compile('&[a-zA-Z#]')
entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]') entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]') charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
attr_charref = re.compile(r'&(#[0-9]+|#[xX][0-9a-fA-F]+|[a-zA-Z][a-zA-Z0-9]*)[;=]?')
starttagopen = re.compile('<[a-zA-Z]') starttagopen = re.compile('<[a-zA-Z]')
piclose = re.compile('>') piclose = re.compile('>')
@ -57,6 +59,22 @@ endendtag = re.compile('>')
# </ and the tag name, so maybe this should be fixed # </ and the tag name, so maybe this should be fixed
endtagfind = re.compile(r'</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>') endtagfind = re.compile(r'</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
# Character reference processing logic specific to attribute values
# See: https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state
def _replace_attr_charref(match):
ref = match.group(0)
# Numeric / hex char refs must always be unescaped
if ref.startswith('&#'):
return unescape(ref)
# Named character / entity references must only be unescaped
# if they are an exact match, and they are not followed by an equals sign
if not ref.endswith('=') and ref[1:] in html5_entities:
return unescape(ref)
# Otherwise do not unescape
return ref
def _unescape_attrvalue(s):
return attr_charref.sub(_replace_attr_charref, s)
class HTMLParser(_markupbase.ParserBase): class HTMLParser(_markupbase.ParserBase):
@ -323,7 +341,7 @@ class HTMLParser(_markupbase.ParserBase):
attrvalue[:1] == '"' == attrvalue[-1:]: attrvalue[:1] == '"' == attrvalue[-1:]:
attrvalue = attrvalue[1:-1] attrvalue = attrvalue[1:-1]
if attrvalue: if attrvalue:
attrvalue = unescape(attrvalue) attrvalue = _unescape_attrvalue(attrvalue)
attrs.append((attrname.lower(), attrvalue)) attrs.append((attrname.lower(), attrvalue))
k = m.end() k = m.end()

View file

@ -348,18 +348,16 @@ text
collector = lambda: EventCollectorCharrefs() collector = lambda: EventCollectorCharrefs()
self.assertTrue(collector().convert_charrefs) self.assertTrue(collector().convert_charrefs)
charrefs = ['&quot;', '&#34;', '&#x22;', '&quot', '&#34', '&#x22'] charrefs = ['&quot;', '&#34;', '&#x22;', '&quot', '&#34', '&#x22']
# check charrefs in the middle of the text/attributes # check charrefs in the middle of the text
expected = [('starttag', 'a', [('href', 'foo"zar')]), expected = [('starttag', 'a', []), ('data', 'a"z'), ('endtag', 'a')]
('data', 'a"z'), ('endtag', 'a')]
for charref in charrefs: for charref in charrefs:
self._run_check('<a href="foo{0}zar">a{0}z</a>'.format(charref), self._run_check('<a>a{0}z</a>'.format(charref),
expected, collector=collector()) expected, collector=collector())
# check charrefs at the beginning/end of the text/attributes # check charrefs at the beginning/end of the text
expected = [('data', '"'), expected = [('data', '"'), ('starttag', 'a', []),
('starttag', 'a', [('x', '"'), ('y', '"X'), ('z', 'X"')]),
('data', '"'), ('endtag', 'a'), ('data', '"')] ('data', '"'), ('endtag', 'a'), ('data', '"')]
for charref in charrefs: for charref in charrefs:
self._run_check('{0}<a x="{0}" y="{0}X" z="X{0}">' self._run_check('{0}<a>'
'{0}</a>{0}'.format(charref), '{0}</a>{0}'.format(charref),
expected, collector=collector()) expected, collector=collector())
# check charrefs in <script>/<style> elements # check charrefs in <script>/<style> elements
@ -382,6 +380,35 @@ text
self._run_check('no charrefs here', [('data', 'no charrefs here')], self._run_check('no charrefs here', [('data', 'no charrefs here')],
collector=collector()) collector=collector())
def test_convert_charrefs_in_attribute_values(self):
# default value for convert_charrefs is now True
collector = lambda: EventCollectorCharrefs()
self.assertTrue(collector().convert_charrefs)
# always unescape terminated entity refs, numeric and hex char refs:
# - regardless whether they are at start, middle, end of attribute
# - or followed by alphanumeric, non-alphanumeric, or equals char
charrefs = ['&cent;', '&#xa2;', '&#xa2', '&#162;', '&#162']
expected = [('starttag', 'a',
[('x', '¢'), ('x', ''), ('x', '¢z'),
('x', 'z¢z'), ('x', '¢ z'), ('x', '¢=z')]),
('endtag', 'a')]
for charref in charrefs:
self._run_check('<a x="{0}" x="z{0}" x="{0}z" '
' x="z{0}z" x="{0} z" x="{0}=z"></a>'
.format(charref), expected, collector=collector())
# only unescape unterminated entity matches if they are not followed by
# an alphanumeric or an equals sign
charref = '&cent'
expected = [('starttag', 'a',
[('x', '¢'), ('x', ''), ('x', '&centz'),
('x', 'z&centz'), ('x', '¢ z'), ('x', '&cent=z')]),
('endtag', 'a')]
self._run_check('<a x="{0}" x="z{0}" x="{0}z" '
' x="z{0}z" x="{0} z" x="{0}=z"></a>'
.format(charref), expected, collector=collector())
# the remaining tests were for the "tolerant" parser (which is now # the remaining tests were for the "tolerant" parser (which is now
# the default), and check various kind of broken markup # the default), and check various kind of broken markup
def test_tolerant_parsing(self): def test_tolerant_parsing(self):

View file

@ -0,0 +1,3 @@
Fix :class:`html.parser.HTMLParser` to not unescape character entities in
attribute values if they are followed by an ASCII alphanumeric or an equals
sign.