diff --git a/Doc/library/html.parser.rst b/Doc/library/html.parser.rst
index 6d433b5a04f..dd67fc34e85 100644
--- a/Doc/library/html.parser.rst
+++ b/Doc/library/html.parser.rst
@@ -43,7 +43,9 @@ Example HTML Parser Application
As a basic example, below is a simple HTML parser that uses the
:class:`HTMLParser` class to print out start tags, end tags, and data
-as they are encountered::
+as they are encountered:
+
+.. testcode::
from html.parser import HTMLParser
@@ -63,7 +65,7 @@ as they are encountered::
The output will then be:
-.. code-block:: none
+.. testoutput::
Encountered a start tag: html
Encountered a start tag: head
@@ -230,7 +232,9 @@ Examples
--------
The following class implements a parser that will be used to illustrate more
-examples::
+examples:
+
+.. testcode::
from html.parser import HTMLParser
from html.entities import name2codepoint
@@ -266,13 +270,17 @@ examples::
parser = MyHTMLParser()
-Parsing a doctype::
+Parsing a doctype:
+
+.. doctest::
>>> parser.feed('')
Decl : DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd"
-Parsing an element with a few attributes and a title::
+Parsing an element with a few attributes and a title:
+
+.. doctest::
>>> parser.feed('
')
Start tag: img
@@ -285,7 +293,9 @@ Parsing an element with a few attributes and a title::
End tag : h1
The content of ``script`` and ``style`` elements is returned as is, without
-further parsing::
+further parsing:
+
+.. doctest::
>>> parser.feed('')
Start tag: style
@@ -300,16 +310,25 @@ further parsing::
Data : alert("hello!");
End tag : script
-Parsing comments::
+Parsing comments:
- >>> parser.feed(''
+.. doctest::
+
+ >>> parser.feed(''
... '')
- Comment : a comment
+ Comment : a comment
Comment : [if IE 9]>IE-specific content'``)::
+correct char (note: these 3 references are all equivalent to ``'>'``):
+.. doctest::
+
+ >>> parser = MyHTMLParser()
+ >>> parser.feed('>>>')
+ Data : >>>
+
+ >>> parser = MyHTMLParser(convert_charrefs=False)
>>> parser.feed('>>>')
Named ent: >
Num ent : >
@@ -317,18 +336,22 @@ correct char (note: these 3 references are all equivalent to ``'>'``)::
Feeding incomplete chunks to :meth:`~HTMLParser.feed` works, but
:meth:`~HTMLParser.handle_data` might be called more than once
-(unless *convert_charrefs* is set to ``True``)::
+(unless *convert_charrefs* is set to ``True``):
- >>> for chunk in ['buff', 'ered ', 'text']:
+.. doctest::
+
+ >>> for chunk in ['buff', 'ered', ' text']:
... parser.feed(chunk)
...
Start tag: span
Data : buff
Data : ered
- Data : text
+ Data : text
End tag : span
-Parsing invalid HTML (e.g. unquoted attributes) also works::
+Parsing invalid HTML (e.g. unquoted attributes) also works:
+
+.. doctest::
>>> parser.feed('tag soup
')
Start tag: p
diff --git a/Lib/html/parser.py b/Lib/html/parser.py
index 13c95c34e50..0a1dd3b7d3b 100644
--- a/Lib/html/parser.py
+++ b/Lib/html/parser.py
@@ -12,6 +12,7 @@ import re
import _markupbase
from html import unescape
+from html.entities import html5 as html5_entities
__all__ = ['HTMLParser']
@@ -23,6 +24,7 @@ incomplete = re.compile('&[a-zA-Z#]')
entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
charref = re.compile('(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
+attr_charref = re.compile(r'&(#[0-9]+|#[xX][0-9a-fA-F]+|[a-zA-Z][a-zA-Z0-9]*)[;=]?')
starttagopen = re.compile('<[a-zA-Z]')
piclose = re.compile('>')
@@ -57,6 +59,22 @@ endendtag = re.compile('>')
# and the tag name, so maybe this should be fixed
endtagfind = re.compile(r'\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
+# Character reference processing logic specific to attribute values
+# See: https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state
+def _replace_attr_charref(match):
+ ref = match.group(0)
+ # Numeric / hex char refs must always be unescaped
+ if ref.startswith(''):
+ return unescape(ref)
+ # Named character / entity references must only be unescaped
+ # if they are an exact match, and they are not followed by an equals sign
+ if not ref.endswith('=') and ref[1:] in html5_entities:
+ return unescape(ref)
+ # Otherwise do not unescape
+ return ref
+
+def _unescape_attrvalue(s):
+ return attr_charref.sub(_replace_attr_charref, s)
class HTMLParser(_markupbase.ParserBase):
@@ -323,7 +341,7 @@ class HTMLParser(_markupbase.ParserBase):
attrvalue[:1] == '"' == attrvalue[-1:]:
attrvalue = attrvalue[1:-1]
if attrvalue:
- attrvalue = unescape(attrvalue)
+ attrvalue = _unescape_attrvalue(attrvalue)
attrs.append((attrname.lower(), attrvalue))
k = m.end()
diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py
index b42a611c62c..4fdba06cf4c 100644
--- a/Lib/test/test_htmlparser.py
+++ b/Lib/test/test_htmlparser.py
@@ -348,18 +348,16 @@ text
collector = lambda: EventCollectorCharrefs()
self.assertTrue(collector().convert_charrefs)
charrefs = ['"', '"', '"', '"', '"', '"']
- # check charrefs in the middle of the text/attributes
- expected = [('starttag', 'a', [('href', 'foo"zar')]),
- ('data', 'a"z'), ('endtag', 'a')]
+ # check charrefs in the middle of the text
+ expected = [('starttag', 'a', []), ('data', 'a"z'), ('endtag', 'a')]
for charref in charrefs:
- self._run_check('a{0}z'.format(charref),
+ self._run_check('a{0}z'.format(charref),
expected, collector=collector())
- # check charrefs at the beginning/end of the text/attributes
- expected = [('data', '"'),
- ('starttag', 'a', [('x', '"'), ('y', '"X'), ('z', 'X"')]),
+ # check charrefs at the beginning/end of the text
+ expected = [('data', '"'), ('starttag', 'a', []),
('data', '"'), ('endtag', 'a'), ('data', '"')]
for charref in charrefs:
- self._run_check('{0}'
+ self._run_check('{0}'
'{0}{0}'.format(charref),
expected, collector=collector())
# check charrefs in