mirror of
https://github.com/python/cpython.git
synced 2025-09-26 18:29:57 +00:00
Merge branch 'main' of https://github.com/python/cpython
This commit is contained in:
commit
5ea24116b0
4 changed files with 94 additions and 23 deletions
|
@ -43,7 +43,9 @@ Example HTML Parser Application
|
||||||
|
|
||||||
As a basic example, below is a simple HTML parser that uses the
|
As a basic example, below is a simple HTML parser that uses the
|
||||||
:class:`HTMLParser` class to print out start tags, end tags, and data
|
:class:`HTMLParser` class to print out start tags, end tags, and data
|
||||||
as they are encountered::
|
as they are encountered:
|
||||||
|
|
||||||
|
.. testcode::
|
||||||
|
|
||||||
from html.parser import HTMLParser
|
from html.parser import HTMLParser
|
||||||
|
|
||||||
|
@ -63,7 +65,7 @@ as they are encountered::
|
||||||
|
|
||||||
The output will then be:
|
The output will then be:
|
||||||
|
|
||||||
.. code-block:: none
|
.. testoutput::
|
||||||
|
|
||||||
Encountered a start tag: html
|
Encountered a start tag: html
|
||||||
Encountered a start tag: head
|
Encountered a start tag: head
|
||||||
|
@ -230,7 +232,9 @@ Examples
|
||||||
--------
|
--------
|
||||||
|
|
||||||
The following class implements a parser that will be used to illustrate more
|
The following class implements a parser that will be used to illustrate more
|
||||||
examples::
|
examples:
|
||||||
|
|
||||||
|
.. testcode::
|
||||||
|
|
||||||
from html.parser import HTMLParser
|
from html.parser import HTMLParser
|
||||||
from html.entities import name2codepoint
|
from html.entities import name2codepoint
|
||||||
|
@ -266,13 +270,17 @@ examples::
|
||||||
|
|
||||||
parser = MyHTMLParser()
|
parser = MyHTMLParser()
|
||||||
|
|
||||||
Parsing a doctype::
|
Parsing a doctype:
|
||||||
|
|
||||||
|
.. doctest::
|
||||||
|
|
||||||
>>> parser.feed('<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" '
|
>>> parser.feed('<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" '
|
||||||
... '"http://www.w3.org/TR/html4/strict.dtd">')
|
... '"http://www.w3.org/TR/html4/strict.dtd">')
|
||||||
Decl : DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd"
|
Decl : DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd"
|
||||||
|
|
||||||
Parsing an element with a few attributes and a title::
|
Parsing an element with a few attributes and a title:
|
||||||
|
|
||||||
|
.. doctest::
|
||||||
|
|
||||||
>>> parser.feed('<img src="python-logo.png" alt="The Python logo">')
|
>>> parser.feed('<img src="python-logo.png" alt="The Python logo">')
|
||||||
Start tag: img
|
Start tag: img
|
||||||
|
@ -285,7 +293,9 @@ Parsing an element with a few attributes and a title::
|
||||||
End tag : h1
|
End tag : h1
|
||||||
|
|
||||||
The content of ``script`` and ``style`` elements is returned as is, without
|
The content of ``script`` and ``style`` elements is returned as is, without
|
||||||
further parsing::
|
further parsing:
|
||||||
|
|
||||||
|
.. doctest::
|
||||||
|
|
||||||
>>> parser.feed('<style type="text/css">#python { color: green }</style>')
|
>>> parser.feed('<style type="text/css">#python { color: green }</style>')
|
||||||
Start tag: style
|
Start tag: style
|
||||||
|
@ -300,7 +310,9 @@ further parsing::
|
||||||
Data : alert("<strong>hello!</strong>");
|
Data : alert("<strong>hello!</strong>");
|
||||||
End tag : script
|
End tag : script
|
||||||
|
|
||||||
Parsing comments::
|
Parsing comments:
|
||||||
|
|
||||||
|
.. doctest::
|
||||||
|
|
||||||
>>> parser.feed('<!--a comment-->'
|
>>> parser.feed('<!--a comment-->'
|
||||||
... '<!--[if IE 9]>IE-specific content<![endif]-->')
|
... '<!--[if IE 9]>IE-specific content<![endif]-->')
|
||||||
|
@ -308,8 +320,15 @@ Parsing comments::
|
||||||
Comment : [if IE 9]>IE-specific content<![endif]
|
Comment : [if IE 9]>IE-specific content<![endif]
|
||||||
|
|
||||||
Parsing named and numeric character references and converting them to the
|
Parsing named and numeric character references and converting them to the
|
||||||
correct char (note: these 3 references are all equivalent to ``'>'``)::
|
correct char (note: these 3 references are all equivalent to ``'>'``):
|
||||||
|
|
||||||
|
.. doctest::
|
||||||
|
|
||||||
|
>>> parser = MyHTMLParser()
|
||||||
|
>>> parser.feed('>>>')
|
||||||
|
Data : >>>
|
||||||
|
|
||||||
|
>>> parser = MyHTMLParser(convert_charrefs=False)
|
||||||
>>> parser.feed('>>>')
|
>>> parser.feed('>>>')
|
||||||
Named ent: >
|
Named ent: >
|
||||||
Num ent : >
|
Num ent : >
|
||||||
|
@ -317,7 +336,9 @@ correct char (note: these 3 references are all equivalent to ``'>'``)::
|
||||||
|
|
||||||
Feeding incomplete chunks to :meth:`~HTMLParser.feed` works, but
|
Feeding incomplete chunks to :meth:`~HTMLParser.feed` works, but
|
||||||
:meth:`~HTMLParser.handle_data` might be called more than once
|
:meth:`~HTMLParser.handle_data` might be called more than once
|
||||||
(unless *convert_charrefs* is set to ``True``)::
|
(unless *convert_charrefs* is set to ``True``):
|
||||||
|
|
||||||
|
.. doctest::
|
||||||
|
|
||||||
>>> for chunk in ['<sp', 'an>buff', 'ered', ' text</s', 'pan>']:
|
>>> for chunk in ['<sp', 'an>buff', 'ered', ' text</s', 'pan>']:
|
||||||
... parser.feed(chunk)
|
... parser.feed(chunk)
|
||||||
|
@ -328,7 +349,9 @@ Feeding incomplete chunks to :meth:`~HTMLParser.feed` works, but
|
||||||
Data : text
|
Data : text
|
||||||
End tag : span
|
End tag : span
|
||||||
|
|
||||||
Parsing invalid HTML (e.g. unquoted attributes) also works::
|
Parsing invalid HTML (e.g. unquoted attributes) also works:
|
||||||
|
|
||||||
|
.. doctest::
|
||||||
|
|
||||||
>>> parser.feed('<p><a class=link href=#main>tag soup</p ></a>')
|
>>> parser.feed('<p><a class=link href=#main>tag soup</p ></a>')
|
||||||
Start tag: p
|
Start tag: p
|
||||||
|
|
|
@ -12,6 +12,7 @@ import re
|
||||||
import _markupbase
|
import _markupbase
|
||||||
|
|
||||||
from html import unescape
|
from html import unescape
|
||||||
|
from html.entities import html5 as html5_entities
|
||||||
|
|
||||||
|
|
||||||
__all__ = ['HTMLParser']
|
__all__ = ['HTMLParser']
|
||||||
|
@ -23,6 +24,7 @@ incomplete = re.compile('&[a-zA-Z#]')
|
||||||
|
|
||||||
entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
|
entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
|
||||||
charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
|
charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
|
||||||
|
attr_charref = re.compile(r'&(#[0-9]+|#[xX][0-9a-fA-F]+|[a-zA-Z][a-zA-Z0-9]*)[;=]?')
|
||||||
|
|
||||||
starttagopen = re.compile('<[a-zA-Z]')
|
starttagopen = re.compile('<[a-zA-Z]')
|
||||||
piclose = re.compile('>')
|
piclose = re.compile('>')
|
||||||
|
@ -57,6 +59,22 @@ endendtag = re.compile('>')
|
||||||
# </ and the tag name, so maybe this should be fixed
|
# </ and the tag name, so maybe this should be fixed
|
||||||
endtagfind = re.compile(r'</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
|
endtagfind = re.compile(r'</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
|
||||||
|
|
||||||
|
# Character reference processing logic specific to attribute values
|
||||||
|
# See: https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state
|
||||||
|
def _replace_attr_charref(match):
|
||||||
|
ref = match.group(0)
|
||||||
|
# Numeric / hex char refs must always be unescaped
|
||||||
|
if ref.startswith('&#'):
|
||||||
|
return unescape(ref)
|
||||||
|
# Named character / entity references must only be unescaped
|
||||||
|
# if they are an exact match, and they are not followed by an equals sign
|
||||||
|
if not ref.endswith('=') and ref[1:] in html5_entities:
|
||||||
|
return unescape(ref)
|
||||||
|
# Otherwise do not unescape
|
||||||
|
return ref
|
||||||
|
|
||||||
|
def _unescape_attrvalue(s):
|
||||||
|
return attr_charref.sub(_replace_attr_charref, s)
|
||||||
|
|
||||||
|
|
||||||
class HTMLParser(_markupbase.ParserBase):
|
class HTMLParser(_markupbase.ParserBase):
|
||||||
|
@ -323,7 +341,7 @@ class HTMLParser(_markupbase.ParserBase):
|
||||||
attrvalue[:1] == '"' == attrvalue[-1:]:
|
attrvalue[:1] == '"' == attrvalue[-1:]:
|
||||||
attrvalue = attrvalue[1:-1]
|
attrvalue = attrvalue[1:-1]
|
||||||
if attrvalue:
|
if attrvalue:
|
||||||
attrvalue = unescape(attrvalue)
|
attrvalue = _unescape_attrvalue(attrvalue)
|
||||||
attrs.append((attrname.lower(), attrvalue))
|
attrs.append((attrname.lower(), attrvalue))
|
||||||
k = m.end()
|
k = m.end()
|
||||||
|
|
||||||
|
|
|
@ -348,18 +348,16 @@ text
|
||||||
collector = lambda: EventCollectorCharrefs()
|
collector = lambda: EventCollectorCharrefs()
|
||||||
self.assertTrue(collector().convert_charrefs)
|
self.assertTrue(collector().convert_charrefs)
|
||||||
charrefs = ['"', '"', '"', '"', '"', '"']
|
charrefs = ['"', '"', '"', '"', '"', '"']
|
||||||
# check charrefs in the middle of the text/attributes
|
# check charrefs in the middle of the text
|
||||||
expected = [('starttag', 'a', [('href', 'foo"zar')]),
|
expected = [('starttag', 'a', []), ('data', 'a"z'), ('endtag', 'a')]
|
||||||
('data', 'a"z'), ('endtag', 'a')]
|
|
||||||
for charref in charrefs:
|
for charref in charrefs:
|
||||||
self._run_check('<a href="foo{0}zar">a{0}z</a>'.format(charref),
|
self._run_check('<a>a{0}z</a>'.format(charref),
|
||||||
expected, collector=collector())
|
expected, collector=collector())
|
||||||
# check charrefs at the beginning/end of the text/attributes
|
# check charrefs at the beginning/end of the text
|
||||||
expected = [('data', '"'),
|
expected = [('data', '"'), ('starttag', 'a', []),
|
||||||
('starttag', 'a', [('x', '"'), ('y', '"X'), ('z', 'X"')]),
|
|
||||||
('data', '"'), ('endtag', 'a'), ('data', '"')]
|
('data', '"'), ('endtag', 'a'), ('data', '"')]
|
||||||
for charref in charrefs:
|
for charref in charrefs:
|
||||||
self._run_check('{0}<a x="{0}" y="{0}X" z="X{0}">'
|
self._run_check('{0}<a>'
|
||||||
'{0}</a>{0}'.format(charref),
|
'{0}</a>{0}'.format(charref),
|
||||||
expected, collector=collector())
|
expected, collector=collector())
|
||||||
# check charrefs in <script>/<style> elements
|
# check charrefs in <script>/<style> elements
|
||||||
|
@ -382,6 +380,35 @@ text
|
||||||
self._run_check('no charrefs here', [('data', 'no charrefs here')],
|
self._run_check('no charrefs here', [('data', 'no charrefs here')],
|
||||||
collector=collector())
|
collector=collector())
|
||||||
|
|
||||||
|
def test_convert_charrefs_in_attribute_values(self):
|
||||||
|
# default value for convert_charrefs is now True
|
||||||
|
collector = lambda: EventCollectorCharrefs()
|
||||||
|
self.assertTrue(collector().convert_charrefs)
|
||||||
|
|
||||||
|
# always unescape terminated entity refs, numeric and hex char refs:
|
||||||
|
# - regardless whether they are at start, middle, end of attribute
|
||||||
|
# - or followed by alphanumeric, non-alphanumeric, or equals char
|
||||||
|
charrefs = ['¢', '¢', '¢', '¢', '¢']
|
||||||
|
expected = [('starttag', 'a',
|
||||||
|
[('x', '¢'), ('x', 'z¢'), ('x', '¢z'),
|
||||||
|
('x', 'z¢z'), ('x', '¢ z'), ('x', '¢=z')]),
|
||||||
|
('endtag', 'a')]
|
||||||
|
for charref in charrefs:
|
||||||
|
self._run_check('<a x="{0}" x="z{0}" x="{0}z" '
|
||||||
|
' x="z{0}z" x="{0} z" x="{0}=z"></a>'
|
||||||
|
.format(charref), expected, collector=collector())
|
||||||
|
|
||||||
|
# only unescape unterminated entity matches if they are not followed by
|
||||||
|
# an alphanumeric or an equals sign
|
||||||
|
charref = '¢'
|
||||||
|
expected = [('starttag', 'a',
|
||||||
|
[('x', '¢'), ('x', 'z¢'), ('x', '¢z'),
|
||||||
|
('x', 'z¢z'), ('x', '¢ z'), ('x', '¢=z')]),
|
||||||
|
('endtag', 'a')]
|
||||||
|
self._run_check('<a x="{0}" x="z{0}" x="{0}z" '
|
||||||
|
' x="z{0}z" x="{0} z" x="{0}=z"></a>'
|
||||||
|
.format(charref), expected, collector=collector())
|
||||||
|
|
||||||
# the remaining tests were for the "tolerant" parser (which is now
|
# the remaining tests were for the "tolerant" parser (which is now
|
||||||
# the default), and check various kind of broken markup
|
# the default), and check various kind of broken markup
|
||||||
def test_tolerant_parsing(self):
|
def test_tolerant_parsing(self):
|
||||||
|
|
|
@ -0,0 +1,3 @@
|
||||||
|
Fix :class:`html.parser.HTMLParser` to not unescape character entities in
|
||||||
|
attribute values if they are followed by an ASCII alphanumeric or an equals
|
||||||
|
sign.
|
Loading…
Add table
Add a link
Reference in a new issue