Merge branch 'main' of https://github.com/python/cpython

2025-09-26 18:29:57 +00:00 · 2025-05-07 18:53:08 +03:00 · 2025-05-07 18:53:08 +03:00 · 5ea24116b0
commit 5ea24116b0
parent 9748fb3867 ee76e36d76
4 changed files with 94 additions and 23 deletions
--- a/Doc/library/html.parser.rst
+++ b/Doc/library/html.parser.rst
@ -43,7 +43,9 @@ Example HTML Parser Application
 As a basic example, below is a simple HTML parser that uses the
 :class:`HTMLParser` class to print out start tags, end tags, and data
-as they are encountered::
+as they are encountered:
 .. testcode::
   from html.parser import HTMLParser
@ -63,7 +65,7 @@ as they are encountered::
 The output will then be:
-.. code-block:: none
+.. testoutput::
   Encountered a start tag: html
   Encountered a start tag: head
@ -230,7 +232,9 @@ Examples
 --------
 The following class implements a parser that will be used to illustrate more
-examples::
+examples:
 .. testcode::
   from html.parser import HTMLParser
   from html.entities import name2codepoint
@ -266,13 +270,17 @@ examples::
   parser = MyHTMLParser()
-Parsing a doctype::
+Parsing a doctype:
 .. doctest::
   >>> parser.feed('<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" '
   ...             '"http://www.w3.org/TR/html4/strict.dtd">')
   Decl     : DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd"
-Parsing an element with a few attributes and a title::
+Parsing an element with a few attributes and a title:
 .. doctest::
   >>> parser.feed('<img src="python-logo.png" alt="The Python logo">')
   Start tag: img
@ -285,7 +293,9 @@ Parsing an element with a few attributes and a title::
   End tag  : h1
 The content of ``script`` and ``style`` elements is returned as is, without
-further parsing::
+further parsing:
 .. doctest::
   >>> parser.feed('<style type="text/css">#python { color: green }</style>')
   Start tag: style
@ -300,16 +310,25 @@ further parsing::
   Data     : alert("<strong>hello!</strong>");
   End tag  : script
-Parsing comments::
+Parsing comments:
-   >>> parser.feed('<!-- a comment -->'
+.. doctest::
   >>> parser.feed('<!--a comment-->'
   ...             '<!--[if IE 9]>IE-specific content<![endif]-->')
-   Comment  :  a comment
+   Comment  : a comment
   Comment  : [if IE 9]>IE-specific content<![endif]
 Parsing named and numeric character references and converting them to the
-correct char (note: these 3 references are all equivalent to ``'>'``)::
+correct char (note: these 3 references are all equivalent to ``'>'``):
 .. doctest::
   >>> parser = MyHTMLParser()
   >>> parser.feed('&gt;&#62;&#x3E;')
   Data     : >>>
   >>> parser = MyHTMLParser(convert_charrefs=False)
   >>> parser.feed('&gt;&#62;&#x3E;')
   Named ent: >
   Num ent  : >
@ -317,18 +336,22 @@ correct char (note: these 3 references are all equivalent to ``'>'``)::
 Feeding incomplete chunks to :meth:`~HTMLParser.feed` works, but
 :meth:`~HTMLParser.handle_data` might be called more than once
-(unless *convert_charrefs* is set to ``True``)::
+(unless *convert_charrefs* is set to ``True``):
-   >>> for chunk in ['<sp', 'an>buff', 'ered ', 'text</s', 'pan>']:
+.. doctest::
   >>> for chunk in ['<sp', 'an>buff', 'ered', ' text</s', 'pan>']:
   ...     parser.feed(chunk)
   ...
   Start tag: span
   Data     : buff
   Data     : ered
-   Data     : text
+   Data     :  text
   End tag  : span
-Parsing invalid HTML (e.g. unquoted attributes) also works::
+Parsing invalid HTML (e.g. unquoted attributes) also works:
 .. doctest::
   >>> parser.feed('<p><a class=link href=#main>tag soup</p ></a>')
   Start tag: p
--- a/Lib/html/parser.py
+++ b/Lib/html/parser.py
@ -12,6 +12,7 @@ import re
 import _markupbase
 from html import unescape
 from html.entities import html5 as html5_entities
 __all__ = ['HTMLParser']
@ -23,6 +24,7 @@ incomplete = re.compile('&[a-zA-Z#]')
 entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
 charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
 attr_charref = re.compile(r'&(#[0-9]+|#[xX][0-9a-fA-F]+|[a-zA-Z][a-zA-Z0-9]*)[;=]?')
 starttagopen = re.compile('<[a-zA-Z]')
 piclose = re.compile('>')
@ -57,6 +59,22 @@ endendtag = re.compile('>')
 # </ and the tag name, so maybe this should be fixed
 endtagfind = re.compile(r'</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
 # Character reference processing logic specific to attribute values
 # See: https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state
 def _replace_attr_charref(match):
    ref = match.group(0)
    # Numeric / hex char refs must always be unescaped
    if ref.startswith('&#'):
        return unescape(ref)
    # Named character / entity references must only be unescaped
    # if they are an exact match, and they are not followed by an equals sign
    if not ref.endswith('=') and ref[1:] in html5_entities:
        return unescape(ref)
    # Otherwise do not unescape
    return ref
 def _unescape_attrvalue(s):
    return attr_charref.sub(_replace_attr_charref, s)
 class HTMLParser(_markupbase.ParserBase):
@ -323,7 +341,7 @@ class HTMLParser(_markupbase.ParserBase):
                 attrvalue[:1] == '"' == attrvalue[-1:]:
                attrvalue = attrvalue[1:-1]
            if attrvalue:
-                attrvalue = unescape(attrvalue)
+                attrvalue = _unescape_attrvalue(attrvalue)
            attrs.append((attrname.lower(), attrvalue))
            k = m.end()
--- a/Lib/test/test_htmlparser.py
+++ b/Lib/test/test_htmlparser.py
@ -348,18 +348,16 @@ text
        collector = lambda: EventCollectorCharrefs()
        self.assertTrue(collector().convert_charrefs)
        charrefs = ['&quot;', '&#34;', '&#x22;', '&quot', '&#34', '&#x22']
-        # check charrefs in the middle of the text/attributes
+        # check charrefs in the middle of the text
-        expected = [('starttag', 'a', [('href', 'foo"zar')]),
+        expected = [('starttag', 'a', []), ('data', 'a"z'), ('endtag', 'a')]
                    ('data', 'a"z'), ('endtag', 'a')]
        for charref in charrefs:
-            self._run_check('<a href="foo{0}zar">a{0}z</a>'.format(charref),
+            self._run_check('<a>a{0}z</a>'.format(charref),
                            expected, collector=collector())
-        # check charrefs at the beginning/end of the text/attributes
+        # check charrefs at the beginning/end of the text
-        expected = [('data', '"'),
+        expected = [('data', '"'), ('starttag', 'a', []),
                    ('starttag', 'a', [('x', '"'), ('y', '"X'), ('z', 'X"')]),
                    ('data', '"'), ('endtag', 'a'), ('data', '"')]
        for charref in charrefs:
-            self._run_check('{0}<a x="{0}" y="{0}X" z="X{0}">'
+            self._run_check('{0}<a>'
                            '{0}</a>{0}'.format(charref),
                            expected, collector=collector())
        # check charrefs in <script>/<style> elements
@ -382,6 +380,35 @@ text
        self._run_check('no charrefs here', [('data', 'no charrefs here')],
                        collector=collector())
    def test_convert_charrefs_in_attribute_values(self):
        # default value for convert_charrefs is now True
        collector = lambda: EventCollectorCharrefs()
        self.assertTrue(collector().convert_charrefs)
        # always unescape terminated entity refs, numeric and hex char refs:
        # - regardless whether they are at start, middle, end of attribute
        # - or followed by alphanumeric, non-alphanumeric, or equals char
        charrefs = ['&cent;', '&#xa2;', '&#xa2', '&#162;', '&#162']
        expected = [('starttag', 'a',
                     [('x', '¢'), ('x', 'z¢'), ('x', '¢z'),
                      ('x', 'z¢z'), ('x', '¢ z'), ('x', '¢=z')]),
                    ('endtag', 'a')]
        for charref in charrefs:
            self._run_check('<a x="{0}" x="z{0}" x="{0}z" '
                            '   x="z{0}z" x="{0} z" x="{0}=z"></a>'
                            .format(charref), expected, collector=collector())
        # only unescape unterminated entity matches if they are not followed by
        # an alphanumeric or an equals sign
        charref = '&cent'
        expected = [('starttag', 'a',
                     [('x', '¢'), ('x', 'z¢'), ('x', '&centz'),
                      ('x', 'z&centz'), ('x', '¢ z'), ('x', '&cent=z')]),
                    ('endtag', 'a')]
        self._run_check('<a x="{0}" x="z{0}" x="{0}z" '
                        '   x="z{0}z" x="{0} z" x="{0}=z"></a>'
                        .format(charref), expected, collector=collector())
    # the remaining tests were for the "tolerant" parser (which is now
    # the default), and check various kind of broken markup
    def test_tolerant_parsing(self):
--- a/Misc/NEWS.d/next/Library/2022-07-24-20-56-32.gh-issue-69426.unccw7.rst
+++ b/Misc/NEWS.d/next/Library/2022-07-24-20-56-32.gh-issue-69426.unccw7.rst
@ -0,0 +1,3 @@
 Fix :class:`html.parser.HTMLParser` to not unescape character entities in
 attribute values if they are followed by an ASCII alphanumeric or an equals
 sign.