[3.12] gh-118350: Fix support of elements "textarea" and "title" in HTMLParser (GH-135310) (GH-136986)

(cherry picked from commit 4d02f31cdd) Co-authored-by: Timon Viola <44016238+timonviola@users.noreply.github.com> Co-authored-by: Serhiy Storchaka <storchaka@gmail.com> Co-authored-by: Łukasz Langa <lukasz@langa.pl>
2025-08-25 11:15:02 +00:00 · 2025-07-22 14:31:27 +02:00 · 2025-07-22 14:31:27 +02:00 · f66c75f11d
commit f66c75f11d
parent ad695f5328
3 changed files with 113 additions and 5 deletions
--- a/Lib/html/parser.py
+++ b/Lib/html/parser.py
@ -110,6 +110,7 @@ class HTMLParser(_markupbase.ParserBase):
    """
    CDATA_CONTENT_ELEMENTS = ("script", "style")
    RCDATA_CONTENT_ELEMENTS = ("textarea", "title")
    def __init__(self, *, convert_charrefs=True):
        """Initialize and reset this instance.
@ -127,6 +128,7 @@ class HTMLParser(_markupbase.ParserBase):
        self.lasttag = '???'
        self.interesting = interesting_normal
        self.cdata_elem = None
        self._escapable = True
        super().reset()
    def feed(self, data):
@ -148,14 +150,20 @@ class HTMLParser(_markupbase.ParserBase):
        """Return full source of start tag: '<...>'."""
        return self.__starttag_text
-    def set_cdata_mode(self, elem):
+    def set_cdata_mode(self, elem, *, escapable=False):
        self.cdata_elem = elem.lower()
        self._escapable = escapable
        if escapable and not self.convert_charrefs:
            self.interesting = re.compile(r'&|</%s(?=[\t\n\r\f />])' % self.cdata_elem,
                                          re.IGNORECASE|re.ASCII)
        else:
            self.interesting = re.compile(r'</%s(?=[\t\n\r\f />])' % self.cdata_elem,
                                          re.IGNORECASE|re.ASCII)
    def clear_cdata_mode(self):
        self.interesting = interesting_normal
        self.cdata_elem = None
        self._escapable = True
    # Internal -- handle data as far as reasonable.  May leave state
    # and data to be processed by a subsequent call.  If 'end' is
@ -188,7 +196,7 @@ class HTMLParser(_markupbase.ParserBase):
                        break
                    j = n
            if i < j:
-                if self.convert_charrefs and not self.cdata_elem:
+                if self.convert_charrefs and self._escapable:
                    self.handle_data(unescape(rawdata[i:j]))
                else:
                    self.handle_data(rawdata[i:j])
@ -290,7 +298,7 @@ class HTMLParser(_markupbase.ParserBase):
                assert 0, "interesting.search() lied"
        # end while
        if end and i < n:
-            if self.convert_charrefs and not self.cdata_elem:
+            if self.convert_charrefs and self._escapable:
                self.handle_data(unescape(rawdata[i:n]))
            else:
                self.handle_data(rawdata[i:n])
@ -402,6 +410,8 @@ class HTMLParser(_markupbase.ParserBase):
            self.handle_starttag(tag, attrs)
            if tag in self.CDATA_CONTENT_ELEMENTS:
                self.set_cdata_mode(tag)
            elif tag in self.RCDATA_CONTENT_ELEMENTS:
                self.set_cdata_mode(tag, escapable=True)
        return endpos
    # Internal -- check to see if we have a complete starttag; return end
--- a/Lib/test/test_htmlparser.py
+++ b/Lib/test/test_htmlparser.py
@ -317,6 +317,49 @@ text
                            ("data", content),
                            ("endtag", "style")])
    @support.subTests('content', [
            '<!-- not a comment -->',
            "<not a='start tag'>",
            '<![CDATA[not a cdata]]>',
            '<!not a bogus comment>',
            '</not a bogus comment>',
            '\u2603',
            '< /title>',
            '</ title>',
            '</titled>',
            '</title\v>',
            '</title\xa0>',
            '</tıtle>',
        ])
    def test_title_content(self, content):
        source = f"<title>{content}</title>"
        self._run_check(source, [
            ("starttag", "title", []),
            ("data", content),
            ("endtag", "title"),
        ])
    @support.subTests('content', [
            '<!-- not a comment -->',
            "<not a='start tag'>",
            '<![CDATA[not a cdata]]>',
            '<!not a bogus comment>',
            '</not a bogus comment>',
            '\u2603',
            '< /textarea>',
            '</ textarea>',
            '</textareable>',
            '</textarea\v>',
            '</textarea\xa0>',
        ])
    def test_textarea_content(self, content):
        source = f"<textarea>{content}</textarea>"
        self._run_check(source, [
            ("starttag", "textarea", []),
            ("data", content),
            ("endtag", "textarea"),
        ])
    @support.subTests('endtag', ['script', 'SCRIPT', 'script ', 'script\n',
                                 'script/', 'script foo=bar', 'script foo=">"'])
    def test_script_closing_tag(self, endtag):
@ -346,6 +389,38 @@ text
                            ("endtag", "style")],
                        collector=EventCollectorNoNormalize(convert_charrefs=False))
    @support.subTests('endtag', ['title', 'TITLE', 'title ', 'title\n',
                                 'title/', 'title foo=bar', 'title foo=">"'])
    def test_title_closing_tag(self, endtag):
        content = "<!-- not a comment --><i>Egg &amp; Spam</i>"
        s = f'<TitLe>{content}</{endtag}>'
        self._run_check(s, [("starttag", "title", []),
                            ('data', '<!-- not a comment --><i>Egg & Spam</i>'),
                            ("endtag", "title")],
                        collector=EventCollectorNoNormalize(convert_charrefs=True))
        self._run_check(s, [("starttag", "title", []),
                            ('data', '<!-- not a comment --><i>Egg '),
                            ('entityref', 'amp'),
                            ('data', ' Spam</i>'),
                            ("endtag", "title")],
                        collector=EventCollectorNoNormalize(convert_charrefs=False))
    @support.subTests('endtag', ['textarea', 'TEXTAREA', 'textarea ', 'textarea\n',
                                 'textarea/', 'textarea foo=bar', 'textarea foo=">"'])
    def test_textarea_closing_tag(self, endtag):
        content = "<!-- not a comment --><i>Egg &amp; Spam</i>"
        s = f'<TexTarEa>{content}</{endtag}>'
        self._run_check(s, [("starttag", "textarea", []),
                            ('data', '<!-- not a comment --><i>Egg & Spam</i>'),
                            ("endtag", "textarea")],
                        collector=EventCollectorNoNormalize(convert_charrefs=True))
        self._run_check(s, [("starttag", "textarea", []),
                            ('data', '<!-- not a comment --><i>Egg '),
                            ('entityref', 'amp'),
                            ('data', ' Spam</i>'),
                            ("endtag", "textarea")],
                        collector=EventCollectorNoNormalize(convert_charrefs=False))
    @support.subTests('tail,end', [
        ('', False),
        ('<', False),
@ -363,6 +438,27 @@ text
                            ("data", content if end else content + tail)],
                        collector=EventCollectorNoNormalize(convert_charrefs=False))
    @support.subTests('tail,end', [
        ('', False),
        ('<', False),
        ('</', False),
        ('</t', False),
        ('</title', False),
        ('</title ', True),
        ('</title foo=bar', True),
        ('</title foo=">', True),
    ])
    def test_eof_in_title(self, tail, end):
        s = f'<TitLe>Egg &amp; Spam{tail}'
        self._run_check(s, [("starttag", "title", []),
                            ("data", "Egg & Spam" + ('' if end else tail))],
                        collector=EventCollectorNoNormalize(convert_charrefs=True))
        self._run_check(s, [("starttag", "title", []),
                            ('data', 'Egg '),
                            ('entityref', 'amp'),
                            ('data', ' Spam' + ('' if end else tail))],
                        collector=EventCollectorNoNormalize(convert_charrefs=False))
    def test_comments(self):
        html = ("<!-- I'm a valid comment -->"
                '<!--me too!-->'
--- a/Misc/NEWS.d/next/Security/2025-06-09-20-38-25.gh-issue-118350.KgWCcP.rst
+++ b/Misc/NEWS.d/next/Security/2025-06-09-20-38-25.gh-issue-118350.KgWCcP.rst
@ -0,0 +1,2 @@
 Fix support of escapable raw text mode (elements "textarea" and "title")
 in :class:`html.parser.HTMLParser`.
		`@ -0,0 +1,2 @@`
							`Fix support of escapable raw text mode (elements "textarea" and "title")`
							in :class:`html.parser.HTMLParser`.