[3.13] gh-118350: Fix support of elements "textarea" and "title" in HTMLParser (GH-135310) (GH-136985)

(cherry picked from commit 4d02f31cdd) Co-authored-by: Timon Viola <44016238+timonviola@users.noreply.github.com> Co-authored-by: Serhiy Storchaka <storchaka@gmail.com> Co-authored-by: Łukasz Langa <lukasz@langa.pl>
2025-09-11 11:17:16 +00:00 · 2025-07-22 14:17:59 +02:00 · 2025-07-22 14:17:59 +02:00 · 8de88e0840
commit 8de88e0840
parent 4999cdbced
3 changed files with 113 additions and 5 deletions
--- a/Lib/html/parser.py
+++ b/Lib/html/parser.py
@ -128,6 +128,7 @@ class HTMLParser(_markupbase.ParserBase):
    """

    CDATA_CONTENT_ELEMENTS = ("script", "style")
+    RCDATA_CONTENT_ELEMENTS = ("textarea", "title")

    def __init__(self, *, convert_charrefs=True):
        """Initialize and reset this instance.
@ -145,6 +146,7 @@ class HTMLParser(_markupbase.ParserBase):
        self.lasttag = '???'
        self.interesting = interesting_normal
        self.cdata_elem = None
+        self._escapable = True
        super().reset()

    def feed(self, data):
@ -166,14 +168,20 @@ class HTMLParser(_markupbase.ParserBase):
        """Return full source of start tag: '<...>'."""
        return self.__starttag_text

-    def set_cdata_mode(self, elem):
+    def set_cdata_mode(self, elem, *, escapable=False):
        self.cdata_elem = elem.lower()
-        self.interesting = re.compile(r'</%s(?=[\t\n\r\f />])' % self.cdata_elem,
-                                      re.IGNORECASE|re.ASCII)
+        self._escapable = escapable
+        if escapable and not self.convert_charrefs:
+            self.interesting = re.compile(r'&|</%s(?=[\t\n\r\f />])' % self.cdata_elem,
+                                          re.IGNORECASE|re.ASCII)
+        else:
+            self.interesting = re.compile(r'</%s(?=[\t\n\r\f />])' % self.cdata_elem,
+                                          re.IGNORECASE|re.ASCII)

    def clear_cdata_mode(self):
        self.interesting = interesting_normal
        self.cdata_elem = None
+        self._escapable = True

    # Internal -- handle data as far as reasonable.  May leave state
    # and data to be processed by a subsequent call.  If 'end' is
@ -206,7 +214,7 @@ class HTMLParser(_markupbase.ParserBase):
                        break
                    j = n
            if i < j:
-                if self.convert_charrefs and not self.cdata_elem:
+                if self.convert_charrefs and self._escapable:
                    self.handle_data(unescape(rawdata[i:j]))
                else:
                    self.handle_data(rawdata[i:j])
@ -308,7 +316,7 @@ class HTMLParser(_markupbase.ParserBase):
                assert 0, "interesting.search() lied"
        # end while
        if end and i < n:
-            if self.convert_charrefs and not self.cdata_elem:
+            if self.convert_charrefs and self._escapable:
                self.handle_data(unescape(rawdata[i:n]))
            else:
                self.handle_data(rawdata[i:n])
@ -420,6 +428,8 @@ class HTMLParser(_markupbase.ParserBase):
            self.handle_starttag(tag, attrs)
            if tag in self.CDATA_CONTENT_ELEMENTS:
                self.set_cdata_mode(tag)
+            elif tag in self.RCDATA_CONTENT_ELEMENTS:
+                self.set_cdata_mode(tag, escapable=True)
        return endpos

    # Internal -- check to see if we have a complete starttag; return end