[3.14] gh-137836: Support more RAWTEXT and PLAINTEXT elements in HTMLParser (GH-137837) (GH-140841)

* the "plaintext" element
* the RAWTEXT elements "xmp", "iframe", "noembed" and "noframes"
* optionally RAWTEXT (if scripting=True) element "noscript"
(cherry picked from commit a17c57eee5)

Co-authored-by: Serhiy Storchaka <storchaka@gmail.com>
This commit is contained in:
Miss Islington (bot) 2025-10-31 17:13:13 +01:00 committed by GitHub
parent d0c78a458b
commit 89818a5939
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 163 additions and 114 deletions

View file

@ -127,17 +127,25 @@ class HTMLParser(_markupbase.ParserBase):
argument.
"""
CDATA_CONTENT_ELEMENTS = ("script", "style")
# See the HTML5 specs section "13.4 Parsing HTML fragments".
# https://html.spec.whatwg.org/multipage/parsing.html#parsing-html-fragments
# CDATA_CONTENT_ELEMENTS are parsed in RAWTEXT mode
CDATA_CONTENT_ELEMENTS = ("script", "style", "xmp", "iframe", "noembed", "noframes")
RCDATA_CONTENT_ELEMENTS = ("textarea", "title")
def __init__(self, *, convert_charrefs=True):
def __init__(self, *, convert_charrefs=True, scripting=False):
"""Initialize and reset this instance.
If convert_charrefs is True (the default), all character references
If convert_charrefs is true (the default), all character references
are automatically converted to the corresponding Unicode characters.
If *scripting* is false (the default), the content of the
``noscript`` element is parsed normally; if it's true,
it's returned as is without being parsed.
"""
super().__init__()
self.convert_charrefs = convert_charrefs
self.scripting = scripting
self.reset()
def reset(self):
@ -172,7 +180,9 @@ class HTMLParser(_markupbase.ParserBase):
def set_cdata_mode(self, elem, *, escapable=False):
self.cdata_elem = elem.lower()
self._escapable = escapable
if escapable and not self.convert_charrefs:
if self.cdata_elem == 'plaintext':
self.interesting = re.compile(r'\z')
elif escapable and not self.convert_charrefs:
self.interesting = re.compile(r'&|</%s(?=[\t\n\r\f />])' % self.cdata_elem,
re.IGNORECASE|re.ASCII)
else:
@ -444,8 +454,10 @@ class HTMLParser(_markupbase.ParserBase):
self.handle_startendtag(tag, attrs)
else:
self.handle_starttag(tag, attrs)
if tag in self.CDATA_CONTENT_ELEMENTS:
self.set_cdata_mode(tag)
if (tag in self.CDATA_CONTENT_ELEMENTS or
(self.scripting and tag == "noscript") or
tag == "plaintext"):
self.set_cdata_mode(tag, escapable=False)
elif tag in self.RCDATA_CONTENT_ELEMENTS:
self.set_cdata_mode(tag, escapable=True)
return endpos