diff --git a/Doc/library/html.parser.rst b/Doc/library/html.parser.rst
index dd67fc34e85..341a8337ba2 100644
--- a/Doc/library/html.parser.rst
+++ b/Doc/library/html.parser.rst
@@ -15,14 +15,18 @@
This module defines a class :class:`HTMLParser` which serves as the basis for
parsing text files formatted in HTML (HyperText Mark-up Language) and XHTML.
-.. class:: HTMLParser(*, convert_charrefs=True)
+.. class:: HTMLParser(*, convert_charrefs=True, scripting=False)
Create a parser instance able to parse invalid markup.
- If *convert_charrefs* is ``True`` (the default), all character
- references (except the ones in ``script``/``style`` elements) are
+ If *convert_charrefs* is true (the default), all character
+ references (except the ones in elements like ``script`` and ``style``) are
automatically converted to the corresponding Unicode characters.
+ If *scripting* is false (the default), the content of the ``noscript``
+ element is parsed normally; if it's true, it's returned as is without
+ being parsed.
+
An :class:`.HTMLParser` instance is fed HTML data and calls handler methods
when start tags, end tags, text, comments, and other markup elements are
encountered. The user should subclass :class:`.HTMLParser` and override its
@@ -37,6 +41,9 @@ parsing text files formatted in HTML (HyperText Mark-up Language) and XHTML.
.. versionchanged:: 3.5
The default value for argument *convert_charrefs* is now ``True``.
+ .. versionchanged:: 3.14.1
+ Added the *scripting* parameter.
+
Example HTML Parser Application
-------------------------------
@@ -161,15 +168,15 @@ implementations do nothing (except for :meth:`~HTMLParser.handle_startendtag`):
.. method:: HTMLParser.handle_data(data)
This method is called to process arbitrary data (e.g. text nodes and the
- content of ```` and ````).
+ content of elements like ``script`` and ``style``).
.. method:: HTMLParser.handle_entityref(name)
This method is called to process a named character reference of the form
``&name;`` (e.g. ``>``), where *name* is a general entity reference
- (e.g. ``'gt'``). This method is never called if *convert_charrefs* is
- ``True``.
+ (e.g. ``'gt'``).
+ This method is only called if *convert_charrefs* is false.
.. method:: HTMLParser.handle_charref(name)
@@ -177,8 +184,8 @@ implementations do nothing (except for :meth:`~HTMLParser.handle_startendtag`):
This method is called to process decimal and hexadecimal numeric character
references of the form :samp:`{NNN};` and :samp:`{NNN};`. For example, the decimal
equivalent for ``>`` is ``>``, whereas the hexadecimal is ``>``;
- in this case the method will receive ``'62'`` or ``'x3E'``. This method
- is never called if *convert_charrefs* is ``True``.
+ in this case the method will receive ``'62'`` or ``'x3E'``.
+ This method is only called if *convert_charrefs* is false.
.. method:: HTMLParser.handle_comment(data)
@@ -292,8 +299,8 @@ Parsing an element with a few attributes and a title:
Data : Python
End tag : h1
-The content of ``script`` and ``style`` elements is returned as is, without
-further parsing:
+The content of elements like ``script`` and ``style`` is returned as is,
+without further parsing:
.. doctest::
@@ -304,10 +311,10 @@ further parsing:
End tag : style
>>> parser.feed('')
+ ... 'alert("hello! ☺");')
Start tag: script
attr: ('type', 'text/javascript')
- Data : alert("hello!");
+ Data : alert("hello! ☺");
End tag : script
Parsing comments:
@@ -336,7 +343,7 @@ correct char (note: these 3 references are all equivalent to ``'>'``):
Feeding incomplete chunks to :meth:`~HTMLParser.feed` works, but
:meth:`~HTMLParser.handle_data` might be called more than once
-(unless *convert_charrefs* is set to ``True``):
+if *convert_charrefs* is false:
.. doctest::
diff --git a/Lib/html/parser.py b/Lib/html/parser.py
index 5d7050dad23..e50620de800 100644
--- a/Lib/html/parser.py
+++ b/Lib/html/parser.py
@@ -127,17 +127,25 @@ class HTMLParser(_markupbase.ParserBase):
argument.
"""
- CDATA_CONTENT_ELEMENTS = ("script", "style")
+ # See the HTML5 specs section "13.4 Parsing HTML fragments".
+ # https://html.spec.whatwg.org/multipage/parsing.html#parsing-html-fragments
+ # CDATA_CONTENT_ELEMENTS are parsed in RAWTEXT mode
+ CDATA_CONTENT_ELEMENTS = ("script", "style", "xmp", "iframe", "noembed", "noframes")
RCDATA_CONTENT_ELEMENTS = ("textarea", "title")
- def __init__(self, *, convert_charrefs=True):
+ def __init__(self, *, convert_charrefs=True, scripting=False):
"""Initialize and reset this instance.
- If convert_charrefs is True (the default), all character references
+ If convert_charrefs is true (the default), all character references
are automatically converted to the corresponding Unicode characters.
+
+ If *scripting* is false (the default), the content of the
+ ``noscript`` element is parsed normally; if it's true,
+ it's returned as is without being parsed.
"""
super().__init__()
self.convert_charrefs = convert_charrefs
+ self.scripting = scripting
self.reset()
def reset(self):
@@ -172,7 +180,9 @@ class HTMLParser(_markupbase.ParserBase):
def set_cdata_mode(self, elem, *, escapable=False):
self.cdata_elem = elem.lower()
self._escapable = escapable
- if escapable and not self.convert_charrefs:
+ if self.cdata_elem == 'plaintext':
+ self.interesting = re.compile(r'\z')
+ elif escapable and not self.convert_charrefs:
self.interesting = re.compile(r'&|%s(?=[\t\n\r\f />])' % self.cdata_elem,
re.IGNORECASE|re.ASCII)
else:
@@ -444,8 +454,10 @@ class HTMLParser(_markupbase.ParserBase):
self.handle_startendtag(tag, attrs)
else:
self.handle_starttag(tag, attrs)
- if tag in self.CDATA_CONTENT_ELEMENTS:
- self.set_cdata_mode(tag)
+ if (tag in self.CDATA_CONTENT_ELEMENTS or
+ (self.scripting and tag == "noscript") or
+ tag == "plaintext"):
+ self.set_cdata_mode(tag, escapable=False)
elif tag in self.RCDATA_CONTENT_ELEMENTS:
self.set_cdata_mode(tag, escapable=True)
return endpos
diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py
index 6a1d69335a0..19dde9362a4 100644
--- a/Lib/test/test_htmlparser.py
+++ b/Lib/test/test_htmlparser.py
@@ -8,6 +8,18 @@ from unittest.mock import patch
from test import support
+SAMPLE_RCDATA = (
+ ''
+ ""
+ ''
+ ''
+ ''
+ '\u2603'
+)
+
+SAMPLE_RAWTEXT = SAMPLE_RCDATA + '&☺'
+
+
class EventCollector(html.parser.HTMLParser):
def __init__(self, *args, autocdata=False, **kw):
@@ -293,30 +305,20 @@ text
'Date().getTime()+\'"><\\/s\'+\'cript>\');\n//]]>'),
'\n\n',
'',
- 'foo = " script>"',
- 'foo = ""',
- 'foo = ""',
- 'foo = ""',
- 'foo = "ſcript>"',
- 'foo = ""',
])
def test_script_content(self, content):
s = f''
- self._run_check(s, [("starttag", "script", []),
- ("data", content),
- ("endtag", "script")])
+ self._run_check(s, [
+ ("starttag", "script", []),
+ ("data", content),
+ ("endtag", "script"),
+ ])
@support.subTests('content', [
'a::before { content: ""; }',
'a::before { content: "¬-an-entity-ref;"; }',
'a::before { content: ""; }',
'a::before { content: "\u2603"; }',
- 'a::before { content: "< /style>"; }',
- 'a::before { content: " style>"; }',
- 'a::before { content: ""; }',
- 'a::before { content: ""; }',
- 'a::before { content: ""; }',
- 'a::before { content: "ſtyle>"; }',
])
def test_style_content(self, content):
s = f''
@@ -324,47 +326,59 @@ text
("data", content),
("endtag", "style")])
- @support.subTests('content', [
- '',
- "",
- '',
- '',
- '',
- '\u2603',
- '< /title>',
- ' title>',
- '',
- '',
- '',
- '',
- ])
- def test_title_content(self, content):
- source = f"{content}"
+ @support.subTests('tag', ['title', 'textarea'])
+ def test_rcdata_content(self, tag):
+ source = f"<{tag}>{SAMPLE_RCDATA}{tag}>"
self._run_check(source, [
- ("starttag", "title", []),
- ("data", content),
- ("endtag", "title"),
+ ("starttag", tag, []),
+ ("data", SAMPLE_RCDATA),
+ ("endtag", tag),
+ ])
+ source = f"<{tag}>&{tag}>"
+ self._run_check(source, [
+ ("starttag", tag, []),
+ ('entityref', 'amp'),
+ ("endtag", tag),
])
- @support.subTests('content', [
- '',
- "",
- '',
- '',
- '',
- '\u2603',
- '< /textarea>',
- ' textarea>',
- '',
- '',
- '',
- ])
- def test_textarea_content(self, content):
- source = f""
+ @support.subTests('tag',
+ ['style', 'xmp', 'iframe', 'noembed', 'noframes', 'script'])
+ def test_rawtext_content(self, tag):
+ source = f"<{tag}>{SAMPLE_RAWTEXT}{tag}>"
self._run_check(source, [
- ("starttag", "textarea", []),
+ ("starttag", tag, []),
+ ("data", SAMPLE_RAWTEXT),
+ ("endtag", tag),
+ ])
+
+ def test_noscript_content(self):
+ source = f""
+ # scripting=False -- normal mode
+ self._run_check(source, [
+ ('starttag', 'noscript', []),
+ ('comment', ' not a comment '),
+ ('starttag', 'not', [('a', 'start tag')]),
+ ('unknown decl', 'CDATA[not a cdata'),
+ ('comment', 'not a bogus comment'),
+ ('endtag', 'not'),
+ ('data', '☃'),
+ ('entityref', 'amp'),
+ ('charref', '9786'),
+ ('endtag', 'noscript'),
+ ])
+ # scripting=True -- RAWTEXT mode
+ self._run_check(source, [
+ ("starttag", "noscript", []),
+ ("data", SAMPLE_RAWTEXT),
+ ("endtag", "noscript"),
+ ], collector=EventCollector(scripting=True))
+
+ def test_plaintext_content(self):
+ content = SAMPLE_RAWTEXT + '' # not closing
+ source = f"{content}"
+ self._run_check(source, [
+ ("starttag", "plaintext", []),
("data", content),
- ("endtag", "textarea"),
])
@support.subTests('endtag', ['script', 'SCRIPT', 'script ', 'script\n',
@@ -381,52 +395,65 @@ text
("endtag", "script")],
collector=EventCollectorNoNormalize(convert_charrefs=False))
- @support.subTests('endtag', ['style', 'STYLE', 'style ', 'style\n',
- 'style/', 'style foo=bar', 'style foo=">"'])
- def test_style_closing_tag(self, endtag):
- content = """
- b::before { content: ""; }
- p::before { content: "¬-an-entity-ref;"; }
- a::before { content: ""; }
- a::after { content: ""; }
- """
- s = f'