#15114: the strict mode of HTMLParser and the HTMLParseError exception are deprecated now that the parser is able to parse invalid markup.

2025-09-26 18:29:57 +00:00 · 2012-06-23 15:27:51 +02:00 · 2012-06-23 15:27:51 +02:00 · 3861d8b271
commit 3861d8b271
parent a4db02c7a3
4 changed files with 35 additions and 18 deletions
--- a/Doc/library/html.parser.rst
+++ b/Doc/library/html.parser.rst
@ -16,13 +16,14 @@
 This module defines a class :class:`HTMLParser` which serves as the basis for
 parsing text files formatted in HTML (HyperText Mark-up Language) and XHTML.
-.. class:: HTMLParser(strict=True)
+.. class:: HTMLParser(strict=False)
-   Create a parser instance.  If *strict* is ``True`` (the default), invalid
+   Create a parser instance.  If *strict* is ``False`` (the default), the parser
-   HTML results in :exc:`~html.parser.HTMLParseError` exceptions [#]_.  If
+   will accept and parse invalid markup.  If *strict* is ``True`` the parser
-   *strict* is ``False``, the parser uses heuristics to make a best guess at
+   will raise an :exc:`~html.parser.HTMLParseError` exception instead [#]_ when
-   the intention of any invalid HTML it encounters, similar to the way most
+   it's not able to parse the markup.
-   browsers do.  Using ``strict=False`` is advised.
+   The use of ``strict=True`` is discouraged and the *strict* argument is
   deprecated.
   An :class:`.HTMLParser` instance is fed HTML data and calls handler methods
   when start tags, end tags, text, comments, and other markup elements are
@ -34,6 +35,10 @@ parsing text files formatted in HTML (HyperText Mark-up Language) and XHTML.
   .. versionchanged:: 3.2 *strict* keyword added
   .. deprecated-removed:: 3.3 3.5
      The *strict* argument and the strict mode have been deprecated.
      The parser is now able to accept and parse invalid markup too.
 An exception is defined as well:
@ -46,6 +51,10 @@ An exception is defined as well:
   detected, and :attr:`offset` is the number of characters into the line at
   which the construct starts.
   .. deprecated-removed:: 3.3 3.5
      This exception has been deprecated because it's never raised by the parser
      (when the default non-strict mode is used).
 Example HTML Parser Application
 -------------------------------
--- a/Lib/html/parser.py
+++ b/Lib/html/parser.py
@ -10,6 +10,7 @@
 import _markupbase
 import re
 import warnings
 # Regular expressions used for parsing
@ -113,14 +114,16 @@ class HTMLParser(_markupbase.ParserBase):
    CDATA_CONTENT_ELEMENTS = ("script", "style")
-    def __init__(self, strict=True):
+    def __init__(self, strict=False):
        """Initialize and reset this instance.
-        If strict is set to True (the default), errors are raised when invalid
+        If strict is set to False (the default) the parser will parse invalid
-        HTML is encountered.  If set to False, an attempt is instead made to
+        markup, otherwise it will raise an error.  Note that the strict mode
-        continue parsing, making "best guesses" about the intended meaning, in
+        is deprecated.
        a fashion similar to what browsers typically do.
        """
        if strict:
            warnings.warn("The strict mode is deprecated.",
                          DeprecationWarning, stacklevel=2)
        self.strict = strict
        self.reset()
@ -271,8 +274,8 @@ class HTMLParser(_markupbase.ParserBase):
    # See also parse_declaration in _markupbase
    def parse_html_declaration(self, i):
        rawdata = self.rawdata
-        if rawdata[i:i+2] != '<!':
+        assert rawdata[i:i+2] == '<!', ('unexpected call to '
-            self.error('unexpected call to parse_html_declaration()')
+                                        'parse_html_declaration()')
        if rawdata[i:i+4] == '<!--':
            # this case is actually already handled in goahead()
            return self.parse_comment(i)
@ -292,8 +295,8 @@ class HTMLParser(_markupbase.ParserBase):
    # see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state
    def parse_bogus_comment(self, i, report=1):
        rawdata = self.rawdata
-        if rawdata[i:i+2] not in ('<!', '</'):
+        assert rawdata[i:i+2] in ('<!', '</'), ('unexpected call to '
-            self.error('unexpected call to parse_comment()')
+                                                'parse_comment()')
        pos = rawdata.find('>', i+2)
        if pos == -1:
            return -1
--- a/Lib/test/test_htmlparser.py
+++ b/Lib/test/test_htmlparser.py
@ -102,7 +102,8 @@ class TestCaseBase(unittest.TestCase):
 class HTMLParserStrictTestCase(TestCaseBase):
    def get_collector(self):
-        return EventCollector(strict=True)
+        with support.check_warnings(("", DeprecationWarning), quite=False):
            return EventCollector(strict=True)
    def test_processing_instruction_only(self):
        self._run_check("<?processing instruction>", [
@ -594,7 +595,8 @@ class HTMLParserTolerantTestCase(HTMLParserStrictTestCase):
 class AttributesStrictTestCase(TestCaseBase):
    def get_collector(self):
-        return EventCollector(strict=True)
+        with support.check_warnings(("", DeprecationWarning), quite=False):
            return EventCollector(strict=True)
    def test_attr_syntax(self):
        output = [
--- a/Misc/NEWS
+++ b/Misc/NEWS
@ -43,6 +43,9 @@ Core and Builtins
 Library
 -------
 - Issue #15114: the strict mode of HTMLParser and the HTMLParseError exception
  are deprecated now that the parser is able to parse invalid markup.
 - Issue #3665: \u and \U escapes are now supported in unicode regular
  expressions.  Patch by Serhiy Storchaka.