#15114: the strict mode and argument of HTMLParser, HTMLParser.error, and the HTMLParserError exception have been removed.

2025-09-26 10:19:53 +00:00 · 2014-08-02 14:10:30 +03:00 · 2014-08-02 14:10:30 +03:00 · 73a4359eb0
commit 73a4359eb0
parent ffff1440d1
4 changed files with 23 additions and 197 deletions
--- a/Doc/library/html.parser.rst
+++ b/Doc/library/html.parser.rst
@ -16,9 +16,9 @@
 This module defines a class :class:`HTMLParser` which serves as the basis for
 parsing text files formatted in HTML (HyperText Mark-up Language) and XHTML.
-.. class:: HTMLParser(strict=False, *, convert_charrefs=False)
+.. class:: HTMLParser(*, convert_charrefs=False)
-   Create a parser instance.
+   Create a parser instance able to parse invalid markup.
   If *convert_charrefs* is ``True`` (default: ``False``), all character
   references (except the ones in ``script``/``style`` elements) are
@ -26,12 +26,6 @@ parsing text files formatted in HTML (HyperText Mark-up Language) and XHTML.
   The use of ``convert_charrefs=True`` is encouraged and will become
   the default in Python 3.5.
   If *strict* is ``False`` (the default), the parser will accept and parse
   invalid markup.  If *strict* is ``True`` the parser will raise an
   :exc:`~html.parser.HTMLParseError` exception instead [#]_ when it's not
   able to parse the markup.  The use of ``strict=True`` is discouraged and
   the *strict* argument is deprecated.
   An :class:`.HTMLParser` instance is fed HTML data and calls handler methods
   when start tags, end tags, text, comments, and other markup elements are
   encountered.  The user should subclass :class:`.HTMLParser` and override its
@ -40,32 +34,9 @@ parsing text files formatted in HTML (HyperText Mark-up Language) and XHTML.
   This parser does not check that end tags match start tags or call the end-tag
   handler for elements which are closed implicitly by closing an outer element.
   .. versionchanged:: 3.2
      *strict* argument added.
   .. deprecated-removed:: 3.3 3.5
      The *strict* argument and the strict mode have been deprecated.
      The parser is now able to accept and parse invalid markup too.
   .. versionchanged:: 3.4
      *convert_charrefs* keyword argument added.
 An exception is defined as well:
 .. exception:: HTMLParseError
   Exception raised by the :class:`HTMLParser` class when it encounters an error
   while parsing and *strict* is ``True``.  This exception provides three
   attributes: :attr:`msg` is a brief message explaining the error,
   :attr:`lineno` is the number of the line on which the broken construct was
   detected, and :attr:`offset` is the number of characters into the line at
   which the construct starts.
   .. deprecated-removed:: 3.3 3.5
      This exception has been deprecated because it's never raised by the parser
      (when the default non-strict mode is used).
 Example HTML Parser Application
 -------------------------------
@ -246,8 +217,7 @@ implementations do nothing (except for :meth:`~HTMLParser.handle_startendtag`):
   The *data* parameter will be the entire contents of the declaration inside
   the ``<![...]>`` markup.  It is sometimes useful to be overridden by a
-   derived class.  The base class implementation raises an :exc:`HTMLParseError`
+   derived class.  The base class implementation does nothing.
   when *strict* is ``True``.
 .. _htmlparser-examples:
@ -358,9 +328,3 @@ Parsing invalid HTML (e.g. unquoted attributes) also works::
   Data     : tag soup
   End tag  : p
   End tag  : a
 .. rubric:: Footnotes
 .. [#] For backward compatibility reasons *strict* mode does not raise
       exceptions for all non-compliant HTML.  That is, some invalid HTML
       is tolerated even in *strict* mode.
--- a/Lib/html/parser.py
+++ b/Lib/html/parser.py
@ -29,35 +29,15 @@ starttagopen = re.compile('<[a-zA-Z]')
 piclose = re.compile('>')
 commentclose = re.compile(r'--\s*>')
 # Note:
-#  1) the strict attrfind isn't really strict, but we can't make it
+#  1) if you change tagfind/attrfind remember to update locatestarttagend too;
-#     correctly strict without breaking backward compatibility;
+#  2) if you change tagfind/attrfind and/or locatestarttagend the parser will
 #  2) if you change tagfind/attrfind remember to update locatestarttagend too;
 #  3) if you change tagfind/attrfind and/or locatestarttagend the parser will
 #     explode, so don't do it.
 tagfind = re.compile('([a-zA-Z][-.a-zA-Z0-9:_]*)(?:\s|/(?!>))*')
 # see http://www.w3.org/TR/html5/tokenization.html#tag-open-state
 # and http://www.w3.org/TR/html5/tokenization.html#tag-name-state
 tagfind_tolerant = re.compile('([a-zA-Z][^\t\n\r\f />\x00]*)(?:\s|/(?!>))*')
 attrfind = re.compile(
    r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
    r'(\'[^\']*\'|"[^"]*"|[^\s"\'=<>`]*))?')
 attrfind_tolerant = re.compile(
    r'((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*'
    r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?(?:\s|/(?!>))*')
 locatestarttagend = re.compile(r"""
  <[a-zA-Z][-.a-zA-Z0-9:_]*          # tag name
  (?:\s+                             # whitespace before attribute name
    (?:[a-zA-Z_][-.:a-zA-Z0-9_]*     # attribute name
      (?:\s*=\s*                     # value indicator
        (?:'[^']*'                   # LITA-enclosed value
          |\"[^\"]*\"                # LIT-enclosed value
          |[^'\">\s]+                # bare value
         )
       )?
     )
   )*
  \s*                                # trailing whitespace
 """, re.VERBOSE)
 locatestarttagend_tolerant = re.compile(r"""
  <[a-zA-Z][^\t\n\r\f />\x00]*       # tag name
  (?:[\s/]*                          # optional whitespace before attribute name
@ -79,24 +59,6 @@ endendtag = re.compile('>')
 endtagfind = re.compile('</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
 class HTMLParseError(Exception):
    """Exception raised for all parse errors."""
    def __init__(self, msg, position=(None, None)):
        assert msg
        self.msg = msg
        self.lineno = position[0]
        self.offset = position[1]
    def __str__(self):
        result = self.msg
        if self.lineno is not None:
            result = result + ", at line %d" % self.lineno
        if self.offset is not None:
            result = result + ", column %d" % (self.offset + 1)
        return result
 _default_sentinel = object()
 class HTMLParser(_markupbase.ParserBase):
@ -123,22 +85,12 @@ class HTMLParser(_markupbase.ParserBase):
    CDATA_CONTENT_ELEMENTS = ("script", "style")
-    def __init__(self, strict=_default_sentinel, *,
+    def __init__(self, *, convert_charrefs=_default_sentinel):
                 convert_charrefs=_default_sentinel):
        """Initialize and reset this instance.
        If convert_charrefs is True (default: False), all character references
        are automatically converted to the corresponding Unicode characters.
        If strict is set to False (the default) the parser will parse invalid
        markup, otherwise it will raise an error.  Note that the strict mode
        and argument are deprecated.
        """
        if strict is not _default_sentinel:
            warnings.warn("The strict argument and mode are deprecated.",
                          DeprecationWarning, stacklevel=2)
        else:
            strict = False  # default
        self.strict = strict
        if convert_charrefs is _default_sentinel:
            convert_charrefs = False  # default
            warnings.warn("The value of convert_charrefs will become True in "
@ -168,11 +120,6 @@ class HTMLParser(_markupbase.ParserBase):
        """Handle any buffered data."""
        self.goahead(1)
    def error(self, message):
        warnings.warn("The 'error' method is deprecated.",
                      DeprecationWarning, stacklevel=2)
        raise HTMLParseError(message, self.getpos())
    __starttag_text = None
    def get_starttag_text(self):
@ -227,9 +174,6 @@ class HTMLParser(_markupbase.ParserBase):
                elif startswith("<?", i):
                    k = self.parse_pi(i)
                elif startswith("<!", i):
                    if self.strict:
                        k = self.parse_declaration(i)
                    else:
                    k = self.parse_html_declaration(i)
                elif (i + 1) < n:
                    self.handle_data("<")
@ -239,8 +183,6 @@ class HTMLParser(_markupbase.ParserBase):
                if k < 0:
                    if not end:
                        break
                    if self.strict:
                        self.error("EOF in middle of construct")
                    k = rawdata.find('>', i + 1)
                    if k < 0:
                        k = rawdata.find('<', i + 1)
@ -282,9 +224,6 @@ class HTMLParser(_markupbase.ParserBase):
                if match:
                    # match.group() will contain at least 2 chars
                    if end and match.group() == rawdata[i:]:
                        if self.strict:
                            self.error("EOF in middle of entity or char ref")
                        else:
                        k = match.end()
                        if k <= i:
                            k = n
@ -367,17 +306,11 @@ class HTMLParser(_markupbase.ParserBase):
        # Now parse the data between i+1 and j into a tag and attrs
        attrs = []
        if self.strict:
            match = tagfind.match(rawdata, i+1)
        else:
        match = tagfind_tolerant.match(rawdata, i+1)
        assert match, 'unexpected call to parse_starttag()'
        k = match.end()
        self.lasttag = tag = match.group(1).lower()
        while k < endpos:
            if self.strict:
                m = attrfind.match(rawdata, k)
            else:
            m = attrfind_tolerant.match(rawdata, k)
            if not m:
                break
@ -401,9 +334,6 @@ class HTMLParser(_markupbase.ParserBase):
                         - self.__starttag_text.rfind("\n")
            else:
                offset = offset + len(self.__starttag_text)
            if self.strict:
                self.error("junk characters in start tag: %r"
                           % (rawdata[k:endpos][:20],))
            self.handle_data(rawdata[i:endpos])
            return endpos
        if end.endswith('/>'):
@ -419,9 +349,6 @@ class HTMLParser(_markupbase.ParserBase):
    # or -1 if incomplete.
    def check_for_whole_start_tag(self, i):
        rawdata = self.rawdata
        if self.strict:
            m = locatestarttagend.match(rawdata, i)
        else:
        m = locatestarttagend_tolerant.match(rawdata, i)
        if m:
            j = m.end()
@ -435,9 +362,6 @@ class HTMLParser(_markupbase.ParserBase):
                    # buffer boundary
                    return -1
                # else bogus input
                if self.strict:
                    self.updatepos(i, j + 1)
                    self.error("malformed empty start tag")
                if j > i:
                    return j
                else:
@ -450,9 +374,6 @@ class HTMLParser(_markupbase.ParserBase):
                # end of input in or before attribute value, or we have the
                # '/' from a '/>' ending
                return -1
            if self.strict:
                self.updatepos(i, j)
                self.error("malformed start tag")
            if j > i:
                return j
            else:
@ -472,8 +393,6 @@ class HTMLParser(_markupbase.ParserBase):
            if self.cdata_elem is not None:
                self.handle_data(rawdata[i:gtpos])
                return gtpos
            if self.strict:
                self.error("bad end tag: %r" % (rawdata[i:gtpos],))
            # find the name: w3.org/TR/html5/tokenization.html#tag-name-state
            namematch = tagfind_tolerant.match(rawdata, i+2)
            if not namematch:
@ -539,8 +458,7 @@ class HTMLParser(_markupbase.ParserBase):
        pass
    def unknown_decl(self, data):
-        if self.strict:
+        pass
            self.error("unknown declaration: %r" % (data,))
    # Internal -- helper to remove special character quoting
    def unescape(self, s):
--- a/Lib/test/test_htmlparser.py
+++ b/Lib/test/test_htmlparser.py
@ -85,7 +85,7 @@ class EventCollectorCharrefs(EventCollector):
 class TestCaseBase(unittest.TestCase):
    def get_collector(self):
-        raise NotImplementedError
+        return EventCollector(convert_charrefs=False)
    def _run_check(self, source, expected_events, collector=None):
        if collector is None:
@ -105,21 +105,8 @@ class TestCaseBase(unittest.TestCase):
        self._run_check(source, events,
                        EventCollectorExtra(convert_charrefs=False))
    def _parse_error(self, source):
        def parse(source=source):
            parser = self.get_collector()
            parser.feed(source)
            parser.close()
        with self.assertRaises(html.parser.HTMLParseError):
            with self.assertWarns(DeprecationWarning):
                parse()
-
+class HTMLParserTestCase(TestCaseBase):
 class HTMLParserStrictTestCase(TestCaseBase):
    def get_collector(self):
        with support.check_warnings(("", DeprecationWarning), quite=False):
            return EventCollector(strict=True, convert_charrefs=False)
    def test_processing_instruction_only(self):
        self._run_check("<?processing instruction>", [
@ -201,9 +188,6 @@ text
            ("data", "this < text > contains < bare>pointy< brackets"),
            ])
    def test_illegal_declarations(self):
        self._parse_error('<!spacer type="block" height="25">')
    def test_starttag_end_boundary(self):
        self._run_check("""<a b='<'>""", [("starttag", "a", [("b", "<")])])
        self._run_check("""<a b='>'>""", [("starttag", "a", [("b", ">")])])
@ -238,25 +222,6 @@ text
        self._run_check(["<!--abc--", ">"], output)
        self._run_check(["<!--abc-->", ""], output)
    def test_starttag_junk_chars(self):
        self._parse_error("</>")
        self._parse_error("</$>")
        self._parse_error("</")
        self._parse_error("</a")
        self._parse_error("<a<a>")
        self._parse_error("</a<a>")
        self._parse_error("<!")
        self._parse_error("<a")
        self._parse_error("<a foo='bar'")
        self._parse_error("<a foo='bar")
        self._parse_error("<a foo='>'")
        self._parse_error("<a foo='>")
        self._parse_error("<a$>")
        self._parse_error("<a$b>")
        self._parse_error("<a$b/>")
        self._parse_error("<a$b  >")
        self._parse_error("<a$b  />")
    def test_valid_doctypes(self):
        # from http://www.w3.org/QA/2002/04/valid-dtd-list.html
        dtds = ['HTML',  # HTML5 doctype
@ -281,9 +246,6 @@ text
            self._run_check("<!DOCTYPE %s>" % dtd,
                            [('decl', 'DOCTYPE ' + dtd)])
    def test_declaration_junk_chars(self):
        self._parse_error("<!DOCTYPE foo $ >")
    def test_startendtag(self):
        self._run_check("<p/>", [
            ("startendtag", "p", []),
@ -421,23 +383,12 @@ text
        self._run_check('no charrefs here', [('data', 'no charrefs here')],
                        collector=collector())
 class HTMLParserTolerantTestCase(HTMLParserStrictTestCase):
    def get_collector(self):
        return EventCollector(convert_charrefs=False)
    def test_deprecation_warnings(self):
        with self.assertWarns(DeprecationWarning):
            EventCollector()  # convert_charrefs not passed explicitly
        with self.assertWarns(DeprecationWarning):
            EventCollector(strict=True)
        with self.assertWarns(DeprecationWarning):
            EventCollector(strict=False)
        with self.assertRaises(html.parser.HTMLParseError):
            with self.assertWarns(DeprecationWarning):
                EventCollector().error('test')
    # the remaining tests were for the "tolerant" parser (which is now
    # the default), and check various kind of broken markup
    def test_tolerant_parsing(self):
        self._run_check('<html <html>te>>xt&a<<bc</a></html>\n'
                        '<img src="URL><//img></html</html>', [
@ -686,11 +637,7 @@ class HTMLParserTolerantTestCase(HTMLParserStrictTestCase):
        self._run_check(html, expected)
-class AttributesStrictTestCase(TestCaseBase):
+class AttributesTestCase(TestCaseBase):
    def get_collector(self):
        with support.check_warnings(("", DeprecationWarning), quite=False):
            return EventCollector(strict=True, convert_charrefs=False)
    def test_attr_syntax(self):
        output = [
@ -747,12 +694,6 @@ class AttributesStrictTestCase(TestCaseBase):
            [("starttag", "html", [("foo", "\u20AC&aa&unsupported;")])])
 class AttributesTolerantTestCase(AttributesStrictTestCase):
    def get_collector(self):
        return EventCollector(convert_charrefs=False)
    def test_attr_funky_names2(self):
        self._run_check(
            "<a $><b $=%><c \=/>",
--- a/Misc/NEWS
+++ b/Misc/NEWS
@ -121,6 +121,9 @@ Core and Builtins
 Library
 -------
 - Issue #15114: the strict mode and argument of HTMLParser, HTMLParser.error,
  and the HTMLParserError exception have been removed.
 - Issue #22085: Dropped support of Tk 8.3 in Tkinter.
 - Issue #21580: Now Tkinter correctly handles bytes arguments passed to Tk.