mirror of
https://github.com/python/cpython.git
synced 2025-09-26 18:29:57 +00:00
#15114: the strict mode of HTMLParser and the HTMLParseError exception are deprecated now that the parser is able to parse invalid markup.
This commit is contained in:
parent
a4db02c7a3
commit
3861d8b271
4 changed files with 35 additions and 18 deletions
|
@ -16,13 +16,14 @@
|
||||||
This module defines a class :class:`HTMLParser` which serves as the basis for
|
This module defines a class :class:`HTMLParser` which serves as the basis for
|
||||||
parsing text files formatted in HTML (HyperText Mark-up Language) and XHTML.
|
parsing text files formatted in HTML (HyperText Mark-up Language) and XHTML.
|
||||||
|
|
||||||
.. class:: HTMLParser(strict=True)
|
.. class:: HTMLParser(strict=False)
|
||||||
|
|
||||||
Create a parser instance. If *strict* is ``True`` (the default), invalid
|
Create a parser instance. If *strict* is ``False`` (the default), the parser
|
||||||
HTML results in :exc:`~html.parser.HTMLParseError` exceptions [#]_. If
|
will accept and parse invalid markup. If *strict* is ``True`` the parser
|
||||||
*strict* is ``False``, the parser uses heuristics to make a best guess at
|
will raise an :exc:`~html.parser.HTMLParseError` exception instead [#]_ when
|
||||||
the intention of any invalid HTML it encounters, similar to the way most
|
it's not able to parse the markup.
|
||||||
browsers do. Using ``strict=False`` is advised.
|
The use of ``strict=True`` is discouraged and the *strict* argument is
|
||||||
|
deprecated.
|
||||||
|
|
||||||
An :class:`.HTMLParser` instance is fed HTML data and calls handler methods
|
An :class:`.HTMLParser` instance is fed HTML data and calls handler methods
|
||||||
when start tags, end tags, text, comments, and other markup elements are
|
when start tags, end tags, text, comments, and other markup elements are
|
||||||
|
@ -34,6 +35,10 @@ parsing text files formatted in HTML (HyperText Mark-up Language) and XHTML.
|
||||||
|
|
||||||
.. versionchanged:: 3.2 *strict* keyword added
|
.. versionchanged:: 3.2 *strict* keyword added
|
||||||
|
|
||||||
|
.. deprecated-removed:: 3.3 3.5
|
||||||
|
The *strict* argument and the strict mode have been deprecated.
|
||||||
|
The parser is now able to accept and parse invalid markup too.
|
||||||
|
|
||||||
An exception is defined as well:
|
An exception is defined as well:
|
||||||
|
|
||||||
|
|
||||||
|
@ -46,6 +51,10 @@ An exception is defined as well:
|
||||||
detected, and :attr:`offset` is the number of characters into the line at
|
detected, and :attr:`offset` is the number of characters into the line at
|
||||||
which the construct starts.
|
which the construct starts.
|
||||||
|
|
||||||
|
.. deprecated-removed:: 3.3 3.5
|
||||||
|
This exception has been deprecated because it's never raised by the parser
|
||||||
|
(when the default non-strict mode is used).
|
||||||
|
|
||||||
|
|
||||||
Example HTML Parser Application
|
Example HTML Parser Application
|
||||||
-------------------------------
|
-------------------------------
|
||||||
|
|
|
@ -10,6 +10,7 @@
|
||||||
|
|
||||||
import _markupbase
|
import _markupbase
|
||||||
import re
|
import re
|
||||||
|
import warnings
|
||||||
|
|
||||||
# Regular expressions used for parsing
|
# Regular expressions used for parsing
|
||||||
|
|
||||||
|
@ -113,14 +114,16 @@ class HTMLParser(_markupbase.ParserBase):
|
||||||
|
|
||||||
CDATA_CONTENT_ELEMENTS = ("script", "style")
|
CDATA_CONTENT_ELEMENTS = ("script", "style")
|
||||||
|
|
||||||
def __init__(self, strict=True):
|
def __init__(self, strict=False):
|
||||||
"""Initialize and reset this instance.
|
"""Initialize and reset this instance.
|
||||||
|
|
||||||
If strict is set to True (the default), errors are raised when invalid
|
If strict is set to False (the default) the parser will parse invalid
|
||||||
HTML is encountered. If set to False, an attempt is instead made to
|
markup, otherwise it will raise an error. Note that the strict mode
|
||||||
continue parsing, making "best guesses" about the intended meaning, in
|
is deprecated.
|
||||||
a fashion similar to what browsers typically do.
|
|
||||||
"""
|
"""
|
||||||
|
if strict:
|
||||||
|
warnings.warn("The strict mode is deprecated.",
|
||||||
|
DeprecationWarning, stacklevel=2)
|
||||||
self.strict = strict
|
self.strict = strict
|
||||||
self.reset()
|
self.reset()
|
||||||
|
|
||||||
|
@ -271,8 +274,8 @@ class HTMLParser(_markupbase.ParserBase):
|
||||||
# See also parse_declaration in _markupbase
|
# See also parse_declaration in _markupbase
|
||||||
def parse_html_declaration(self, i):
|
def parse_html_declaration(self, i):
|
||||||
rawdata = self.rawdata
|
rawdata = self.rawdata
|
||||||
if rawdata[i:i+2] != '<!':
|
assert rawdata[i:i+2] == '<!', ('unexpected call to '
|
||||||
self.error('unexpected call to parse_html_declaration()')
|
'parse_html_declaration()')
|
||||||
if rawdata[i:i+4] == '<!--':
|
if rawdata[i:i+4] == '<!--':
|
||||||
# this case is actually already handled in goahead()
|
# this case is actually already handled in goahead()
|
||||||
return self.parse_comment(i)
|
return self.parse_comment(i)
|
||||||
|
@ -292,8 +295,8 @@ class HTMLParser(_markupbase.ParserBase):
|
||||||
# see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state
|
# see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state
|
||||||
def parse_bogus_comment(self, i, report=1):
|
def parse_bogus_comment(self, i, report=1):
|
||||||
rawdata = self.rawdata
|
rawdata = self.rawdata
|
||||||
if rawdata[i:i+2] not in ('<!', '</'):
|
assert rawdata[i:i+2] in ('<!', '</'), ('unexpected call to '
|
||||||
self.error('unexpected call to parse_comment()')
|
'parse_comment()')
|
||||||
pos = rawdata.find('>', i+2)
|
pos = rawdata.find('>', i+2)
|
||||||
if pos == -1:
|
if pos == -1:
|
||||||
return -1
|
return -1
|
||||||
|
|
|
@ -102,7 +102,8 @@ class TestCaseBase(unittest.TestCase):
|
||||||
class HTMLParserStrictTestCase(TestCaseBase):
|
class HTMLParserStrictTestCase(TestCaseBase):
|
||||||
|
|
||||||
def get_collector(self):
|
def get_collector(self):
|
||||||
return EventCollector(strict=True)
|
with support.check_warnings(("", DeprecationWarning), quite=False):
|
||||||
|
return EventCollector(strict=True)
|
||||||
|
|
||||||
def test_processing_instruction_only(self):
|
def test_processing_instruction_only(self):
|
||||||
self._run_check("<?processing instruction>", [
|
self._run_check("<?processing instruction>", [
|
||||||
|
@ -594,7 +595,8 @@ class HTMLParserTolerantTestCase(HTMLParserStrictTestCase):
|
||||||
class AttributesStrictTestCase(TestCaseBase):
|
class AttributesStrictTestCase(TestCaseBase):
|
||||||
|
|
||||||
def get_collector(self):
|
def get_collector(self):
|
||||||
return EventCollector(strict=True)
|
with support.check_warnings(("", DeprecationWarning), quite=False):
|
||||||
|
return EventCollector(strict=True)
|
||||||
|
|
||||||
def test_attr_syntax(self):
|
def test_attr_syntax(self):
|
||||||
output = [
|
output = [
|
||||||
|
|
|
@ -43,6 +43,9 @@ Core and Builtins
|
||||||
Library
|
Library
|
||||||
-------
|
-------
|
||||||
|
|
||||||
|
- Issue #15114: the strict mode of HTMLParser and the HTMLParseError exception
|
||||||
|
are deprecated now that the parser is able to parse invalid markup.
|
||||||
|
|
||||||
- Issue #3665: \u and \U escapes are now supported in unicode regular
|
- Issue #3665: \u and \U escapes are now supported in unicode regular
|
||||||
expressions. Patch by Serhiy Storchaka.
|
expressions. Patch by Serhiy Storchaka.
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue