mirror of
https://github.com/python/cpython.git
synced 2025-10-02 21:25:24 +00:00
#21047: set the default value for the *convert_charrefs* argument of HTMLParser to True. Patch by Berker Peksag.
This commit is contained in:
parent
11bec7a1b8
commit
6fc16d81af
4 changed files with 12 additions and 17 deletions
|
@ -16,15 +16,13 @@
|
||||||
This module defines a class :class:`HTMLParser` which serves as the basis for
|
This module defines a class :class:`HTMLParser` which serves as the basis for
|
||||||
parsing text files formatted in HTML (HyperText Mark-up Language) and XHTML.
|
parsing text files formatted in HTML (HyperText Mark-up Language) and XHTML.
|
||||||
|
|
||||||
.. class:: HTMLParser(*, convert_charrefs=False)
|
.. class:: HTMLParser(*, convert_charrefs=True)
|
||||||
|
|
||||||
Create a parser instance able to parse invalid markup.
|
Create a parser instance able to parse invalid markup.
|
||||||
|
|
||||||
If *convert_charrefs* is ``True`` (default: ``False``), all character
|
If *convert_charrefs* is ``True`` (the default), all character
|
||||||
references (except the ones in ``script``/``style`` elements) are
|
references (except the ones in ``script``/``style`` elements) are
|
||||||
automatically converted to the corresponding Unicode characters.
|
automatically converted to the corresponding Unicode characters.
|
||||||
The use of ``convert_charrefs=True`` is encouraged and will become
|
|
||||||
the default in Python 3.5.
|
|
||||||
|
|
||||||
An :class:`.HTMLParser` instance is fed HTML data and calls handler methods
|
An :class:`.HTMLParser` instance is fed HTML data and calls handler methods
|
||||||
when start tags, end tags, text, comments, and other markup elements are
|
when start tags, end tags, text, comments, and other markup elements are
|
||||||
|
@ -37,6 +35,9 @@ parsing text files formatted in HTML (HyperText Mark-up Language) and XHTML.
|
||||||
.. versionchanged:: 3.4
|
.. versionchanged:: 3.4
|
||||||
*convert_charrefs* keyword argument added.
|
*convert_charrefs* keyword argument added.
|
||||||
|
|
||||||
|
.. versionchanged:: 3.5
|
||||||
|
The default value for argument *convert_charrefs* is now ``True``.
|
||||||
|
|
||||||
|
|
||||||
Example HTML Parser Application
|
Example HTML Parser Application
|
||||||
-------------------------------
|
-------------------------------
|
||||||
|
|
|
@ -59,7 +59,6 @@ endendtag = re.compile('>')
|
||||||
endtagfind = re.compile('</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
|
endtagfind = re.compile('</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
|
||||||
|
|
||||||
|
|
||||||
_default_sentinel = object()
|
|
||||||
|
|
||||||
class HTMLParser(_markupbase.ParserBase):
|
class HTMLParser(_markupbase.ParserBase):
|
||||||
"""Find tags and other markup and call handler functions.
|
"""Find tags and other markup and call handler functions.
|
||||||
|
@ -85,17 +84,12 @@ class HTMLParser(_markupbase.ParserBase):
|
||||||
|
|
||||||
CDATA_CONTENT_ELEMENTS = ("script", "style")
|
CDATA_CONTENT_ELEMENTS = ("script", "style")
|
||||||
|
|
||||||
def __init__(self, *, convert_charrefs=_default_sentinel):
|
def __init__(self, *, convert_charrefs=True):
|
||||||
"""Initialize and reset this instance.
|
"""Initialize and reset this instance.
|
||||||
|
|
||||||
If convert_charrefs is True (default: False), all character references
|
If convert_charrefs is True (the default), all character references
|
||||||
are automatically converted to the corresponding Unicode characters.
|
are automatically converted to the corresponding Unicode characters.
|
||||||
"""
|
"""
|
||||||
if convert_charrefs is _default_sentinel:
|
|
||||||
convert_charrefs = False # default
|
|
||||||
warnings.warn("The value of convert_charrefs will become True in "
|
|
||||||
"3.5. You are encouraged to set the value explicitly.",
|
|
||||||
DeprecationWarning, stacklevel=2)
|
|
||||||
self.convert_charrefs = convert_charrefs
|
self.convert_charrefs = convert_charrefs
|
||||||
self.reset()
|
self.reset()
|
||||||
|
|
||||||
|
|
|
@ -346,7 +346,8 @@ text
|
||||||
self._run_check(html, expected)
|
self._run_check(html, expected)
|
||||||
|
|
||||||
def test_convert_charrefs(self):
|
def test_convert_charrefs(self):
|
||||||
collector = lambda: EventCollectorCharrefs(convert_charrefs=True)
|
# default value for convert_charrefs is now True
|
||||||
|
collector = lambda: EventCollectorCharrefs()
|
||||||
self.assertTrue(collector().convert_charrefs)
|
self.assertTrue(collector().convert_charrefs)
|
||||||
charrefs = ['"', '"', '"', '"', '"', '"']
|
charrefs = ['"', '"', '"', '"', '"', '"']
|
||||||
# check charrefs in the middle of the text/attributes
|
# check charrefs in the middle of the text/attributes
|
||||||
|
@ -383,10 +384,6 @@ text
|
||||||
self._run_check('no charrefs here', [('data', 'no charrefs here')],
|
self._run_check('no charrefs here', [('data', 'no charrefs here')],
|
||||||
collector=collector())
|
collector=collector())
|
||||||
|
|
||||||
def test_deprecation_warnings(self):
|
|
||||||
with self.assertWarns(DeprecationWarning):
|
|
||||||
EventCollector() # convert_charrefs not passed explicitly
|
|
||||||
|
|
||||||
# the remaining tests were for the "tolerant" parser (which is now
|
# the remaining tests were for the "tolerant" parser (which is now
|
||||||
# the default), and check various kind of broken markup
|
# the default), and check various kind of broken markup
|
||||||
def test_tolerant_parsing(self):
|
def test_tolerant_parsing(self):
|
||||||
|
|
|
@ -121,6 +121,9 @@ Core and Builtins
|
||||||
Library
|
Library
|
||||||
-------
|
-------
|
||||||
|
|
||||||
|
- Issue #21047: set the default value for the *convert_charrefs* argument
|
||||||
|
of HTMLParser to True. Patch by Berker Peksag.
|
||||||
|
|
||||||
- Add an __all__ to html.entities.
|
- Add an __all__ to html.entities.
|
||||||
|
|
||||||
- Issue #15114: the strict mode and argument of HTMLParser, HTMLParser.error,
|
- Issue #15114: the strict mode and argument of HTMLParser, HTMLParser.error,
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue