mirror of
https://github.com/python/cpython.git
synced 2025-12-11 19:40:17 +00:00
Issue #17741: Add ElementTree.IncrementalParser, an event-driven parser for non-blocking applications.
This commit is contained in:
parent
323d2927f0
commit
5b235d0923
4 changed files with 272 additions and 109 deletions
|
|
@ -397,6 +397,9 @@ Functions
|
||||||
|
|
||||||
If you need a fully populated element, look for "end" events instead.
|
If you need a fully populated element, look for "end" events instead.
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
For real event-driven parsing, see :class:`IncrementalParser`.
|
||||||
|
|
||||||
|
|
||||||
.. function:: parse(source, parser=None)
|
.. function:: parse(source, parser=None)
|
||||||
|
|
||||||
|
|
@ -833,6 +836,48 @@ QName Objects
|
||||||
:class:`QName` instances are opaque.
|
:class:`QName` instances are opaque.
|
||||||
|
|
||||||
|
|
||||||
|
IncrementalParser Objects
|
||||||
|
^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
|
||||||
|
.. class:: IncrementalParser(events=None, parser=None)
|
||||||
|
|
||||||
|
An incremental, event-driven parser suitable for non-blocking applications.
|
||||||
|
*events* is a list of events to report back. The supported events are the
|
||||||
|
strings ``"start"``, ``"end"``, ``"start-ns"`` and ``"end-ns"`` (the "ns"
|
||||||
|
events are used to get detailed namespace information). If *events* is
|
||||||
|
omitted, only ``"end"`` events are reported. *parser* is an optional
|
||||||
|
parser instance. If not given, the standard :class:`XMLParser` parser is
|
||||||
|
used.
|
||||||
|
|
||||||
|
.. method:: data_received(data)
|
||||||
|
|
||||||
|
Feed the given bytes data to the incremental parser.
|
||||||
|
|
||||||
|
.. method:: eof_received()
|
||||||
|
|
||||||
|
Signal the incremental parser that the data stream is terminated.
|
||||||
|
|
||||||
|
.. method:: events()
|
||||||
|
|
||||||
|
Iterate over the events which have been encountered in the data fed
|
||||||
|
to the parser. This method yields ``(event, elem)`` pairs, where
|
||||||
|
*event* is a string representing the type of event (e.g. ``"end"``)
|
||||||
|
and *elem* is the encountered :class:`Element` object.
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
:class:`IncrementalParser` only guarantees that it has seen the ">"
|
||||||
|
character of a starting tag when it emits a "start" event, so the
|
||||||
|
attributes are defined, but the contents of the text and tail attributes
|
||||||
|
are undefined at that point. The same applies to the element children;
|
||||||
|
they may or may not be present.
|
||||||
|
|
||||||
|
If you need a fully populated element, look for "end" events instead.
|
||||||
|
|
||||||
|
.. versionadded:: 3.4
|
||||||
|
|
||||||
|
|
||||||
.. _elementtree-treebuilder-objects:
|
.. _elementtree-treebuilder-objects:
|
||||||
|
|
||||||
TreeBuilder Objects
|
TreeBuilder Objects
|
||||||
|
|
|
||||||
|
|
@ -903,6 +903,134 @@ class ElementTreeTest(unittest.TestCase):
|
||||||
self.assertEqual(serialized, expected)
|
self.assertEqual(serialized, expected)
|
||||||
|
|
||||||
|
|
||||||
|
class IncrementalParserTest(unittest.TestCase):
|
||||||
|
|
||||||
|
def _feed(self, parser, data, chunk_size=None):
|
||||||
|
if chunk_size is None:
|
||||||
|
parser.data_received(data)
|
||||||
|
else:
|
||||||
|
for i in range(0, len(data), chunk_size):
|
||||||
|
parser.data_received(data[i:i+chunk_size])
|
||||||
|
|
||||||
|
def assert_event_tags(self, parser, expected):
|
||||||
|
events = parser.events()
|
||||||
|
self.assertEqual([(action, elem.tag) for action, elem in events],
|
||||||
|
expected)
|
||||||
|
|
||||||
|
def test_simple_xml(self):
|
||||||
|
for chunk_size in (None, 1, 5):
|
||||||
|
with self.subTest(chunk_size=chunk_size):
|
||||||
|
parser = ET.IncrementalParser()
|
||||||
|
self.assert_event_tags(parser, [])
|
||||||
|
self._feed(parser, "<!-- comment -->\n", chunk_size)
|
||||||
|
self.assert_event_tags(parser, [])
|
||||||
|
self._feed(parser,
|
||||||
|
"<root>\n <element key='value'>text</element",
|
||||||
|
chunk_size)
|
||||||
|
self.assert_event_tags(parser, [])
|
||||||
|
self._feed(parser, ">\n", chunk_size)
|
||||||
|
self.assert_event_tags(parser, [('end', 'element')])
|
||||||
|
self._feed(parser, "<element>text</element>tail\n", chunk_size)
|
||||||
|
self._feed(parser, "<empty-element/>\n", chunk_size)
|
||||||
|
self.assert_event_tags(parser, [
|
||||||
|
('end', 'element'),
|
||||||
|
('end', 'empty-element'),
|
||||||
|
])
|
||||||
|
self._feed(parser, "</root>\n", chunk_size)
|
||||||
|
self.assert_event_tags(parser, [('end', 'root')])
|
||||||
|
# Receiving EOF sets the `root` attribute
|
||||||
|
self.assertIs(parser.root, None)
|
||||||
|
parser.eof_received()
|
||||||
|
self.assertEqual(parser.root.tag, 'root')
|
||||||
|
|
||||||
|
def test_data_received_while_iterating(self):
|
||||||
|
parser = ET.IncrementalParser()
|
||||||
|
it = parser.events()
|
||||||
|
self._feed(parser, "<root>\n <element key='value'>text</element>\n")
|
||||||
|
action, elem = next(it)
|
||||||
|
self.assertEqual((action, elem.tag), ('end', 'element'))
|
||||||
|
self._feed(parser, "</root>\n")
|
||||||
|
action, elem = next(it)
|
||||||
|
self.assertEqual((action, elem.tag), ('end', 'root'))
|
||||||
|
with self.assertRaises(StopIteration):
|
||||||
|
next(it)
|
||||||
|
|
||||||
|
def test_simple_xml_with_ns(self):
|
||||||
|
parser = ET.IncrementalParser()
|
||||||
|
self.assert_event_tags(parser, [])
|
||||||
|
self._feed(parser, "<!-- comment -->\n")
|
||||||
|
self.assert_event_tags(parser, [])
|
||||||
|
self._feed(parser, "<root xmlns='namespace'>\n")
|
||||||
|
self.assert_event_tags(parser, [])
|
||||||
|
self._feed(parser, "<element key='value'>text</element")
|
||||||
|
self.assert_event_tags(parser, [])
|
||||||
|
self._feed(parser, ">\n")
|
||||||
|
self.assert_event_tags(parser, [('end', '{namespace}element')])
|
||||||
|
self._feed(parser, "<element>text</element>tail\n")
|
||||||
|
self._feed(parser, "<empty-element/>\n")
|
||||||
|
self.assert_event_tags(parser, [
|
||||||
|
('end', '{namespace}element'),
|
||||||
|
('end', '{namespace}empty-element'),
|
||||||
|
])
|
||||||
|
self._feed(parser, "</root>\n")
|
||||||
|
self.assert_event_tags(parser, [('end', '{namespace}root')])
|
||||||
|
# Receiving EOF sets the `root` attribute
|
||||||
|
self.assertIs(parser.root, None)
|
||||||
|
parser.eof_received()
|
||||||
|
self.assertEqual(parser.root.tag, '{namespace}root')
|
||||||
|
|
||||||
|
def test_events(self):
|
||||||
|
parser = ET.IncrementalParser(events=())
|
||||||
|
self._feed(parser, "<root/>\n")
|
||||||
|
self.assert_event_tags(parser, [])
|
||||||
|
|
||||||
|
parser = ET.IncrementalParser(events=('start', 'end'))
|
||||||
|
self._feed(parser, "<!-- comment -->\n")
|
||||||
|
self.assert_event_tags(parser, [])
|
||||||
|
self._feed(parser, "<root>\n")
|
||||||
|
self.assert_event_tags(parser, [('start', 'root')])
|
||||||
|
self._feed(parser, "<element key='value'>text</element")
|
||||||
|
self.assert_event_tags(parser, [('start', 'element')])
|
||||||
|
self._feed(parser, ">\n")
|
||||||
|
self.assert_event_tags(parser, [('end', 'element')])
|
||||||
|
self._feed(parser,
|
||||||
|
"<element xmlns='foo'>text<empty-element/></element>tail\n")
|
||||||
|
self.assert_event_tags(parser, [
|
||||||
|
('start', '{foo}element'),
|
||||||
|
('start', '{foo}empty-element'),
|
||||||
|
('end', '{foo}empty-element'),
|
||||||
|
('end', '{foo}element'),
|
||||||
|
])
|
||||||
|
self._feed(parser, "</root>")
|
||||||
|
parser.eof_received()
|
||||||
|
self.assertIs(parser.root, None)
|
||||||
|
self.assert_event_tags(parser, [('end', 'root')])
|
||||||
|
self.assertEqual(parser.root.tag, 'root')
|
||||||
|
|
||||||
|
parser = ET.IncrementalParser(events=('start',))
|
||||||
|
self._feed(parser, "<!-- comment -->\n")
|
||||||
|
self.assert_event_tags(parser, [])
|
||||||
|
self._feed(parser, "<root>\n")
|
||||||
|
self.assert_event_tags(parser, [('start', 'root')])
|
||||||
|
self._feed(parser, "<element key='value'>text</element")
|
||||||
|
self.assert_event_tags(parser, [('start', 'element')])
|
||||||
|
self._feed(parser, ">\n")
|
||||||
|
self.assert_event_tags(parser, [])
|
||||||
|
self._feed(parser,
|
||||||
|
"<element xmlns='foo'>text<empty-element/></element>tail\n")
|
||||||
|
self.assert_event_tags(parser, [
|
||||||
|
('start', '{foo}element'),
|
||||||
|
('start', '{foo}empty-element'),
|
||||||
|
])
|
||||||
|
self._feed(parser, "</root>")
|
||||||
|
parser.eof_received()
|
||||||
|
self.assertEqual(parser.root.tag, 'root')
|
||||||
|
|
||||||
|
def test_unknown_event(self):
|
||||||
|
with self.assertRaises(ValueError):
|
||||||
|
ET.IncrementalParser(events=('start', 'end', 'bogus'))
|
||||||
|
|
||||||
|
|
||||||
#
|
#
|
||||||
# xinclude tests (samples from appendix C of the xinclude specification)
|
# xinclude tests (samples from appendix C of the xinclude specification)
|
||||||
|
|
||||||
|
|
@ -1406,6 +1534,7 @@ class BugsTest(unittest.TestCase):
|
||||||
ET.register_namespace('test10777', 'http://myuri/')
|
ET.register_namespace('test10777', 'http://myuri/')
|
||||||
ET.register_namespace('test10777', 'http://myuri/')
|
ET.register_namespace('test10777', 'http://myuri/')
|
||||||
|
|
||||||
|
|
||||||
# --------------------------------------------------------------------
|
# --------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -2301,6 +2430,7 @@ def test_main(module=None):
|
||||||
ElementSlicingTest,
|
ElementSlicingTest,
|
||||||
BasicElementTest,
|
BasicElementTest,
|
||||||
ElementTreeTest,
|
ElementTreeTest,
|
||||||
|
IncrementalParserTest,
|
||||||
IOTest,
|
IOTest,
|
||||||
ParseErrorTest,
|
ParseErrorTest,
|
||||||
XIncludeTest,
|
XIncludeTest,
|
||||||
|
|
|
||||||
|
|
@ -1216,84 +1216,85 @@ def iterparse(source, events=None, parser=None):
|
||||||
if not hasattr(source, "read"):
|
if not hasattr(source, "read"):
|
||||||
source = open(source, "rb")
|
source = open(source, "rb")
|
||||||
close_source = True
|
close_source = True
|
||||||
if not parser:
|
|
||||||
parser = XMLParser(target=TreeBuilder())
|
|
||||||
return _IterParseIterator(source, events, parser, close_source)
|
return _IterParseIterator(source, events, parser, close_source)
|
||||||
|
|
||||||
class _IterParseIterator:
|
|
||||||
|
|
||||||
def __init__(self, source, events, parser, close_source=False):
|
class IncrementalParser:
|
||||||
self._file = source
|
|
||||||
self._close_file = close_source
|
def __init__(self, events=None, parser=None):
|
||||||
self._events = []
|
# _elementtree.c expects a list, not a deque
|
||||||
|
self._events_queue = []
|
||||||
self._index = 0
|
self._index = 0
|
||||||
self._error = None
|
|
||||||
self.root = self._root = None
|
self.root = self._root = None
|
||||||
|
if not parser:
|
||||||
|
parser = XMLParser(target=TreeBuilder())
|
||||||
self._parser = parser
|
self._parser = parser
|
||||||
# wire up the parser for event reporting
|
# wire up the parser for event reporting
|
||||||
parser = self._parser._parser
|
|
||||||
append = self._events.append
|
|
||||||
if events is None:
|
if events is None:
|
||||||
events = ["end"]
|
events = ("end",)
|
||||||
for event in events:
|
self._parser._setevents(self._events_queue, events)
|
||||||
if event == "start":
|
|
||||||
try:
|
|
||||||
parser.ordered_attributes = 1
|
|
||||||
parser.specified_attributes = 1
|
|
||||||
def handler(tag, attrib_in, event=event, append=append,
|
|
||||||
start=self._parser._start_list):
|
|
||||||
append((event, start(tag, attrib_in)))
|
|
||||||
parser.StartElementHandler = handler
|
|
||||||
except AttributeError:
|
|
||||||
def handler(tag, attrib_in, event=event, append=append,
|
|
||||||
start=self._parser._start):
|
|
||||||
append((event, start(tag, attrib_in)))
|
|
||||||
parser.StartElementHandler = handler
|
|
||||||
elif event == "end":
|
|
||||||
def handler(tag, event=event, append=append,
|
|
||||||
end=self._parser._end):
|
|
||||||
append((event, end(tag)))
|
|
||||||
parser.EndElementHandler = handler
|
|
||||||
elif event == "start-ns":
|
|
||||||
def handler(prefix, uri, event=event, append=append):
|
|
||||||
append((event, (prefix or "", uri or "")))
|
|
||||||
parser.StartNamespaceDeclHandler = handler
|
|
||||||
elif event == "end-ns":
|
|
||||||
def handler(prefix, event=event, append=append):
|
|
||||||
append((event, None))
|
|
||||||
parser.EndNamespaceDeclHandler = handler
|
|
||||||
else:
|
|
||||||
raise ValueError("unknown event %r" % event)
|
|
||||||
|
|
||||||
def __next__(self):
|
def data_received(self, data):
|
||||||
while 1:
|
|
||||||
try:
|
|
||||||
item = self._events[self._index]
|
|
||||||
self._index += 1
|
|
||||||
return item
|
|
||||||
except IndexError:
|
|
||||||
pass
|
|
||||||
if self._error:
|
|
||||||
e = self._error
|
|
||||||
self._error = None
|
|
||||||
raise e
|
|
||||||
if self._parser is None:
|
if self._parser is None:
|
||||||
self.root = self._root
|
raise ValueError("data_received() called after end of stream")
|
||||||
if self._close_file:
|
|
||||||
self._file.close()
|
|
||||||
raise StopIteration
|
|
||||||
# load event buffer
|
|
||||||
del self._events[:]
|
|
||||||
self._index = 0
|
|
||||||
data = self._file.read(16384)
|
|
||||||
if data:
|
if data:
|
||||||
try:
|
try:
|
||||||
self._parser.feed(data)
|
self._parser.feed(data)
|
||||||
except SyntaxError as exc:
|
except SyntaxError as exc:
|
||||||
self._error = exc
|
self._events_queue.append(exc)
|
||||||
else:
|
|
||||||
|
def eof_received(self):
|
||||||
self._root = self._parser.close()
|
self._root = self._parser.close()
|
||||||
self._parser = None
|
self._parser = None
|
||||||
|
if self._index >= len(self._events_queue):
|
||||||
|
self.root = self._root
|
||||||
|
|
||||||
|
def events(self):
|
||||||
|
events = self._events_queue
|
||||||
|
while True:
|
||||||
|
index = self._index
|
||||||
|
try:
|
||||||
|
event = events[self._index]
|
||||||
|
# Avoid retaining references to past events
|
||||||
|
events[self._index] = None
|
||||||
|
except IndexError:
|
||||||
|
break
|
||||||
|
index += 1
|
||||||
|
# Compact the list in a O(1) amortized fashion
|
||||||
|
if index * 2 >= len(events):
|
||||||
|
events[:index] = []
|
||||||
|
self._index = 0
|
||||||
|
else:
|
||||||
|
self._index = index
|
||||||
|
if isinstance(event, Exception):
|
||||||
|
raise event
|
||||||
|
else:
|
||||||
|
yield event
|
||||||
|
if self._parser is None:
|
||||||
|
self.root = self._root
|
||||||
|
|
||||||
|
|
||||||
|
class _IterParseIterator(IncrementalParser):
|
||||||
|
|
||||||
|
def __init__(self, source, events, parser, close_source=False):
|
||||||
|
IncrementalParser.__init__(self, events, parser)
|
||||||
|
self._file = source
|
||||||
|
self._close_file = close_source
|
||||||
|
|
||||||
|
def __next__(self):
|
||||||
|
while 1:
|
||||||
|
for event in self.events():
|
||||||
|
return event
|
||||||
|
if self._parser is None:
|
||||||
|
if self._close_file:
|
||||||
|
self._file.close()
|
||||||
|
raise StopIteration
|
||||||
|
# load event buffer
|
||||||
|
data = self._file.read(16384)
|
||||||
|
if data:
|
||||||
|
self.data_received(data)
|
||||||
|
else:
|
||||||
|
self.eof_received()
|
||||||
|
|
||||||
def __iter__(self):
|
def __iter__(self):
|
||||||
return self
|
return self
|
||||||
|
|
@ -1498,6 +1499,40 @@ class XMLParser:
|
||||||
except AttributeError:
|
except AttributeError:
|
||||||
pass # unknown
|
pass # unknown
|
||||||
|
|
||||||
|
def _setevents(self, event_list, events):
|
||||||
|
# Internal API for IncrementalParser
|
||||||
|
parser = self._parser
|
||||||
|
append = event_list.append
|
||||||
|
for event in events:
|
||||||
|
if event == "start":
|
||||||
|
try:
|
||||||
|
parser.ordered_attributes = 1
|
||||||
|
parser.specified_attributes = 1
|
||||||
|
def handler(tag, attrib_in, event=event, append=append,
|
||||||
|
start=self._start_list):
|
||||||
|
append((event, start(tag, attrib_in)))
|
||||||
|
parser.StartElementHandler = handler
|
||||||
|
except AttributeError:
|
||||||
|
def handler(tag, attrib_in, event=event, append=append,
|
||||||
|
start=self._start):
|
||||||
|
append((event, start(tag, attrib_in)))
|
||||||
|
parser.StartElementHandler = handler
|
||||||
|
elif event == "end":
|
||||||
|
def handler(tag, event=event, append=append,
|
||||||
|
end=self._end):
|
||||||
|
append((event, end(tag)))
|
||||||
|
parser.EndElementHandler = handler
|
||||||
|
elif event == "start-ns":
|
||||||
|
def handler(prefix, uri, event=event, append=append):
|
||||||
|
append((event, (prefix or "", uri or "")))
|
||||||
|
parser.StartNamespaceDeclHandler = handler
|
||||||
|
elif event == "end-ns":
|
||||||
|
def handler(prefix, event=event, append=append):
|
||||||
|
append((event, None))
|
||||||
|
parser.EndNamespaceDeclHandler = handler
|
||||||
|
else:
|
||||||
|
raise ValueError("unknown event %r" % event)
|
||||||
|
|
||||||
def _raiseerror(self, value):
|
def _raiseerror(self, value):
|
||||||
err = ParseError(value)
|
err = ParseError(value)
|
||||||
err.code = value.code
|
err.code = value.code
|
||||||
|
|
@ -1635,7 +1670,7 @@ try:
|
||||||
except ImportError:
|
except ImportError:
|
||||||
pass
|
pass
|
||||||
else:
|
else:
|
||||||
# Overwrite 'ElementTree.parse' and 'iterparse' to use the C XMLParser
|
# Overwrite 'ElementTree.parse' to use the C XMLParser
|
||||||
|
|
||||||
class ElementTree(ElementTree):
|
class ElementTree(ElementTree):
|
||||||
__doc__ = ElementTree.__doc__
|
__doc__ = ElementTree.__doc__
|
||||||
|
|
@ -1661,56 +1696,6 @@ else:
|
||||||
if close_source:
|
if close_source:
|
||||||
source.close()
|
source.close()
|
||||||
|
|
||||||
class iterparse:
|
|
||||||
__doc__ = iterparse.__doc__
|
|
||||||
root = None
|
|
||||||
def __init__(self, source, events=None, parser=None):
|
|
||||||
self._close_file = False
|
|
||||||
if not hasattr(source, 'read'):
|
|
||||||
source = open(source, 'rb')
|
|
||||||
self._close_file = True
|
|
||||||
self._file = source
|
|
||||||
self._events = []
|
|
||||||
self._index = 0
|
|
||||||
self._error = None
|
|
||||||
self.root = self._root = None
|
|
||||||
if parser is None:
|
|
||||||
parser = XMLParser(target=TreeBuilder())
|
|
||||||
self._parser = parser
|
|
||||||
self._parser._setevents(self._events, events)
|
|
||||||
|
|
||||||
def __next__(self):
|
|
||||||
while True:
|
|
||||||
try:
|
|
||||||
item = self._events[self._index]
|
|
||||||
self._index += 1
|
|
||||||
return item
|
|
||||||
except IndexError:
|
|
||||||
pass
|
|
||||||
if self._error:
|
|
||||||
e = self._error
|
|
||||||
self._error = None
|
|
||||||
raise e
|
|
||||||
if self._parser is None:
|
|
||||||
self.root = self._root
|
|
||||||
if self._close_file:
|
|
||||||
self._file.close()
|
|
||||||
raise StopIteration
|
|
||||||
# load event buffer
|
|
||||||
del self._events[:]
|
|
||||||
self._index = 0
|
|
||||||
data = self._file.read(16384)
|
|
||||||
if data:
|
|
||||||
try:
|
|
||||||
self._parser.feed(data)
|
|
||||||
except SyntaxError as exc:
|
|
||||||
self._error = exc
|
|
||||||
else:
|
|
||||||
self._root = self._parser.close()
|
|
||||||
self._parser = None
|
|
||||||
|
|
||||||
def __iter__(self):
|
|
||||||
return self
|
|
||||||
|
|
||||||
# compatibility
|
# compatibility
|
||||||
XMLTreeBuilder = XMLParser
|
XMLTreeBuilder = XMLParser
|
||||||
|
|
|
||||||
|
|
@ -45,6 +45,9 @@ Core and Builtins
|
||||||
Library
|
Library
|
||||||
-------
|
-------
|
||||||
|
|
||||||
|
- Issue #17741: Add ElementTree.IncrementalParser, an event-driven parser
|
||||||
|
for non-blocking applications.
|
||||||
|
|
||||||
- Issue #17555: Fix ForkAwareThreadLock so that size of after fork
|
- Issue #17555: Fix ForkAwareThreadLock so that size of after fork
|
||||||
registry does not grow exponentially with generation of process.
|
registry does not grow exponentially with generation of process.
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue