mirror of
https://github.com/python/cpython.git
synced 2025-08-04 17:08:35 +00:00
Issue #25638: Optimized ElementTree.iterparse(); it is now 2x faster.
ElementTree.XMLParser._setevents now accepts any objects with the append method, not just a list.
This commit is contained in:
parent
f2fdfe1f88
commit
9ec5e25f26
4 changed files with 56 additions and 80 deletions
|
@ -95,6 +95,7 @@ import sys
|
|||
import re
|
||||
import warnings
|
||||
import io
|
||||
import collections
|
||||
import contextlib
|
||||
|
||||
from . import ElementPath
|
||||
|
@ -1198,16 +1199,37 @@ def iterparse(source, events=None, parser=None):
|
|||
Returns an iterator providing (event, elem) pairs.
|
||||
|
||||
"""
|
||||
# Use the internal, undocumented _parser argument for now; When the
|
||||
# parser argument of iterparse is removed, this can be killed.
|
||||
pullparser = XMLPullParser(events=events, _parser=parser)
|
||||
def iterator():
|
||||
try:
|
||||
while True:
|
||||
yield from pullparser.read_events()
|
||||
# load event buffer
|
||||
data = source.read(16 * 1024)
|
||||
if not data:
|
||||
break
|
||||
pullparser.feed(data)
|
||||
root = pullparser._close_and_return_root()
|
||||
yield from pullparser.read_events()
|
||||
it.root = root
|
||||
finally:
|
||||
if close_source:
|
||||
source.close()
|
||||
|
||||
class IterParseIterator(collections.Iterator):
|
||||
__next__ = iterator().__next__
|
||||
it = IterParseIterator()
|
||||
it.root = None
|
||||
del iterator, IterParseIterator
|
||||
|
||||
close_source = False
|
||||
if not hasattr(source, "read"):
|
||||
source = open(source, "rb")
|
||||
close_source = True
|
||||
try:
|
||||
return _IterParseIterator(source, events, parser, close_source)
|
||||
except:
|
||||
if close_source:
|
||||
source.close()
|
||||
raise
|
||||
|
||||
return it
|
||||
|
||||
|
||||
class XMLPullParser:
|
||||
|
@ -1217,9 +1239,7 @@ class XMLPullParser:
|
|||
# upon in user code. It will be removed in a future release.
|
||||
# See http://bugs.python.org/issue17741 for more details.
|
||||
|
||||
# _elementtree.c expects a list, not a deque
|
||||
self._events_queue = []
|
||||
self._index = 0
|
||||
self._events_queue = collections.deque()
|
||||
self._parser = _parser or XMLParser(target=TreeBuilder())
|
||||
# wire up the parser for event reporting
|
||||
if events is None:
|
||||
|
@ -1257,64 +1277,14 @@ class XMLPullParser:
|
|||
retrieved from the iterator.
|
||||
"""
|
||||
events = self._events_queue
|
||||
while True:
|
||||
index = self._index
|
||||
try:
|
||||
event = events[self._index]
|
||||
# Avoid retaining references to past events
|
||||
events[self._index] = None
|
||||
except IndexError:
|
||||
break
|
||||
index += 1
|
||||
# Compact the list in a O(1) amortized fashion
|
||||
# As noted above, _elementree.c needs a list, not a deque
|
||||
if index * 2 >= len(events):
|
||||
events[:index] = []
|
||||
self._index = 0
|
||||
else:
|
||||
self._index = index
|
||||
while events:
|
||||
event = events.popleft()
|
||||
if isinstance(event, Exception):
|
||||
raise event
|
||||
else:
|
||||
yield event
|
||||
|
||||
|
||||
class _IterParseIterator:
|
||||
|
||||
def __init__(self, source, events, parser, close_source=False):
|
||||
# Use the internal, undocumented _parser argument for now; When the
|
||||
# parser argument of iterparse is removed, this can be killed.
|
||||
self._parser = XMLPullParser(events=events, _parser=parser)
|
||||
self._file = source
|
||||
self._close_file = close_source
|
||||
self.root = self._root = None
|
||||
|
||||
def __next__(self):
|
||||
try:
|
||||
while 1:
|
||||
for event in self._parser.read_events():
|
||||
return event
|
||||
if self._parser._parser is None:
|
||||
break
|
||||
# load event buffer
|
||||
data = self._file.read(16 * 1024)
|
||||
if data:
|
||||
self._parser.feed(data)
|
||||
else:
|
||||
self._root = self._parser._close_and_return_root()
|
||||
self.root = self._root
|
||||
except:
|
||||
if self._close_file:
|
||||
self._file.close()
|
||||
raise
|
||||
if self._close_file:
|
||||
self._file.close()
|
||||
raise StopIteration
|
||||
|
||||
def __iter__(self):
|
||||
return self
|
||||
|
||||
|
||||
def XML(text, parser=None):
|
||||
"""Parse XML document from string constant.
|
||||
|
||||
|
|
|
@ -109,6 +109,8 @@ Core and Builtins
|
|||
Library
|
||||
-------
|
||||
|
||||
- Issue #25638: Optimized ElementTree.iterparse(); it is now 2x faster.
|
||||
|
||||
- Issue #25761: Improved detecting errors in broken pickle data.
|
||||
|
||||
- Issue #25717: Restore the previous behaviour of tolerating most fstat()
|
||||
|
|
|
@ -2289,7 +2289,7 @@ typedef struct {
|
|||
PyObject *element_factory;
|
||||
|
||||
/* element tracing */
|
||||
PyObject *events; /* list of events, or NULL if not collecting */
|
||||
PyObject *events_append; /* the append method of the list of events, or NULL */
|
||||
PyObject *start_event_obj; /* event objects (NULL to ignore) */
|
||||
PyObject *end_event_obj;
|
||||
PyObject *start_ns_event_obj;
|
||||
|
@ -2324,7 +2324,7 @@ treebuilder_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
|
|||
}
|
||||
t->index = 0;
|
||||
|
||||
t->events = NULL;
|
||||
t->events_append = NULL;
|
||||
t->start_event_obj = t->end_event_obj = NULL;
|
||||
t->start_ns_event_obj = t->end_ns_event_obj = NULL;
|
||||
}
|
||||
|
@ -2374,7 +2374,7 @@ treebuilder_gc_clear(TreeBuilderObject *self)
|
|||
Py_CLEAR(self->start_ns_event_obj);
|
||||
Py_CLEAR(self->end_event_obj);
|
||||
Py_CLEAR(self->start_event_obj);
|
||||
Py_CLEAR(self->events);
|
||||
Py_CLEAR(self->events_append);
|
||||
Py_CLEAR(self->stack);
|
||||
Py_CLEAR(self->data);
|
||||
Py_CLEAR(self->last);
|
||||
|
@ -2455,13 +2455,14 @@ treebuilder_append_event(TreeBuilderObject *self, PyObject *action,
|
|||
PyObject *node)
|
||||
{
|
||||
if (action != NULL) {
|
||||
PyObject *res = PyTuple_Pack(2, action, node);
|
||||
PyObject *res;
|
||||
PyObject *event = PyTuple_Pack(2, action, node);
|
||||
if (event == NULL)
|
||||
return -1;
|
||||
res = PyObject_CallFunctionObjArgs(self->events_append, event, NULL);
|
||||
Py_DECREF(event);
|
||||
if (res == NULL)
|
||||
return -1;
|
||||
if (PyList_Append(self->events, res) < 0) {
|
||||
Py_DECREF(res);
|
||||
return -1;
|
||||
}
|
||||
Py_DECREF(res);
|
||||
}
|
||||
return 0;
|
||||
|
@ -3039,7 +3040,7 @@ expat_start_ns_handler(XMLParserObject* self, const XML_Char* prefix,
|
|||
if (PyErr_Occurred())
|
||||
return;
|
||||
|
||||
if (!target->events || !target->start_ns_event_obj)
|
||||
if (!target->events_append || !target->start_ns_event_obj)
|
||||
return;
|
||||
|
||||
if (!uri)
|
||||
|
@ -3062,7 +3063,7 @@ expat_end_ns_handler(XMLParserObject* self, const XML_Char* prefix_in)
|
|||
if (PyErr_Occurred())
|
||||
return;
|
||||
|
||||
if (!target->events)
|
||||
if (!target->events_append)
|
||||
return;
|
||||
|
||||
treebuilder_append_event(target, target->end_ns_event_obj, Py_None);
|
||||
|
@ -3551,7 +3552,7 @@ _elementtree_XMLParser_doctype_impl(XMLParserObject *self, PyObject *name,
|
|||
/*[clinic input]
|
||||
_elementtree.XMLParser._setevents
|
||||
|
||||
events_queue: object(subclass_of='&PyList_Type')
|
||||
events_queue: object
|
||||
events_to_report: object = None
|
||||
/
|
||||
|
||||
|
@ -3561,12 +3562,12 @@ static PyObject *
|
|||
_elementtree_XMLParser__setevents_impl(XMLParserObject *self,
|
||||
PyObject *events_queue,
|
||||
PyObject *events_to_report)
|
||||
/*[clinic end generated code: output=1440092922b13ed1 input=59db9742910c6174]*/
|
||||
/*[clinic end generated code: output=1440092922b13ed1 input=abf90830a1c3b0fc]*/
|
||||
{
|
||||
/* activate element event reporting */
|
||||
Py_ssize_t i, seqlen;
|
||||
TreeBuilderObject *target;
|
||||
PyObject *events_seq;
|
||||
PyObject *events_append, *events_seq;
|
||||
|
||||
if (!TreeBuilder_CheckExact(self->target)) {
|
||||
PyErr_SetString(
|
||||
|
@ -3579,9 +3580,11 @@ _elementtree_XMLParser__setevents_impl(XMLParserObject *self,
|
|||
|
||||
target = (TreeBuilderObject*) self->target;
|
||||
|
||||
Py_INCREF(events_queue);
|
||||
Py_XDECREF(target->events);
|
||||
target->events = events_queue;
|
||||
events_append = PyObject_GetAttrString(events_queue, "append");
|
||||
if (events_append == NULL)
|
||||
return NULL;
|
||||
Py_XDECREF(target->events_append);
|
||||
target->events_append = events_append;
|
||||
|
||||
/* clear out existing events */
|
||||
Py_CLEAR(target->start_event_obj);
|
||||
|
|
|
@ -668,12 +668,13 @@ _elementtree_XMLParser__setevents(XMLParserObject *self, PyObject *args)
|
|||
PyObject *events_queue;
|
||||
PyObject *events_to_report = Py_None;
|
||||
|
||||
if (!PyArg_ParseTuple(args, "O!|O:_setevents",
|
||||
&PyList_Type, &events_queue, &events_to_report))
|
||||
if (!PyArg_UnpackTuple(args, "_setevents",
|
||||
1, 2,
|
||||
&events_queue, &events_to_report))
|
||||
goto exit;
|
||||
return_value = _elementtree_XMLParser__setevents_impl(self, events_queue, events_to_report);
|
||||
|
||||
exit:
|
||||
return return_value;
|
||||
}
|
||||
/*[clinic end generated code: output=25b8bf7e7f2151ca input=a9049054013a1b77]*/
|
||||
/*[clinic end generated code: output=19d94e2d2726d3aa input=a9049054013a1b77]*/
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue