mirror of
https://github.com/python/cpython.git
synced 2025-08-03 00:23:06 +00:00
Issue #25873: Optimized iterating ElementTree.
Iterating elements Element.iter() is now 40% faster, iterating text Element.itertext() is now up to 2.5 times faster.
This commit is contained in:
parent
47a9d59d51
commit
22adf2ac02
2 changed files with 107 additions and 160 deletions
|
@ -1986,22 +1986,22 @@ static PySequenceMethods element_as_sequence = {
|
|||
* pre-order traversal. To keep track of which sub-element should be returned
|
||||
* next, a stack of parents is maintained. This is a standard stack-based
|
||||
* iterative pre-order traversal of a tree.
|
||||
* The stack is managed using a single-linked list starting at parent_stack.
|
||||
* Each stack node contains the saved parent to which we should return after
|
||||
* The stack is managed using a continuous array.
|
||||
* Each stack item contains the saved parent to which we should return after
|
||||
* the current one is exhausted, and the next child to examine in that parent.
|
||||
*/
|
||||
typedef struct ParentLocator_t {
|
||||
ElementObject *parent;
|
||||
Py_ssize_t child_index;
|
||||
struct ParentLocator_t *next;
|
||||
} ParentLocator;
|
||||
|
||||
typedef struct {
|
||||
PyObject_HEAD
|
||||
ParentLocator *parent_stack;
|
||||
Py_ssize_t parent_stack_used;
|
||||
Py_ssize_t parent_stack_size;
|
||||
ElementObject *root_element;
|
||||
PyObject *sought_tag;
|
||||
int root_done;
|
||||
int gettext;
|
||||
} ElementIterObject;
|
||||
|
||||
|
@ -2009,13 +2009,11 @@ typedef struct {
|
|||
static void
|
||||
elementiter_dealloc(ElementIterObject *it)
|
||||
{
|
||||
ParentLocator *p = it->parent_stack;
|
||||
while (p) {
|
||||
ParentLocator *temp = p;
|
||||
Py_XDECREF(p->parent);
|
||||
p = p->next;
|
||||
PyObject_Free(temp);
|
||||
}
|
||||
Py_ssize_t i = it->parent_stack_used;
|
||||
it->parent_stack_used = 0;
|
||||
while (i--)
|
||||
Py_XDECREF(it->parent_stack[i].parent);
|
||||
PyMem_Free(it->parent_stack);
|
||||
|
||||
Py_XDECREF(it->sought_tag);
|
||||
Py_XDECREF(it->root_element);
|
||||
|
@ -2027,11 +2025,9 @@ elementiter_dealloc(ElementIterObject *it)
|
|||
static int
|
||||
elementiter_traverse(ElementIterObject *it, visitproc visit, void *arg)
|
||||
{
|
||||
ParentLocator *p = it->parent_stack;
|
||||
while (p) {
|
||||
Py_VISIT(p->parent);
|
||||
p = p->next;
|
||||
}
|
||||
Py_ssize_t i = it->parent_stack_used;
|
||||
while (i--)
|
||||
Py_VISIT(it->parent_stack[i].parent);
|
||||
|
||||
Py_VISIT(it->root_element);
|
||||
Py_VISIT(it->sought_tag);
|
||||
|
@ -2040,17 +2036,25 @@ elementiter_traverse(ElementIterObject *it, visitproc visit, void *arg)
|
|||
|
||||
/* Helper function for elementiter_next. Add a new parent to the parent stack.
|
||||
*/
|
||||
static ParentLocator *
|
||||
parent_stack_push_new(ParentLocator *stack, ElementObject *parent)
|
||||
static int
|
||||
parent_stack_push_new(ElementIterObject *it, ElementObject *parent)
|
||||
{
|
||||
ParentLocator *new_node = PyObject_Malloc(sizeof(ParentLocator));
|
||||
if (new_node) {
|
||||
new_node->parent = parent;
|
||||
Py_INCREF(parent);
|
||||
new_node->child_index = 0;
|
||||
new_node->next = stack;
|
||||
ParentLocator *item;
|
||||
|
||||
if (it->parent_stack_used >= it->parent_stack_size) {
|
||||
Py_ssize_t new_size = it->parent_stack_size * 2; /* never overflow */
|
||||
ParentLocator *parent_stack = it->parent_stack;
|
||||
PyMem_Resize(parent_stack, ParentLocator, new_size);
|
||||
if (parent_stack == NULL)
|
||||
return -1;
|
||||
it->parent_stack = parent_stack;
|
||||
it->parent_stack_size = new_size;
|
||||
}
|
||||
return new_node;
|
||||
item = it->parent_stack + it->parent_stack_used++;
|
||||
Py_INCREF(parent);
|
||||
item->parent = parent;
|
||||
item->child_index = 0;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static PyObject *
|
||||
|
@ -2067,151 +2071,91 @@ elementiter_next(ElementIterObject *it)
|
|||
* - itertext() also has to handle tail, after finishing with all the
|
||||
* children of a node.
|
||||
*/
|
||||
ElementObject *cur_parent;
|
||||
Py_ssize_t child_index;
|
||||
int rc;
|
||||
ElementObject *elem;
|
||||
PyObject *text;
|
||||
|
||||
while (1) {
|
||||
/* Handle the case reached in the beginning and end of iteration, where
|
||||
* the parent stack is empty. The root_done flag gives us indication
|
||||
* whether we've just started iterating (so root_done is 0), in which
|
||||
* case the root is returned. If root_done is 1 and we're here, the
|
||||
* the parent stack is empty. If root_element is NULL and we're here, the
|
||||
* iterator is exhausted.
|
||||
*/
|
||||
if (!it->parent_stack->parent) {
|
||||
if (it->root_done) {
|
||||
if (!it->parent_stack_used) {
|
||||
if (!it->root_element) {
|
||||
PyErr_SetNone(PyExc_StopIteration);
|
||||
return NULL;
|
||||
} else {
|
||||
elem = it->root_element;
|
||||
it->parent_stack = parent_stack_push_new(it->parent_stack,
|
||||
elem);
|
||||
if (!it->parent_stack) {
|
||||
PyErr_NoMemory();
|
||||
return NULL;
|
||||
}
|
||||
|
||||
Py_INCREF(elem);
|
||||
it->root_done = 1;
|
||||
rc = (it->sought_tag == Py_None);
|
||||
if (!rc) {
|
||||
rc = PyObject_RichCompareBool(elem->tag,
|
||||
it->sought_tag, Py_EQ);
|
||||
if (rc < 0) {
|
||||
Py_DECREF(elem);
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
if (rc) {
|
||||
if (it->gettext) {
|
||||
PyObject *text = element_get_text(elem);
|
||||
if (!text) {
|
||||
Py_DECREF(elem);
|
||||
return NULL;
|
||||
}
|
||||
Py_INCREF(text);
|
||||
Py_DECREF(elem);
|
||||
rc = PyObject_IsTrue(text);
|
||||
if (rc > 0)
|
||||
return text;
|
||||
Py_DECREF(text);
|
||||
if (rc < 0)
|
||||
return NULL;
|
||||
} else {
|
||||
return (PyObject *)elem;
|
||||
}
|
||||
}
|
||||
else {
|
||||
Py_DECREF(elem);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* See if there are children left to traverse in the current parent. If
|
||||
* yes, visit the next child. If not, pop the stack and try again.
|
||||
*/
|
||||
cur_parent = it->parent_stack->parent;
|
||||
child_index = it->parent_stack->child_index;
|
||||
if (cur_parent->extra && child_index < cur_parent->extra->length) {
|
||||
elem = (ElementObject *)cur_parent->extra->children[child_index];
|
||||
it->parent_stack->child_index++;
|
||||
it->parent_stack = parent_stack_push_new(it->parent_stack,
|
||||
elem);
|
||||
if (!it->parent_stack) {
|
||||
PyErr_NoMemory();
|
||||
return NULL;
|
||||
}
|
||||
|
||||
Py_INCREF(elem);
|
||||
if (it->gettext) {
|
||||
PyObject *text = element_get_text(elem);
|
||||
if (!text) {
|
||||
Py_DECREF(elem);
|
||||
return NULL;
|
||||
}
|
||||
Py_INCREF(text);
|
||||
Py_DECREF(elem);
|
||||
rc = PyObject_IsTrue(text);
|
||||
if (rc > 0)
|
||||
return text;
|
||||
Py_DECREF(text);
|
||||
if (rc < 0)
|
||||
return NULL;
|
||||
} else {
|
||||
rc = (it->sought_tag == Py_None);
|
||||
if (!rc) {
|
||||
rc = PyObject_RichCompareBool(elem->tag,
|
||||
it->sought_tag, Py_EQ);
|
||||
if (rc < 0) {
|
||||
Py_DECREF(elem);
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
if (rc) {
|
||||
return (PyObject *)elem;
|
||||
}
|
||||
Py_DECREF(elem);
|
||||
}
|
||||
elem = it->root_element; /* steals a reference */
|
||||
it->root_element = NULL;
|
||||
}
|
||||
else {
|
||||
PyObject *tail;
|
||||
ParentLocator *next;
|
||||
if (it->gettext) {
|
||||
Py_INCREF(cur_parent);
|
||||
tail = element_get_tail(cur_parent);
|
||||
if (!tail) {
|
||||
Py_DECREF(cur_parent);
|
||||
return NULL;
|
||||
}
|
||||
Py_INCREF(tail);
|
||||
Py_DECREF(cur_parent);
|
||||
}
|
||||
else {
|
||||
tail = Py_None;
|
||||
Py_INCREF(tail);
|
||||
}
|
||||
next = it->parent_stack->next;
|
||||
cur_parent = it->parent_stack->parent;
|
||||
PyObject_Free(it->parent_stack);
|
||||
it->parent_stack = next;
|
||||
Py_XDECREF(cur_parent);
|
||||
|
||||
/* Note that extra condition on it->parent_stack->parent here;
|
||||
* this is because itertext() is supposed to only return *inner*
|
||||
* text, not text following the element it began iteration with.
|
||||
/* See if there are children left to traverse in the current parent. If
|
||||
* yes, visit the next child. If not, pop the stack and try again.
|
||||
*/
|
||||
if (it->parent_stack->parent) {
|
||||
rc = PyObject_IsTrue(tail);
|
||||
if (rc > 0)
|
||||
return tail;
|
||||
Py_DECREF(tail);
|
||||
if (rc < 0)
|
||||
return NULL;
|
||||
}
|
||||
else {
|
||||
Py_DECREF(tail);
|
||||
ParentLocator *item = &it->parent_stack[it->parent_stack_used - 1];
|
||||
Py_ssize_t child_index = item->child_index;
|
||||
ElementObjectExtra *extra;
|
||||
elem = item->parent;
|
||||
extra = elem->extra;
|
||||
if (!extra || child_index >= extra->length) {
|
||||
it->parent_stack_used--;
|
||||
/* Note that extra condition on it->parent_stack_used here;
|
||||
* this is because itertext() is supposed to only return *inner*
|
||||
* text, not text following the element it began iteration with.
|
||||
*/
|
||||
if (it->gettext && it->parent_stack_used) {
|
||||
text = element_get_tail(elem);
|
||||
goto gettext;
|
||||
}
|
||||
Py_DECREF(elem);
|
||||
continue;
|
||||
}
|
||||
|
||||
elem = (ElementObject *)extra->children[child_index];
|
||||
item->child_index++;
|
||||
Py_INCREF(elem);
|
||||
}
|
||||
|
||||
if (parent_stack_push_new(it, elem) < 0) {
|
||||
Py_DECREF(elem);
|
||||
PyErr_NoMemory();
|
||||
return NULL;
|
||||
}
|
||||
if (it->gettext) {
|
||||
text = element_get_text(elem);
|
||||
goto gettext;
|
||||
}
|
||||
|
||||
if (it->sought_tag == Py_None)
|
||||
return (PyObject *)elem;
|
||||
|
||||
rc = PyObject_RichCompareBool(elem->tag, it->sought_tag, Py_EQ);
|
||||
if (rc > 0)
|
||||
return (PyObject *)elem;
|
||||
|
||||
Py_DECREF(elem);
|
||||
if (rc < 0)
|
||||
return NULL;
|
||||
continue;
|
||||
|
||||
gettext:
|
||||
if (!text) {
|
||||
Py_DECREF(elem);
|
||||
return NULL;
|
||||
}
|
||||
if (text == Py_None) {
|
||||
Py_DECREF(elem);
|
||||
}
|
||||
else {
|
||||
Py_INCREF(text);
|
||||
Py_DECREF(elem);
|
||||
rc = PyObject_IsTrue(text);
|
||||
if (rc > 0)
|
||||
return text;
|
||||
Py_DECREF(text);
|
||||
if (rc < 0)
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -2263,6 +2207,7 @@ static PyTypeObject ElementIter_Type = {
|
|||
0, /* tp_new */
|
||||
};
|
||||
|
||||
#define INIT_PARENT_STACK_SIZE 8
|
||||
|
||||
static PyObject *
|
||||
create_elementiter(ElementObject *self, PyObject *tag, int gettext)
|
||||
|
@ -2275,22 +2220,20 @@ create_elementiter(ElementObject *self, PyObject *tag, int gettext)
|
|||
|
||||
Py_INCREF(tag);
|
||||
it->sought_tag = tag;
|
||||
it->root_done = 0;
|
||||
it->gettext = gettext;
|
||||
Py_INCREF(self);
|
||||
it->root_element = self;
|
||||
|
||||
PyObject_GC_Track(it);
|
||||
|
||||
it->parent_stack = PyObject_Malloc(sizeof(ParentLocator));
|
||||
it->parent_stack = PyMem_New(ParentLocator, INIT_PARENT_STACK_SIZE);
|
||||
if (it->parent_stack == NULL) {
|
||||
Py_DECREF(it);
|
||||
PyErr_NoMemory();
|
||||
return NULL;
|
||||
}
|
||||
it->parent_stack->parent = NULL;
|
||||
it->parent_stack->child_index = 0;
|
||||
it->parent_stack->next = NULL;
|
||||
it->parent_stack_used = 0;
|
||||
it->parent_stack_size = INIT_PARENT_STACK_SIZE;
|
||||
|
||||
return (PyObject *)it;
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue