mirror of
https://github.com/python/cpython.git
synced 2025-11-11 14:44:57 +00:00
rewrote the pattern.split method in C
also restored SRE Unicode support for 1.6/2.0/2.1
This commit is contained in:
parent
9dbc0bcf9d
commit
971e78b55b
1 changed files with 136 additions and 12 deletions
146
Modules/_sre.c
146
Modules/_sre.c
|
|
@ -33,6 +33,7 @@
|
||||||
* 2001-07-01 fl added BIGCHARSET support (from Martin von Loewis)
|
* 2001-07-01 fl added BIGCHARSET support (from Martin von Loewis)
|
||||||
* 2001-09-18 fl added _getliteral helper
|
* 2001-09-18 fl added _getliteral helper
|
||||||
* 2001-10-18 fl fixed group reset issue (from Matthew Mueller)
|
* 2001-10-18 fl fixed group reset issue (from Matthew Mueller)
|
||||||
|
* 2001-10-20 fl added split primitive; reenable unicode for 1.6/2.0/2.1
|
||||||
*
|
*
|
||||||
* Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
|
* Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
|
||||||
*
|
*
|
||||||
|
|
@ -65,14 +66,19 @@ static char copyright[] =
|
||||||
/* defining this one enables tracing */
|
/* defining this one enables tracing */
|
||||||
#undef VERBOSE
|
#undef VERBOSE
|
||||||
|
|
||||||
#if PY_VERSION_HEX >= 0x01060000 && defined(Py_USING_UNICODE)
|
#if PY_VERSION_HEX >= 0x01060000
|
||||||
|
#if PY_VERSION_HEX < 0x02020000 || defined(Py_USING_UNICODE)
|
||||||
/* defining this enables unicode support (default under 1.6a1 and later) */
|
/* defining this enables unicode support (default under 1.6a1 and later) */
|
||||||
#define HAVE_UNICODE
|
#define HAVE_UNICODE
|
||||||
#endif
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
/* -------------------------------------------------------------------- */
|
/* -------------------------------------------------------------------- */
|
||||||
/* optional features */
|
/* optional features */
|
||||||
|
|
||||||
|
/* test: define to use sre._split helper instead of C code */
|
||||||
|
#undef USE_PYTHON_SPLIT
|
||||||
|
|
||||||
/* prevent run-away recursion (bad patterns on long strings) */
|
/* prevent run-away recursion (bad patterns on long strings) */
|
||||||
|
|
||||||
#if !defined(USE_STACKCHECK)
|
#if !defined(USE_STACKCHECK)
|
||||||
|
|
@ -1488,14 +1494,20 @@ state_fini(SRE_STATE* state)
|
||||||
}
|
}
|
||||||
|
|
||||||
LOCAL(PyObject*)
|
LOCAL(PyObject*)
|
||||||
state_getslice(SRE_STATE* state, int index, PyObject* string)
|
state_getslice(SRE_STATE* state, int index, PyObject* string, int empty)
|
||||||
{
|
{
|
||||||
int i, j;
|
int i, j;
|
||||||
|
|
||||||
index = (index - 1) * 2;
|
index = (index - 1) * 2;
|
||||||
|
|
||||||
if (string == Py_None || !state->mark[index] || !state->mark[index+1]) {
|
if (string == Py_None || !state->mark[index] || !state->mark[index+1]) {
|
||||||
|
if (empty)
|
||||||
|
/* want empty string */
|
||||||
i = j = 0;
|
i = j = 0;
|
||||||
|
else {
|
||||||
|
Py_INCREF(Py_None);
|
||||||
|
return Py_None;
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
i = ((char*)state->mark[index] - (char*)state->beginning) /
|
i = ((char*)state->mark[index] - (char*)state->beginning) /
|
||||||
state->charsize;
|
state->charsize;
|
||||||
|
|
@ -1782,6 +1794,7 @@ pattern_subn(PatternObject* self, PyObject* args, PyObject* kw)
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if defined(USE_PYTHON_SPLIT)
|
||||||
static PyObject*
|
static PyObject*
|
||||||
pattern_split(PatternObject* self, PyObject* args, PyObject* kw)
|
pattern_split(PatternObject* self, PyObject* args, PyObject* kw)
|
||||||
{
|
{
|
||||||
|
|
@ -1798,6 +1811,7 @@ pattern_split(PatternObject* self, PyObject* args, PyObject* kw)
|
||||||
Py_BuildValue("OOO", self, string, maxsplit)
|
Py_BuildValue("OOO", self, string, maxsplit)
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
static PyObject*
|
static PyObject*
|
||||||
pattern_findall(PatternObject* self, PyObject* args, PyObject* kw)
|
pattern_findall(PatternObject* self, PyObject* args, PyObject* kw)
|
||||||
|
|
@ -1805,7 +1819,7 @@ pattern_findall(PatternObject* self, PyObject* args, PyObject* kw)
|
||||||
SRE_STATE state;
|
SRE_STATE state;
|
||||||
PyObject* list;
|
PyObject* list;
|
||||||
int status;
|
int status;
|
||||||
int i;
|
int i, b, e;
|
||||||
|
|
||||||
PyObject* string;
|
PyObject* string;
|
||||||
int start = 0;
|
int start = 0;
|
||||||
|
|
@ -1842,17 +1856,16 @@ pattern_findall(PatternObject* self, PyObject* args, PyObject* kw)
|
||||||
/* don't bother to build a match object */
|
/* don't bother to build a match object */
|
||||||
switch (self->groups) {
|
switch (self->groups) {
|
||||||
case 0:
|
case 0:
|
||||||
item = PySequence_GetSlice(
|
b = ((char*) state.start - (char*) state.beginning) /
|
||||||
string,
|
state.charsize;
|
||||||
((char*) state.start - (char*) state.beginning) /
|
e = ((char*) state.ptr - (char*) state.beginning) /
|
||||||
state.charsize,
|
state.charsize;
|
||||||
((char*) state.ptr - (char*) state.beginning) /
|
item = PySequence_GetSlice(string, b, e);
|
||||||
state.charsize);
|
|
||||||
if (!item)
|
if (!item)
|
||||||
goto error;
|
goto error;
|
||||||
break;
|
break;
|
||||||
case 1:
|
case 1:
|
||||||
item = state_getslice(&state, 1, string);
|
item = state_getslice(&state, 1, string, 1);
|
||||||
if (!item)
|
if (!item)
|
||||||
goto error;
|
goto error;
|
||||||
break;
|
break;
|
||||||
|
|
@ -1861,7 +1874,7 @@ pattern_findall(PatternObject* self, PyObject* args, PyObject* kw)
|
||||||
if (!item)
|
if (!item)
|
||||||
goto error;
|
goto error;
|
||||||
for (i = 0; i < self->groups; i++) {
|
for (i = 0; i < self->groups; i++) {
|
||||||
PyObject* o = state_getslice(&state, i+1, string);
|
PyObject* o = state_getslice(&state, i+1, string, 1);
|
||||||
if (!o) {
|
if (!o) {
|
||||||
Py_DECREF(item);
|
Py_DECREF(item);
|
||||||
goto error;
|
goto error;
|
||||||
|
|
@ -1903,6 +1916,117 @@ error:
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if !defined(USE_PYTHON_SPLIT)
|
||||||
|
static PyObject*
|
||||||
|
pattern_split(PatternObject* self, PyObject* args, PyObject* kw)
|
||||||
|
{
|
||||||
|
SRE_STATE state;
|
||||||
|
PyObject* list;
|
||||||
|
PyObject* item;
|
||||||
|
int status;
|
||||||
|
int n;
|
||||||
|
int i, b, e;
|
||||||
|
int g;
|
||||||
|
|
||||||
|
PyObject* string;
|
||||||
|
int maxsplit = 0;
|
||||||
|
static char* kwlist[] = { "source", "maxsplit", NULL };
|
||||||
|
if (!PyArg_ParseTupleAndKeywords(args, kw, "O|i:split", kwlist,
|
||||||
|
&string, &maxsplit))
|
||||||
|
return NULL;
|
||||||
|
|
||||||
|
string = state_init(&state, self, string, 0, INT_MAX);
|
||||||
|
if (!string)
|
||||||
|
return NULL;
|
||||||
|
|
||||||
|
list = PyList_New(0);
|
||||||
|
|
||||||
|
i = n = 0;
|
||||||
|
|
||||||
|
while (maxsplit == 0 || n < maxsplit) {
|
||||||
|
|
||||||
|
state_reset(&state);
|
||||||
|
|
||||||
|
state.ptr = state.start;
|
||||||
|
|
||||||
|
if (state.charsize == 1) {
|
||||||
|
status = sre_search(&state, PatternObject_GetCode(self));
|
||||||
|
} else {
|
||||||
|
#if defined(HAVE_UNICODE)
|
||||||
|
status = sre_usearch(&state, PatternObject_GetCode(self));
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
if (status > 0) {
|
||||||
|
|
||||||
|
if (state.start == state.ptr) {
|
||||||
|
if (i >= state.endpos)
|
||||||
|
break;
|
||||||
|
/* skip one character */
|
||||||
|
state.start = (void*) ((char*) state.ptr + state.charsize);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
b = ((char*) state.start - (char*) state.beginning) /
|
||||||
|
state.charsize;
|
||||||
|
e = ((char*) state.ptr - (char*) state.beginning) /
|
||||||
|
state.charsize;
|
||||||
|
|
||||||
|
/* get segment before this match */
|
||||||
|
item = PySequence_GetSlice(string, i, b);
|
||||||
|
if (!item)
|
||||||
|
goto error;
|
||||||
|
status = PyList_Append(list, item);
|
||||||
|
Py_DECREF(item);
|
||||||
|
if (status < 0)
|
||||||
|
goto error;
|
||||||
|
|
||||||
|
for (g = 0; g < self->groups; g++) {
|
||||||
|
item = state_getslice(&state, g+1, string, 0);
|
||||||
|
if (!item)
|
||||||
|
goto error;
|
||||||
|
status = PyList_Append(list, item);
|
||||||
|
Py_DECREF(item);
|
||||||
|
if (status < 0)
|
||||||
|
goto error;
|
||||||
|
}
|
||||||
|
|
||||||
|
i = e;
|
||||||
|
n = n + 1;
|
||||||
|
|
||||||
|
state.start = state.ptr;
|
||||||
|
|
||||||
|
} else {
|
||||||
|
|
||||||
|
if (status == 0)
|
||||||
|
break;
|
||||||
|
|
||||||
|
pattern_error(status);
|
||||||
|
goto error;
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* get segment following last match */
|
||||||
|
item = PySequence_GetSlice(string, i, state.endpos);
|
||||||
|
if (!item)
|
||||||
|
goto error;
|
||||||
|
status = PyList_Append(list, item);
|
||||||
|
Py_DECREF(item);
|
||||||
|
if (status < 0)
|
||||||
|
goto error;
|
||||||
|
|
||||||
|
state_fini(&state);
|
||||||
|
return list;
|
||||||
|
|
||||||
|
error:
|
||||||
|
Py_DECREF(list);
|
||||||
|
state_fini(&state);
|
||||||
|
return NULL;
|
||||||
|
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
static PyObject*
|
static PyObject*
|
||||||
pattern_copy(PatternObject* self, PyObject* args)
|
pattern_copy(PatternObject* self, PyObject* args)
|
||||||
{
|
{
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue