needforspeed: new replace implementation by Andrew Dalke. replace is

now about 3x faster on my machine, for the replace tests from string-
bench.
This commit is contained in:
Fredrik Lundh 2006-05-25 17:08:14 +00:00
parent 0c71f88fc9
commit e68955cf32

View file

@ -2379,174 +2379,622 @@ string_translate(PyStringObject *self, PyObject *args)
}
/* What follows is used for implementing replace(). Perry Stoll. */
#define FORWARD 1
#define REVERSE -1
/*
mymemfind
/* find and count characters and substrings */
strstr replacement for arbitrary blocks of memory.
/* Don't call if length < 2 */
#define Py_STRING_MATCH(target, offset, pattern, length) \
(target[offset] == pattern[0] && \
target[offset+length-1] == pattern[length-1] && \
!memcmp(target+offset+1, pattern+1, length-2) )
Locates the first occurrence in the memory pointed to by MEM of the
contents of memory pointed to by PAT. Returns the index into MEM if
found, or -1 if not found. If len of PAT is greater than length of
MEM, the function returns -1.
*/
static Py_ssize_t
mymemfind(const char *mem, Py_ssize_t len, const char *pat, Py_ssize_t pat_len)
#define findchar(target, target_len, c) \
((char *)memchr((const void *)(target), c, target_len))
/* String ops must return a string. */
/* If the object is subclass of string, create a copy */
static PyStringObject *
return_self(PyStringObject *self)
{
register Py_ssize_t ii;
if (PyString_CheckExact(self)) {
Py_INCREF(self);
return self;
}
return (PyStringObject *)PyString_FromStringAndSize(
PyString_AS_STRING(self),
PyString_GET_SIZE(self));
}
/* pattern can not occur in the last pat_len-1 chars */
len -= pat_len;
static Py_ssize_t
countchar(char *target, int target_len, char c)
{
Py_ssize_t count=0;
char *start=target;
char *end=target+target_len;
for (ii = 0; ii <= len; ii++) {
if (mem[ii] == pat[0] && memcmp(&mem[ii], pat, pat_len) == 0) {
return ii;
}
while ( (start=findchar(start, end-start, c)) != NULL ) {
count++;
start += 1;
}
return count;
}
static Py_ssize_t
findstring(char *target, Py_ssize_t target_len,
char *pattern, Py_ssize_t pattern_len,
Py_ssize_t start,
Py_ssize_t end,
int direction)
{
if (start < 0) {
start += target_len;
if (start < 0)
start = 0;
}
if (end > target_len) {
end = target_len;
} else if (end < 0) {
end += target_len;
if (end < 0)
end = 0;
}
/* zero-length substrings always match at the first attempt */
if (pattern_len == 0)
return (direction > 0) ? start : end;
end -= pattern_len;
if (direction < 0) {
for (; end >= start; end--)
if (Py_STRING_MATCH(target, end, pattern, pattern_len))
return end;
} else {
for (; start <= end; start++)
if (Py_STRING_MATCH(target, start, pattern, pattern_len))
return start;
}
return -1;
}
/*
mymemcnt
Return the number of distinct times PAT is found in MEM.
meaning mem=1111 and pat==11 returns 2.
mem=11111 and pat==11 also return 2.
*/
static Py_ssize_t
mymemcnt(const char *mem, Py_ssize_t len, const char *pat, Py_ssize_t pat_len)
Py_ssize_t
countstring(char *target, Py_ssize_t target_len,
char *pattern, Py_ssize_t pattern_len,
Py_ssize_t start,
Py_ssize_t end,
int direction)
{
register Py_ssize_t offset = 0;
Py_ssize_t nfound = 0;
Py_ssize_t count=0;
while (len >= 0) {
offset = mymemfind(mem, len, pat, pat_len);
if (start < 0) {
start += target_len;
if (start < 0)
start = 0;
}
if (end > target_len) {
end = target_len;
} else if (end < 0) {
end += target_len;
if (end < 0)
end = 0;
}
/* zero-length substrings match everywhere */
if (pattern_len == 0)
return target_len+1;
end -= pattern_len;
if (direction < 0) {
for (; end >= start; end--)
if (Py_STRING_MATCH(target, end, pattern, pattern_len)) {
count++;
end -= pattern_len-1;
}
} else {
for (; start <= end; start++)
if (Py_STRING_MATCH(target, start, pattern, pattern_len)) {
count++;
start += pattern_len-1;
}
}
return count;
}
/* Algorithms for difference cases of string replacement */
/* len(self)>=1, from="", len(to)>=1, maxcount>=1 */
static PyStringObject *
replace_interleave(PyStringObject *self,
PyStringObject *to,
Py_ssize_t maxcount)
{
char *self_s, *to_s, *result_s;
Py_ssize_t self_len, to_len, result_len;
Py_ssize_t count, i, product;
PyStringObject *result;
self_len = PyString_GET_SIZE(self);
to_len = PyString_GET_SIZE(to);
/* 1 at the end plus 1 after every character */
count = self_len+1;
if (maxcount < count)
count = maxcount;
/* Check for overflow */
/* result_len = count * to_len + self_len; */
product = count * to_len;
if (product / to_len != count) {
PyErr_SetString(PyExc_OverflowError,
"replace string is too long");
return NULL;
}
result_len = product + self_len;
if (result_len < 0) {
PyErr_SetString(PyExc_OverflowError,
"replace string is too long");
return NULL;
}
if (! (result = (PyStringObject *)
PyString_FromStringAndSize(NULL, result_len)) )
return NULL;
self_s = PyString_AS_STRING(self);
to_s = PyString_AS_STRING(to);
to_len = PyString_GET_SIZE(to);
result_s = PyString_AS_STRING(result);
/* TODO: special case single character, which doesn't need memcpy */
/* Lay the first one down (guaranteed this will occur) */
memcpy(result_s, to_s, to_len);
result_s += to_len;
count -= 1;
for (i=0; i<count; i++) {
*result_s++ = *self_s++;
memcpy(result_s, to_s, to_len);
result_s += to_len;
}
/* Copy the rest of the original string */
memcpy(result_s, self_s, self_len-i);
return result;
}
/* Special case for deleting a single character */
/* len(self)>=1, len(from)==1, to="", maxcount>=1 */
static PyStringObject *
replace_delete_single_character(PyStringObject *self,
char from_c, Py_ssize_t maxcount)
{
char *self_s, *result_s;
char *start, *next, *end;
Py_ssize_t self_len, result_len;
Py_ssize_t count;
PyStringObject *result;
self_len = PyString_GET_SIZE(self);
self_s = PyString_AS_STRING(self);
count = countchar(self_s, self_len, from_c);
if (count == 0) {
return return_self(self);
}
if (count > maxcount)
count = maxcount;
result_len = self_len - count; /* from_len == 1 */
assert(result_len>=0);
if ( (result = (PyStringObject *)
PyString_FromStringAndSize(NULL, result_len)) == NULL)
return NULL;
result_s = PyString_AS_STRING(result);
start = self_s;
end = self_s + self_len;
while (count-- > 0) {
next = findchar(start, end-start, from_c);
if (next == NULL)
break;
memcpy(result_s, start, next-start);
result_s += (next-start);
start = next+1;
}
memcpy(result_s, start, end-start);
return result;
}
/* len(self)>=1, len(from)>=2, to="", maxcount>=1 */
static PyStringObject *
replace_delete_substring(PyStringObject *self, PyStringObject *from,
Py_ssize_t maxcount) {
char *self_s, *from_s, *result_s;
char *start, *next, *end;
Py_ssize_t self_len, from_len, result_len;
Py_ssize_t count, offset;
PyStringObject *result;
self_len = PyString_GET_SIZE(self);
self_s = PyString_AS_STRING(self);
from_len = PyString_GET_SIZE(from);
from_s = PyString_AS_STRING(from);
count = countstring(self_s, self_len,
from_s, from_len,
0, self_len, 1);
if (count > maxcount)
count = maxcount;
if (count == 0) {
/* no matches */
return return_self(self);
}
result_len = self_len - (count * from_len);
assert (result_len>=0);
if ( (result = (PyStringObject *)
PyString_FromStringAndSize(NULL, result_len)) == NULL )
return NULL;
result_s = PyString_AS_STRING(result);
start = self_s;
end = self_s + self_len;
while (count-- > 0) {
offset = findstring(start, end-start,
from_s, from_len,
0, end-start, FORWARD);
if (offset == -1)
break;
mem += offset + pat_len;
len -= offset + pat_len;
nfound++;
next = start + offset;
memcpy(result_s, start, next-start);
result_s += (next-start);
start = next+from_len;
}
return nfound;
memcpy(result_s, start, end-start);
return result;
}
/*
mymemreplace
Return a string in which all occurrences of PAT in memory STR are
replaced with SUB.
If length of PAT is less than length of STR or there are no occurrences
of PAT in STR, then the original string is returned. Otherwise, a new
string is allocated here and returned.
on return, out_len is:
the length of output string, or
-1 if the input string is returned, or
unchanged if an error occurs (no memory).
return value is:
the new string allocated locally, or
NULL if an error occurred.
*/
static char *
mymemreplace(const char *str, Py_ssize_t len, /* input string */
const char *pat, Py_ssize_t pat_len, /* pattern string to find */
const char *sub, Py_ssize_t sub_len, /* substitution string */
Py_ssize_t count, /* number of replacements */
Py_ssize_t *out_len)
/* len(self)>=1, len(from)==len(to)==1, maxcount>=1 */
static PyStringObject *
replace_single_character_in_place(PyStringObject *self,
char from_c, char to_c,
Py_ssize_t maxcount)
{
char *out_s;
char *new_s;
Py_ssize_t nfound, offset, new_len;
Py_ssize_t product, delta;
if (len == 0 || (pat_len == 0 && sub_len == 0) || pat_len > len)
goto return_same;
/* find length of output string */
nfound = (pat_len > 0) ? mymemcnt(str, len, pat, pat_len) : len + 1;
if (count < 0)
count = PY_SSIZE_T_MAX;
else if (nfound > count)
nfound = count;
if (nfound == 0)
goto return_same;
delta = (sub_len - pat_len);
if (delta == 0) {
new_len = len;
} else {
product = nfound * (sub_len - pat_len);
if ((product / (sub_len - pat_len)) != nfound) {
PyErr_SetString(PyExc_OverflowError,
"replace string is too long");
return NULL;
}
new_len = len + product;
if (new_len < 0) {
PyErr_SetString(PyExc_OverflowError,
"replace string is too long");
return NULL;
}
}
if (new_len == 0) {
/* Have to allocate something for the caller to free(). */
out_s = (char *)PyMem_MALLOC(1);
if (out_s == NULL)
return NULL;
out_s[0] = '\0';
char *self_s, *result_s, *start, *end, *next;
Py_ssize_t self_len;
PyStringObject *result;
/* The result string will be the same size */
self_s = PyString_AS_STRING(self);
self_len = PyString_GET_SIZE(self);
next = findchar(self_s, self_len, from_c);
if (next == NULL) {
/* No matches; return the original string */
return return_self(self);
}
else {
assert(new_len > 0);
new_s = (char *)PyMem_MALLOC(new_len);
if (new_s == NULL)
return NULL;
out_s = new_s;
if (pat_len > 0) {
for (; nfound > 0; --nfound) {
/* find index of next instance of pattern */
offset = mymemfind(str, len, pat, pat_len);
if (offset == -1)
break;
/* copy non matching part of input string */
memcpy(new_s, str, offset);
str += offset + pat_len;
len -= offset + pat_len;
/* copy substitute into the output string */
new_s += offset;
memcpy(new_s, sub, sub_len);
new_s += sub_len;
}
/* copy any remaining values into output string */
if (len > 0)
memcpy(new_s, str, len);
}
else {
for (;;++str, --len) {
memcpy(new_s, sub, sub_len);
new_s += sub_len;
if (--nfound <= 0) {
memcpy(new_s, str, len);
break;
}
*new_s++ = *str;
}
}
/* Need to make a new string */
result = (PyStringObject *) PyString_FromStringAndSize(self_s, self_len);
if (result == NULL)
return NULL;
result_s = PyString_AS_STRING(result);
/* change everything in-place, starting with this one */
start = result_s + (next-self_s);
*start = to_c;
start++;
end = result_s + self_len;
while (--maxcount > 0) {
next = findchar(start, end-start, from_c);
if (next == NULL)
break;
*next = to_c;
start = next+1;
}
*out_len = new_len;
return out_s;
return_same:
*out_len = -1;
return (char *)str; /* cast away const */
return result;
}
/* len(self)>=1, len(from)==len(to)>=2, maxcount>=1 */
static PyStringObject *
replace_substring_in_place(PyStringObject *self,
PyStringObject *from,
PyStringObject *to,
Py_ssize_t maxcount)
{
char *result_s, *start, *end;
char *self_s, *from_s, *to_s;
Py_ssize_t self_len, from_len, offset;
PyStringObject *result;
/* The result string will be the same size */
self_s = PyString_AS_STRING(self);
self_len = PyString_GET_SIZE(self);
from_s = PyString_AS_STRING(from);
from_len = PyString_GET_SIZE(from);
to_s = PyString_AS_STRING(to);
offset = findstring(self_s, self_len,
from_s, from_len,
0, self_len, FORWARD);
if (offset == -1) {
/* No matches; return the original string */
return return_self(self);
}
/* Need to make a new string */
result = (PyStringObject *) PyString_FromStringAndSize(self_s, self_len);
if (result == NULL)
return NULL;
result_s = PyString_AS_STRING(result);
/* change everything in-place, starting with this one */
start = result_s + offset;
memcpy(start, to_s, from_len);
start += from_len;
end = result_s + self_len;
while ( --maxcount > 0) {
offset = findstring(start, end-start,
from_s, from_len,
0, end-start, FORWARD);
if (offset==-1)
break;
memcpy(start+offset, to_s, from_len);
start += offset+from_len;
}
return result;
}
/* len(self)>=1, len(from)==1, len(to)>=2, maxcount>=1 */
static PyStringObject *
replace_single_character(PyStringObject *self,
char from_c,
PyStringObject *to,
Py_ssize_t maxcount)
{
char *self_s, *to_s, *result_s;
char *start, *next, *end;
Py_ssize_t self_len, to_len, result_len;
Py_ssize_t count, product;
PyStringObject *result;
self_s = PyString_AS_STRING(self);
self_len = PyString_GET_SIZE(self);
count = countchar(self_s, self_len, from_c);
if (count > maxcount)
count = maxcount;
if (count == 0) {
/* no matches, return unchanged */
return return_self(self);
}
to_s = PyString_AS_STRING(to);
to_len = PyString_GET_SIZE(to);
/* use the difference between current and new, hence the "-1" */
/* result_len = self_len + count * (to_len-1) */
product = count * (to_len-1);
if (product / (to_len-1) != count) {
PyErr_SetString(PyExc_OverflowError, "replace string is too long");
return NULL;
}
result_len = self_len + product;
if (result_len < 0) {
PyErr_SetString(PyExc_OverflowError, "replace string is too long");
return NULL;
}
if ( (result = (PyStringObject *)
PyString_FromStringAndSize(NULL, result_len)) == NULL)
return NULL;
result_s = PyString_AS_STRING(result);
start = self_s;
end = self_s + self_len;
while (count-- > 0) {
next = findchar(start, end-start, from_c);
if (next == NULL)
break;
if (next == start) {
/* replace with the 'to' */
memcpy(result_s, to_s, to_len);
result_s += to_len;
start += 1;
} else {
/* copy the unchanged old then the 'to' */
memcpy(result_s, start, next-start);
result_s += (next-start);
memcpy(result_s, to_s, to_len);
result_s += to_len;
start = next+1;
}
}
/* Copy the remainder of the remaining string */
memcpy(result_s, start, end-start);
return result;
}
/* len(self)>=1, len(from)>=2, len(to)>=2, maxcount>=1 */
static PyStringObject *
replace_substring(PyStringObject *self,
PyStringObject *from,
PyStringObject *to,
Py_ssize_t maxcount) {
char *self_s, *from_s, *to_s, *result_s;
char *start, *next, *end;
Py_ssize_t self_len, from_len, to_len, result_len;
Py_ssize_t count, offset, product;
PyStringObject *result;
self_s = PyString_AS_STRING(self);
self_len = PyString_GET_SIZE(self);
from_s = PyString_AS_STRING(from);
from_len = PyString_GET_SIZE(from);
count = countstring(self_s, self_len,
from_s, from_len,
0, self_len, FORWARD);
if (count > maxcount)
count = maxcount;
if (count == 0) {
/* no matches, return unchanged */
return return_self(self);
}
to_s = PyString_AS_STRING(to);
to_len = PyString_GET_SIZE(to);
/* Check for overflow */
/* result_len = self_len + count * (to_len-from_len) */
product = count * (to_len-from_len);
if (product / (to_len-from_len) != count) {
PyErr_SetString(PyExc_OverflowError, "replace string is too long");
return NULL;
}
result_len = self_len + product;
if (result_len < 0) {
PyErr_SetString(PyExc_OverflowError, "replace string is too long");
return NULL;
}
if ( (result = (PyStringObject *)
PyString_FromStringAndSize(NULL, result_len)) == NULL)
return NULL;
result_s = PyString_AS_STRING(result);
start = self_s;
end = self_s + self_len;
while (count-- > 0) {
offset = findstring(start, end-start,
from_s, from_len,
0, end-start, FORWARD);
if (offset == -1)
break;
next = start+offset;
if (next == start) {
/* replace with the 'to' */
memcpy(result_s, to_s, to_len);
result_s += to_len;
start += from_len;
} else {
/* copy the unchanged old then the 'to' */
memcpy(result_s, start, next-start);
result_s += (next-start);
memcpy(result_s, to_s, to_len);
result_s += to_len;
start = next+from_len;
}
}
/* Copy the remainder of the remaining string */
memcpy(result_s, start, end-start);
return result;
}
static PyStringObject *
replace(PyStringObject *self,
PyStringObject *from,
PyStringObject *to,
Py_ssize_t maxcount)
{
Py_ssize_t from_len, to_len;
if (maxcount < 0) {
maxcount = PY_SSIZE_T_MAX;
} else if (maxcount == 0 || PyString_GET_SIZE(self) == 0) {
/* nothing to do; return the original string */
return return_self(self);
}
from_len = PyString_GET_SIZE(from);
to_len = PyString_GET_SIZE(to);
if (maxcount == 0 ||
(from_len == 0 && to_len == 0)) {
/* nothing to do; return the original string */
return return_self(self);
}
/* Handle zero-length special cases */
if (from_len == 0) {
/* insert the 'to' string everywhere. */
/* >>> "Python".replace("", ".") */
/* '.P.y.t.h.o.n.' */
return replace_interleave(self, to, maxcount);
}
/* Except for "".replace("", "A") == "A" there is no way beyond this */
/* point for an empty self string to generate a non-empty string */
/* Special case so the remaining code always gets a non-empty string */
if (PyString_GET_SIZE(self) == 0) {
return return_self(self);
}
if (to_len == 0) {
/* delete all occurances of 'from' string */
if (from_len == 1) {
return replace_delete_single_character(
self, PyString_AS_STRING(from)[0], maxcount);
} else {
return replace_delete_substring(self, from, maxcount);
}
}
/* Handle special case where both strings have the same length */
if (from_len == to_len) {
if (from_len == 1) {
return replace_single_character_in_place(
self,
PyString_AS_STRING(from)[0],
PyString_AS_STRING(to)[0],
maxcount);
} else {
return replace_substring_in_place(
self, from, to, maxcount);
}
}
/* Otherwise use the more generic algorithms */
if (from_len == 1) {
return replace_single_character(self, PyString_AS_STRING(from)[0],
to, maxcount);
} else {
/* len('from')>=2, len('to')>=1 */
return replace_substring(self, from, to, maxcount);
}
}
PyDoc_STRVAR(replace__doc__,
"S.replace (old, new[, count]) -> string\n\
@ -2558,67 +3006,42 @@ given, only the first count occurrences are replaced.");
static PyObject *
string_replace(PyStringObject *self, PyObject *args)
{
const char *str = PyString_AS_STRING(self), *sub, *repl;
char *new_s;
const Py_ssize_t len = PyString_GET_SIZE(self);
Py_ssize_t sub_len, repl_len, out_len;
Py_ssize_t count = -1;
PyObject *newobj;
PyObject *subobj, *replobj;
PyObject *from, *to;
char *tmp_s;
Py_ssize_t tmp_len;
if (!PyArg_ParseTuple(args, "OO|n:replace",
&subobj, &replobj, &count))
if (!PyArg_ParseTuple(args, "OO|n:replace", &from, &to, &count))
return NULL;
if (PyString_Check(subobj)) {
sub = PyString_AS_STRING(subobj);
sub_len = PyString_GET_SIZE(subobj);
if (PyString_Check(from)) {
/* Can this be made a '!check' after the Unicode check? */
}
#ifdef Py_USING_UNICODE
else if (PyUnicode_Check(subobj))
if (PyUnicode_Check(from))
return PyUnicode_Replace((PyObject *)self,
subobj, replobj, count);
from, to, count);
#endif
else if (PyObject_AsCharBuffer(subobj, &sub, &sub_len))
else if (PyObject_AsCharBuffer(from, &tmp_s, &tmp_len))
return NULL;
if (PyString_Check(replobj)) {
repl = PyString_AS_STRING(replobj);
repl_len = PyString_GET_SIZE(replobj);
if (PyString_Check(to)) {
/* Can this be made a '!check' after the Unicode check? */
}
#ifdef Py_USING_UNICODE
else if (PyUnicode_Check(replobj))
else if (PyUnicode_Check(to))
return PyUnicode_Replace((PyObject *)self,
subobj, replobj, count);
from, to, count);
#endif
else if (PyObject_AsCharBuffer(replobj, &repl, &repl_len))
else if (PyObject_AsCharBuffer(to, &tmp_s, &tmp_len))
return NULL;
new_s = mymemreplace(str,len,sub,sub_len,repl,repl_len,count,&out_len);
if (new_s == NULL) {
if (!PyErr_Occurred())
PyErr_NoMemory();
return NULL;
}
if (out_len == -1) {
if (PyString_CheckExact(self)) {
/* we're returning another reference to self */
newobj = (PyObject*)self;
Py_INCREF(newobj);
}
else {
newobj = PyString_FromStringAndSize(str, len);
if (newobj == NULL)
return NULL;
}
}
else {
newobj = PyString_FromStringAndSize(new_s, out_len);
PyMem_FREE(new_s);
}
return newobj;
return (PyObject *)replace((PyStringObject *) self,
(PyStringObject *) from,
(PyStringObject *) to, count);
}
/** End DALKE **/
PyDoc_STRVAR(startswith__doc__,
"S.startswith(prefix[, start[, end]]) -> bool\n\