mirror of
https://github.com/python/cpython.git
synced 2025-09-26 10:19:53 +00:00
needforspeed: use append+reverse for rsplit, use "bloom filters" to
speed up splitlines and strip with charsets; etc. rsplit is now as fast as split in all our tests (reverse takes no time at all), and splitlines() is nearly as fast as a plain split("\n") in our tests. and we're not done yet... ;-)
This commit is contained in:
parent
7298f270a7
commit
b63588c188
1 changed files with 101 additions and 43 deletions
|
@ -46,6 +46,18 @@ OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
|
||||||
#include <windows.h>
|
#include <windows.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#undef USE_INLINE /* XXX - set via configure? */
|
||||||
|
|
||||||
|
#if defined(_MSC_VER) /* this is taken from _sre.c */
|
||||||
|
#pragma warning(disable: 4710)
|
||||||
|
/* fastest possible local call under MSVC */
|
||||||
|
#define LOCAL(type) static __inline type __fastcall
|
||||||
|
#elif defined(USE_INLINE)
|
||||||
|
#define LOCAL(type) static inline type
|
||||||
|
#else
|
||||||
|
#define LOCAL(type) static type
|
||||||
|
#endif
|
||||||
|
|
||||||
/* Limit for the Unicode object free list */
|
/* Limit for the Unicode object free list */
|
||||||
|
|
||||||
#define MAX_UNICODE_FREELIST_SIZE 1024
|
#define MAX_UNICODE_FREELIST_SIZE 1024
|
||||||
|
@ -121,6 +133,51 @@ PyUnicode_GetMax(void)
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* --- Bloom Filters ----------------------------------------------------- */
|
||||||
|
|
||||||
|
/* stuff to implement simple "bloom filters" for Unicode characters.
|
||||||
|
to keep things simple, we use a single bitmask, using the least 5
|
||||||
|
bits from each unicode characters as the bit index. */
|
||||||
|
|
||||||
|
/* the linebreak mask is set up by Unicode_Init below */
|
||||||
|
|
||||||
|
#define BLOOM_MASK unsigned long
|
||||||
|
|
||||||
|
static BLOOM_MASK bloom_linebreak;
|
||||||
|
|
||||||
|
#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
|
||||||
|
|
||||||
|
#define BLOOM_LINEBREAK(ch)\
|
||||||
|
(BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK((ch)))
|
||||||
|
|
||||||
|
LOCAL(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
|
||||||
|
{
|
||||||
|
/* calculate simple bloom-style bitmask for a given unicode string */
|
||||||
|
|
||||||
|
long mask;
|
||||||
|
Py_ssize_t i;
|
||||||
|
|
||||||
|
mask = 0;
|
||||||
|
for (i = 0; i < len; i++)
|
||||||
|
mask |= (1 << (ptr[i] & 0x1F));
|
||||||
|
|
||||||
|
return mask;
|
||||||
|
}
|
||||||
|
|
||||||
|
LOCAL(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
|
||||||
|
{
|
||||||
|
Py_ssize_t i;
|
||||||
|
|
||||||
|
for (i = 0; i < setlen; i++)
|
||||||
|
if (set[i] == chr)
|
||||||
|
return 1;
|
||||||
|
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
#define BLOOM_MEMBER(mask, chr, set, setlen)\
|
||||||
|
BLOOM(mask, chr) && unicode_member(chr, set, setlen)
|
||||||
|
|
||||||
/* --- Unicode Object ----------------------------------------------------- */
|
/* --- Unicode Object ----------------------------------------------------- */
|
||||||
|
|
||||||
static
|
static
|
||||||
|
@ -3791,8 +3848,7 @@ int PyUnicode_EncodeDecimal(Py_UNICODE *s,
|
||||||
|
|
||||||
/* --- Helpers ------------------------------------------------------------ */
|
/* --- Helpers ------------------------------------------------------------ */
|
||||||
|
|
||||||
static
|
static Py_ssize_t count(PyUnicodeObject *self,
|
||||||
Py_ssize_t count(PyUnicodeObject *self,
|
|
||||||
Py_ssize_t start,
|
Py_ssize_t start,
|
||||||
Py_ssize_t end,
|
Py_ssize_t end,
|
||||||
PyUnicodeObject *substring)
|
PyUnicodeObject *substring)
|
||||||
|
@ -3850,8 +3906,7 @@ Py_ssize_t PyUnicode_Count(PyObject *str,
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
static
|
static Py_ssize_t findstring(PyUnicodeObject *self,
|
||||||
Py_ssize_t findstring(PyUnicodeObject *self,
|
|
||||||
PyUnicodeObject *substring,
|
PyUnicodeObject *substring,
|
||||||
Py_ssize_t start,
|
Py_ssize_t start,
|
||||||
Py_ssize_t end,
|
Py_ssize_t end,
|
||||||
|
@ -4332,17 +4387,6 @@ PyUnicodeObject *pad(PyUnicodeObject *self,
|
||||||
else \
|
else \
|
||||||
Py_DECREF(str);
|
Py_DECREF(str);
|
||||||
|
|
||||||
#define SPLIT_INSERT(data, left, right) \
|
|
||||||
str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
|
|
||||||
if (!str) \
|
|
||||||
goto onError; \
|
|
||||||
if (PyList_Insert(list, 0, str)) { \
|
|
||||||
Py_DECREF(str); \
|
|
||||||
goto onError; \
|
|
||||||
} \
|
|
||||||
else \
|
|
||||||
Py_DECREF(str);
|
|
||||||
|
|
||||||
static
|
static
|
||||||
PyObject *split_whitespace(PyUnicodeObject *self,
|
PyObject *split_whitespace(PyUnicodeObject *self,
|
||||||
PyObject *list,
|
PyObject *list,
|
||||||
|
@ -4403,7 +4447,7 @@ PyObject *PyUnicode_Splitlines(PyObject *string,
|
||||||
Py_ssize_t eol;
|
Py_ssize_t eol;
|
||||||
|
|
||||||
/* Find a line and append it */
|
/* Find a line and append it */
|
||||||
while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
|
while (i < len && !BLOOM_LINEBREAK(data[i]))
|
||||||
i++;
|
i++;
|
||||||
|
|
||||||
/* Skip the line break reading CRLF as one line break */
|
/* Skip the line break reading CRLF as one line break */
|
||||||
|
@ -4514,15 +4558,17 @@ PyObject *rsplit_whitespace(PyUnicodeObject *self,
|
||||||
if (j > i) {
|
if (j > i) {
|
||||||
if (maxcount-- <= 0)
|
if (maxcount-- <= 0)
|
||||||
break;
|
break;
|
||||||
SPLIT_INSERT(self->str, i + 1, j + 1);
|
SPLIT_APPEND(self->str, i + 1, j + 1);
|
||||||
while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
|
while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
|
||||||
i--;
|
i--;
|
||||||
j = i;
|
j = i;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (j >= 0) {
|
if (j >= 0) {
|
||||||
SPLIT_INSERT(self->str, 0, j + 1);
|
SPLIT_APPEND(self->str, 0, j + 1);
|
||||||
}
|
}
|
||||||
|
if (PyList_Reverse(list) < 0)
|
||||||
|
goto onError;
|
||||||
return list;
|
return list;
|
||||||
|
|
||||||
onError:
|
onError:
|
||||||
|
@ -4545,14 +4591,16 @@ PyObject *rsplit_char(PyUnicodeObject *self,
|
||||||
if (self->str[i] == ch) {
|
if (self->str[i] == ch) {
|
||||||
if (maxcount-- <= 0)
|
if (maxcount-- <= 0)
|
||||||
break;
|
break;
|
||||||
SPLIT_INSERT(self->str, i + 1, j + 1);
|
SPLIT_APPEND(self->str, i + 1, j + 1);
|
||||||
j = i = i - 1;
|
j = i = i - 1;
|
||||||
} else
|
} else
|
||||||
i--;
|
i--;
|
||||||
}
|
}
|
||||||
if (j >= -1) {
|
if (j >= -1) {
|
||||||
SPLIT_INSERT(self->str, 0, j + 1);
|
SPLIT_APPEND(self->str, 0, j + 1);
|
||||||
}
|
}
|
||||||
|
if (PyList_Reverse(list) < 0)
|
||||||
|
goto onError;
|
||||||
return list;
|
return list;
|
||||||
|
|
||||||
onError:
|
onError:
|
||||||
|
@ -4576,15 +4624,17 @@ PyObject *rsplit_substring(PyUnicodeObject *self,
|
||||||
if (Py_UNICODE_MATCH(self, i, substring)) {
|
if (Py_UNICODE_MATCH(self, i, substring)) {
|
||||||
if (maxcount-- <= 0)
|
if (maxcount-- <= 0)
|
||||||
break;
|
break;
|
||||||
SPLIT_INSERT(self->str, i + sublen, j);
|
SPLIT_APPEND(self->str, i + sublen, j);
|
||||||
j = i;
|
j = i;
|
||||||
i -= sublen;
|
i -= sublen;
|
||||||
} else
|
} else
|
||||||
i--;
|
i--;
|
||||||
}
|
}
|
||||||
if (j >= 0) {
|
if (j >= 0) {
|
||||||
SPLIT_INSERT(self->str, 0, j);
|
SPLIT_APPEND(self->str, 0, j);
|
||||||
}
|
}
|
||||||
|
if (PyList_Reverse(list) < 0)
|
||||||
|
goto onError;
|
||||||
return list;
|
return list;
|
||||||
|
|
||||||
onError:
|
onError:
|
||||||
|
@ -4593,7 +4643,6 @@ PyObject *rsplit_substring(PyUnicodeObject *self,
|
||||||
}
|
}
|
||||||
|
|
||||||
#undef SPLIT_APPEND
|
#undef SPLIT_APPEND
|
||||||
#undef SPLIT_INSERT
|
|
||||||
|
|
||||||
static
|
static
|
||||||
PyObject *split(PyUnicodeObject *self,
|
PyObject *split(PyUnicodeObject *self,
|
||||||
|
@ -5703,16 +5752,6 @@ static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
|
||||||
|
|
||||||
#define STRIPNAME(i) (stripformat[i]+3)
|
#define STRIPNAME(i) (stripformat[i]+3)
|
||||||
|
|
||||||
static const Py_UNICODE *
|
|
||||||
unicode_memchr(const Py_UNICODE *s, Py_UNICODE c, size_t n)
|
|
||||||
{
|
|
||||||
size_t i;
|
|
||||||
for (i = 0; i < n; ++i)
|
|
||||||
if (s[i] == c)
|
|
||||||
return s+i;
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* externally visible for str.strip(unicode) */
|
/* externally visible for str.strip(unicode) */
|
||||||
PyObject *
|
PyObject *
|
||||||
_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
|
_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
|
||||||
|
@ -5723,9 +5762,11 @@ _PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
|
||||||
Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
|
Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
|
||||||
Py_ssize_t i, j;
|
Py_ssize_t i, j;
|
||||||
|
|
||||||
|
BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
|
||||||
|
|
||||||
i = 0;
|
i = 0;
|
||||||
if (striptype != RIGHTSTRIP) {
|
if (striptype != RIGHTSTRIP) {
|
||||||
while (i < len && unicode_memchr(sep, s[i], seplen)) {
|
while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
|
||||||
i++;
|
i++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -5734,7 +5775,7 @@ _PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
|
||||||
if (striptype != LEFTSTRIP) {
|
if (striptype != LEFTSTRIP) {
|
||||||
do {
|
do {
|
||||||
j--;
|
j--;
|
||||||
} while (j >= i && unicode_memchr(sep, s[j], seplen));
|
} while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
|
||||||
j++;
|
j++;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -7387,6 +7428,18 @@ void _PyUnicode_Init(void)
|
||||||
{
|
{
|
||||||
int i;
|
int i;
|
||||||
|
|
||||||
|
/* XXX - move this array to unicodectype.c ? */
|
||||||
|
Py_UNICODE linebreak[] = {
|
||||||
|
0x000A, /* LINE FEED */
|
||||||
|
0x000D, /* CARRIAGE RETURN */
|
||||||
|
0x001C, /* FILE SEPARATOR */
|
||||||
|
0x001D, /* GROUP SEPARATOR */
|
||||||
|
0x001E, /* RECORD SEPARATOR */
|
||||||
|
0x0085, /* NEXT LINE */
|
||||||
|
0x2028, /* LINE SEPARATOR */
|
||||||
|
0x2029, /* PARAGRAPH SEPARATOR */
|
||||||
|
};
|
||||||
|
|
||||||
/* Init the implementation */
|
/* Init the implementation */
|
||||||
unicode_freelist = NULL;
|
unicode_freelist = NULL;
|
||||||
unicode_freelist_size = 0;
|
unicode_freelist_size = 0;
|
||||||
|
@ -7396,6 +7449,11 @@ void _PyUnicode_Init(void)
|
||||||
unicode_latin1[i] = NULL;
|
unicode_latin1[i] = NULL;
|
||||||
if (PyType_Ready(&PyUnicode_Type) < 0)
|
if (PyType_Ready(&PyUnicode_Type) < 0)
|
||||||
Py_FatalError("Can't initialize 'unicode'");
|
Py_FatalError("Can't initialize 'unicode'");
|
||||||
|
|
||||||
|
/* initialize the linebreak bloom filter */
|
||||||
|
bloom_linebreak = make_bloom_mask(
|
||||||
|
linebreak, sizeof(linebreak) / sizeof(linebreak[0])
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Finalize the Unicode implementation */
|
/* Finalize the Unicode implementation */
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue