mirror of
https://github.com/python/cpython.git
synced 2025-08-31 14:07:50 +00:00
- changed hash calculation for unicode strings. the new
value is calculated from the character values, in a way that makes sure an 8-bit ASCII string and a unicode string with the same contents get the same hash value. (as a side effect, this also works for ISO Latin 1 strings). for more details, see the python-dev discussion.
This commit is contained in:
parent
417c489def
commit
dde6164402
1 changed files with 19 additions and 17 deletions
|
@ -3471,26 +3471,28 @@ unicode_getitem(PyUnicodeObject *self, int index)
|
||||||
static long
|
static long
|
||||||
unicode_hash(PyUnicodeObject *self)
|
unicode_hash(PyUnicodeObject *self)
|
||||||
{
|
{
|
||||||
long hash;
|
/* Since Unicode objects compare equal to their ASCII string
|
||||||
PyObject *utf8;
|
counterparts, they should use the individual character values
|
||||||
|
as basis for their hash value. This is needed to assure that
|
||||||
|
strings and Unicode objects behave in the same way as
|
||||||
|
dictionary keys. */
|
||||||
|
|
||||||
|
register int len;
|
||||||
|
register Py_UNICODE *p;
|
||||||
|
register long x;
|
||||||
|
|
||||||
/* Since Unicode objects compare equal to their UTF-8 string
|
|
||||||
counterparts, they should also use the UTF-8 strings as basis
|
|
||||||
for their hash value. This is needed to assure that strings and
|
|
||||||
Unicode objects behave in the same way as dictionary
|
|
||||||
keys. Unfortunately, this costs some performance and also some
|
|
||||||
memory if the cached UTF-8 representation is not used later
|
|
||||||
on. */
|
|
||||||
if (self->hash != -1)
|
if (self->hash != -1)
|
||||||
return self->hash;
|
return self->hash;
|
||||||
utf8 = _PyUnicode_AsUTF8String((PyObject *)self, NULL);
|
len = PyUnicode_GET_SIZE(self);
|
||||||
if (utf8 == NULL)
|
p = PyUnicode_AS_UNICODE(self);
|
||||||
return -1;
|
x = *p << 7;
|
||||||
hash = PyObject_Hash(utf8);
|
while (--len >= 0)
|
||||||
if (hash == -1)
|
x = (1000003*x) ^ *p++;
|
||||||
return -1;
|
x ^= PyUnicode_GET_SIZE(self);
|
||||||
self->hash = hash;
|
if (x == -1)
|
||||||
return hash;
|
x = -2;
|
||||||
|
self->hash = x;
|
||||||
|
return x;
|
||||||
}
|
}
|
||||||
|
|
||||||
static char index__doc__[] =
|
static char index__doc__[] =
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue