Removing UTF-16 aware Unicode comparison code. This kind of compare

function (together with other locale aware ones) should into a new collation
support module. See python-dev for a discussion of this removal.

Note: This patch should also be applied to the 1.6 branch.
This commit is contained in:
Marc-André Lemburg 2000-08-08 08:04:29 +00:00
parent 5660639f9f
commit e5034378cc
3 changed files with 83 additions and 48 deletions

View file

@ -1,6 +1,5 @@
test_unicode test_unicode
Testing Unicode comparisons... done. Testing Unicode comparisons... done.
Testing UTF-16 code point order comparisons... done.
Testing Unicode contains method... done. Testing Unicode contains method... done.
Testing Unicode formatting strings... done. Testing Unicode formatting strings... done.
Testing builtin codecs... done. Testing builtin codecs... done.

View file

@ -168,56 +168,59 @@ assert 'abc' < u'abcd'
assert u'abc' < u'abcd' assert u'abc' < u'abcd'
print 'done.' print 'done.'
print 'Testing UTF-16 code point order comparisons...', if 0:
#No surrogates, no fixup required. # Move these tests to a Unicode collation module test...
assert u'\u0061' < u'\u20ac'
# Non surrogate below surrogate value, no fixup required
assert u'\u0061' < u'\ud800\udc02'
# Non surrogate above surrogate value, fixup required print 'Testing UTF-16 code point order comparisons...',
def test_lecmp(s, s2): #No surrogates, no fixup required.
assert s < s2 , "comparison failed on %s < %s" % (s, s2) assert u'\u0061' < u'\u20ac'
# Non surrogate below surrogate value, no fixup required
def test_fixup(s): assert u'\u0061' < u'\ud800\udc02'
s2 = u'\ud800\udc01'
test_lecmp(s, s2)
s2 = u'\ud900\udc01'
test_lecmp(s, s2)
s2 = u'\uda00\udc01'
test_lecmp(s, s2)
s2 = u'\udb00\udc01'
test_lecmp(s, s2)
s2 = u'\ud800\udd01'
test_lecmp(s, s2)
s2 = u'\ud900\udd01'
test_lecmp(s, s2)
s2 = u'\uda00\udd01'
test_lecmp(s, s2)
s2 = u'\udb00\udd01'
test_lecmp(s, s2)
s2 = u'\ud800\ude01'
test_lecmp(s, s2)
s2 = u'\ud900\ude01'
test_lecmp(s, s2)
s2 = u'\uda00\ude01'
test_lecmp(s, s2)
s2 = u'\udb00\ude01'
test_lecmp(s, s2)
s2 = u'\ud800\udfff'
test_lecmp(s, s2)
s2 = u'\ud900\udfff'
test_lecmp(s, s2)
s2 = u'\uda00\udfff'
test_lecmp(s, s2)
s2 = u'\udb00\udfff'
test_lecmp(s, s2)
test_fixup(u'\ue000') # Non surrogate above surrogate value, fixup required
test_fixup(u'\uff61') def test_lecmp(s, s2):
assert s < s2 , "comparison failed on %s < %s" % (s, s2)
# Surrogates on both sides, no fixup required def test_fixup(s):
assert u'\ud800\udc02' < u'\ud84d\udc56' s2 = u'\ud800\udc01'
print 'done.' test_lecmp(s, s2)
s2 = u'\ud900\udc01'
test_lecmp(s, s2)
s2 = u'\uda00\udc01'
test_lecmp(s, s2)
s2 = u'\udb00\udc01'
test_lecmp(s, s2)
s2 = u'\ud800\udd01'
test_lecmp(s, s2)
s2 = u'\ud900\udd01'
test_lecmp(s, s2)
s2 = u'\uda00\udd01'
test_lecmp(s, s2)
s2 = u'\udb00\udd01'
test_lecmp(s, s2)
s2 = u'\ud800\ude01'
test_lecmp(s, s2)
s2 = u'\ud900\ude01'
test_lecmp(s, s2)
s2 = u'\uda00\ude01'
test_lecmp(s, s2)
s2 = u'\udb00\ude01'
test_lecmp(s, s2)
s2 = u'\ud800\udfff'
test_lecmp(s, s2)
s2 = u'\ud900\udfff'
test_lecmp(s, s2)
s2 = u'\uda00\udfff'
test_lecmp(s, s2)
s2 = u'\udb00\udfff'
test_lecmp(s, s2)
test_fixup(u'\ue000')
test_fixup(u'\uff61')
# Surrogates on both sides, no fixup required
assert u'\ud800\udc02' < u'\ud84d\udc56'
print 'done.'
test('ljust', u'abc', u'abc ', 10) test('ljust', u'abc', u'abc ', 10)
test('rjust', u'abc', u' abc', 10) test('rjust', u'abc', u' abc', 10)

View file

@ -3169,6 +3169,12 @@ unicode_center(PyUnicodeObject *self, PyObject *args)
return (PyObject*) pad(self, left, marg - left, ' '); return (PyObject*) pad(self, left, marg - left, ' ');
} }
#if 0
/* This code should go into some future Unicode collation support
module. The basic comparison should compare ordinals on a naive
basis (this is what Java does and thus JPython too).
/* speedy UTF-16 code point order comparison */ /* speedy UTF-16 code point order comparison */
/* gleaned from: */ /* gleaned from: */
/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */ /* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
@ -3213,6 +3219,33 @@ unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
return (len1 < len2) ? -1 : (len1 != len2); return (len1 < len2) ? -1 : (len1 != len2);
} }
#else
static int
unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
{
register int len1, len2;
Py_UNICODE *s1 = str1->str;
Py_UNICODE *s2 = str2->str;
len1 = str1->length;
len2 = str2->length;
while (len1 > 0 && len2 > 0) {
register long diff;
diff = (long)*s1++ - (long)*s2++;
if (diff)
return (diff < 0) ? -1 : (diff != 0);
len1--; len2--;
}
return (len1 < len2) ? -1 : (len1 != len2);
}
#endif
int PyUnicode_Compare(PyObject *left, int PyUnicode_Compare(PyObject *left,
PyObject *right) PyObject *right)
{ {