mirror of
https://github.com/python/cpython.git
synced 2025-08-30 13:38:43 +00:00
New surrogate support in the UTF-8 codec. By Bill Tutt.
This commit is contained in:
parent
d6d06ade26
commit
e12896ec98
1 changed files with 80 additions and 29 deletions
|
@ -657,10 +657,10 @@ PyObject *PyUnicode_DecodeUTF8(const char *s,
|
||||||
e = s + size;
|
e = s + size;
|
||||||
|
|
||||||
while (s < e) {
|
while (s < e) {
|
||||||
register Py_UNICODE ch = (unsigned char)*s;
|
Py_UCS4 ch = (unsigned char)*s;
|
||||||
|
|
||||||
if (ch < 0x80) {
|
if (ch < 0x80) {
|
||||||
*p++ = ch;
|
*p++ = (Py_UNICODE)ch;
|
||||||
s++;
|
s++;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
@ -687,7 +687,7 @@ PyObject *PyUnicode_DecodeUTF8(const char *s,
|
||||||
if (ch < 0x80)
|
if (ch < 0x80)
|
||||||
UTF8_ERROR("illegal encoding");
|
UTF8_ERROR("illegal encoding");
|
||||||
else
|
else
|
||||||
*p++ = ch;
|
*p++ = (Py_UNICODE)ch;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case 3:
|
case 3:
|
||||||
|
@ -698,7 +698,30 @@ PyObject *PyUnicode_DecodeUTF8(const char *s,
|
||||||
if (ch < 0x800 || (ch >= 0xd800 && ch < 0xe000))
|
if (ch < 0x800 || (ch >= 0xd800 && ch < 0xe000))
|
||||||
UTF8_ERROR("illegal encoding");
|
UTF8_ERROR("illegal encoding");
|
||||||
else
|
else
|
||||||
*p++ = ch;
|
*p++ = (Py_UNICODE)ch;
|
||||||
|
break;
|
||||||
|
|
||||||
|
case 4:
|
||||||
|
if ((s[1] & 0xc0) != 0x80 ||
|
||||||
|
(s[2] & 0xc0) != 0x80 ||
|
||||||
|
(s[3] & 0xc0) != 0x80)
|
||||||
|
UTF8_ERROR("invalid data");
|
||||||
|
ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
|
||||||
|
((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
|
||||||
|
/* validate and convert to UTF-16 */
|
||||||
|
if ((ch < 0x10000) || /* minimum value allowed for 4 byte encoding */
|
||||||
|
(ch > 0x10ffff)) /* maximum value allowed for UTF-16 */
|
||||||
|
UTF8_ERROR("illegal encoding");
|
||||||
|
/* compute and append the two surrogates: */
|
||||||
|
|
||||||
|
/* translate from 10000..10FFFF to 0..FFFF */
|
||||||
|
ch -= 0x10000;
|
||||||
|
|
||||||
|
/* high surrogate = top 10 bits added to D800 */
|
||||||
|
*p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
|
||||||
|
|
||||||
|
/* low surrogate = bottom 10 bits added to DC00 */
|
||||||
|
*p++ = (Py_UNICODE)(0xDC00 + (ch & ~0xFC00));
|
||||||
break;
|
break;
|
||||||
|
|
||||||
default:
|
default:
|
||||||
|
@ -758,32 +781,60 @@ PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s,
|
||||||
PyObject *v;
|
PyObject *v;
|
||||||
char *p;
|
char *p;
|
||||||
char *q;
|
char *q;
|
||||||
|
Py_UCS4 ch2;
|
||||||
|
unsigned int cbAllocated = 3 * size;
|
||||||
|
unsigned int cbWritten = 0;
|
||||||
|
int i = 0;
|
||||||
|
|
||||||
v = PyString_FromStringAndSize(NULL, 3 * size);
|
v = PyString_FromStringAndSize(NULL, cbAllocated);
|
||||||
if (v == NULL)
|
if (v == NULL)
|
||||||
return NULL;
|
return NULL;
|
||||||
if (size == 0)
|
if (size == 0)
|
||||||
goto done;
|
goto done;
|
||||||
|
|
||||||
p = q = PyString_AS_STRING(v);
|
p = q = PyString_AS_STRING(v);
|
||||||
while (size-- > 0) {
|
while (i < size) {
|
||||||
Py_UNICODE ch = *s++;
|
Py_UCS4 ch = s[i++];
|
||||||
if (ch < 0x80)
|
if (ch < 0x80) {
|
||||||
*p++ = (char) ch;
|
*p++ = (char) ch;
|
||||||
|
cbWritten++;
|
||||||
|
}
|
||||||
else if (ch < 0x0800) {
|
else if (ch < 0x0800) {
|
||||||
*p++ = 0xc0 | (ch >> 6);
|
*p++ = 0xc0 | (ch >> 6);
|
||||||
*p++ = 0x80 | (ch & 0x3f);
|
*p++ = 0x80 | (ch & 0x3f);
|
||||||
} else if (0xD800 <= ch && ch <= 0xDFFF) {
|
cbWritten += 2;
|
||||||
/* These byte ranges are reserved for UTF-16 surrogate
|
}
|
||||||
bytes which the Python implementation currently does
|
else {
|
||||||
not support. */
|
/* Check for high surrogate */
|
||||||
if (utf8_encoding_error(&s, &p, errors,
|
if (0xD800 <= ch && ch <= 0xDBFF) {
|
||||||
"unsupported code range"))
|
if (i != size) {
|
||||||
|
ch2 = s[i];
|
||||||
|
if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
|
||||||
|
|
||||||
|
if (cbWritten >= (cbAllocated - 4)) {
|
||||||
|
/* Provide enough room for some more
|
||||||
|
surrogates */
|
||||||
|
cbAllocated += 4*10;
|
||||||
|
if (_PyString_Resize(&v, cbAllocated))
|
||||||
goto onError;
|
goto onError;
|
||||||
} else {
|
}
|
||||||
*p++ = 0xe0 | (ch >> 12);
|
|
||||||
*p++ = 0x80 | ((ch >> 6) & 0x3f);
|
/* combine the two values */
|
||||||
*p++ = 0x80 | (ch & 0x3f);
|
ch = ((ch - 0xD800)<<10 | (ch2-0xDC00))+0x10000;
|
||||||
|
|
||||||
|
*p++ = (char)((ch >> 18) | 0xf0);
|
||||||
|
*p++ = (char)(0x80 | (ch >> 12) & 0x3f);
|
||||||
|
i++;
|
||||||
|
cbWritten += 4;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
*p++ = (char)(0xe0 | (ch >> 12));
|
||||||
|
cbWritten += 3;
|
||||||
|
}
|
||||||
|
*p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
|
||||||
|
*p++ = (char)(0x80 | (ch & 0x3f));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
*p = '\0';
|
*p = '\0';
|
||||||
|
@ -1217,7 +1268,7 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
|
||||||
{
|
{
|
||||||
const char *start = s + 1;
|
const char *start = s + 1;
|
||||||
const char *endBrace = start;
|
const char *endBrace = start;
|
||||||
unsigned int uiValue;
|
Py_UCS4 value;
|
||||||
unsigned long j;
|
unsigned long j;
|
||||||
|
|
||||||
/* look for either the closing brace, or we
|
/* look for either the closing brace, or we
|
||||||
|
@ -1248,25 +1299,25 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
|
||||||
}
|
}
|
||||||
goto ucnFallthrough;
|
goto ucnFallthrough;
|
||||||
}
|
}
|
||||||
uiValue = ((_Py_UnicodeCharacterName *)
|
value = ((_Py_UnicodeCharacterName *)
|
||||||
(pucnHash->getValue(j)))->uiValue;
|
(pucnHash->getValue(j)))->value;
|
||||||
if (uiValue < 1<<16)
|
if (value < 1<<16)
|
||||||
{
|
{
|
||||||
/* In UCS-2 range, easy solution.. */
|
/* In UCS-2 range, easy solution.. */
|
||||||
*p++ = uiValue;
|
*p++ = value;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
/* Oops, its in UCS-4 space, */
|
/* Oops, its in UCS-4 space, */
|
||||||
/* compute and append the two surrogates: */
|
/* compute and append the two surrogates: */
|
||||||
/* translate from 10000..10FFFF to 0..FFFFF */
|
/* translate from 10000..10FFFF to 0..FFFFF */
|
||||||
uiValue -= 0x10000;
|
value -= 0x10000;
|
||||||
|
|
||||||
/* high surrogate = top 10 bits added to D800 */
|
/* high surrogate = top 10 bits added to D800 */
|
||||||
*p++ = 0xD800 + (uiValue >> 10);
|
*p++ = 0xD800 + (value >> 10);
|
||||||
|
|
||||||
/* low surrogate = bottom 10 bits added to DC00 */
|
/* low surrogate = bottom 10 bits added to DC00 */
|
||||||
*p++ = 0xDC00 + (uiValue & ~0xFC00);
|
*p++ = 0xDC00 + (value & ~0xFC00);
|
||||||
}
|
}
|
||||||
s = endBrace + 1;
|
s = endBrace + 1;
|
||||||
}
|
}
|
||||||
|
@ -3091,12 +3142,12 @@ unicode_center(PyUnicodeObject *self, PyObject *args)
|
||||||
/* gleaned from: */
|
/* gleaned from: */
|
||||||
/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
|
/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
|
||||||
|
|
||||||
static unsigned long utf16Fixup[32] =
|
static short utf16Fixup[32] =
|
||||||
{
|
{
|
||||||
0, 0, 0, 0, 0, 0, 0, 0,
|
0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
0, 0, 0, 0, 0, 0, 0, 0,
|
0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
0, 0, 0, 0, 0, 0, 0, 0,
|
0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
0, 0, 0, 0x2000, 0xf800, 0xf800, 0xf800, 0xf800
|
0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
|
||||||
};
|
};
|
||||||
|
|
||||||
static int
|
static int
|
||||||
|
@ -3111,7 +3162,7 @@ unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
|
||||||
len2 = str2->length;
|
len2 = str2->length;
|
||||||
|
|
||||||
while (len1 > 0 && len2 > 0) {
|
while (len1 > 0 && len2 > 0) {
|
||||||
unsigned long c1, c2;
|
Py_UNICODE c1, c2;
|
||||||
long diff;
|
long diff;
|
||||||
|
|
||||||
c1 = *s1++;
|
c1 = *s1++;
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue