mirror of
https://github.com/python/cpython.git
synced 2025-10-17 12:18:23 +00:00
Encode surrogates in UTF-8 even for a wide Py_UNICODE.
Implement sys.maxunicode. Explicitly wrap around upper/lower computations for wide Py_UNICODE. When decoding large characters with UTF-8, represent expected test results using the \U notation.
This commit is contained in:
parent
236d8b7974
commit
ce9b5a55e1
5 changed files with 47 additions and 16 deletions
|
@ -274,6 +274,9 @@ extern DL_IMPORT(int) PyUnicode_GetSize(
|
||||||
PyObject *unicode /* Unicode object */
|
PyObject *unicode /* Unicode object */
|
||||||
);
|
);
|
||||||
|
|
||||||
|
/* Get the maximum ordinal for a Unicode character. */
|
||||||
|
extern DL_IMPORT(Py_UNICODE) PyUnicode_GetMax(void);
|
||||||
|
|
||||||
/* Resize an already allocated Unicode object to the new size length.
|
/* Resize an already allocated Unicode object to the new size length.
|
||||||
|
|
||||||
*unicode is modified to point to the new (resized) object and 0
|
*unicode is modified to point to the new (resized) object and 0
|
||||||
|
|
|
@ -386,9 +386,9 @@ verify(u'\ud84d\udc56'.encode('utf-8') == \
|
||||||
''.join((chr(0xf0), chr(0xa3), chr(0x91), chr(0x96))) )
|
''.join((chr(0xf0), chr(0xa3), chr(0x91), chr(0x96))) )
|
||||||
# UTF-8 specific decoding tests
|
# UTF-8 specific decoding tests
|
||||||
verify(unicode(''.join((chr(0xf0), chr(0xa3), chr(0x91), chr(0x96))),
|
verify(unicode(''.join((chr(0xf0), chr(0xa3), chr(0x91), chr(0x96))),
|
||||||
'utf-8') == u'\ud84d\udc56' )
|
'utf-8') == u'\U00023456' )
|
||||||
verify(unicode(''.join((chr(0xf0), chr(0x90), chr(0x80), chr(0x82))),
|
verify(unicode(''.join((chr(0xf0), chr(0x90), chr(0x80), chr(0x82))),
|
||||||
'utf-8') == u'\ud800\udc02' )
|
'utf-8') == u'\U00010002' )
|
||||||
verify(unicode(''.join((chr(0xe2), chr(0x82), chr(0xac))),
|
verify(unicode(''.join((chr(0xe2), chr(0x82), chr(0xac))),
|
||||||
'utf-8') == u'\u20ac' )
|
'utf-8') == u'\u20ac' )
|
||||||
|
|
||||||
|
|
|
@ -59,14 +59,21 @@ int _PyUnicode_IsLinebreak(register const Py_UNICODE ch)
|
||||||
/* Returns the titlecase Unicode characters corresponding to ch or just
|
/* Returns the titlecase Unicode characters corresponding to ch or just
|
||||||
ch if no titlecase mapping is known. */
|
ch if no titlecase mapping is known. */
|
||||||
|
|
||||||
Py_UNICODE _PyUnicode_ToTitlecase(register const Py_UNICODE ch)
|
Py_UNICODE _PyUnicode_ToTitlecase(register Py_UNICODE ch)
|
||||||
{
|
{
|
||||||
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
|
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
|
||||||
|
|
||||||
if (ctype->title)
|
if (ctype->title)
|
||||||
return ch + ctype->title;
|
ch += ctype->title;
|
||||||
|
else
|
||||||
|
ch += ctype->upper;
|
||||||
|
|
||||||
return ch + ctype->upper;
|
#ifdef USE_UCS4_STORAGE
|
||||||
|
/* The database assumes that the values wrap around at 0x10000. */
|
||||||
|
if (ch > 0x10000)
|
||||||
|
ch -= 0x10000;
|
||||||
|
#endif
|
||||||
|
return ch;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Returns 1 for Unicode characters having the category 'Lt', 0
|
/* Returns 1 for Unicode characters having the category 'Lt', 0
|
||||||
|
@ -348,21 +355,33 @@ int _PyUnicode_IsUppercase(register const Py_UNICODE ch)
|
||||||
/* Returns the uppercase Unicode characters corresponding to ch or just
|
/* Returns the uppercase Unicode characters corresponding to ch or just
|
||||||
ch if no uppercase mapping is known. */
|
ch if no uppercase mapping is known. */
|
||||||
|
|
||||||
Py_UNICODE _PyUnicode_ToUppercase(register const Py_UNICODE ch)
|
Py_UNICODE _PyUnicode_ToUppercase(register Py_UNICODE ch)
|
||||||
{
|
{
|
||||||
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
|
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
|
||||||
|
|
||||||
return ch + ctype->upper;
|
ch += ctype->upper;
|
||||||
|
#ifdef USE_UCS4_STORAGE
|
||||||
|
/* The database assumes that the values wrap around at 0x10000. */
|
||||||
|
if (ch > 0x10000)
|
||||||
|
ch -= 0x10000;
|
||||||
|
#endif
|
||||||
|
return ch;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Returns the lowercase Unicode characters corresponding to ch or just
|
/* Returns the lowercase Unicode characters corresponding to ch or just
|
||||||
ch if no lowercase mapping is known. */
|
ch if no lowercase mapping is known. */
|
||||||
|
|
||||||
Py_UNICODE _PyUnicode_ToLowercase(register const Py_UNICODE ch)
|
Py_UNICODE _PyUnicode_ToLowercase(register Py_UNICODE ch)
|
||||||
{
|
{
|
||||||
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
|
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
|
||||||
|
|
||||||
return ch + ctype->lower;
|
ch += ctype->lower;
|
||||||
|
#ifdef USE_UCS4_STORAGE
|
||||||
|
/* The database assumes that the values wrap around at 0x10000. */
|
||||||
|
if (ch > 0x10000)
|
||||||
|
ch -= 0x10000;
|
||||||
|
#endif
|
||||||
|
return ch;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Returns 1 for Unicode characters having the category 'Ll', 'Lu', 'Lt',
|
/* Returns 1 for Unicode characters having the category 'Ll', 'Lu', 'Lt',
|
||||||
|
|
|
@ -103,6 +103,18 @@ static PyUnicodeObject *unicode_latin1[256];
|
||||||
*/
|
*/
|
||||||
static char unicode_default_encoding[100];
|
static char unicode_default_encoding[100];
|
||||||
|
|
||||||
|
Py_UNICODE
|
||||||
|
PyUnicode_GetMax()
|
||||||
|
{
|
||||||
|
#ifdef USE_UCS4_STORAGE
|
||||||
|
return 0x10FFFF;
|
||||||
|
#else
|
||||||
|
/* This is actually an illegal character, so it should
|
||||||
|
not be passed to unichr. */
|
||||||
|
return 0xFFFF;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
/* --- Unicode Object ----------------------------------------------------- */
|
/* --- Unicode Object ----------------------------------------------------- */
|
||||||
|
|
||||||
static
|
static
|
||||||
|
@ -884,12 +896,6 @@ PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s,
|
||||||
cbWritten += 2;
|
cbWritten += 2;
|
||||||
}
|
}
|
||||||
else if (ch < 0x10000) {
|
else if (ch < 0x10000) {
|
||||||
#if Py_UNICODE_SIZE == 4
|
|
||||||
*p++ = 0xe0 | (ch>>12);
|
|
||||||
*p++ = 0x80 | ((ch>>6) & 0x3f);
|
|
||||||
*p++ = 0x80 | (ch & 0x3f);
|
|
||||||
cbWritten += 3;
|
|
||||||
#else
|
|
||||||
/* Check for high surrogate */
|
/* Check for high surrogate */
|
||||||
if (0xD800 <= ch && ch <= 0xDBFF) {
|
if (0xD800 <= ch && ch <= 0xDBFF) {
|
||||||
if (i != size) {
|
if (i != size) {
|
||||||
|
@ -920,7 +926,6 @@ PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s,
|
||||||
}
|
}
|
||||||
*p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
|
*p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
|
||||||
*p++ = (char)(0x80 | (ch & 0x3f));
|
*p++ = (char)(0x80 | (ch & 0x3f));
|
||||||
#endif
|
|
||||||
} else {
|
} else {
|
||||||
*p++ = 0xf0 | (ch>>18);
|
*p++ = 0xf0 | (ch>>18);
|
||||||
*p++ = 0x80 | ((ch>>12) & 0x3f);
|
*p++ = 0x80 | ((ch>>12) & 0x3f);
|
||||||
|
|
|
@ -533,6 +533,7 @@ exc_traceback -- traceback of exception currently being handled\n\
|
||||||
Static objects:\n\
|
Static objects:\n\
|
||||||
\n\
|
\n\
|
||||||
maxint -- the largest supported integer (the smallest is -maxint-1)\n\
|
maxint -- the largest supported integer (the smallest is -maxint-1)\n\
|
||||||
|
maxunicode -- the largest supported character\n\
|
||||||
builtin_module_names -- tuple of module names built into this intepreter\n\
|
builtin_module_names -- tuple of module names built into this intepreter\n\
|
||||||
version -- the version of this interpreter as a string\n\
|
version -- the version of this interpreter as a string\n\
|
||||||
version_info -- version information as a tuple\n\
|
version_info -- version information as a tuple\n\
|
||||||
|
@ -643,6 +644,9 @@ _PySys_Init(void)
|
||||||
PyDict_SetItemString(sysdict, "maxint",
|
PyDict_SetItemString(sysdict, "maxint",
|
||||||
v = PyInt_FromLong(PyInt_GetMax()));
|
v = PyInt_FromLong(PyInt_GetMax()));
|
||||||
Py_XDECREF(v);
|
Py_XDECREF(v);
|
||||||
|
PyDict_SetItemString(sysdict, "maxunicode",
|
||||||
|
v = PyInt_FromLong(PyUnicode_GetMax()));
|
||||||
|
Py_XDECREF(v);
|
||||||
PyDict_SetItemString(sysdict, "builtin_module_names",
|
PyDict_SetItemString(sysdict, "builtin_module_names",
|
||||||
v = list_builtin_module_names());
|
v = list_builtin_module_names());
|
||||||
Py_XDECREF(v);
|
Py_XDECREF(v);
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue