mirror of
https://github.com/python/cpython.git
synced 2025-10-22 22:53:06 +00:00
Port code page codec to Unicode API.
This commit is contained in:
parent
8ba79306d1
commit
3d325191bf
2 changed files with 73 additions and 62 deletions
|
@ -577,22 +577,18 @@ class CodecCallbackTest(unittest.TestCase):
|
||||||
UnicodeEncodeError("ascii", "\uffff", 0, 1, "ouch")),
|
UnicodeEncodeError("ascii", "\uffff", 0, 1, "ouch")),
|
||||||
("\\uffff", 1)
|
("\\uffff", 1)
|
||||||
)
|
)
|
||||||
if SIZEOF_WCHAR_T == 2:
|
|
||||||
len_wide = 2
|
|
||||||
else:
|
|
||||||
len_wide = 1
|
|
||||||
if SIZEOF_WCHAR_T > 0:
|
if SIZEOF_WCHAR_T > 0:
|
||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
codecs.backslashreplace_errors(
|
codecs.backslashreplace_errors(
|
||||||
UnicodeEncodeError("ascii", "\U00010000",
|
UnicodeEncodeError("ascii", "\U00010000",
|
||||||
0, len_wide, "ouch")),
|
0, 1, "ouch")),
|
||||||
("\\U00010000", len_wide)
|
("\\U00010000", 1)
|
||||||
)
|
)
|
||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
codecs.backslashreplace_errors(
|
codecs.backslashreplace_errors(
|
||||||
UnicodeEncodeError("ascii", "\U0010ffff",
|
UnicodeEncodeError("ascii", "\U0010ffff",
|
||||||
0, len_wide, "ouch")),
|
0, 1, "ouch")),
|
||||||
("\\U0010ffff", len_wide)
|
("\\U0010ffff", 1)
|
||||||
)
|
)
|
||||||
# Lone surrogates (regardless of unicode width)
|
# Lone surrogates (regardless of unicode width)
|
||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
|
|
|
@ -4680,9 +4680,6 @@ _PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
|
||||||
int kind;
|
int kind;
|
||||||
void *data;
|
void *data;
|
||||||
Py_ssize_t size;
|
Py_ssize_t size;
|
||||||
#if SIZEOF_WCHAR_T == 2
|
|
||||||
Py_ssize_t wchar_offset = 0;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
if (!PyUnicode_Check(unicode)) {
|
if (!PyUnicode_Check(unicode)) {
|
||||||
PyErr_BadArgument();
|
PyErr_BadArgument();
|
||||||
|
@ -4738,9 +4735,6 @@ _PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
|
||||||
PyObject *rep;
|
PyObject *rep;
|
||||||
Py_ssize_t repsize, k, startpos;
|
Py_ssize_t repsize, k, startpos;
|
||||||
startpos = i-1;
|
startpos = i-1;
|
||||||
#if SIZEOF_WCHAR_T == 2
|
|
||||||
startpos += wchar_offset;
|
|
||||||
#endif
|
|
||||||
rep = unicode_encode_call_errorhandler(
|
rep = unicode_encode_call_errorhandler(
|
||||||
errors, &errorHandler, "utf-8", "surrogates not allowed",
|
errors, &errorHandler, "utf-8", "surrogates not allowed",
|
||||||
unicode, &exc, startpos, startpos+1, &newpos);
|
unicode, &exc, startpos, startpos+1, &newpos);
|
||||||
|
@ -4809,9 +4803,6 @@ _PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
|
||||||
*p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
|
*p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
|
||||||
*p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
|
*p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
|
||||||
*p++ = (char)(0x80 | (ch & 0x3f));
|
*p++ = (char)(0x80 | (ch & 0x3f));
|
||||||
#if SIZEOF_WCHAR_T == 2
|
|
||||||
wchar_offset++;
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -7315,23 +7306,37 @@ encode_code_page_flags(UINT code_page, const char *errors)
|
||||||
*/
|
*/
|
||||||
static int
|
static int
|
||||||
encode_code_page_strict(UINT code_page, PyObject **outbytes,
|
encode_code_page_strict(UINT code_page, PyObject **outbytes,
|
||||||
const Py_UNICODE *p, const int size,
|
PyObject *unicode, Py_ssize_t offset, int len,
|
||||||
const char* errors)
|
const char* errors)
|
||||||
{
|
{
|
||||||
BOOL usedDefaultChar = FALSE;
|
BOOL usedDefaultChar = FALSE;
|
||||||
BOOL *pusedDefaultChar = &usedDefaultChar;
|
BOOL *pusedDefaultChar = &usedDefaultChar;
|
||||||
int outsize;
|
int outsize;
|
||||||
PyObject *exc = NULL;
|
PyObject *exc = NULL;
|
||||||
|
Py_UNICODE *p;
|
||||||
|
Py_ssize_t size;
|
||||||
const DWORD flags = encode_code_page_flags(code_page, NULL);
|
const DWORD flags = encode_code_page_flags(code_page, NULL);
|
||||||
char *out;
|
char *out;
|
||||||
|
/* Create a substring so that we can get the UTF-16 representation
|
||||||
|
of just the slice under consideration. */
|
||||||
|
PyObject *substring;
|
||||||
|
|
||||||
assert(size > 0);
|
assert(len > 0);
|
||||||
|
|
||||||
if (code_page != CP_UTF8 && code_page != CP_UTF7)
|
if (code_page != CP_UTF8 && code_page != CP_UTF7)
|
||||||
pusedDefaultChar = &usedDefaultChar;
|
pusedDefaultChar = &usedDefaultChar;
|
||||||
else
|
else
|
||||||
pusedDefaultChar = NULL;
|
pusedDefaultChar = NULL;
|
||||||
|
|
||||||
|
substring = PyUnicode_Substring(unicode, offset, offset+len);
|
||||||
|
if (substring == NULL)
|
||||||
|
return -1;
|
||||||
|
p = PyUnicode_AsUnicodeAndSize(substring, &size);
|
||||||
|
if (p == NULL) {
|
||||||
|
Py_DECREF(substring);
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
/* First get the size of the result */
|
/* First get the size of the result */
|
||||||
outsize = WideCharToMultiByte(code_page, flags,
|
outsize = WideCharToMultiByte(code_page, flags,
|
||||||
p, size,
|
p, size,
|
||||||
|
@ -7340,14 +7345,18 @@ encode_code_page_strict(UINT code_page, PyObject **outbytes,
|
||||||
if (outsize <= 0)
|
if (outsize <= 0)
|
||||||
goto error;
|
goto error;
|
||||||
/* If we used a default char, then we failed! */
|
/* If we used a default char, then we failed! */
|
||||||
if (pusedDefaultChar && *pusedDefaultChar)
|
if (pusedDefaultChar && *pusedDefaultChar) {
|
||||||
|
Py_DECREF(substring);
|
||||||
return -2;
|
return -2;
|
||||||
|
}
|
||||||
|
|
||||||
if (*outbytes == NULL) {
|
if (*outbytes == NULL) {
|
||||||
/* Create string object */
|
/* Create string object */
|
||||||
*outbytes = PyBytes_FromStringAndSize(NULL, outsize);
|
*outbytes = PyBytes_FromStringAndSize(NULL, outsize);
|
||||||
if (*outbytes == NULL)
|
if (*outbytes == NULL) {
|
||||||
|
Py_DECREF(substring);
|
||||||
return -1;
|
return -1;
|
||||||
|
}
|
||||||
out = PyBytes_AS_STRING(*outbytes);
|
out = PyBytes_AS_STRING(*outbytes);
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
|
@ -7355,10 +7364,13 @@ encode_code_page_strict(UINT code_page, PyObject **outbytes,
|
||||||
const Py_ssize_t n = PyBytes_Size(*outbytes);
|
const Py_ssize_t n = PyBytes_Size(*outbytes);
|
||||||
if (outsize > PY_SSIZE_T_MAX - n) {
|
if (outsize > PY_SSIZE_T_MAX - n) {
|
||||||
PyErr_NoMemory();
|
PyErr_NoMemory();
|
||||||
|
Py_DECREF(substring);
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
if (_PyBytes_Resize(outbytes, n + outsize) < 0)
|
if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
|
||||||
|
Py_DECREF(substring);
|
||||||
return -1;
|
return -1;
|
||||||
|
}
|
||||||
out = PyBytes_AS_STRING(*outbytes) + n;
|
out = PyBytes_AS_STRING(*outbytes) + n;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -7367,6 +7379,7 @@ encode_code_page_strict(UINT code_page, PyObject **outbytes,
|
||||||
p, size,
|
p, size,
|
||||||
out, outsize,
|
out, outsize,
|
||||||
NULL, pusedDefaultChar);
|
NULL, pusedDefaultChar);
|
||||||
|
Py_CLEAR(substring);
|
||||||
if (outsize <= 0)
|
if (outsize <= 0)
|
||||||
goto error;
|
goto error;
|
||||||
if (pusedDefaultChar && *pusedDefaultChar)
|
if (pusedDefaultChar && *pusedDefaultChar)
|
||||||
|
@ -7374,6 +7387,7 @@ encode_code_page_strict(UINT code_page, PyObject **outbytes,
|
||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
error:
|
error:
|
||||||
|
Py_XDECREF(substring);
|
||||||
if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
|
if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
|
||||||
return -2;
|
return -2;
|
||||||
PyErr_SetFromWindowsErr(0);
|
PyErr_SetFromWindowsErr(0);
|
||||||
|
@ -7390,12 +7404,11 @@ error:
|
||||||
static int
|
static int
|
||||||
encode_code_page_errors(UINT code_page, PyObject **outbytes,
|
encode_code_page_errors(UINT code_page, PyObject **outbytes,
|
||||||
PyObject *unicode, Py_ssize_t unicode_offset,
|
PyObject *unicode, Py_ssize_t unicode_offset,
|
||||||
const Py_UNICODE *in, const int insize,
|
Py_ssize_t insize, const char* errors)
|
||||||
const char* errors)
|
|
||||||
{
|
{
|
||||||
const DWORD flags = encode_code_page_flags(code_page, errors);
|
const DWORD flags = encode_code_page_flags(code_page, errors);
|
||||||
const Py_UNICODE *startin = in;
|
Py_ssize_t pos = unicode_offset;
|
||||||
const Py_UNICODE *endin = in + insize;
|
Py_ssize_t endin = unicode_offset + insize;
|
||||||
/* Ideally, we should get reason from FormatMessage. This is the Windows
|
/* Ideally, we should get reason from FormatMessage. This is the Windows
|
||||||
2000 English version of the message. */
|
2000 English version of the message. */
|
||||||
const char *reason = "invalid character";
|
const char *reason = "invalid character";
|
||||||
|
@ -7404,12 +7417,11 @@ encode_code_page_errors(UINT code_page, PyObject **outbytes,
|
||||||
BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
|
BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
|
||||||
Py_ssize_t outsize;
|
Py_ssize_t outsize;
|
||||||
char *out;
|
char *out;
|
||||||
int charsize;
|
|
||||||
PyObject *errorHandler = NULL;
|
PyObject *errorHandler = NULL;
|
||||||
PyObject *exc = NULL;
|
PyObject *exc = NULL;
|
||||||
PyObject *encoding_obj = NULL;
|
PyObject *encoding_obj = NULL;
|
||||||
char *encoding;
|
char *encoding;
|
||||||
Py_ssize_t startpos, newpos, newoutsize;
|
Py_ssize_t newpos, newoutsize;
|
||||||
PyObject *rep;
|
PyObject *rep;
|
||||||
int ret = -1;
|
int ret = -1;
|
||||||
|
|
||||||
|
@ -7422,7 +7434,7 @@ encode_code_page_errors(UINT code_page, PyObject **outbytes,
|
||||||
if (errors == NULL || strcmp(errors, "strict") == 0) {
|
if (errors == NULL || strcmp(errors, "strict") == 0) {
|
||||||
/* The last error was ERROR_NO_UNICODE_TRANSLATION,
|
/* The last error was ERROR_NO_UNICODE_TRANSLATION,
|
||||||
then we raise a UnicodeEncodeError. */
|
then we raise a UnicodeEncodeError. */
|
||||||
make_encode_exception(&exc, encoding, in, insize, 0, 0, reason);
|
make_encode_exception_obj(&exc, encoding, unicode, 0, 0, reason);
|
||||||
if (exc != NULL) {
|
if (exc != NULL) {
|
||||||
PyCodec_StrictErrors(exc);
|
PyCodec_StrictErrors(exc);
|
||||||
Py_DECREF(exc);
|
Py_DECREF(exc);
|
||||||
|
@ -7462,23 +7474,30 @@ encode_code_page_errors(UINT code_page, PyObject **outbytes,
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Encode the string character per character */
|
/* Encode the string character per character */
|
||||||
while (in < endin)
|
while (pos < endin)
|
||||||
{
|
{
|
||||||
if ((in + 2) <= endin
|
Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
|
||||||
&& 0xD800 <= in[0] && in[0] <= 0xDBFF
|
wchar_t chars[2];
|
||||||
&& 0xDC00 <= in[1] && in[1] <= 0xDFFF)
|
int charsize;
|
||||||
charsize = 2;
|
if (ch < 0x10000) {
|
||||||
else
|
chars[0] = (wchar_t)ch;
|
||||||
charsize = 1;
|
charsize = 1;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
ch -= 0x10000;
|
||||||
|
chars[0] = 0xd800 + (ch >> 10);
|
||||||
|
chars[1] = 0xdc00 + (ch & 0x3ff);
|
||||||
|
charsize = 2;
|
||||||
|
}
|
||||||
|
|
||||||
outsize = WideCharToMultiByte(code_page, flags,
|
outsize = WideCharToMultiByte(code_page, flags,
|
||||||
in, charsize,
|
chars, charsize,
|
||||||
buffer, Py_ARRAY_LENGTH(buffer),
|
buffer, Py_ARRAY_LENGTH(buffer),
|
||||||
NULL, pusedDefaultChar);
|
NULL, pusedDefaultChar);
|
||||||
if (outsize > 0) {
|
if (outsize > 0) {
|
||||||
if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
|
if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
|
||||||
{
|
{
|
||||||
in += charsize;
|
pos++;
|
||||||
memcpy(out, buffer, outsize);
|
memcpy(out, buffer, outsize);
|
||||||
out += outsize;
|
out += outsize;
|
||||||
continue;
|
continue;
|
||||||
|
@ -7489,15 +7508,13 @@ encode_code_page_errors(UINT code_page, PyObject **outbytes,
|
||||||
goto error;
|
goto error;
|
||||||
}
|
}
|
||||||
|
|
||||||
charsize = Py_MAX(charsize - 1, 1);
|
|
||||||
startpos = unicode_offset + in - startin;
|
|
||||||
rep = unicode_encode_call_errorhandler(
|
rep = unicode_encode_call_errorhandler(
|
||||||
errors, &errorHandler, encoding, reason,
|
errors, &errorHandler, encoding, reason,
|
||||||
unicode, &exc,
|
unicode, &exc,
|
||||||
startpos, startpos + charsize, &newpos);
|
pos, pos + 1, &newpos);
|
||||||
if (rep == NULL)
|
if (rep == NULL)
|
||||||
goto error;
|
goto error;
|
||||||
in += (newpos - startpos);
|
pos = newpos;
|
||||||
|
|
||||||
if (PyBytes_Check(rep)) {
|
if (PyBytes_Check(rep)) {
|
||||||
outsize = PyBytes_GET_SIZE(rep);
|
outsize = PyBytes_GET_SIZE(rep);
|
||||||
|
@ -7538,10 +7555,9 @@ encode_code_page_errors(UINT code_page, PyObject **outbytes,
|
||||||
for (i=0; i < outsize; i++) {
|
for (i=0; i < outsize; i++) {
|
||||||
Py_UCS4 ch = PyUnicode_READ(kind, data, i);
|
Py_UCS4 ch = PyUnicode_READ(kind, data, i);
|
||||||
if (ch > 127) {
|
if (ch > 127) {
|
||||||
raise_encode_exception(&exc,
|
raise_encode_exception_obj(&exc,
|
||||||
encoding,
|
encoding, unicode,
|
||||||
startin, insize,
|
pos, pos + 1,
|
||||||
startpos, startpos + charsize,
|
|
||||||
"unable to encode error handler result to ASCII");
|
"unable to encode error handler result to ASCII");
|
||||||
Py_DECREF(rep);
|
Py_DECREF(rep);
|
||||||
goto error;
|
goto error;
|
||||||
|
@ -7572,55 +7588,54 @@ encode_code_page(int code_page,
|
||||||
PyObject *unicode,
|
PyObject *unicode,
|
||||||
const char *errors)
|
const char *errors)
|
||||||
{
|
{
|
||||||
const Py_UNICODE *p;
|
Py_ssize_t len;
|
||||||
Py_ssize_t size;
|
|
||||||
PyObject *outbytes = NULL;
|
PyObject *outbytes = NULL;
|
||||||
Py_ssize_t offset;
|
Py_ssize_t offset;
|
||||||
int chunk_len, ret, done;
|
int chunk_len, ret, done;
|
||||||
|
|
||||||
p = PyUnicode_AsUnicodeAndSize(unicode, &size);
|
if (PyUnicode_READY(unicode) < 0)
|
||||||
if (p == NULL)
|
return NULL;
|
||||||
return NULL;
|
len = PyUnicode_GET_LENGTH(unicode);
|
||||||
|
|
||||||
if (code_page < 0) {
|
if (code_page < 0) {
|
||||||
PyErr_SetString(PyExc_ValueError, "invalid code page number");
|
PyErr_SetString(PyExc_ValueError, "invalid code page number");
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (size == 0)
|
if (len == 0)
|
||||||
return PyBytes_FromStringAndSize(NULL, 0);
|
return PyBytes_FromStringAndSize(NULL, 0);
|
||||||
|
|
||||||
offset = 0;
|
offset = 0;
|
||||||
do
|
do
|
||||||
{
|
{
|
||||||
#ifdef NEED_RETRY
|
#ifdef NEED_RETRY
|
||||||
if (size > INT_MAX) {
|
/* UTF-16 encoding may double the size, so use only INT_MAX/2
|
||||||
chunk_len = INT_MAX;
|
chunks. */
|
||||||
|
if (len > INT_MAX/2) {
|
||||||
|
chunk_len = INT_MAX/2;
|
||||||
done = 0;
|
done = 0;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
#endif
|
#endif
|
||||||
{
|
{
|
||||||
chunk_len = (int)size;
|
chunk_len = (int)len;
|
||||||
done = 1;
|
done = 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
ret = encode_code_page_strict(code_page, &outbytes,
|
ret = encode_code_page_strict(code_page, &outbytes,
|
||||||
p, chunk_len,
|
unicode, offset, chunk_len,
|
||||||
errors);
|
errors);
|
||||||
if (ret == -2)
|
if (ret == -2)
|
||||||
ret = encode_code_page_errors(code_page, &outbytes,
|
ret = encode_code_page_errors(code_page, &outbytes,
|
||||||
unicode, offset,
|
unicode, offset,
|
||||||
p, chunk_len,
|
chunk_len, errors);
|
||||||
errors);
|
|
||||||
if (ret < 0) {
|
if (ret < 0) {
|
||||||
Py_XDECREF(outbytes);
|
Py_XDECREF(outbytes);
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
p += chunk_len;
|
|
||||||
offset += chunk_len;
|
offset += chunk_len;
|
||||||
size -= chunk_len;
|
len -= chunk_len;
|
||||||
} while (!done);
|
} while (!done);
|
||||||
|
|
||||||
return outbytes;
|
return outbytes;
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue