Issue #5915: Implement PEP 383, Non-decodable Bytes in

System Character Interfaces.
This commit is contained in:
Martin v. Löwis 2009-05-05 04:43:17 +00:00
parent 93f65a177b
commit 011e842033
15 changed files with 726 additions and 289 deletions

View file

@ -829,6 +829,82 @@ PyCodec_SurrogateErrors(PyObject *exc)
}
}
static PyObject *
PyCodec_UTF8bErrors(PyObject *exc)
{
PyObject *restuple;
PyObject *object;
Py_ssize_t start;
Py_ssize_t end;
PyObject *res;
if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
Py_UNICODE *p;
Py_UNICODE *startp;
char *outp;
if (PyUnicodeEncodeError_GetStart(exc, &start))
return NULL;
if (PyUnicodeEncodeError_GetEnd(exc, &end))
return NULL;
if (!(object = PyUnicodeEncodeError_GetObject(exc)))
return NULL;
startp = PyUnicode_AS_UNICODE(object);
res = PyBytes_FromStringAndSize(NULL, end-start);
if (!res) {
Py_DECREF(object);
return NULL;
}
outp = PyBytes_AsString(res);
for (p = startp+start; p < startp+end; p++) {
Py_UNICODE ch = *p;
if (ch < 0xdc80 || ch > 0xdcff) {
/* Not a UTF-8b surrogate, fail with original exception */
PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
Py_DECREF(res);
Py_DECREF(object);
return NULL;
}
*outp++ = ch - 0xdc00;
}
restuple = Py_BuildValue("(On)", res, end);
Py_DECREF(res);
Py_DECREF(object);
return restuple;
}
else if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) {
unsigned char *p;
Py_UNICODE ch[4]; /* decode up to 4 bad bytes. */
int consumed = 0;
if (PyUnicodeDecodeError_GetStart(exc, &start))
return NULL;
if (PyUnicodeDecodeError_GetEnd(exc, &end))
return NULL;
if (!(object = PyUnicodeDecodeError_GetObject(exc)))
return NULL;
if (!(p = (unsigned char*)PyBytes_AsString(object))) {
Py_DECREF(object);
return NULL;
}
while (consumed < 4 && consumed < end-start) {
/* Refuse to escape ASCII bytes. */
if (p[start+consumed] < 128)
break;
ch[consumed] = 0xdc00 + p[start+consumed];
consumed++;
}
Py_DECREF(object);
if (!consumed) {
/* codec complained about ASCII byte. */
PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
return NULL;
}
return Py_BuildValue("(u#n)", ch, consumed, start+consumed);
}
else {
wrong_exception_type(exc);
return NULL;
}
}
static PyObject *strict_errors(PyObject *self, PyObject *exc)
{
@ -864,6 +940,11 @@ static PyObject *surrogates_errors(PyObject *self, PyObject *exc)
return PyCodec_SurrogateErrors(exc);
}
static PyObject *utf8b_errors(PyObject *self, PyObject *exc)
{
return PyCodec_UTF8bErrors(exc);
}
static int _PyCodecRegistry_Init(void)
{
static struct {
@ -918,6 +999,14 @@ static int _PyCodecRegistry_Init(void)
surrogates_errors,
METH_O
}
},
{
"utf8b",
{
"utf8b",
utf8b_errors,
METH_O
}
}
};

View file

@ -262,6 +262,22 @@ Py_InitializeEx(int install_sigs)
_PyImportHooks_Init();
#if defined(HAVE_LANGINFO_H) && defined(CODESET)
/* On Unix, set the file system encoding according to the
user's preference, if the CODESET names a well-known
Python codec, and Py_FileSystemDefaultEncoding isn't
initialized by other means. Also set the encoding of
stdin and stdout if these are terminals. */
codeset = get_codeset();
if (codeset) {
if (!Py_FileSystemDefaultEncoding)
Py_FileSystemDefaultEncoding = codeset;
else
free(codeset);
}
#endif
if (install_sigs)
initsigs(); /* Signal handling stuff, including initintr() */
@ -285,22 +301,6 @@ Py_InitializeEx(int install_sigs)
#ifdef WITH_THREAD
_PyGILState_Init(interp, tstate);
#endif /* WITH_THREAD */
#if defined(HAVE_LANGINFO_H) && defined(CODESET)
/* On Unix, set the file system encoding according to the
user's preference, if the CODESET names a well-known
Python codec, and Py_FileSystemDefaultEncoding isn't
initialized by other means. Also set the encoding of
stdin and stdout if these are terminals. */
codeset = get_codeset();
if (codeset) {
if (!Py_FileSystemDefaultEncoding)
Py_FileSystemDefaultEncoding = codeset;
else
free(codeset);
}
#endif
}
void