mirror of
https://github.com/python/cpython.git
synced 2025-07-12 13:55:34 +00:00
SF patch #998993: The UTF-8 and the UTF-16 stateful decoders now support
decoding incomplete input (when the input stream is temporarily exhausted). codecs.StreamReader now implements buffering, which enables proper readline support for the UTF-16 decoders. codecs.StreamReader.read() has a new argument chars which specifies the number of characters to return. codecs.StreamReader.readline() and codecs.StreamReader.readlines() have a new argument keepends. Trailing "\n"s will be stripped from the lines if keepends is false. Added C APIs PyUnicode_DecodeUTF8Stateful and PyUnicode_DecodeUTF16Stateful.
This commit is contained in:
parent
a708d6e3b0
commit
69652035bc
12 changed files with 419 additions and 173 deletions
|
@ -1135,6 +1135,14 @@ char utf8_code_length[256] = {
|
|||
PyObject *PyUnicode_DecodeUTF8(const char *s,
|
||||
int size,
|
||||
const char *errors)
|
||||
{
|
||||
return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
|
||||
}
|
||||
|
||||
PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
|
||||
int size,
|
||||
const char *errors,
|
||||
int *consumed)
|
||||
{
|
||||
const char *starts = s;
|
||||
int n;
|
||||
|
@ -1153,8 +1161,11 @@ PyObject *PyUnicode_DecodeUTF8(const char *s,
|
|||
unicode = _PyUnicode_New(size);
|
||||
if (!unicode)
|
||||
return NULL;
|
||||
if (size == 0)
|
||||
if (size == 0) {
|
||||
if (consumed)
|
||||
*consumed = 0;
|
||||
return (PyObject *)unicode;
|
||||
}
|
||||
|
||||
/* Unpack UTF-8 encoded data */
|
||||
p = unicode->str;
|
||||
|
@ -1172,10 +1183,14 @@ PyObject *PyUnicode_DecodeUTF8(const char *s,
|
|||
n = utf8_code_length[ch];
|
||||
|
||||
if (s + n > e) {
|
||||
errmsg = "unexpected end of data";
|
||||
startinpos = s-starts;
|
||||
endinpos = size;
|
||||
goto utf8Error;
|
||||
if (consumed)
|
||||
break;
|
||||
else {
|
||||
errmsg = "unexpected end of data";
|
||||
startinpos = s-starts;
|
||||
endinpos = size;
|
||||
goto utf8Error;
|
||||
}
|
||||
}
|
||||
|
||||
switch (n) {
|
||||
|
@ -1293,6 +1308,8 @@ PyObject *PyUnicode_DecodeUTF8(const char *s,
|
|||
(PyObject **)&unicode, &outpos, &p))
|
||||
goto onError;
|
||||
}
|
||||
if (consumed)
|
||||
*consumed = s-starts;
|
||||
|
||||
/* Adjust length */
|
||||
if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
|
||||
|
@ -1427,6 +1444,16 @@ PyUnicode_DecodeUTF16(const char *s,
|
|||
int size,
|
||||
const char *errors,
|
||||
int *byteorder)
|
||||
{
|
||||
return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
|
||||
}
|
||||
|
||||
PyObject *
|
||||
PyUnicode_DecodeUTF16Stateful(const char *s,
|
||||
int size,
|
||||
const char *errors,
|
||||
int *byteorder,
|
||||
int *consumed)
|
||||
{
|
||||
const char *starts = s;
|
||||
int startinpos;
|
||||
|
@ -1467,26 +1494,28 @@ PyUnicode_DecodeUTF16(const char *s,
|
|||
mark is skipped, in all other modes, it is copied to the output
|
||||
stream as-is (giving a ZWNBSP character). */
|
||||
if (bo == 0) {
|
||||
const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
|
||||
if (size >= 2) {
|
||||
const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
|
||||
#ifdef BYTEORDER_IS_LITTLE_ENDIAN
|
||||
if (bom == 0xFEFF) {
|
||||
q += 2;
|
||||
bo = -1;
|
||||
}
|
||||
else if (bom == 0xFFFE) {
|
||||
q += 2;
|
||||
bo = 1;
|
||||
}
|
||||
if (bom == 0xFEFF) {
|
||||
q += 2;
|
||||
bo = -1;
|
||||
}
|
||||
else if (bom == 0xFFFE) {
|
||||
q += 2;
|
||||
bo = 1;
|
||||
}
|
||||
#else
|
||||
if (bom == 0xFEFF) {
|
||||
q += 2;
|
||||
bo = 1;
|
||||
}
|
||||
else if (bom == 0xFFFE) {
|
||||
q += 2;
|
||||
bo = -1;
|
||||
}
|
||||
if (bom == 0xFEFF) {
|
||||
q += 2;
|
||||
bo = 1;
|
||||
}
|
||||
else if (bom == 0xFFFE) {
|
||||
q += 2;
|
||||
bo = -1;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
if (bo == -1) {
|
||||
|
@ -1502,8 +1531,10 @@ PyUnicode_DecodeUTF16(const char *s,
|
|||
|
||||
while (q < e) {
|
||||
Py_UNICODE ch;
|
||||
/* remaing bytes at the end? (size should be even) */
|
||||
/* remaining bytes at the end? (size should be even) */
|
||||
if (e-q<2) {
|
||||
if (consumed)
|
||||
break;
|
||||
errmsg = "truncated data";
|
||||
startinpos = ((const char *)q)-starts;
|
||||
endinpos = ((const char *)e)-starts;
|
||||
|
@ -1565,6 +1596,9 @@ PyUnicode_DecodeUTF16(const char *s,
|
|||
if (byteorder)
|
||||
*byteorder = bo;
|
||||
|
||||
if (consumed)
|
||||
*consumed = (const char *)q-starts;
|
||||
|
||||
/* Adjust length */
|
||||
if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
|
||||
goto onError;
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue