mirror of
https://github.com/python/cpython.git
synced 2025-07-30 22:54:16 +00:00
Patch #1455898: Incremental mode for "mbcs" codec.
This commit is contained in:
parent
6ce9fe880b
commit
d825143be1
6 changed files with 214 additions and 50 deletions
|
@ -1431,6 +1431,18 @@ machine running the codec.
|
|||
raised by the codec.
|
||||
\end{cfuncdesc}
|
||||
|
||||
\begin{cfuncdesc}{PyObject*}{PyUnicode_DecodeMBCSStateful}{const char *s,
|
||||
int size,
|
||||
const char *errors,
|
||||
int *consumed}
|
||||
If \var{consumed} is \NULL{}, behave like
|
||||
\cfunction{PyUnicode_DecodeMBCS()}. If \var{consumed} is not \NULL{},
|
||||
\cfunction{PyUnicode_DecodeMBCSStateful()} will not decode trailing lead
|
||||
byte and the number of bytes that have been decoded will be stored in
|
||||
\var{consumed}.
|
||||
\versionadded{2.5}
|
||||
\end{cfuncdesc}
|
||||
|
||||
\begin{cfuncdesc}{PyObject*}{PyUnicode_EncodeMBCS}{const Py_UNICODE *s,
|
||||
Py_ssize_t size,
|
||||
const char *errors}
|
||||
|
|
|
@ -938,6 +938,13 @@ PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCS(
|
|||
const char *errors /* error handling */
|
||||
);
|
||||
|
||||
PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCSStateful(
|
||||
const char *string, /* MBCS encoded string */
|
||||
Py_ssize_t length, /* size of string */
|
||||
const char *errors, /* error handling */
|
||||
Py_ssize_t *consumed /* bytes consumed */
|
||||
);
|
||||
|
||||
PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString(
|
||||
PyObject *unicode /* Unicode object */
|
||||
);
|
||||
|
|
|
@ -22,9 +22,10 @@ class IncrementalEncoder(codecs.IncrementalEncoder):
|
|||
def encode(self, input, final=False):
|
||||
return codecs.mbcs_encode(input,self.errors)[0]
|
||||
|
||||
class IncrementalDecoder(codecs.IncrementalDecoder):
|
||||
def decode(self, input, final=False):
|
||||
return codecs.mbcs_decode(input,self.errors)[0]
|
||||
class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
|
||||
def _buffer_decode(self, input, errors, final):
|
||||
return codecs.mbcs_decode(input,self.errors,final)
|
||||
|
||||
class StreamWriter(Codec,codecs.StreamWriter):
|
||||
pass
|
||||
|
||||
|
|
|
@ -156,6 +156,9 @@ Extension Modules
|
|||
Library
|
||||
-------
|
||||
|
||||
- Patch #1455898: The MBCS codec now supports the incremental mode for
|
||||
double-byte encodings.
|
||||
|
||||
- ``difflib``'s ``SequenceMatcher.get_matching_blocks()`` was changed to
|
||||
guarantee that adjacent triples in the return list always describe
|
||||
non-adjacent blocks. Previously, a pair of matching blocks could end
|
||||
|
|
|
@ -479,15 +479,20 @@ mbcs_decode(PyObject *self,
|
|||
PyObject *args)
|
||||
{
|
||||
const char *data;
|
||||
Py_ssize_t size;
|
||||
Py_ssize_t size, consumed;
|
||||
const char *errors = NULL;
|
||||
int final = 1;
|
||||
PyObject *decoded;
|
||||
|
||||
if (!PyArg_ParseTuple(args, "t#|z:mbcs_decode",
|
||||
&data, &size, &errors))
|
||||
if (!PyArg_ParseTuple(args, "t#|zi:mbcs_decode",
|
||||
&data, &size, &errors, &final))
|
||||
return NULL;
|
||||
|
||||
return codec_tuple(PyUnicode_DecodeMBCS(data, size, errors),
|
||||
size);
|
||||
decoded = PyUnicode_DecodeMBCSStateful(
|
||||
data, size, errors, final ? NULL : &consumed);
|
||||
if (!decoded)
|
||||
return NULL;
|
||||
return codec_tuple(decoded, final ? size : consumed);
|
||||
}
|
||||
|
||||
#endif /* MS_WINDOWS */
|
||||
|
|
|
@ -2820,65 +2820,199 @@ PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
|
|||
|
||||
/* --- MBCS codecs for Windows -------------------------------------------- */
|
||||
|
||||
#if SIZEOF_INT < SIZEOF_SSIZE_T
|
||||
#define NEED_RETRY
|
||||
#endif
|
||||
|
||||
/* XXX This code is limited to "true" double-byte encodings, as
|
||||
a) it assumes an incomplete character consists of a single byte, and
|
||||
b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
|
||||
encodings, see IsDBCSLeadByteEx documentation. */
|
||||
|
||||
static int is_dbcs_lead_byte(const char *s, int offset)
|
||||
{
|
||||
const char *curr = s + offset;
|
||||
|
||||
if (IsDBCSLeadByte(*curr)) {
|
||||
const char *prev = CharPrev(s, curr);
|
||||
return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Decode MBCS string into unicode object. If 'final' is set, converts
|
||||
* trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
|
||||
*/
|
||||
static int decode_mbcs(PyUnicodeObject **v,
|
||||
const char *s, /* MBCS string */
|
||||
int size, /* sizeof MBCS string */
|
||||
int final)
|
||||
{
|
||||
Py_UNICODE *p;
|
||||
Py_ssize_t n = 0;
|
||||
int usize = 0;
|
||||
|
||||
assert(size >= 0);
|
||||
|
||||
/* Skip trailing lead-byte unless 'final' is set */
|
||||
if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
|
||||
--size;
|
||||
|
||||
/* First get the size of the result */
|
||||
if (size > 0) {
|
||||
usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
|
||||
if (usize == 0) {
|
||||
PyErr_SetFromWindowsErrWithFilename(0, NULL);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
if (*v == NULL) {
|
||||
/* Create unicode object */
|
||||
*v = _PyUnicode_New(usize);
|
||||
if (*v == NULL)
|
||||
return -1;
|
||||
}
|
||||
else {
|
||||
/* Extend unicode object */
|
||||
n = PyUnicode_GET_SIZE(*v);
|
||||
if (_PyUnicode_Resize(v, n + usize) < 0)
|
||||
return -1;
|
||||
}
|
||||
|
||||
/* Do the conversion */
|
||||
if (size > 0) {
|
||||
p = PyUnicode_AS_UNICODE(*v) + n;
|
||||
if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
|
||||
PyErr_SetFromWindowsErrWithFilename(0, NULL);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
return size;
|
||||
}
|
||||
|
||||
PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
|
||||
Py_ssize_t size,
|
||||
const char *errors,
|
||||
Py_ssize_t *consumed)
|
||||
{
|
||||
PyUnicodeObject *v = NULL;
|
||||
int done;
|
||||
|
||||
if (consumed)
|
||||
*consumed = 0;
|
||||
|
||||
#ifdef NEED_RETRY
|
||||
retry:
|
||||
if (size > INT_MAX)
|
||||
done = decode_mbcs(&v, s, INT_MAX, 0);
|
||||
else
|
||||
#endif
|
||||
done = decode_mbcs(&v, s, (int)size, !consumed);
|
||||
|
||||
if (done < 0) {
|
||||
Py_XDECREF(v);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (consumed)
|
||||
*consumed += done;
|
||||
|
||||
#ifdef NEED_RETRY
|
||||
if (size > INT_MAX) {
|
||||
s += done;
|
||||
size -= done;
|
||||
goto retry;
|
||||
}
|
||||
#endif
|
||||
|
||||
return (PyObject *)v;
|
||||
}
|
||||
|
||||
PyObject *PyUnicode_DecodeMBCS(const char *s,
|
||||
Py_ssize_t size,
|
||||
const char *errors)
|
||||
{
|
||||
PyUnicodeObject *v;
|
||||
Py_UNICODE *p;
|
||||
DWORD usize;
|
||||
|
||||
/* First get the size of the result */
|
||||
assert(size < INT_MAX);
|
||||
usize = MultiByteToWideChar(CP_ACP, 0, s, (int)size, NULL, 0);
|
||||
if (size > 0 && usize==0)
|
||||
return PyErr_SetFromWindowsErrWithFilename(0, NULL);
|
||||
|
||||
v = _PyUnicode_New(usize);
|
||||
if (v == NULL)
|
||||
return NULL;
|
||||
if (usize == 0)
|
||||
return (PyObject *)v;
|
||||
p = PyUnicode_AS_UNICODE(v);
|
||||
if (0 == MultiByteToWideChar(CP_ACP, 0, s, (int)size, p, usize)) {
|
||||
Py_DECREF(v);
|
||||
return PyErr_SetFromWindowsErrWithFilename(0, NULL);
|
||||
return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
|
||||
}
|
||||
|
||||
return (PyObject *)v;
|
||||
/*
|
||||
* Convert unicode into string object (MBCS).
|
||||
* Returns 0 if succeed, -1 otherwise.
|
||||
*/
|
||||
static int encode_mbcs(PyObject **repr,
|
||||
const Py_UNICODE *p, /* unicode */
|
||||
int size) /* size of unicode */
|
||||
{
|
||||
int mbcssize = 0;
|
||||
Py_ssize_t n = 0;
|
||||
|
||||
assert(size >= 0);
|
||||
|
||||
/* First get the size of the result */
|
||||
if (size > 0) {
|
||||
mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
|
||||
if (mbcssize == 0) {
|
||||
PyErr_SetFromWindowsErrWithFilename(0, NULL);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
if (*repr == NULL) {
|
||||
/* Create string object */
|
||||
*repr = PyString_FromStringAndSize(NULL, mbcssize);
|
||||
if (*repr == NULL)
|
||||
return -1;
|
||||
}
|
||||
else {
|
||||
/* Extend string object */
|
||||
n = PyString_Size(*repr);
|
||||
if (_PyString_Resize(repr, n + mbcssize) < 0)
|
||||
return -1;
|
||||
}
|
||||
|
||||
/* Do the conversion */
|
||||
if (size > 0) {
|
||||
char *s = PyString_AS_STRING(*repr) + n;
|
||||
if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
|
||||
PyErr_SetFromWindowsErrWithFilename(0, NULL);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
|
||||
Py_ssize_t size,
|
||||
const char *errors)
|
||||
{
|
||||
PyObject *repr;
|
||||
char *s;
|
||||
DWORD mbcssize;
|
||||
PyObject *repr = NULL;
|
||||
int ret;
|
||||
|
||||
/* If there are no characters, bail now! */
|
||||
if (size==0)
|
||||
return PyString_FromString("");
|
||||
#ifdef NEED_RETRY
|
||||
retry:
|
||||
if (size > INT_MAX)
|
||||
ret = encode_mbcs(&repr, p, INT_MAX);
|
||||
else
|
||||
#endif
|
||||
ret = encode_mbcs(&repr, p, (int)size);
|
||||
|
||||
/* First get the size of the result */
|
||||
assert(size<INT_MAX);
|
||||
mbcssize = WideCharToMultiByte(CP_ACP, 0, p, (int)size, NULL, 0, NULL, NULL);
|
||||
if (mbcssize==0)
|
||||
return PyErr_SetFromWindowsErrWithFilename(0, NULL);
|
||||
|
||||
repr = PyString_FromStringAndSize(NULL, mbcssize);
|
||||
if (repr == NULL)
|
||||
if (ret < 0) {
|
||||
Py_XDECREF(repr);
|
||||
return NULL;
|
||||
if (mbcssize == 0)
|
||||
return repr;
|
||||
|
||||
/* Do the conversion */
|
||||
s = PyString_AS_STRING(repr);
|
||||
assert(size < INT_MAX);
|
||||
if (0 == WideCharToMultiByte(CP_ACP, 0, p, (int)size, s, mbcssize, NULL, NULL)) {
|
||||
Py_DECREF(repr);
|
||||
return PyErr_SetFromWindowsErrWithFilename(0, NULL);
|
||||
}
|
||||
|
||||
#ifdef NEED_RETRY
|
||||
if (size > INT_MAX) {
|
||||
p += INT_MAX;
|
||||
size -= INT_MAX;
|
||||
goto retry;
|
||||
}
|
||||
#endif
|
||||
|
||||
return repr;
|
||||
}
|
||||
|
||||
|
@ -2893,6 +3027,8 @@ PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
|
|||
NULL);
|
||||
}
|
||||
|
||||
#undef NEED_RETRY
|
||||
|
||||
#endif /* MS_WINDOWS */
|
||||
|
||||
/* --- Character Mapping Codec -------------------------------------------- */
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue