mirror of
https://github.com/python/cpython.git
synced 2025-08-04 00:48:58 +00:00
Part of SF patch #1313939: Speedup charmap decoding by extending
PyUnicode_DecodeCharmap() the accept a unicode string as the mapping argument which is used as a mapping table. This code isn't used by any of the codecs yet.
This commit is contained in:
parent
331649acc7
commit
d1c1e10f70
4 changed files with 157 additions and 80 deletions
|
@ -1322,7 +1322,12 @@ points.
|
||||||
const char *errors}
|
const char *errors}
|
||||||
Create a Unicode object by decoding \var{size} bytes of the encoded
|
Create a Unicode object by decoding \var{size} bytes of the encoded
|
||||||
string \var{s} using the given \var{mapping} object. Return
|
string \var{s} using the given \var{mapping} object. Return
|
||||||
\NULL{} if an exception was raised by the codec.
|
\NULL{} if an exception was raised by the codec. If \var{mapping} is \NULL{}
|
||||||
|
latin-1 decoding will be done. Else it can be a dictionary mapping byte or a
|
||||||
|
unicode string, which is treated as a lookup table. Byte values greater
|
||||||
|
that the length of the string and U+FFFE "characters" are treated as
|
||||||
|
"undefined mapping".
|
||||||
|
\versionchanged[Allowed unicode string as mapping argument]{2.4}
|
||||||
\end{cfuncdesc}
|
\end{cfuncdesc}
|
||||||
|
|
||||||
\begin{cfuncdesc}{PyObject*}{PyUnicode_EncodeCharmap}{const Py_UNICODE *s,
|
\begin{cfuncdesc}{PyObject*}{PyUnicode_EncodeCharmap}{const Py_UNICODE *s,
|
||||||
|
|
|
@ -924,6 +924,40 @@ class BasicStrTest(unittest.TestCase):
|
||||||
(chars, size) = codecs.getdecoder(encoding)(bytes)
|
(chars, size) = codecs.getdecoder(encoding)(bytes)
|
||||||
self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
|
self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
|
||||||
|
|
||||||
|
class CharmapTest(unittest.TestCase):
|
||||||
|
def test_decode_with_string_map(self):
|
||||||
|
self.assertEquals(
|
||||||
|
codecs.charmap_decode("\x00\x01\x02", "strict", u"abc"),
|
||||||
|
(u"abc", 3)
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertEquals(
|
||||||
|
codecs.charmap_decode("\x00\x01\x02", "replace", u"ab"),
|
||||||
|
(u"ab\ufffd", 3)
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertEquals(
|
||||||
|
codecs.charmap_decode("\x00\x01\x02", "replace", u"ab\ufffe"),
|
||||||
|
(u"ab\ufffd", 3)
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertEquals(
|
||||||
|
codecs.charmap_decode("\x00\x01\x02", "ignore", u"ab"),
|
||||||
|
(u"ab", 3)
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertEquals(
|
||||||
|
codecs.charmap_decode("\x00\x01\x02", "ignore", u"ab\ufffe"),
|
||||||
|
(u"ab", 3)
|
||||||
|
)
|
||||||
|
|
||||||
|
allbytes = "".join(chr(i) for i in xrange(256))
|
||||||
|
self.assertEquals(
|
||||||
|
codecs.charmap_decode(allbytes, "ignore", u""),
|
||||||
|
(u"", len(allbytes))
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def test_main():
|
def test_main():
|
||||||
test_support.run_unittest(
|
test_support.run_unittest(
|
||||||
UTF16Test,
|
UTF16Test,
|
||||||
|
@ -940,7 +974,8 @@ def test_main():
|
||||||
StreamReaderTest,
|
StreamReaderTest,
|
||||||
Str2StrTest,
|
Str2StrTest,
|
||||||
BasicUnicodeTest,
|
BasicUnicodeTest,
|
||||||
BasicStrTest
|
BasicStrTest,
|
||||||
|
CharmapTest
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -563,6 +563,11 @@ C API
|
||||||
|
|
||||||
- Removed PyRange_New().
|
- Removed PyRange_New().
|
||||||
|
|
||||||
|
- Patch #1313939: PyUnicode_DecodeCharmap() accepts a unicode string as the
|
||||||
|
mapping argument now. This string is used as a mapping table. Byte values
|
||||||
|
greater than the length of the string and 0xFFFE are treated as undefined
|
||||||
|
mappings.
|
||||||
|
|
||||||
|
|
||||||
Tests
|
Tests
|
||||||
-----
|
-----
|
||||||
|
|
|
@ -2833,6 +2833,8 @@ PyObject *PyUnicode_DecodeCharmap(const char *s,
|
||||||
int extrachars = 0;
|
int extrachars = 0;
|
||||||
PyObject *errorHandler = NULL;
|
PyObject *errorHandler = NULL;
|
||||||
PyObject *exc = NULL;
|
PyObject *exc = NULL;
|
||||||
|
Py_UNICODE *mapstring = NULL;
|
||||||
|
int maplen = 0;
|
||||||
|
|
||||||
/* Default to Latin-1 */
|
/* Default to Latin-1 */
|
||||||
if (mapping == NULL)
|
if (mapping == NULL)
|
||||||
|
@ -2845,91 +2847,121 @@ PyObject *PyUnicode_DecodeCharmap(const char *s,
|
||||||
return (PyObject *)v;
|
return (PyObject *)v;
|
||||||
p = PyUnicode_AS_UNICODE(v);
|
p = PyUnicode_AS_UNICODE(v);
|
||||||
e = s + size;
|
e = s + size;
|
||||||
while (s < e) {
|
if (PyUnicode_CheckExact(mapping)) {
|
||||||
unsigned char ch = *s;
|
mapstring = PyUnicode_AS_UNICODE(mapping);
|
||||||
PyObject *w, *x;
|
maplen = PyUnicode_GET_SIZE(mapping);
|
||||||
|
while (s < e) {
|
||||||
|
unsigned char ch = *s;
|
||||||
|
Py_UNICODE x = 0xfffe; /* illegal value */
|
||||||
|
|
||||||
/* Get mapping (char ordinal -> integer, Unicode char or None) */
|
if (ch < maplen)
|
||||||
w = PyInt_FromLong((long)ch);
|
x = mapstring[ch];
|
||||||
if (w == NULL)
|
|
||||||
goto onError;
|
|
||||||
x = PyObject_GetItem(mapping, w);
|
|
||||||
Py_DECREF(w);
|
|
||||||
if (x == NULL) {
|
|
||||||
if (PyErr_ExceptionMatches(PyExc_LookupError)) {
|
|
||||||
/* No mapping found means: mapping is undefined. */
|
|
||||||
PyErr_Clear();
|
|
||||||
x = Py_None;
|
|
||||||
Py_INCREF(x);
|
|
||||||
} else
|
|
||||||
goto onError;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Apply mapping */
|
if (x == 0xfffe) {
|
||||||
if (PyInt_Check(x)) {
|
/* undefined mapping */
|
||||||
long value = PyInt_AS_LONG(x);
|
outpos = p-PyUnicode_AS_UNICODE(v);
|
||||||
if (value < 0 || value > 65535) {
|
startinpos = s-starts;
|
||||||
PyErr_SetString(PyExc_TypeError,
|
endinpos = startinpos+1;
|
||||||
"character mapping must be in range(65536)");
|
if (unicode_decode_call_errorhandler(
|
||||||
Py_DECREF(x);
|
errors, &errorHandler,
|
||||||
goto onError;
|
"charmap", "character maps to <undefined>",
|
||||||
}
|
starts, size, &startinpos, &endinpos, &exc, &s,
|
||||||
*p++ = (Py_UNICODE)value;
|
(PyObject **)&v, &outpos, &p)) {
|
||||||
}
|
goto onError;
|
||||||
else if (x == Py_None) {
|
|
||||||
/* undefined mapping */
|
|
||||||
outpos = p-PyUnicode_AS_UNICODE(v);
|
|
||||||
startinpos = s-starts;
|
|
||||||
endinpos = startinpos+1;
|
|
||||||
if (unicode_decode_call_errorhandler(
|
|
||||||
errors, &errorHandler,
|
|
||||||
"charmap", "character maps to <undefined>",
|
|
||||||
starts, size, &startinpos, &endinpos, &exc, &s,
|
|
||||||
(PyObject **)&v, &outpos, &p)) {
|
|
||||||
Py_DECREF(x);
|
|
||||||
goto onError;
|
|
||||||
}
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
else if (PyUnicode_Check(x)) {
|
|
||||||
int targetsize = PyUnicode_GET_SIZE(x);
|
|
||||||
|
|
||||||
if (targetsize == 1)
|
|
||||||
/* 1-1 mapping */
|
|
||||||
*p++ = *PyUnicode_AS_UNICODE(x);
|
|
||||||
|
|
||||||
else if (targetsize > 1) {
|
|
||||||
/* 1-n mapping */
|
|
||||||
if (targetsize > extrachars) {
|
|
||||||
/* resize first */
|
|
||||||
int oldpos = (int)(p - PyUnicode_AS_UNICODE(v));
|
|
||||||
int needed = (targetsize - extrachars) + \
|
|
||||||
(targetsize << 2);
|
|
||||||
extrachars += needed;
|
|
||||||
if (_PyUnicode_Resize(&v,
|
|
||||||
PyUnicode_GET_SIZE(v) + needed) < 0) {
|
|
||||||
Py_DECREF(x);
|
|
||||||
goto onError;
|
|
||||||
}
|
|
||||||
p = PyUnicode_AS_UNICODE(v) + oldpos;
|
|
||||||
}
|
}
|
||||||
Py_UNICODE_COPY(p,
|
continue;
|
||||||
PyUnicode_AS_UNICODE(x),
|
|
||||||
targetsize);
|
|
||||||
p += targetsize;
|
|
||||||
extrachars -= targetsize;
|
|
||||||
}
|
}
|
||||||
/* 1-0 mapping: skip the character */
|
*p++ = x;
|
||||||
|
++s;
|
||||||
}
|
}
|
||||||
else {
|
}
|
||||||
/* wrong return value */
|
else {
|
||||||
PyErr_SetString(PyExc_TypeError,
|
while (s < e) {
|
||||||
"character mapping must return integer, None or unicode");
|
unsigned char ch = *s;
|
||||||
|
PyObject *w, *x;
|
||||||
|
|
||||||
|
/* Get mapping (char ordinal -> integer, Unicode char or None) */
|
||||||
|
w = PyInt_FromLong((long)ch);
|
||||||
|
if (w == NULL)
|
||||||
|
goto onError;
|
||||||
|
x = PyObject_GetItem(mapping, w);
|
||||||
|
Py_DECREF(w);
|
||||||
|
if (x == NULL) {
|
||||||
|
if (PyErr_ExceptionMatches(PyExc_LookupError)) {
|
||||||
|
/* No mapping found means: mapping is undefined. */
|
||||||
|
PyErr_Clear();
|
||||||
|
x = Py_None;
|
||||||
|
Py_INCREF(x);
|
||||||
|
} else
|
||||||
|
goto onError;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Apply mapping */
|
||||||
|
if (PyInt_Check(x)) {
|
||||||
|
long value = PyInt_AS_LONG(x);
|
||||||
|
if (value < 0 || value > 65535) {
|
||||||
|
PyErr_SetString(PyExc_TypeError,
|
||||||
|
"character mapping must be in range(65536)");
|
||||||
|
Py_DECREF(x);
|
||||||
|
goto onError;
|
||||||
|
}
|
||||||
|
*p++ = (Py_UNICODE)value;
|
||||||
|
}
|
||||||
|
else if (x == Py_None) {
|
||||||
|
/* undefined mapping */
|
||||||
|
outpos = p-PyUnicode_AS_UNICODE(v);
|
||||||
|
startinpos = s-starts;
|
||||||
|
endinpos = startinpos+1;
|
||||||
|
if (unicode_decode_call_errorhandler(
|
||||||
|
errors, &errorHandler,
|
||||||
|
"charmap", "character maps to <undefined>",
|
||||||
|
starts, size, &startinpos, &endinpos, &exc, &s,
|
||||||
|
(PyObject **)&v, &outpos, &p)) {
|
||||||
|
Py_DECREF(x);
|
||||||
|
goto onError;
|
||||||
|
}
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
else if (PyUnicode_Check(x)) {
|
||||||
|
int targetsize = PyUnicode_GET_SIZE(x);
|
||||||
|
|
||||||
|
if (targetsize == 1)
|
||||||
|
/* 1-1 mapping */
|
||||||
|
*p++ = *PyUnicode_AS_UNICODE(x);
|
||||||
|
|
||||||
|
else if (targetsize > 1) {
|
||||||
|
/* 1-n mapping */
|
||||||
|
if (targetsize > extrachars) {
|
||||||
|
/* resize first */
|
||||||
|
int oldpos = (int)(p - PyUnicode_AS_UNICODE(v));
|
||||||
|
int needed = (targetsize - extrachars) + \
|
||||||
|
(targetsize << 2);
|
||||||
|
extrachars += needed;
|
||||||
|
if (_PyUnicode_Resize(&v,
|
||||||
|
PyUnicode_GET_SIZE(v) + needed) < 0) {
|
||||||
|
Py_DECREF(x);
|
||||||
|
goto onError;
|
||||||
|
}
|
||||||
|
p = PyUnicode_AS_UNICODE(v) + oldpos;
|
||||||
|
}
|
||||||
|
Py_UNICODE_COPY(p,
|
||||||
|
PyUnicode_AS_UNICODE(x),
|
||||||
|
targetsize);
|
||||||
|
p += targetsize;
|
||||||
|
extrachars -= targetsize;
|
||||||
|
}
|
||||||
|
/* 1-0 mapping: skip the character */
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
/* wrong return value */
|
||||||
|
PyErr_SetString(PyExc_TypeError,
|
||||||
|
"character mapping must return integer, None or unicode");
|
||||||
|
Py_DECREF(x);
|
||||||
|
goto onError;
|
||||||
|
}
|
||||||
Py_DECREF(x);
|
Py_DECREF(x);
|
||||||
goto onError;
|
++s;
|
||||||
}
|
}
|
||||||
Py_DECREF(x);
|
|
||||||
++s;
|
|
||||||
}
|
}
|
||||||
if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
|
if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
|
||||||
if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)
|
if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue