mirror of
https://github.com/python/cpython.git
synced 2025-08-04 00:48:58 +00:00
Marc-Andre Lemburg:
Attached you find the latest update of the Unicode implementation. The patch is against the current CVS version. It includes the fix I posted yesterday for the core dump problem in codecs.c (was introduced by my previous patch set -- sorry), adds more tests for the codecs and two new parser markers "es" and "es#".
This commit is contained in:
parent
27fc3c05e1
commit
d8855fde88
5 changed files with 259 additions and 6 deletions
|
@ -46,7 +46,7 @@ class Codec:
|
||||||
handling schemes by providing the errors argument. These
|
handling schemes by providing the errors argument. These
|
||||||
string values are defined:
|
string values are defined:
|
||||||
|
|
||||||
'strict' - raise an error (or a subclass)
|
'strict' - raise a ValueError error (or a subclass)
|
||||||
'ignore' - ignore the character and continue with the next
|
'ignore' - ignore the character and continue with the next
|
||||||
'replace' - replace with a suitable replacement character;
|
'replace' - replace with a suitable replacement character;
|
||||||
Python will use the official U+FFFD REPLACEMENT
|
Python will use the official U+FFFD REPLACEMENT
|
||||||
|
|
|
@ -1,5 +1,4 @@
|
||||||
test_unicode
|
test_unicode
|
||||||
Testing Unicode comparisons... done.
|
Testing Unicode comparisons... done.
|
||||||
Testing Unicode contains method... done.
|
|
||||||
Testing Unicode formatting strings... done.
|
Testing Unicode formatting strings... done.
|
||||||
Testing unicodedata module... done.
|
Testing unicodedata module... done.
|
||||||
|
|
|
@ -293,3 +293,33 @@ else:
|
||||||
assert unicodedata.combining(u'\u20e1') == 230
|
assert unicodedata.combining(u'\u20e1') == 230
|
||||||
|
|
||||||
print 'done.'
|
print 'done.'
|
||||||
|
|
||||||
|
# Test builtin codecs
|
||||||
|
print 'Testing builtin codecs...',
|
||||||
|
|
||||||
|
assert unicode('hello','ascii') == u'hello'
|
||||||
|
assert unicode('hello','utf-8') == u'hello'
|
||||||
|
assert unicode('hello','utf8') == u'hello'
|
||||||
|
assert unicode('hello','latin-1') == u'hello'
|
||||||
|
|
||||||
|
assert u'hello'.encode('ascii') == 'hello'
|
||||||
|
assert u'hello'.encode('utf-8') == 'hello'
|
||||||
|
assert u'hello'.encode('utf8') == 'hello'
|
||||||
|
assert u'hello'.encode('utf-16-le') == 'h\000e\000l\000l\000o\000'
|
||||||
|
assert u'hello'.encode('utf-16-be') == '\000h\000e\000l\000l\000o'
|
||||||
|
assert u'hello'.encode('latin-1') == 'hello'
|
||||||
|
|
||||||
|
u = u''.join(map(unichr, range(1024)))
|
||||||
|
for encoding in ('utf-8', 'utf-16', 'utf-16-le', 'utf-16-be',
|
||||||
|
'raw_unicode_escape', 'unicode_escape', 'unicode_internal'):
|
||||||
|
assert unicode(u.encode(encoding),encoding) == u
|
||||||
|
|
||||||
|
u = u''.join(map(unichr, range(256)))
|
||||||
|
for encoding in ('latin-1',):
|
||||||
|
assert unicode(u.encode(encoding),encoding) == u
|
||||||
|
|
||||||
|
u = u''.join(map(unichr, range(128)))
|
||||||
|
for encoding in ('ascii',):
|
||||||
|
assert unicode(u.encode(encoding),encoding) == u
|
||||||
|
|
||||||
|
print 'done.'
|
||||||
|
|
114
Misc/unicode.txt
114
Misc/unicode.txt
|
@ -715,21 +715,126 @@ Internal Argument Parsing:
|
||||||
|
|
||||||
These markers are used by the PyArg_ParseTuple() APIs:
|
These markers are used by the PyArg_ParseTuple() APIs:
|
||||||
|
|
||||||
'U': Check for Unicode object and return a pointer to it
|
"U": Check for Unicode object and return a pointer to it
|
||||||
|
|
||||||
's': For Unicode objects: auto convert them to the <default encoding>
|
"s": For Unicode objects: auto convert them to the <default encoding>
|
||||||
and return a pointer to the object's <defencstr> buffer.
|
and return a pointer to the object's <defencstr> buffer.
|
||||||
|
|
||||||
's#': Access to the Unicode object via the bf_getreadbuf buffer interface
|
"s#": Access to the Unicode object via the bf_getreadbuf buffer interface
|
||||||
(see Buffer Interface); note that the length relates to the buffer
|
(see Buffer Interface); note that the length relates to the buffer
|
||||||
length, not the Unicode string length (this may be different
|
length, not the Unicode string length (this may be different
|
||||||
depending on the Internal Format).
|
depending on the Internal Format).
|
||||||
|
|
||||||
't#': Access to the Unicode object via the bf_getcharbuf buffer interface
|
"t#": Access to the Unicode object via the bf_getcharbuf buffer interface
|
||||||
(see Buffer Interface); note that the length relates to the buffer
|
(see Buffer Interface); note that the length relates to the buffer
|
||||||
length, not necessarily to the Unicode string length (this may
|
length, not necessarily to the Unicode string length (this may
|
||||||
be different depending on the <default encoding>).
|
be different depending on the <default encoding>).
|
||||||
|
|
||||||
|
"es":
|
||||||
|
Takes two parameters: encoding (const char *) and
|
||||||
|
buffer (char **).
|
||||||
|
|
||||||
|
The input object is first coerced to Unicode in the usual way
|
||||||
|
and then encoded into a string using the given encoding.
|
||||||
|
|
||||||
|
On output, a buffer of the needed size is allocated and
|
||||||
|
returned through *buffer as NULL-terminated string.
|
||||||
|
The encoded may not contain embedded NULL characters.
|
||||||
|
The caller is responsible for free()ing the allocated *buffer
|
||||||
|
after usage.
|
||||||
|
|
||||||
|
"es#":
|
||||||
|
Takes three parameters: encoding (const char *),
|
||||||
|
buffer (char **) and buffer_len (int *).
|
||||||
|
|
||||||
|
The input object is first coerced to Unicode in the usual way
|
||||||
|
and then encoded into a string using the given encoding.
|
||||||
|
|
||||||
|
If *buffer is non-NULL, *buffer_len must be set to sizeof(buffer)
|
||||||
|
on input. Output is then copied to *buffer.
|
||||||
|
|
||||||
|
If *buffer is NULL, a buffer of the needed size is
|
||||||
|
allocated and output copied into it. *buffer is then
|
||||||
|
updated to point to the allocated memory area. The caller
|
||||||
|
is responsible for free()ing *buffer after usage.
|
||||||
|
|
||||||
|
In both cases *buffer_len is updated to the number of
|
||||||
|
characters written (excluding the trailing NULL-byte).
|
||||||
|
The output buffer is assured to be NULL-terminated.
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
|
||||||
|
Using "es#" with auto-allocation:
|
||||||
|
|
||||||
|
static PyObject *
|
||||||
|
test_parser(PyObject *self,
|
||||||
|
PyObject *args)
|
||||||
|
{
|
||||||
|
PyObject *str;
|
||||||
|
const char *encoding = "latin-1";
|
||||||
|
char *buffer = NULL;
|
||||||
|
int buffer_len = 0;
|
||||||
|
|
||||||
|
if (!PyArg_ParseTuple(args, "es#:test_parser",
|
||||||
|
encoding, &buffer, &buffer_len))
|
||||||
|
return NULL;
|
||||||
|
if (!buffer) {
|
||||||
|
PyErr_SetString(PyExc_SystemError,
|
||||||
|
"buffer is NULL");
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
str = PyString_FromStringAndSize(buffer, buffer_len);
|
||||||
|
free(buffer);
|
||||||
|
return str;
|
||||||
|
}
|
||||||
|
|
||||||
|
Using "es" with auto-allocation returning a NULL-terminated string:
|
||||||
|
|
||||||
|
static PyObject *
|
||||||
|
test_parser(PyObject *self,
|
||||||
|
PyObject *args)
|
||||||
|
{
|
||||||
|
PyObject *str;
|
||||||
|
const char *encoding = "latin-1";
|
||||||
|
char *buffer = NULL;
|
||||||
|
|
||||||
|
if (!PyArg_ParseTuple(args, "es:test_parser",
|
||||||
|
encoding, &buffer))
|
||||||
|
return NULL;
|
||||||
|
if (!buffer) {
|
||||||
|
PyErr_SetString(PyExc_SystemError,
|
||||||
|
"buffer is NULL");
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
str = PyString_FromString(buffer);
|
||||||
|
free(buffer);
|
||||||
|
return str;
|
||||||
|
}
|
||||||
|
|
||||||
|
Using "es#" with a pre-allocated buffer:
|
||||||
|
|
||||||
|
static PyObject *
|
||||||
|
test_parser(PyObject *self,
|
||||||
|
PyObject *args)
|
||||||
|
{
|
||||||
|
PyObject *str;
|
||||||
|
const char *encoding = "latin-1";
|
||||||
|
char _buffer[10];
|
||||||
|
char *buffer = _buffer;
|
||||||
|
int buffer_len = sizeof(_buffer);
|
||||||
|
|
||||||
|
if (!PyArg_ParseTuple(args, "es#:test_parser",
|
||||||
|
encoding, &buffer, &buffer_len))
|
||||||
|
return NULL;
|
||||||
|
if (!buffer) {
|
||||||
|
PyErr_SetString(PyExc_SystemError,
|
||||||
|
"buffer is NULL");
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
str = PyString_FromStringAndSize(buffer, buffer_len);
|
||||||
|
return str;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
File/Stream Output:
|
File/Stream Output:
|
||||||
-------------------
|
-------------------
|
||||||
|
@ -837,6 +942,7 @@ Encodings:
|
||||||
|
|
||||||
History of this Proposal:
|
History of this Proposal:
|
||||||
-------------------------
|
-------------------------
|
||||||
|
1.3: Added new "es" and "es#" parser markers
|
||||||
1.2: Removed POD about codecs.open()
|
1.2: Removed POD about codecs.open()
|
||||||
1.1: Added note about comparisons and hash values. Added note about
|
1.1: Added note about comparisons and hash values. Added note about
|
||||||
case mapping algorithms. Changed stream codecs .read() and
|
case mapping algorithms. Changed stream codecs .read() and
|
||||||
|
|
118
Python/getargs.c
118
Python/getargs.c
|
@ -178,6 +178,8 @@ vgetargs1(args, format, p_va, compat)
|
||||||
}
|
}
|
||||||
else if (level != 0)
|
else if (level != 0)
|
||||||
; /* Pass */
|
; /* Pass */
|
||||||
|
else if (c == 'e')
|
||||||
|
; /* Pass */
|
||||||
else if (isalpha(c))
|
else if (isalpha(c))
|
||||||
max++;
|
max++;
|
||||||
else if (c == '|')
|
else if (c == '|')
|
||||||
|
@ -654,6 +656,122 @@ convertsimple1(arg, p_format, p_va)
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
case 'e': /* encoded string */
|
||||||
|
{
|
||||||
|
char **buffer;
|
||||||
|
const char *encoding;
|
||||||
|
PyObject *u, *s;
|
||||||
|
int size;
|
||||||
|
|
||||||
|
/* Get 'e' parameter: the encoding name */
|
||||||
|
encoding = (const char *)va_arg(*p_va, const char *);
|
||||||
|
if (encoding == NULL)
|
||||||
|
return "(encoding is NULL)";
|
||||||
|
|
||||||
|
/* Get 's' parameter: the output buffer to use */
|
||||||
|
if (*format != 's')
|
||||||
|
return "(unkown parser marker combination)";
|
||||||
|
buffer = (char **)va_arg(*p_va, char **);
|
||||||
|
format++;
|
||||||
|
if (buffer == NULL)
|
||||||
|
return "(buffer is NULL)";
|
||||||
|
|
||||||
|
/* Convert object to Unicode */
|
||||||
|
u = PyUnicode_FromObject(arg);
|
||||||
|
if (u == NULL)
|
||||||
|
return "string, unicode or text buffer";
|
||||||
|
|
||||||
|
/* Encode object; use default error handling */
|
||||||
|
s = PyUnicode_AsEncodedString(u,
|
||||||
|
encoding,
|
||||||
|
NULL);
|
||||||
|
Py_DECREF(u);
|
||||||
|
if (s == NULL)
|
||||||
|
return "(encoding failed)";
|
||||||
|
if (!PyString_Check(s)) {
|
||||||
|
Py_DECREF(s);
|
||||||
|
return "(encoder failed to return a string)";
|
||||||
|
}
|
||||||
|
size = PyString_GET_SIZE(s);
|
||||||
|
|
||||||
|
/* Write output; output is guaranteed to be
|
||||||
|
0-terminated */
|
||||||
|
if (*format == '#') {
|
||||||
|
/* Using buffer length parameter '#':
|
||||||
|
|
||||||
|
- if *buffer is NULL, a new buffer
|
||||||
|
of the needed size is allocated and
|
||||||
|
the data copied into it; *buffer is
|
||||||
|
updated to point to the new buffer;
|
||||||
|
the caller is responsible for
|
||||||
|
free()ing it after usage
|
||||||
|
|
||||||
|
- if *buffer is not NULL, the data
|
||||||
|
is copied to *buffer; *buffer_len
|
||||||
|
has to be set to the size of the
|
||||||
|
buffer on input; buffer overflow is
|
||||||
|
signalled with an error; buffer has
|
||||||
|
to provide enough room for the
|
||||||
|
encoded string plus the trailing
|
||||||
|
0-byte
|
||||||
|
|
||||||
|
- in both cases, *buffer_len is
|
||||||
|
updated to the size of the buffer
|
||||||
|
/excluding/ the trailing 0-byte
|
||||||
|
|
||||||
|
*/
|
||||||
|
int *buffer_len = va_arg(*p_va, int *);
|
||||||
|
|
||||||
|
format++;
|
||||||
|
if (buffer_len == NULL)
|
||||||
|
return "(buffer_len is NULL)";
|
||||||
|
if (*buffer == NULL) {
|
||||||
|
*buffer = PyMem_NEW(char, size + 1);
|
||||||
|
if (*buffer == NULL) {
|
||||||
|
Py_DECREF(s);
|
||||||
|
return "(memory error)";
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if (size + 1 > *buffer_len) {
|
||||||
|
Py_DECREF(s);
|
||||||
|
return "(buffer overflow)";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
memcpy(*buffer,
|
||||||
|
PyString_AS_STRING(s),
|
||||||
|
size + 1);
|
||||||
|
*buffer_len = size;
|
||||||
|
} else {
|
||||||
|
/* Using a 0-terminated buffer:
|
||||||
|
|
||||||
|
- the encoded string has to be
|
||||||
|
0-terminated for this variant to
|
||||||
|
work; if it is not, an error raised
|
||||||
|
|
||||||
|
- a new buffer of the needed size
|
||||||
|
is allocated and the data copied
|
||||||
|
into it; *buffer is updated to
|
||||||
|
point to the new buffer; the caller
|
||||||
|
is responsible for free()ing it
|
||||||
|
after usage
|
||||||
|
|
||||||
|
*/
|
||||||
|
if (strlen(PyString_AS_STRING(s)) != size)
|
||||||
|
return "(encoded string without "\
|
||||||
|
"NULL bytes)";
|
||||||
|
*buffer = PyMem_NEW(char, size + 1);
|
||||||
|
if (*buffer == NULL) {
|
||||||
|
Py_DECREF(s);
|
||||||
|
return "(memory error)";
|
||||||
|
}
|
||||||
|
memcpy(*buffer,
|
||||||
|
PyString_AS_STRING(s),
|
||||||
|
size + 1);
|
||||||
|
}
|
||||||
|
Py_DECREF(s);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
case 'S': /* string object */
|
case 'S': /* string object */
|
||||||
{
|
{
|
||||||
PyObject **p = va_arg(*p_va, PyObject **);
|
PyObject **p = va_arg(*p_va, PyObject **);
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue