Marc-Andre Lemburg:

Attached you find the latest update of the Unicode implementation. The patch is against the current CVS version. It includes the fix I posted yesterday for the core dump problem in codecs.c (was introduced by my previous patch set -- sorry), adds more tests for the codecs and two new parser markers "es" and "es#".
2025-09-26 18:29:57 +00:00 · 2000-03-24 22:14:19 +00:00 · 2000-03-24 22:14:19 +00:00 · d8855fde88
commit d8855fde88
parent 27fc3c05e1
5 changed files with 259 additions and 6 deletions
--- a/Lib/codecs.py
+++ b/Lib/codecs.py
@ -46,7 +46,7 @@ class Codec:
        handling schemes by providing the errors argument. These
        string values are defined:
-         'strict' - raise an error (or a subclass)
+         'strict' - raise a ValueError error (or a subclass)
         'ignore' - ignore the character and continue with the next
         'replace' - replace with a suitable replacement character;
                    Python will use the official U+FFFD REPLACEMENT
--- a/Lib/test/output/test_unicode
+++ b/Lib/test/output/test_unicode
@ -1,5 +1,4 @@
 test_unicode
 Testing Unicode comparisons... done.
 Testing Unicode contains method... done.
 Testing Unicode formatting strings... done.
 Testing unicodedata module... done.
--- a/Lib/test/test_unicode.py
+++ b/Lib/test/test_unicode.py
@ -293,3 +293,33 @@ else:
    assert unicodedata.combining(u'\u20e1') == 230
    print 'done.'
 # Test builtin codecs
 print 'Testing builtin codecs...',
 assert unicode('hello','ascii') == u'hello'
 assert unicode('hello','utf-8') == u'hello'
 assert unicode('hello','utf8') == u'hello'
 assert unicode('hello','latin-1') == u'hello'
 assert u'hello'.encode('ascii') == 'hello'
 assert u'hello'.encode('utf-8') == 'hello'
 assert u'hello'.encode('utf8') == 'hello'
 assert u'hello'.encode('utf-16-le') == 'h\000e\000l\000l\000o\000'
 assert u'hello'.encode('utf-16-be') == '\000h\000e\000l\000l\000o'
 assert u'hello'.encode('latin-1') == 'hello'
 u = u''.join(map(unichr, range(1024)))
 for encoding in ('utf-8', 'utf-16', 'utf-16-le', 'utf-16-be',
                 'raw_unicode_escape', 'unicode_escape', 'unicode_internal'):
    assert unicode(u.encode(encoding),encoding) == u
 u = u''.join(map(unichr, range(256)))
 for encoding in ('latin-1',):
    assert unicode(u.encode(encoding),encoding) == u
 u = u''.join(map(unichr, range(128)))
 for encoding in ('ascii',):
    assert unicode(u.encode(encoding),encoding) == u
 print 'done.'
--- a/Misc/unicode.txt
+++ b/Misc/unicode.txt
@ -715,21 +715,126 @@ Internal Argument Parsing:
 These markers are used by the PyArg_ParseTuple() APIs:
-  'U':  Check for Unicode object and return a pointer to it
+  "U":  Check for Unicode object and return a pointer to it
-  's':  For Unicode objects: auto convert them to the <default encoding>
+  "s":  For Unicode objects: auto convert them to the <default encoding>
        and return a pointer to the object's <defencstr> buffer.
-  's#': Access to the Unicode object via the bf_getreadbuf buffer interface 
+  "s#": Access to the Unicode object via the bf_getreadbuf buffer interface 
        (see Buffer Interface); note that the length relates to the buffer
        length, not the Unicode string length (this may be different
        depending on the Internal Format).
-  't#': Access to the Unicode object via the bf_getcharbuf buffer interface
+  "t#": Access to the Unicode object via the bf_getcharbuf buffer interface
        (see Buffer Interface); note that the length relates to the buffer
        length, not necessarily to the Unicode string length (this may
        be different depending on the <default encoding>).
  "es": 
 	Takes two parameters: encoding (const char *) and
 	buffer (char **). 
 	The input object is first coerced to Unicode in the usual way
 	and then encoded into a string using the given encoding.
 	On output, a buffer of the needed size is allocated and
 	returned through *buffer as NULL-terminated string.
 	The encoded may not contain embedded NULL characters.
 	The caller is responsible for free()ing the allocated *buffer
 	after usage.
  "es#":
 	Takes three parameters: encoding (const char *),
 	buffer (char **) and buffer_len (int *).
 	The input object is first coerced to Unicode in the usual way
 	and then encoded into a string using the given encoding.
 	If *buffer is non-NULL, *buffer_len must be set to sizeof(buffer)
 	on input. Output is then copied to *buffer.
 	If *buffer is NULL, a buffer of the needed size is
 	allocated and output copied into it. *buffer is then
 	updated to point to the allocated memory area. The caller
 	is responsible for free()ing *buffer after usage.
 	In both cases *buffer_len is updated to the number of
 	characters written (excluding the trailing NULL-byte).
 	The output buffer is assured to be NULL-terminated.
 Examples:
 Using "es#" with auto-allocation:
    static PyObject *
    test_parser(PyObject *self,
 		PyObject *args)
    {
 	PyObject *str;
 	const char *encoding = "latin-1";
 	char *buffer = NULL;
 	int buffer_len = 0;
 	if (!PyArg_ParseTuple(args, "es#:test_parser",
 			      encoding, &buffer, &buffer_len))
 	    return NULL;
 	if (!buffer) {
 	    PyErr_SetString(PyExc_SystemError,
 			    "buffer is NULL");
 	    return NULL;
 	}
 	str = PyString_FromStringAndSize(buffer, buffer_len);
 	free(buffer);
 	return str;
    }
 Using "es" with auto-allocation returning a NULL-terminated string:    
    static PyObject *
    test_parser(PyObject *self,
 		PyObject *args)
    {
 	PyObject *str;
 	const char *encoding = "latin-1";
 	char *buffer = NULL;
 	if (!PyArg_ParseTuple(args, "es:test_parser",
 			      encoding, &buffer))
 	    return NULL;
 	if (!buffer) {
 	    PyErr_SetString(PyExc_SystemError,
 			    "buffer is NULL");
 	    return NULL;
 	}
 	str = PyString_FromString(buffer);
 	free(buffer);
 	return str;
    }
 Using "es#" with a pre-allocated buffer:
    static PyObject *
    test_parser(PyObject *self,
 		PyObject *args)
    {
 	PyObject *str;
 	const char *encoding = "latin-1";
 	char _buffer[10];
 	char *buffer = _buffer;
 	int buffer_len = sizeof(_buffer);
 	if (!PyArg_ParseTuple(args, "es#:test_parser",
 			      encoding, &buffer, &buffer_len))
 	    return NULL;
 	if (!buffer) {
 	    PyErr_SetString(PyExc_SystemError,
 			    "buffer is NULL");
 	    return NULL;
 	}
 	str = PyString_FromStringAndSize(buffer, buffer_len);
 	return str;
    }
 File/Stream Output:
 -------------------
@ -837,6 +942,7 @@ Encodings:
 History of this Proposal:
 -------------------------
 1.3: Added new "es" and "es#" parser markers
 1.2: Removed POD about codecs.open()
 1.1: Added note about comparisons and hash values. Added note about
     case mapping algorithms. Changed stream codecs .read() and
--- a/Python/getargs.c
+++ b/Python/getargs.c
@ -178,6 +178,8 @@ vgetargs1(args, format, p_va, compat)
 		}
 		else if (level != 0)
 			; /* Pass */
 		else if (c == 'e')
 			; /* Pass */
 		else if (isalpha(c))
 			max++;
 		else if (c == '|')
@ -654,6 +656,122 @@ convertsimple1(arg, p_format, p_va)
 			break;
 		}
 	case 'e': /* encoded string */
 		{
 			char **buffer;
 			const char *encoding;
 			PyObject *u, *s;
 			int size;
 			/* Get 'e' parameter: the encoding name */
 			encoding = (const char *)va_arg(*p_va, const char *);
 			if (encoding == NULL)
 				return "(encoding is NULL)";
 			/* Get 's' parameter: the output buffer to use */
 			if (*format != 's')
 				return "(unkown parser marker combination)";
 			buffer = (char **)va_arg(*p_va, char **);
 			format++;
 			if (buffer == NULL)
 				return "(buffer is NULL)";
 			/* Convert object to Unicode */
 			u = PyUnicode_FromObject(arg);
 			if (u == NULL)
 				return "string, unicode or text buffer";
 			/* Encode object; use default error handling */
 			s = PyUnicode_AsEncodedString(u,
 						      encoding,
 						      NULL);
 			Py_DECREF(u);
 			if (s == NULL)
 				return "(encoding failed)";
 			if (!PyString_Check(s)) {
 				Py_DECREF(s);
 				return "(encoder failed to return a string)";
 			}
 			size = PyString_GET_SIZE(s);
 			/* Write output; output is guaranteed to be
 			   0-terminated */
 			if (*format == '#') { 
 				/* Using buffer length parameter '#':
 				   - if *buffer is NULL, a new buffer
 				   of the needed size is allocated and
 				   the data copied into it; *buffer is
 				   updated to point to the new buffer;
 				   the caller is responsible for
 				   free()ing it after usage
 				   - if *buffer is not NULL, the data
 				   is copied to *buffer; *buffer_len
 				   has to be set to the size of the
 				   buffer on input; buffer overflow is
 				   signalled with an error; buffer has
 				   to provide enough room for the
 				   encoded string plus the trailing
 				   0-byte
 				   - in both cases, *buffer_len is
 				   updated to the size of the buffer
 				   /excluding/ the trailing 0-byte
 				*/
 				int *buffer_len = va_arg(*p_va, int *);
 				format++;
 				if (buffer_len == NULL)
 					return "(buffer_len is NULL)";
 				if (*buffer == NULL) {
 					*buffer = PyMem_NEW(char, size + 1);
 					if (*buffer == NULL) {
 						Py_DECREF(s);
 						return "(memory error)";
 					}
 				} else {
 					if (size + 1 > *buffer_len) {
 						Py_DECREF(s);
 						return "(buffer overflow)";
 					}
 				}
 				memcpy(*buffer,
 				       PyString_AS_STRING(s),
 				       size + 1);
 				*buffer_len = size;
 			} else {
 				/* Using a 0-terminated buffer:
 				   - the encoded string has to be
 				   0-terminated for this variant to
 				   work; if it is not, an error raised
 				   - a new buffer of the needed size
 				   is allocated and the data copied
 				   into it; *buffer is updated to
 				   point to the new buffer; the caller
 				   is responsible for free()ing it
 				   after usage
 				 */
 				if (strlen(PyString_AS_STRING(s)) != size)
 					return "(encoded string without "\
 					       "NULL bytes)";
 				*buffer = PyMem_NEW(char, size + 1);
 				if (*buffer == NULL) {
 					Py_DECREF(s);
 					return "(memory error)";
 				}
 				memcpy(*buffer,
 				       PyString_AS_STRING(s),
 				       size + 1);
 			}
 			Py_DECREF(s);
 			break;
 		}
 	case 'S': /* string object */
 		{
 			PyObject **p = va_arg(*p_va, PyObject **);