Issue #10783: struct.pack() doesn't encode implicitly unicode to UTF-8

* Replace "bytes" by "bytes object" in struct error messages
 * Document the API change in What's new in Python 3.2
 * Fix test_wave
 * Remove also ugly implicit conversions in test_struct
This commit is contained in:
Victor Stinner 2010-12-28 13:26:42 +00:00
parent e398da9ad0
commit da9ec995f6
6 changed files with 83 additions and 107 deletions

View file

@ -164,58 +164,53 @@ platform-dependent.
+--------+--------------------------+--------------------+----------------+------------+ +--------+--------------------------+--------------------+----------------+------------+
| ``c`` | :c:type:`char` | bytes of length 1 | 1 | | | ``c`` | :c:type:`char` | bytes of length 1 | 1 | |
+--------+--------------------------+--------------------+----------------+------------+ +--------+--------------------------+--------------------+----------------+------------+
| ``b`` | :c:type:`signed char` | integer | 1 | \(1),\(4) | | ``b`` | :c:type:`signed char` | integer | 1 | \(1),\(3) |
+--------+--------------------------+--------------------+----------------+------------+ +--------+--------------------------+--------------------+----------------+------------+
| ``B`` | :c:type:`unsigned char` | integer | 1 | \(4) | | ``B`` | :c:type:`unsigned char` | integer | 1 | \(3) |
+--------+--------------------------+--------------------+----------------+------------+ +--------+--------------------------+--------------------+----------------+------------+
| ``?`` | :c:type:`_Bool` | bool | 1 | \(2) | | ``?`` | :c:type:`_Bool` | bool | 1 | \(1) |
+--------+--------------------------+--------------------+----------------+------------+ +--------+--------------------------+--------------------+----------------+------------+
| ``h`` | :c:type:`short` | integer | 2 | \(4) | | ``h`` | :c:type:`short` | integer | 2 | \(3) |
+--------+--------------------------+--------------------+----------------+------------+ +--------+--------------------------+--------------------+----------------+------------+
| ``H`` | :c:type:`unsigned short` | integer | 2 | \(4) | | ``H`` | :c:type:`unsigned short` | integer | 2 | \(3) |
+--------+--------------------------+--------------------+----------------+------------+ +--------+--------------------------+--------------------+----------------+------------+
| ``i`` | :c:type:`int` | integer | 4 | \(4) | | ``i`` | :c:type:`int` | integer | 4 | \(3) |
+--------+--------------------------+--------------------+----------------+------------+ +--------+--------------------------+--------------------+----------------+------------+
| ``I`` | :c:type:`unsigned int` | integer | 4 | \(4) | | ``I`` | :c:type:`unsigned int` | integer | 4 | \(3) |
+--------+--------------------------+--------------------+----------------+------------+ +--------+--------------------------+--------------------+----------------+------------+
| ``l`` | :c:type:`long` | integer | 4 | \(4) | | ``l`` | :c:type:`long` | integer | 4 | \(3) |
+--------+--------------------------+--------------------+----------------+------------+ +--------+--------------------------+--------------------+----------------+------------+
| ``L`` | :c:type:`unsigned long` | integer | 4 | \(4) | | ``L`` | :c:type:`unsigned long` | integer | 4 | \(3) |
+--------+--------------------------+--------------------+----------------+------------+ +--------+--------------------------+--------------------+----------------+------------+
| ``q`` | :c:type:`long long` | integer | 8 | \(3), \(4) | | ``q`` | :c:type:`long long` | integer | 8 | \(2), \(3) |
+--------+--------------------------+--------------------+----------------+------------+ +--------+--------------------------+--------------------+----------------+------------+
| ``Q`` | :c:type:`unsigned long | integer | 8 | \(3), \(4) | | ``Q`` | :c:type:`unsigned long | integer | 8 | \(2), \(3) |
| | long` | | | | | | long` | | | |
+--------+--------------------------+--------------------+----------------+------------+ +--------+--------------------------+--------------------+----------------+------------+
| ``f`` | :c:type:`float` | float | 4 | \(5) | | ``f`` | :c:type:`float` | float | 4 | \(4) |
+--------+--------------------------+--------------------+----------------+------------+ +--------+--------------------------+--------------------+----------------+------------+
| ``d`` | :c:type:`double` | float | 8 | \(5) | | ``d`` | :c:type:`double` | float | 8 | \(4) |
+--------+--------------------------+--------------------+----------------+------------+ +--------+--------------------------+--------------------+----------------+------------+
| ``s`` | :c:type:`char[]` | bytes | | \(1) | | ``s`` | :c:type:`char[]` | bytes | | |
+--------+--------------------------+--------------------+----------------+------------+ +--------+--------------------------+--------------------+----------------+------------+
| ``p`` | :c:type:`char[]` | bytes | | \(1) | | ``p`` | :c:type:`char[]` | bytes | | |
+--------+--------------------------+--------------------+----------------+------------+ +--------+--------------------------+--------------------+----------------+------------+
| ``P`` | :c:type:`void \*` | integer | | \(6) | | ``P`` | :c:type:`void \*` | integer | | \(5) |
+--------+--------------------------+--------------------+----------------+------------+ +--------+--------------------------+--------------------+----------------+------------+
Notes: Notes:
(1) (1)
The ``c``, ``s`` and ``p`` conversion codes operate on :class:`bytes`
objects, but packing with such codes also supports :class:`str` objects,
which are encoded using UTF-8.
(2)
The ``'?'`` conversion code corresponds to the :c:type:`_Bool` type defined by The ``'?'`` conversion code corresponds to the :c:type:`_Bool` type defined by
C99. If this type is not available, it is simulated using a :c:type:`char`. In C99. If this type is not available, it is simulated using a :c:type:`char`. In
standard mode, it is always represented by one byte. standard mode, it is always represented by one byte.
(3) (2)
The ``'q'`` and ``'Q'`` conversion codes are available in native mode only if The ``'q'`` and ``'Q'`` conversion codes are available in native mode only if
the platform C compiler supports C :c:type:`long long`, or, on Windows, the platform C compiler supports C :c:type:`long long`, or, on Windows,
:c:type:`__int64`. They are always available in standard modes. :c:type:`__int64`. They are always available in standard modes.
(4) (3)
When attempting to pack a non-integer using any of the integer conversion When attempting to pack a non-integer using any of the integer conversion
codes, if the non-integer has a :meth:`__index__` method then that method is codes, if the non-integer has a :meth:`__index__` method then that method is
called to convert the argument to an integer before packing. called to convert the argument to an integer before packing.
@ -223,12 +218,12 @@ Notes:
.. versionchanged:: 3.2 .. versionchanged:: 3.2
Use of the :meth:`__index__` method for non-integers is new in 3.2. Use of the :meth:`__index__` method for non-integers is new in 3.2.
(5) (4)
For the ``'f'`` and ``'d'`` conversion codes, the packed representation uses For the ``'f'`` and ``'d'`` conversion codes, the packed representation uses
the IEEE 754 binary32 (for ``'f'``) or binary64 (for ``'d'``) format, the IEEE 754 binary32 (for ``'f'``) or binary64 (for ``'d'``) format,
regardless of the floating-point format used by the platform. regardless of the floating-point format used by the platform.
(6) (5)
The ``'P'`` format character is only available for the native byte ordering The ``'P'`` format character is only available for the native byte ordering
(selected as the default or with the ``'@'`` byte order character). The byte (selected as the default or with the ``'@'`` byte order character). The byte
order character ``'='`` chooses to use little- or big-endian ordering based order character ``'='`` chooses to use little- or big-endian ordering based
@ -310,9 +305,9 @@ the result in a named tuple::
The ordering of format characters may have an impact on size since the padding The ordering of format characters may have an impact on size since the padding
needed to satisfy alignment requirements is different:: needed to satisfy alignment requirements is different::
>>> pack('ci', '*', 0x12131415) >>> pack('ci', b'*', 0x12131415)
b'*\x00\x00\x00\x12\x13\x14\x15' b'*\x00\x00\x00\x12\x13\x14\x15'
>>> pack('ic', 0x12131415, '*') >>> pack('ic', 0x12131415, b'*')
b'\x12\x13\x14\x15*' b'\x12\x13\x14\x15*'
>>> calcsize('ci') >>> calcsize('ci')
8 8

View file

@ -1705,3 +1705,7 @@ require changes to your code:
(Contributed by Georg Brandl and Mattias Brändström; (Contributed by Georg Brandl and Mattias Brändström;
`appspot issue 53094 <http://codereview.appspot.com/53094>`_.) `appspot issue 53094 <http://codereview.appspot.com/53094>`_.)
* :func:`struct.pack` doesn't encode implicitly unicode to UTF-8 anymore: use
explicit conversion instead and replace unicode literals by bytes literals.

View file

@ -82,58 +82,52 @@ class StructTest(unittest.TestCase):
# Test some of the new features in detail # Test some of the new features in detail
# (format, argument, big-endian result, little-endian result, asymmetric) # (format, argument, big-endian result, little-endian result, asymmetric)
tests = [ tests = [
('c', 'a', 'a', 'a', 0), ('c', b'a', b'a', b'a', 0),
('xc', 'a', '\0a', '\0a', 0), ('xc', b'a', b'\0a', b'\0a', 0),
('cx', 'a', 'a\0', 'a\0', 0), ('cx', b'a', b'a\0', b'a\0', 0),
('s', 'a', 'a', 'a', 0), ('s', b'a', b'a', b'a', 0),
('0s', 'helloworld', '', '', 1), ('0s', b'helloworld', b'', b'', 1),
('1s', 'helloworld', 'h', 'h', 1), ('1s', b'helloworld', b'h', b'h', 1),
('9s', 'helloworld', 'helloworl', 'helloworl', 1), ('9s', b'helloworld', b'helloworl', b'helloworl', 1),
('10s', 'helloworld', 'helloworld', 'helloworld', 0), ('10s', b'helloworld', b'helloworld', b'helloworld', 0),
('11s', 'helloworld', 'helloworld\0', 'helloworld\0', 1), ('11s', b'helloworld', b'helloworld\0', b'helloworld\0', 1),
('20s', 'helloworld', 'helloworld'+10*'\0', 'helloworld'+10*'\0', 1), ('20s', b'helloworld', b'helloworld'+10*b'\0', b'helloworld'+10*b'\0', 1),
('b', 7, '\7', '\7', 0), ('b', 7, b'\7', b'\7', 0),
('b', -7, '\371', '\371', 0), ('b', -7, b'\371', b'\371', 0),
('B', 7, '\7', '\7', 0), ('B', 7, b'\7', b'\7', 0),
('B', 249, '\371', '\371', 0), ('B', 249, b'\371', b'\371', 0),
('h', 700, '\002\274', '\274\002', 0), ('h', 700, b'\002\274', b'\274\002', 0),
('h', -700, '\375D', 'D\375', 0), ('h', -700, b'\375D', b'D\375', 0),
('H', 700, '\002\274', '\274\002', 0), ('H', 700, b'\002\274', b'\274\002', 0),
('H', 0x10000-700, '\375D', 'D\375', 0), ('H', 0x10000-700, b'\375D', b'D\375', 0),
('i', 70000000, '\004,\035\200', '\200\035,\004', 0), ('i', 70000000, b'\004,\035\200', b'\200\035,\004', 0),
('i', -70000000, '\373\323\342\200', '\200\342\323\373', 0), ('i', -70000000, b'\373\323\342\200', b'\200\342\323\373', 0),
('I', 70000000, '\004,\035\200', '\200\035,\004', 0), ('I', 70000000, b'\004,\035\200', b'\200\035,\004', 0),
('I', 0x100000000-70000000, '\373\323\342\200', '\200\342\323\373', 0), ('I', 0x100000000-70000000, b'\373\323\342\200', b'\200\342\323\373', 0),
('l', 70000000, '\004,\035\200', '\200\035,\004', 0), ('l', 70000000, b'\004,\035\200', b'\200\035,\004', 0),
('l', -70000000, '\373\323\342\200', '\200\342\323\373', 0), ('l', -70000000, b'\373\323\342\200', b'\200\342\323\373', 0),
('L', 70000000, '\004,\035\200', '\200\035,\004', 0), ('L', 70000000, b'\004,\035\200', b'\200\035,\004', 0),
('L', 0x100000000-70000000, '\373\323\342\200', '\200\342\323\373', 0), ('L', 0x100000000-70000000, b'\373\323\342\200', b'\200\342\323\373', 0),
('f', 2.0, '@\000\000\000', '\000\000\000@', 0), ('f', 2.0, b'@\000\000\000', b'\000\000\000@', 0),
('d', 2.0, '@\000\000\000\000\000\000\000', ('d', 2.0, b'@\000\000\000\000\000\000\000',
'\000\000\000\000\000\000\000@', 0), b'\000\000\000\000\000\000\000@', 0),
('f', -2.0, '\300\000\000\000', '\000\000\000\300', 0), ('f', -2.0, b'\300\000\000\000', b'\000\000\000\300', 0),
('d', -2.0, '\300\000\000\000\000\000\000\000', ('d', -2.0, b'\300\000\000\000\000\000\000\000',
'\000\000\000\000\000\000\000\300', 0), b'\000\000\000\000\000\000\000\300', 0),
('?', 0, '\0', '\0', 0), ('?', 0, b'\0', b'\0', 0),
('?', 3, '\1', '\1', 1), ('?', 3, b'\1', b'\1', 1),
('?', True, '\1', '\1', 0), ('?', True, b'\1', b'\1', 0),
('?', [], '\0', '\0', 1), ('?', [], b'\0', b'\0', 1),
('?', (1,), '\1', '\1', 1), ('?', (1,), b'\1', b'\1', 1),
] ]
for fmt, arg, big, lil, asy in tests: for fmt, arg, big, lil, asy in tests:
big = bytes(big, "latin-1")
lil = bytes(lil, "latin-1")
for (xfmt, exp) in [('>'+fmt, big), ('!'+fmt, big), ('<'+fmt, lil), for (xfmt, exp) in [('>'+fmt, big), ('!'+fmt, big), ('<'+fmt, lil),
('='+fmt, ISBIGENDIAN and big or lil)]: ('='+fmt, ISBIGENDIAN and big or lil)]:
res = struct.pack(xfmt, arg) res = struct.pack(xfmt, arg)
self.assertEqual(res, exp) self.assertEqual(res, exp)
self.assertEqual(struct.calcsize(xfmt), len(res)) self.assertEqual(struct.calcsize(xfmt), len(res))
rev = struct.unpack(xfmt, res)[0] rev = struct.unpack(xfmt, res)[0]
if isinstance(arg, str):
# Strings are returned as bytes since you can't know the
# encoding of the string when packed.
arg = bytes(arg, 'latin1')
if rev != arg: if rev != arg:
self.assertTrue(asy) self.assertTrue(asy)
@ -334,15 +328,14 @@ class StructTest(unittest.TestCase):
def test_p_code(self): def test_p_code(self):
# Test p ("Pascal string") code. # Test p ("Pascal string") code.
for code, input, expected, expectedback in [ for code, input, expected, expectedback in [
('p','abc', '\x00', b''), ('p', b'abc', b'\x00', b''),
('1p', 'abc', '\x00', b''), ('1p', b'abc', b'\x00', b''),
('2p', 'abc', '\x01a', b'a'), ('2p', b'abc', b'\x01a', b'a'),
('3p', 'abc', '\x02ab', b'ab'), ('3p', b'abc', b'\x02ab', b'ab'),
('4p', 'abc', '\x03abc', b'abc'), ('4p', b'abc', b'\x03abc', b'abc'),
('5p', 'abc', '\x03abc\x00', b'abc'), ('5p', b'abc', b'\x03abc\x00', b'abc'),
('6p', 'abc', '\x03abc\x00\x00', b'abc'), ('6p', b'abc', b'\x03abc\x00\x00', b'abc'),
('1000p', 'x'*1000, '\xff' + 'x'*999, b'x'*255)]: ('1000p', b'x'*1000, b'\xff' + b'x'*999, b'x'*255)]:
expected = bytes(expected, "latin-1")
got = struct.pack(code, input) got = struct.pack(code, input)
self.assertEqual(got, expected) self.assertEqual(got, expected)
(got,) = struct.unpack(code, got) (got,) = struct.unpack(code, got)
@ -401,15 +394,11 @@ class StructTest(unittest.TestCase):
s = struct.Struct(fmt) s = struct.Struct(fmt)
for cls in (bytes, bytearray): for cls in (bytes, bytearray):
data = cls(test_string) data = cls(test_string)
if not isinstance(data, (bytes, bytearray)):
bytes_data = bytes(data, 'latin1')
else:
bytes_data = data
self.assertEqual(s.unpack_from(data), (b'abcd',)) self.assertEqual(s.unpack_from(data), (b'abcd',))
self.assertEqual(s.unpack_from(data, 2), (b'cd01',)) self.assertEqual(s.unpack_from(data, 2), (b'cd01',))
self.assertEqual(s.unpack_from(data, 4), (b'0123',)) self.assertEqual(s.unpack_from(data, 4), (b'0123',))
for i in range(6): for i in range(6):
self.assertEqual(s.unpack_from(data, i), (bytes_data[i:i+4],)) self.assertEqual(s.unpack_from(data, i), (data[i:i+4],))
for i in range(6, len(test_string) + 1): for i in range(6, len(test_string) + 1):
self.assertRaises(struct.error, s.unpack_from, data, i) self.assertRaises(struct.error, s.unpack_from, data, i)
for cls in (bytes, bytearray): for cls in (bytes, bytearray):

View file

@ -467,11 +467,11 @@ class Wave_write:
self._datalength = self._nframes * self._nchannels * self._sampwidth self._datalength = self._nframes * self._nchannels * self._sampwidth
self._form_length_pos = self._file.tell() self._form_length_pos = self._file.tell()
self._file.write(struct.pack('<l4s4slhhllhh4s', self._file.write(struct.pack('<l4s4slhhllhh4s',
36 + self._datalength, 'WAVE', 'fmt ', 16, 36 + self._datalength, b'WAVE', b'fmt ', 16,
WAVE_FORMAT_PCM, self._nchannels, self._framerate, WAVE_FORMAT_PCM, self._nchannels, self._framerate,
self._nchannels * self._framerate * self._sampwidth, self._nchannels * self._framerate * self._sampwidth,
self._nchannels * self._sampwidth, self._nchannels * self._sampwidth,
self._sampwidth * 8, 'data')) self._sampwidth * 8, b'data'))
self._data_length_pos = self._file.tell() self._data_length_pos = self._file.tell()
self._file.write(struct.pack('<l', self._datalength)) self._file.write(struct.pack('<l', self._datalength))
self._headerwritten = True self._headerwritten = True

View file

@ -18,6 +18,9 @@ Core and Builtins
Library Library
------- -------
- Issue #10783: struct.pack() doesn't encode implicitly unicode to UTF-8
anymore.
- Issue #10730: Add SVG mime types to mimetypes module. - Issue #10730: Add SVG mime types to mimetypes module.
- Issue #10768: Make the Tkinter ScrolledText widget work again. - Issue #10768: Make the Tkinter ScrolledText widget work again.

View file

@ -462,14 +462,9 @@ np_ubyte(char *p, PyObject *v, const formatdef *f)
static int static int
np_char(char *p, PyObject *v, const formatdef *f) np_char(char *p, PyObject *v, const formatdef *f)
{ {
if (PyUnicode_Check(v)) {
v = _PyUnicode_AsDefaultEncodedString(v, NULL);
if (v == NULL)
return -1;
}
if (!PyBytes_Check(v) || PyBytes_Size(v) != 1) { if (!PyBytes_Check(v) || PyBytes_Size(v) != 1) {
PyErr_SetString(StructError, PyErr_SetString(StructError,
"char format requires bytes or string of length 1"); "char format requires a bytes object of length 1");
return -1; return -1;
} }
*p = *PyBytes_AsString(v); *p = *PyBytes_AsString(v);
@ -1345,7 +1340,7 @@ s_init(PyObject *self, PyObject *args, PyObject *kwds)
if (!PyBytes_Check(o_format)) { if (!PyBytes_Check(o_format)) {
Py_DECREF(o_format); Py_DECREF(o_format);
PyErr_Format(PyExc_TypeError, PyErr_Format(PyExc_TypeError,
"Struct() argument 1 must be bytes, not %.200s", "Struct() argument 1 must be a bytes object, not %.200s",
Py_TYPE(o_format)->tp_name); Py_TYPE(o_format)->tp_name);
return -1; return -1;
} }
@ -1423,7 +1418,7 @@ s_unpack(PyObject *self, PyObject *input)
return NULL; return NULL;
if (vbuf.len != soself->s_size) { if (vbuf.len != soself->s_size) {
PyErr_Format(StructError, PyErr_Format(StructError,
"unpack requires a bytes argument of length %zd", "unpack requires a bytes object of length %zd",
soself->s_size); soself->s_size);
PyBuffer_Release(&vbuf); PyBuffer_Release(&vbuf);
return NULL; return NULL;
@ -1503,15 +1498,10 @@ s_pack_internal(PyStructObject *soself, PyObject *args, int offset, char* buf)
if (e->format == 's') { if (e->format == 's') {
int isstring; int isstring;
void *p; void *p;
if (PyUnicode_Check(v)) {
v = _PyUnicode_AsDefaultEncodedString(v, NULL);
if (v == NULL)
return -1;
}
isstring = PyBytes_Check(v); isstring = PyBytes_Check(v);
if (!isstring && !PyByteArray_Check(v)) { if (!isstring && !PyByteArray_Check(v)) {
PyErr_SetString(StructError, PyErr_SetString(StructError,
"argument for 's' must be a bytes or string"); "argument for 's' must be a bytes object");
return -1; return -1;
} }
if (isstring) { if (isstring) {
@ -1529,15 +1519,10 @@ s_pack_internal(PyStructObject *soself, PyObject *args, int offset, char* buf)
} else if (e->format == 'p') { } else if (e->format == 'p') {
int isstring; int isstring;
void *p; void *p;
if (PyUnicode_Check(v)) {
v = _PyUnicode_AsDefaultEncodedString(v, NULL);
if (v == NULL)
return -1;
}
isstring = PyBytes_Check(v); isstring = PyBytes_Check(v);
if (!isstring && !PyByteArray_Check(v)) { if (!isstring && !PyByteArray_Check(v)) {
PyErr_SetString(StructError, PyErr_SetString(StructError,
"argument for 'p' must be a bytes or string"); "argument for 'p' must be a bytes object");
return -1; return -1;
} }
if (isstring) { if (isstring) {