mirror of
https://github.com/python/cpython.git
synced 2025-09-26 18:29:57 +00:00
Issue #5006: Better handling of unicode byte-order marks (BOM) in the io library.
This means, for example, that opening an UTF-16 text file in append mode doesn't add a BOM at the end of the file if the file isn't empty.
This commit is contained in:
parent
b565577aa7
commit
e450185b4a
6 changed files with 168 additions and 22 deletions
20
Lib/_pyio.py
20
Lib/_pyio.py
|
@ -1436,6 +1436,15 @@ class TextIOWrapper(TextIOBase):
|
||||||
self._snapshot = None # info for reconstructing decoder state
|
self._snapshot = None # info for reconstructing decoder state
|
||||||
self._seekable = self._telling = self.buffer.seekable()
|
self._seekable = self._telling = self.buffer.seekable()
|
||||||
|
|
||||||
|
if self._seekable and self.writable():
|
||||||
|
position = self.buffer.tell()
|
||||||
|
if position != 0:
|
||||||
|
try:
|
||||||
|
self._get_encoder().setstate(0)
|
||||||
|
except LookupError:
|
||||||
|
# Sometimes the encoder doesn't exist
|
||||||
|
pass
|
||||||
|
|
||||||
# self._snapshot is either None, or a tuple (dec_flags, next_input)
|
# self._snapshot is either None, or a tuple (dec_flags, next_input)
|
||||||
# where dec_flags is the second (integer) item of the decoder state
|
# where dec_flags is the second (integer) item of the decoder state
|
||||||
# and next_input is the chunk of input bytes that comes next after the
|
# and next_input is the chunk of input bytes that comes next after the
|
||||||
|
@ -1741,6 +1750,17 @@ class TextIOWrapper(TextIOBase):
|
||||||
raise IOError("can't restore logical file position")
|
raise IOError("can't restore logical file position")
|
||||||
self._decoded_chars_used = chars_to_skip
|
self._decoded_chars_used = chars_to_skip
|
||||||
|
|
||||||
|
# Finally, reset the encoder (merely useful for proper BOM handling)
|
||||||
|
try:
|
||||||
|
encoder = self._encoder or self._get_encoder()
|
||||||
|
except LookupError:
|
||||||
|
# Sometimes the encoder doesn't exist
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
if cookie != 0:
|
||||||
|
encoder.setstate(0)
|
||||||
|
else:
|
||||||
|
encoder.reset()
|
||||||
return cookie
|
return cookie
|
||||||
|
|
||||||
def read(self, n=None):
|
def read(self, n=None):
|
||||||
|
|
|
@ -1963,6 +1963,37 @@ class TextIOWrapperTest(unittest.TestCase):
|
||||||
|
|
||||||
self.assertEqual(buffer.seekable(), txt.seekable())
|
self.assertEqual(buffer.seekable(), txt.seekable())
|
||||||
|
|
||||||
|
def test_append_bom(self):
|
||||||
|
# The BOM is not written again when appending to a non-empty file
|
||||||
|
filename = support.TESTFN
|
||||||
|
for charset in ('utf-8-sig', 'utf-16', 'utf-32'):
|
||||||
|
with self.open(filename, 'w', encoding=charset) as f:
|
||||||
|
f.write('aaa')
|
||||||
|
pos = f.tell()
|
||||||
|
with self.open(filename, 'rb') as f:
|
||||||
|
self.assertEquals(f.read(), 'aaa'.encode(charset))
|
||||||
|
|
||||||
|
with self.open(filename, 'a', encoding=charset) as f:
|
||||||
|
f.write('xxx')
|
||||||
|
with self.open(filename, 'rb') as f:
|
||||||
|
self.assertEquals(f.read(), 'aaaxxx'.encode(charset))
|
||||||
|
|
||||||
|
def test_seek_bom(self):
|
||||||
|
# Same test, but when seeking manually
|
||||||
|
filename = support.TESTFN
|
||||||
|
for charset in ('utf-8-sig', 'utf-16', 'utf-32'):
|
||||||
|
with self.open(filename, 'w', encoding=charset) as f:
|
||||||
|
f.write('aaa')
|
||||||
|
pos = f.tell()
|
||||||
|
with self.open(filename, 'r+', encoding=charset) as f:
|
||||||
|
f.seek(pos)
|
||||||
|
f.write('zzz')
|
||||||
|
f.seek(0)
|
||||||
|
f.write('bbb')
|
||||||
|
with self.open(filename, 'rb') as f:
|
||||||
|
self.assertEquals(f.read(), 'bbbzzz'.encode(charset))
|
||||||
|
|
||||||
|
|
||||||
class CTextIOWrapperTest(TextIOWrapperTest):
|
class CTextIOWrapperTest(TextIOWrapperTest):
|
||||||
|
|
||||||
def test_initialization(self):
|
def test_initialization(self):
|
||||||
|
|
|
@ -23,6 +23,11 @@ Core and Builtins
|
||||||
Library
|
Library
|
||||||
-------
|
-------
|
||||||
|
|
||||||
|
- Issue #5006: Better handling of unicode byte-order marks (BOM) in the io
|
||||||
|
library. This means, for example, that opening an UTF-16 text file in
|
||||||
|
append mode doesn't add a BOM at the end of the file if the file isn't
|
||||||
|
empty.
|
||||||
|
|
||||||
- Issue #4050: inspect.findsource/getsource now raise an IOError if the 'source'
|
- Issue #4050: inspect.findsource/getsource now raise an IOError if the 'source'
|
||||||
file is a binary. Patch by Brodie Rao, tests by Daniel Diniz. This fix
|
file is a binary. Patch by Brodie Rao, tests by Daniel Diniz. This fix
|
||||||
corrects a pydoc regression.
|
corrects a pydoc regression.
|
||||||
|
|
|
@ -41,6 +41,7 @@ PyObject *_PyIO_str_readline;
|
||||||
PyObject *_PyIO_str_reset;
|
PyObject *_PyIO_str_reset;
|
||||||
PyObject *_PyIO_str_seek;
|
PyObject *_PyIO_str_seek;
|
||||||
PyObject *_PyIO_str_seekable;
|
PyObject *_PyIO_str_seekable;
|
||||||
|
PyObject *_PyIO_str_setstate;
|
||||||
PyObject *_PyIO_str_tell;
|
PyObject *_PyIO_str_tell;
|
||||||
PyObject *_PyIO_str_truncate;
|
PyObject *_PyIO_str_truncate;
|
||||||
PyObject *_PyIO_str_writable;
|
PyObject *_PyIO_str_writable;
|
||||||
|
@ -48,6 +49,7 @@ PyObject *_PyIO_str_write;
|
||||||
|
|
||||||
PyObject *_PyIO_empty_str;
|
PyObject *_PyIO_empty_str;
|
||||||
PyObject *_PyIO_empty_bytes;
|
PyObject *_PyIO_empty_bytes;
|
||||||
|
PyObject *_PyIO_zero;
|
||||||
|
|
||||||
|
|
||||||
PyDoc_STRVAR(module_doc,
|
PyDoc_STRVAR(module_doc,
|
||||||
|
@ -734,6 +736,8 @@ PyInit__io(void)
|
||||||
goto fail;
|
goto fail;
|
||||||
if (!(_PyIO_str_seekable = PyUnicode_InternFromString("seekable")))
|
if (!(_PyIO_str_seekable = PyUnicode_InternFromString("seekable")))
|
||||||
goto fail;
|
goto fail;
|
||||||
|
if (!(_PyIO_str_setstate = PyUnicode_InternFromString("setstate")))
|
||||||
|
goto fail;
|
||||||
if (!(_PyIO_str_tell = PyUnicode_InternFromString("tell")))
|
if (!(_PyIO_str_tell = PyUnicode_InternFromString("tell")))
|
||||||
goto fail;
|
goto fail;
|
||||||
if (!(_PyIO_str_truncate = PyUnicode_InternFromString("truncate")))
|
if (!(_PyIO_str_truncate = PyUnicode_InternFromString("truncate")))
|
||||||
|
@ -747,6 +751,8 @@ PyInit__io(void)
|
||||||
goto fail;
|
goto fail;
|
||||||
if (!(_PyIO_empty_bytes = PyBytes_FromStringAndSize(NULL, 0)))
|
if (!(_PyIO_empty_bytes = PyBytes_FromStringAndSize(NULL, 0)))
|
||||||
goto fail;
|
goto fail;
|
||||||
|
if (!(_PyIO_zero = PyLong_FromLong(0L)))
|
||||||
|
goto fail;
|
||||||
|
|
||||||
state->initialized = 1;
|
state->initialized = 1;
|
||||||
|
|
||||||
|
|
|
@ -141,6 +141,7 @@ extern PyObject *_PyIO_str_readline;
|
||||||
extern PyObject *_PyIO_str_reset;
|
extern PyObject *_PyIO_str_reset;
|
||||||
extern PyObject *_PyIO_str_seek;
|
extern PyObject *_PyIO_str_seek;
|
||||||
extern PyObject *_PyIO_str_seekable;
|
extern PyObject *_PyIO_str_seekable;
|
||||||
|
extern PyObject *_PyIO_str_setstate;
|
||||||
extern PyObject *_PyIO_str_tell;
|
extern PyObject *_PyIO_str_tell;
|
||||||
extern PyObject *_PyIO_str_truncate;
|
extern PyObject *_PyIO_str_truncate;
|
||||||
extern PyObject *_PyIO_str_writable;
|
extern PyObject *_PyIO_str_writable;
|
||||||
|
@ -148,3 +149,4 @@ extern PyObject *_PyIO_str_write;
|
||||||
|
|
||||||
extern PyObject *_PyIO_empty_str;
|
extern PyObject *_PyIO_empty_str;
|
||||||
extern PyObject *_PyIO_empty_bytes;
|
extern PyObject *_PyIO_empty_bytes;
|
||||||
|
extern PyObject *_PyIO_zero;
|
||||||
|
|
|
@ -647,6 +647,8 @@ typedef struct
|
||||||
char telling;
|
char telling;
|
||||||
/* Specialized encoding func (see below) */
|
/* Specialized encoding func (see below) */
|
||||||
encodefunc_t encodefunc;
|
encodefunc_t encodefunc;
|
||||||
|
/* Whether or not it's the start of the stream */
|
||||||
|
char encoding_start_of_stream;
|
||||||
|
|
||||||
/* Reads and writes are internally buffered in order to speed things up.
|
/* Reads and writes are internally buffered in order to speed things up.
|
||||||
However, any read will first flush the write buffer if itsn't empty.
|
However, any read will first flush the write buffer if itsn't empty.
|
||||||
|
@ -707,21 +709,50 @@ utf16le_encode(PyTextIOWrapperObject *self, PyObject *text)
|
||||||
static PyObject *
|
static PyObject *
|
||||||
utf16_encode(PyTextIOWrapperObject *self, PyObject *text)
|
utf16_encode(PyTextIOWrapperObject *self, PyObject *text)
|
||||||
{
|
{
|
||||||
PyObject *res;
|
if (!self->encoding_start_of_stream) {
|
||||||
res = PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(text),
|
/* Skip the BOM and use native byte ordering */
|
||||||
|
#if defined(WORDS_BIGENDIAN)
|
||||||
|
return utf16be_encode(self, text);
|
||||||
|
#else
|
||||||
|
return utf16le_encode(self, text);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(text),
|
||||||
PyUnicode_GET_SIZE(text),
|
PyUnicode_GET_SIZE(text),
|
||||||
PyBytes_AS_STRING(self->errors), 0);
|
PyBytes_AS_STRING(self->errors), 0);
|
||||||
if (res == NULL)
|
|
||||||
return NULL;
|
|
||||||
/* Next writes will skip the BOM and use native byte ordering */
|
|
||||||
#if defined(WORDS_BIGENDIAN)
|
|
||||||
self->encodefunc = (encodefunc_t) utf16be_encode;
|
|
||||||
#else
|
|
||||||
self->encodefunc = (encodefunc_t) utf16le_encode;
|
|
||||||
#endif
|
|
||||||
return res;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static PyObject *
|
||||||
|
utf32be_encode(PyTextIOWrapperObject *self, PyObject *text)
|
||||||
|
{
|
||||||
|
return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(text),
|
||||||
|
PyUnicode_GET_SIZE(text),
|
||||||
|
PyBytes_AS_STRING(self->errors), 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
static PyObject *
|
||||||
|
utf32le_encode(PyTextIOWrapperObject *self, PyObject *text)
|
||||||
|
{
|
||||||
|
return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(text),
|
||||||
|
PyUnicode_GET_SIZE(text),
|
||||||
|
PyBytes_AS_STRING(self->errors), -1);
|
||||||
|
}
|
||||||
|
|
||||||
|
static PyObject *
|
||||||
|
utf32_encode(PyTextIOWrapperObject *self, PyObject *text)
|
||||||
|
{
|
||||||
|
if (!self->encoding_start_of_stream) {
|
||||||
|
/* Skip the BOM and use native byte ordering */
|
||||||
|
#if defined(WORDS_BIGENDIAN)
|
||||||
|
return utf32be_encode(self, text);
|
||||||
|
#else
|
||||||
|
return utf32le_encode(self, text);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(text),
|
||||||
|
PyUnicode_GET_SIZE(text),
|
||||||
|
PyBytes_AS_STRING(self->errors), 0);
|
||||||
|
}
|
||||||
|
|
||||||
static PyObject *
|
static PyObject *
|
||||||
utf8_encode(PyTextIOWrapperObject *self, PyObject *text)
|
utf8_encode(PyTextIOWrapperObject *self, PyObject *text)
|
||||||
|
@ -749,10 +780,13 @@ typedef struct {
|
||||||
static encodefuncentry encodefuncs[] = {
|
static encodefuncentry encodefuncs[] = {
|
||||||
{"ascii", (encodefunc_t) ascii_encode},
|
{"ascii", (encodefunc_t) ascii_encode},
|
||||||
{"iso8859-1", (encodefunc_t) latin1_encode},
|
{"iso8859-1", (encodefunc_t) latin1_encode},
|
||||||
|
{"utf-8", (encodefunc_t) utf8_encode},
|
||||||
{"utf-16-be", (encodefunc_t) utf16be_encode},
|
{"utf-16-be", (encodefunc_t) utf16be_encode},
|
||||||
{"utf-16-le", (encodefunc_t) utf16le_encode},
|
{"utf-16-le", (encodefunc_t) utf16le_encode},
|
||||||
{"utf-16", (encodefunc_t) utf16_encode},
|
{"utf-16", (encodefunc_t) utf16_encode},
|
||||||
{"utf-8", (encodefunc_t) utf8_encode},
|
{"utf-32-be", (encodefunc_t) utf32be_encode},
|
||||||
|
{"utf-32-le", (encodefunc_t) utf32le_encode},
|
||||||
|
{"utf-32", (encodefunc_t) utf32_encode},
|
||||||
{NULL, NULL}
|
{NULL, NULL}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -978,6 +1012,33 @@ TextIOWrapper_init(PyTextIOWrapperObject *self, PyObject *args, PyObject *kwds)
|
||||||
self->seekable = self->telling = PyObject_IsTrue(res);
|
self->seekable = self->telling = PyObject_IsTrue(res);
|
||||||
Py_DECREF(res);
|
Py_DECREF(res);
|
||||||
|
|
||||||
|
self->encoding_start_of_stream = 0;
|
||||||
|
if (self->seekable && self->encoder) {
|
||||||
|
PyObject *cookieObj;
|
||||||
|
int cmp;
|
||||||
|
|
||||||
|
self->encoding_start_of_stream = 1;
|
||||||
|
|
||||||
|
cookieObj = PyObject_CallMethodObjArgs(buffer, _PyIO_str_tell, NULL);
|
||||||
|
if (cookieObj == NULL)
|
||||||
|
goto error;
|
||||||
|
|
||||||
|
cmp = PyObject_RichCompareBool(cookieObj, _PyIO_zero, Py_EQ);
|
||||||
|
Py_DECREF(cookieObj);
|
||||||
|
if (cmp < 0) {
|
||||||
|
goto error;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (cmp == 0) {
|
||||||
|
self->encoding_start_of_stream = 0;
|
||||||
|
res = PyObject_CallMethodObjArgs(self->encoder, _PyIO_str_setstate,
|
||||||
|
_PyIO_zero, NULL);
|
||||||
|
if (res == NULL)
|
||||||
|
goto error;
|
||||||
|
Py_DECREF(res);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
self->ok = 1;
|
self->ok = 1;
|
||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
|
@ -1192,8 +1253,10 @@ TextIOWrapper_write(PyTextIOWrapperObject *self, PyObject *args)
|
||||||
needflush = 1;
|
needflush = 1;
|
||||||
|
|
||||||
/* XXX What if we were just reading? */
|
/* XXX What if we were just reading? */
|
||||||
if (self->encodefunc != NULL)
|
if (self->encodefunc != NULL) {
|
||||||
b = (*self->encodefunc)((PyObject *) self, text);
|
b = (*self->encodefunc)((PyObject *) self, text);
|
||||||
|
self->encoding_start_of_stream = 0;
|
||||||
|
}
|
||||||
else
|
else
|
||||||
b = PyObject_CallMethodObjArgs(self->encoder,
|
b = PyObject_CallMethodObjArgs(self->encoder,
|
||||||
_PyIO_str_encode, text, NULL);
|
_PyIO_str_encode, text, NULL);
|
||||||
|
@ -1847,24 +1910,38 @@ _TextIOWrapper_decoder_setstate(PyTextIOWrapperObject *self,
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static int
|
||||||
|
_TextIOWrapper_encoder_setstate(PyTextIOWrapperObject *self,
|
||||||
|
CookieStruct *cookie)
|
||||||
|
{
|
||||||
|
PyObject *res;
|
||||||
|
/* Same as _TextIOWrapper_decoder_setstate() above. */
|
||||||
|
if (cookie->start_pos == 0 && cookie->dec_flags == 0) {
|
||||||
|
res = PyObject_CallMethodObjArgs(self->encoder, _PyIO_str_reset, NULL);
|
||||||
|
self->encoding_start_of_stream = 1;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
res = PyObject_CallMethodObjArgs(self->encoder, _PyIO_str_setstate,
|
||||||
|
_PyIO_zero, NULL);
|
||||||
|
self->encoding_start_of_stream = 0;
|
||||||
|
}
|
||||||
|
if (res == NULL)
|
||||||
|
return -1;
|
||||||
|
Py_DECREF(res);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
static PyObject *
|
static PyObject *
|
||||||
TextIOWrapper_seek(PyTextIOWrapperObject *self, PyObject *args)
|
TextIOWrapper_seek(PyTextIOWrapperObject *self, PyObject *args)
|
||||||
{
|
{
|
||||||
PyObject *cookieObj, *posobj;
|
PyObject *cookieObj, *posobj;
|
||||||
CookieStruct cookie;
|
CookieStruct cookie;
|
||||||
int whence = 0;
|
int whence = 0;
|
||||||
static PyObject *zero = NULL;
|
|
||||||
PyObject *res;
|
PyObject *res;
|
||||||
int cmp;
|
int cmp;
|
||||||
|
|
||||||
CHECK_INITIALIZED(self);
|
CHECK_INITIALIZED(self);
|
||||||
|
|
||||||
if (zero == NULL) {
|
|
||||||
zero = PyLong_FromLong(0L);
|
|
||||||
if (zero == NULL)
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!PyArg_ParseTuple(args, "O|i:seek", &cookieObj, &whence))
|
if (!PyArg_ParseTuple(args, "O|i:seek", &cookieObj, &whence))
|
||||||
return NULL;
|
return NULL;
|
||||||
CHECK_CLOSED(self);
|
CHECK_CLOSED(self);
|
||||||
|
@ -1879,7 +1956,7 @@ TextIOWrapper_seek(PyTextIOWrapperObject *self, PyObject *args)
|
||||||
|
|
||||||
if (whence == 1) {
|
if (whence == 1) {
|
||||||
/* seek relative to current position */
|
/* seek relative to current position */
|
||||||
cmp = PyObject_RichCompareBool(cookieObj, zero, Py_EQ);
|
cmp = PyObject_RichCompareBool(cookieObj, _PyIO_zero, Py_EQ);
|
||||||
if (cmp < 0)
|
if (cmp < 0)
|
||||||
goto fail;
|
goto fail;
|
||||||
|
|
||||||
|
@ -1900,7 +1977,7 @@ TextIOWrapper_seek(PyTextIOWrapperObject *self, PyObject *args)
|
||||||
else if (whence == 2) {
|
else if (whence == 2) {
|
||||||
/* seek relative to end of file */
|
/* seek relative to end of file */
|
||||||
|
|
||||||
cmp = PyObject_RichCompareBool(cookieObj, zero, Py_EQ);
|
cmp = PyObject_RichCompareBool(cookieObj, _PyIO_zero, Py_EQ);
|
||||||
if (cmp < 0)
|
if (cmp < 0)
|
||||||
goto fail;
|
goto fail;
|
||||||
|
|
||||||
|
@ -1934,7 +2011,7 @@ TextIOWrapper_seek(PyTextIOWrapperObject *self, PyObject *args)
|
||||||
goto fail;
|
goto fail;
|
||||||
}
|
}
|
||||||
|
|
||||||
cmp = PyObject_RichCompareBool(cookieObj, zero, Py_LT);
|
cmp = PyObject_RichCompareBool(cookieObj, _PyIO_zero, Py_LT);
|
||||||
if (cmp < 0)
|
if (cmp < 0)
|
||||||
goto fail;
|
goto fail;
|
||||||
|
|
||||||
|
@ -2013,6 +2090,11 @@ TextIOWrapper_seek(PyTextIOWrapperObject *self, PyObject *args)
|
||||||
goto fail;
|
goto fail;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Finally, reset the encoder (merely useful for proper BOM handling) */
|
||||||
|
if (self->encoder) {
|
||||||
|
if (_TextIOWrapper_encoder_setstate(self, &cookie) < 0)
|
||||||
|
goto fail;
|
||||||
|
}
|
||||||
return cookieObj;
|
return cookieObj;
|
||||||
fail:
|
fail:
|
||||||
Py_XDECREF(cookieObj);
|
Py_XDECREF(cookieObj);
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue