This patch changes the behaviour of the UTF-16 codec family. Only the

UTF-16 codec will now interpret and remove a *leading* BOM mark. Sub-
sequent BOM characters are no longer interpreted and removed.
UTF-16-LE and -BE pass through all BOM mark characters.

These changes should get the UTF-16 codec more in line with what
the Unicode FAQ recommends w/r to BOM marks.
This commit is contained in:
Marc-André Lemburg 2001-05-21 20:30:15 +00:00
parent f52d27e52d
commit 489b56e044
2 changed files with 31 additions and 22 deletions

View file

@ -1001,31 +1001,39 @@ PyObject *PyUnicode_DecodeUTF16(const char *s,
if (byteorder)
bo = *byteorder;
/* Check for BOM marks (U+FEFF) in the input and adjust current
byte order setting accordingly. In native mode, the leading BOM
mark is skipped, in all other modes, it is copied to the output
stream as-is (giving a ZWNBSP character). */
if (bo == 0) {
#ifdef BYTEORDER_IS_LITTLE_ENDIAN
if (*q == 0xFEFF) {
q++;
bo = -1;
} else if (*q == 0xFFFE) {
q++;
bo = 1;
}
#else
if (*q == 0xFEFF) {
q++;
bo = 1;
} else if (*q == 0xFFFE) {
q++;
bo = -1;
}
#endif
}
while (q < e) {
register Py_UNICODE ch = *q++;
/* Check for BOM marks (U+FEFF) in the input and adjust
current byte order setting accordingly. Swap input
bytes if needed. (This assumes sizeof(Py_UNICODE) == 2
!) */
/* Swap input bytes if needed. (This assumes
sizeof(Py_UNICODE) == 2 !) */
#ifdef BYTEORDER_IS_LITTLE_ENDIAN
if (ch == 0xFEFF) {
bo = -1;
continue;
} else if (ch == 0xFFFE) {
bo = 1;
continue;
}
if (bo == 1)
ch = (ch >> 8) | (ch << 8);
#else
if (ch == 0xFEFF) {
bo = 1;
continue;
} else if (ch == 0xFFFE) {
bo = -1;
continue;
}
if (bo == -1)
ch = (ch >> 8) | (ch << 8);
#endif