mirror of
https://github.com/python/cpython.git
synced 2025-08-04 00:48:58 +00:00
Issue #16330: Use surrogate-related macros
Patch written by Serhiy Storchaka.
This commit is contained in:
parent
a5e7cd06bb
commit
76df43de30
6 changed files with 25 additions and 28 deletions
|
@ -180,9 +180,9 @@ typedef unsigned char Py_UCS1;
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
/* macros to work with surrogates */
|
/* macros to work with surrogates */
|
||||||
#define Py_UNICODE_IS_SURROGATE(ch) (0xD800 <= ch && ch <= 0xDFFF)
|
#define Py_UNICODE_IS_SURROGATE(ch) (0xD800 <= (ch) && (ch) <= 0xDFFF)
|
||||||
#define Py_UNICODE_IS_HIGH_SURROGATE(ch) (0xD800 <= ch && ch <= 0xDBFF)
|
#define Py_UNICODE_IS_HIGH_SURROGATE(ch) (0xD800 <= (ch) && (ch) <= 0xDBFF)
|
||||||
#define Py_UNICODE_IS_LOW_SURROGATE(ch) (0xDC00 <= ch && ch <= 0xDFFF)
|
#define Py_UNICODE_IS_LOW_SURROGATE(ch) (0xDC00 <= (ch) && (ch) <= 0xDFFF)
|
||||||
/* Join two surrogate characters and return a single Py_UCS4 value. */
|
/* Join two surrogate characters and return a single Py_UCS4 value. */
|
||||||
#define Py_UNICODE_JOIN_SURROGATES(high, low) \
|
#define Py_UNICODE_JOIN_SURROGATES(high, low) \
|
||||||
(((((Py_UCS4)(high) & 0x03FF) << 10) | \
|
(((((Py_UCS4)(high) & 0x03FF) << 10) | \
|
||||||
|
|
|
@ -174,14 +174,13 @@ ascii_escape_unichar(Py_UCS4 c, unsigned char *output, Py_ssize_t chars)
|
||||||
default:
|
default:
|
||||||
if (c >= 0x10000) {
|
if (c >= 0x10000) {
|
||||||
/* UTF-16 surrogate pair */
|
/* UTF-16 surrogate pair */
|
||||||
Py_UCS4 v = c - 0x10000;
|
Py_UCS4 v = Py_UNICODE_HIGH_SURROGATE(c);
|
||||||
c = 0xd800 | ((v >> 10) & 0x3ff);
|
|
||||||
output[chars++] = 'u';
|
output[chars++] = 'u';
|
||||||
output[chars++] = Py_hexdigits[(c >> 12) & 0xf];
|
output[chars++] = Py_hexdigits[(v >> 12) & 0xf];
|
||||||
output[chars++] = Py_hexdigits[(c >> 8) & 0xf];
|
output[chars++] = Py_hexdigits[(v >> 8) & 0xf];
|
||||||
output[chars++] = Py_hexdigits[(c >> 4) & 0xf];
|
output[chars++] = Py_hexdigits[(v >> 4) & 0xf];
|
||||||
output[chars++] = Py_hexdigits[(c ) & 0xf];
|
output[chars++] = Py_hexdigits[(v ) & 0xf];
|
||||||
c = 0xdc00 | (v & 0x3ff);
|
c = Py_UNICODE_LOW_SURROGATE(c);
|
||||||
output[chars++] = '\\';
|
output[chars++] = '\\';
|
||||||
}
|
}
|
||||||
output[chars++] = 'u';
|
output[chars++] = 'u';
|
||||||
|
@ -431,7 +430,7 @@ scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
/* Surrogate pair */
|
/* Surrogate pair */
|
||||||
if ((c & 0xfc00) == 0xd800) {
|
if (Py_UNICODE_IS_HIGH_SURROGATE(c)) {
|
||||||
Py_UCS4 c2 = 0;
|
Py_UCS4 c2 = 0;
|
||||||
if (end + 6 >= len) {
|
if (end + 6 >= len) {
|
||||||
raise_errmsg("Unpaired high surrogate", pystr, end - 5);
|
raise_errmsg("Unpaired high surrogate", pystr, end - 5);
|
||||||
|
@ -462,13 +461,13 @@ scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next
|
||||||
goto bail;
|
goto bail;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if ((c2 & 0xfc00) != 0xdc00) {
|
if (!Py_UNICODE_IS_LOW_SURROGATE(c2)) {
|
||||||
raise_errmsg("Unpaired high surrogate", pystr, end - 5);
|
raise_errmsg("Unpaired high surrogate", pystr, end - 5);
|
||||||
goto bail;
|
goto bail;
|
||||||
}
|
}
|
||||||
c = 0x10000 + (((c - 0xd800) << 10) | (c2 - 0xdc00));
|
c = Py_UNICODE_JOIN_SURROGATES(c, c2);
|
||||||
}
|
}
|
||||||
else if ((c & 0xfc00) == 0xdc00) {
|
else if (Py_UNICODE_IS_LOW_SURROGATE(c)) {
|
||||||
raise_errmsg("Unpaired low surrogate", pystr, end - 5);
|
raise_errmsg("Unpaired low surrogate", pystr, end - 5);
|
||||||
goto bail;
|
goto bail;
|
||||||
}
|
}
|
||||||
|
|
|
@ -148,8 +148,8 @@ static const struct dbcs_map *mapping_list;
|
||||||
#if Py_UNICODE_SIZE == 2
|
#if Py_UNICODE_SIZE == 2
|
||||||
# define WRITEUCS4(c) \
|
# define WRITEUCS4(c) \
|
||||||
REQUIRE_OUTBUF(2) \
|
REQUIRE_OUTBUF(2) \
|
||||||
(*outbuf)[0] = 0xd800 + (((c) - 0x10000) >> 10); \
|
(*outbuf)[0] = Py_UNICODE_HIGH_SURROGATE(c); \
|
||||||
(*outbuf)[1] = 0xdc00 + (((c) - 0x10000) & 0x3ff); \
|
(*outbuf)[1] = Py_UNICODE_LOW_SURROGATE(c); \
|
||||||
NEXT_OUT(2)
|
NEXT_OUT(2)
|
||||||
#else
|
#else
|
||||||
# define WRITEUCS4(c) \
|
# define WRITEUCS4(c) \
|
||||||
|
@ -188,11 +188,10 @@ static const struct dbcs_map *mapping_list;
|
||||||
|
|
||||||
#if Py_UNICODE_SIZE == 2
|
#if Py_UNICODE_SIZE == 2
|
||||||
#define DECODE_SURROGATE(c) \
|
#define DECODE_SURROGATE(c) \
|
||||||
if (c >> 10 == 0xd800 >> 10) { /* high surrogate */ \
|
if (Py_UNICODE_IS_HIGH_SURROGATE(c)) { \
|
||||||
REQUIRE_INBUF(2) \
|
REQUIRE_INBUF(2) \
|
||||||
if (IN2 >> 10 == 0xdc00 >> 10) { /* low surrogate */ \
|
if (Py_UNICODE_IS_LOW_SURROGATE(IN2)) { \
|
||||||
c = 0x10000 + ((ucs4_t)(c - 0xd800) << 10) + \
|
c = Py_UNICODE_JOIN_SURROGATES(c, IN2) \
|
||||||
((ucs4_t)(IN2) - 0xdc00); \
|
|
||||||
} \
|
} \
|
||||||
}
|
}
|
||||||
#define GET_INSIZE(c) ((c) > 0xffff ? 2 : 1)
|
#define GET_INSIZE(c) ((c) > 0xffff ? 2 : 1)
|
||||||
|
|
|
@ -4412,7 +4412,7 @@ encode_char:
|
||||||
|
|
||||||
/* code first surrogate */
|
/* code first surrogate */
|
||||||
base64bits += 16;
|
base64bits += 16;
|
||||||
base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
|
base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
|
||||||
while (base64bits >= 6) {
|
while (base64bits >= 6) {
|
||||||
*out++ = TO_BASE64(base64buffer >> (base64bits-6));
|
*out++ = TO_BASE64(base64buffer >> (base64bits-6));
|
||||||
base64bits -= 6;
|
base64bits -= 6;
|
||||||
|
@ -7052,9 +7052,8 @@ encode_code_page_errors(UINT code_page, PyObject **outbytes,
|
||||||
charsize = 1;
|
charsize = 1;
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
ch -= 0x10000;
|
chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
|
||||||
chars[0] = 0xd800 + (ch >> 10);
|
chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
|
||||||
chars[1] = 0xdc00 + (ch & 0x3ff);
|
|
||||||
charsize = 2;
|
charsize = 2;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -761,7 +761,7 @@ PyCodec_SurrogatePassErrors(PyObject *exc)
|
||||||
for (i = start; i < end; i++) {
|
for (i = start; i < end; i++) {
|
||||||
/* object is guaranteed to be "ready" */
|
/* object is guaranteed to be "ready" */
|
||||||
Py_UCS4 ch = PyUnicode_READ_CHAR(object, i);
|
Py_UCS4 ch = PyUnicode_READ_CHAR(object, i);
|
||||||
if (ch < 0xd800 || ch > 0xdfff) {
|
if (!Py_UNICODE_IS_SURROGATE(ch)) {
|
||||||
/* Not a surrogate, fail with original exception */
|
/* Not a surrogate, fail with original exception */
|
||||||
PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
|
PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
|
||||||
Py_DECREF(res);
|
Py_DECREF(res);
|
||||||
|
@ -797,7 +797,7 @@ PyCodec_SurrogatePassErrors(PyObject *exc)
|
||||||
(p[2] & 0xc0) == 0x80)) {
|
(p[2] & 0xc0) == 0x80)) {
|
||||||
/* it's a three-byte code */
|
/* it's a three-byte code */
|
||||||
ch = ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) + (p[2] & 0x3f);
|
ch = ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) + (p[2] & 0x3f);
|
||||||
if (ch < 0xd800 || ch > 0xdfff)
|
if (!Py_UNICODE_IS_SURROGATE(ch))
|
||||||
/* it's not a surrogate - fail */
|
/* it's not a surrogate - fail */
|
||||||
ch = 0;
|
ch = 0;
|
||||||
}
|
}
|
||||||
|
|
|
@ -85,7 +85,7 @@ _Py_char2wchar(const char* arg, size_t *size)
|
||||||
/* Only use the result if it contains no
|
/* Only use the result if it contains no
|
||||||
surrogate characters. */
|
surrogate characters. */
|
||||||
for (tmp = res; *tmp != 0 &&
|
for (tmp = res; *tmp != 0 &&
|
||||||
(*tmp < 0xd800 || *tmp > 0xdfff); tmp++)
|
!Py_UNICODE_IS_SURROGATE(*tmp); tmp++)
|
||||||
;
|
;
|
||||||
if (*tmp == 0) {
|
if (*tmp == 0) {
|
||||||
if (size != NULL)
|
if (size != NULL)
|
||||||
|
@ -131,7 +131,7 @@ _Py_char2wchar(const char* arg, size_t *size)
|
||||||
memset(&mbs, 0, sizeof mbs);
|
memset(&mbs, 0, sizeof mbs);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
if (*out >= 0xd800 && *out <= 0xdfff) {
|
if (Py_UNICODE_IS_SURROGATE(*out)) {
|
||||||
/* Surrogate character. Escape the original
|
/* Surrogate character. Escape the original
|
||||||
byte sequence with surrogateescape. */
|
byte sequence with surrogateescape. */
|
||||||
argsize -= converted;
|
argsize -= converted;
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue