mirror of
https://github.com/python/cpython.git
synced 2025-07-07 19:35:27 +00:00
bpo-35883: Py_DecodeLocale() escapes invalid Unicode characters (GH-24843)
Python no longer fails at startup with a fatal error if a command line argument contains an invalid Unicode character. The Py_DecodeLocale() function now escapes byte sequences which would be decoded as Unicode characters outside the [U+0000; U+10ffff] range. Use MAX_UNICODE constant in unicodeobject.c.
This commit is contained in:
parent
6086ae7fd4
commit
9976834f80
4 changed files with 149 additions and 70 deletions
|
@ -34,6 +34,13 @@ extern int winerror_to_errno(int);
|
|||
int _Py_open_cloexec_works = -1;
|
||||
#endif
|
||||
|
||||
// The value must be the same in unicodeobject.c.
|
||||
#define MAX_UNICODE 0x10ffff
|
||||
|
||||
// mbstowcs() and mbrtowc() errors
|
||||
static const size_t DECODE_ERROR = ((size_t)-1);
|
||||
static const size_t INCOMPLETE_CHARACTER = (size_t)-2;
|
||||
|
||||
|
||||
static int
|
||||
get_surrogateescape(_Py_error_handler errors, int *surrogateescape)
|
||||
|
@ -82,6 +89,57 @@ _Py_device_encoding(int fd)
|
|||
#endif
|
||||
}
|
||||
|
||||
|
||||
static size_t
|
||||
is_valid_wide_char(wchar_t ch)
|
||||
{
|
||||
if (Py_UNICODE_IS_SURROGATE(ch)) {
|
||||
// Reject lone surrogate characters
|
||||
return 0;
|
||||
}
|
||||
if (ch > MAX_UNICODE) {
|
||||
// bpo-35883: Reject characters outside [U+0000; U+10ffff] range.
|
||||
// The glibc mbstowcs() UTF-8 decoder does not respect the RFC 3629,
|
||||
// it creates characters outside the [U+0000; U+10ffff] range:
|
||||
// https://sourceware.org/bugzilla/show_bug.cgi?id=2373
|
||||
return 0;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
||||
static size_t
|
||||
_Py_mbstowcs(wchar_t *dest, const char *src, size_t n)
|
||||
{
|
||||
size_t count = mbstowcs(dest, src, n);
|
||||
if (dest != NULL && count != DECODE_ERROR) {
|
||||
for (size_t i=0; i < count; i++) {
|
||||
wchar_t ch = dest[i];
|
||||
if (!is_valid_wide_char(ch)) {
|
||||
return DECODE_ERROR;
|
||||
}
|
||||
}
|
||||
}
|
||||
return count;
|
||||
}
|
||||
|
||||
|
||||
#ifdef HAVE_MBRTOWC
|
||||
static size_t
|
||||
_Py_mbrtowc(wchar_t *pwc, const char *str, size_t len, mbstate_t *pmbs)
|
||||
{
|
||||
assert(pwc != NULL);
|
||||
size_t count = mbrtowc(pwc, str, len, pmbs);
|
||||
if (count != 0 && count != DECODE_ERROR && count != INCOMPLETE_CHARACTER) {
|
||||
if (!is_valid_wide_char(*pwc)) {
|
||||
return DECODE_ERROR;
|
||||
}
|
||||
}
|
||||
return count;
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
#if !defined(_Py_FORCE_UTF8_FS_ENCODING) && !defined(MS_WINDOWS)
|
||||
|
||||
#define USE_FORCE_ASCII
|
||||
|
@ -148,8 +206,8 @@ check_force_ascii(void)
|
|||
size_t res;
|
||||
|
||||
ch = (unsigned char)0xA7;
|
||||
res = mbstowcs(&wch, (char*)&ch, 1);
|
||||
if (res != (size_t)-1 && wch == L'\xA7') {
|
||||
res = _Py_mbstowcs(&wch, (char*)&ch, 1);
|
||||
if (res != DECODE_ERROR && wch == L'\xA7') {
|
||||
/* On HP-UX withe C locale or the POSIX locale,
|
||||
nl_langinfo(CODESET) announces "roman8", whereas mbstowcs() uses
|
||||
Latin1 encoding in practice. Force ASCII in this case.
|
||||
|
@ -196,8 +254,8 @@ check_force_ascii(void)
|
|||
|
||||
unsigned uch = (unsigned char)i;
|
||||
ch[0] = (char)uch;
|
||||
res = mbstowcs(wch, ch, 1);
|
||||
if (res != (size_t)-1) {
|
||||
res = _Py_mbstowcs(wch, ch, 1);
|
||||
if (res != DECODE_ERROR) {
|
||||
/* decoding a non-ASCII character from the locale encoding succeed:
|
||||
the locale encoding is not ASCII, force ASCII */
|
||||
return 1;
|
||||
|
@ -387,9 +445,9 @@ decode_current_locale(const char* arg, wchar_t **wstr, size_t *wlen,
|
|||
*/
|
||||
argsize = strlen(arg);
|
||||
#else
|
||||
argsize = mbstowcs(NULL, arg, 0);
|
||||
argsize = _Py_mbstowcs(NULL, arg, 0);
|
||||
#endif
|
||||
if (argsize != (size_t)-1) {
|
||||
if (argsize != DECODE_ERROR) {
|
||||
if (argsize > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
|
||||
return -1;
|
||||
}
|
||||
|
@ -398,21 +456,13 @@ decode_current_locale(const char* arg, wchar_t **wstr, size_t *wlen,
|
|||
return -1;
|
||||
}
|
||||
|
||||
count = mbstowcs(res, arg, argsize + 1);
|
||||
if (count != (size_t)-1) {
|
||||
wchar_t *tmp;
|
||||
/* Only use the result if it contains no
|
||||
surrogate characters. */
|
||||
for (tmp = res; *tmp != 0 &&
|
||||
!Py_UNICODE_IS_SURROGATE(*tmp); tmp++)
|
||||
;
|
||||
if (*tmp == 0) {
|
||||
if (wlen != NULL) {
|
||||
*wlen = count;
|
||||
}
|
||||
*wstr = res;
|
||||
return 0;
|
||||
count = _Py_mbstowcs(res, arg, argsize + 1);
|
||||
if (count != DECODE_ERROR) {
|
||||
*wstr = res;
|
||||
if (wlen != NULL) {
|
||||
*wlen = count;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
PyMem_RawFree(res);
|
||||
}
|
||||
|
@ -436,13 +486,13 @@ decode_current_locale(const char* arg, wchar_t **wstr, size_t *wlen,
|
|||
out = res;
|
||||
memset(&mbs, 0, sizeof mbs);
|
||||
while (argsize) {
|
||||
size_t converted = mbrtowc(out, (char*)in, argsize, &mbs);
|
||||
size_t converted = _Py_mbrtowc(out, (char*)in, argsize, &mbs);
|
||||
if (converted == 0) {
|
||||
/* Reached end of string; null char stored. */
|
||||
break;
|
||||
}
|
||||
|
||||
if (converted == (size_t)-2) {
|
||||
if (converted == INCOMPLETE_CHARACTER) {
|
||||
/* Incomplete character. This should never happen,
|
||||
since we provide everything that we have -
|
||||
unless there is a bug in the C library, or I
|
||||
|
@ -450,32 +500,22 @@ decode_current_locale(const char* arg, wchar_t **wstr, size_t *wlen,
|
|||
goto decode_error;
|
||||
}
|
||||
|
||||
if (converted == (size_t)-1) {
|
||||
if (converted == DECODE_ERROR) {
|
||||
if (!surrogateescape) {
|
||||
goto decode_error;
|
||||
}
|
||||
|
||||
/* Conversion error. Escape as UTF-8b, and start over
|
||||
in the initial shift state. */
|
||||
/* Decoding error. Escape as UTF-8b, and start over in the initial
|
||||
shift state. */
|
||||
*out++ = 0xdc00 + *in++;
|
||||
argsize--;
|
||||
memset(&mbs, 0, sizeof mbs);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (Py_UNICODE_IS_SURROGATE(*out)) {
|
||||
if (!surrogateescape) {
|
||||
goto decode_error;
|
||||
}
|
||||
// _Py_mbrtowc() reject lone surrogate characters
|
||||
assert(!Py_UNICODE_IS_SURROGATE(*out));
|
||||
|
||||
/* Surrogate character. Escape the original
|
||||
byte sequence with surrogateescape. */
|
||||
argsize -= converted;
|
||||
while (converted--) {
|
||||
*out++ = 0xdc00 + *in++;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
/* successfully converted some bytes */
|
||||
in += converted;
|
||||
argsize -= converted;
|
||||
|
@ -652,7 +692,7 @@ encode_current_locale(const wchar_t *text, char **str,
|
|||
else {
|
||||
converted = wcstombs(NULL, buf, 0);
|
||||
}
|
||||
if (converted == (size_t)-1) {
|
||||
if (converted == DECODE_ERROR) {
|
||||
goto encode_error;
|
||||
}
|
||||
if (bytes != NULL) {
|
||||
|
@ -1440,7 +1480,7 @@ _Py_wfopen(const wchar_t *path, const wchar_t *mode)
|
|||
char cmode[10];
|
||||
size_t r;
|
||||
r = wcstombs(cmode, mode, 10);
|
||||
if (r == (size_t)-1 || r >= 10) {
|
||||
if (r == DECODE_ERROR || r >= 10) {
|
||||
errno = EINVAL;
|
||||
return NULL;
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue