mirror of
https://github.com/python/cpython.git
synced 2025-07-15 23:35:23 +00:00
Issue #5915: Implement PEP 383, Non-decodable Bytes in
System Character Interfaces.
This commit is contained in:
parent
93f65a177b
commit
011e842033
15 changed files with 726 additions and 289 deletions
113
Modules/python.c
113
Modules/python.c
|
@ -14,6 +14,93 @@ wmain(int argc, wchar_t **argv)
|
|||
return Py_Main(argc, argv);
|
||||
}
|
||||
#else
|
||||
static wchar_t*
|
||||
char2wchar(char* arg)
|
||||
{
|
||||
wchar_t *res;
|
||||
#ifdef HAVE_BROKEN_MBSTOWCS
|
||||
/* Some platforms have a broken implementation of
|
||||
* mbstowcs which does not count the characters that
|
||||
* would result from conversion. Use an upper bound.
|
||||
*/
|
||||
size_t argsize = strlen(arg);
|
||||
#else
|
||||
size_t argsize = mbstowcs(NULL, arg, 0);
|
||||
#endif
|
||||
size_t count;
|
||||
unsigned char *in;
|
||||
wchar_t *out;
|
||||
#ifdef HAVE_MBRTOWC
|
||||
mbstate_t mbs;
|
||||
#endif
|
||||
if (argsize != (size_t)-1) {
|
||||
res = (wchar_t *)PyMem_Malloc((argsize+1)*sizeof(wchar_t));
|
||||
if (!res)
|
||||
goto oom;
|
||||
count = mbstowcs(res, arg, argsize+1);
|
||||
if (count != (size_t)-1)
|
||||
return res;
|
||||
PyMem_Free(res);
|
||||
}
|
||||
/* Conversion failed. Fall back to escaping with utf8b. */
|
||||
#ifdef HAVE_MBRTOWC
|
||||
/* Try conversion with mbrtwoc (C99), and escape non-decodable bytes. */
|
||||
|
||||
/* Overallocate; as multi-byte characters are in the argument, the
|
||||
actual output could use less memory. */
|
||||
argsize = strlen(arg) + 1;
|
||||
res = PyMem_Malloc(argsize*sizeof(wchar_t));
|
||||
if (!res) goto oom;
|
||||
in = (unsigned char*)arg;
|
||||
out = res;
|
||||
memset(&mbs, 0, sizeof mbs);
|
||||
while (argsize) {
|
||||
size_t converted = mbrtowc(out, (char*)in, argsize, &mbs);
|
||||
if (converted == 0)
|
||||
/* Reached end of string; null char stored. */
|
||||
break;
|
||||
if (converted == (size_t)-2) {
|
||||
/* Incomplete character. This should never happen,
|
||||
since we provide everything that we have -
|
||||
unless there is a bug in the C library, or I
|
||||
misunderstood how mbrtowc works. */
|
||||
fprintf(stderr, "unexpected mbrtowc result -2\n");
|
||||
return NULL;
|
||||
}
|
||||
if (converted == (size_t)-1) {
|
||||
/* Conversion error. Escape as UTF-8b, and start over
|
||||
in the initial shift state. */
|
||||
*out++ = 0xdc00 + *in++;
|
||||
argsize--;
|
||||
memset(&mbs, 0, sizeof mbs);
|
||||
continue;
|
||||
}
|
||||
/* successfully converted some bytes */
|
||||
in += converted;
|
||||
argsize -= converted;
|
||||
out++;
|
||||
}
|
||||
#else
|
||||
/* Cannot use C locale for escaping; manually escape as if charset
|
||||
is ASCII (i.e. escape all bytes > 128. This will still roundtrip
|
||||
correctly in the locale's charset, which must be an ASCII superset. */
|
||||
res = PyMem_Malloc((strlen(arg)+1)*sizeof(wchar_t));
|
||||
if (!res) goto oom;
|
||||
in = (unsigned char*)arg;
|
||||
out = res;
|
||||
while(*in)
|
||||
if(*in < 128)
|
||||
*out++ = *in++;
|
||||
else
|
||||
*out++ = 0xdc00 + *in++;
|
||||
*out = 0;
|
||||
#endif
|
||||
return res;
|
||||
oom:
|
||||
fprintf(stderr, "out of memory\n");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
int
|
||||
main(int argc, char **argv)
|
||||
{
|
||||
|
@ -40,31 +127,9 @@ main(int argc, char **argv)
|
|||
oldloc = strdup(setlocale(LC_ALL, NULL));
|
||||
setlocale(LC_ALL, "");
|
||||
for (i = 0; i < argc; i++) {
|
||||
#ifdef HAVE_BROKEN_MBSTOWCS
|
||||
/* Some platforms have a broken implementation of
|
||||
* mbstowcs which does not count the characters that
|
||||
* would result from conversion. Use an upper bound.
|
||||
*/
|
||||
size_t argsize = strlen(argv[i]);
|
||||
#else
|
||||
size_t argsize = mbstowcs(NULL, argv[i], 0);
|
||||
#endif
|
||||
size_t count;
|
||||
if (argsize == (size_t)-1) {
|
||||
fprintf(stderr, "Could not convert argument %d to string\n", i);
|
||||
argv_copy2[i] = argv_copy[i] = char2wchar(argv[i]);
|
||||
if (!argv_copy[i])
|
||||
return 1;
|
||||
}
|
||||
argv_copy[i] = (wchar_t *)PyMem_Malloc((argsize+1)*sizeof(wchar_t));
|
||||
argv_copy2[i] = argv_copy[i];
|
||||
if (!argv_copy[i]) {
|
||||
fprintf(stderr, "out of memory\n");
|
||||
return 1;
|
||||
}
|
||||
count = mbstowcs(argv_copy[i], argv[i], argsize+1);
|
||||
if (count == (size_t)-1) {
|
||||
fprintf(stderr, "Could not convert argument %d to string\n", i);
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
setlocale(LC_ALL, oldloc);
|
||||
free(oldloc);
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue