mirror of
https://github.com/python/cpython.git
synced 2025-09-26 10:19:53 +00:00
bpo-34403: On HP-UX, force ASCII for C locale (GH-8969)
On HP-UX with C or POSIX locale, sys.getfilesystemencoding() now returns "ascii" instead of "roman8" (when the UTF-8 Mode is disabled and the C locale is not coerced). nl_langinfo(CODESET) announces "roman8" whereas it uses the Latin1 encoding in practice.
This commit is contained in:
parent
5cb258950c
commit
d500e5307a
5 changed files with 105 additions and 52 deletions
|
@ -170,6 +170,11 @@ PyAPI_FUNC(int) _Py_GetLocaleconvNumeric(
|
||||||
|
|
||||||
#endif /* Py_LIMITED_API */
|
#endif /* Py_LIMITED_API */
|
||||||
|
|
||||||
|
|
||||||
|
#ifdef Py_BUILD_CORE
|
||||||
|
PyAPI_FUNC(int) _Py_GetForceASCII(void);
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -0,0 +1,3 @@
|
||||||
|
On HP-UX with C or POSIX locale, sys.getfilesystemencoding() now returns
|
||||||
|
"ascii" instead of "roman8" (when the UTF-8 Mode is disabled and the C locale
|
||||||
|
is not coerced).
|
|
@ -828,18 +828,21 @@ config_read_complex_options(_PyCoreConfig *config)
|
||||||
static void
|
static void
|
||||||
config_init_locale(_PyCoreConfig *config)
|
config_init_locale(_PyCoreConfig *config)
|
||||||
{
|
{
|
||||||
if (_Py_LegacyLocaleDetected()) {
|
|
||||||
/* The C locale enables the C locale coercion (PEP 538) */
|
|
||||||
if (config->coerce_c_locale < 0) {
|
if (config->coerce_c_locale < 0) {
|
||||||
|
/* The C locale enables the C locale coercion (PEP 538) */
|
||||||
|
if (_Py_LegacyLocaleDetected()) {
|
||||||
config->coerce_c_locale = 1;
|
config->coerce_c_locale = 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifndef MS_WINDOWS
|
#ifndef MS_WINDOWS
|
||||||
|
if (config->utf8_mode < 0) {
|
||||||
|
/* The C locale and the POSIX locale enable the UTF-8 Mode (PEP 540) */
|
||||||
const char *ctype_loc = setlocale(LC_CTYPE, NULL);
|
const char *ctype_loc = setlocale(LC_CTYPE, NULL);
|
||||||
if (ctype_loc != NULL
|
if (ctype_loc != NULL
|
||||||
&& (strcmp(ctype_loc, "C") == 0 || strcmp(ctype_loc, "POSIX") == 0)) {
|
&& (strcmp(ctype_loc, "C") == 0
|
||||||
/* The C locale and the POSIX locale enable the UTF-8 Mode (PEP 540) */
|
|| strcmp(ctype_loc, "POSIX") == 0))
|
||||||
if (config->utf8_mode < 0) {
|
{
|
||||||
config->utf8_mode = 1;
|
config->utf8_mode = 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -72,8 +72,8 @@ _Py_device_encoding(int fd)
|
||||||
|
|
||||||
extern int _Py_normalize_encoding(const char *, char *, size_t);
|
extern int _Py_normalize_encoding(const char *, char *, size_t);
|
||||||
|
|
||||||
/* Workaround FreeBSD and OpenIndiana locale encoding issue with the C locale.
|
/* Workaround FreeBSD and OpenIndiana locale encoding issue with the C locale
|
||||||
On these operating systems, nl_langinfo(CODESET) announces an alias of the
|
and POSIX locale. nl_langinfo(CODESET) announces an alias of the
|
||||||
ASCII encoding, whereas mbstowcs() and wcstombs() functions use the
|
ASCII encoding, whereas mbstowcs() and wcstombs() functions use the
|
||||||
ISO-8859-1 encoding. The problem is that os.fsencode() and os.fsdecode() use
|
ISO-8859-1 encoding. The problem is that os.fsencode() and os.fsdecode() use
|
||||||
locale.getpreferredencoding() codec. For example, if command line arguments
|
locale.getpreferredencoding() codec. For example, if command line arguments
|
||||||
|
@ -86,6 +86,10 @@ extern int _Py_normalize_encoding(const char *, char *, size_t);
|
||||||
workaround is also enabled on error, for example if getting the locale
|
workaround is also enabled on error, for example if getting the locale
|
||||||
failed.
|
failed.
|
||||||
|
|
||||||
|
On HP-UX with the C locale or the POSIX locale, nl_langinfo(CODESET)
|
||||||
|
announces "roman8" but mbstowcs() uses Latin1 in practice. Force also the
|
||||||
|
ASCII encoding in this case.
|
||||||
|
|
||||||
Values of force_ascii:
|
Values of force_ascii:
|
||||||
|
|
||||||
1: the workaround is used: Py_EncodeLocale() uses
|
1: the workaround is used: Py_EncodeLocale() uses
|
||||||
|
@ -100,13 +104,46 @@ static int force_ascii = -1;
|
||||||
static int
|
static int
|
||||||
check_force_ascii(void)
|
check_force_ascii(void)
|
||||||
{
|
{
|
||||||
char *loc;
|
char *loc = setlocale(LC_CTYPE, NULL);
|
||||||
|
if (loc == NULL) {
|
||||||
|
goto error;
|
||||||
|
}
|
||||||
|
if (strcmp(loc, "C") != 0 && strcmp(loc, "POSIX") != 0) {
|
||||||
|
/* the LC_CTYPE locale is different than C and POSIX */
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
#if defined(HAVE_LANGINFO_H) && defined(CODESET)
|
#if defined(HAVE_LANGINFO_H) && defined(CODESET)
|
||||||
char *codeset, **alias;
|
const char *codeset = nl_langinfo(CODESET);
|
||||||
|
if (!codeset || codeset[0] == '\0') {
|
||||||
|
/* CODESET is not set or empty */
|
||||||
|
goto error;
|
||||||
|
}
|
||||||
|
|
||||||
char encoding[20]; /* longest name: "iso_646.irv_1991\0" */
|
char encoding[20]; /* longest name: "iso_646.irv_1991\0" */
|
||||||
int is_ascii;
|
if (!_Py_normalize_encoding(codeset, encoding, sizeof(encoding))) {
|
||||||
unsigned int i;
|
goto error;
|
||||||
char* ascii_aliases[] = {
|
}
|
||||||
|
|
||||||
|
#ifdef __hpux
|
||||||
|
if (strcmp(encoding, "roman8") == 0) {
|
||||||
|
unsigned char ch;
|
||||||
|
wchar_t wch;
|
||||||
|
size_t res;
|
||||||
|
|
||||||
|
ch = (unsigned char)0xA7;
|
||||||
|
res = mbstowcs(&wch, (char*)&ch, 1);
|
||||||
|
if (res != (size_t)-1 && wch == L'\xA7') {
|
||||||
|
/* On HP-UX withe C locale or the POSIX locale,
|
||||||
|
nl_langinfo(CODESET) announces "roman8", whereas mbstowcs() uses
|
||||||
|
Latin1 encoding in practice. Force ASCII in this case.
|
||||||
|
|
||||||
|
Roman8 decodes 0xA7 to U+00CF. Latin1 decodes 0xA7 to U+00A7. */
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
const char* ascii_aliases[] = {
|
||||||
"ascii",
|
"ascii",
|
||||||
/* Aliases from Lib/encodings/aliases.py */
|
/* Aliases from Lib/encodings/aliases.py */
|
||||||
"646",
|
"646",
|
||||||
|
@ -123,27 +160,9 @@ check_force_ascii(void)
|
||||||
"us_ascii",
|
"us_ascii",
|
||||||
NULL
|
NULL
|
||||||
};
|
};
|
||||||
#endif
|
|
||||||
|
|
||||||
loc = setlocale(LC_CTYPE, NULL);
|
int is_ascii = 0;
|
||||||
if (loc == NULL)
|
for (const char **alias=ascii_aliases; *alias != NULL; alias++) {
|
||||||
goto error;
|
|
||||||
if (strcmp(loc, "C") != 0 && strcmp(loc, "POSIX") != 0) {
|
|
||||||
/* the LC_CTYPE locale is different than C */
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
#if defined(HAVE_LANGINFO_H) && defined(CODESET)
|
|
||||||
codeset = nl_langinfo(CODESET);
|
|
||||||
if (!codeset || codeset[0] == '\0') {
|
|
||||||
/* CODESET is not set or empty */
|
|
||||||
goto error;
|
|
||||||
}
|
|
||||||
if (!_Py_normalize_encoding(codeset, encoding, sizeof(encoding)))
|
|
||||||
goto error;
|
|
||||||
|
|
||||||
is_ascii = 0;
|
|
||||||
for (alias=ascii_aliases; *alias != NULL; alias++) {
|
|
||||||
if (strcmp(encoding, *alias) == 0) {
|
if (strcmp(encoding, *alias) == 0) {
|
||||||
is_ascii = 1;
|
is_ascii = 1;
|
||||||
break;
|
break;
|
||||||
|
@ -154,13 +173,14 @@ check_force_ascii(void)
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
for (i=0x80; i<0xff; i++) {
|
for (unsigned int i=0x80; i<=0xff; i++) {
|
||||||
unsigned char ch;
|
char ch[1];
|
||||||
wchar_t wch;
|
wchar_t wch[1];
|
||||||
size_t res;
|
size_t res;
|
||||||
|
|
||||||
ch = (unsigned char)i;
|
unsigned uch = (unsigned char)i;
|
||||||
res = mbstowcs(&wch, (char*)&ch, 1);
|
ch[0] = (char)uch;
|
||||||
|
res = mbstowcs(wch, ch, 1);
|
||||||
if (res != (size_t)-1) {
|
if (res != (size_t)-1) {
|
||||||
/* decoding a non-ASCII character from the locale encoding succeed:
|
/* decoding a non-ASCII character from the locale encoding succeed:
|
||||||
the locale encoding is not ASCII, force ASCII */
|
the locale encoding is not ASCII, force ASCII */
|
||||||
|
@ -169,17 +189,29 @@ check_force_ascii(void)
|
||||||
}
|
}
|
||||||
/* None of the bytes in the range 0x80-0xff can be decoded from the locale
|
/* None of the bytes in the range 0x80-0xff can be decoded from the locale
|
||||||
encoding: the locale encoding is really ASCII */
|
encoding: the locale encoding is really ASCII */
|
||||||
|
#endif /* !defined(__hpux) */
|
||||||
return 0;
|
return 0;
|
||||||
#else
|
#else
|
||||||
/* nl_langinfo(CODESET) is not available: always force ASCII */
|
/* nl_langinfo(CODESET) is not available: always force ASCII */
|
||||||
return 1;
|
return 1;
|
||||||
#endif
|
#endif /* defined(HAVE_LANGINFO_H) && defined(CODESET) */
|
||||||
|
|
||||||
error:
|
error:
|
||||||
/* if an error occurred, force the ASCII encoding */
|
/* if an error occurred, force the ASCII encoding */
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
int
|
||||||
|
_Py_GetForceASCII(void)
|
||||||
|
{
|
||||||
|
if (force_ascii == -1) {
|
||||||
|
force_ascii = check_force_ascii();
|
||||||
|
}
|
||||||
|
return force_ascii;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
static int
|
static int
|
||||||
encode_ascii(const wchar_t *text, char **str,
|
encode_ascii(const wchar_t *text, char **str,
|
||||||
size_t *error_pos, const char **reason,
|
size_t *error_pos, const char **reason,
|
||||||
|
@ -234,6 +266,12 @@ encode_ascii(const wchar_t *text, char **str,
|
||||||
*str = result;
|
*str = result;
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
#else
|
||||||
|
int
|
||||||
|
_Py_GetForceASCII(void)
|
||||||
|
{
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
#endif /* !defined(__APPLE__) && !defined(__ANDROID__) && !defined(MS_WINDOWS) */
|
#endif /* !defined(__APPLE__) && !defined(__ANDROID__) && !defined(MS_WINDOWS) */
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1576,13 +1576,16 @@ initfsencoding(PyInterpreterState *interp)
|
||||||
Py_FileSystemDefaultEncodeErrors = "surrogatepass";
|
Py_FileSystemDefaultEncodeErrors = "surrogatepass";
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
if (Py_FileSystemDefaultEncoding == NULL &&
|
if (Py_FileSystemDefaultEncoding == NULL) {
|
||||||
interp->core_config.utf8_mode)
|
if (interp->core_config.utf8_mode) {
|
||||||
{
|
|
||||||
Py_FileSystemDefaultEncoding = "utf-8";
|
Py_FileSystemDefaultEncoding = "utf-8";
|
||||||
Py_HasFileSystemDefaultEncoding = 1;
|
Py_HasFileSystemDefaultEncoding = 1;
|
||||||
}
|
}
|
||||||
else if (Py_FileSystemDefaultEncoding == NULL) {
|
else if (_Py_GetForceASCII()) {
|
||||||
|
Py_FileSystemDefaultEncoding = "ascii";
|
||||||
|
Py_HasFileSystemDefaultEncoding = 1;
|
||||||
|
}
|
||||||
|
else {
|
||||||
Py_FileSystemDefaultEncoding = get_locale_encoding();
|
Py_FileSystemDefaultEncoding = get_locale_encoding();
|
||||||
if (Py_FileSystemDefaultEncoding == NULL) {
|
if (Py_FileSystemDefaultEncoding == NULL) {
|
||||||
return _Py_INIT_ERR("Unable to get the locale encoding");
|
return _Py_INIT_ERR("Unable to get the locale encoding");
|
||||||
|
@ -1592,6 +1595,7 @@ initfsencoding(PyInterpreterState *interp)
|
||||||
interp->fscodec_initialized = 1;
|
interp->fscodec_initialized = 1;
|
||||||
return _Py_INIT_OK();
|
return _Py_INIT_OK();
|
||||||
}
|
}
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/* the encoding is mbcs, utf-8 or ascii */
|
/* the encoding is mbcs, utf-8 or ascii */
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue