mirror of
				https://github.com/python/cpython.git
				synced 2025-11-03 19:34:08 +00:00 
			
		
		
		
	Issue #850997: mbcs encoding (Windows only) handles errors argument: strict
mode raises unicode errors. The encoder only supports "strict" and "replace" error handlers, the decoder only supports "strict" and "ignore" error handlers.
This commit is contained in:
		
							parent
							
								
									79ee19f3db
								
							
						
					
					
						commit
						554f3f0081
					
				
					 5 changed files with 149 additions and 45 deletions
				
			
		| 
						 | 
				
			
			@ -1223,6 +1223,23 @@ functions can be used directly if desired.
 | 
			
		|||
   Convert a label to Unicode, as specified in :rfc:`3490`.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
:mod:`encodings.mbcs` --- Windows ANSI codepage
 | 
			
		||||
-----------------------------------------------
 | 
			
		||||
 | 
			
		||||
.. module:: encodings.mbcs
 | 
			
		||||
   :synopsis: Windows ANSI codepage
 | 
			
		||||
 | 
			
		||||
Encode operand according to the ANSI codepage (CP_ACP). This codec only
 | 
			
		||||
supports ``'strict'`` and ``'replace'`` error handlers to encode, and
 | 
			
		||||
``'strict'`` and ``'ignore'`` error handlers to decode.
 | 
			
		||||
 | 
			
		||||
Availability: Windows only.
 | 
			
		||||
 | 
			
		||||
.. versionchanged:: 3.2
 | 
			
		||||
   Before 3.2, the *errors* argument was ignored; ``'replace'`` was always used
 | 
			
		||||
   to encode, and ``'ignore'`` to decode.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
:mod:`encodings.utf_8_sig` --- UTF-8 codec with BOM signature
 | 
			
		||||
-------------------------------------------------------------
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -265,7 +265,7 @@ except ImportError:
 | 
			
		|||
    pass
 | 
			
		||||
else:
 | 
			
		||||
    if _os.name in ("nt", "ce"):
 | 
			
		||||
        set_conversion_mode("mbcs", "ignore")
 | 
			
		||||
        set_conversion_mode("mbcs", "strict")
 | 
			
		||||
    else:
 | 
			
		||||
        set_conversion_mode("ascii", "strict")
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1358,11 +1358,6 @@ broken_incremental_coders = broken_unicode_with_streams + [
 | 
			
		|||
    "idna",
 | 
			
		||||
]
 | 
			
		||||
 | 
			
		||||
# The following encodings only support "strict" mode
 | 
			
		||||
only_strict_mode = [
 | 
			
		||||
    "idna",
 | 
			
		||||
]
 | 
			
		||||
 | 
			
		||||
class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):
 | 
			
		||||
    def test_basics(self):
 | 
			
		||||
        s = "abc123" # all codecs should be able to encode these
 | 
			
		||||
| 
						 | 
				
			
			@ -1437,7 +1432,7 @@ class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):
 | 
			
		|||
                    result = "".join(codecs.iterdecode(codecs.iterencode("", encoding), encoding))
 | 
			
		||||
                    self.assertEqual(result, "")
 | 
			
		||||
 | 
			
		||||
                if encoding not in only_strict_mode:
 | 
			
		||||
                if encoding not in ("idna", "mbcs"):
 | 
			
		||||
                    # check incremental decoder/encoder with errors argument
 | 
			
		||||
                    try:
 | 
			
		||||
                        encoder = codecs.getincrementalencoder(encoding)("ignore")
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -12,6 +12,11 @@ What's New in Python 3.2 Alpha 1?
 | 
			
		|||
Core and Builtins
 | 
			
		||||
-----------------
 | 
			
		||||
 | 
			
		||||
- Issue #850997: mbcs encoding (Windows only) handles errors argument: strict
 | 
			
		||||
  mode raises unicode errors. The encoder only supports "strict" and "replace"
 | 
			
		||||
  error handlers, the decoder only supports "strict" and "ignore" error
 | 
			
		||||
  handlers.
 | 
			
		||||
 | 
			
		||||
- Issue #8592: PyArg_Parse*() functions raise a TypeError for "y", "u" and "Z"
 | 
			
		||||
  formats if the string contains a null byte/character. Write unit tests for
 | 
			
		||||
  string formats.
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1767,6 +1767,33 @@ int PyUnicode_SetDefaultEncoding(const char *encoding)
 | 
			
		|||
    return 0;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/* create or adjust a UnicodeDecodeError */
 | 
			
		||||
static void
 | 
			
		||||
make_decode_exception(PyObject **exceptionObject,
 | 
			
		||||
                      const char *encoding,
 | 
			
		||||
                      const char *input, Py_ssize_t length,
 | 
			
		||||
                      Py_ssize_t startpos, Py_ssize_t endpos,
 | 
			
		||||
                      const char *reason)
 | 
			
		||||
{
 | 
			
		||||
    if (*exceptionObject == NULL) {
 | 
			
		||||
        *exceptionObject = PyUnicodeDecodeError_Create(
 | 
			
		||||
            encoding, input, length, startpos, endpos, reason);
 | 
			
		||||
    }
 | 
			
		||||
    else {
 | 
			
		||||
        if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
 | 
			
		||||
            goto onError;
 | 
			
		||||
        if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
 | 
			
		||||
            goto onError;
 | 
			
		||||
        if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
 | 
			
		||||
            goto onError;
 | 
			
		||||
    }
 | 
			
		||||
    return;
 | 
			
		||||
 | 
			
		||||
onError:
 | 
			
		||||
    Py_DECREF(*exceptionObject);
 | 
			
		||||
    *exceptionObject = NULL;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/* error handling callback helper:
 | 
			
		||||
   build arguments, call the callback and check the arguments,
 | 
			
		||||
   if no exception occurred, copy the replacement to the output
 | 
			
		||||
| 
						 | 
				
			
			@ -1800,20 +1827,13 @@ int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler
 | 
			
		|||
            goto onError;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    if (*exceptionObject == NULL) {
 | 
			
		||||
        *exceptionObject = PyUnicodeDecodeError_Create(
 | 
			
		||||
            encoding, *input, *inend-*input, *startinpos, *endinpos, reason);
 | 
			
		||||
    make_decode_exception(exceptionObject,
 | 
			
		||||
        encoding,
 | 
			
		||||
        *input, *inend - *input,
 | 
			
		||||
        *startinpos, *endinpos,
 | 
			
		||||
        reason);
 | 
			
		||||
    if (*exceptionObject == NULL)
 | 
			
		||||
        goto onError;
 | 
			
		||||
    }
 | 
			
		||||
    else {
 | 
			
		||||
        if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
 | 
			
		||||
            goto onError;
 | 
			
		||||
        if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
 | 
			
		||||
            goto onError;
 | 
			
		||||
        if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
 | 
			
		||||
            goto onError;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
 | 
			
		||||
    if (restuple == NULL)
 | 
			
		||||
| 
						 | 
				
			
			@ -4552,32 +4572,46 @@ static int is_dbcs_lead_byte(const char *s, int offset)
 | 
			
		|||
static int decode_mbcs(PyUnicodeObject **v,
 | 
			
		||||
                       const char *s, /* MBCS string */
 | 
			
		||||
                       int size, /* sizeof MBCS string */
 | 
			
		||||
                       int final)
 | 
			
		||||
                       int final,
 | 
			
		||||
                       const char *errors)
 | 
			
		||||
{
 | 
			
		||||
    Py_UNICODE *p;
 | 
			
		||||
    Py_ssize_t n = 0;
 | 
			
		||||
    int usize = 0;
 | 
			
		||||
    Py_ssize_t n;
 | 
			
		||||
    DWORD usize;
 | 
			
		||||
    DWORD flags;
 | 
			
		||||
 | 
			
		||||
    assert(size >= 0);
 | 
			
		||||
 | 
			
		||||
    /* check and handle 'errors' arg */
 | 
			
		||||
    if (errors==NULL || strcmp(errors, "strict")==0)
 | 
			
		||||
        flags = MB_ERR_INVALID_CHARS;
 | 
			
		||||
    else if (strcmp(errors, "ignore")==0)
 | 
			
		||||
        flags = 0;
 | 
			
		||||
    else {
 | 
			
		||||
        PyErr_Format(PyExc_ValueError,
 | 
			
		||||
                     "mbcs encoding does not support errors='%s'",
 | 
			
		||||
                     errors);
 | 
			
		||||
        return -1;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    /* Skip trailing lead-byte unless 'final' is set */
 | 
			
		||||
    if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
 | 
			
		||||
        --size;
 | 
			
		||||
 | 
			
		||||
    /* First get the size of the result */
 | 
			
		||||
    if (size > 0) {
 | 
			
		||||
        usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
 | 
			
		||||
        if (usize == 0) {
 | 
			
		||||
            PyErr_SetFromWindowsErrWithFilename(0, NULL);
 | 
			
		||||
            return -1;
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
        usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
 | 
			
		||||
        if (usize==0)
 | 
			
		||||
            goto mbcs_decode_error;
 | 
			
		||||
    } else
 | 
			
		||||
        usize = 0;
 | 
			
		||||
 | 
			
		||||
    if (*v == NULL) {
 | 
			
		||||
        /* Create unicode object */
 | 
			
		||||
        *v = _PyUnicode_New(usize);
 | 
			
		||||
        if (*v == NULL)
 | 
			
		||||
            return -1;
 | 
			
		||||
        n = 0;
 | 
			
		||||
    }
 | 
			
		||||
    else {
 | 
			
		||||
        /* Extend unicode object */
 | 
			
		||||
| 
						 | 
				
			
			@ -4587,15 +4621,35 @@ static int decode_mbcs(PyUnicodeObject **v,
 | 
			
		|||
    }
 | 
			
		||||
 | 
			
		||||
    /* Do the conversion */
 | 
			
		||||
    if (size > 0) {
 | 
			
		||||
    if (usize > 0) {
 | 
			
		||||
        p = PyUnicode_AS_UNICODE(*v) + n;
 | 
			
		||||
        if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
 | 
			
		||||
            PyErr_SetFromWindowsErrWithFilename(0, NULL);
 | 
			
		||||
            return -1;
 | 
			
		||||
        if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
 | 
			
		||||
            goto mbcs_decode_error;
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    return size;
 | 
			
		||||
 | 
			
		||||
mbcs_decode_error:
 | 
			
		||||
    /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
 | 
			
		||||
       we raise a UnicodeDecodeError - else it is a 'generic'
 | 
			
		||||
       windows error
 | 
			
		||||
     */
 | 
			
		||||
    if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
 | 
			
		||||
        /* Ideally, we should get reason from FormatMessage - this
 | 
			
		||||
           is the Windows 2000 English version of the message
 | 
			
		||||
        */
 | 
			
		||||
        PyObject *exc = NULL;
 | 
			
		||||
        const char *reason = "No mapping for the Unicode character exists "
 | 
			
		||||
                             "in the target multi-byte code page.";
 | 
			
		||||
        make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
 | 
			
		||||
        if (exc != NULL) {
 | 
			
		||||
            PyCodec_StrictErrors(exc);
 | 
			
		||||
            Py_DECREF(exc);
 | 
			
		||||
        }
 | 
			
		||||
    } else {
 | 
			
		||||
        PyErr_SetFromWindowsErrWithFilename(0, NULL);
 | 
			
		||||
    }
 | 
			
		||||
    return -1;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
 | 
			
		||||
| 
						 | 
				
			
			@ -4612,10 +4666,10 @@ PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
 | 
			
		|||
#ifdef NEED_RETRY
 | 
			
		||||
  retry:
 | 
			
		||||
    if (size > INT_MAX)
 | 
			
		||||
        done = decode_mbcs(&v, s, INT_MAX, 0);
 | 
			
		||||
        done = decode_mbcs(&v, s, INT_MAX, 0, errors);
 | 
			
		||||
    else
 | 
			
		||||
#endif
 | 
			
		||||
        done = decode_mbcs(&v, s, (int)size, !consumed);
 | 
			
		||||
        done = decode_mbcs(&v, s, (int)size, !consumed, errors);
 | 
			
		||||
 | 
			
		||||
    if (done < 0) {
 | 
			
		||||
        Py_XDECREF(v);
 | 
			
		||||
| 
						 | 
				
			
			@ -4649,20 +4703,45 @@ PyObject *PyUnicode_DecodeMBCS(const char *s,
 | 
			
		|||
 */
 | 
			
		||||
static int encode_mbcs(PyObject **repr,
 | 
			
		||||
                       const Py_UNICODE *p, /* unicode */
 | 
			
		||||
                       int size) /* size of unicode */
 | 
			
		||||
                       int size, /* size of unicode */
 | 
			
		||||
                       const char* errors)
 | 
			
		||||
{
 | 
			
		||||
    int mbcssize = 0;
 | 
			
		||||
    Py_ssize_t n = 0;
 | 
			
		||||
    BOOL usedDefaultChar = FALSE;
 | 
			
		||||
    BOOL *pusedDefaultChar;
 | 
			
		||||
    int mbcssize;
 | 
			
		||||
    Py_ssize_t n;
 | 
			
		||||
    PyObject *exc = NULL;
 | 
			
		||||
    DWORD flags;
 | 
			
		||||
 | 
			
		||||
    assert(size >= 0);
 | 
			
		||||
 | 
			
		||||
    /* check and handle 'errors' arg */
 | 
			
		||||
    if (errors==NULL || strcmp(errors, "strict")==0) {
 | 
			
		||||
        flags = WC_NO_BEST_FIT_CHARS;
 | 
			
		||||
        pusedDefaultChar = &usedDefaultChar;
 | 
			
		||||
    } else if (strcmp(errors, "replace")==0) {
 | 
			
		||||
        flags = 0;
 | 
			
		||||
        pusedDefaultChar = NULL;
 | 
			
		||||
    } else {
 | 
			
		||||
         PyErr_Format(PyExc_ValueError,
 | 
			
		||||
                      "mbcs encoding does not support errors='%s'",
 | 
			
		||||
                      errors);
 | 
			
		||||
         return -1;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    /* First get the size of the result */
 | 
			
		||||
    if (size > 0) {
 | 
			
		||||
        mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
 | 
			
		||||
        mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
 | 
			
		||||
                                       NULL, pusedDefaultChar);
 | 
			
		||||
        if (mbcssize == 0) {
 | 
			
		||||
            PyErr_SetFromWindowsErrWithFilename(0, NULL);
 | 
			
		||||
            return -1;
 | 
			
		||||
        }
 | 
			
		||||
        /* If we used a default char, then we failed! */
 | 
			
		||||
        if (pusedDefaultChar && *pusedDefaultChar)
 | 
			
		||||
            goto mbcs_encode_error;
 | 
			
		||||
    } else {
 | 
			
		||||
        mbcssize = 0;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    if (*repr == NULL) {
 | 
			
		||||
| 
						 | 
				
			
			@ -4670,6 +4749,7 @@ static int encode_mbcs(PyObject **repr,
 | 
			
		|||
        *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
 | 
			
		||||
        if (*repr == NULL)
 | 
			
		||||
            return -1;
 | 
			
		||||
        n = 0;
 | 
			
		||||
    }
 | 
			
		||||
    else {
 | 
			
		||||
        /* Extend string object */
 | 
			
		||||
| 
						 | 
				
			
			@ -4681,13 +4761,20 @@ static int encode_mbcs(PyObject **repr,
 | 
			
		|||
    /* Do the conversion */
 | 
			
		||||
    if (size > 0) {
 | 
			
		||||
        char *s = PyBytes_AS_STRING(*repr) + n;
 | 
			
		||||
        if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
 | 
			
		||||
        if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
 | 
			
		||||
                                     NULL, pusedDefaultChar)) {
 | 
			
		||||
            PyErr_SetFromWindowsErrWithFilename(0, NULL);
 | 
			
		||||
            return -1;
 | 
			
		||||
        }
 | 
			
		||||
        if (pusedDefaultChar && *pusedDefaultChar)
 | 
			
		||||
            goto mbcs_encode_error;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    return 0;
 | 
			
		||||
 | 
			
		||||
mbcs_encode_error:
 | 
			
		||||
    raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
 | 
			
		||||
    Py_XDECREF(exc);
 | 
			
		||||
    return -1;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
 | 
			
		||||
| 
						 | 
				
			
			@ -4700,10 +4787,10 @@ PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
 | 
			
		|||
#ifdef NEED_RETRY
 | 
			
		||||
  retry:
 | 
			
		||||
    if (size > INT_MAX)
 | 
			
		||||
        ret = encode_mbcs(&repr, p, INT_MAX);
 | 
			
		||||
        ret = encode_mbcs(&repr, p, INT_MAX, errors);
 | 
			
		||||
    else
 | 
			
		||||
#endif
 | 
			
		||||
        ret = encode_mbcs(&repr, p, (int)size);
 | 
			
		||||
        ret = encode_mbcs(&repr, p, (int)size, errors);
 | 
			
		||||
 | 
			
		||||
    if (ret < 0) {
 | 
			
		||||
        Py_XDECREF(repr);
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue