gh-111495: improve test coverage of codecs C API (GH-126030)

For now, skip some crashers (tracked in gh-123378).
2025-09-26 18:29:57 +00:00 · 2024-11-01 14:28:18 +01:00 · 2024-11-01 14:28:18 +01:00 · 32e07fd377
commit 32e07fd377
parent 6c67446a6e
1 changed files with 113 additions and 21 deletions
--- a/Lib/test/test_capi/test_codecs.py
+++ b/Lib/test/test_capi/test_codecs.py
@ -747,6 +747,49 @@ class CAPICodecs(unittest.TestCase):

 class CAPICodecErrors(unittest.TestCase):

+    @classmethod
+    def _generate_exception_args(cls):
+        for objlen in range(5):
+            maxind = 2 * max(2, objlen)
+            for start in range(-maxind, maxind + 1):
+                for end in range(-maxind, maxind + 1):
+                    yield objlen, start, end
+
+    @classmethod
+    def generate_encode_errors(cls):
+        return tuple(
+            UnicodeEncodeError('utf-8', '0' * objlen, start, end, 'why')
+            for objlen, start, end in cls._generate_exception_args()
+        )
+
+    @classmethod
+    def generate_decode_errors(cls):
+        return tuple(
+            UnicodeDecodeError('utf-8', b'0' * objlen, start, end, 'why')
+            for objlen, start, end in cls._generate_exception_args()
+        )
+
+    @classmethod
+    def generate_translate_errors(cls):
+        return tuple(
+            UnicodeTranslateError('0' * objlen, start, end, 'why')
+            for objlen, start, end in cls._generate_exception_args()
+        )
+
+    @classmethod
+    def setUpClass(cls):
+        cls.unicode_encode_errors = cls.generate_encode_errors()
+        cls.unicode_decode_errors = cls.generate_decode_errors()
+        cls.unicode_translate_errors = cls.generate_translate_errors()
+        cls.all_unicode_errors = (
+            cls.unicode_encode_errors
+            + cls.unicode_decode_errors
+            + cls.unicode_translate_errors
+        )
+        cls.bad_unicode_errors = (
+            ValueError(),
+        )
+
    def test_codec_register_error(self):
        # for cleaning up between tests
        from _codecs import _unregister_error as _codecs_unregister_error
@ -780,33 +823,82 @@ class CAPICodecErrors(unittest.TestCase):
        self.assertIs(codec_lookup_error('ignore'), codecs.ignore_errors)
        self.assertIs(codec_lookup_error('replace'), codecs.replace_errors)
        self.assertIs(codec_lookup_error('xmlcharrefreplace'), codecs.xmlcharrefreplace_errors)
+        self.assertIs(codec_lookup_error('backslashreplace'), codecs.backslashreplace_errors)
        self.assertIs(codec_lookup_error('namereplace'), codecs.namereplace_errors)
        self.assertRaises(LookupError, codec_lookup_error, 'unknown')

-    def test_codec_error_handlers(self):
-        exceptions = [
-            # A UnicodeError with an empty message currently crashes:
-            # See: https://github.com/python/cpython/issues/123378
-            # UnicodeEncodeError('bad', '', 0, 1, 'reason'),
-            UnicodeEncodeError('bad', 'x', 0, 1, 'reason'),
-            UnicodeEncodeError('bad', 'xyz123', 0, 1, 'reason'),
-            UnicodeEncodeError('bad', 'xyz123', 1, 4, 'reason'),
-        ]
+    def test_codec_strict_errors_handler(self):
+        handler = _testcapi.codec_strict_errors
+        for exc in self.all_unicode_errors + self.bad_unicode_errors:
+            with self.subTest(handler=handler, exc=exc):
+                self.assertRaises(type(exc), handler, exc)

-        strict_handler = _testcapi.codec_strict_errors
+    def test_codec_ignore_errors_handler(self):
+        handler = _testcapi.codec_ignore_errors
+        self.do_test_codec_errors_handler(handler, self.all_unicode_errors)
+
+    def test_codec_replace_errors_handler(self):
+        handler = _testcapi.codec_replace_errors
+        self.do_test_codec_errors_handler(handler, self.all_unicode_errors)
+
+    def test_codec_xmlcharrefreplace_errors_handler(self):
+        handler = _testcapi.codec_xmlcharrefreplace_errors
+        self.do_test_codec_errors_handler(handler, self.unicode_encode_errors)
+
+    def test_codec_backslashreplace_errors_handler(self):
+        handler = _testcapi.codec_backslashreplace_errors
+        self.do_test_codec_errors_handler(handler, self.all_unicode_errors)
+
+    def test_codec_namereplace_errors_handler(self):
+        handler = _testlimitedcapi.codec_namereplace_errors
+        self.do_test_codec_errors_handler(handler, self.unicode_encode_errors)
+
+    def do_test_codec_errors_handler(self, handler, exceptions):
+        at_least_one = False
        for exc in exceptions:
-            with self.subTest(handler=strict_handler, exc=exc):
-                self.assertRaises(UnicodeEncodeError, strict_handler, exc)
+            # See https://github.com/python/cpython/issues/123378 and related
+            # discussion and issues for details.
+            if self._exception_may_crash(exc):
+                continue

-        for handler in [
-            _testcapi.codec_ignore_errors,
-            _testcapi.codec_replace_errors,
-            _testcapi.codec_xmlcharrefreplace_errors,
-            _testlimitedcapi.codec_namereplace_errors,
-        ]:
-            for exc in exceptions:
-                with self.subTest(handler=handler, exc=exc):
-                    self.assertIsInstance(handler(exc), tuple)
+            at_least_one = True
+            with self.subTest(handler=handler, exc=exc):
+                # test that the handler does not crash
+                self.assertIsInstance(handler(exc), tuple)
+
+        if exceptions:
+            self.assertTrue(at_least_one, "all exceptions are crashing")
+
+        for bad_exc in (
+            self.bad_unicode_errors
+            + tuple(e for e in self.all_unicode_errors if e not in exceptions)
+        ):
+            with self.subTest('bad type', handler=handler, exc=bad_exc):
+                self.assertRaises(TypeError, handler, bad_exc)
+
+    @classmethod
+    def _exception_may_crash(cls, exc):
+        """Indicate whether a Unicode exception might currently crash
+        the interpreter when used by a built-in codecs error handler.
+
+        Until gh-123378 is fixed, we skip the tests for these exceptions.
+
+        This should only be used by "do_test_codec_errors_handler".
+        """
+        message, start, end = exc.object, exc.start, exc.end
+        match exc:
+            case UnicodeEncodeError():
+                return end < start or (end - start) >= len(message)
+            case UnicodeDecodeError():
+                # The case "end - start >= len(message)" does not crash.
+                return end < start
+            case UnicodeTranslateError():
+                # Test "end <= start" because PyCodec_ReplaceErrors checks
+                # the Unicode kind of a 0-length string which by convention
+                # is PyUnicode_1BYTE_KIND and not PyUnicode_2BYTE_KIND as
+                # the handler currently expects.
+                return end <= start or (end - start) >= len(message)
+        return False


 if __name__ == "__main__":