[3.12] bpo-43950: handle wide unicode characters in tracebacks (GH-28150) (#111346)

2025-07-16 15:55:18 +00:00 · 2023-10-27 06:43:38 +09:00 · 2023-10-27 06:43:38 +09:00 · c81ebf5b3d
commit c81ebf5b3d
parent e25d8b40cd
5 changed files with 187 additions and 16 deletions
--- a/Lib/test/test_traceback.py
+++ b/Lib/test/test_traceback.py
@ -922,8 +922,63 @@ class TracebackErrorLocationCaretTestBase:
            f"  File \"{__file__}\", line {self.callable_line}, in get_exception",
            "    callable()",
            f"  File \"{__file__}\", line {f.__code__.co_firstlineno + 4}, in f",
-            "    print(1, ｗｗｗ(",
+            f"    print(1, ｗｗｗ(",
-            "             ^^^^",
+            f"             ^^^^^^^",
        ]
        self.assertEqual(actual, expected)
    def test_byte_offset_with_wide_characters_term_highlight(self):
        def f():
            说明说明 = 1
            şçöğıĤellö = 0 # not wide but still non-ascii
            return 说明说明 / şçöğıĤellö
        actual = self.get_exception(f)
        expected = [
            f"Traceback (most recent call last):",
            f"  File \"{__file__}\", line {self.callable_line}, in get_exception",
            f"    callable()",
            f"  File \"{__file__}\", line {f.__code__.co_firstlineno + 3}, in f",
            f"    return 说明说明 / şçöğıĤellö",
            f"           ~~~~~~~~~^~~~~~~~~~~~",
        ]
        self.assertEqual(actual, expected)
    def test_byte_offset_with_emojis_term_highlight(self):
        def f():
            return "✨🐍" + func_说明说明("📗🚛",
                "📗🚛") + "🐍"
        actual = self.get_exception(f)
        expected = [
            f"Traceback (most recent call last):",
            f"  File \"{__file__}\", line {self.callable_line}, in get_exception",
            f"    callable()",
            f"  File \"{__file__}\", line {f.__code__.co_firstlineno + 1}, in f",
            f'    return "✨🐍" + func_说明说明("📗🚛",',
            f"                    ^^^^^^^^^^^^^",
        ]
        self.assertEqual(actual, expected)
    def test_byte_offset_wide_chars_subscript(self):
        def f():
            my_dct = {
                "✨🚛✨": {
                    "说明": {
                        "🐍🐍🐍": None
                    }
                }
            }
            return my_dct["✨🚛✨"]["说明"]["🐍"]["说明"]["🐍🐍"]
        actual = self.get_exception(f)
        expected = [
            f"Traceback (most recent call last):",
            f"  File \"{__file__}\", line {self.callable_line}, in get_exception",
            f"    callable()",
            f"  File \"{__file__}\", line {f.__code__.co_firstlineno + 8}, in f",
            f'    return my_dct["✨🚛✨"]["说明"]["🐍"]["说明"]["🐍🐍"]',
            f"           ~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^",
        ]
        self.assertEqual(actual, expected)
--- a/Lib/traceback.py
+++ b/Lib/traceback.py
@ -470,7 +470,8 @@ class StackSummary(list):
            stripped_line = frame_summary.line.strip()
            row.append('    {}\n'.format(stripped_line))
-            orig_line_len = len(frame_summary._original_line)
+            line = frame_summary._original_line
            orig_line_len = len(line)
            frame_line_len = len(frame_summary.line.lstrip())
            stripped_characters = orig_line_len - frame_line_len
            if (
@ -478,31 +479,40 @@ class StackSummary(list):
                and frame_summary.end_colno is not None
            ):
                start_offset = _byte_offset_to_character_offset(
-                    frame_summary._original_line, frame_summary.colno) + 1
+                    line, frame_summary.colno)
                end_offset = _byte_offset_to_character_offset(
-                    frame_summary._original_line, frame_summary.end_colno) + 1
+                    line, frame_summary.end_colno)
                code_segment = line[start_offset:end_offset]
                anchors = None
                if frame_summary.lineno == frame_summary.end_lineno:
                    with suppress(Exception):
-                        anchors = _extract_caret_anchors_from_line_segment(
+                        anchors = _extract_caret_anchors_from_line_segment(code_segment)
                            frame_summary._original_line[start_offset - 1:end_offset - 1]
                        )
                else:
-                    end_offset = stripped_characters + len(stripped_line)
+                    # Don't count the newline since the anchors only need to
                    # go up until the last character of the line.
                    end_offset = len(line.rstrip())
                # show indicators if primary char doesn't span the frame line
                if end_offset - start_offset < len(stripped_line) or (
                        anchors and anchors.right_start_offset - anchors.left_end_offset > 0):
                    # When showing this on a terminal, some of the non-ASCII characters
                    # might be rendered as double-width characters, so we need to take
                    # that into account when calculating the length of the line.
                    dp_start_offset = _display_width(line, start_offset) + 1
                    dp_end_offset = _display_width(line, end_offset) + 1
                    row.append('    ')
-                    row.append(' ' * (start_offset - stripped_characters))
+                    row.append(' ' * (dp_start_offset - stripped_characters))
                    if anchors:
-                        row.append(anchors.primary_char * (anchors.left_end_offset))
+                        dp_left_end_offset = _display_width(code_segment, anchors.left_end_offset)
-                        row.append(anchors.secondary_char * (anchors.right_start_offset - anchors.left_end_offset))
+                        dp_right_start_offset = _display_width(code_segment, anchors.right_start_offset)
-                        row.append(anchors.primary_char * (end_offset - start_offset - anchors.right_start_offset))
+                        row.append(anchors.primary_char * dp_left_end_offset)
                        row.append(anchors.secondary_char * (dp_right_start_offset - dp_left_end_offset))
                        row.append(anchors.primary_char * (dp_end_offset - dp_start_offset - dp_right_start_offset))
                    else:
-                        row.append('^' * (end_offset - start_offset))
+                        row.append('^' * (dp_end_offset - dp_start_offset))
                    row.append('\n')
@ -623,6 +633,25 @@ def _extract_caret_anchors_from_line_segment(segment):
    return None
 _WIDE_CHAR_SPECIFIERS = "WF"
 def _display_width(line, offset):
    """Calculate the extra amount of width space the given source
    code segment might take if it were to be displayed on a fixed
    width output device. Supports wide unicode characters and emojis."""
    # Fast track for ASCII-only strings
    if line.isascii():
        return offset
    import unicodedata
    return sum(
        2 if unicodedata.east_asian_width(char) in _WIDE_CHAR_SPECIFIERS else 1
        for char in line[:offset]
    )
 class _ExceptionPrintContext:
    def __init__(self):
--- a/Parser/pegen.c
+++ b/Parser/pegen.c
@ -38,6 +38,61 @@ _PyPegen_byte_offset_to_character_offset(PyObject *line, Py_ssize_t col_offset)
    return size;
 }
 // Calculate the extra amount of width space the given source
 // code segment might take if it were to be displayed on a fixed
 // width output device. Supports wide unicode characters and emojis.
 Py_ssize_t
 _PyPegen_calculate_display_width(PyObject *line, Py_ssize_t character_offset)
 {
    PyObject *segment = PyUnicode_Substring(line, 0, character_offset);
    if (!segment) {
        return -1;
    }
    // Fast track for ascii strings
    if (PyUnicode_IS_ASCII(segment)) {
        Py_DECREF(segment);
        return character_offset;
    }
    PyObject *width_fn = _PyImport_GetModuleAttrString("unicodedata", "east_asian_width");
    if (!width_fn) {
        return -1;
    }
    Py_ssize_t width = 0;
    Py_ssize_t len = PyUnicode_GET_LENGTH(segment);
    for (Py_ssize_t i = 0; i < len; i++) {
        PyObject *chr = PyUnicode_Substring(segment, i, i + 1);
        if (!chr) {
            Py_DECREF(segment);
            Py_DECREF(width_fn);
            return -1;
        }
        PyObject *width_specifier = PyObject_CallOneArg(width_fn, chr);
        Py_DECREF(chr);
        if (!width_specifier) {
            Py_DECREF(segment);
            Py_DECREF(width_fn);
            return -1;
        }
        if (_PyUnicode_EqualToASCIIString(width_specifier, "W") ||
            _PyUnicode_EqualToASCIIString(width_specifier, "F")) {
            width += 2;
        }
        else {
            width += 1;
        }
        Py_DECREF(width_specifier);
    }
    Py_DECREF(segment);
    Py_DECREF(width_fn);
    return width;
 }
 // Here, mark is the start of the node, while p->mark is the end.
 // If node==NULL, they should be the same.
 int
--- a/Parser/pegen.h
+++ b/Parser/pegen.h
@ -151,6 +151,7 @@ expr_ty _PyPegen_name_token(Parser *p);
 expr_ty _PyPegen_number_token(Parser *p);
 void *_PyPegen_string_token(Parser *p);
 Py_ssize_t _PyPegen_byte_offset_to_character_offset(PyObject *line, Py_ssize_t col_offset);
 Py_ssize_t _PyPegen_calculate_display_width(PyObject *segment, Py_ssize_t character_offset);
 // Error handling functions and APIs
 typedef enum {
--- a/Python/traceback.c
+++ b/Python/traceback.c
@ -900,8 +900,39 @@ tb_displayline(PyTracebackObject* tb, PyObject *f, PyObject *filename, int linen
        goto done;
    }
-    if (print_error_location_carets(f, truncation, start_offset, end_offset,
+    // Convert all offsets to display offsets (e.g. the space they would take up if printed
-                                    right_start_offset, left_end_offset,
+    // on the screen).
    Py_ssize_t dp_start = _PyPegen_calculate_display_width(source_line, start_offset);
    if (dp_start < 0) {
        err = ignore_source_errors() < 0;
        goto done;
    }
    Py_ssize_t dp_end = _PyPegen_calculate_display_width(source_line, end_offset);
    if (dp_end < 0) {
        err = ignore_source_errors() < 0;
        goto done;
    }
    Py_ssize_t dp_left_end = -1;
    Py_ssize_t dp_right_start = -1;
    if (has_secondary_ranges) {
        dp_left_end = _PyPegen_calculate_display_width(source_line, left_end_offset);
        if (dp_left_end < 0) {
            err = ignore_source_errors() < 0;
            goto done;
        }
        dp_right_start = _PyPegen_calculate_display_width(source_line, right_start_offset);
        if (dp_right_start < 0) {
            err = ignore_source_errors() < 0;
            goto done;
        }
    }
    if (print_error_location_carets(f, truncation, dp_start, dp_end,
                                    dp_right_start, dp_left_end,
                                    primary_error_char, secondary_error_char) < 0) {
        err = -1;
        goto done;