diff --git a/.gitattributes b/.gitattributes
index 767ec620fba..e88d6ea13e2 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -83,8 +83,10 @@ Include/opcode.h                                    generated
 Include/opcode_ids.h                                generated
 Include/token.h                                     generated
 Lib/_opcode_metadata.py                             generated
-Lib/keyword.py                                      generated
 Lib/idlelib/help.html                               generated
+Lib/keyword.py                                      generated
+Lib/pydoc_data/topics.py                            generated
+Lib/pydoc_data/module_docs.py                       generated
 Lib/test/certdata/*.pem                             generated
 Lib/test/certdata/*.0                               generated
 Lib/test/levenshtein_examples.json                  generated
diff --git a/.github/workflows/mypy.yml b/.github/workflows/mypy.yml
index fac0fa8aba3..8810730e193 100644
--- a/.github/workflows/mypy.yml
+++ b/.github/workflows/mypy.yml
@@ -26,6 +26,7 @@ on:
       - "Tools/build/update_file.py"
       - "Tools/build/verify_ensurepip_wheels.py"
       - "Tools/cases_generator/**"
+      - "Tools/check-c-api-docs/**"
       - "Tools/clinic/**"
       - "Tools/jit/**"
       - "Tools/peg_generator/**"
@@ -58,6 +59,7 @@ jobs:
           "Lib/tomllib",
           "Tools/build",
           "Tools/cases_generator",
+          "Tools/check-c-api-docs",
           "Tools/clinic",
           "Tools/jit",
           "Tools/peg_generator",
diff --git a/.github/workflows/tail-call.yml b/.github/workflows/tail-call.yml
index e99e317182e..76a8c05aa52 100644
--- a/.github/workflows/tail-call.yml
+++ b/.github/workflows/tail-call.yml
@@ -79,19 +79,17 @@ jobs:
         with:
           python-version: '3.11'
 
-      - name: Native Windows (debug)
+      - name: Native Windows MSVC (release)
         if: runner.os == 'Windows' && matrix.architecture != 'ARM64'
         shell: cmd
         run: |
-          choco install llvm --allow-downgrade --no-progress --version ${{ matrix.llvm }}.1.0
-          set PlatformToolset=clangcl
-          set LLVMToolsVersion=${{ matrix.llvm }}.1.0
-          set LLVMInstallDir=C:\Program Files\LLVM
-          call ./PCbuild/build.bat --tail-call-interp -d -p ${{ matrix.architecture }}
-          call ./PCbuild/rt.bat -d -p ${{ matrix.architecture }} -q --multiprocess 0 --timeout 4500 --verbose2 --verbose3
+          choco install visualstudio2026buildtools --no-progress -y --force --params "--add Microsoft.VisualStudio.Component.VC.Tools.x86.x64  --locale en-US --passive"
+          $env:PATH = "C:\Program Files (x86)\Microsoft Visual Studio\18\BuildTools\MSBuild\Current\bin;$env:PATH"
+          ./PCbuild/build.bat --tail-call-interp -c Release -p ${{ matrix.architecture }} "/p:PlatformToolset=v145"
+          ./PCbuild/rt.bat -p ${{ matrix.architecture }} -q --multiprocess 0 --timeout 4500 --verbose2 --verbose3
 
       # No tests (yet):
-      - name: Emulated Windows (release)
+      - name: Emulated Windows Clang (release)
         if: runner.os == 'Windows' && matrix.architecture == 'ARM64'
         shell: cmd
         run: |
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index c5767ee841e..ee89e18db35 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -40,15 +40,15 @@ repos:
         files: ^Apple
       - id: ruff-format
         name: Run Ruff (format) on Doc/
-        args: [--check]
+        args: [--exit-non-zero-on-fix]
         files: ^Doc/
       - id: ruff-format
         name: Run Ruff (format) on Tools/build/check_warnings.py
-        args: [--check, --config=Tools/build/.ruff.toml]
+        args: [--exit-non-zero-on-fix, --config=Tools/build/.ruff.toml]
         files: ^Tools/build/check_warnings.py
       - id: ruff-format
         name: Run Ruff (format) on Tools/wasm/
-        args: [--check, --config=Tools/wasm/.ruff.toml]
+        args: [--exit-non-zero-on-fix, --config=Tools/wasm/.ruff.toml]
         files: ^Tools/wasm/
 
   - repo: https://github.com/psf/black-pre-commit-mirror
diff --git a/Doc/Makefile b/Doc/Makefile
index f16d9cacb1b..4d605980a62 100644
--- a/Doc/Makefile
+++ b/Doc/Makefile
@@ -140,7 +140,8 @@ doctest:
 pydoc-topics: BUILDER = pydoc-topics
 pydoc-topics: build
 	@echo "Building finished; now run this:" \
-	      "cp build/pydoc-topics/topics.py ../Lib/pydoc_data/topics.py"
+	      "cp build/pydoc-topics/topics.py ../Lib/pydoc_data/topics.py" \
+	      "&& cp build/pydoc-topics/module_docs.py ../Lib/pydoc_data/module_docs.py"
 
 .PHONY: gettext
 gettext: BUILDER = gettext
diff --git a/Doc/c-api/intro.rst b/Doc/c-api/intro.rst
index bb94bcb86a7..5e90d9b7bc9 100644
--- a/Doc/c-api/intro.rst
+++ b/Doc/c-api/intro.rst
@@ -107,6 +107,46 @@ header files properly declare the entry points to be ``extern "C"``. As a result
 there is no need to do anything special to use the API from C++.
 
 
+.. _capi-system-includes:
+
+System includes
+---------------
+
+   :file:`Python.h` includes several standard header files.
+   C extensions should include the standard headers that they use,
+   and should not rely on these implicit includes.
+   The implicit includes are:
+
+   * ``<assert.h>``
+   * ``<intrin.h>`` (on Windows)
+   * ``<inttypes.h>``
+   * ``<limits.h>``
+   * ``<math.h>``
+   * ``<stdarg.h>``
+   * ``<wchar.h>``
+   * ``<sys/types.h>`` (if present)
+
+   The following are included for backwards compatibility, unless using
+   :ref:`Limited API <limited-c-api>` 3.13 or newer:
+
+   * ``<ctype.h>``
+   * ``<unistd.h>`` (on POSIX)
+
+   The following are included for backwards compatibility, unless using
+   :ref:`Limited API <limited-c-api>` 3.11 or newer:
+
+   * ``<errno.h>``
+   * ``<stdio.h>``
+   * ``<stdlib.h>``
+   * ``<string.h>``
+
+.. note::
+
+   Since Python may define some pre-processor definitions which affect the standard
+   headers on some systems, you *must* include :file:`Python.h` before any standard
+   headers are included.
+
+
 Useful macros
 =============
 
diff --git a/Doc/deprecations/pending-removal-in-3.20.rst b/Doc/deprecations/pending-removal-in-3.20.rst
index 185f20fbc6d..4e4b2e1d5f8 100644
--- a/Doc/deprecations/pending-removal-in-3.20.rst
+++ b/Doc/deprecations/pending-removal-in-3.20.rst
@@ -1,9 +1,9 @@
 Pending removal in Python 3.20
 ------------------------------
 
-* The ``__version__`` attribute has been deprecated in these standard library
-  modules and will be removed in Python 3.20.
-  Use :py:data:`sys.version_info` instead.
+* The ``__version__``, ``version`` and ``VERSION`` attributes have been
+  deprecated in these standard library modules and will be removed in
+  Python 3.20. Use :py:data:`sys.version_info` instead.
 
   - :mod:`argparse`
   - :mod:`csv`
@@ -24,6 +24,9 @@ Pending removal in Python 3.20
   - :mod:`tkinter.font`
   - :mod:`tkinter.ttk`
   - :mod:`wsgiref.simple_server`
+  - :mod:`xml.etree.ElementTree`
+  - :mod:`!xml.sax.expatreader`
+  - :mod:`xml.sax.handler`
   - :mod:`zlib`
 
   (Contributed by Hugo van Kemenade and Stan Ulbrych in :gh:`76007`.)
diff --git a/Doc/extending/extending.rst b/Doc/extending/extending.rst
index f9b65643dfe..c0066d315d0 100644
--- a/Doc/extending/extending.rst
+++ b/Doc/extending/extending.rst
@@ -3,154 +3,20 @@
 
 .. _extending-intro:
 
-******************************
-Extending Python with C or C++
-******************************
+********************************
+Using the C API: Assorted topics
+********************************
 
-It is quite easy to add new built-in modules to Python, if you know how to
-program in C.  Such :dfn:`extension modules` can do two things that can't be
-done directly in Python: they can implement new built-in object types, and they
-can call C library functions and system calls.
-
-To support extensions, the Python API (Application Programmers Interface)
-defines a set of functions, macros and variables that provide access to most
-aspects of the Python run-time system.  The Python API is incorporated in a C
-source file by including the header ``"Python.h"``.
-
-The compilation of an extension module depends on its intended use as well as on
-your system setup; details are given in later chapters.
-
-.. note::
-
-   The C extension interface is specific to CPython, and extension modules do
-   not work on other Python implementations.  In many cases, it is possible to
-   avoid writing C extensions and preserve portability to other implementations.
-   For example, if your use case is calling C library functions or system calls,
-   you should consider using the :mod:`ctypes` module or the `cffi
-   <https://cffi.readthedocs.io/>`_ library rather than writing
-   custom C code.
-   These modules let you write Python code to interface with C code and are more
-   portable between implementations of Python than writing and compiling a C
-   extension module.
-
-
-.. _extending-simpleexample:
-
-A Simple Example
-================
-
-Let's create an extension module called ``spam`` (the favorite food of Monty
-Python fans...) and let's say we want to create a Python interface to the C
-library function :c:func:`system` [#]_. This function takes a null-terminated
-character string as argument and returns an integer.  We want this function to
-be callable from Python as follows:
-
-.. code-block:: pycon
-
-   >>> import spam
-   >>> status = spam.system("ls -l")
-
-Begin by creating a file :file:`spammodule.c`.  (Historically, if a module is
-called ``spam``, the C file containing its implementation is called
-:file:`spammodule.c`; if the module name is very long, like ``spammify``, the
-module name can be just :file:`spammify.c`.)
-
-The first two lines of our file can be::
-
-   #define PY_SSIZE_T_CLEAN
-   #include <Python.h>
-
-which pulls in the Python API (you can add a comment describing the purpose of
-the module and a copyright notice if you like).
-
-.. note::
-
-   Since Python may define some pre-processor definitions which affect the standard
-   headers on some systems, you *must* include :file:`Python.h` before any standard
-   headers are included.
-
-   ``#define PY_SSIZE_T_CLEAN`` was used to indicate that ``Py_ssize_t`` should be
-   used in some APIs instead of ``int``.
-   It is not necessary since Python 3.13, but we keep it here for backward compatibility.
-   See :ref:`arg-parsing-string-and-buffers` for a description of this macro.
-
-All user-visible symbols defined by :file:`Python.h` have a prefix of ``Py`` or
-``PY``, except those defined in standard header files.
-
-.. tip::
-
-   For backward compatibility, :file:`Python.h` includes several standard header files.
-   C extensions should include the standard headers that they use,
-   and should not rely on these implicit includes.
-   If using the limited C API version 3.13 or newer, the implicit includes are:
-
-   * ``<assert.h>``
-   * ``<intrin.h>`` (on Windows)
-   * ``<inttypes.h>``
-   * ``<limits.h>``
-   * ``<math.h>``
-   * ``<stdarg.h>``
-   * ``<wchar.h>``
-   * ``<sys/types.h>`` (if present)
-
-   If :c:macro:`Py_LIMITED_API` is not defined, or is set to version 3.12 or older,
-   the headers below are also included:
-
-   * ``<ctype.h>``
-   * ``<unistd.h>`` (on POSIX)
-
-   If :c:macro:`Py_LIMITED_API` is not defined, or is set to version 3.10 or older,
-   the headers below are also included:
-
-   * ``<errno.h>``
-   * ``<stdio.h>``
-   * ``<stdlib.h>``
-   * ``<string.h>``
-
-The next thing we add to our module file is the C function that will be called
-when the Python expression ``spam.system(string)`` is evaluated (we'll see
-shortly how it ends up being called)::
-
-   static PyObject *
-   spam_system(PyObject *self, PyObject *args)
-   {
-       const char *command;
-       int sts;
-
-       if (!PyArg_ParseTuple(args, "s", &command))
-           return NULL;
-       sts = system(command);
-       return PyLong_FromLong(sts);
-   }
-
-There is a straightforward translation from the argument list in Python (for
-example, the single expression ``"ls -l"``) to the arguments passed to the C
-function.  The C function always has two arguments, conventionally named *self*
-and *args*.
-
-The *self* argument points to the module object for module-level functions;
-for a method it would point to the object instance.
-
-The *args* argument will be a pointer to a Python tuple object containing the
-arguments.  Each item of the tuple corresponds to an argument in the call's
-argument list.  The arguments are Python objects --- in order to do anything
-with them in our C function we have to convert them to C values.  The function
-:c:func:`PyArg_ParseTuple` in the Python API checks the argument types and
-converts them to C values.  It uses a template string to determine the required
-types of the arguments as well as the types of the C variables into which to
-store the converted values.  More about this later.
-
-:c:func:`PyArg_ParseTuple` returns true (nonzero) if all arguments have the right
-type and its components have been stored in the variables whose addresses are
-passed.  It returns false (zero) if an invalid argument list was passed.  In the
-latter case it also raises an appropriate exception so the calling function can
-return ``NULL`` immediately (as we saw in the example).
+The :ref:`tutorial <first-extension-module>` walked you through
+creating a C API extension module, but left many areas unexplained.
+This document looks at several concepts that you'll need to learn
+in order to write more complex extensions.
 
 
 .. _extending-errors:
 
-Intermezzo: Errors and Exceptions
-=================================
+Errors and Exceptions
+=====================
 
 An important convention throughout the Python interpreter is the following: when
 a function fails, it should set an exception condition and return an error value
@@ -321,194 +187,14 @@ call to :c:func:`PyErr_SetString` as shown below::
    }
 
 
-.. _backtoexample:
-
-Back to the Example
-===================
-
-Going back to our example function, you should now be able to understand this
-statement::
-
-   if (!PyArg_ParseTuple(args, "s", &command))
-       return NULL;
-
-It returns ``NULL`` (the error indicator for functions returning object pointers)
-if an error is detected in the argument list, relying on the exception set by
-:c:func:`PyArg_ParseTuple`.  Otherwise the string value of the argument has been
-copied to the local variable :c:data:`!command`.  This is a pointer assignment and
-you are not supposed to modify the string to which it points (so in Standard C,
-the variable :c:data:`!command` should properly be declared as ``const char
-*command``).
-
-The next statement is a call to the Unix function :c:func:`system`, passing it
-the string we just got from :c:func:`PyArg_ParseTuple`::
-
-   sts = system(command);
-
-Our :func:`!spam.system` function must return the value of :c:data:`!sts` as a
-Python object.  This is done using the function :c:func:`PyLong_FromLong`. ::
-
-   return PyLong_FromLong(sts);
-
-In this case, it will return an integer object.  (Yes, even integers are objects
-on the heap in Python!)
-
-If you have a C function that returns no useful argument (a function returning
-:c:expr:`void`), the corresponding Python function must return ``None``.   You
-need this idiom to do so (which is implemented by the :c:macro:`Py_RETURN_NONE`
-macro)::
-
-   Py_INCREF(Py_None);
-   return Py_None;
-
-:c:data:`Py_None` is the C name for the special Python object ``None``.  It is a
-genuine Python object rather than a ``NULL`` pointer, which means "error" in most
-contexts, as we have seen.
-
-
-.. _methodtable:
-
-The Module's Method Table and Initialization Function
-=====================================================
-
-I promised to show how :c:func:`!spam_system` is called from Python programs.
-First, we need to list its name and address in a "method table"::
-
-   static PyMethodDef spam_methods[] = {
-       ...
-       {"system",  spam_system, METH_VARARGS,
-        "Execute a shell command."},
-       ...
-       {NULL, NULL, 0, NULL}        /* Sentinel */
-   };
-
-Note the third entry (``METH_VARARGS``).  This is a flag telling the interpreter
-the calling convention to be used for the C function.  It should normally always
-be ``METH_VARARGS`` or ``METH_VARARGS | METH_KEYWORDS``; a value of ``0`` means
-that an obsolete variant of :c:func:`PyArg_ParseTuple` is used.
-
-When using only ``METH_VARARGS``, the function should expect the Python-level
-parameters to be passed in as a tuple acceptable for parsing via
-:c:func:`PyArg_ParseTuple`; more information on this function is provided below.
-
-The :c:macro:`METH_KEYWORDS` bit may be set in the third field if keyword
-arguments should be passed to the function.  In this case, the C function should
-accept a third ``PyObject *`` parameter which will be a dictionary of keywords.
-Use :c:func:`PyArg_ParseTupleAndKeywords` to parse the arguments to such a
-function.
-
-The method table must be referenced in the module definition structure::
-
-   static struct PyModuleDef spam_module = {
-       ...
-       .m_methods = spam_methods,
-       ...
-   };
-
-This structure, in turn, must be passed to the interpreter in the module's
-initialization function.  The initialization function must be named
-:c:func:`!PyInit_name`, where *name* is the name of the module, and should be the
-only non-\ ``static`` item defined in the module file::
-
-   PyMODINIT_FUNC
-   PyInit_spam(void)
-   {
-       return PyModuleDef_Init(&spam_module);
-   }
-
-Note that :c:macro:`PyMODINIT_FUNC` declares the function as ``PyObject *`` return type,
-declares any special linkage declarations required by the platform, and for C++
-declares the function as ``extern "C"``.
-
-:c:func:`!PyInit_spam` is called when each interpreter imports its module
-:mod:`!spam` for the first time.  (See below for comments about embedding Python.)
-A pointer to the module definition must be returned via :c:func:`PyModuleDef_Init`,
-so that the import machinery can create the module and store it in ``sys.modules``.
-
-When embedding Python, the :c:func:`!PyInit_spam` function is not called
-automatically unless there's an entry in the :c:data:`PyImport_Inittab` table.
-To add the module to the initialization table, use :c:func:`PyImport_AppendInittab`,
-optionally followed by an import of the module::
-
-   #define PY_SSIZE_T_CLEAN
-   #include <Python.h>
-
-   int
-   main(int argc, char *argv[])
-   {
-       PyStatus status;
-       PyConfig config;
-       PyConfig_InitPythonConfig(&config);
-
-       /* Add a built-in module, before Py_Initialize */
-       if (PyImport_AppendInittab("spam", PyInit_spam) == -1) {
-           fprintf(stderr, "Error: could not extend in-built modules table\n");
-           exit(1);
-       }
-
-       /* Pass argv[0] to the Python interpreter */
-       status = PyConfig_SetBytesString(&config, &config.program_name, argv[0]);
-       if (PyStatus_Exception(status)) {
-           goto exception;
-       }
-
-       /* Initialize the Python interpreter.  Required.
-          If this step fails, it will be a fatal error. */
-       status = Py_InitializeFromConfig(&config);
-       if (PyStatus_Exception(status)) {
-           goto exception;
-       }
-       PyConfig_Clear(&config);
-
-       /* Optionally import the module; alternatively,
-          import can be deferred until the embedded script
-          imports it. */
-       PyObject *pmodule = PyImport_ImportModule("spam");
-       if (!pmodule) {
-           PyErr_Print();
-           fprintf(stderr, "Error: could not import module 'spam'\n");
-       }
-
-       // ... use Python C API here ...
-
-       return 0;
-
-     exception:
-        PyConfig_Clear(&config);
-        Py_ExitStatusException(status);
-   }
-
-.. note::
-
-   If you declare a global variable or a local static one, the module may
-   experience unintended side-effects on re-initialisation, for example when
-   removing entries from ``sys.modules`` or importing compiled modules into
-   multiple interpreters within a process
-   (or following a :c:func:`fork` without an intervening :c:func:`exec`).
-   If module state is not yet fully :ref:`isolated <isolating-extensions-howto>`,
-   authors should consider marking the module as having no support for subinterpreters
-   (via :c:macro:`Py_MOD_MULTIPLE_INTERPRETERS_NOT_SUPPORTED`).
-
-A more substantial example module is included in the Python source distribution
-as :file:`Modules/xxlimited.c`.  This file may be used as a template or simply
-read as an example.
-
-
 .. _compilation:
 
-Compilation and Linkage
-=======================
+Embedding an extension
+======================
 
-There are two more things to do before you can use your new extension: compiling
-and linking it with the Python system.  If you use dynamic loading, the details
-may depend on the style of dynamic loading your system uses; see the chapters
-about building extension modules (chapter :ref:`building`) and additional
-information that pertains only to building on Windows (chapter
-:ref:`building-on-windows`) for more information about this.
-
-If you can't use dynamic loading, or if you want to make your module a permanent
+If you want to make your module a permanent
 part of the Python interpreter, you will have to change the configuration setup
-and rebuild the interpreter.  Luckily, this is very simple on Unix: just place
+and rebuild the interpreter.  On Unix, place
 your file (:file:`spammodule.c` for example) in the :file:`Modules/` directory
 of an unpacked source distribution, add a line to the file
 :file:`Modules/Setup.local` describing your file:
@@ -536,7 +222,7 @@ on the line in the configuration file as well, for instance:
 Calling Python Functions from C
 ===============================
 
-So far we have concentrated on making C functions callable from Python.  The
+The tutorial concentrated on making C functions callable from Python.  The
 reverse is also useful: calling Python functions from C. This is especially the
 case for libraries that support so-called "callback" functions.  If a C
 interface makes use of callbacks, the equivalent Python often needs to provide a
@@ -581,7 +267,7 @@ be part of a module definition::
    }
 
 This function must be registered with the interpreter using the
-:c:macro:`METH_VARARGS` flag; this is described in section :ref:`methodtable`.  The
+:c:macro:`METH_VARARGS` flag in :c:type:`PyMethodDef.ml_flags`.  The
 :c:func:`PyArg_ParseTuple` function and its arguments are documented in section
 :ref:`parsetuple`.
 
@@ -676,14 +362,21 @@ the above example, we use :c:func:`Py_BuildValue` to construct the dictionary. :
    Py_DECREF(result);
 
 
+.. index:: single: PyArg_ParseTuple (C function)
+
 .. _parsetuple:
 
 Extracting Parameters in Extension Functions
 ============================================
 
-.. index:: single: PyArg_ParseTuple (C function)
+The :ref:`tutorial <first-extension-module>` uses a ":c:data:`METH_O`"
+function, which is limited to a single Python argument.
+If you want more, you can use :c:data:`METH_VARARGS` instead.
+With this flag, the C function will receive a :py:class:`tuple` of arguments
+instead of a single object.
 
-The :c:func:`PyArg_ParseTuple` function is declared as follows::
+For unpacking the tuple, CPython provides the :c:func:`PyArg_ParseTuple`
+function, declared as follows::
 
    int PyArg_ParseTuple(PyObject *arg, const char *format, ...);
 
@@ -693,6 +386,19 @@ whose syntax is explained in :ref:`arg-parsing` in the Python/C API Reference
 Manual.  The remaining arguments must be addresses of variables whose type is
 determined by the format string.
 
+For example, to receive a single Python :py:class:`str` object and turn it
+into a C buffer, you would use ``"s"`` as the format string::
+
+   const char *command;
+   if (!PyArg_ParseTuple(args, "s", &command)) {
+       return NULL;
+   }
+
+If an error is detected in the argument list, :c:func:`!PyArg_ParseTuple`
+returns ``NULL`` (the error indicator for functions returning object pointers);
+your function may return ``NULL``, relying on the exception set by
+:c:func:`PyArg_ParseTuple`.
+
 Note that while :c:func:`PyArg_ParseTuple` checks that the Python arguments have
 the required types, it cannot check the validity of the addresses of C variables
 passed to the call: if you make mistakes there, your code will probably crash or
@@ -703,7 +409,6 @@ Note that any Python object references which are provided to the caller are
 
 Some example calls::
 
-   #define PY_SSIZE_T_CLEAN
    #include <Python.h>
 
 ::
@@ -773,6 +478,17 @@ Some example calls::
 Keyword Parameters for Extension Functions
 ==========================================
 
+If you also want your function to accept
+:term:`keyword arguments <keyword argument>`, use the :c:data:`METH_KEYWORDS`
+flag in combination with :c:data:`METH_VARARGS`.
+(:c:data:`!METH_KEYWORDS` can also be used with other flags; see its
+documentation for the allowed combinations.)
+
+In this case, the C function should accept a third ``PyObject *`` parameter
+which will be a dictionary of keywords.
+Use :c:func:`PyArg_ParseTupleAndKeywords` to parse the arguments to such a
+function.
+
 .. index:: single: PyArg_ParseTupleAndKeywords (C function)
 
 The :c:func:`PyArg_ParseTupleAndKeywords` function is declared as follows::
@@ -833,19 +549,6 @@ Philbrick (philbrick@hks.com)::
        {NULL, NULL, 0, NULL}   /* sentinel */
    };
 
-   static struct PyModuleDef keywdarg_module = {
-       .m_base = PyModuleDef_HEAD_INIT,
-       .m_name = "keywdarg",
-       .m_size = 0,
-       .m_methods = keywdarg_methods,
-   };
-
-   PyMODINIT_FUNC
-   PyInit_keywdarg(void)
-   {
-       return PyModuleDef_Init(&keywdarg_module);
-   }
-
 
 .. _buildvalue:
 
@@ -986,11 +689,11 @@ needed.  Ownership of a reference can be transferred.  There are three ways to
 dispose of an owned reference: pass it on, store it, or call :c:func:`Py_DECREF`.
 Forgetting to dispose of an owned reference creates a memory leak.
 
-It is also possible to :dfn:`borrow` [#]_ a reference to an object.  The
+It is also possible to :dfn:`borrow` [#borrow]_ a reference to an object.  The
 borrower of a reference should not call :c:func:`Py_DECREF`.  The borrower must
 not hold on to the object longer than the owner from which it was borrowed.
 Using a borrowed reference after the owner has disposed of it risks using freed
-memory and should be avoided completely [#]_.
+memory and should be avoided completely [#dont-check-refcount]_.
 
 The advantage of borrowing over owning a reference is that you don't need to
 take care of disposing of the reference on all possible paths through the code
@@ -1169,7 +872,7 @@ checking.
 
 The C function calling mechanism guarantees that the argument list passed to C
 functions (``args`` in the examples) is never ``NULL`` --- in fact it guarantees
-that it is always a tuple [#]_.
+that it is always a tuple [#old-calling-convention]_.
 
 It is a severe error to ever let a ``NULL`` pointer "escape" to the Python user.
 
@@ -1226,8 +929,8 @@ the module whose functions one wishes to call might not have been loaded yet!
 Portability therefore requires not to make any assumptions about symbol
 visibility. This means that all symbols in extension modules should be declared
 ``static``, except for the module's initialization function, in order to
-avoid name clashes with other extension modules (as discussed in section
-:ref:`methodtable`). And it means that symbols that *should* be accessible from
+avoid name clashes with other extension modules. And it means that symbols
+that *should* be accessible from
 other extension modules must be exported in a different way.
 
 Python provides a special mechanism to pass C-level information (pointers) from
@@ -1269,8 +972,9 @@ file corresponding to the module provides a macro that takes care of importing
 the module and retrieving its C API pointers; client modules only have to call
 this macro before accessing the C API.
 
-The exporting module is a modification of the :mod:`!spam` module from section
-:ref:`extending-simpleexample`. The function :func:`!spam.system` does not call
+The exporting module is a modification of the :mod:`!spam` module from the
+:ref:`tutorial <first-extension-module>`.
+The function :func:`!spam.system` does not call
 the C library function :c:func:`system` directly, but a function
 :c:func:`!PySpam_System`, which would of course do something more complicated in
 reality (such as adding "spam" to every command). This function
@@ -1412,15 +1116,14 @@ code distribution).
 
 .. rubric:: Footnotes
 
-.. [#] An interface for this function already exists in the standard module :mod:`os`
-   --- it was chosen as a simple and straightforward example.
+.. [#borrow] The metaphor of "borrowing" a reference is not completely correct:
+   the owner still has a copy of the reference.
 
-.. [#] The metaphor of "borrowing" a reference is not completely correct: the owner
-   still has a copy of the reference.
-
-.. [#] Checking that the reference count is at least 1 **does not work** --- the
+.. [#dont-check-refcount] Checking that the reference count is at least 1
+   **does not work** --- the
    reference count itself could be in freed memory and may thus be reused for
    another object!
 
-.. [#] These guarantees don't hold when you use the "old" style calling convention ---
+.. [#old-calling-convention] These guarantees don't hold when you use the
+   "old" style calling convention ---
    this is still found in much existing code.
diff --git a/Doc/extending/first-extension-module.rst b/Doc/extending/first-extension-module.rst
new file mode 100644
index 00000000000..5bde785c49e
--- /dev/null
+++ b/Doc/extending/first-extension-module.rst
@@ -0,0 +1,667 @@
+.. highlight:: c
+
+
+.. _extending-simpleexample:
+.. _first-extension-module:
+
+*********************************
+Your first C API extension module
+*********************************
+
+This tutorial will take you through creating a simple
+Python extension module written in C or C++.
+
+We will use the low-level Python C API directly.
+For easier ways to create extension modules, see
+the :ref:`recommended third party tools <c-api-tools>`.
+
+The tutorial assumes basic knowledge about Python: you should be able to
+define functions in Python code before starting to write them in C.
+See :ref:`tutorial-index` for an introduction to Python itself.
+
+The tutorial should be approachable for anyone who can write a basic C library.
+While we will mention several concepts that a C beginner would not be expected
+to know, like ``static`` functions or linkage declarations, understanding these
+is not necessary for success.
+
+We will focus on giving you a "feel" of what Python's C API is like.
+It will not teach you important concepts, like error handling
+and reference counting, which are covered in later chapters.
+
+We will assume that you use a Unix-like system (including macOS and
+Linux), or Windows.
+On other systems, you might need to adjust some details -- for example,
+a system command name.
+
+You need to have a suitable C compiler and Python development headers installed.
+On Linux, headers are often in a package like ``python3-dev``
+or ``python3-devel``.
+
+You need to be able to install Python packages.
+This tutorial uses `pip <https://pip.pypa.io/>`__ (``pip install``), but you
+can substitute any tool that can build and install ``pyproject.toml``-based
+projects, like `uv <https://docs.astral.sh/uv/>`_ (``uv pip install``).
+Preferably, have a :ref:`virtual environment <venv-def>` activated.
+
+
+.. note::
+
+   This tutorial uses APIs that were added in CPython 3.15.
+   To create an extension that's compatible with earlier versions of CPython,
+   please follow an earlier version of this documentation.
+
+   This tutorial uses C syntax added in C11 and C++20.
+   If your extension needs to be compatible with earlier standards,
+   please follow tutorials in documentation for Python 3.14 or below.
+
+
+What we'll do
+=============
+
+Let's create an extension module called ``spam`` [#why-spam]_,
+which will include a Python interface to the C
+standard library function :c:func:`system`.
+This function is defined in ``stdlib.h``.
+It takes a C string as argument, runs the argument as a system
+command, and returns a result value as an integer.
+A manual page for :c:func:`system` might summarize it this way::
+
+   #include <stdlib.h>
+   int system(const char *command);
+
+Note that like many functions in the C standard library,
+this function is already exposed in Python.
+In production, use :py:func:`os.system` or :py:func:`subprocess.run`
+rather than the module you'll write here.
+
+We want this function to be callable from Python as follows:
+
+.. code-block:: pycon
+
+   >>> import spam
+   >>> status = spam.system("whoami")
+   User Name
+   >>> status
+   0
+
+.. note::
+
+   The system command ``whoami`` prints out your username.
+   It's useful in tutorials like this one because it has the same name on
+   both Unix and Windows.
+
+
+Start with the headers
+======================
+
+Begin by creating a directory for this tutorial, and switching to it
+on the command line.
+Then, create a file named :file:`spammodule.c` in your directory.
+[#why-spammodule]_
+
+In this file, we'll include two headers: :file:`Python.h` to pull in
+all declarations of the Python C API, and :file:`stdlib.h` for the
+:c:func:`system` function. [#stdlib-h]_
+
+Add the following lines to :file:`spammodule.c`:
+
+.. literalinclude:: ../includes/capi-extension/spammodule-01.c
+   :start-at: <Python.h>
+   :end-at: <stdlib.h>
+
+Be sure to put :file:`stdlib.h`, and any other standard library includes,
+*after* :file:`Python.h`.
+On some systems, Python may define some pre-processor definitions
+that affect the standard headers.
+
+
+Running your build tool
+=======================
+
+With only the includes in place, your extension won't do anything.
+Still, it's a good time to compile it and try to import it.
+This will ensure that your build tool works, so that you can make
+and test incremental changes as you follow the rest of the text.
+
+CPython itself does not come with a tool to build extension modules;
+it is recommended to use a third-party project for this.
+In this tutorial, we'll use `meson-python`_.
+(If you want to use another one, see :ref:`first-extension-other-tools`.)
+
+.. at the time of writing, meson-python has the least overhead for a
+   simple extension using PyModExport.
+   Change this if another tool makes things easier.
+
+``meson-python`` requires defining a "project" using two extra files.
+
+First, add ``pyproject.toml`` with these contents:
+
+.. code-block:: toml
+
+   [build-system]
+   build-backend = 'mesonpy'
+   requires = ['meson-python']
+
+   [project]
+   # Placeholder project information
+   # (change this before distributing the module)
+   name = 'sampleproject'
+   version = '0'
+
+Then, create ``meson.build`` containing the following:
+
+.. code-block:: meson
+
+   project('sampleproject', 'c')
+
+   py = import('python').find_installation(pure: false)
+
+   py.extension_module(
+      'spam',          # name of the importable Python module
+      'spammodule.c',  # the C source file
+      install: true,
+   )
+
+.. note::
+
+   See `meson-python documentation <meson-python>`_ for details on
+   configuration.
+
+Now, build install the *project in the current directory* (``.``) via ``pip``:
+
+.. code-block:: sh
+
+   python -m pip install .
+
+.. tip::
+
+   If you don't have ``pip`` installed, run ``python -m ensurepip``,
+   preferably in a :ref:`virtual environment <venv-def>`.
+   (Or, if you prefer another tool that can build and install
+   ``pyproject.toml``-based projects, use that.)
+
+.. _meson-python: https://mesonbuild.com/meson-python/
+.. _virtual environment: https://packaging.python.org/en/latest/guides/installing-using-pip-and-virtual-environments/#create-and-use-virtual-environments
+
+Note that you will need to run this command again every time you change your
+extension.
+Unlike Python, C has an explicit compilation step.
+
+When your extension is compiled and installed, start Python and try to
+import it.
+This should fail with the following exception:
+
+.. code-block:: pycon
+
+   >>> import spam
+   Traceback (most recent call last):
+      ...
+   ImportError: dynamic module does not define module export function (PyModExport_spam or PyInit_spam)
+
+
+Module export hook
+==================
+
+The exception you got when you tried to import the module told you that Python
+is looking for a "module export function", also known as a
+:ref:`module export hook <extension-export-hook>`.
+Let's define one.
+
+First, add a prototype below the ``#include`` lines:
+
+.. literalinclude:: ../includes/capi-extension/spammodule-01.c
+   :start-after: /// Export hook prototype
+   :end-before: ///
+
+.. tip::
+   The prototype is not strictly necessary, but some modern compilers emit
+   warnings without it.
+   It's generally better to add the prototype than to disable the warning.
+
+The :c:macro:`PyMODEXPORT_FUNC` macro declares the function's
+return type, and adds any special linkage declarations needed
+to make the function visible and usable when CPython loads it.
+
+After the prototype, add the function itself.
+For now, make it return ``NULL``:
+
+.. code-block:: c
+
+   PyMODEXPORT_FUNC
+   PyModExport_spam(void)
+   {
+      return NULL;
+   }
+
+Compile and load the module again.
+You should get a different error this time.
+
+.. code-block:: pycon
+
+   >>> import spam
+   Traceback (most recent call last):
+      ...
+   SystemError: module export hook for module 'spam' failed without setting an exception
+
+Simply returning ``NULL`` is *not* correct behavior for an export hook,
+and CPython complains about it.
+That's good -- it means that CPython found the function!
+Let's now make it do something useful.
+
+
+The slot table
+==============
+
+Rather than ``NULL``, the export hook should return the information needed to
+create a module.
+Let's start with the basics: the name and docstring.
+
+The information should be defined in a ``static`` array of
+:c:type:`PyModuleDef_Slot` entries, which are essentially key-value pairs.
+Define this array just before your export hook:
+
+.. code-block:: c
+
+   static PyModuleDef_Slot spam_slots[] = {
+      {Py_mod_name, "spam"},
+      {Py_mod_doc, "A wonderful module with an example function"},
+      {0, NULL}
+   };
+
+For both :c:data:`Py_mod_name` and :c:data:`Py_mod_doc`, the values are C
+strings -- that is, NUL-terminated, UTF-8 encoded byte arrays.
+
+Note the zero-filled sentinel entry at the end.
+If you forget it, you'll trigger undefined behavior.
+
+The array is defined as ``static`` -- that is, not visible outside this ``.c`` file.
+This will be a common theme.
+CPython only needs to access the export hook; all global variables
+and all other functions should generally be ``static``, so that they don't
+clash with other extensions.
+
+Return this array from your export hook instead of ``NULL``:
+
+.. code-block:: c
+   :emphasize-lines: 4
+
+   PyMODEXPORT_FUNC
+   PyModExport_spam(void)
+   {
+      return spam_slots;
+   }
+
+Now, recompile and try it out:
+
+.. code-block:: pycon
+
+   >>> import spam
+   >>> print(spam)
+   <module 'spam' from '/home/encukou/dev/cpython/spam.so'>
+
+You have an extension module!
+Try ``help(spam)`` to see the docstring.
+
+The next step will be adding a function.
+
+
+.. _backtoexample:
+
+Exposing a function
+===================
+
+To expose the :c:func:`system` C function directly to Python,
+we'll need to write a layer of glue code to convert arguments from Python
+objects to C values, and the C return value back to Python.
+
+One of the simplest ways to write glue code is a ":c:data:`METH_O`" function,
+which takes two Python objects and returns one.
+All Python objects -- regardless of the Python type -- are represented in C
+as pointers to the :c:type:`PyObject` structure.
+
+Add such a function above the slots array::
+
+   static PyObject *
+   spam_system(PyObject *self, PyObject *arg)
+   {
+      Py_RETURN_NONE;
+   }
+
+For now, we ignore the arguments, and use the :c:macro:`Py_RETURN_NONE`
+macro, which expands to a ``return`` statement that properly returns
+a Python :py:data:`None` object.
+
+Recompile your extension to make sure you don't have syntax errors.
+We haven't yet added ``spam_system`` to the module, so you might get a
+warning that ``spam_system`` is unused.
+
+.. _methodtable:
+
+Method definitions
+------------------
+
+To expose the C function to Python, you will need to provide several pieces of
+information in a structure called
+:c:type:`PyMethodDef` [#why-pymethoddef]_:
+
+* ``ml_name``: the name of the Python function;
+* ``ml_doc``: a docstring;
+* ``ml_meth``: the C function to be called; and
+* ``ml_flags``: a set of flags describing details like how Python arguments are
+  passed to the C function.
+  We'll use :c:data:`METH_O` here -- the flag that matches our
+  ``spam_system`` function's signature.
+
+Because modules typically create several functions, these definitions
+need to be collected in an array, with a zero-filled sentinel at the end.
+Add this array just below the ``spam_system`` function:
+
+.. literalinclude:: ../includes/capi-extension/spammodule-01.c
+   :start-after: /// Module method table
+   :end-before: ///
+
+As with module slots, a zero-filled sentinel marks the end of the array.
+
+Next, we'll add the method to the module.
+Add a :c:data:`Py_mod_methods` slot to your :c:type:`PyMethodDef` array:
+
+.. literalinclude:: ../includes/capi-extension/spammodule-01.c
+   :start-after: /// Module slot table
+   :end-before: ///
+   :emphasize-lines: 5
+
+Recompile your extension again, and test it.
+Be sure to restart the Python interpreter, so that ``import spam`` picks
+up the new version of the module.
+
+You should now be able to call the function:
+
+.. code-block:: pycon
+
+   >>> import spam
+   >>> print(spam.system)
+   <built-in function system>
+   >>> print(spam.system('whoami'))
+   None
+
+Note that our ``spam.system`` does not yet run the ``whoami`` command;
+it only returns ``None``.
+
+Check that the function accepts exactly one argument, as specified by
+the :c:data:`METH_O` flag:
+
+.. code-block:: pycon
+
+   >>> print(spam.system('too', 'many', 'arguments'))
+   Traceback (most recent call last):
+      ...
+   TypeError: spam.system() takes exactly one argument (3 given)
+
+
+Returning an integer
+====================
+
+Now, let's take a look at the return value.
+Instead of ``None``, we'll want ``spam.system`` to return a number -- that is,
+a Python :py:type:`int` object.
+Eventually this will be the exit code of a system command,
+but let's start with a fixed value, say, ``3``.
+
+The Python C API provides a function to create a Python :py:type:`int` object
+from a C ``int`` value: :c:func:`PyLong_FromLong`. [#why-pylongfromlong]_
+
+To call it, replace the ``Py_RETURN_NONE`` with the following 3 lines:
+
+.. this could be a one-liner, but we want to show the data types here
+
+.. code-block:: c
+   :emphasize-lines: 4-6
+
+   static PyObject *
+   spam_system(PyObject *self, PyObject *arg)
+   {
+      int status = 3;
+      PyObject *result = PyLong_FromLong(status);
+      return result;
+   }
+
+
+Recompile, restart the Python interpreter again,
+and check that the function now returns 3:
+
+.. code-block:: pycon
+
+   >>> import spam
+   >>> spam.system('whoami')
+   3
+
+
+Accepting a string
+==================
+
+Finally, let's handle the function argument.
+
+Our C function, :c:func:`!spam_system`, takes two arguments.
+The first one, ``PyObject *self``, will be set to the ``spam`` module
+object.
+This isn't useful in our case, so we'll ignore it.
+
+The other one, ``PyObject *arg``, will be set to the object that the user
+passed from Python.
+We expect that it should be a Python string.
+In order to use the information in it, we will need
+to convert it to a C value -- in this case, a C string (``const char *``).
+
+There's a slight type mismatch here: Python's :py:class:`str` objects store
+Unicode text, but C strings are arrays of bytes.
+So, we'll need to *encode* the data, and we'll use the UTF-8 encoding for it.
+(UTF-8 might not always be correct for system commands, but it's what
+:py:meth:`str.encode` uses by default,
+and the C API has special support for it.)
+
+The function to encode a Python string into a UTF-8 buffer is named
+:c:func:`PyUnicode_AsUTF8` [#why-pyunicodeasutf8]_.
+Call it like this:
+
+.. code-block:: c
+   :emphasize-lines: 4
+
+   static PyObject *
+   spam_system(PyObject *self, PyObject *arg)
+   {
+      const char *command = PyUnicode_AsUTF8(arg);
+      int status = 3;
+      PyObject *result = PyLong_FromLong(status);
+      return result;
+   }
+
+If :c:func:`PyUnicode_AsUTF8` is successful, *command* will point to the
+resulting array of bytes.
+This buffer is managed by the *arg* object, which means we don't need to free
+it, but we must follow some rules:
+
+* We should only use the buffer inside the ``spam_system`` function.
+  When ``spam_system`` returns, *arg* and the buffer it manages might be
+  garbage-collected.
+* We must not modify it. This is why we use ``const``.
+
+If :c:func:`PyUnicode_AsUTF8` was *not* successful, it returns a ``NULL``
+pointer.
+When calling *any* Python C API, we always need to handle such error cases.
+The way to do this in general is left for later chapters of this documentation.
+For now, be assured that we are already handling errors from
+:c:func:`PyLong_FromLong` correctly.
+
+For the :c:func:`PyUnicode_AsUTF8` call, the correct way to handle errors is
+returning ``NULL`` from ``spam_system``.
+Add an ``if`` block for this:
+
+
+.. code-block:: c
+   :emphasize-lines: 5-7
+
+   static PyObject *
+   spam_system(PyObject *self, PyObject *arg)
+   {
+      const char *command = PyUnicode_AsUTF8(arg);
+      if (command == NULL) {
+         return NULL;
+      }
+      int status = 3;
+      PyObject *result = PyLong_FromLong(status);
+      return result;
+   }
+
+That's it for the setup.
+Now, all that is left is calling the C library function :c:func:`system` with
+the ``char *`` buffer, and using its result instead of the ``3``:
+
+.. code-block:: c
+   :emphasize-lines: 8
+
+   static PyObject *
+   spam_system(PyObject *self, PyObject *arg)
+   {
+      const char *command = PyUnicode_AsUTF8(arg);
+      if (command == NULL) {
+         return NULL;
+      }
+      int status = system(command);
+      PyObject *result = PyLong_FromLong(status);
+      return result;
+   }
+
+Compile your module, restart Python, and test.
+This time, you should see your username -- the output of the ``whoami``
+system command:
+
+.. code-block:: pycon
+
+   >>> import spam
+   >>> result = spam.system('whoami')
+   User Name
+   >>> result
+   0
+
+You might also want to test error cases:
+
+.. code-block:: pycon
+
+   >>> import spam
+   >>> result = spam.system('nonexistent-command')
+   sh: line 1: nonexistent-command: command not found
+   >>> result
+   32512
+
+   >>> spam.system(3)
+   Traceback (most recent call last):
+      ...
+   TypeError: bad argument type for built-in operation
+
+
+The result
+==========
+
+
+Congratulations!
+You have written a complete Python C API extension module,
+and completed this tutorial!
+
+Here is the entire source file, for your convenience:
+
+.. _extending-spammodule-source:
+
+.. literalinclude:: ../includes/capi-extension/spammodule-01.c
+   :start-at: ///
+
+
+.. _first-extension-other-tools:
+
+Appendix: Other build tools
+===========================
+
+You should be able to follow this tutorial -- except the
+*Running your build tool* section itself -- with a build tool other
+than ``meson-python``.
+
+The Python Packaging User Guide has a `list of recommended tools <https://packaging.python.org/en/latest/guides/tool-recommendations/#build-backends-for-extension-modules>`_;
+be sure to choose one for the C language.
+
+
+Workaround for missing PyInit function
+--------------------------------------
+
+If your build tool output complains about missing ``PyInit_spam``,
+add the following function to your module for now:
+
+.. code-block:: c
+
+   // A workaround
+   void *PyInit_spam(void) { return NULL; }
+
+This is a shim for an old-style :ref:`initialization function <extension-export-hook>`,
+which was required in extension modules for CPython 3.14 and below.
+Current CPython does not need it, but some build tools may still assume that
+all extension modules need to define it.
+
+If you use this workaround, you will get the exception
+``SystemError: initialization of spam failed without raising an exception``
+instead of
+``ImportError: dynamic module does not define module export function``.
+
+
+Compiling directly
+------------------
+
+Using a third-party build tool is heavily recommended,
+as it will take care of various details of your platform and Python
+installation, of naming the resulting extension, and, later, of distributing
+your work.
+
+If you are building an extension for as *specific* system, or for yourself
+only, you might instead want to run your compiler directly.
+The way to do this is system-specific; be prepared for issues you will need
+to solve yourself.
+
+Linux
+^^^^^
+
+On Linux, the Python development package may include a ``python3-config``
+command that prints out the required compiler flags.
+If you use it, check that it corresponds to the CPython interpreter you'll use
+to load the module.
+Then, start with the following command:
+
+.. code-block:: sh
+
+   gcc --shared $(python3-config --cflags --ldflags) spammodule.c -o spam.so
+
+This should generate a ``spam.so`` file that you need to put in a directory
+on :py:attr:`sys.path`.
+
+
+.. rubric:: Footnotes
+
+.. [#why-spam] ``spam`` is the favorite food of Monty Python fans...
+.. [#why-spammodule] The source file name is entirely up to you,
+   though some tools can be picky about the ``.c`` extension.
+   This tutorial uses the traditional ``*module.c`` suffix.
+   Some people would just use :file:`spam.c` to implement a module
+   named ``spam``,
+   projects where Python isn't the primary language might use ``py_spam.c``,
+   and so on.
+.. [#stdlib-h] Including :file:`stdlib.h` is technically not necessary,
+   since :file:`Python.h` includes it and
+   :ref:`several other standard headers <capi-system-includes>` for its own use
+   or for backwards compatibility.
+   However, it is good practice to explicitly include what you need.
+.. [#why-pymethoddef] The :c:type:`!PyMethodDef` structure is also used
+   to create methods of classes, so there's no separate
+   ":c:type:`!PyFunctionDef`".
+.. [#why-pylongfromlong] The name :c:func:`PyLong_FromLong`
+   might not seem obvious.
+   ``PyLong`` refers to a the Python :py:class:`int`, which was originally
+   called ``long``; the ``FromLong`` refers to the C ``long`` (or ``long int``)
+   type.
+.. [#why-pyunicodeasutf8] Here, ``PyUnicode`` refers to the original name of
+   the Python :py:class:`str` class: ``unicode``.
diff --git a/Doc/extending/index.rst b/Doc/extending/index.rst
index 4cc2c96d8d5..c0c494c3059 100644
--- a/Doc/extending/index.rst
+++ b/Doc/extending/index.rst
@@ -5,15 +5,17 @@
 ##################################################
 
 This document describes how to write modules in C or C++ to extend the Python
-interpreter with new modules.  Those modules can not only define new functions
-but also new object types and their methods.  The document also describes how
+interpreter with new modules.  Those modules can do what Python code does --
+define functions, object types and methods -- but also interact with native
+libraries or achieve better performance by avoiding the overhead of an
+interpreter.  The document also describes how
 to embed the Python interpreter in another application, for use as an extension
 language.  Finally, it shows how to compile and link extension modules so that
 they can be loaded dynamically (at run time) into the interpreter, if the
 underlying operating system supports this feature.
 
-This document assumes basic knowledge about Python.  For an informal
-introduction to the language, see :ref:`tutorial-index`.  :ref:`reference-index`
+This document assumes basic knowledge about C and Python.  For an informal
+introduction to Python, see :ref:`tutorial-index`.  :ref:`reference-index`
 gives a more formal definition of the language.  :ref:`library-index` documents
 the existing object types, functions and modules (both built-in and written in
 Python) that give the language its wide application range.
@@ -21,37 +23,75 @@ Python) that give the language its wide application range.
 For a detailed description of the whole Python/C API, see the separate
 :ref:`c-api-index`.
 
+To support extensions, Python's C API (Application Programmers Interface)
+defines a set of functions, macros and variables that provide access to most
+aspects of the Python run-time system.  The Python API is incorporated in a C
+source file by including the header ``"Python.h"``.
+
+.. note::
+
+   The C extension interface is specific to CPython, and extension modules do
+   not work on other Python implementations.  In many cases, it is possible to
+   avoid writing C extensions and preserve portability to other implementations.
+   For example, if your use case is calling C library functions or system calls,
+   you should consider using the :mod:`ctypes` module or the `cffi
+   <https://cffi.readthedocs.io/>`_ library rather than writing
+   custom C code.
+   These modules let you write Python code to interface with C code and are more
+   portable between implementations of Python than writing and compiling a C
+   extension module.
+
+
+.. toctree::
+   :hidden:
+
+   first-extension-module.rst
+   extending.rst
+   newtypes_tutorial.rst
+   newtypes.rst
+   building.rst
+   windows.rst
+   embedding.rst
+
 
 Recommended third party tools
 =============================
 
-This guide only covers the basic tools for creating extensions provided
+This document only covers the basic tools for creating extensions provided
 as part of this version of CPython. Some :ref:`third party tools
 <c-api-tools>` offer both simpler and more sophisticated approaches to creating
 C and C++ extensions for Python.
 
+While this document is aimed at extension authors, it should also be helpful to
+the authors of such tools.
+For example, the tutorial module can serve as a simple test case for a build
+tool or sample expected output of a code generator.
 
-Creating extensions without third party tools
-=============================================
+
+C API Tutorial
+==============
+
+This tutorial describes how to write a simple module in C or C++,
+using the Python C API -- that is, using the basic tools provided
+as part of this version of CPython.
+
+
+#. :ref:`first-extension-module`
+
+
+Guides for intermediate topics
+==============================
 
 This section of the guide covers creating C and C++ extensions without
 assistance from third party tools. It is intended primarily for creators
 of those tools, rather than being a recommended way to create your own
 C extensions.
 
-.. seealso::
-
-   :pep:`489` -- Multi-phase extension module initialization
-
-.. toctree::
-   :maxdepth: 2
-   :numbered:
-
-   extending.rst
-   newtypes_tutorial.rst
-   newtypes.rst
-   building.rst
-   windows.rst
+* :ref:`extending-intro`
+* :ref:`defining-new-types`
+* :ref:`new-types-topics`
+* :ref:`building`
+* :ref:`building-on-windows`
 
 Embedding the CPython runtime in a larger application
 =====================================================
@@ -61,8 +101,4 @@ interpreter as the main application, it is desirable to instead embed
 the CPython runtime inside a larger application. This section covers
 some of the details involved in doing that successfully.
 
-.. toctree::
-   :maxdepth: 2
-   :numbered:
-
-   embedding.rst
+* :ref:`embedding`
diff --git a/Doc/glossary.rst b/Doc/glossary.rst
index 3a01df99c38..68035c2dfb5 100644
--- a/Doc/glossary.rst
+++ b/Doc/glossary.rst
@@ -134,6 +134,14 @@ Glossary
       iterator's :meth:`~object.__anext__` method until it raises a
       :exc:`StopAsyncIteration` exception.  Introduced by :pep:`492`.
 
+   atomic operation
+      An operation that appears to execute as a single, indivisible step: no
+      other thread can observe it half-done, and its effects become visible all
+      at once.  Python does not guarantee that high-level statements are atomic
+      (for example, ``x += 1`` performs multiple bytecode operations and is not
+      atomic).  Atomicity is only guaranteed where explicitly documented.  See
+      also :term:`race condition` and :term:`data race`.
+
    attached thread state
 
       A :term:`thread state` that is active for the current OS thread.
@@ -289,6 +297,22 @@ Glossary
       advanced mathematical feature.  If you're not aware of a need for them,
       it's almost certain you can safely ignore them.
 
+   concurrency
+      The ability of a computer program to perform multiple tasks at the same
+      time.  Python provides libraries for writing programs that make use of
+      different forms of concurrency.  :mod:`asyncio` is a library for dealing
+      with asynchronous tasks and coroutines.  :mod:`threading` provides
+      access to operating system threads and :mod:`multiprocessing` to
+      operating system processes. Multi-core processors can execute threads and
+      processes on different CPU cores at the same time (see
+      :term:`parallelism`).
+
+   concurrent modification
+      When multiple threads modify shared data at the same time.  Concurrent
+      modification without proper synchronization can cause
+      :term:`race conditions <race condition>`, and might also trigger a
+      :term:`data race <data race>`, data corruption, or both.
+
    context
       This term has different meanings depending on where and how it is used.
       Some common meanings:
@@ -363,6 +387,28 @@ Glossary
       the :term:`cyclic garbage collector <garbage collection>` is to identify these groups and break the reference
       cycles so that the memory can be reclaimed.
 
+   data race
+      A situation where multiple threads access the same memory location
+      concurrently, at least one of the accesses is a write, and the threads
+      do not use any synchronization to control their access.  Data races
+      lead to :term:`non-deterministic` behavior and can cause data corruption.
+      Proper use of :term:`locks <lock>` and other :term:`synchronization primitives
+      <synchronization primitive>` prevents data races.  Note that data races
+      can only happen in native code, but that :term:`native code` might be
+      exposed in a Python API.  See also :term:`race condition` and
+      :term:`thread-safe`.
+
+   deadlock
+      A situation in which two or more tasks (threads, processes, or coroutines)
+      wait indefinitely for each other to release resources or complete actions,
+      preventing any from making progress.  For example, if thread A holds lock
+      1 and waits for lock 2, while thread B holds lock 2 and waits for lock 1,
+      both threads will wait indefinitely.  In Python this often arises from
+      acquiring multiple locks in conflicting orders or from circular
+      join/await dependencies.  Deadlocks can be avoided by always acquiring
+      multiple :term:`locks <lock>` in a consistent order.  See also
+      :term:`lock` and :term:`reentrant`.
+
    decorator
       A function returning another function, usually applied as a function
       transformation using the ``@wrapper`` syntax.  Common examples for
@@ -662,6 +708,14 @@ Glossary
       requires the GIL to be held in order to use it. This refers to having an
       :term:`attached thread state`.
 
+   global state
+      Data that is accessible throughout a program, such as module-level
+      variables, class variables, or C static variables in :term:`extension modules
+      <extension module>`.  In multi-threaded programs, global state shared
+      between threads typically requires synchronization to avoid
+      :term:`race conditions <race condition>` and
+      :term:`data races <data race>`.
+
    hash-based pyc
       A bytecode cache file that uses the hash rather than the last-modified
       time of the corresponding source file to determine its validity. See
@@ -706,7 +760,9 @@ Glossary
       tuples.  Such an object cannot be altered.  A new object has to
       be created if a different value has to be stored.  They play an important
       role in places where a constant hash value is needed, for example as a key
-      in a dictionary.
+      in a dictionary.  Immutable objects are inherently :term:`thread-safe`
+      because their state cannot be modified after creation, eliminating concerns
+      about improperly synchronized :term:`concurrent modification`.
 
    import path
       A list of locations (or :term:`path entries <path entry>`) that are
@@ -796,8 +852,9 @@ Glossary
 
          CPython does not consistently apply the requirement that an iterator
          define :meth:`~iterator.__iter__`.
-         And also please note that the free-threading CPython does not guarantee
-         the thread-safety of iterator operations.
+         And also please note that :term:`free-threaded <free threading>`
+         CPython does not guarantee :term:`thread-safe` behavior of iterator
+         operations.
 
 
    key function
@@ -835,10 +892,11 @@ Glossary
       :keyword:`if` statements.
 
       In a multi-threaded environment, the LBYL approach can risk introducing a
-      race condition between "the looking" and "the leaping".  For example, the
-      code, ``if key in mapping: return mapping[key]`` can fail if another
+      :term:`race condition` between "the looking" and "the leaping".  For example,
+      the code, ``if key in mapping: return mapping[key]`` can fail if another
       thread removes *key* from *mapping* after the test, but before the lookup.
-      This issue can be solved with locks or by using the EAFP approach.
+      This issue can be solved with :term:`locks <lock>` or by using the
+      :term:`EAFP` approach.  See also :term:`thread-safe`.
 
    lexical analyzer
 
@@ -857,6 +915,19 @@ Glossary
       clause is optional.  If omitted, all elements in ``range(256)`` are
       processed.
 
+   lock
+      A :term:`synchronization primitive` that allows only one thread at a
+      time to access a shared resource.  A thread must acquire a lock before
+      accessing the protected resource and release it afterward.  If a thread
+      attempts to acquire a lock that is already held by another thread, it
+      will block until the lock becomes available.  Python's :mod:`threading`
+      module provides :class:`~threading.Lock` (a basic lock) and
+      :class:`~threading.RLock` (a :term:`reentrant` lock).  Locks are used
+      to prevent :term:`race conditions <race condition>` and ensure
+      :term:`thread-safe` access to shared data.  Alternative design patterns
+      to locks exist such as queues, producer/consumer patterns, and
+      thread-local state. See also :term:`deadlock`, and :term:`reentrant`.
+
    loader
       An object that loads a module.
       It must define the :meth:`!exec_module` and :meth:`!create_module` methods
@@ -942,8 +1013,11 @@ Glossary
       See :term:`method resolution order`.
 
    mutable
-      Mutable objects can change their value but keep their :func:`id`.  See
-      also :term:`immutable`.
+      An :term:`object` with state that is allowed to change during the course
+      of the program.  In multi-threaded programs, mutable objects that are
+      shared between threads require careful synchronization to avoid
+      :term:`race conditions <race condition>`.  See also :term:`immutable`,
+      :term:`thread-safe`, and :term:`concurrent modification`.
 
    named tuple
       The term "named tuple" applies to any type or class that inherits from
@@ -995,6 +1069,13 @@ Glossary
 
       See also :term:`module`.
 
+   native code
+      Code that is compiled to machine instructions and runs directly on the
+      processor, as opposed to code that is interpreted or runs in a virtual
+      machine.  In the context of Python, native code typically refers to
+      C, C++, Rust or Fortran code in :term:`extension modules <extension module>`
+      that can be called from Python.  See also :term:`extension module`.
+
    nested scope
       The ability to refer to a variable in an enclosing definition.  For
       instance, a function defined inside another function can refer to
@@ -1011,6 +1092,15 @@ Glossary
       properties, :meth:`~object.__getattribute__`, class methods, and static
       methods.
 
+   non-deterministic
+      Behavior where the outcome of a program can vary between executions with
+      the same inputs.  In multi-threaded programs, non-deterministic behavior
+      often results from :term:`race conditions <race condition>` where the
+      relative timing or interleaving of threads affects the result.
+      Proper synchronization using :term:`locks <lock>` and other
+      :term:`synchronization primitives <synchronization primitive>` helps
+      ensure deterministic behavior.
+
    object
       Any data with state (attributes or value) and defined behavior
       (methods).  Also the ultimate base class of any :term:`new-style
@@ -1041,6 +1131,16 @@ Glossary
 
       See also :term:`regular package` and :term:`namespace package`.
 
+   parallelism
+      Executing multiple operations at the same time (e.g. on multiple CPU
+      cores).  In Python builds with the
+      :term:`global interpreter lock (GIL) <global interpreter lock>`, only one
+      thread runs Python bytecode at a time, so taking advantage of multiple
+      CPU cores typically involves multiple processes
+      (e.g. :mod:`multiprocessing`) or native extensions that release the GIL.
+      In :term:`free-threaded <free threading>` Python, multiple Python threads
+      can run Python code simultaneously on different cores.
+
    parameter
       A named entity in a :term:`function` (or method) definition that
       specifies an :term:`argument` (or in some cases, arguments) that the
@@ -1215,6 +1315,18 @@ Glossary
          >>> email.mime.text.__name__
          'email.mime.text'
 
+   race condition
+      A condition of a program where the its behavior
+      depends on the relative timing or ordering of events, particularly in
+      multi-threaded programs.  Race conditions can lead to
+      :term:`non-deterministic` behavior and bugs that are difficult to
+      reproduce.  A :term:`data race` is a specific type of race condition
+      involving unsynchronized access to shared memory.  The :term:`LBYL`
+      coding style is particularly susceptible to race conditions in
+      multi-threaded code.  Using :term:`locks <lock>` and other
+      :term:`synchronization primitives <synchronization primitive>`
+      helps prevent race conditions.
+
    reference count
       The number of references to an object.  When the reference count of an
       object drops to zero, it is deallocated.  Some objects are
@@ -1236,6 +1348,25 @@ Glossary
 
       See also :term:`namespace package`.
 
+   reentrant
+      A property of a function or :term:`lock` that allows it to be called or
+      acquired multiple times by the same thread without causing errors or a
+      :term:`deadlock`.
+
+      For functions, reentrancy means the function can be safely called again
+      before a previous invocation has completed, which is important when
+      functions may be called recursively or from signal handlers. Thread-unsafe
+      functions may be :term:`non-deterministic` if they're called reentrantly in a
+      multithreaded program.
+
+      For locks, Python's :class:`threading.RLock` (reentrant lock) is
+      reentrant, meaning a thread that already holds the lock can acquire it
+      again without blocking.  In contrast, :class:`threading.Lock` is not
+      reentrant - attempting to acquire it twice from the same thread will cause
+      a deadlock.
+
+      See also :term:`lock` and :term:`deadlock`.
+
    REPL
       An acronym for the "read–eval–print loop", another name for the
       :term:`interactive` interpreter shell.
@@ -1340,6 +1471,18 @@ Glossary
 
       See also :term:`borrowed reference`.
 
+   synchronization primitive
+      A basic building block for coordinating (synchronizing) the execution of
+      multiple threads to ensure :term:`thread-safe` access to shared resources.
+      Python's :mod:`threading` module provides several synchronization primitives
+      including :class:`~threading.Lock`, :class:`~threading.RLock`,
+      :class:`~threading.Semaphore`, :class:`~threading.Condition`,
+      :class:`~threading.Event`, and :class:`~threading.Barrier`.  Additionally,
+      the :mod:`queue` module provides multi-producer, multi-consumer queues
+      that are especially useful in multithreaded programs. These
+      primitives help prevent :term:`race conditions <race condition>` and
+      coordinate thread execution.  See also :term:`lock`.
+
    t-string
    t-strings
       String literals prefixed with ``t`` or ``T`` are commonly called
@@ -1392,6 +1535,19 @@ Glossary
       See :ref:`Thread State and the Global Interpreter Lock <threads>` for more
       information.
 
+   thread-safe
+      A module, function, or class that behaves correctly when used by multiple
+      threads concurrently.  Thread-safe code uses appropriate
+      :term:`synchronization primitives <synchronization primitive>` like
+      :term:`locks <lock>` to protect shared mutable state, or is designed
+      to avoid shared mutable state entirely.  In the
+      :term:`free-threaded <free threading>` build, built-in types like
+      :class:`dict`, :class:`list`, and :class:`set` use internal locking
+      to make many operations thread-safe, although thread safety is not
+      necessarily guaranteed.  Code that is not thread-safe may experience
+      :term:`race conditions <race condition>` and :term:`data races <data race>`
+      when used in multi-threaded programs.
+
    token
 
       A small unit of source code, generated by the
diff --git a/Doc/includes/capi-extension/spammodule-01.c b/Doc/includes/capi-extension/spammodule-01.c
new file mode 100644
index 00000000000..86c9840359d
--- /dev/null
+++ b/Doc/includes/capi-extension/spammodule-01.c
@@ -0,0 +1,55 @@
+/* This file needs to be kept in sync with the tutorial
+ * at Doc/extending/first-extension-module.rst
+ */
+
+/// Includes
+
+#include <Python.h>
+#include <stdlib.h>     // for system()
+
+/// Implementation of spam.system
+
+static PyObject *
+spam_system(PyObject *self, PyObject *arg)
+{
+   const char *command = PyUnicode_AsUTF8(arg);
+   if (command == NULL) {
+      return NULL;
+   }
+   int status = system(command);
+   PyObject *result = PyLong_FromLong(status);
+   return result;
+}
+
+/// Module method table
+
+static PyMethodDef spam_methods[] = {
+    {
+        .ml_name="system",
+        .ml_meth=spam_system,
+        .ml_flags=METH_O,
+        .ml_doc="Execute a shell command.",
+    },
+    {NULL, NULL, 0, NULL}        /* Sentinel */
+};
+
+/// Module slot table
+
+static PyModuleDef_Slot spam_slots[] = {
+    {Py_mod_name, "spam"},
+    {Py_mod_doc, "A wonderful module with an example function"},
+    {Py_mod_methods, spam_methods},
+    {0, NULL}
+};
+
+/// Export hook prototype
+
+PyMODEXPORT_FUNC PyModExport_spam(void);
+
+/// Module export hook
+
+PyMODEXPORT_FUNC
+PyModExport_spam(void)
+{
+   return spam_slots;
+}
diff --git a/Doc/library/ast.rst b/Doc/library/ast.rst
index 2e7d0dbc26e..bf37540e5fa 100644
--- a/Doc/library/ast.rst
+++ b/Doc/library/ast.rst
@@ -139,12 +139,13 @@ Node classes
     The :meth:`~object.__repr__` output of :class:`~ast.AST` nodes includes
     the values of the node fields.
 
-.. deprecated:: 3.8
+.. deprecated-removed:: 3.8 3.14
 
-   Old classes :class:`!ast.Num`, :class:`!ast.Str`, :class:`!ast.Bytes`,
-   :class:`!ast.NameConstant` and :class:`!ast.Ellipsis` are still available,
-   but they will be removed in future Python releases.  In the meantime,
-   instantiating them will return an instance of a different class.
+   Previous versions of Python provided the AST classes :class:`!ast.Num`,
+   :class:`!ast.Str`, :class:`!ast.Bytes`, :class:`!ast.NameConstant` and
+   :class:`!ast.Ellipsis`, which were deprecated in Python 3.8. These classes
+   were removed in Python 3.14, and their functionality has been replaced with
+   :class:`ast.Constant`.
 
 .. deprecated:: 3.9
 
@@ -2419,12 +2420,12 @@ and classes for traversing abstract syntax trees:
    during traversal.  For this a special visitor exists
    (:class:`NodeTransformer`) that allows modifications.
 
-   .. deprecated:: 3.8
+   .. deprecated-removed:: 3.8 3.14
 
       Methods :meth:`!visit_Num`, :meth:`!visit_Str`, :meth:`!visit_Bytes`,
-      :meth:`!visit_NameConstant` and :meth:`!visit_Ellipsis` are deprecated
-      now and will not be called in future Python versions.  Add the
-      :meth:`visit_Constant` method to handle all constant nodes.
+      :meth:`!visit_NameConstant` and :meth:`!visit_Ellipsis` will not be called
+      in Python 3.14+.  Add the :meth:`visit_Constant` method instead to handle
+      all constant nodes.
 
 
 .. class:: NodeTransformer()
diff --git a/Doc/library/asyncio-queue.rst b/Doc/library/asyncio-queue.rst
index d481a1921d5..a9735ae8065 100644
--- a/Doc/library/asyncio-queue.rst
+++ b/Doc/library/asyncio-queue.rst
@@ -107,7 +107,7 @@ Queue
       The queue can no longer grow.
       Future calls to :meth:`~Queue.put` raise :exc:`QueueShutDown`.
       Currently blocked callers of :meth:`~Queue.put` will be unblocked
-      and will raise :exc:`QueueShutDown` in the formerly blocked thread.
+      and will raise :exc:`QueueShutDown` in the formerly awaiting task.
 
       If *immediate* is false (the default), the queue can be wound
       down normally with :meth:`~Queue.get` calls to extract tasks
diff --git a/Doc/library/datetime.rst b/Doc/library/datetime.rst
index 8ae1c1fb9e4..48e7080da6c 100644
--- a/Doc/library/datetime.rst
+++ b/Doc/library/datetime.rst
@@ -2651,9 +2651,42 @@ Broadly speaking, ``d.strftime(fmt)`` acts like the :mod:`time` module's
 ``time.strftime(fmt, d.timetuple())`` although not all objects support a
 :meth:`~date.timetuple` method.
 
-For the :meth:`.datetime.strptime` class method, the default value is
-``1900-01-01T00:00:00.000``: any components not specified in the format string
-will be pulled from the default value. [#]_
+For the :meth:`.datetime.strptime` and :meth:`.date.strptime` class methods,
+the default value is ``1900-01-01T00:00:00.000``: any components not specified
+in the format string will be pulled from the default value.
+
+.. note::
+   When used to parse partial dates lacking a year, :meth:`.datetime.strptime`
+   and :meth:`.date.strptime` will raise when encountering February 29 because
+   the default year of 1900 is *not* a leap year.  Always add a default leap
+   year to partial date strings before parsing.
+
+
+.. testsetup::
+
+    # doctest seems to turn the warning into an error which makes it
+    # show up and require matching and prevents the actual interesting
+    # exception from being raised.
+    # Manually apply the catch_warnings context manager
+    import warnings
+    catch_warnings = warnings.catch_warnings()
+    catch_warnings.__enter__()
+    warnings.simplefilter("ignore")
+
+.. testcleanup::
+
+    catch_warnings.__exit__()
+
+.. doctest::
+
+    >>> from datetime import datetime
+    >>> value = "2/29"
+    >>> datetime.strptime(value, "%m/%d")
+    Traceback (most recent call last):
+    ...
+    ValueError: day 29 must be in range 1..28 for month 2 in year 1900
+    >>> datetime.strptime(f"1904 {value}", "%Y %m/%d")
+    datetime.datetime(1904, 2, 29, 0, 0)
 
 Using ``datetime.strptime(date_string, format)`` is equivalent to::
 
@@ -2790,7 +2823,7 @@ Notes:
    include a year in the format.  If the value you need to parse lacks a year,
    append an explicit dummy leap year.  Otherwise your code will raise an
    exception when it encounters leap day because the default year used by the
-   parser is not a leap year.  Users run into this bug every four years...
+   parser (1900) is not a leap year.  Users run into that bug every leap year.
 
    .. doctest::
 
@@ -2817,5 +2850,3 @@ Notes:
 .. [#] See R. H. van Gent's `guide to the mathematics of the ISO 8601 calendar
        <https://web.archive.org/web/20220531051136/https://webspace.science.uu.nl/~gent0113/calendar/isocalendar.htm>`_
        for a good explanation.
-
-.. [#] Passing ``datetime.strptime('Feb 29', '%b %d')`` will fail since 1900 is not a leap year.
diff --git a/Doc/library/enum.rst b/Doc/library/enum.rst
index a8a7e671aad..0da27ba8e78 100644
--- a/Doc/library/enum.rst
+++ b/Doc/library/enum.rst
@@ -947,12 +947,13 @@ Utilities and Decorators
    the member's name.  Care must be taken if mixing *auto()* with manually
    specified values.
 
-   *auto* instances are only resolved when at the top level of an assignment:
+   *auto* instances are only resolved when at the top level of an assignment, either by
+   itself or as part of a tuple:
 
    * ``FIRST = auto()`` will work (auto() is replaced with ``1``);
    * ``SECOND = auto(), -2`` will work (auto is replaced with ``2``, so ``2, -2`` is
      used to create the ``SECOND`` enum member;
-   * ``THREE = [auto(), -3]`` will *not* work (``<auto instance>, -3`` is used to
+   * ``THREE = [auto(), -3]`` will *not* work (``[<auto instance>, -3]`` is used to
      create the ``THREE`` enum member)
 
    .. versionchanged:: 3.11.1
diff --git a/Doc/library/mmap.rst b/Doc/library/mmap.rst
index f32aa322c40..41b90f2c3b3 100644
--- a/Doc/library/mmap.rst
+++ b/Doc/library/mmap.rst
@@ -328,6 +328,17 @@ To map anonymous memory, -1 should be passed as the fileno along with the length
 
       .. versionadded:: 3.13
 
+   .. method:: set_name(name, /)
+
+      Annotate the memory mapping with the given *name* for easier identification
+      in ``/proc/<pid>/maps`` if the kernel supports the feature and :option:`-X dev <-X>` is passed
+      to Python or if Python is built in :ref:`debug mode <debug-build>`.
+      The length of *name* must not exceed 67 bytes including the ``'\0'`` terminator.
+
+      .. availability:: Linux >= 5.17 (kernel built with ``CONFIG_ANON_VMA_NAME`` option)
+
+      .. versionadded:: next
+
    .. method:: size()
 
       Return the length of the file, which can be larger than the size of the
diff --git a/Doc/library/pdb.rst b/Doc/library/pdb.rst
index 0bbdc425352..8ab3e7ec9ef 100644
--- a/Doc/library/pdb.rst
+++ b/Doc/library/pdb.rst
@@ -520,7 +520,8 @@ can be overridden by the local file.
    To remove all commands from a breakpoint, type ``commands`` and follow it
    immediately with ``end``; that is, give no commands.
 
-   With no *bpnumber* argument, ``commands`` refers to the last breakpoint set.
+   With no *bpnumber* argument, ``commands`` refers to the most recently set
+   breakpoint that still exists.
 
    You can use breakpoint commands to start your program up again.  Simply use
    the :pdbcmd:`continue` command, or :pdbcmd:`step`,
diff --git a/Doc/library/profiling.sampling.rst b/Doc/library/profiling.sampling.rst
index 1f60e2cb578..41cb254174d 100644
--- a/Doc/library/profiling.sampling.rst
+++ b/Doc/library/profiling.sampling.rst
@@ -200,6 +200,36 @@ On most systems, attaching to another process requires appropriate permissions.
 See :ref:`profiling-permissions` for platform-specific requirements.
 
 
+.. _replay-command:
+
+The ``replay`` command
+----------------------
+
+The ``replay`` command converts binary profile files to other output formats::
+
+   python -m profiling.sampling replay profile.bin
+   python -m profiling.sampling replay --flamegraph -o profile.html profile.bin
+
+This command is useful when you have captured profiling data in binary format
+and want to analyze it later or convert it to a visualization format. Binary
+profiles can be replayed multiple times to different formats without
+re-profiling.
+
+::
+
+   # Convert binary to pstats (default, prints to stdout)
+   python -m profiling.sampling replay profile.bin
+
+   # Convert binary to flame graph
+   python -m profiling.sampling replay --flamegraph -o output.html profile.bin
+
+   # Convert binary to gecko format for Firefox Profiler
+   python -m profiling.sampling replay --gecko -o profile.json profile.bin
+
+   # Convert binary to heatmap
+   python -m profiling.sampling replay --heatmap -o my_heatmap profile.bin
+
+
 Profiling in production
 -----------------------
 
@@ -1041,6 +1071,59 @@ intuitive view that shows exactly where time is spent without requiring
 interpretation of hierarchical visualizations.
 
 
+Binary format
+-------------
+
+Binary format (:option:`--binary`) produces a compact binary file for efficient
+storage of profiling data::
+
+   python -m profiling.sampling run --binary -o profile.bin script.py
+   python -m profiling.sampling attach --binary -o profile.bin 12345
+
+The :option:`--compression` option controls data compression:
+
+- ``auto`` (default): Use zstd compression if available, otherwise no
+  compression
+- ``zstd``: Force zstd compression (requires :mod:`compression.zstd` support)
+- ``none``: Disable compression
+
+::
+
+   python -m profiling.sampling run --binary --compression=zstd -o profile.bin script.py
+
+To analyze binary profiles, use the :ref:`replay-command` to convert them to
+other formats like flame graphs or pstats output.
+
+
+Record and replay workflow
+==========================
+
+The binary format combined with the replay command enables a record-and-replay
+workflow that separates data capture from analysis. Rather than generating
+visualizations during profiling, you capture raw data to a compact binary file
+and convert it to different formats later.
+
+This approach has three main benefits:
+
+- Sampling runs faster because the work of building data structures for
+  visualization is deferred until replay.
+- A single binary capture can be converted to multiple output formats
+  without re-profiling: pstats for a quick overview, flame graph for visual
+  exploration, heatmap for line-level detail.
+- Binary files are compact and easy to share with colleagues who can convert
+  them to their preferred format.
+
+A typical workflow::
+
+   # Capture profile in production or during tests
+   python -m profiling.sampling attach --binary -o profile.bin 12345
+
+   # Later, analyze with different formats
+   python -m profiling.sampling replay profile.bin
+   python -m profiling.sampling replay --flamegraph -o profile.html profile.bin
+   python -m profiling.sampling replay --heatmap -o heatmap profile.bin
+
+
 Live mode
 =========
 
@@ -1252,6 +1335,10 @@ Global options
 
    Attach to and profile a running process by PID.
 
+.. option:: replay
+
+   Convert a binary profile file to another output format.
+
 
 Sampling options
 ----------------
@@ -1335,12 +1422,22 @@ Output options
 
    Generate HTML heatmap with line-level sample counts.
 
+.. option:: --binary
+
+   Generate high-performance binary format for later conversion with the
+   ``replay`` command.
+
+.. option:: --compression <type>
+
+   Compression for binary format: ``auto`` (use zstd if available, default),
+   ``zstd``, or ``none``.
+
 .. option:: -o <path>, --output <path>
 
    Output file or directory path. Default behavior varies by format:
-   ``--pstats`` writes to stdout, ``--flamegraph`` and ``--gecko`` generate
-   files like ``flamegraph.PID.html``, and ``--heatmap`` creates a directory
-   named ``heatmap_PID``.
+   :option:`--pstats` writes to stdout, while other formats generate a file
+   named ``<format>_<PID>.<ext>`` (for example, ``flamegraph_12345.html``).
+   :option:`--heatmap` creates a directory named ``heatmap_<PID>``.
 
 
 pstats display options
diff --git a/Doc/library/random.rst b/Doc/library/random.rst
index 4e55e301b89..6bddf575a80 100644
--- a/Doc/library/random.rst
+++ b/Doc/library/random.rst
@@ -78,7 +78,7 @@ Bookkeeping functions
    instead of the system time (see the :func:`os.urandom` function for details
    on availability).
 
-   If *a* is an int, it is used directly.
+   If *a* is an int, its absolute value is used directly.
 
    With version 2 (the default), a :class:`str`, :class:`bytes`, or :class:`bytearray`
    object gets converted to an :class:`int` and all of its bits are used.
diff --git a/Doc/library/stdtypes.rst b/Doc/library/stdtypes.rst
index f33b73238ec..7eaa9f48ab5 100644
--- a/Doc/library/stdtypes.rst
+++ b/Doc/library/stdtypes.rst
@@ -46,8 +46,10 @@ Any object can be tested for truth value, for use in an :keyword:`if` or
 By default, an object is considered true unless its class defines either a
 :meth:`~object.__bool__` method that returns ``False`` or a
 :meth:`~object.__len__` method that
-returns zero, when called with the object. [1]_  Here are most of the built-in
-objects considered false:
+returns zero, when called with the object. [1]_ If one of the methods raises an
+exception when called, the exception is propagated and the object does
+not have a truth value (for example, :data:`NotImplemented`).
+Here are most of the built-in objects considered false:
 
 .. index::
    single: None (Built-in object)
diff --git a/Doc/tools/extensions/pydoc_topics.py b/Doc/tools/extensions/pydoc_topics.py
index 01efbba6283..a65d77433b2 100644
--- a/Doc/tools/extensions/pydoc_topics.py
+++ b/Doc/tools/extensions/pydoc_topics.py
@@ -109,6 +109,7 @@ class PydocTopicsBuilder(TextBuilder):
     def init(self) -> None:
         super().init()
         self.topics: dict[str, str] = {}
+        self.module_docs: dict[str, str] = {}
 
     def get_outdated_docs(self) -> str:
         # Return a string describing what an update build will build.
@@ -130,6 +131,15 @@ class PydocTopicsBuilder(TextBuilder):
                 continue
             doc_labels.setdefault(docname, []).append((topic_label, label_id))
 
+        py_domain = env.domains['py']
+        for module_name, module_info in py_domain.data['modules'].items():
+            docname = module_info[0]
+            if docname.startswith('library/'):
+                doc_file = docname.replace('library/', '')
+                self.module_docs[module_name] = (
+                    f"{doc_file}#module-{module_name}"
+                )
+
         for docname, label_ids in status_iterator(
             doc_labels.items(),
             "building topics... ",
@@ -161,6 +171,22 @@ topics = {{
 """
         self.outdir.joinpath("topics.py").write_text(topics, encoding="utf-8")
 
+        module_docs_repr = "\n".join(
+            f"    '{module}': '{doc_file}',"
+            for module, doc_file in sorted(self.module_docs.items())
+        )
+        module_docs = f"""\
+# Autogenerated by Sphinx on {asctime()}
+# as part of the release process.
+
+module_docs = {{
+{module_docs_repr}
+}}
+"""
+        self.outdir.joinpath("module_docs.py").write_text(
+            module_docs, encoding="utf-8"
+        )
+
 
 def _display_labels(item: tuple[str, Sequence[tuple[str, str]]]) -> str:
     _docname, label_ids = item
diff --git a/Doc/whatsnew/3.15.rst b/Doc/whatsnew/3.15.rst
index 24a51f87c0f..aa138c9cacb 100644
--- a/Doc/whatsnew/3.15.rst
+++ b/Doc/whatsnew/3.15.rst
@@ -73,6 +73,7 @@ Summary -- Release highlights
   <whatsnew315-utf8-default>`
 * :pep:`782`: :ref:`A new PyBytesWriter C API to create a Python bytes object
   <whatsnew315-pep782>`
+* :ref:`The JIT compiler has been significantly upgraded <whatsnew315-jit>`
 * :ref:`Improved error messages <whatsnew315-improved-error-messages>`
 
 
@@ -591,6 +592,11 @@ mmap
   not be duplicated.
   (Contributed by Serhiy Storchaka in :gh:`78502`.)
 
+* Added the :meth:`mmap.mmap.set_name` method
+  to annotate an anonymous memory mapping
+  if Linux kernel supports :manpage:`PR_SET_VMA_ANON_NAME <PR_SET_VMA(2const)>` (Linux 5.17 or newer).
+  (Contributed by Donghee Na in :gh:`142419`.)
+
 
 os
 --
@@ -843,6 +849,16 @@ zlib
 Optimizations
 =============
 
+* Builds using Visual Studio 2026 (MSVC 18) may now use the new
+  :ref:`tail-calling interpreter <whatsnew314-tail-call-interpreter>`.
+  Results on an early experimental MSVC compiler reported roughly 15% speedup
+  on the geometric mean of pyperformance on Windows x86-64 over
+  the switch-case interpreter. We have
+  observed speedups ranging from 15% for large pure-Python libraries
+  to 40% for long-running small pure-Python scripts on Windows.
+  (Contributed by Chris Eibl, Ken Jin, and Brandt Bucher in :gh:`143068`.
+  Special thanks to the MSVC team including Hulon Jenkins.)
+
 csv
 ---
 
@@ -850,6 +866,91 @@ csv
   (Contributed by Maurycy Pawłowski-Wieroński in :gh:`137628`.)
 
 
+.. _whatsnew315-jit:
+
+Upgraded JIT compiler
+=====================
+
+Results from the `pyperformance <https://github.com/python/pyperformance>`__
+benchmark suite report
+`3-4% <https://github.com/facebookexperimental/free-threading-benchmarking/blob/main/results/bm-20251214-3.15.0a2%2B-6cddf04-JIT/bm-20251214-vultr-x86_64-python-6cddf04344a1e8ca9df5-3.15.0a2%2B-6cddf04-vs-base.svg>`__
+geometric mean performance improvement for the JIT over the standard CPython
+interpreter built with all optimizations enabled. The speedups for JIT
+builds versus no JIT builds range from roughly 20% slowdown to over
+100% speedup (ignoring the ``unpack_sequence`` microbenchmark) on
+x86-64 Linux and AArch64 macOS systems.
+
+.. attention::
+    These results are not yet final.
+
+The major upgrades to the JIT are:
+
+* LLVM 21 build-time dependency
+* New tracing frontend
+* Basic register allocation in the JIT
+* More JIT optimizations
+* Better machine code generation
+
+.. rubric:: LLVM 21 build-time dependency
+
+The JIT compiler now uses LLVM 21 for build-time stencil generation. As
+always, LLVM is only needed when building CPython with the JIT enabled;
+end users running Python do not need LLVM installed. Instructions for
+installing LLVM can be found in the `JIT compiler documentation
+<https://github.com/python/cpython/blob/main/Tools/jit/README.md>`__
+for all supported platforms.
+
+(Contributed by Savannah Ostrowski in :gh:`140973`.)
+
+.. rubric:: A new tracing frontend
+
+The JIT compiler now supports significantly more bytecode operations and
+control flow than in Python 3.14, enabling speedups on a wider variety of
+code. For example, simple Python object creation is now understood by the
+3.15 JIT compiler. Overloaded operations and generators are also partially
+supported. This was made possible by an overhauled JIT tracing frontend
+that records actual execution paths through code, rather than estimating
+them as the previous implementation did.
+
+(Contributed by Ken Jin in :gh:`139109`. Support for Windows added by
+Mark Shannon in :gh:`141703`.)
+
+.. rubric:: Basic register allocation in the JIT
+
+A basic form of register allocation has been added to the JIT compiler's
+optimizer. This allows the JIT compiler to avoid certain stack operations
+altogether and instead operate on registers. This allows the JIT to produce
+more efficient traces by avoiding reads and writes to memory.
+
+(Contributed by Mark Shannon in :gh:`135379`.)
+
+.. rubric:: More JIT optimizations
+
+More `constant-propagation <https://en.wikipedia.org/wiki/Constant_folding>`__
+is now performed. This means when the JIT compiler detects that certain user
+code results in constants, the code can be simplified by the JIT.
+
+(Contributed by Ken Jin and Savannah Ostrowski in :gh:`132732`.)
+
+The JIT avoids :term:`reference count`\ s where possible. This generally
+reduces the cost of most operations in Python.
+
+(Contributed by Ken Jin, Donghee Na, Zheao Li, Savannah Ostrowski,
+Noam Cohen, Tomas Roun, PuQing in :gh:`134584`.)
+
+.. rubric:: Better machine code generation
+
+The JIT compiler's machine code generator now produces better machine code
+for x86-64 and AArch64 macOS and Linux targets. In general, users should
+experience lower memory usage for generated machine code and more efficient
+machine code versus the old JIT.
+
+(Contributed by Brandt Bucher in :gh:`136528` and :gh:`136528`.
+Implementation for AArch64 contributed by Mark Shannon in :gh:`139855`.
+Additional optimizations for AArch64 contributed by Mark Shannon and
+Diego Russo in :gh:`140683` and :gh:`142305`.)
+
+
 Removed
 =======
 
@@ -1018,9 +1119,9 @@ New deprecations
 
 * ``__version__``
 
-  * The ``__version__`` attribute has been deprecated in these standard library
-    modules and will be removed in Python 3.20.
-    Use :py:data:`sys.version_info` instead.
+  * The ``__version__``, ``version`` and ``VERSION`` attributes have been
+    deprecated in these standard library modules and will be removed in
+    Python 3.20. Use :py:data:`sys.version_info` instead.
 
     - :mod:`argparse`
     - :mod:`csv`
@@ -1041,6 +1142,9 @@ New deprecations
     - :mod:`tkinter.font`
     - :mod:`tkinter.ttk`
     - :mod:`wsgiref.simple_server`
+    - :mod:`xml.etree.ElementTree`
+    - :mod:`!xml.sax.expatreader`
+    - :mod:`xml.sax.handler`
     - :mod:`zlib`
 
     (Contributed by Hugo van Kemenade and Stan Ulbrych in :gh:`76007`.)
diff --git a/Include/cpython/pyatomic.h b/Include/cpython/pyatomic.h
index 790640309f1..ce907fd6a4c 100644
--- a/Include/cpython/pyatomic.h
+++ b/Include/cpython/pyatomic.h
@@ -523,6 +523,9 @@ _Py_atomic_store_uintptr_release(uintptr_t *obj, uintptr_t value);
 static inline void
 _Py_atomic_store_ssize_release(Py_ssize_t *obj, Py_ssize_t value);
 
+static inline void
+_Py_atomic_store_int8_release(int8_t *obj, int8_t value);
+
 static inline void
 _Py_atomic_store_int_release(int *obj, int value);
 
diff --git a/Include/cpython/pyatomic_gcc.h b/Include/cpython/pyatomic_gcc.h
index 1566b83b9f6..c045213c898 100644
--- a/Include/cpython/pyatomic_gcc.h
+++ b/Include/cpython/pyatomic_gcc.h
@@ -572,6 +572,10 @@ static inline void
 _Py_atomic_store_int_release(int *obj, int value)
 { __atomic_store_n(obj, value, __ATOMIC_RELEASE); }
 
+static inline void
+_Py_atomic_store_int8_release(int8_t *obj, int8_t value)
+{ __atomic_store_n(obj, value, __ATOMIC_RELEASE); }
+
 static inline void
 _Py_atomic_store_ssize_release(Py_ssize_t *obj, Py_ssize_t value)
 { __atomic_store_n(obj, value, __ATOMIC_RELEASE); }
diff --git a/Include/cpython/pyatomic_msc.h b/Include/cpython/pyatomic_msc.h
index d155955df0c..8b9dd3eb0f8 100644
--- a/Include/cpython/pyatomic_msc.h
+++ b/Include/cpython/pyatomic_msc.h
@@ -1066,6 +1066,19 @@ _Py_atomic_store_int_release(int *obj, int value)
 #endif
 }
 
+static inline void
+_Py_atomic_store_int8_release(int8_t *obj, int8_t value)
+{
+#if defined(_M_X64) || defined(_M_IX86)
+    *(int8_t volatile *)obj = value;
+#elif defined(_M_ARM64)
+    _Py_atomic_ASSERT_ARG_TYPE(unsigned __int8);
+    __stlr8((unsigned __int8 volatile *)obj, (unsigned __int8)value);
+#else
+#  error "no implementation of _Py_atomic_store_int8_release"
+#endif
+}
+
 static inline void
 _Py_atomic_store_ssize_release(Py_ssize_t *obj, Py_ssize_t value)
 {
diff --git a/Include/cpython/pyatomic_std.h b/Include/cpython/pyatomic_std.h
index 7176f667a40..cfc8dbefc63 100644
--- a/Include/cpython/pyatomic_std.h
+++ b/Include/cpython/pyatomic_std.h
@@ -1023,6 +1023,14 @@ _Py_atomic_store_int_release(int *obj, int value)
                           memory_order_release);
 }
 
+static inline void
+_Py_atomic_store_int8_release(int8_t *obj, int8_t value)
+{
+    _Py_USING_STD;
+    atomic_store_explicit((_Atomic(int8_t)*)obj, value,
+                          memory_order_release);
+}
+
 static inline void
 _Py_atomic_store_uint_release(unsigned int *obj, unsigned int value)
 {
diff --git a/Include/internal/pycore_ceval.h b/Include/internal/pycore_ceval.h
index af53f2e7d6f..c6c82038d7c 100644
--- a/Include/internal/pycore_ceval.h
+++ b/Include/internal/pycore_ceval.h
@@ -123,7 +123,7 @@ _PyEval_EvalFrame(PyThreadState *tstate, _PyInterpreterFrame *frame, int throwfl
 
 #ifdef _Py_TIER2
 #ifdef _Py_JIT
-_Py_CODEUNIT *_Py_LazyJitTrampoline(
+_Py_CODEUNIT *_Py_LazyJitShim(
     struct _PyExecutorObject *current_executor, _PyInterpreterFrame *frame,
     _PyStackRef *stack_pointer, PyThreadState *tstate
 );
@@ -415,6 +415,17 @@ _Py_VectorCall_StackRefSteal(
     int total_args,
     _PyStackRef kwnames);
 
+PyAPI_FUNC(PyObject*)
+_Py_VectorCallInstrumentation_StackRefSteal(
+    _PyStackRef callable,
+    _PyStackRef* arguments,
+    int total_args,
+    _PyStackRef kwnames,
+    bool call_instrumentation,
+    _PyInterpreterFrame* frame,
+    _Py_CODEUNIT* this_instr,
+    PyThreadState* tstate);
+
 PyAPI_FUNC(PyObject *)
 _Py_BuiltinCallFast_StackRefSteal(
     _PyStackRef callable,
@@ -464,6 +475,11 @@ _Py_assert_within_stack_bounds(
     _PyInterpreterFrame *frame, _PyStackRef *stack_pointer,
     const char *filename, int lineno);
 
+// Like PyMapping_GetOptionalItem, but returns the PyObject* instead of taking
+// it as an out parameter. This helps MSVC's escape analysis when used with
+// tail calling.
+PyAPI_FUNC(PyObject*) _PyMapping_GetOptionalItem2(PyObject* obj, PyObject* key, int* err);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/Include/internal/pycore_dict.h b/Include/internal/pycore_dict.h
index 1193f496da1..a7005a3b8e2 100644
--- a/Include/internal/pycore_dict.h
+++ b/Include/internal/pycore_dict.h
@@ -272,8 +272,7 @@ _PyDict_SendEvent(int watcher_bits,
                   PyObject *value);
 
 static inline void
-_PyDict_NotifyEvent(PyInterpreterState *interp,
-                    PyDict_WatchEvent event,
+_PyDict_NotifyEvent(PyDict_WatchEvent event,
                     PyDictObject *mp,
                     PyObject *key,
                     PyObject *value)
diff --git a/Include/internal/pycore_global_objects_fini_generated.h b/Include/internal/pycore_global_objects_fini_generated.h
index 56bc003ac3e..e625bf2fef1 100644
--- a/Include/internal/pycore_global_objects_fini_generated.h
+++ b/Include/internal/pycore_global_objects_fini_generated.h
@@ -1653,9 +1653,11 @@ _PyStaticObjects_CheckRefcnt(PyInterpreterState *interp) {
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(co_varnames));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(code));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(col_offset));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(collector));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(command));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(comment_factory));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(compile_mode));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(compression));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(config));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(consts));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(context));
@@ -1718,7 +1720,9 @@ _PyStaticObjects_CheckRefcnt(PyInterpreterState *interp) {
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(event));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(eventmask));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(exc));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(exc_tb));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(exc_type));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(exc_val));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(exc_value));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(excepthook));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(exception));
@@ -1974,6 +1978,7 @@ _PyStaticObjects_CheckRefcnt(PyInterpreterState *interp) {
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(print_file_and_line));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(priority));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(progress));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(progress_callback));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(progress_routine));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(proto));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(protocol));
@@ -2014,6 +2019,7 @@ _PyStaticObjects_CheckRefcnt(PyInterpreterState *interp) {
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(reversed));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(rounding));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(salt));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(sample_interval_us));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(sched_priority));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(scheduler));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(script));
@@ -2053,8 +2059,10 @@ _PyStaticObjects_CheckRefcnt(PyInterpreterState *interp) {
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(spam));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(src));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(src_dir_fd));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(stack_frames));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(stacklevel));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(start));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(start_time_us));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(statement));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(stats));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(status));
@@ -2095,6 +2103,7 @@ _PyStaticObjects_CheckRefcnt(PyInterpreterState *interp) {
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(times));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(timespec));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(timestamp));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(timestamp_us));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(timetuple));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(timeunit));
     _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(top));
diff --git a/Include/internal/pycore_global_strings.h b/Include/internal/pycore_global_strings.h
index 8be948b92ec..771f0f8cb4a 100644
--- a/Include/internal/pycore_global_strings.h
+++ b/Include/internal/pycore_global_strings.h
@@ -376,9 +376,11 @@ struct _Py_global_strings {
         STRUCT_FOR_ID(co_varnames)
         STRUCT_FOR_ID(code)
         STRUCT_FOR_ID(col_offset)
+        STRUCT_FOR_ID(collector)
         STRUCT_FOR_ID(command)
         STRUCT_FOR_ID(comment_factory)
         STRUCT_FOR_ID(compile_mode)
+        STRUCT_FOR_ID(compression)
         STRUCT_FOR_ID(config)
         STRUCT_FOR_ID(consts)
         STRUCT_FOR_ID(context)
@@ -441,7 +443,9 @@ struct _Py_global_strings {
         STRUCT_FOR_ID(event)
         STRUCT_FOR_ID(eventmask)
         STRUCT_FOR_ID(exc)
+        STRUCT_FOR_ID(exc_tb)
         STRUCT_FOR_ID(exc_type)
+        STRUCT_FOR_ID(exc_val)
         STRUCT_FOR_ID(exc_value)
         STRUCT_FOR_ID(excepthook)
         STRUCT_FOR_ID(exception)
@@ -697,6 +701,7 @@ struct _Py_global_strings {
         STRUCT_FOR_ID(print_file_and_line)
         STRUCT_FOR_ID(priority)
         STRUCT_FOR_ID(progress)
+        STRUCT_FOR_ID(progress_callback)
         STRUCT_FOR_ID(progress_routine)
         STRUCT_FOR_ID(proto)
         STRUCT_FOR_ID(protocol)
@@ -737,6 +742,7 @@ struct _Py_global_strings {
         STRUCT_FOR_ID(reversed)
         STRUCT_FOR_ID(rounding)
         STRUCT_FOR_ID(salt)
+        STRUCT_FOR_ID(sample_interval_us)
         STRUCT_FOR_ID(sched_priority)
         STRUCT_FOR_ID(scheduler)
         STRUCT_FOR_ID(script)
@@ -776,8 +782,10 @@ struct _Py_global_strings {
         STRUCT_FOR_ID(spam)
         STRUCT_FOR_ID(src)
         STRUCT_FOR_ID(src_dir_fd)
+        STRUCT_FOR_ID(stack_frames)
         STRUCT_FOR_ID(stacklevel)
         STRUCT_FOR_ID(start)
+        STRUCT_FOR_ID(start_time_us)
         STRUCT_FOR_ID(statement)
         STRUCT_FOR_ID(stats)
         STRUCT_FOR_ID(status)
@@ -818,6 +826,7 @@ struct _Py_global_strings {
         STRUCT_FOR_ID(times)
         STRUCT_FOR_ID(timespec)
         STRUCT_FOR_ID(timestamp)
+        STRUCT_FOR_ID(timestamp_us)
         STRUCT_FOR_ID(timetuple)
         STRUCT_FOR_ID(timeunit)
         STRUCT_FOR_ID(top)
diff --git a/Include/internal/pycore_interp_structs.h b/Include/internal/pycore_interp_structs.h
index 6b3d5711b92..818c4f15959 100644
--- a/Include/internal/pycore_interp_structs.h
+++ b/Include/internal/pycore_interp_structs.h
@@ -947,7 +947,6 @@ struct _is {
     struct _PyExecutorObject *executor_deletion_list_head;
     struct _PyExecutorObject *cold_executor;
     struct _PyExecutorObject *cold_dynamic_executor;
-    int executor_deletion_list_remaining_capacity;
     size_t executor_creation_counter;
     _rare_events rare_events;
     PyDict_WatchCallback builtins_dict_watcher;
diff --git a/Include/internal/pycore_mmap.h b/Include/internal/pycore_mmap.h
index 214fd4362a5..897816db010 100644
--- a/Include/internal/pycore_mmap.h
+++ b/Include/internal/pycore_mmap.h
@@ -17,25 +17,27 @@ extern "C" {
 #endif
 
 #if defined(HAVE_PR_SET_VMA_ANON_NAME) && defined(__linux__)
-static inline void
+static inline int
 _PyAnnotateMemoryMap(void *addr, size_t size, const char *name)
 {
 #ifndef Py_DEBUG
     if (!_Py_GetConfig()->dev_mode) {
-        return;
+        return 0;
     }
 #endif
+    // The name length cannot exceed 80 (including the '\0').
     assert(strlen(name) < 80);
-    int old_errno = errno;
-    prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME, (unsigned long)addr, size, name);
-    /* Ignore errno from prctl */
-    /* See: https://bugzilla.redhat.com/show_bug.cgi?id=2302746 */
-    errno = old_errno;
+    int res = prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME, (unsigned long)addr, size, name);
+    if (res < 0) {
+        return -1;
+    }
+    return 0;
 }
 #else
-static inline void
+static inline int
 _PyAnnotateMemoryMap(void *Py_UNUSED(addr), size_t Py_UNUSED(size), const char *Py_UNUSED(name))
 {
+    return 0;
 }
 #endif
 
diff --git a/Include/internal/pycore_opcode_metadata.h b/Include/internal/pycore_opcode_metadata.h
index ce8a26c551b..e0d2e2a3c43 100644
--- a/Include/internal/pycore_opcode_metadata.h
+++ b/Include/internal/pycore_opcode_metadata.h
@@ -1081,7 +1081,7 @@ const struct opcode_metadata _PyOpcode_opcode_metadata[267] = {
     [BINARY_OP] = { true, INSTR_FMT_IBC0000, HAS_ARG_FLAG | HAS_ERROR_FLAG | HAS_ERROR_NO_POP_FLAG | HAS_ESCAPES_FLAG },
     [BINARY_OP_ADD_FLOAT] = { true, INSTR_FMT_IXC0000, HAS_EXIT_FLAG | HAS_ERROR_FLAG | HAS_ERROR_NO_POP_FLAG },
     [BINARY_OP_ADD_INT] = { true, INSTR_FMT_IXC0000, HAS_EXIT_FLAG },
-    [BINARY_OP_ADD_UNICODE] = { true, INSTR_FMT_IXC0000, HAS_EXIT_FLAG | HAS_ERROR_FLAG },
+    [BINARY_OP_ADD_UNICODE] = { true, INSTR_FMT_IXC0000, HAS_EXIT_FLAG | HAS_ERROR_FLAG | HAS_ERROR_NO_POP_FLAG },
     [BINARY_OP_EXTEND] = { true, INSTR_FMT_IXC0000, HAS_DEOPT_FLAG | HAS_ESCAPES_FLAG },
     [BINARY_OP_INPLACE_ADD_UNICODE] = { true, INSTR_FMT_IXC0000, HAS_LOCAL_FLAG | HAS_DEOPT_FLAG | HAS_EXIT_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG },
     [BINARY_OP_MULTIPLY_FLOAT] = { true, INSTR_FMT_IXC0000, HAS_EXIT_FLAG | HAS_ERROR_FLAG | HAS_ERROR_NO_POP_FLAG },
@@ -1331,16 +1331,16 @@ _PyOpcode_macro_expansion[256] = {
     [BINARY_OP] = { .nuops = 1, .uops = { { _BINARY_OP, OPARG_SIMPLE, 4 } } },
     [BINARY_OP_ADD_FLOAT] = { .nuops = 5, .uops = { { _GUARD_TOS_FLOAT, OPARG_SIMPLE, 0 }, { _GUARD_NOS_FLOAT, OPARG_SIMPLE, 0 }, { _BINARY_OP_ADD_FLOAT, OPARG_SIMPLE, 5 }, { _POP_TOP_FLOAT, OPARG_SIMPLE, 5 }, { _POP_TOP_FLOAT, OPARG_SIMPLE, 5 } } },
     [BINARY_OP_ADD_INT] = { .nuops = 5, .uops = { { _GUARD_TOS_INT, OPARG_SIMPLE, 0 }, { _GUARD_NOS_INT, OPARG_SIMPLE, 0 }, { _BINARY_OP_ADD_INT, OPARG_SIMPLE, 5 }, { _POP_TOP_INT, OPARG_SIMPLE, 5 }, { _POP_TOP_INT, OPARG_SIMPLE, 5 } } },
-    [BINARY_OP_ADD_UNICODE] = { .nuops = 3, .uops = { { _GUARD_TOS_UNICODE, OPARG_SIMPLE, 0 }, { _GUARD_NOS_UNICODE, OPARG_SIMPLE, 0 }, { _BINARY_OP_ADD_UNICODE, OPARG_SIMPLE, 5 } } },
+    [BINARY_OP_ADD_UNICODE] = { .nuops = 5, .uops = { { _GUARD_TOS_UNICODE, OPARG_SIMPLE, 0 }, { _GUARD_NOS_UNICODE, OPARG_SIMPLE, 0 }, { _BINARY_OP_ADD_UNICODE, OPARG_SIMPLE, 5 }, { _POP_TOP_UNICODE, OPARG_SIMPLE, 5 }, { _POP_TOP_UNICODE, OPARG_SIMPLE, 5 } } },
     [BINARY_OP_EXTEND] = { .nuops = 2, .uops = { { _GUARD_BINARY_OP_EXTEND, 4, 1 }, { _BINARY_OP_EXTEND, 4, 1 } } },
     [BINARY_OP_INPLACE_ADD_UNICODE] = { .nuops = 3, .uops = { { _GUARD_TOS_UNICODE, OPARG_SIMPLE, 0 }, { _GUARD_NOS_UNICODE, OPARG_SIMPLE, 0 }, { _BINARY_OP_INPLACE_ADD_UNICODE, OPARG_SIMPLE, 5 } } },
     [BINARY_OP_MULTIPLY_FLOAT] = { .nuops = 5, .uops = { { _GUARD_TOS_FLOAT, OPARG_SIMPLE, 0 }, { _GUARD_NOS_FLOAT, OPARG_SIMPLE, 0 }, { _BINARY_OP_MULTIPLY_FLOAT, OPARG_SIMPLE, 5 }, { _POP_TOP_FLOAT, OPARG_SIMPLE, 5 }, { _POP_TOP_FLOAT, OPARG_SIMPLE, 5 } } },
     [BINARY_OP_MULTIPLY_INT] = { .nuops = 5, .uops = { { _GUARD_TOS_INT, OPARG_SIMPLE, 0 }, { _GUARD_NOS_INT, OPARG_SIMPLE, 0 }, { _BINARY_OP_MULTIPLY_INT, OPARG_SIMPLE, 5 }, { _POP_TOP_INT, OPARG_SIMPLE, 5 }, { _POP_TOP_INT, OPARG_SIMPLE, 5 } } },
     [BINARY_OP_SUBSCR_DICT] = { .nuops = 2, .uops = { { _GUARD_NOS_DICT, OPARG_SIMPLE, 0 }, { _BINARY_OP_SUBSCR_DICT, OPARG_SIMPLE, 5 } } },
     [BINARY_OP_SUBSCR_GETITEM] = { .nuops = 4, .uops = { { _CHECK_PEP_523, OPARG_SIMPLE, 5 }, { _BINARY_OP_SUBSCR_CHECK_FUNC, OPARG_SIMPLE, 5 }, { _BINARY_OP_SUBSCR_INIT_CALL, OPARG_SIMPLE, 5 }, { _PUSH_FRAME, OPARG_SIMPLE, 5 } } },
-    [BINARY_OP_SUBSCR_LIST_INT] = { .nuops = 3, .uops = { { _GUARD_TOS_INT, OPARG_SIMPLE, 0 }, { _GUARD_NOS_LIST, OPARG_SIMPLE, 0 }, { _BINARY_OP_SUBSCR_LIST_INT, OPARG_SIMPLE, 5 } } },
+    [BINARY_OP_SUBSCR_LIST_INT] = { .nuops = 5, .uops = { { _GUARD_TOS_INT, OPARG_SIMPLE, 0 }, { _GUARD_NOS_LIST, OPARG_SIMPLE, 0 }, { _BINARY_OP_SUBSCR_LIST_INT, OPARG_SIMPLE, 5 }, { _POP_TOP_INT, OPARG_SIMPLE, 5 }, { _POP_TOP, OPARG_SIMPLE, 5 } } },
     [BINARY_OP_SUBSCR_LIST_SLICE] = { .nuops = 3, .uops = { { _GUARD_TOS_SLICE, OPARG_SIMPLE, 0 }, { _GUARD_NOS_LIST, OPARG_SIMPLE, 0 }, { _BINARY_OP_SUBSCR_LIST_SLICE, OPARG_SIMPLE, 5 } } },
-    [BINARY_OP_SUBSCR_STR_INT] = { .nuops = 3, .uops = { { _GUARD_TOS_INT, OPARG_SIMPLE, 0 }, { _GUARD_NOS_UNICODE, OPARG_SIMPLE, 0 }, { _BINARY_OP_SUBSCR_STR_INT, OPARG_SIMPLE, 5 } } },
+    [BINARY_OP_SUBSCR_STR_INT] = { .nuops = 5, .uops = { { _GUARD_TOS_INT, OPARG_SIMPLE, 0 }, { _GUARD_NOS_UNICODE, OPARG_SIMPLE, 0 }, { _BINARY_OP_SUBSCR_STR_INT, OPARG_SIMPLE, 5 }, { _POP_TOP_INT, OPARG_SIMPLE, 5 }, { _POP_TOP, OPARG_SIMPLE, 5 } } },
     [BINARY_OP_SUBSCR_TUPLE_INT] = { .nuops = 3, .uops = { { _GUARD_TOS_INT, OPARG_SIMPLE, 0 }, { _GUARD_NOS_TUPLE, OPARG_SIMPLE, 0 }, { _BINARY_OP_SUBSCR_TUPLE_INT, OPARG_SIMPLE, 5 } } },
     [BINARY_OP_SUBTRACT_FLOAT] = { .nuops = 5, .uops = { { _GUARD_TOS_FLOAT, OPARG_SIMPLE, 0 }, { _GUARD_NOS_FLOAT, OPARG_SIMPLE, 0 }, { _BINARY_OP_SUBTRACT_FLOAT, OPARG_SIMPLE, 5 }, { _POP_TOP_FLOAT, OPARG_SIMPLE, 5 }, { _POP_TOP_FLOAT, OPARG_SIMPLE, 5 } } },
     [BINARY_OP_SUBTRACT_INT] = { .nuops = 5, .uops = { { _GUARD_TOS_INT, OPARG_SIMPLE, 0 }, { _GUARD_NOS_INT, OPARG_SIMPLE, 0 }, { _BINARY_OP_SUBTRACT_INT, OPARG_SIMPLE, 5 }, { _POP_TOP_INT, OPARG_SIMPLE, 5 }, { _POP_TOP_INT, OPARG_SIMPLE, 5 } } },
@@ -1425,7 +1425,7 @@ _PyOpcode_macro_expansion[256] = {
     [LOAD_ATTR] = { .nuops = 1, .uops = { { _LOAD_ATTR, OPARG_SIMPLE, 8 } } },
     [LOAD_ATTR_CLASS] = { .nuops = 3, .uops = { { _CHECK_ATTR_CLASS, 2, 1 }, { _LOAD_ATTR_CLASS, 4, 5 }, { _PUSH_NULL_CONDITIONAL, OPARG_SIMPLE, 9 } } },
     [LOAD_ATTR_CLASS_WITH_METACLASS_CHECK] = { .nuops = 4, .uops = { { _CHECK_ATTR_CLASS, 2, 1 }, { _GUARD_TYPE_VERSION, 2, 3 }, { _LOAD_ATTR_CLASS, 4, 5 }, { _PUSH_NULL_CONDITIONAL, OPARG_SIMPLE, 9 } } },
-    [LOAD_ATTR_INSTANCE_VALUE] = { .nuops = 4, .uops = { { _GUARD_TYPE_VERSION, 2, 1 }, { _CHECK_MANAGED_OBJECT_HAS_VALUES, OPARG_SIMPLE, 3 }, { _LOAD_ATTR_INSTANCE_VALUE, 1, 3 }, { _PUSH_NULL_CONDITIONAL, OPARG_SIMPLE, 9 } } },
+    [LOAD_ATTR_INSTANCE_VALUE] = { .nuops = 5, .uops = { { _GUARD_TYPE_VERSION, 2, 1 }, { _CHECK_MANAGED_OBJECT_HAS_VALUES, OPARG_SIMPLE, 3 }, { _LOAD_ATTR_INSTANCE_VALUE, 1, 3 }, { _POP_TOP, OPARG_SIMPLE, 4 }, { _PUSH_NULL_CONDITIONAL, OPARG_SIMPLE, 9 } } },
     [LOAD_ATTR_METHOD_LAZY_DICT] = { .nuops = 3, .uops = { { _GUARD_TYPE_VERSION, 2, 1 }, { _CHECK_ATTR_METHOD_LAZY_DICT, 1, 3 }, { _LOAD_ATTR_METHOD_LAZY_DICT, 4, 5 } } },
     [LOAD_ATTR_METHOD_NO_DICT] = { .nuops = 2, .uops = { { _GUARD_TYPE_VERSION, 2, 1 }, { _LOAD_ATTR_METHOD_NO_DICT, 4, 5 } } },
     [LOAD_ATTR_METHOD_WITH_VALUES] = { .nuops = 4, .uops = { { _GUARD_TYPE_VERSION, 2, 1 }, { _GUARD_DORV_VALUES_INST_ATTR_FROM_DICT, OPARG_SIMPLE, 3 }, { _GUARD_KEYS_VERSION, 2, 3 }, { _LOAD_ATTR_METHOD_WITH_VALUES, 4, 5 } } },
@@ -1434,7 +1434,7 @@ _PyOpcode_macro_expansion[256] = {
     [LOAD_ATTR_NONDESCRIPTOR_WITH_VALUES] = { .nuops = 4, .uops = { { _GUARD_TYPE_VERSION, 2, 1 }, { _GUARD_DORV_VALUES_INST_ATTR_FROM_DICT, OPARG_SIMPLE, 3 }, { _GUARD_KEYS_VERSION, 2, 3 }, { _LOAD_ATTR_NONDESCRIPTOR_WITH_VALUES, 4, 5 } } },
     [LOAD_ATTR_PROPERTY] = { .nuops = 5, .uops = { { _CHECK_PEP_523, OPARG_SIMPLE, 1 }, { _GUARD_TYPE_VERSION, 2, 1 }, { _LOAD_ATTR_PROPERTY_FRAME, 4, 5 }, { _SAVE_RETURN_OFFSET, OPARG_SAVE_RETURN_OFFSET, 9 }, { _PUSH_FRAME, OPARG_SIMPLE, 9 } } },
     [LOAD_ATTR_SLOT] = { .nuops = 3, .uops = { { _GUARD_TYPE_VERSION, 2, 1 }, { _LOAD_ATTR_SLOT, 1, 3 }, { _PUSH_NULL_CONDITIONAL, OPARG_SIMPLE, 9 } } },
-    [LOAD_ATTR_WITH_HINT] = { .nuops = 3, .uops = { { _GUARD_TYPE_VERSION, 2, 1 }, { _LOAD_ATTR_WITH_HINT, 1, 3 }, { _PUSH_NULL_CONDITIONAL, OPARG_SIMPLE, 9 } } },
+    [LOAD_ATTR_WITH_HINT] = { .nuops = 4, .uops = { { _GUARD_TYPE_VERSION, 2, 1 }, { _LOAD_ATTR_WITH_HINT, 1, 3 }, { _POP_TOP, OPARG_SIMPLE, 4 }, { _PUSH_NULL_CONDITIONAL, OPARG_SIMPLE, 9 } } },
     [LOAD_BUILD_CLASS] = { .nuops = 1, .uops = { { _LOAD_BUILD_CLASS, OPARG_SIMPLE, 0 } } },
     [LOAD_COMMON_CONSTANT] = { .nuops = 1, .uops = { { _LOAD_COMMON_CONSTANT, OPARG_SIMPLE, 0 } } },
     [LOAD_CONST] = { .nuops = 1, .uops = { { _LOAD_CONST, OPARG_SIMPLE, 0 } } },
@@ -1484,7 +1484,7 @@ _PyOpcode_macro_expansion[256] = {
     [STORE_ATTR] = { .nuops = 1, .uops = { { _STORE_ATTR, OPARG_SIMPLE, 3 } } },
     [STORE_ATTR_INSTANCE_VALUE] = { .nuops = 4, .uops = { { _GUARD_TYPE_VERSION_AND_LOCK, 2, 1 }, { _GUARD_DORV_NO_DICT, OPARG_SIMPLE, 3 }, { _STORE_ATTR_INSTANCE_VALUE, 1, 3 }, { _POP_TOP, OPARG_SIMPLE, 4 } } },
     [STORE_ATTR_SLOT] = { .nuops = 3, .uops = { { _GUARD_TYPE_VERSION, 2, 1 }, { _STORE_ATTR_SLOT, 1, 3 }, { _POP_TOP, OPARG_SIMPLE, 4 } } },
-    [STORE_ATTR_WITH_HINT] = { .nuops = 2, .uops = { { _GUARD_TYPE_VERSION, 2, 1 }, { _STORE_ATTR_WITH_HINT, 1, 3 } } },
+    [STORE_ATTR_WITH_HINT] = { .nuops = 3, .uops = { { _GUARD_TYPE_VERSION, 2, 1 }, { _STORE_ATTR_WITH_HINT, 1, 3 }, { _POP_TOP, OPARG_SIMPLE, 4 } } },
     [STORE_DEREF] = { .nuops = 1, .uops = { { _STORE_DEREF, OPARG_SIMPLE, 0 } } },
     [STORE_FAST] = { .nuops = 1, .uops = { { _STORE_FAST, OPARG_SIMPLE, 0 } } },
     [STORE_FAST_LOAD_FAST] = { .nuops = 2, .uops = { { _STORE_FAST, OPARG_TOP, 0 }, { _LOAD_FAST, OPARG_BOTTOM, 0 } } },
diff --git a/Include/internal/pycore_optimizer.h b/Include/internal/pycore_optimizer.h
index 295d4909e14..3ee62f17283 100644
--- a/Include/internal/pycore_optimizer.h
+++ b/Include/internal/pycore_optimizer.h
@@ -25,7 +25,6 @@ typedef struct {
     uint8_t opcode;
     uint8_t oparg;
     uint8_t valid;
-    uint8_t linked;
     uint8_t chain_depth;  // Must be big enough for MAX_CHAIN_DEPTH - 1.
     bool warm;
     int32_t index;           // Index of ENTER_EXECUTOR (if code isn't NULL, below).
@@ -55,11 +54,6 @@ typedef struct _PyExecutorObject {
     _PyExitData exits[1];
 } _PyExecutorObject;
 
-/* If pending deletion list gets large enough, then scan,
- * and free any executors that aren't executing
- * i.e. any that aren't a thread's current_executor. */
-#define EXECUTOR_DELETE_LIST_MAX 100
-
 // Export for '_opcode' shared extension (JIT compiler).
 PyAPI_FUNC(_PyExecutorObject*) _Py_GetExecutor(PyCodeObject *code, int offset);
 
@@ -80,7 +74,6 @@ PyAPI_FUNC(void) _Py_Executors_InvalidateCold(PyInterpreterState *interp);
 #else
 #  define _Py_Executors_InvalidateDependency(A, B, C) ((void)0)
 #  define _Py_Executors_InvalidateAll(A, B) ((void)0)
-#  define _Py_Executors_InvalidateCold(A) ((void)0)
 
 #endif
 
diff --git a/Include/internal/pycore_parser.h b/Include/internal/pycore_parser.h
index 2c46f59ab7d..b89d02035db 100644
--- a/Include/internal/pycore_parser.h
+++ b/Include/internal/pycore_parser.h
@@ -14,21 +14,6 @@ extern "C" {
 #include "pycore_pyarena.h"         // PyArena
 
 _Py_DECLARE_STR(empty, "")
-#if defined(Py_DEBUG) && defined(Py_GIL_DISABLED)
-#define _parser_runtime_state_INIT \
-    { \
-        .mutex = {0}, \
-        .dummy_name = { \
-            .kind = Name_kind, \
-            .v.Name.id = &_Py_STR(empty), \
-            .v.Name.ctx = Load, \
-            .lineno = 1, \
-            .col_offset = 0, \
-            .end_lineno = 1, \
-            .end_col_offset = 0, \
-        }, \
-    }
-#else
 #define _parser_runtime_state_INIT \
     { \
         .dummy_name = { \
@@ -41,7 +26,6 @@ _Py_DECLARE_STR(empty, "")
             .end_col_offset = 0, \
         }, \
     }
-#endif
 
 extern struct _mod* _PyParser_ASTFromString(
     const char *str,
diff --git a/Include/internal/pycore_pyatomic_ft_wrappers.h b/Include/internal/pycore_pyatomic_ft_wrappers.h
index 1a6d5075361..70a32db663b 100644
--- a/Include/internal/pycore_pyatomic_ft_wrappers.h
+++ b/Include/internal/pycore_pyatomic_ft_wrappers.h
@@ -41,6 +41,8 @@ extern "C" {
     _Py_atomic_load_uint8(&value)
 #define FT_ATOMIC_STORE_UINT8(value, new_value) \
     _Py_atomic_store_uint8(&value, new_value)
+#define FT_ATOMIC_LOAD_INT8_RELAXED(value) \
+    _Py_atomic_load_int8_relaxed(&value)
 #define FT_ATOMIC_LOAD_UINT8_RELAXED(value) \
     _Py_atomic_load_uint8_relaxed(&value)
 #define FT_ATOMIC_LOAD_UINT16_RELAXED(value) \
@@ -55,6 +57,10 @@ extern "C" {
     _Py_atomic_store_ptr_release(&value, new_value)
 #define FT_ATOMIC_STORE_UINTPTR_RELEASE(value, new_value) \
     _Py_atomic_store_uintptr_release(&value, new_value)
+#define FT_ATOMIC_STORE_INT8_RELAXED(value, new_value) \
+    _Py_atomic_store_int8_relaxed(&value, new_value)
+#define FT_ATOMIC_STORE_INT8_RELEASE(value, new_value) \
+    _Py_atomic_store_int8_release(&value, new_value)
 #define FT_ATOMIC_STORE_SSIZE_RELAXED(value, new_value) \
     _Py_atomic_store_ssize_relaxed(&value, new_value)
 #define FT_ATOMIC_STORE_SSIZE_RELEASE(value, new_value) \
@@ -134,6 +140,7 @@ extern "C" {
 #define FT_ATOMIC_LOAD_PTR_RELAXED(value) value
 #define FT_ATOMIC_LOAD_UINT8(value) value
 #define FT_ATOMIC_STORE_UINT8(value, new_value) value = new_value
+#define FT_ATOMIC_LOAD_INT8_RELAXED(value) value
 #define FT_ATOMIC_LOAD_UINT8_RELAXED(value) value
 #define FT_ATOMIC_LOAD_UINT16_RELAXED(value) value
 #define FT_ATOMIC_LOAD_UINT32_RELAXED(value) value
@@ -141,6 +148,8 @@ extern "C" {
 #define FT_ATOMIC_STORE_PTR_RELAXED(value, new_value) value = new_value
 #define FT_ATOMIC_STORE_PTR_RELEASE(value, new_value) value = new_value
 #define FT_ATOMIC_STORE_UINTPTR_RELEASE(value, new_value) value = new_value
+#define FT_ATOMIC_STORE_INT8_RELAXED(value, new_value) value = new_value
+#define FT_ATOMIC_STORE_INT8_RELEASE(value, new_value) value = new_value
 #define FT_ATOMIC_STORE_SSIZE_RELAXED(value, new_value) value = new_value
 #define FT_ATOMIC_STORE_SSIZE_RELEASE(value, new_value) value = new_value
 #define FT_ATOMIC_STORE_UINT8_RELAXED(value, new_value) value = new_value
diff --git a/Include/internal/pycore_runtime_init_generated.h b/Include/internal/pycore_runtime_init_generated.h
index d381fb9d2d4..499a2569b9a 100644
--- a/Include/internal/pycore_runtime_init_generated.h
+++ b/Include/internal/pycore_runtime_init_generated.h
@@ -1651,9 +1651,11 @@ extern "C" {
     INIT_ID(co_varnames), \
     INIT_ID(code), \
     INIT_ID(col_offset), \
+    INIT_ID(collector), \
     INIT_ID(command), \
     INIT_ID(comment_factory), \
     INIT_ID(compile_mode), \
+    INIT_ID(compression), \
     INIT_ID(config), \
     INIT_ID(consts), \
     INIT_ID(context), \
@@ -1716,7 +1718,9 @@ extern "C" {
     INIT_ID(event), \
     INIT_ID(eventmask), \
     INIT_ID(exc), \
+    INIT_ID(exc_tb), \
     INIT_ID(exc_type), \
+    INIT_ID(exc_val), \
     INIT_ID(exc_value), \
     INIT_ID(excepthook), \
     INIT_ID(exception), \
@@ -1972,6 +1976,7 @@ extern "C" {
     INIT_ID(print_file_and_line), \
     INIT_ID(priority), \
     INIT_ID(progress), \
+    INIT_ID(progress_callback), \
     INIT_ID(progress_routine), \
     INIT_ID(proto), \
     INIT_ID(protocol), \
@@ -2012,6 +2017,7 @@ extern "C" {
     INIT_ID(reversed), \
     INIT_ID(rounding), \
     INIT_ID(salt), \
+    INIT_ID(sample_interval_us), \
     INIT_ID(sched_priority), \
     INIT_ID(scheduler), \
     INIT_ID(script), \
@@ -2051,8 +2057,10 @@ extern "C" {
     INIT_ID(spam), \
     INIT_ID(src), \
     INIT_ID(src_dir_fd), \
+    INIT_ID(stack_frames), \
     INIT_ID(stacklevel), \
     INIT_ID(start), \
+    INIT_ID(start_time_us), \
     INIT_ID(statement), \
     INIT_ID(stats), \
     INIT_ID(status), \
@@ -2093,6 +2101,7 @@ extern "C" {
     INIT_ID(times), \
     INIT_ID(timespec), \
     INIT_ID(timestamp), \
+    INIT_ID(timestamp_us), \
     INIT_ID(timetuple), \
     INIT_ID(timeunit), \
     INIT_ID(top), \
diff --git a/Include/internal/pycore_runtime_structs.h b/Include/internal/pycore_runtime_structs.h
index 995f49e78dc..92387031ad7 100644
--- a/Include/internal/pycore_runtime_structs.h
+++ b/Include/internal/pycore_runtime_structs.h
@@ -77,9 +77,7 @@ struct _fileutils_state {
 struct _parser_runtime_state {
 #ifdef Py_DEBUG
     long memo_statistics[_PYPEGEN_NSTATISTICS];
-#ifdef Py_GIL_DISABLED
     PyMutex mutex;
-#endif
 #else
     int _not_used;
 #endif
diff --git a/Include/internal/pycore_tracemalloc.h b/Include/internal/pycore_tracemalloc.h
index 693385f9a46..9974ea3c414 100644
--- a/Include/internal/pycore_tracemalloc.h
+++ b/Include/internal/pycore_tracemalloc.h
@@ -21,7 +21,10 @@ struct _PyTraceMalloc_Config {
     } initialized;
 
     /* Is tracemalloc tracing memory allocations?
-       Variable protected by the TABLES_LOCK(). */
+       Variable protected by the TABLES_LOCK() and stored atomically.
+       Atomic store is used so that it can read without locking for the
+       general case of checking if tracemalloc is enabled.
+       */
     int tracing;
 
     /* limit of the number of frames in a traceback, 1 by default.
diff --git a/Include/internal/pycore_tstate.h b/Include/internal/pycore_tstate.h
index c4f723ac8ab..a57f1f45c13 100644
--- a/Include/internal/pycore_tstate.h
+++ b/Include/internal/pycore_tstate.h
@@ -82,6 +82,13 @@ typedef struct _PyThreadStateImpl {
     PyObject *asyncio_running_loop; // Strong reference
     PyObject *asyncio_running_task; // Strong reference
 
+    // Distinguishes between yield and return from PyEval_EvalFrame().
+    // See gen_send_ex2() in Objects/genobject.c
+    enum {
+        GENERATOR_RETURN = 0,
+        GENERATOR_YIELD = 1,
+    } generator_return_kind;
+
     /* Head of circular linked-list of all tasks which are instances of `asyncio.Task`
        or subclasses of it used in `asyncio.all_tasks`.
     */
diff --git a/Include/internal/pycore_unicodeobject_generated.h b/Include/internal/pycore_unicodeobject_generated.h
index 24e50828935..1375f46018f 100644
--- a/Include/internal/pycore_unicodeobject_generated.h
+++ b/Include/internal/pycore_unicodeobject_generated.h
@@ -1284,6 +1284,10 @@ _PyUnicode_InitStaticStrings(PyInterpreterState *interp) {
     _PyUnicode_InternStatic(interp, &string);
     assert(_PyUnicode_CheckConsistency(string, 1));
     assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(collector);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
     string = &_Py_ID(command);
     _PyUnicode_InternStatic(interp, &string);
     assert(_PyUnicode_CheckConsistency(string, 1));
@@ -1296,6 +1300,10 @@ _PyUnicode_InitStaticStrings(PyInterpreterState *interp) {
     _PyUnicode_InternStatic(interp, &string);
     assert(_PyUnicode_CheckConsistency(string, 1));
     assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(compression);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
     string = &_Py_ID(config);
     _PyUnicode_InternStatic(interp, &string);
     assert(_PyUnicode_CheckConsistency(string, 1));
@@ -1544,10 +1552,18 @@ _PyUnicode_InitStaticStrings(PyInterpreterState *interp) {
     _PyUnicode_InternStatic(interp, &string);
     assert(_PyUnicode_CheckConsistency(string, 1));
     assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(exc_tb);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
     string = &_Py_ID(exc_type);
     _PyUnicode_InternStatic(interp, &string);
     assert(_PyUnicode_CheckConsistency(string, 1));
     assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(exc_val);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
     string = &_Py_ID(exc_value);
     _PyUnicode_InternStatic(interp, &string);
     assert(_PyUnicode_CheckConsistency(string, 1));
@@ -2568,6 +2584,10 @@ _PyUnicode_InitStaticStrings(PyInterpreterState *interp) {
     _PyUnicode_InternStatic(interp, &string);
     assert(_PyUnicode_CheckConsistency(string, 1));
     assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(progress_callback);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
     string = &_Py_ID(progress_routine);
     _PyUnicode_InternStatic(interp, &string);
     assert(_PyUnicode_CheckConsistency(string, 1));
@@ -2728,6 +2748,10 @@ _PyUnicode_InitStaticStrings(PyInterpreterState *interp) {
     _PyUnicode_InternStatic(interp, &string);
     assert(_PyUnicode_CheckConsistency(string, 1));
     assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(sample_interval_us);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
     string = &_Py_ID(sched_priority);
     _PyUnicode_InternStatic(interp, &string);
     assert(_PyUnicode_CheckConsistency(string, 1));
@@ -2884,6 +2908,10 @@ _PyUnicode_InitStaticStrings(PyInterpreterState *interp) {
     _PyUnicode_InternStatic(interp, &string);
     assert(_PyUnicode_CheckConsistency(string, 1));
     assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(stack_frames);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
     string = &_Py_ID(stacklevel);
     _PyUnicode_InternStatic(interp, &string);
     assert(_PyUnicode_CheckConsistency(string, 1));
@@ -2892,6 +2920,10 @@ _PyUnicode_InitStaticStrings(PyInterpreterState *interp) {
     _PyUnicode_InternStatic(interp, &string);
     assert(_PyUnicode_CheckConsistency(string, 1));
     assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(start_time_us);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
     string = &_Py_ID(statement);
     _PyUnicode_InternStatic(interp, &string);
     assert(_PyUnicode_CheckConsistency(string, 1));
@@ -3052,6 +3084,10 @@ _PyUnicode_InitStaticStrings(PyInterpreterState *interp) {
     _PyUnicode_InternStatic(interp, &string);
     assert(_PyUnicode_CheckConsistency(string, 1));
     assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(timestamp_us);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
     string = &_Py_ID(timetuple);
     _PyUnicode_InternStatic(interp, &string);
     assert(_PyUnicode_CheckConsistency(string, 1));
diff --git a/Include/internal/pycore_uop_ids.h b/Include/internal/pycore_uop_ids.h
index df623f49b0d..c8aa2765a34 100644
--- a/Include/internal/pycore_uop_ids.h
+++ b/Include/internal/pycore_uop_ids.h
@@ -314,61 +314,61 @@ extern "C" {
 #define _SET_ADD SET_ADD
 #define _SET_FUNCTION_ATTRIBUTE SET_FUNCTION_ATTRIBUTE
 #define _SET_UPDATE SET_UPDATE
-#define _SPILL_OR_RELOAD 524
-#define _START_EXECUTOR 525
-#define _STORE_ATTR 526
-#define _STORE_ATTR_INSTANCE_VALUE 527
-#define _STORE_ATTR_SLOT 528
-#define _STORE_ATTR_WITH_HINT 529
+#define _SHUFFLE_3_LOAD_CONST_INLINE_BORROW 524
+#define _SPILL_OR_RELOAD 525
+#define _START_EXECUTOR 526
+#define _STORE_ATTR 527
+#define _STORE_ATTR_INSTANCE_VALUE 528
+#define _STORE_ATTR_SLOT 529
+#define _STORE_ATTR_WITH_HINT 530
 #define _STORE_DEREF STORE_DEREF
-#define _STORE_FAST 530
-#define _STORE_FAST_0 531
-#define _STORE_FAST_1 532
-#define _STORE_FAST_2 533
-#define _STORE_FAST_3 534
-#define _STORE_FAST_4 535
-#define _STORE_FAST_5 536
-#define _STORE_FAST_6 537
-#define _STORE_FAST_7 538
+#define _STORE_FAST 531
+#define _STORE_FAST_0 532
+#define _STORE_FAST_1 533
+#define _STORE_FAST_2 534
+#define _STORE_FAST_3 535
+#define _STORE_FAST_4 536
+#define _STORE_FAST_5 537
+#define _STORE_FAST_6 538
+#define _STORE_FAST_7 539
 #define _STORE_GLOBAL STORE_GLOBAL
 #define _STORE_NAME STORE_NAME
-#define _STORE_SLICE 539
-#define _STORE_SUBSCR 540
-#define _STORE_SUBSCR_DICT 541
-#define _STORE_SUBSCR_LIST_INT 542
-#define _SWAP 543
-#define _SWAP_2 544
-#define _SWAP_3 545
-#define _TIER2_RESUME_CHECK 546
-#define _TO_BOOL 547
+#define _STORE_SLICE 540
+#define _STORE_SUBSCR 541
+#define _STORE_SUBSCR_DICT 542
+#define _STORE_SUBSCR_LIST_INT 543
+#define _SWAP 544
+#define _SWAP_2 545
+#define _SWAP_3 546
+#define _TIER2_RESUME_CHECK 547
+#define _TO_BOOL 548
 #define _TO_BOOL_BOOL TO_BOOL_BOOL
 #define _TO_BOOL_INT TO_BOOL_INT
-#define _TO_BOOL_LIST 548
+#define _TO_BOOL_LIST 549
 #define _TO_BOOL_NONE TO_BOOL_NONE
-#define _TO_BOOL_STR 549
+#define _TO_BOOL_STR 550
 #define _TRACE_RECORD TRACE_RECORD
 #define _UNARY_INVERT UNARY_INVERT
 #define _UNARY_NEGATIVE UNARY_NEGATIVE
 #define _UNARY_NOT UNARY_NOT
 #define _UNPACK_EX UNPACK_EX
-#define _UNPACK_SEQUENCE 550
-#define _UNPACK_SEQUENCE_LIST 551
-#define _UNPACK_SEQUENCE_TUPLE 552
-#define _UNPACK_SEQUENCE_TWO_TUPLE 553
+#define _UNPACK_SEQUENCE 551
+#define _UNPACK_SEQUENCE_LIST 552
+#define _UNPACK_SEQUENCE_TUPLE 553
+#define _UNPACK_SEQUENCE_TWO_TUPLE 554
 #define _WITH_EXCEPT_START WITH_EXCEPT_START
 #define _YIELD_VALUE YIELD_VALUE
-#define MAX_UOP_ID 553
-#define _BINARY_OP_r21 554
-#define _BINARY_OP_ADD_FLOAT_r03 555
-#define _BINARY_OP_ADD_FLOAT_r13 556
-#define _BINARY_OP_ADD_FLOAT_r23 557
-#define _BINARY_OP_ADD_INT_r03 558
-#define _BINARY_OP_ADD_INT_r13 559
-#define _BINARY_OP_ADD_INT_r23 560
-#define _BINARY_OP_ADD_UNICODE_r01 561
-#define _BINARY_OP_ADD_UNICODE_r11 562
-#define _BINARY_OP_ADD_UNICODE_r21 563
-#define _BINARY_OP_ADD_UNICODE_r32 564
+#define MAX_UOP_ID 554
+#define _BINARY_OP_r21 555
+#define _BINARY_OP_ADD_FLOAT_r03 556
+#define _BINARY_OP_ADD_FLOAT_r13 557
+#define _BINARY_OP_ADD_FLOAT_r23 558
+#define _BINARY_OP_ADD_INT_r03 559
+#define _BINARY_OP_ADD_INT_r13 560
+#define _BINARY_OP_ADD_INT_r23 561
+#define _BINARY_OP_ADD_UNICODE_r03 562
+#define _BINARY_OP_ADD_UNICODE_r13 563
+#define _BINARY_OP_ADD_UNICODE_r23 564
 #define _BINARY_OP_EXTEND_r21 565
 #define _BINARY_OP_INPLACE_ADD_UNICODE_r20 566
 #define _BINARY_OP_MULTIPLY_FLOAT_r03 567
@@ -383,9 +383,9 @@ extern "C" {
 #define _BINARY_OP_SUBSCR_INIT_CALL_r11 576
 #define _BINARY_OP_SUBSCR_INIT_CALL_r21 577
 #define _BINARY_OP_SUBSCR_INIT_CALL_r31 578
-#define _BINARY_OP_SUBSCR_LIST_INT_r21 579
+#define _BINARY_OP_SUBSCR_LIST_INT_r23 579
 #define _BINARY_OP_SUBSCR_LIST_SLICE_r21 580
-#define _BINARY_OP_SUBSCR_STR_INT_r21 581
+#define _BINARY_OP_SUBSCR_STR_INT_r23 581
 #define _BINARY_OP_SUBSCR_TUPLE_INT_r21 582
 #define _BINARY_OP_SUBTRACT_FLOAT_r03 583
 #define _BINARY_OP_SUBTRACT_FLOAT_r13 584
@@ -535,566 +535,574 @@ extern "C" {
 #define _FORMAT_SIMPLE_r11 728
 #define _FORMAT_WITH_SPEC_r21 729
 #define _FOR_ITER_r23 730
-#define _FOR_ITER_GEN_FRAME_r23 731
-#define _FOR_ITER_TIER_TWO_r23 732
-#define _GET_AITER_r11 733
-#define _GET_ANEXT_r12 734
-#define _GET_AWAITABLE_r11 735
-#define _GET_ITER_r12 736
-#define _GET_LEN_r12 737
-#define _GET_YIELD_FROM_ITER_r11 738
-#define _GUARD_BINARY_OP_EXTEND_r22 739
-#define _GUARD_CALLABLE_ISINSTANCE_r03 740
-#define _GUARD_CALLABLE_ISINSTANCE_r13 741
-#define _GUARD_CALLABLE_ISINSTANCE_r23 742
-#define _GUARD_CALLABLE_ISINSTANCE_r33 743
-#define _GUARD_CALLABLE_LEN_r03 744
-#define _GUARD_CALLABLE_LEN_r13 745
-#define _GUARD_CALLABLE_LEN_r23 746
-#define _GUARD_CALLABLE_LEN_r33 747
-#define _GUARD_CALLABLE_LIST_APPEND_r03 748
-#define _GUARD_CALLABLE_LIST_APPEND_r13 749
-#define _GUARD_CALLABLE_LIST_APPEND_r23 750
-#define _GUARD_CALLABLE_LIST_APPEND_r33 751
-#define _GUARD_CALLABLE_STR_1_r03 752
-#define _GUARD_CALLABLE_STR_1_r13 753
-#define _GUARD_CALLABLE_STR_1_r23 754
-#define _GUARD_CALLABLE_STR_1_r33 755
-#define _GUARD_CALLABLE_TUPLE_1_r03 756
-#define _GUARD_CALLABLE_TUPLE_1_r13 757
-#define _GUARD_CALLABLE_TUPLE_1_r23 758
-#define _GUARD_CALLABLE_TUPLE_1_r33 759
-#define _GUARD_CALLABLE_TYPE_1_r03 760
-#define _GUARD_CALLABLE_TYPE_1_r13 761
-#define _GUARD_CALLABLE_TYPE_1_r23 762
-#define _GUARD_CALLABLE_TYPE_1_r33 763
-#define _GUARD_DORV_NO_DICT_r01 764
-#define _GUARD_DORV_NO_DICT_r11 765
-#define _GUARD_DORV_NO_DICT_r22 766
-#define _GUARD_DORV_NO_DICT_r33 767
-#define _GUARD_DORV_VALUES_INST_ATTR_FROM_DICT_r01 768
-#define _GUARD_DORV_VALUES_INST_ATTR_FROM_DICT_r11 769
-#define _GUARD_DORV_VALUES_INST_ATTR_FROM_DICT_r22 770
-#define _GUARD_DORV_VALUES_INST_ATTR_FROM_DICT_r33 771
-#define _GUARD_GLOBALS_VERSION_r00 772
-#define _GUARD_GLOBALS_VERSION_r11 773
-#define _GUARD_GLOBALS_VERSION_r22 774
-#define _GUARD_GLOBALS_VERSION_r33 775
-#define _GUARD_IP_RETURN_GENERATOR_r00 776
-#define _GUARD_IP_RETURN_GENERATOR_r11 777
-#define _GUARD_IP_RETURN_GENERATOR_r22 778
-#define _GUARD_IP_RETURN_GENERATOR_r33 779
-#define _GUARD_IP_RETURN_VALUE_r00 780
-#define _GUARD_IP_RETURN_VALUE_r11 781
-#define _GUARD_IP_RETURN_VALUE_r22 782
-#define _GUARD_IP_RETURN_VALUE_r33 783
-#define _GUARD_IP_YIELD_VALUE_r00 784
-#define _GUARD_IP_YIELD_VALUE_r11 785
-#define _GUARD_IP_YIELD_VALUE_r22 786
-#define _GUARD_IP_YIELD_VALUE_r33 787
-#define _GUARD_IP__PUSH_FRAME_r00 788
-#define _GUARD_IP__PUSH_FRAME_r11 789
-#define _GUARD_IP__PUSH_FRAME_r22 790
-#define _GUARD_IP__PUSH_FRAME_r33 791
-#define _GUARD_IS_FALSE_POP_r00 792
-#define _GUARD_IS_FALSE_POP_r10 793
-#define _GUARD_IS_FALSE_POP_r21 794
-#define _GUARD_IS_FALSE_POP_r32 795
-#define _GUARD_IS_NONE_POP_r00 796
-#define _GUARD_IS_NONE_POP_r10 797
-#define _GUARD_IS_NONE_POP_r21 798
-#define _GUARD_IS_NONE_POP_r32 799
-#define _GUARD_IS_NOT_NONE_POP_r10 800
-#define _GUARD_IS_TRUE_POP_r00 801
-#define _GUARD_IS_TRUE_POP_r10 802
-#define _GUARD_IS_TRUE_POP_r21 803
-#define _GUARD_IS_TRUE_POP_r32 804
-#define _GUARD_KEYS_VERSION_r01 805
-#define _GUARD_KEYS_VERSION_r11 806
-#define _GUARD_KEYS_VERSION_r22 807
-#define _GUARD_KEYS_VERSION_r33 808
-#define _GUARD_NOS_DICT_r02 809
-#define _GUARD_NOS_DICT_r12 810
-#define _GUARD_NOS_DICT_r22 811
-#define _GUARD_NOS_DICT_r33 812
-#define _GUARD_NOS_FLOAT_r02 813
-#define _GUARD_NOS_FLOAT_r12 814
-#define _GUARD_NOS_FLOAT_r22 815
-#define _GUARD_NOS_FLOAT_r33 816
-#define _GUARD_NOS_INT_r02 817
-#define _GUARD_NOS_INT_r12 818
-#define _GUARD_NOS_INT_r22 819
-#define _GUARD_NOS_INT_r33 820
-#define _GUARD_NOS_LIST_r02 821
-#define _GUARD_NOS_LIST_r12 822
-#define _GUARD_NOS_LIST_r22 823
-#define _GUARD_NOS_LIST_r33 824
-#define _GUARD_NOS_NOT_NULL_r02 825
-#define _GUARD_NOS_NOT_NULL_r12 826
-#define _GUARD_NOS_NOT_NULL_r22 827
-#define _GUARD_NOS_NOT_NULL_r33 828
-#define _GUARD_NOS_NULL_r02 829
-#define _GUARD_NOS_NULL_r12 830
-#define _GUARD_NOS_NULL_r22 831
-#define _GUARD_NOS_NULL_r33 832
-#define _GUARD_NOS_OVERFLOWED_r02 833
-#define _GUARD_NOS_OVERFLOWED_r12 834
-#define _GUARD_NOS_OVERFLOWED_r22 835
-#define _GUARD_NOS_OVERFLOWED_r33 836
-#define _GUARD_NOS_TUPLE_r02 837
-#define _GUARD_NOS_TUPLE_r12 838
-#define _GUARD_NOS_TUPLE_r22 839
-#define _GUARD_NOS_TUPLE_r33 840
-#define _GUARD_NOS_UNICODE_r02 841
-#define _GUARD_NOS_UNICODE_r12 842
-#define _GUARD_NOS_UNICODE_r22 843
-#define _GUARD_NOS_UNICODE_r33 844
-#define _GUARD_NOT_EXHAUSTED_LIST_r02 845
-#define _GUARD_NOT_EXHAUSTED_LIST_r12 846
-#define _GUARD_NOT_EXHAUSTED_LIST_r22 847
-#define _GUARD_NOT_EXHAUSTED_LIST_r33 848
-#define _GUARD_NOT_EXHAUSTED_RANGE_r02 849
-#define _GUARD_NOT_EXHAUSTED_RANGE_r12 850
-#define _GUARD_NOT_EXHAUSTED_RANGE_r22 851
-#define _GUARD_NOT_EXHAUSTED_RANGE_r33 852
-#define _GUARD_NOT_EXHAUSTED_TUPLE_r02 853
-#define _GUARD_NOT_EXHAUSTED_TUPLE_r12 854
-#define _GUARD_NOT_EXHAUSTED_TUPLE_r22 855
-#define _GUARD_NOT_EXHAUSTED_TUPLE_r33 856
-#define _GUARD_THIRD_NULL_r03 857
-#define _GUARD_THIRD_NULL_r13 858
-#define _GUARD_THIRD_NULL_r23 859
-#define _GUARD_THIRD_NULL_r33 860
-#define _GUARD_TOS_ANY_SET_r01 861
-#define _GUARD_TOS_ANY_SET_r11 862
-#define _GUARD_TOS_ANY_SET_r22 863
-#define _GUARD_TOS_ANY_SET_r33 864
-#define _GUARD_TOS_DICT_r01 865
-#define _GUARD_TOS_DICT_r11 866
-#define _GUARD_TOS_DICT_r22 867
-#define _GUARD_TOS_DICT_r33 868
-#define _GUARD_TOS_FLOAT_r01 869
-#define _GUARD_TOS_FLOAT_r11 870
-#define _GUARD_TOS_FLOAT_r22 871
-#define _GUARD_TOS_FLOAT_r33 872
-#define _GUARD_TOS_INT_r01 873
-#define _GUARD_TOS_INT_r11 874
-#define _GUARD_TOS_INT_r22 875
-#define _GUARD_TOS_INT_r33 876
-#define _GUARD_TOS_LIST_r01 877
-#define _GUARD_TOS_LIST_r11 878
-#define _GUARD_TOS_LIST_r22 879
-#define _GUARD_TOS_LIST_r33 880
-#define _GUARD_TOS_OVERFLOWED_r01 881
-#define _GUARD_TOS_OVERFLOWED_r11 882
-#define _GUARD_TOS_OVERFLOWED_r22 883
-#define _GUARD_TOS_OVERFLOWED_r33 884
-#define _GUARD_TOS_SLICE_r01 885
-#define _GUARD_TOS_SLICE_r11 886
-#define _GUARD_TOS_SLICE_r22 887
-#define _GUARD_TOS_SLICE_r33 888
-#define _GUARD_TOS_TUPLE_r01 889
-#define _GUARD_TOS_TUPLE_r11 890
-#define _GUARD_TOS_TUPLE_r22 891
-#define _GUARD_TOS_TUPLE_r33 892
-#define _GUARD_TOS_UNICODE_r01 893
-#define _GUARD_TOS_UNICODE_r11 894
-#define _GUARD_TOS_UNICODE_r22 895
-#define _GUARD_TOS_UNICODE_r33 896
-#define _GUARD_TYPE_VERSION_r01 897
-#define _GUARD_TYPE_VERSION_r11 898
-#define _GUARD_TYPE_VERSION_r22 899
-#define _GUARD_TYPE_VERSION_r33 900
-#define _GUARD_TYPE_VERSION_AND_LOCK_r01 901
-#define _GUARD_TYPE_VERSION_AND_LOCK_r11 902
-#define _GUARD_TYPE_VERSION_AND_LOCK_r22 903
-#define _GUARD_TYPE_VERSION_AND_LOCK_r33 904
-#define _HANDLE_PENDING_AND_DEOPT_r00 905
-#define _HANDLE_PENDING_AND_DEOPT_r10 906
-#define _HANDLE_PENDING_AND_DEOPT_r20 907
-#define _HANDLE_PENDING_AND_DEOPT_r30 908
-#define _IMPORT_FROM_r12 909
-#define _IMPORT_NAME_r21 910
-#define _INIT_CALL_BOUND_METHOD_EXACT_ARGS_r00 911
-#define _INIT_CALL_PY_EXACT_ARGS_r01 912
-#define _INIT_CALL_PY_EXACT_ARGS_0_r01 913
-#define _INIT_CALL_PY_EXACT_ARGS_1_r01 914
-#define _INIT_CALL_PY_EXACT_ARGS_2_r01 915
-#define _INIT_CALL_PY_EXACT_ARGS_3_r01 916
-#define _INIT_CALL_PY_EXACT_ARGS_4_r01 917
-#define _INSERT_NULL_r10 918
-#define _INSTRUMENTED_FOR_ITER_r23 919
-#define _INSTRUMENTED_INSTRUCTION_r00 920
-#define _INSTRUMENTED_JUMP_FORWARD_r00 921
-#define _INSTRUMENTED_JUMP_FORWARD_r11 922
-#define _INSTRUMENTED_JUMP_FORWARD_r22 923
-#define _INSTRUMENTED_JUMP_FORWARD_r33 924
-#define _INSTRUMENTED_LINE_r00 925
-#define _INSTRUMENTED_NOT_TAKEN_r00 926
-#define _INSTRUMENTED_NOT_TAKEN_r11 927
-#define _INSTRUMENTED_NOT_TAKEN_r22 928
-#define _INSTRUMENTED_NOT_TAKEN_r33 929
-#define _INSTRUMENTED_POP_JUMP_IF_FALSE_r00 930
-#define _INSTRUMENTED_POP_JUMP_IF_FALSE_r10 931
-#define _INSTRUMENTED_POP_JUMP_IF_FALSE_r21 932
-#define _INSTRUMENTED_POP_JUMP_IF_FALSE_r32 933
-#define _INSTRUMENTED_POP_JUMP_IF_NONE_r10 934
-#define _INSTRUMENTED_POP_JUMP_IF_NOT_NONE_r10 935
-#define _INSTRUMENTED_POP_JUMP_IF_TRUE_r00 936
-#define _INSTRUMENTED_POP_JUMP_IF_TRUE_r10 937
-#define _INSTRUMENTED_POP_JUMP_IF_TRUE_r21 938
-#define _INSTRUMENTED_POP_JUMP_IF_TRUE_r32 939
-#define _IS_NONE_r11 940
-#define _IS_OP_r21 941
-#define _ITER_CHECK_LIST_r02 942
-#define _ITER_CHECK_LIST_r12 943
-#define _ITER_CHECK_LIST_r22 944
-#define _ITER_CHECK_LIST_r33 945
-#define _ITER_CHECK_RANGE_r02 946
-#define _ITER_CHECK_RANGE_r12 947
-#define _ITER_CHECK_RANGE_r22 948
-#define _ITER_CHECK_RANGE_r33 949
-#define _ITER_CHECK_TUPLE_r02 950
-#define _ITER_CHECK_TUPLE_r12 951
-#define _ITER_CHECK_TUPLE_r22 952
-#define _ITER_CHECK_TUPLE_r33 953
-#define _ITER_JUMP_LIST_r02 954
-#define _ITER_JUMP_LIST_r12 955
-#define _ITER_JUMP_LIST_r22 956
-#define _ITER_JUMP_LIST_r33 957
-#define _ITER_JUMP_RANGE_r02 958
-#define _ITER_JUMP_RANGE_r12 959
-#define _ITER_JUMP_RANGE_r22 960
-#define _ITER_JUMP_RANGE_r33 961
-#define _ITER_JUMP_TUPLE_r02 962
-#define _ITER_JUMP_TUPLE_r12 963
-#define _ITER_JUMP_TUPLE_r22 964
-#define _ITER_JUMP_TUPLE_r33 965
-#define _ITER_NEXT_LIST_r23 966
-#define _ITER_NEXT_LIST_TIER_TWO_r23 967
-#define _ITER_NEXT_RANGE_r03 968
-#define _ITER_NEXT_RANGE_r13 969
-#define _ITER_NEXT_RANGE_r23 970
-#define _ITER_NEXT_TUPLE_r03 971
-#define _ITER_NEXT_TUPLE_r13 972
-#define _ITER_NEXT_TUPLE_r23 973
-#define _JUMP_BACKWARD_NO_INTERRUPT_r00 974
-#define _JUMP_BACKWARD_NO_INTERRUPT_r11 975
-#define _JUMP_BACKWARD_NO_INTERRUPT_r22 976
-#define _JUMP_BACKWARD_NO_INTERRUPT_r33 977
-#define _JUMP_TO_TOP_r00 978
-#define _LIST_APPEND_r10 979
-#define _LIST_EXTEND_r10 980
-#define _LOAD_ATTR_r10 981
-#define _LOAD_ATTR_CLASS_r11 982
-#define _LOAD_ATTR_GETATTRIBUTE_OVERRIDDEN_r11 983
-#define _LOAD_ATTR_INSTANCE_VALUE_r11 984
-#define _LOAD_ATTR_METHOD_LAZY_DICT_r02 985
-#define _LOAD_ATTR_METHOD_LAZY_DICT_r12 986
-#define _LOAD_ATTR_METHOD_LAZY_DICT_r23 987
-#define _LOAD_ATTR_METHOD_NO_DICT_r02 988
-#define _LOAD_ATTR_METHOD_NO_DICT_r12 989
-#define _LOAD_ATTR_METHOD_NO_DICT_r23 990
-#define _LOAD_ATTR_METHOD_WITH_VALUES_r02 991
-#define _LOAD_ATTR_METHOD_WITH_VALUES_r12 992
-#define _LOAD_ATTR_METHOD_WITH_VALUES_r23 993
-#define _LOAD_ATTR_MODULE_r11 994
-#define _LOAD_ATTR_NONDESCRIPTOR_NO_DICT_r11 995
-#define _LOAD_ATTR_NONDESCRIPTOR_WITH_VALUES_r11 996
-#define _LOAD_ATTR_PROPERTY_FRAME_r11 997
-#define _LOAD_ATTR_SLOT_r11 998
-#define _LOAD_ATTR_WITH_HINT_r11 999
-#define _LOAD_BUILD_CLASS_r01 1000
-#define _LOAD_BYTECODE_r00 1001
-#define _LOAD_COMMON_CONSTANT_r01 1002
-#define _LOAD_COMMON_CONSTANT_r12 1003
-#define _LOAD_COMMON_CONSTANT_r23 1004
-#define _LOAD_CONST_r01 1005
-#define _LOAD_CONST_r12 1006
-#define _LOAD_CONST_r23 1007
-#define _LOAD_CONST_INLINE_r01 1008
-#define _LOAD_CONST_INLINE_r12 1009
-#define _LOAD_CONST_INLINE_r23 1010
-#define _LOAD_CONST_INLINE_BORROW_r01 1011
-#define _LOAD_CONST_INLINE_BORROW_r12 1012
-#define _LOAD_CONST_INLINE_BORROW_r23 1013
-#define _LOAD_CONST_UNDER_INLINE_r02 1014
-#define _LOAD_CONST_UNDER_INLINE_r12 1015
-#define _LOAD_CONST_UNDER_INLINE_r23 1016
-#define _LOAD_CONST_UNDER_INLINE_BORROW_r02 1017
-#define _LOAD_CONST_UNDER_INLINE_BORROW_r12 1018
-#define _LOAD_CONST_UNDER_INLINE_BORROW_r23 1019
-#define _LOAD_DEREF_r01 1020
-#define _LOAD_FAST_r01 1021
-#define _LOAD_FAST_r12 1022
-#define _LOAD_FAST_r23 1023
-#define _LOAD_FAST_0_r01 1024
-#define _LOAD_FAST_0_r12 1025
-#define _LOAD_FAST_0_r23 1026
-#define _LOAD_FAST_1_r01 1027
-#define _LOAD_FAST_1_r12 1028
-#define _LOAD_FAST_1_r23 1029
-#define _LOAD_FAST_2_r01 1030
-#define _LOAD_FAST_2_r12 1031
-#define _LOAD_FAST_2_r23 1032
-#define _LOAD_FAST_3_r01 1033
-#define _LOAD_FAST_3_r12 1034
-#define _LOAD_FAST_3_r23 1035
-#define _LOAD_FAST_4_r01 1036
-#define _LOAD_FAST_4_r12 1037
-#define _LOAD_FAST_4_r23 1038
-#define _LOAD_FAST_5_r01 1039
-#define _LOAD_FAST_5_r12 1040
-#define _LOAD_FAST_5_r23 1041
-#define _LOAD_FAST_6_r01 1042
-#define _LOAD_FAST_6_r12 1043
-#define _LOAD_FAST_6_r23 1044
-#define _LOAD_FAST_7_r01 1045
-#define _LOAD_FAST_7_r12 1046
-#define _LOAD_FAST_7_r23 1047
-#define _LOAD_FAST_AND_CLEAR_r01 1048
-#define _LOAD_FAST_AND_CLEAR_r12 1049
-#define _LOAD_FAST_AND_CLEAR_r23 1050
-#define _LOAD_FAST_BORROW_r01 1051
-#define _LOAD_FAST_BORROW_r12 1052
-#define _LOAD_FAST_BORROW_r23 1053
-#define _LOAD_FAST_BORROW_0_r01 1054
-#define _LOAD_FAST_BORROW_0_r12 1055
-#define _LOAD_FAST_BORROW_0_r23 1056
-#define _LOAD_FAST_BORROW_1_r01 1057
-#define _LOAD_FAST_BORROW_1_r12 1058
-#define _LOAD_FAST_BORROW_1_r23 1059
-#define _LOAD_FAST_BORROW_2_r01 1060
-#define _LOAD_FAST_BORROW_2_r12 1061
-#define _LOAD_FAST_BORROW_2_r23 1062
-#define _LOAD_FAST_BORROW_3_r01 1063
-#define _LOAD_FAST_BORROW_3_r12 1064
-#define _LOAD_FAST_BORROW_3_r23 1065
-#define _LOAD_FAST_BORROW_4_r01 1066
-#define _LOAD_FAST_BORROW_4_r12 1067
-#define _LOAD_FAST_BORROW_4_r23 1068
-#define _LOAD_FAST_BORROW_5_r01 1069
-#define _LOAD_FAST_BORROW_5_r12 1070
-#define _LOAD_FAST_BORROW_5_r23 1071
-#define _LOAD_FAST_BORROW_6_r01 1072
-#define _LOAD_FAST_BORROW_6_r12 1073
-#define _LOAD_FAST_BORROW_6_r23 1074
-#define _LOAD_FAST_BORROW_7_r01 1075
-#define _LOAD_FAST_BORROW_7_r12 1076
-#define _LOAD_FAST_BORROW_7_r23 1077
-#define _LOAD_FAST_BORROW_LOAD_FAST_BORROW_r02 1078
-#define _LOAD_FAST_BORROW_LOAD_FAST_BORROW_r13 1079
-#define _LOAD_FAST_CHECK_r01 1080
-#define _LOAD_FAST_CHECK_r12 1081
-#define _LOAD_FAST_CHECK_r23 1082
-#define _LOAD_FAST_LOAD_FAST_r02 1083
-#define _LOAD_FAST_LOAD_FAST_r13 1084
-#define _LOAD_FROM_DICT_OR_DEREF_r11 1085
-#define _LOAD_FROM_DICT_OR_GLOBALS_r11 1086
-#define _LOAD_GLOBAL_r00 1087
-#define _LOAD_GLOBAL_BUILTINS_r01 1088
-#define _LOAD_GLOBAL_MODULE_r01 1089
-#define _LOAD_LOCALS_r01 1090
-#define _LOAD_LOCALS_r12 1091
-#define _LOAD_LOCALS_r23 1092
-#define _LOAD_NAME_r01 1093
-#define _LOAD_SMALL_INT_r01 1094
-#define _LOAD_SMALL_INT_r12 1095
-#define _LOAD_SMALL_INT_r23 1096
-#define _LOAD_SMALL_INT_0_r01 1097
-#define _LOAD_SMALL_INT_0_r12 1098
-#define _LOAD_SMALL_INT_0_r23 1099
-#define _LOAD_SMALL_INT_1_r01 1100
-#define _LOAD_SMALL_INT_1_r12 1101
-#define _LOAD_SMALL_INT_1_r23 1102
-#define _LOAD_SMALL_INT_2_r01 1103
-#define _LOAD_SMALL_INT_2_r12 1104
-#define _LOAD_SMALL_INT_2_r23 1105
-#define _LOAD_SMALL_INT_3_r01 1106
-#define _LOAD_SMALL_INT_3_r12 1107
-#define _LOAD_SMALL_INT_3_r23 1108
-#define _LOAD_SPECIAL_r00 1109
-#define _LOAD_SUPER_ATTR_ATTR_r31 1110
-#define _LOAD_SUPER_ATTR_METHOD_r32 1111
-#define _MAKE_CALLARGS_A_TUPLE_r33 1112
-#define _MAKE_CELL_r00 1113
-#define _MAKE_FUNCTION_r11 1114
-#define _MAKE_WARM_r00 1115
-#define _MAKE_WARM_r11 1116
-#define _MAKE_WARM_r22 1117
-#define _MAKE_WARM_r33 1118
-#define _MAP_ADD_r20 1119
-#define _MATCH_CLASS_r31 1120
-#define _MATCH_KEYS_r23 1121
-#define _MATCH_MAPPING_r02 1122
-#define _MATCH_MAPPING_r12 1123
-#define _MATCH_MAPPING_r23 1124
-#define _MATCH_SEQUENCE_r02 1125
-#define _MATCH_SEQUENCE_r12 1126
-#define _MATCH_SEQUENCE_r23 1127
-#define _MAYBE_EXPAND_METHOD_r00 1128
-#define _MAYBE_EXPAND_METHOD_KW_r11 1129
-#define _MONITOR_CALL_r00 1130
-#define _MONITOR_CALL_KW_r11 1131
-#define _MONITOR_JUMP_BACKWARD_r00 1132
-#define _MONITOR_JUMP_BACKWARD_r11 1133
-#define _MONITOR_JUMP_BACKWARD_r22 1134
-#define _MONITOR_JUMP_BACKWARD_r33 1135
-#define _MONITOR_RESUME_r00 1136
-#define _NOP_r00 1137
-#define _NOP_r11 1138
-#define _NOP_r22 1139
-#define _NOP_r33 1140
-#define _POP_CALL_r20 1141
-#define _POP_CALL_LOAD_CONST_INLINE_BORROW_r21 1142
-#define _POP_CALL_ONE_r30 1143
-#define _POP_CALL_ONE_LOAD_CONST_INLINE_BORROW_r31 1144
-#define _POP_CALL_TWO_r30 1145
-#define _POP_CALL_TWO_LOAD_CONST_INLINE_BORROW_r31 1146
-#define _POP_EXCEPT_r10 1147
-#define _POP_ITER_r20 1148
-#define _POP_JUMP_IF_FALSE_r00 1149
-#define _POP_JUMP_IF_FALSE_r10 1150
-#define _POP_JUMP_IF_FALSE_r21 1151
-#define _POP_JUMP_IF_FALSE_r32 1152
-#define _POP_JUMP_IF_TRUE_r00 1153
-#define _POP_JUMP_IF_TRUE_r10 1154
-#define _POP_JUMP_IF_TRUE_r21 1155
-#define _POP_JUMP_IF_TRUE_r32 1156
-#define _POP_TOP_r10 1157
-#define _POP_TOP_FLOAT_r00 1158
-#define _POP_TOP_FLOAT_r10 1159
-#define _POP_TOP_FLOAT_r21 1160
-#define _POP_TOP_FLOAT_r32 1161
-#define _POP_TOP_INT_r00 1162
-#define _POP_TOP_INT_r10 1163
-#define _POP_TOP_INT_r21 1164
-#define _POP_TOP_INT_r32 1165
-#define _POP_TOP_LOAD_CONST_INLINE_r11 1166
-#define _POP_TOP_LOAD_CONST_INLINE_BORROW_r11 1167
-#define _POP_TOP_NOP_r00 1168
-#define _POP_TOP_NOP_r10 1169
-#define _POP_TOP_NOP_r21 1170
-#define _POP_TOP_NOP_r32 1171
-#define _POP_TOP_UNICODE_r00 1172
-#define _POP_TOP_UNICODE_r10 1173
-#define _POP_TOP_UNICODE_r21 1174
-#define _POP_TOP_UNICODE_r32 1175
-#define _POP_TWO_r20 1176
-#define _POP_TWO_LOAD_CONST_INLINE_BORROW_r21 1177
-#define _PUSH_EXC_INFO_r02 1178
-#define _PUSH_EXC_INFO_r12 1179
-#define _PUSH_EXC_INFO_r23 1180
-#define _PUSH_FRAME_r10 1181
-#define _PUSH_NULL_r01 1182
-#define _PUSH_NULL_r12 1183
-#define _PUSH_NULL_r23 1184
-#define _PUSH_NULL_CONDITIONAL_r00 1185
-#define _PY_FRAME_GENERAL_r01 1186
-#define _PY_FRAME_KW_r11 1187
-#define _QUICKEN_RESUME_r00 1188
-#define _QUICKEN_RESUME_r11 1189
-#define _QUICKEN_RESUME_r22 1190
-#define _QUICKEN_RESUME_r33 1191
-#define _REPLACE_WITH_TRUE_r11 1192
-#define _RESUME_CHECK_r00 1193
-#define _RESUME_CHECK_r11 1194
-#define _RESUME_CHECK_r22 1195
-#define _RESUME_CHECK_r33 1196
-#define _RETURN_GENERATOR_r01 1197
-#define _RETURN_VALUE_r11 1198
-#define _SAVE_RETURN_OFFSET_r00 1199
-#define _SAVE_RETURN_OFFSET_r11 1200
-#define _SAVE_RETURN_OFFSET_r22 1201
-#define _SAVE_RETURN_OFFSET_r33 1202
-#define _SEND_r22 1203
-#define _SEND_GEN_FRAME_r22 1204
-#define _SETUP_ANNOTATIONS_r00 1205
-#define _SET_ADD_r10 1206
-#define _SET_FUNCTION_ATTRIBUTE_r01 1207
-#define _SET_FUNCTION_ATTRIBUTE_r11 1208
-#define _SET_FUNCTION_ATTRIBUTE_r21 1209
-#define _SET_FUNCTION_ATTRIBUTE_r32 1210
-#define _SET_IP_r00 1211
-#define _SET_IP_r11 1212
-#define _SET_IP_r22 1213
-#define _SET_IP_r33 1214
-#define _SET_UPDATE_r10 1215
-#define _SPILL_OR_RELOAD_r01 1216
-#define _SPILL_OR_RELOAD_r02 1217
-#define _SPILL_OR_RELOAD_r03 1218
-#define _SPILL_OR_RELOAD_r10 1219
-#define _SPILL_OR_RELOAD_r12 1220
-#define _SPILL_OR_RELOAD_r13 1221
-#define _SPILL_OR_RELOAD_r20 1222
-#define _SPILL_OR_RELOAD_r21 1223
-#define _SPILL_OR_RELOAD_r23 1224
-#define _SPILL_OR_RELOAD_r30 1225
-#define _SPILL_OR_RELOAD_r31 1226
-#define _SPILL_OR_RELOAD_r32 1227
-#define _START_EXECUTOR_r00 1228
-#define _STORE_ATTR_r20 1229
-#define _STORE_ATTR_INSTANCE_VALUE_r21 1230
-#define _STORE_ATTR_SLOT_r21 1231
-#define _STORE_ATTR_WITH_HINT_r20 1232
-#define _STORE_DEREF_r10 1233
-#define _STORE_FAST_r10 1234
-#define _STORE_FAST_0_r10 1235
-#define _STORE_FAST_1_r10 1236
-#define _STORE_FAST_2_r10 1237
-#define _STORE_FAST_3_r10 1238
-#define _STORE_FAST_4_r10 1239
-#define _STORE_FAST_5_r10 1240
-#define _STORE_FAST_6_r10 1241
-#define _STORE_FAST_7_r10 1242
-#define _STORE_FAST_LOAD_FAST_r11 1243
-#define _STORE_FAST_STORE_FAST_r20 1244
-#define _STORE_GLOBAL_r10 1245
-#define _STORE_NAME_r10 1246
-#define _STORE_SLICE_r30 1247
-#define _STORE_SUBSCR_r30 1248
-#define _STORE_SUBSCR_DICT_r31 1249
-#define _STORE_SUBSCR_LIST_INT_r32 1250
-#define _SWAP_r11 1251
-#define _SWAP_2_r02 1252
-#define _SWAP_2_r12 1253
-#define _SWAP_2_r22 1254
-#define _SWAP_2_r33 1255
-#define _SWAP_3_r03 1256
-#define _SWAP_3_r13 1257
-#define _SWAP_3_r23 1258
-#define _SWAP_3_r33 1259
-#define _TIER2_RESUME_CHECK_r00 1260
-#define _TIER2_RESUME_CHECK_r11 1261
-#define _TIER2_RESUME_CHECK_r22 1262
-#define _TIER2_RESUME_CHECK_r33 1263
-#define _TO_BOOL_r11 1264
-#define _TO_BOOL_BOOL_r01 1265
-#define _TO_BOOL_BOOL_r11 1266
-#define _TO_BOOL_BOOL_r22 1267
-#define _TO_BOOL_BOOL_r33 1268
-#define _TO_BOOL_INT_r11 1269
-#define _TO_BOOL_LIST_r11 1270
-#define _TO_BOOL_NONE_r01 1271
-#define _TO_BOOL_NONE_r11 1272
-#define _TO_BOOL_NONE_r22 1273
-#define _TO_BOOL_NONE_r33 1274
-#define _TO_BOOL_STR_r11 1275
-#define _TRACE_RECORD_r00 1276
-#define _UNARY_INVERT_r11 1277
-#define _UNARY_NEGATIVE_r11 1278
-#define _UNARY_NOT_r01 1279
-#define _UNARY_NOT_r11 1280
-#define _UNARY_NOT_r22 1281
-#define _UNARY_NOT_r33 1282
-#define _UNPACK_EX_r10 1283
-#define _UNPACK_SEQUENCE_r10 1284
-#define _UNPACK_SEQUENCE_LIST_r10 1285
-#define _UNPACK_SEQUENCE_TUPLE_r10 1286
-#define _UNPACK_SEQUENCE_TWO_TUPLE_r12 1287
-#define _WITH_EXCEPT_START_r33 1288
-#define _YIELD_VALUE_r11 1289
-#define MAX_UOP_REGS_ID 1289
+#define _FOR_ITER_GEN_FRAME_r03 731
+#define _FOR_ITER_GEN_FRAME_r13 732
+#define _FOR_ITER_GEN_FRAME_r23 733
+#define _FOR_ITER_TIER_TWO_r23 734
+#define _GET_AITER_r11 735
+#define _GET_ANEXT_r12 736
+#define _GET_AWAITABLE_r11 737
+#define _GET_ITER_r12 738
+#define _GET_LEN_r12 739
+#define _GET_YIELD_FROM_ITER_r11 740
+#define _GUARD_BINARY_OP_EXTEND_r22 741
+#define _GUARD_CALLABLE_ISINSTANCE_r03 742
+#define _GUARD_CALLABLE_ISINSTANCE_r13 743
+#define _GUARD_CALLABLE_ISINSTANCE_r23 744
+#define _GUARD_CALLABLE_ISINSTANCE_r33 745
+#define _GUARD_CALLABLE_LEN_r03 746
+#define _GUARD_CALLABLE_LEN_r13 747
+#define _GUARD_CALLABLE_LEN_r23 748
+#define _GUARD_CALLABLE_LEN_r33 749
+#define _GUARD_CALLABLE_LIST_APPEND_r03 750
+#define _GUARD_CALLABLE_LIST_APPEND_r13 751
+#define _GUARD_CALLABLE_LIST_APPEND_r23 752
+#define _GUARD_CALLABLE_LIST_APPEND_r33 753
+#define _GUARD_CALLABLE_STR_1_r03 754
+#define _GUARD_CALLABLE_STR_1_r13 755
+#define _GUARD_CALLABLE_STR_1_r23 756
+#define _GUARD_CALLABLE_STR_1_r33 757
+#define _GUARD_CALLABLE_TUPLE_1_r03 758
+#define _GUARD_CALLABLE_TUPLE_1_r13 759
+#define _GUARD_CALLABLE_TUPLE_1_r23 760
+#define _GUARD_CALLABLE_TUPLE_1_r33 761
+#define _GUARD_CALLABLE_TYPE_1_r03 762
+#define _GUARD_CALLABLE_TYPE_1_r13 763
+#define _GUARD_CALLABLE_TYPE_1_r23 764
+#define _GUARD_CALLABLE_TYPE_1_r33 765
+#define _GUARD_DORV_NO_DICT_r01 766
+#define _GUARD_DORV_NO_DICT_r11 767
+#define _GUARD_DORV_NO_DICT_r22 768
+#define _GUARD_DORV_NO_DICT_r33 769
+#define _GUARD_DORV_VALUES_INST_ATTR_FROM_DICT_r01 770
+#define _GUARD_DORV_VALUES_INST_ATTR_FROM_DICT_r11 771
+#define _GUARD_DORV_VALUES_INST_ATTR_FROM_DICT_r22 772
+#define _GUARD_DORV_VALUES_INST_ATTR_FROM_DICT_r33 773
+#define _GUARD_GLOBALS_VERSION_r00 774
+#define _GUARD_GLOBALS_VERSION_r11 775
+#define _GUARD_GLOBALS_VERSION_r22 776
+#define _GUARD_GLOBALS_VERSION_r33 777
+#define _GUARD_IP_RETURN_GENERATOR_r00 778
+#define _GUARD_IP_RETURN_GENERATOR_r11 779
+#define _GUARD_IP_RETURN_GENERATOR_r22 780
+#define _GUARD_IP_RETURN_GENERATOR_r33 781
+#define _GUARD_IP_RETURN_VALUE_r00 782
+#define _GUARD_IP_RETURN_VALUE_r11 783
+#define _GUARD_IP_RETURN_VALUE_r22 784
+#define _GUARD_IP_RETURN_VALUE_r33 785
+#define _GUARD_IP_YIELD_VALUE_r00 786
+#define _GUARD_IP_YIELD_VALUE_r11 787
+#define _GUARD_IP_YIELD_VALUE_r22 788
+#define _GUARD_IP_YIELD_VALUE_r33 789
+#define _GUARD_IP__PUSH_FRAME_r00 790
+#define _GUARD_IP__PUSH_FRAME_r11 791
+#define _GUARD_IP__PUSH_FRAME_r22 792
+#define _GUARD_IP__PUSH_FRAME_r33 793
+#define _GUARD_IS_FALSE_POP_r00 794
+#define _GUARD_IS_FALSE_POP_r10 795
+#define _GUARD_IS_FALSE_POP_r21 796
+#define _GUARD_IS_FALSE_POP_r32 797
+#define _GUARD_IS_NONE_POP_r00 798
+#define _GUARD_IS_NONE_POP_r10 799
+#define _GUARD_IS_NONE_POP_r21 800
+#define _GUARD_IS_NONE_POP_r32 801
+#define _GUARD_IS_NOT_NONE_POP_r10 802
+#define _GUARD_IS_TRUE_POP_r00 803
+#define _GUARD_IS_TRUE_POP_r10 804
+#define _GUARD_IS_TRUE_POP_r21 805
+#define _GUARD_IS_TRUE_POP_r32 806
+#define _GUARD_KEYS_VERSION_r01 807
+#define _GUARD_KEYS_VERSION_r11 808
+#define _GUARD_KEYS_VERSION_r22 809
+#define _GUARD_KEYS_VERSION_r33 810
+#define _GUARD_NOS_DICT_r02 811
+#define _GUARD_NOS_DICT_r12 812
+#define _GUARD_NOS_DICT_r22 813
+#define _GUARD_NOS_DICT_r33 814
+#define _GUARD_NOS_FLOAT_r02 815
+#define _GUARD_NOS_FLOAT_r12 816
+#define _GUARD_NOS_FLOAT_r22 817
+#define _GUARD_NOS_FLOAT_r33 818
+#define _GUARD_NOS_INT_r02 819
+#define _GUARD_NOS_INT_r12 820
+#define _GUARD_NOS_INT_r22 821
+#define _GUARD_NOS_INT_r33 822
+#define _GUARD_NOS_LIST_r02 823
+#define _GUARD_NOS_LIST_r12 824
+#define _GUARD_NOS_LIST_r22 825
+#define _GUARD_NOS_LIST_r33 826
+#define _GUARD_NOS_NOT_NULL_r02 827
+#define _GUARD_NOS_NOT_NULL_r12 828
+#define _GUARD_NOS_NOT_NULL_r22 829
+#define _GUARD_NOS_NOT_NULL_r33 830
+#define _GUARD_NOS_NULL_r02 831
+#define _GUARD_NOS_NULL_r12 832
+#define _GUARD_NOS_NULL_r22 833
+#define _GUARD_NOS_NULL_r33 834
+#define _GUARD_NOS_OVERFLOWED_r02 835
+#define _GUARD_NOS_OVERFLOWED_r12 836
+#define _GUARD_NOS_OVERFLOWED_r22 837
+#define _GUARD_NOS_OVERFLOWED_r33 838
+#define _GUARD_NOS_TUPLE_r02 839
+#define _GUARD_NOS_TUPLE_r12 840
+#define _GUARD_NOS_TUPLE_r22 841
+#define _GUARD_NOS_TUPLE_r33 842
+#define _GUARD_NOS_UNICODE_r02 843
+#define _GUARD_NOS_UNICODE_r12 844
+#define _GUARD_NOS_UNICODE_r22 845
+#define _GUARD_NOS_UNICODE_r33 846
+#define _GUARD_NOT_EXHAUSTED_LIST_r02 847
+#define _GUARD_NOT_EXHAUSTED_LIST_r12 848
+#define _GUARD_NOT_EXHAUSTED_LIST_r22 849
+#define _GUARD_NOT_EXHAUSTED_LIST_r33 850
+#define _GUARD_NOT_EXHAUSTED_RANGE_r02 851
+#define _GUARD_NOT_EXHAUSTED_RANGE_r12 852
+#define _GUARD_NOT_EXHAUSTED_RANGE_r22 853
+#define _GUARD_NOT_EXHAUSTED_RANGE_r33 854
+#define _GUARD_NOT_EXHAUSTED_TUPLE_r02 855
+#define _GUARD_NOT_EXHAUSTED_TUPLE_r12 856
+#define _GUARD_NOT_EXHAUSTED_TUPLE_r22 857
+#define _GUARD_NOT_EXHAUSTED_TUPLE_r33 858
+#define _GUARD_THIRD_NULL_r03 859
+#define _GUARD_THIRD_NULL_r13 860
+#define _GUARD_THIRD_NULL_r23 861
+#define _GUARD_THIRD_NULL_r33 862
+#define _GUARD_TOS_ANY_SET_r01 863
+#define _GUARD_TOS_ANY_SET_r11 864
+#define _GUARD_TOS_ANY_SET_r22 865
+#define _GUARD_TOS_ANY_SET_r33 866
+#define _GUARD_TOS_DICT_r01 867
+#define _GUARD_TOS_DICT_r11 868
+#define _GUARD_TOS_DICT_r22 869
+#define _GUARD_TOS_DICT_r33 870
+#define _GUARD_TOS_FLOAT_r01 871
+#define _GUARD_TOS_FLOAT_r11 872
+#define _GUARD_TOS_FLOAT_r22 873
+#define _GUARD_TOS_FLOAT_r33 874
+#define _GUARD_TOS_INT_r01 875
+#define _GUARD_TOS_INT_r11 876
+#define _GUARD_TOS_INT_r22 877
+#define _GUARD_TOS_INT_r33 878
+#define _GUARD_TOS_LIST_r01 879
+#define _GUARD_TOS_LIST_r11 880
+#define _GUARD_TOS_LIST_r22 881
+#define _GUARD_TOS_LIST_r33 882
+#define _GUARD_TOS_OVERFLOWED_r01 883
+#define _GUARD_TOS_OVERFLOWED_r11 884
+#define _GUARD_TOS_OVERFLOWED_r22 885
+#define _GUARD_TOS_OVERFLOWED_r33 886
+#define _GUARD_TOS_SLICE_r01 887
+#define _GUARD_TOS_SLICE_r11 888
+#define _GUARD_TOS_SLICE_r22 889
+#define _GUARD_TOS_SLICE_r33 890
+#define _GUARD_TOS_TUPLE_r01 891
+#define _GUARD_TOS_TUPLE_r11 892
+#define _GUARD_TOS_TUPLE_r22 893
+#define _GUARD_TOS_TUPLE_r33 894
+#define _GUARD_TOS_UNICODE_r01 895
+#define _GUARD_TOS_UNICODE_r11 896
+#define _GUARD_TOS_UNICODE_r22 897
+#define _GUARD_TOS_UNICODE_r33 898
+#define _GUARD_TYPE_VERSION_r01 899
+#define _GUARD_TYPE_VERSION_r11 900
+#define _GUARD_TYPE_VERSION_r22 901
+#define _GUARD_TYPE_VERSION_r33 902
+#define _GUARD_TYPE_VERSION_AND_LOCK_r01 903
+#define _GUARD_TYPE_VERSION_AND_LOCK_r11 904
+#define _GUARD_TYPE_VERSION_AND_LOCK_r22 905
+#define _GUARD_TYPE_VERSION_AND_LOCK_r33 906
+#define _HANDLE_PENDING_AND_DEOPT_r00 907
+#define _HANDLE_PENDING_AND_DEOPT_r10 908
+#define _HANDLE_PENDING_AND_DEOPT_r20 909
+#define _HANDLE_PENDING_AND_DEOPT_r30 910
+#define _IMPORT_FROM_r12 911
+#define _IMPORT_NAME_r21 912
+#define _INIT_CALL_BOUND_METHOD_EXACT_ARGS_r00 913
+#define _INIT_CALL_PY_EXACT_ARGS_r01 914
+#define _INIT_CALL_PY_EXACT_ARGS_0_r01 915
+#define _INIT_CALL_PY_EXACT_ARGS_1_r01 916
+#define _INIT_CALL_PY_EXACT_ARGS_2_r01 917
+#define _INIT_CALL_PY_EXACT_ARGS_3_r01 918
+#define _INIT_CALL_PY_EXACT_ARGS_4_r01 919
+#define _INSERT_NULL_r10 920
+#define _INSTRUMENTED_FOR_ITER_r23 921
+#define _INSTRUMENTED_INSTRUCTION_r00 922
+#define _INSTRUMENTED_JUMP_FORWARD_r00 923
+#define _INSTRUMENTED_JUMP_FORWARD_r11 924
+#define _INSTRUMENTED_JUMP_FORWARD_r22 925
+#define _INSTRUMENTED_JUMP_FORWARD_r33 926
+#define _INSTRUMENTED_LINE_r00 927
+#define _INSTRUMENTED_NOT_TAKEN_r00 928
+#define _INSTRUMENTED_NOT_TAKEN_r11 929
+#define _INSTRUMENTED_NOT_TAKEN_r22 930
+#define _INSTRUMENTED_NOT_TAKEN_r33 931
+#define _INSTRUMENTED_POP_JUMP_IF_FALSE_r00 932
+#define _INSTRUMENTED_POP_JUMP_IF_FALSE_r10 933
+#define _INSTRUMENTED_POP_JUMP_IF_FALSE_r21 934
+#define _INSTRUMENTED_POP_JUMP_IF_FALSE_r32 935
+#define _INSTRUMENTED_POP_JUMP_IF_NONE_r10 936
+#define _INSTRUMENTED_POP_JUMP_IF_NOT_NONE_r10 937
+#define _INSTRUMENTED_POP_JUMP_IF_TRUE_r00 938
+#define _INSTRUMENTED_POP_JUMP_IF_TRUE_r10 939
+#define _INSTRUMENTED_POP_JUMP_IF_TRUE_r21 940
+#define _INSTRUMENTED_POP_JUMP_IF_TRUE_r32 941
+#define _IS_NONE_r11 942
+#define _IS_OP_r21 943
+#define _ITER_CHECK_LIST_r02 944
+#define _ITER_CHECK_LIST_r12 945
+#define _ITER_CHECK_LIST_r22 946
+#define _ITER_CHECK_LIST_r33 947
+#define _ITER_CHECK_RANGE_r02 948
+#define _ITER_CHECK_RANGE_r12 949
+#define _ITER_CHECK_RANGE_r22 950
+#define _ITER_CHECK_RANGE_r33 951
+#define _ITER_CHECK_TUPLE_r02 952
+#define _ITER_CHECK_TUPLE_r12 953
+#define _ITER_CHECK_TUPLE_r22 954
+#define _ITER_CHECK_TUPLE_r33 955
+#define _ITER_JUMP_LIST_r02 956
+#define _ITER_JUMP_LIST_r12 957
+#define _ITER_JUMP_LIST_r22 958
+#define _ITER_JUMP_LIST_r33 959
+#define _ITER_JUMP_RANGE_r02 960
+#define _ITER_JUMP_RANGE_r12 961
+#define _ITER_JUMP_RANGE_r22 962
+#define _ITER_JUMP_RANGE_r33 963
+#define _ITER_JUMP_TUPLE_r02 964
+#define _ITER_JUMP_TUPLE_r12 965
+#define _ITER_JUMP_TUPLE_r22 966
+#define _ITER_JUMP_TUPLE_r33 967
+#define _ITER_NEXT_LIST_r23 968
+#define _ITER_NEXT_LIST_TIER_TWO_r23 969
+#define _ITER_NEXT_RANGE_r03 970
+#define _ITER_NEXT_RANGE_r13 971
+#define _ITER_NEXT_RANGE_r23 972
+#define _ITER_NEXT_TUPLE_r03 973
+#define _ITER_NEXT_TUPLE_r13 974
+#define _ITER_NEXT_TUPLE_r23 975
+#define _JUMP_BACKWARD_NO_INTERRUPT_r00 976
+#define _JUMP_BACKWARD_NO_INTERRUPT_r11 977
+#define _JUMP_BACKWARD_NO_INTERRUPT_r22 978
+#define _JUMP_BACKWARD_NO_INTERRUPT_r33 979
+#define _JUMP_TO_TOP_r00 980
+#define _LIST_APPEND_r10 981
+#define _LIST_EXTEND_r10 982
+#define _LOAD_ATTR_r10 983
+#define _LOAD_ATTR_CLASS_r11 984
+#define _LOAD_ATTR_GETATTRIBUTE_OVERRIDDEN_r11 985
+#define _LOAD_ATTR_INSTANCE_VALUE_r02 986
+#define _LOAD_ATTR_INSTANCE_VALUE_r12 987
+#define _LOAD_ATTR_INSTANCE_VALUE_r23 988
+#define _LOAD_ATTR_METHOD_LAZY_DICT_r02 989
+#define _LOAD_ATTR_METHOD_LAZY_DICT_r12 990
+#define _LOAD_ATTR_METHOD_LAZY_DICT_r23 991
+#define _LOAD_ATTR_METHOD_NO_DICT_r02 992
+#define _LOAD_ATTR_METHOD_NO_DICT_r12 993
+#define _LOAD_ATTR_METHOD_NO_DICT_r23 994
+#define _LOAD_ATTR_METHOD_WITH_VALUES_r02 995
+#define _LOAD_ATTR_METHOD_WITH_VALUES_r12 996
+#define _LOAD_ATTR_METHOD_WITH_VALUES_r23 997
+#define _LOAD_ATTR_MODULE_r11 998
+#define _LOAD_ATTR_NONDESCRIPTOR_NO_DICT_r11 999
+#define _LOAD_ATTR_NONDESCRIPTOR_WITH_VALUES_r11 1000
+#define _LOAD_ATTR_PROPERTY_FRAME_r11 1001
+#define _LOAD_ATTR_SLOT_r11 1002
+#define _LOAD_ATTR_WITH_HINT_r12 1003
+#define _LOAD_BUILD_CLASS_r01 1004
+#define _LOAD_BYTECODE_r00 1005
+#define _LOAD_COMMON_CONSTANT_r01 1006
+#define _LOAD_COMMON_CONSTANT_r12 1007
+#define _LOAD_COMMON_CONSTANT_r23 1008
+#define _LOAD_CONST_r01 1009
+#define _LOAD_CONST_r12 1010
+#define _LOAD_CONST_r23 1011
+#define _LOAD_CONST_INLINE_r01 1012
+#define _LOAD_CONST_INLINE_r12 1013
+#define _LOAD_CONST_INLINE_r23 1014
+#define _LOAD_CONST_INLINE_BORROW_r01 1015
+#define _LOAD_CONST_INLINE_BORROW_r12 1016
+#define _LOAD_CONST_INLINE_BORROW_r23 1017
+#define _LOAD_CONST_UNDER_INLINE_r02 1018
+#define _LOAD_CONST_UNDER_INLINE_r12 1019
+#define _LOAD_CONST_UNDER_INLINE_r23 1020
+#define _LOAD_CONST_UNDER_INLINE_BORROW_r02 1021
+#define _LOAD_CONST_UNDER_INLINE_BORROW_r12 1022
+#define _LOAD_CONST_UNDER_INLINE_BORROW_r23 1023
+#define _LOAD_DEREF_r01 1024
+#define _LOAD_FAST_r01 1025
+#define _LOAD_FAST_r12 1026
+#define _LOAD_FAST_r23 1027
+#define _LOAD_FAST_0_r01 1028
+#define _LOAD_FAST_0_r12 1029
+#define _LOAD_FAST_0_r23 1030
+#define _LOAD_FAST_1_r01 1031
+#define _LOAD_FAST_1_r12 1032
+#define _LOAD_FAST_1_r23 1033
+#define _LOAD_FAST_2_r01 1034
+#define _LOAD_FAST_2_r12 1035
+#define _LOAD_FAST_2_r23 1036
+#define _LOAD_FAST_3_r01 1037
+#define _LOAD_FAST_3_r12 1038
+#define _LOAD_FAST_3_r23 1039
+#define _LOAD_FAST_4_r01 1040
+#define _LOAD_FAST_4_r12 1041
+#define _LOAD_FAST_4_r23 1042
+#define _LOAD_FAST_5_r01 1043
+#define _LOAD_FAST_5_r12 1044
+#define _LOAD_FAST_5_r23 1045
+#define _LOAD_FAST_6_r01 1046
+#define _LOAD_FAST_6_r12 1047
+#define _LOAD_FAST_6_r23 1048
+#define _LOAD_FAST_7_r01 1049
+#define _LOAD_FAST_7_r12 1050
+#define _LOAD_FAST_7_r23 1051
+#define _LOAD_FAST_AND_CLEAR_r01 1052
+#define _LOAD_FAST_AND_CLEAR_r12 1053
+#define _LOAD_FAST_AND_CLEAR_r23 1054
+#define _LOAD_FAST_BORROW_r01 1055
+#define _LOAD_FAST_BORROW_r12 1056
+#define _LOAD_FAST_BORROW_r23 1057
+#define _LOAD_FAST_BORROW_0_r01 1058
+#define _LOAD_FAST_BORROW_0_r12 1059
+#define _LOAD_FAST_BORROW_0_r23 1060
+#define _LOAD_FAST_BORROW_1_r01 1061
+#define _LOAD_FAST_BORROW_1_r12 1062
+#define _LOAD_FAST_BORROW_1_r23 1063
+#define _LOAD_FAST_BORROW_2_r01 1064
+#define _LOAD_FAST_BORROW_2_r12 1065
+#define _LOAD_FAST_BORROW_2_r23 1066
+#define _LOAD_FAST_BORROW_3_r01 1067
+#define _LOAD_FAST_BORROW_3_r12 1068
+#define _LOAD_FAST_BORROW_3_r23 1069
+#define _LOAD_FAST_BORROW_4_r01 1070
+#define _LOAD_FAST_BORROW_4_r12 1071
+#define _LOAD_FAST_BORROW_4_r23 1072
+#define _LOAD_FAST_BORROW_5_r01 1073
+#define _LOAD_FAST_BORROW_5_r12 1074
+#define _LOAD_FAST_BORROW_5_r23 1075
+#define _LOAD_FAST_BORROW_6_r01 1076
+#define _LOAD_FAST_BORROW_6_r12 1077
+#define _LOAD_FAST_BORROW_6_r23 1078
+#define _LOAD_FAST_BORROW_7_r01 1079
+#define _LOAD_FAST_BORROW_7_r12 1080
+#define _LOAD_FAST_BORROW_7_r23 1081
+#define _LOAD_FAST_BORROW_LOAD_FAST_BORROW_r02 1082
+#define _LOAD_FAST_BORROW_LOAD_FAST_BORROW_r13 1083
+#define _LOAD_FAST_CHECK_r01 1084
+#define _LOAD_FAST_CHECK_r12 1085
+#define _LOAD_FAST_CHECK_r23 1086
+#define _LOAD_FAST_LOAD_FAST_r02 1087
+#define _LOAD_FAST_LOAD_FAST_r13 1088
+#define _LOAD_FROM_DICT_OR_DEREF_r11 1089
+#define _LOAD_FROM_DICT_OR_GLOBALS_r11 1090
+#define _LOAD_GLOBAL_r00 1091
+#define _LOAD_GLOBAL_BUILTINS_r01 1092
+#define _LOAD_GLOBAL_MODULE_r01 1093
+#define _LOAD_LOCALS_r01 1094
+#define _LOAD_LOCALS_r12 1095
+#define _LOAD_LOCALS_r23 1096
+#define _LOAD_NAME_r01 1097
+#define _LOAD_SMALL_INT_r01 1098
+#define _LOAD_SMALL_INT_r12 1099
+#define _LOAD_SMALL_INT_r23 1100
+#define _LOAD_SMALL_INT_0_r01 1101
+#define _LOAD_SMALL_INT_0_r12 1102
+#define _LOAD_SMALL_INT_0_r23 1103
+#define _LOAD_SMALL_INT_1_r01 1104
+#define _LOAD_SMALL_INT_1_r12 1105
+#define _LOAD_SMALL_INT_1_r23 1106
+#define _LOAD_SMALL_INT_2_r01 1107
+#define _LOAD_SMALL_INT_2_r12 1108
+#define _LOAD_SMALL_INT_2_r23 1109
+#define _LOAD_SMALL_INT_3_r01 1110
+#define _LOAD_SMALL_INT_3_r12 1111
+#define _LOAD_SMALL_INT_3_r23 1112
+#define _LOAD_SPECIAL_r00 1113
+#define _LOAD_SUPER_ATTR_ATTR_r31 1114
+#define _LOAD_SUPER_ATTR_METHOD_r32 1115
+#define _MAKE_CALLARGS_A_TUPLE_r33 1116
+#define _MAKE_CELL_r00 1117
+#define _MAKE_FUNCTION_r11 1118
+#define _MAKE_WARM_r00 1119
+#define _MAKE_WARM_r11 1120
+#define _MAKE_WARM_r22 1121
+#define _MAKE_WARM_r33 1122
+#define _MAP_ADD_r20 1123
+#define _MATCH_CLASS_r31 1124
+#define _MATCH_KEYS_r23 1125
+#define _MATCH_MAPPING_r02 1126
+#define _MATCH_MAPPING_r12 1127
+#define _MATCH_MAPPING_r23 1128
+#define _MATCH_SEQUENCE_r02 1129
+#define _MATCH_SEQUENCE_r12 1130
+#define _MATCH_SEQUENCE_r23 1131
+#define _MAYBE_EXPAND_METHOD_r00 1132
+#define _MAYBE_EXPAND_METHOD_KW_r11 1133
+#define _MONITOR_CALL_r00 1134
+#define _MONITOR_CALL_KW_r11 1135
+#define _MONITOR_JUMP_BACKWARD_r00 1136
+#define _MONITOR_JUMP_BACKWARD_r11 1137
+#define _MONITOR_JUMP_BACKWARD_r22 1138
+#define _MONITOR_JUMP_BACKWARD_r33 1139
+#define _MONITOR_RESUME_r00 1140
+#define _NOP_r00 1141
+#define _NOP_r11 1142
+#define _NOP_r22 1143
+#define _NOP_r33 1144
+#define _POP_CALL_r20 1145
+#define _POP_CALL_LOAD_CONST_INLINE_BORROW_r21 1146
+#define _POP_CALL_ONE_r30 1147
+#define _POP_CALL_ONE_LOAD_CONST_INLINE_BORROW_r31 1148
+#define _POP_CALL_TWO_r30 1149
+#define _POP_CALL_TWO_LOAD_CONST_INLINE_BORROW_r31 1150
+#define _POP_EXCEPT_r10 1151
+#define _POP_ITER_r20 1152
+#define _POP_JUMP_IF_FALSE_r00 1153
+#define _POP_JUMP_IF_FALSE_r10 1154
+#define _POP_JUMP_IF_FALSE_r21 1155
+#define _POP_JUMP_IF_FALSE_r32 1156
+#define _POP_JUMP_IF_TRUE_r00 1157
+#define _POP_JUMP_IF_TRUE_r10 1158
+#define _POP_JUMP_IF_TRUE_r21 1159
+#define _POP_JUMP_IF_TRUE_r32 1160
+#define _POP_TOP_r10 1161
+#define _POP_TOP_FLOAT_r00 1162
+#define _POP_TOP_FLOAT_r10 1163
+#define _POP_TOP_FLOAT_r21 1164
+#define _POP_TOP_FLOAT_r32 1165
+#define _POP_TOP_INT_r00 1166
+#define _POP_TOP_INT_r10 1167
+#define _POP_TOP_INT_r21 1168
+#define _POP_TOP_INT_r32 1169
+#define _POP_TOP_LOAD_CONST_INLINE_r11 1170
+#define _POP_TOP_LOAD_CONST_INLINE_BORROW_r11 1171
+#define _POP_TOP_NOP_r00 1172
+#define _POP_TOP_NOP_r10 1173
+#define _POP_TOP_NOP_r21 1174
+#define _POP_TOP_NOP_r32 1175
+#define _POP_TOP_UNICODE_r00 1176
+#define _POP_TOP_UNICODE_r10 1177
+#define _POP_TOP_UNICODE_r21 1178
+#define _POP_TOP_UNICODE_r32 1179
+#define _POP_TWO_r20 1180
+#define _POP_TWO_LOAD_CONST_INLINE_BORROW_r21 1181
+#define _PUSH_EXC_INFO_r02 1182
+#define _PUSH_EXC_INFO_r12 1183
+#define _PUSH_EXC_INFO_r23 1184
+#define _PUSH_FRAME_r10 1185
+#define _PUSH_NULL_r01 1186
+#define _PUSH_NULL_r12 1187
+#define _PUSH_NULL_r23 1188
+#define _PUSH_NULL_CONDITIONAL_r00 1189
+#define _PY_FRAME_GENERAL_r01 1190
+#define _PY_FRAME_KW_r11 1191
+#define _QUICKEN_RESUME_r00 1192
+#define _QUICKEN_RESUME_r11 1193
+#define _QUICKEN_RESUME_r22 1194
+#define _QUICKEN_RESUME_r33 1195
+#define _REPLACE_WITH_TRUE_r11 1196
+#define _RESUME_CHECK_r00 1197
+#define _RESUME_CHECK_r11 1198
+#define _RESUME_CHECK_r22 1199
+#define _RESUME_CHECK_r33 1200
+#define _RETURN_GENERATOR_r01 1201
+#define _RETURN_VALUE_r11 1202
+#define _SAVE_RETURN_OFFSET_r00 1203
+#define _SAVE_RETURN_OFFSET_r11 1204
+#define _SAVE_RETURN_OFFSET_r22 1205
+#define _SAVE_RETURN_OFFSET_r33 1206
+#define _SEND_r22 1207
+#define _SEND_GEN_FRAME_r22 1208
+#define _SETUP_ANNOTATIONS_r00 1209
+#define _SET_ADD_r10 1210
+#define _SET_FUNCTION_ATTRIBUTE_r01 1211
+#define _SET_FUNCTION_ATTRIBUTE_r11 1212
+#define _SET_FUNCTION_ATTRIBUTE_r21 1213
+#define _SET_FUNCTION_ATTRIBUTE_r32 1214
+#define _SET_IP_r00 1215
+#define _SET_IP_r11 1216
+#define _SET_IP_r22 1217
+#define _SET_IP_r33 1218
+#define _SET_UPDATE_r10 1219
+#define _SHUFFLE_3_LOAD_CONST_INLINE_BORROW_r03 1220
+#define _SHUFFLE_3_LOAD_CONST_INLINE_BORROW_r13 1221
+#define _SHUFFLE_3_LOAD_CONST_INLINE_BORROW_r23 1222
+#define _SHUFFLE_3_LOAD_CONST_INLINE_BORROW_r33 1223
+#define _SPILL_OR_RELOAD_r01 1224
+#define _SPILL_OR_RELOAD_r02 1225
+#define _SPILL_OR_RELOAD_r03 1226
+#define _SPILL_OR_RELOAD_r10 1227
+#define _SPILL_OR_RELOAD_r12 1228
+#define _SPILL_OR_RELOAD_r13 1229
+#define _SPILL_OR_RELOAD_r20 1230
+#define _SPILL_OR_RELOAD_r21 1231
+#define _SPILL_OR_RELOAD_r23 1232
+#define _SPILL_OR_RELOAD_r30 1233
+#define _SPILL_OR_RELOAD_r31 1234
+#define _SPILL_OR_RELOAD_r32 1235
+#define _START_EXECUTOR_r00 1236
+#define _STORE_ATTR_r20 1237
+#define _STORE_ATTR_INSTANCE_VALUE_r21 1238
+#define _STORE_ATTR_SLOT_r21 1239
+#define _STORE_ATTR_WITH_HINT_r21 1240
+#define _STORE_DEREF_r10 1241
+#define _STORE_FAST_r10 1242
+#define _STORE_FAST_0_r10 1243
+#define _STORE_FAST_1_r10 1244
+#define _STORE_FAST_2_r10 1245
+#define _STORE_FAST_3_r10 1246
+#define _STORE_FAST_4_r10 1247
+#define _STORE_FAST_5_r10 1248
+#define _STORE_FAST_6_r10 1249
+#define _STORE_FAST_7_r10 1250
+#define _STORE_FAST_LOAD_FAST_r11 1251
+#define _STORE_FAST_STORE_FAST_r20 1252
+#define _STORE_GLOBAL_r10 1253
+#define _STORE_NAME_r10 1254
+#define _STORE_SLICE_r30 1255
+#define _STORE_SUBSCR_r30 1256
+#define _STORE_SUBSCR_DICT_r31 1257
+#define _STORE_SUBSCR_LIST_INT_r32 1258
+#define _SWAP_r11 1259
+#define _SWAP_2_r02 1260
+#define _SWAP_2_r12 1261
+#define _SWAP_2_r22 1262
+#define _SWAP_2_r33 1263
+#define _SWAP_3_r03 1264
+#define _SWAP_3_r13 1265
+#define _SWAP_3_r23 1266
+#define _SWAP_3_r33 1267
+#define _TIER2_RESUME_CHECK_r00 1268
+#define _TIER2_RESUME_CHECK_r11 1269
+#define _TIER2_RESUME_CHECK_r22 1270
+#define _TIER2_RESUME_CHECK_r33 1271
+#define _TO_BOOL_r11 1272
+#define _TO_BOOL_BOOL_r01 1273
+#define _TO_BOOL_BOOL_r11 1274
+#define _TO_BOOL_BOOL_r22 1275
+#define _TO_BOOL_BOOL_r33 1276
+#define _TO_BOOL_INT_r11 1277
+#define _TO_BOOL_LIST_r11 1278
+#define _TO_BOOL_NONE_r01 1279
+#define _TO_BOOL_NONE_r11 1280
+#define _TO_BOOL_NONE_r22 1281
+#define _TO_BOOL_NONE_r33 1282
+#define _TO_BOOL_STR_r11 1283
+#define _TRACE_RECORD_r00 1284
+#define _UNARY_INVERT_r11 1285
+#define _UNARY_NEGATIVE_r11 1286
+#define _UNARY_NOT_r01 1287
+#define _UNARY_NOT_r11 1288
+#define _UNARY_NOT_r22 1289
+#define _UNARY_NOT_r33 1290
+#define _UNPACK_EX_r10 1291
+#define _UNPACK_SEQUENCE_r10 1292
+#define _UNPACK_SEQUENCE_LIST_r10 1293
+#define _UNPACK_SEQUENCE_TUPLE_r10 1294
+#define _UNPACK_SEQUENCE_TWO_TUPLE_r12 1295
+#define _WITH_EXCEPT_START_r33 1296
+#define _YIELD_VALUE_r11 1297
+#define MAX_UOP_REGS_ID 1297
 
 #ifdef __cplusplus
 }
diff --git a/Include/internal/pycore_uop_metadata.h b/Include/internal/pycore_uop_metadata.h
index f600468c321..d84c88c9243 100644
--- a/Include/internal/pycore_uop_metadata.h
+++ b/Include/internal/pycore_uop_metadata.h
@@ -111,7 +111,7 @@ const uint32_t _PyUop_Flags[MAX_UOP_ID+1] = {
     [_BINARY_OP_MULTIPLY_FLOAT] = HAS_ERROR_FLAG | HAS_ERROR_NO_POP_FLAG | HAS_PURE_FLAG,
     [_BINARY_OP_ADD_FLOAT] = HAS_ERROR_FLAG | HAS_ERROR_NO_POP_FLAG | HAS_PURE_FLAG,
     [_BINARY_OP_SUBTRACT_FLOAT] = HAS_ERROR_FLAG | HAS_ERROR_NO_POP_FLAG | HAS_PURE_FLAG,
-    [_BINARY_OP_ADD_UNICODE] = HAS_ERROR_FLAG | HAS_PURE_FLAG,
+    [_BINARY_OP_ADD_UNICODE] = HAS_ERROR_FLAG | HAS_ERROR_NO_POP_FLAG | HAS_PURE_FLAG,
     [_BINARY_OP_INPLACE_ADD_UNICODE] = HAS_LOCAL_FLAG | HAS_DEOPT_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG,
     [_GUARD_BINARY_OP_EXTEND] = HAS_DEOPT_FLAG | HAS_ESCAPES_FLAG,
     [_BINARY_OP_EXTEND] = HAS_ESCAPES_FLAG,
@@ -119,7 +119,7 @@ const uint32_t _PyUop_Flags[MAX_UOP_ID+1] = {
     [_STORE_SLICE] = HAS_ERROR_FLAG | HAS_ESCAPES_FLAG,
     [_BINARY_OP_SUBSCR_LIST_INT] = HAS_DEOPT_FLAG | HAS_ESCAPES_FLAG,
     [_BINARY_OP_SUBSCR_LIST_SLICE] = HAS_ERROR_FLAG | HAS_ESCAPES_FLAG,
-    [_BINARY_OP_SUBSCR_STR_INT] = HAS_DEOPT_FLAG | HAS_ESCAPES_FLAG,
+    [_BINARY_OP_SUBSCR_STR_INT] = HAS_DEOPT_FLAG,
     [_GUARD_NOS_TUPLE] = HAS_EXIT_FLAG,
     [_GUARD_TOS_TUPLE] = HAS_EXIT_FLAG,
     [_BINARY_OP_SUBSCR_TUPLE_INT] = HAS_DEOPT_FLAG | HAS_ESCAPES_FLAG,
@@ -189,9 +189,9 @@ const uint32_t _PyUop_Flags[MAX_UOP_ID+1] = {
     [_GUARD_TYPE_VERSION] = HAS_EXIT_FLAG,
     [_GUARD_TYPE_VERSION_AND_LOCK] = HAS_EXIT_FLAG,
     [_CHECK_MANAGED_OBJECT_HAS_VALUES] = HAS_DEOPT_FLAG,
-    [_LOAD_ATTR_INSTANCE_VALUE] = HAS_DEOPT_FLAG | HAS_ESCAPES_FLAG,
+    [_LOAD_ATTR_INSTANCE_VALUE] = HAS_DEOPT_FLAG,
     [_LOAD_ATTR_MODULE] = HAS_DEOPT_FLAG | HAS_ESCAPES_FLAG,
-    [_LOAD_ATTR_WITH_HINT] = HAS_ARG_FLAG | HAS_NAME_FLAG | HAS_DEOPT_FLAG | HAS_ESCAPES_FLAG,
+    [_LOAD_ATTR_WITH_HINT] = HAS_ARG_FLAG | HAS_NAME_FLAG | HAS_DEOPT_FLAG,
     [_LOAD_ATTR_SLOT] = HAS_DEOPT_FLAG | HAS_ESCAPES_FLAG,
     [_CHECK_ATTR_CLASS] = HAS_EXIT_FLAG,
     [_LOAD_ATTR_CLASS] = HAS_ESCAPES_FLAG,
@@ -335,6 +335,7 @@ const uint32_t _PyUop_Flags[MAX_UOP_ID+1] = {
     [_POP_TWO_LOAD_CONST_INLINE_BORROW] = HAS_ESCAPES_FLAG,
     [_POP_CALL_LOAD_CONST_INLINE_BORROW] = HAS_ESCAPES_FLAG,
     [_POP_CALL_ONE_LOAD_CONST_INLINE_BORROW] = HAS_ESCAPES_FLAG,
+    [_SHUFFLE_3_LOAD_CONST_INLINE_BORROW] = 0,
     [_POP_CALL_TWO_LOAD_CONST_INLINE_BORROW] = HAS_ESCAPES_FLAG,
     [_LOAD_CONST_UNDER_INLINE] = 0,
     [_LOAD_CONST_UNDER_INLINE_BORROW] = 0,
@@ -1050,12 +1051,12 @@ const _PyUopCachingInfo _PyUop_Caching[MAX_UOP_ID+1] = {
         },
     },
     [_BINARY_OP_ADD_UNICODE] = {
-        .best = { 0, 1, 2, 3 },
+        .best = { 0, 1, 2, 2 },
         .entries = {
-            { 1, 0, _BINARY_OP_ADD_UNICODE_r01 },
-            { 1, 1, _BINARY_OP_ADD_UNICODE_r11 },
-            { 1, 2, _BINARY_OP_ADD_UNICODE_r21 },
-            { 2, 3, _BINARY_OP_ADD_UNICODE_r32 },
+            { 3, 0, _BINARY_OP_ADD_UNICODE_r03 },
+            { 3, 1, _BINARY_OP_ADD_UNICODE_r13 },
+            { 3, 2, _BINARY_OP_ADD_UNICODE_r23 },
+            { -1, -1, -1 },
         },
     },
     [_BINARY_OP_INPLACE_ADD_UNICODE] = {
@@ -1108,7 +1109,7 @@ const _PyUopCachingInfo _PyUop_Caching[MAX_UOP_ID+1] = {
         .entries = {
             { -1, -1, -1 },
             { -1, -1, -1 },
-            { 1, 2, _BINARY_OP_SUBSCR_LIST_INT_r21 },
+            { 3, 2, _BINARY_OP_SUBSCR_LIST_INT_r23 },
             { -1, -1, -1 },
         },
     },
@@ -1126,7 +1127,7 @@ const _PyUopCachingInfo _PyUop_Caching[MAX_UOP_ID+1] = {
         .entries = {
             { -1, -1, -1 },
             { -1, -1, -1 },
-            { 1, 2, _BINARY_OP_SUBSCR_STR_INT_r21 },
+            { 3, 2, _BINARY_OP_SUBSCR_STR_INT_r23 },
             { -1, -1, -1 },
         },
     },
@@ -1752,11 +1753,11 @@ const _PyUopCachingInfo _PyUop_Caching[MAX_UOP_ID+1] = {
         },
     },
     [_LOAD_ATTR_INSTANCE_VALUE] = {
-        .best = { 1, 1, 1, 1 },
+        .best = { 0, 1, 2, 2 },
         .entries = {
-            { -1, -1, -1 },
-            { 1, 1, _LOAD_ATTR_INSTANCE_VALUE_r11 },
-            { -1, -1, -1 },
+            { 2, 0, _LOAD_ATTR_INSTANCE_VALUE_r02 },
+            { 2, 1, _LOAD_ATTR_INSTANCE_VALUE_r12 },
+            { 3, 2, _LOAD_ATTR_INSTANCE_VALUE_r23 },
             { -1, -1, -1 },
         },
     },
@@ -1773,7 +1774,7 @@ const _PyUopCachingInfo _PyUop_Caching[MAX_UOP_ID+1] = {
         .best = { 1, 1, 1, 1 },
         .entries = {
             { -1, -1, -1 },
-            { 1, 1, _LOAD_ATTR_WITH_HINT_r11 },
+            { 2, 1, _LOAD_ATTR_WITH_HINT_r12 },
             { -1, -1, -1 },
             { -1, -1, -1 },
         },
@@ -1837,7 +1838,7 @@ const _PyUopCachingInfo _PyUop_Caching[MAX_UOP_ID+1] = {
         .entries = {
             { -1, -1, -1 },
             { -1, -1, -1 },
-            { 0, 2, _STORE_ATTR_WITH_HINT_r20 },
+            { 1, 2, _STORE_ATTR_WITH_HINT_r21 },
             { -1, -1, -1 },
         },
     },
@@ -2130,10 +2131,10 @@ const _PyUopCachingInfo _PyUop_Caching[MAX_UOP_ID+1] = {
         },
     },
     [_FOR_ITER_GEN_FRAME] = {
-        .best = { 2, 2, 2, 2 },
+        .best = { 0, 1, 2, 2 },
         .entries = {
-            { -1, -1, -1 },
-            { -1, -1, -1 },
+            { 3, 0, _FOR_ITER_GEN_FRAME_r03 },
+            { 3, 1, _FOR_ITER_GEN_FRAME_r13 },
             { 3, 2, _FOR_ITER_GEN_FRAME_r23 },
             { -1, -1, -1 },
         },
@@ -3065,6 +3066,15 @@ const _PyUopCachingInfo _PyUop_Caching[MAX_UOP_ID+1] = {
             { 1, 3, _POP_CALL_ONE_LOAD_CONST_INLINE_BORROW_r31 },
         },
     },
+    [_SHUFFLE_3_LOAD_CONST_INLINE_BORROW] = {
+        .best = { 0, 1, 2, 3 },
+        .entries = {
+            { 3, 0, _SHUFFLE_3_LOAD_CONST_INLINE_BORROW_r03 },
+            { 3, 1, _SHUFFLE_3_LOAD_CONST_INLINE_BORROW_r13 },
+            { 3, 2, _SHUFFLE_3_LOAD_CONST_INLINE_BORROW_r23 },
+            { 3, 3, _SHUFFLE_3_LOAD_CONST_INLINE_BORROW_r33 },
+        },
+    },
     [_POP_CALL_TWO_LOAD_CONST_INLINE_BORROW] = {
         .best = { 3, 3, 3, 3 },
         .entries = {
@@ -3414,18 +3424,17 @@ const uint16_t _PyUop_Uncached[MAX_UOP_REGS_ID+1] = {
     [_BINARY_OP_SUBTRACT_FLOAT_r03] = _BINARY_OP_SUBTRACT_FLOAT,
     [_BINARY_OP_SUBTRACT_FLOAT_r13] = _BINARY_OP_SUBTRACT_FLOAT,
     [_BINARY_OP_SUBTRACT_FLOAT_r23] = _BINARY_OP_SUBTRACT_FLOAT,
-    [_BINARY_OP_ADD_UNICODE_r01] = _BINARY_OP_ADD_UNICODE,
-    [_BINARY_OP_ADD_UNICODE_r11] = _BINARY_OP_ADD_UNICODE,
-    [_BINARY_OP_ADD_UNICODE_r21] = _BINARY_OP_ADD_UNICODE,
-    [_BINARY_OP_ADD_UNICODE_r32] = _BINARY_OP_ADD_UNICODE,
+    [_BINARY_OP_ADD_UNICODE_r03] = _BINARY_OP_ADD_UNICODE,
+    [_BINARY_OP_ADD_UNICODE_r13] = _BINARY_OP_ADD_UNICODE,
+    [_BINARY_OP_ADD_UNICODE_r23] = _BINARY_OP_ADD_UNICODE,
     [_BINARY_OP_INPLACE_ADD_UNICODE_r20] = _BINARY_OP_INPLACE_ADD_UNICODE,
     [_GUARD_BINARY_OP_EXTEND_r22] = _GUARD_BINARY_OP_EXTEND,
     [_BINARY_OP_EXTEND_r21] = _BINARY_OP_EXTEND,
     [_BINARY_SLICE_r31] = _BINARY_SLICE,
     [_STORE_SLICE_r30] = _STORE_SLICE,
-    [_BINARY_OP_SUBSCR_LIST_INT_r21] = _BINARY_OP_SUBSCR_LIST_INT,
+    [_BINARY_OP_SUBSCR_LIST_INT_r23] = _BINARY_OP_SUBSCR_LIST_INT,
     [_BINARY_OP_SUBSCR_LIST_SLICE_r21] = _BINARY_OP_SUBSCR_LIST_SLICE,
-    [_BINARY_OP_SUBSCR_STR_INT_r21] = _BINARY_OP_SUBSCR_STR_INT,
+    [_BINARY_OP_SUBSCR_STR_INT_r23] = _BINARY_OP_SUBSCR_STR_INT,
     [_GUARD_NOS_TUPLE_r02] = _GUARD_NOS_TUPLE,
     [_GUARD_NOS_TUPLE_r12] = _GUARD_NOS_TUPLE,
     [_GUARD_NOS_TUPLE_r22] = _GUARD_NOS_TUPLE,
@@ -3529,9 +3538,11 @@ const uint16_t _PyUop_Uncached[MAX_UOP_REGS_ID+1] = {
     [_CHECK_MANAGED_OBJECT_HAS_VALUES_r11] = _CHECK_MANAGED_OBJECT_HAS_VALUES,
     [_CHECK_MANAGED_OBJECT_HAS_VALUES_r22] = _CHECK_MANAGED_OBJECT_HAS_VALUES,
     [_CHECK_MANAGED_OBJECT_HAS_VALUES_r33] = _CHECK_MANAGED_OBJECT_HAS_VALUES,
-    [_LOAD_ATTR_INSTANCE_VALUE_r11] = _LOAD_ATTR_INSTANCE_VALUE,
+    [_LOAD_ATTR_INSTANCE_VALUE_r02] = _LOAD_ATTR_INSTANCE_VALUE,
+    [_LOAD_ATTR_INSTANCE_VALUE_r12] = _LOAD_ATTR_INSTANCE_VALUE,
+    [_LOAD_ATTR_INSTANCE_VALUE_r23] = _LOAD_ATTR_INSTANCE_VALUE,
     [_LOAD_ATTR_MODULE_r11] = _LOAD_ATTR_MODULE,
-    [_LOAD_ATTR_WITH_HINT_r11] = _LOAD_ATTR_WITH_HINT,
+    [_LOAD_ATTR_WITH_HINT_r12] = _LOAD_ATTR_WITH_HINT,
     [_LOAD_ATTR_SLOT_r11] = _LOAD_ATTR_SLOT,
     [_CHECK_ATTR_CLASS_r01] = _CHECK_ATTR_CLASS,
     [_CHECK_ATTR_CLASS_r11] = _CHECK_ATTR_CLASS,
@@ -3544,7 +3555,7 @@ const uint16_t _PyUop_Uncached[MAX_UOP_REGS_ID+1] = {
     [_GUARD_DORV_NO_DICT_r22] = _GUARD_DORV_NO_DICT,
     [_GUARD_DORV_NO_DICT_r33] = _GUARD_DORV_NO_DICT,
     [_STORE_ATTR_INSTANCE_VALUE_r21] = _STORE_ATTR_INSTANCE_VALUE,
-    [_STORE_ATTR_WITH_HINT_r20] = _STORE_ATTR_WITH_HINT,
+    [_STORE_ATTR_WITH_HINT_r21] = _STORE_ATTR_WITH_HINT,
     [_STORE_ATTR_SLOT_r21] = _STORE_ATTR_SLOT,
     [_COMPARE_OP_r21] = _COMPARE_OP,
     [_COMPARE_OP_FLOAT_r01] = _COMPARE_OP_FLOAT,
@@ -3609,6 +3620,8 @@ const uint16_t _PyUop_Uncached[MAX_UOP_REGS_ID+1] = {
     [_ITER_NEXT_RANGE_r03] = _ITER_NEXT_RANGE,
     [_ITER_NEXT_RANGE_r13] = _ITER_NEXT_RANGE,
     [_ITER_NEXT_RANGE_r23] = _ITER_NEXT_RANGE,
+    [_FOR_ITER_GEN_FRAME_r03] = _FOR_ITER_GEN_FRAME,
+    [_FOR_ITER_GEN_FRAME_r13] = _FOR_ITER_GEN_FRAME,
     [_FOR_ITER_GEN_FRAME_r23] = _FOR_ITER_GEN_FRAME,
     [_INSERT_NULL_r10] = _INSERT_NULL,
     [_LOAD_SPECIAL_r00] = _LOAD_SPECIAL,
@@ -3816,6 +3829,10 @@ const uint16_t _PyUop_Uncached[MAX_UOP_REGS_ID+1] = {
     [_POP_TWO_LOAD_CONST_INLINE_BORROW_r21] = _POP_TWO_LOAD_CONST_INLINE_BORROW,
     [_POP_CALL_LOAD_CONST_INLINE_BORROW_r21] = _POP_CALL_LOAD_CONST_INLINE_BORROW,
     [_POP_CALL_ONE_LOAD_CONST_INLINE_BORROW_r31] = _POP_CALL_ONE_LOAD_CONST_INLINE_BORROW,
+    [_SHUFFLE_3_LOAD_CONST_INLINE_BORROW_r03] = _SHUFFLE_3_LOAD_CONST_INLINE_BORROW,
+    [_SHUFFLE_3_LOAD_CONST_INLINE_BORROW_r13] = _SHUFFLE_3_LOAD_CONST_INLINE_BORROW,
+    [_SHUFFLE_3_LOAD_CONST_INLINE_BORROW_r23] = _SHUFFLE_3_LOAD_CONST_INLINE_BORROW,
+    [_SHUFFLE_3_LOAD_CONST_INLINE_BORROW_r33] = _SHUFFLE_3_LOAD_CONST_INLINE_BORROW,
     [_POP_CALL_TWO_LOAD_CONST_INLINE_BORROW_r31] = _POP_CALL_TWO_LOAD_CONST_INLINE_BORROW,
     [_LOAD_CONST_UNDER_INLINE_r02] = _LOAD_CONST_UNDER_INLINE,
     [_LOAD_CONST_UNDER_INLINE_r12] = _LOAD_CONST_UNDER_INLINE,
@@ -3904,10 +3921,9 @@ const char *const _PyOpcode_uop_name[MAX_UOP_REGS_ID+1] = {
     [_BINARY_OP_ADD_INT_r13] = "_BINARY_OP_ADD_INT_r13",
     [_BINARY_OP_ADD_INT_r23] = "_BINARY_OP_ADD_INT_r23",
     [_BINARY_OP_ADD_UNICODE] = "_BINARY_OP_ADD_UNICODE",
-    [_BINARY_OP_ADD_UNICODE_r01] = "_BINARY_OP_ADD_UNICODE_r01",
-    [_BINARY_OP_ADD_UNICODE_r11] = "_BINARY_OP_ADD_UNICODE_r11",
-    [_BINARY_OP_ADD_UNICODE_r21] = "_BINARY_OP_ADD_UNICODE_r21",
-    [_BINARY_OP_ADD_UNICODE_r32] = "_BINARY_OP_ADD_UNICODE_r32",
+    [_BINARY_OP_ADD_UNICODE_r03] = "_BINARY_OP_ADD_UNICODE_r03",
+    [_BINARY_OP_ADD_UNICODE_r13] = "_BINARY_OP_ADD_UNICODE_r13",
+    [_BINARY_OP_ADD_UNICODE_r23] = "_BINARY_OP_ADD_UNICODE_r23",
     [_BINARY_OP_EXTEND] = "_BINARY_OP_EXTEND",
     [_BINARY_OP_EXTEND_r21] = "_BINARY_OP_EXTEND_r21",
     [_BINARY_OP_INPLACE_ADD_UNICODE] = "_BINARY_OP_INPLACE_ADD_UNICODE",
@@ -3930,11 +3946,11 @@ const char *const _PyOpcode_uop_name[MAX_UOP_REGS_ID+1] = {
     [_BINARY_OP_SUBSCR_INIT_CALL_r21] = "_BINARY_OP_SUBSCR_INIT_CALL_r21",
     [_BINARY_OP_SUBSCR_INIT_CALL_r31] = "_BINARY_OP_SUBSCR_INIT_CALL_r31",
     [_BINARY_OP_SUBSCR_LIST_INT] = "_BINARY_OP_SUBSCR_LIST_INT",
-    [_BINARY_OP_SUBSCR_LIST_INT_r21] = "_BINARY_OP_SUBSCR_LIST_INT_r21",
+    [_BINARY_OP_SUBSCR_LIST_INT_r23] = "_BINARY_OP_SUBSCR_LIST_INT_r23",
     [_BINARY_OP_SUBSCR_LIST_SLICE] = "_BINARY_OP_SUBSCR_LIST_SLICE",
     [_BINARY_OP_SUBSCR_LIST_SLICE_r21] = "_BINARY_OP_SUBSCR_LIST_SLICE_r21",
     [_BINARY_OP_SUBSCR_STR_INT] = "_BINARY_OP_SUBSCR_STR_INT",
-    [_BINARY_OP_SUBSCR_STR_INT_r21] = "_BINARY_OP_SUBSCR_STR_INT_r21",
+    [_BINARY_OP_SUBSCR_STR_INT_r23] = "_BINARY_OP_SUBSCR_STR_INT_r23",
     [_BINARY_OP_SUBSCR_TUPLE_INT] = "_BINARY_OP_SUBSCR_TUPLE_INT",
     [_BINARY_OP_SUBSCR_TUPLE_INT_r21] = "_BINARY_OP_SUBSCR_TUPLE_INT_r21",
     [_BINARY_OP_SUBTRACT_FLOAT] = "_BINARY_OP_SUBTRACT_FLOAT",
@@ -4168,6 +4184,8 @@ const char *const _PyOpcode_uop_name[MAX_UOP_REGS_ID+1] = {
     [_FORMAT_WITH_SPEC] = "_FORMAT_WITH_SPEC",
     [_FORMAT_WITH_SPEC_r21] = "_FORMAT_WITH_SPEC_r21",
     [_FOR_ITER_GEN_FRAME] = "_FOR_ITER_GEN_FRAME",
+    [_FOR_ITER_GEN_FRAME_r03] = "_FOR_ITER_GEN_FRAME_r03",
+    [_FOR_ITER_GEN_FRAME_r13] = "_FOR_ITER_GEN_FRAME_r13",
     [_FOR_ITER_GEN_FRAME_r23] = "_FOR_ITER_GEN_FRAME_r23",
     [_FOR_ITER_TIER_TWO] = "_FOR_ITER_TIER_TWO",
     [_FOR_ITER_TIER_TWO_r23] = "_FOR_ITER_TIER_TWO_r23",
@@ -4457,7 +4475,9 @@ const char *const _PyOpcode_uop_name[MAX_UOP_REGS_ID+1] = {
     [_LOAD_ATTR_CLASS] = "_LOAD_ATTR_CLASS",
     [_LOAD_ATTR_CLASS_r11] = "_LOAD_ATTR_CLASS_r11",
     [_LOAD_ATTR_INSTANCE_VALUE] = "_LOAD_ATTR_INSTANCE_VALUE",
-    [_LOAD_ATTR_INSTANCE_VALUE_r11] = "_LOAD_ATTR_INSTANCE_VALUE_r11",
+    [_LOAD_ATTR_INSTANCE_VALUE_r02] = "_LOAD_ATTR_INSTANCE_VALUE_r02",
+    [_LOAD_ATTR_INSTANCE_VALUE_r12] = "_LOAD_ATTR_INSTANCE_VALUE_r12",
+    [_LOAD_ATTR_INSTANCE_VALUE_r23] = "_LOAD_ATTR_INSTANCE_VALUE_r23",
     [_LOAD_ATTR_METHOD_LAZY_DICT] = "_LOAD_ATTR_METHOD_LAZY_DICT",
     [_LOAD_ATTR_METHOD_LAZY_DICT_r02] = "_LOAD_ATTR_METHOD_LAZY_DICT_r02",
     [_LOAD_ATTR_METHOD_LAZY_DICT_r12] = "_LOAD_ATTR_METHOD_LAZY_DICT_r12",
@@ -4481,7 +4501,7 @@ const char *const _PyOpcode_uop_name[MAX_UOP_REGS_ID+1] = {
     [_LOAD_ATTR_SLOT] = "_LOAD_ATTR_SLOT",
     [_LOAD_ATTR_SLOT_r11] = "_LOAD_ATTR_SLOT_r11",
     [_LOAD_ATTR_WITH_HINT] = "_LOAD_ATTR_WITH_HINT",
-    [_LOAD_ATTR_WITH_HINT_r11] = "_LOAD_ATTR_WITH_HINT_r11",
+    [_LOAD_ATTR_WITH_HINT_r12] = "_LOAD_ATTR_WITH_HINT_r12",
     [_LOAD_BUILD_CLASS] = "_LOAD_BUILD_CLASS",
     [_LOAD_BUILD_CLASS_r01] = "_LOAD_BUILD_CLASS_r01",
     [_LOAD_COMMON_CONSTANT] = "_LOAD_COMMON_CONSTANT",
@@ -4760,6 +4780,11 @@ const char *const _PyOpcode_uop_name[MAX_UOP_REGS_ID+1] = {
     [_SET_IP_r33] = "_SET_IP_r33",
     [_SET_UPDATE] = "_SET_UPDATE",
     [_SET_UPDATE_r10] = "_SET_UPDATE_r10",
+    [_SHUFFLE_3_LOAD_CONST_INLINE_BORROW] = "_SHUFFLE_3_LOAD_CONST_INLINE_BORROW",
+    [_SHUFFLE_3_LOAD_CONST_INLINE_BORROW_r03] = "_SHUFFLE_3_LOAD_CONST_INLINE_BORROW_r03",
+    [_SHUFFLE_3_LOAD_CONST_INLINE_BORROW_r13] = "_SHUFFLE_3_LOAD_CONST_INLINE_BORROW_r13",
+    [_SHUFFLE_3_LOAD_CONST_INLINE_BORROW_r23] = "_SHUFFLE_3_LOAD_CONST_INLINE_BORROW_r23",
+    [_SHUFFLE_3_LOAD_CONST_INLINE_BORROW_r33] = "_SHUFFLE_3_LOAD_CONST_INLINE_BORROW_r33",
     [_SPILL_OR_RELOAD] = "_SPILL_OR_RELOAD",
     [_SPILL_OR_RELOAD_r01] = "_SPILL_OR_RELOAD_r01",
     [_SPILL_OR_RELOAD_r02] = "_SPILL_OR_RELOAD_r02",
@@ -4782,7 +4807,7 @@ const char *const _PyOpcode_uop_name[MAX_UOP_REGS_ID+1] = {
     [_STORE_ATTR_SLOT] = "_STORE_ATTR_SLOT",
     [_STORE_ATTR_SLOT_r21] = "_STORE_ATTR_SLOT_r21",
     [_STORE_ATTR_WITH_HINT] = "_STORE_ATTR_WITH_HINT",
-    [_STORE_ATTR_WITH_HINT_r20] = "_STORE_ATTR_WITH_HINT_r20",
+    [_STORE_ATTR_WITH_HINT_r21] = "_STORE_ATTR_WITH_HINT_r21",
     [_STORE_DEREF] = "_STORE_DEREF",
     [_STORE_DEREF_r10] = "_STORE_DEREF_r10",
     [_STORE_FAST] = "_STORE_FAST",
@@ -5477,6 +5502,8 @@ int _PyUop_num_popped(int opcode, int oparg)
             return 2;
         case _POP_CALL_ONE_LOAD_CONST_INLINE_BORROW:
             return 3;
+        case _SHUFFLE_3_LOAD_CONST_INLINE_BORROW:
+            return 3;
         case _POP_CALL_TWO_LOAD_CONST_INLINE_BORROW:
             return 4;
         case _LOAD_CONST_UNDER_INLINE:
diff --git a/Include/patchlevel.h b/Include/patchlevel.h
index 610d5bb4a12..90a73c8f2b1 100644
--- a/Include/patchlevel.h
+++ b/Include/patchlevel.h
@@ -27,7 +27,7 @@
 #define PY_RELEASE_SERIAL       3
 
 /* Version as a string */
-#define PY_VERSION              "3.15.0a3"
+#define PY_VERSION              "3.15.0a3+"
 /*--end constants--*/
 
 
diff --git a/InternalDocs/profiling_binary_format.md b/InternalDocs/profiling_binary_format.md
new file mode 100644
index 00000000000..b3ebdfd22ed
--- /dev/null
+++ b/InternalDocs/profiling_binary_format.md
@@ -0,0 +1,489 @@
+# Profiling Binary Format
+
+The profiling module includes a binary file format for storing sampling
+profiler data. This document describes the format's structure and the
+design decisions behind it.
+
+The implementation is in
+[`Modules/_remote_debugging/binary_io_writer.c`](../Modules/_remote_debugging/binary_io_writer.c)
+and [`Modules/_remote_debugging/binary_io_reader.c`](../Modules/_remote_debugging/binary_io_reader.c),
+with declarations in
+[`Modules/_remote_debugging/binary_io.h`](../Modules/_remote_debugging/binary_io.h).
+
+## Overview
+
+The sampling profiler can generate enormous amounts of data. A typical
+profiling session sampling at 1000 Hz for 60 seconds produces 60,000 samples.
+Each sample contains a full call stack, often 20-50 frames deep, and each
+frame includes a filename, function name, and line number. In a text-based
+format like collapsed stacks, this would mean repeating the same long file
+paths and function names thousands of times.
+
+The binary format addresses this through two key strategies:
+
+1. **Deduplication**: Strings and frames are stored once in lookup tables,
+   then referenced by small integer indices. A 100-character file path that
+   appears in 50,000 samples is stored once, not 50,000 times.
+
+2. **Compact encoding**: Variable-length integers (varints) encode small
+   values in fewer bytes. Since most indices are small (under 128), they
+   typically need only one byte instead of four.
+
+Together with optional zstd compression, these techniques reduce file sizes
+by 10-50x compared to text formats while also enabling faster I/O.
+
+## File Layout
+
+The file consists of five sections:
+
+```
++------------------+  Offset 0
+|     Header       |  64 bytes (fixed)
++------------------+  Offset 64
+|                  |
+|   Sample Data    |  Variable size (optionally compressed)
+|                  |
++------------------+  string_table_offset
+|   String Table   |  Variable size
++------------------+  frame_table_offset
+|   Frame Table    |  Variable size
++------------------+  file_size - 32
+|     Footer       |  32 bytes (fixed)
++------------------+  file_size
+```
+
+The layout is designed for streaming writes during profiling. The profiler
+cannot know in advance how many unique strings or frames will be encountered,
+so these tables must be built incrementally and written at the end.
+
+The header comes first so readers can quickly validate the file and locate
+the metadata tables. The sample data follows immediately, allowing the writer
+to stream samples directly to disk (or through a compression stream) without
+buffering the entire dataset in memory.
+
+The string and frame tables are placed after sample data because they grow
+as new unique entries are discovered during profiling. By deferring their
+output until finalization, the writer avoids the complexity of reserving
+space or rewriting portions of the file.
+
+The footer at the end contains counts needed to allocate arrays before
+parsing the tables. Placing it at a fixed offset from the end (rather than
+at a variable offset recorded in the header) means readers can locate it
+with a single seek to `file_size - 32`, without first reading the header.
+
+## Header
+
+```
+ Offset   Size   Type      Description
++--------+------+---------+----------------------------------------+
+|    0   |  4   | uint32  | Magic number (0x54414348 = "TACH")     |
+|    4   |  4   | uint32  | Format version                         |
+|    8   |  4   | bytes   | Python version (major, minor, micro,   |
+|        |      |         | reserved)                              |
+|   12   |  8   | uint64  | Start timestamp (microseconds)         |
+|   20   |  8   | uint64  | Sample interval (microseconds)         |
+|   28   |  4   | uint32  | Total sample count                     |
+|   32   |  4   | uint32  | Thread count                           |
+|   36   |  8   | uint64  | String table offset                    |
+|   44   |  8   | uint64  | Frame table offset                     |
+|   52   |  4   | uint32  | Compression type (0=none, 1=zstd)      |
+|   56   |  8   | bytes   | Reserved (zero-filled)                 |
++--------+------+---------+----------------------------------------+
+```
+
+The magic number `0x54414348` ("TACH" for Tachyon) identifies the file format
+and also serves as an **endianness marker**. When read on a system with
+different byte order than the writer, it appears as `0x48434154`. The reader
+uses this to detect cross-endian files and automatically byte-swap all
+multi-byte integer fields.
+
+The Python version field records the major, minor, and micro version numbers
+of the Python interpreter that generated the file. This allows analysis tools
+to detect version mismatches when replaying data collected on a different
+Python version, which may have different internal structures or behaviors.
+
+The header is written as zeros initially, then overwritten with actual values
+during finalization. This requires the output stream to be seekable, which
+is acceptable since the format targets regular files rather than pipes or
+network streams.
+
+## Sample Data
+
+Sample data begins at offset 64 and extends to `string_table_offset`. Samples
+use delta compression to minimize redundancy when consecutive samples from the
+same thread have identical or similar call stacks.
+
+### Stack Encoding Types
+
+Each sample record begins with thread identification, then an encoding byte:
+
+| Code | Name | Description |
+|------|------|-------------|
+| 0x00 | REPEAT | RLE: identical stack repeated N times |
+| 0x01 | FULL | Complete stack (first sample or no match) |
+| 0x02 | SUFFIX | Shares N frames from bottom of previous stack |
+| 0x03 | POP_PUSH | Remove M frames from top, add N new frames |
+
+### Record Formats
+
+**REPEAT (0x00) - Run-Length Encoded Identical Stacks:**
+```
++-----------------+-----------+----------------------------------------+
+| thread_id       | 8 bytes   | Thread identifier (uint64, fixed)      |
+| interpreter_id  | 4 bytes   | Interpreter ID (uint32, fixed)         |
+| encoding        | 1 byte    | 0x00 (REPEAT)                          |
+| count           | varint    | Number of samples in this RLE group    |
+| samples         | varies    | Interleaved: [delta: varint, status: 1]|
+|                 |           | repeated count times                   |
++-----------------+-----------+----------------------------------------+
+```
+The stack is inherited from this thread's previous sample. Each sample in the
+group gets its own timestamp delta and status byte, stored as interleaved pairs
+(delta1, status1, delta2, status2, ...) rather than separate arrays.
+
+**FULL (0x01) - Complete Stack:**
+```
++-----------------+-----------+----------------------------------------+
+| thread_id       | 8 bytes   | Thread identifier (uint64, fixed)      |
+| interpreter_id  | 4 bytes   | Interpreter ID (uint32, fixed)         |
+| encoding        | 1 byte    | 0x01 (FULL)                            |
+| timestamp_delta | varint    | Microseconds since thread's last sample|
+| status          | 1 byte    | Thread state flags                     |
+| stack_depth     | varint    | Number of frames in call stack         |
+| frame_indices   | varint[]  | Array of frame table indices           |
++-----------------+-----------+----------------------------------------+
+```
+Used for the first sample from a thread, or when delta encoding would not
+provide savings.
+
+**SUFFIX (0x02) - Shared Suffix Match:**
+```
++-----------------+-----------+----------------------------------------+
+| thread_id       | 8 bytes   | Thread identifier (uint64, fixed)      |
+| interpreter_id  | 4 bytes   | Interpreter ID (uint32, fixed)         |
+| encoding        | 1 byte    | 0x02 (SUFFIX)                          |
+| timestamp_delta | varint    | Microseconds since thread's last sample|
+| status          | 1 byte    | Thread state flags                     |
+| shared_count    | varint    | Frames shared from bottom of prev stack|
+| new_count       | varint    | New frames at top of stack             |
+| new_frames      | varint[]  | Array of new_count frame indices       |
++-----------------+-----------+----------------------------------------+
+```
+Used when a function call added frames to the top of the stack. The shared
+frames from the previous stack are kept, and new frames are prepended.
+
+**POP_PUSH (0x03) - Pop and Push:**
+```
++-----------------+-----------+----------------------------------------+
+| thread_id       | 8 bytes   | Thread identifier (uint64, fixed)      |
+| interpreter_id  | 4 bytes   | Interpreter ID (uint32, fixed)         |
+| encoding        | 1 byte    | 0x03 (POP_PUSH)                        |
+| timestamp_delta | varint    | Microseconds since thread's last sample|
+| status          | 1 byte    | Thread state flags                     |
+| pop_count       | varint    | Frames to remove from top of prev stack|
+| push_count      | varint    | New frames to add at top               |
+| new_frames      | varint[]  | Array of push_count frame indices      |
++-----------------+-----------+----------------------------------------+
+```
+Used when the code path changed: some frames were popped (function returns)
+and new frames were pushed (different function calls).
+
+### Thread and Interpreter Identification
+
+Thread IDs are 64-bit values that can be large (memory addresses on some
+platforms) and vary unpredictably. Using a fixed 8-byte encoding avoids
+the overhead of varint encoding for large values and simplifies parsing
+since the reader knows exactly where each field begins.
+
+The interpreter ID identifies which Python sub-interpreter the thread
+belongs to, allowing analysis tools to separate activity across interpreters
+in processes using multiple sub-interpreters.
+
+### Status Byte
+
+The status byte is a bitfield encoding thread state at sample time:
+
+| Bit | Flag                  | Meaning                                    |
+|-----|-----------------------|--------------------------------------------|
+|  0  | THREAD_STATUS_HAS_GIL | Thread holds the GIL (Global Interpreter Lock) |
+|  1  | THREAD_STATUS_ON_CPU  | Thread is actively running on a CPU core   |
+|  2  | THREAD_STATUS_UNKNOWN | Thread state could not be determined       |
+|  3  | THREAD_STATUS_GIL_REQUESTED | Thread is waiting to acquire the GIL  |
+|  4  | THREAD_STATUS_HAS_EXCEPTION | Thread has a pending exception         |
+
+Multiple flags can be set simultaneously (e.g., a thread can hold the GIL
+while also running on CPU). Analysis tools use these to filter samples or
+visualize thread states over time.
+
+### Timestamp Delta Encoding
+
+Timestamps use delta encoding rather than absolute values. Absolute
+timestamps in microseconds require 8 bytes each, but consecutive samples
+from the same thread are typically separated by the sampling interval
+(e.g., 1000 microseconds), so the delta between them is small and fits
+in 1-2 varint bytes. The writer tracks the previous timestamp for each
+thread separately. The first sample from a thread encodes its delta from
+the profiling start time; subsequent samples encode the delta from that
+thread's previous sample. This per-thread tracking is necessary because
+samples are interleaved across threads in arrival order, not grouped by
+thread.
+
+For REPEAT (RLE) records, timestamp deltas and status bytes are stored as
+interleaved pairs (delta, status, delta, status, ...) - one pair per
+repeated sample - allowing efficient batching while preserving the exact
+timing and state of each sample.
+
+### Frame Indexing
+
+Each frame in a call stack is represented by an index into the frame table
+rather than inline data. This provides massive space savings because call
+stacks are highly repetitive: the same function appears in many samples
+(hot functions), call stacks often share common prefixes (main -> app ->
+handler -> ...), and recursive functions create repeated frame sequences.
+A frame index is typically 1-2 varint bytes. Inline frame data would be
+20-200+ bytes (two strings plus a line number). For a profile with 100,000
+samples averaging 30 frames each, this reduces frame data from potentially
+gigabytes to tens of megabytes.
+
+Frame indices are written innermost-first (the currently executing frame
+has index 0 in the array). This ordering works well with delta compression:
+function calls typically add frames at the top (index 0), while shared
+frames remain at the bottom.
+
+## String Table
+
+The string table stores deduplicated UTF-8 strings (filenames and function
+names). It begins at `string_table_offset` and contains entries in order of
+their assignment during writing:
+
+```
++----------------+
+| length: varint |
+| data: bytes    |
++----------------+  (repeated for each string)
+```
+
+Strings are stored in the order they were first encountered during writing.
+The first unique filename gets index 0, the second gets index 1, and so on.
+Length-prefixing (rather than null-termination) allows strings containing
+null bytes and enables readers to allocate exact-sized buffers. The varint
+length encoding means short strings (under 128 bytes) need only one length
+byte.
+
+## Frame Table
+
+The frame table stores deduplicated frame entries:
+
+```
++----------------------+
+| filename_idx: varint |
+| funcname_idx: varint |
+| lineno: svarint      |
++----------------------+  (repeated for each frame)
+```
+
+Each unique (filename, funcname, lineno) combination gets one entry. Two
+calls to the same function at different line numbers produce different
+frame entries; two calls at the same line number share one entry.
+
+Strings and frames are deduplicated separately because they have different
+cardinalities and reference patterns. A codebase might have hundreds of
+unique source files but thousands of unique functions. Many functions share
+the same filename, so storing the filename index in each frame entry (rather
+than the full string) provides an additional layer of deduplication. A frame
+entry is just three varints (typically 3-6 bytes) rather than two full
+strings plus a line number.
+
+Line numbers use signed varint (zigzag encoding) rather than unsigned to
+handle edge cases. Synthetic frames—generated frames that don't correspond
+directly to Python source code, such as C extension boundaries or internal
+interpreter frames—use line number 0 or -1 to indicate the absence of a
+source location. Zigzag encoding ensures these small negative values encode
+efficiently (−1 becomes 1, which is one byte) rather than requiring the
+maximum varint length.
+
+## Footer
+
+```
+ Offset   Size   Type      Description
++--------+------+---------+----------------------------------------+
+|    0   |  4   | uint32  | String count                           |
+|    4   |  4   | uint32  | Frame count                            |
+|    8   |  8   | uint64  | Total file size                        |
+|   16   | 16   | bytes   | Checksum (reserved, currently zeros)   |
++--------+------+---------+----------------------------------------+
+```
+
+The string and frame counts allow readers to pre-allocate arrays of the
+correct size before parsing the tables. Without these counts, readers would
+need to either scan the tables twice (once to count, once to parse) or use
+dynamically-growing arrays.
+
+The file size field provides a consistency check: if the actual file size
+does not match, the file may be truncated or corrupted.
+
+The checksum field is reserved for future use. A checksum would allow
+detection of corruption but adds complexity and computation cost. The
+current implementation leaves this as zeros.
+
+## Variable-Length Integer Encoding
+
+The format uses LEB128 (Little Endian Base 128) for unsigned integers and
+zigzag + LEB128 for signed integers. These encodings are widely used
+(Protocol Buffers, DWARF debug info, WebAssembly) and well-understood.
+
+### Unsigned Varint (LEB128)
+
+Each byte stores 7 bits of data. The high bit indicates whether more bytes
+follow:
+
+```
+Value        Encoded bytes
+0-127        [0xxxxxxx]                    (1 byte)
+128-16383    [1xxxxxxx] [0xxxxxxx]         (2 bytes)
+16384+       [1xxxxxxx] [1xxxxxxx] ...     (3+ bytes)
+```
+
+Most indices in profiling data are small. A profile with 1000 unique frames
+needs at most 2 bytes per frame index. The common case (indices under 128)
+needs only 1 byte.
+
+### Signed Varint (Zigzag)
+
+Standard LEB128 encodes −1 as a very large unsigned value, requiring many
+bytes. Zigzag encoding interleaves positive and negative values:
+
+```
+ 0 -> 0    -1 -> 1     1 -> 2    -2 -> 3     2 -> 4
+```
+
+This ensures small-magnitude values (whether positive or negative) encode
+in few bytes.
+
+## Compression
+
+When compression is enabled, the sample data region contains a zstd stream.
+The string table, frame table, and footer remain uncompressed so readers can
+access metadata without decompressing the entire file. A tool that only needs
+to report "this file contains 50,000 samples of 3 threads" can read the header
+and footer without touching the compressed sample data. This also simplifies
+the format: the header's offset fields point directly to the tables rather
+than to positions within a decompressed stream.
+
+Zstd provides an excellent balance of compression ratio and speed. Profiling
+data compresses very well (often 5-10x) due to repetitive patterns: the same
+small set of frame indices appears repeatedly, and delta-encoded timestamps
+cluster around the sampling interval. Zstd's streaming API allows compression
+without buffering the entire dataset. The writer feeds sample data through
+the compressor incrementally, flushing compressed chunks to disk as they
+become available.
+
+Level 5 compression is used as a default. Lower levels (1-3) are faster but
+compress less; higher levels (6+) compress more but slow down writing. Level
+5 provides good compression with minimal impact on profiling overhead.
+
+## Reading and Writing
+
+### Writing
+
+1. Open the output file and write 64 zero bytes as a placeholder header
+2. Initialize empty string and frame dictionaries for deduplication
+3. For each sample:
+   - Intern any new strings, assigning sequential indices
+   - Intern any new frames, assigning sequential indices
+   - Encode the sample record and write to the buffer
+   - Flush the buffer through compression (if enabled) when full
+4. Flush remaining buffered data and finalize compression
+5. Write the string table (length-prefixed strings in index order)
+6. Write the frame table (varint-encoded entries in index order)
+7. Write the footer with final counts
+8. Seek to offset 0 and write the header with actual values
+
+The writer maintains two dictionaries: one mapping strings to indices, one
+mapping (filename_idx, funcname_idx, lineno) tuples to frame indices. These
+enable O(1) lookup during interning.
+
+### Reading
+
+1. Read the header magic number to detect endianness (set `needs_swap` flag
+   if the magic appears byte-swapped)
+2. Validate version and read remaining header fields (byte-swapping if needed)
+3. Seek to end − 32 and read the footer (byte-swapping counts if needed)
+4. Allocate string array of `string_count` elements
+5. Parse the string table, populating the array
+6. Allocate frame array of `frame_count * 3` uint32 elements
+7. Parse the frame table, populating the array
+8. If compressed, decompress the sample data region
+9. Iterate through samples, resolving indices to strings/frames
+   (byte-swapping thread_id and interpreter_id if needed)
+
+The reader builds lookup arrays rather than dictionaries since it only needs
+index-to-value mapping, not value-to-index.
+
+## Platform Considerations
+
+### Byte Ordering and Cross-Platform Portability
+
+The binary format uses **native byte order** for all multi-byte integer
+fields when writing. However, the reader supports **cross-endian reading**:
+files written on a little-endian system (x86, ARM) can be read on a
+big-endian system (s390x, PowerPC), and vice versa.
+
+The magic number doubles as an endianness marker. When read on a system with
+different byte order, it appears byte-swapped (`0x48434154` instead of
+`0x54414348`). The reader detects this and automatically byte-swaps all
+fixed-width integer fields during parsing.
+
+Writers must use `memcpy()` from properly-sized integer types when writing
+fixed-width integer fields. When the source variable's type differs from the
+field width (e.g., `size_t` written as 4 bytes), explicit casting to the
+correct type (e.g., `uint32_t`) is required before `memcpy()`. On big-endian
+systems, copying from an oversized type would copy the wrong bytes—high-order
+zeros instead of the actual value.
+
+The reader tracks whether byte-swapping is needed via a `needs_swap` flag set
+during header parsing. All fixed-width fields in the header, footer, and
+sample data are conditionally byte-swapped using Python's internal byte-swap
+functions (`_Py_bswap32`, `_Py_bswap64` from `pycore_bitutils.h`).
+
+Variable-length integers (varints) are byte-order independent since they
+encode values one byte at a time using the LEB128 scheme, so they require
+no special handling for cross-endian reading.
+
+### Memory-Mapped I/O
+
+On Unix systems (Linux, macOS), the reader uses `mmap()` to map the file
+into the process address space. The kernel handles paging data in and out
+as needed, no explicit read() calls or buffer management are required,
+multiple readers can share the same physical pages, and sequential access
+patterns benefit from kernel read-ahead.
+
+The implementation uses `madvise()` to hint the access pattern to the kernel:
+`MADV_SEQUENTIAL` indicates the file will be read linearly, enabling
+aggressive read-ahead. `MADV_WILLNEED` requests pre-faulting of pages.
+On Linux, `MAP_POPULATE` pre-faults all pages at mmap time rather than on
+first access, moving page fault overhead from the parsing loop to the
+initial mapping for more predictable performance. For large files (over
+32 MB), `MADV_HUGEPAGE` requests transparent huge pages (2 MB instead of
+4 KB) to reduce TLB pressure when accessing large amounts of data.
+
+On Windows, the implementation falls back to standard file I/O with full
+file buffering. Profiling data files are typically small enough (tens to
+hundreds of megabytes) that this is acceptable.
+
+The writer uses a 512 KB buffer to batch small writes. Each sample record
+is typically tens of bytes; writing these individually would incur excessive
+syscall overhead. The buffer accumulates data until full, then flushes in
+one write() call (or feeds through the compression stream).
+
+## Future Considerations
+
+The format reserves space for future extensions. The 12 reserved bytes in
+the header could hold additional metadata. The 16-byte checksum field in
+the footer is currently unused. The version field allows incompatible
+changes with graceful rejection. New compression types could be added
+(compression_type > 1).
+
+Any changes that alter the meaning of existing fields or the parsing logic
+should increment the version number to prevent older readers from
+misinterpreting new files.
diff --git a/Lib/asyncio/__main__.py b/Lib/asyncio/__main__.py
index d078ebfa4ce..afbb70bbcab 100644
--- a/Lib/asyncio/__main__.py
+++ b/Lib/asyncio/__main__.py
@@ -86,14 +86,15 @@ class REPLThread(threading.Thread):
         global return_code
 
         try:
-            banner = (
-                f'asyncio REPL {sys.version} on {sys.platform}\n'
-                f'Use "await" directly instead of "asyncio.run()".\n'
-                f'Type "help", "copyright", "credits" or "license" '
-                f'for more information.\n'
-            )
+            if not sys.flags.quiet:
+                banner = (
+                    f'asyncio REPL {sys.version} on {sys.platform}\n'
+                    f'Use "await" directly instead of "asyncio.run()".\n'
+                    f'Type "help", "copyright", "credits" or "license" '
+                    f'for more information.\n'
+                )
 
-            console.write(banner)
+                console.write(banner)
 
             if startup_path := os.getenv("PYTHONSTARTUP"):
                 sys.audit("cpython.run_startup", startup_path)
@@ -240,4 +241,5 @@ if __name__ == '__main__':
             break
 
     console.write('exiting asyncio REPL...\n')
+    loop.close()
     sys.exit(return_code)
diff --git a/Lib/configparser.py b/Lib/configparser.py
index 18af1eadaad..d435a5c2fe0 100644
--- a/Lib/configparser.py
+++ b/Lib/configparser.py
@@ -794,7 +794,8 @@ class RawConfigParser(MutableMapping):
         """
         elements_added = set()
         for section, keys in dictionary.items():
-            section = str(section)
+            if section is not UNNAMED_SECTION:
+                section = str(section)
             try:
                 self.add_section(section)
             except (DuplicateSectionError, ValueError):
diff --git a/Lib/locale.py b/Lib/locale.py
index 37cafb4a601..0f1b429ea41 100644
--- a/Lib/locale.py
+++ b/Lib/locale.py
@@ -214,7 +214,7 @@ def format_string(f, val, grouping=False, monetary=False):
 
     Grouping is applied if the third parameter is true.
     Conversion uses monetary thousands separator and grouping strings if
-    forth parameter monetary is true."""
+    fourth parameter monetary is true."""
     global _percent_re
     if _percent_re is None:
         import re
diff --git a/Lib/mailbox.py b/Lib/mailbox.py
index 4a44642765c..65923e9c5de 100644
--- a/Lib/mailbox.py
+++ b/Lib/mailbox.py
@@ -2181,11 +2181,7 @@ def _unlock_file(f):
 
 def _create_carefully(path):
     """Create a file if it doesn't exist and open for reading and writing."""
-    fd = os.open(path, os.O_CREAT | os.O_EXCL | os.O_RDWR, 0o666)
-    try:
-        return open(path, 'rb+')
-    finally:
-        os.close(fd)
+    return open(path, 'xb+')
 
 def _create_temporary(path):
     """Create a temp file based on path and open for reading and writing."""
diff --git a/Lib/pdb.py b/Lib/pdb.py
index c1a5db080dc..eee0273fdc4 100644
--- a/Lib/pdb.py
+++ b/Lib/pdb.py
@@ -391,17 +391,22 @@ class Pdb(bdb.Bdb, cmd.Cmd):
         # Read ~/.pdbrc and ./.pdbrc
         self.rcLines = []
         if readrc:
+            home_rcfile = os.path.expanduser("~/.pdbrc")
+            local_rcfile = os.path.abspath(".pdbrc")
+
             try:
-                with open(os.path.expanduser('~/.pdbrc'), encoding='utf-8') as rcFile:
-                    self.rcLines.extend(rcFile)
-            except OSError:
-                pass
-            try:
-                with open(".pdbrc", encoding='utf-8') as rcFile:
-                    self.rcLines.extend(rcFile)
+                with open(home_rcfile, encoding='utf-8') as rcfile:
+                    self.rcLines.extend(rcfile)
             except OSError:
                 pass
 
+            if local_rcfile != home_rcfile:
+                try:
+                    with open(local_rcfile, encoding='utf-8') as rcfile:
+                        self.rcLines.extend(rcfile)
+                except OSError:
+                    pass
+
         self.commands = {} # associates a command list to breakpoint numbers
         self.commands_defining = False # True while in the process of defining
                                        # a command list
@@ -1315,7 +1320,14 @@ class Pdb(bdb.Bdb, cmd.Cmd):
         reached.
         """
         if not arg:
-            bnum = len(bdb.Breakpoint.bpbynumber) - 1
+            for bp in reversed(bdb.Breakpoint.bpbynumber):
+                if bp is None:
+                    continue
+                bnum = bp.number
+                break
+            else:
+                self.error('cannot set commands: no existing breakpoint')
+                return
         else:
             try:
                 bnum = int(arg)
diff --git a/Lib/profiling/sampling/__main__.py b/Lib/profiling/sampling/__main__.py
index 47bd3a0113e..a45b645eae0 100644
--- a/Lib/profiling/sampling/__main__.py
+++ b/Lib/profiling/sampling/__main__.py
@@ -46,6 +46,7 @@ system restrictions or missing privileges.
 """
 
 from .cli import main
+from .errors import SamplingUnknownProcessError, SamplingModuleNotFoundError, SamplingScriptNotFoundError
 
 def handle_permission_error():
     """Handle PermissionError by displaying appropriate error message."""
@@ -64,3 +65,9 @@ if __name__ == '__main__':
         main()
     except PermissionError:
         handle_permission_error()
+    except SamplingUnknownProcessError as err:
+        print(f"Tachyon cannot find the process: {err}", file=sys.stderr)
+        sys.exit(1)
+    except (SamplingModuleNotFoundError, SamplingScriptNotFoundError) as err:
+        print(f"Tachyon cannot find the target: {err}", file=sys.stderr)
+        sys.exit(1)
diff --git a/Lib/profiling/sampling/_format_utils.py b/Lib/profiling/sampling/_format_utils.py
new file mode 100644
index 00000000000..237a4f4186b
--- /dev/null
+++ b/Lib/profiling/sampling/_format_utils.py
@@ -0,0 +1,5 @@
+import locale
+
+
+def fmt(value: int | float, decimals: int = 1) -> str:
+    return locale.format_string(f'%.{decimals}f', value, grouping=True)
diff --git a/Lib/profiling/sampling/_heatmap_assets/heatmap.css b/Lib/profiling/sampling/_heatmap_assets/heatmap.css
index 4fba9d866ac..9999cd6760f 100644
--- a/Lib/profiling/sampling/_heatmap_assets/heatmap.css
+++ b/Lib/profiling/sampling/_heatmap_assets/heatmap.css
@@ -5,6 +5,18 @@
    This file extends the shared foundation with heatmap-specific styles.
    ========================================================================== */
 
+/* Heatmap heat colors - using base.css colors with 60% opacity */
+[data-theme="dark"] {
+  --heat-1: rgba(90, 123, 167, 0.60);
+  --heat-2: rgba(106, 148, 168, 0.60);
+  --heat-3: rgba(122, 172, 172, 0.60);
+  --heat-4: rgba(142, 196, 152, 0.60);
+  --heat-5: rgba(168, 216, 136, 0.60);
+  --heat-6: rgba(200, 222, 122, 0.60);
+  --heat-7: rgba(244, 212, 93, 0.60);
+  --heat-8: rgba(255, 122, 69, 0.60);
+}
+
 /* --------------------------------------------------------------------------
    Layout Overrides (Heatmap-specific)
    -------------------------------------------------------------------------- */
@@ -1129,6 +1141,10 @@
   .line-samples-cumulative {
     padding: 0 4px;
   }
+
+  .bytecode-panel {
+    margin: 8px 10px 8px 160px;
+  }
 }
 
 .bytecode-toggle {
@@ -1160,13 +1176,77 @@
 }
 
 .bytecode-panel {
-  margin-left: 90px;
-  padding: 8px 15px;
-  background: var(--bg-secondary);
-  border-left: 3px solid var(--accent);
+  background: var(--bg-primary);
+  border: 1px solid var(--border);
+  border-radius: 8px;
+  box-shadow: var(--shadow-md);
   font-family: var(--font-mono);
   font-size: 12px;
-  margin-bottom: 4px;
+  color: var(--text-primary);
+  line-height: 1.5;
+  word-wrap: break-word;
+  overflow-wrap: break-word;
+  padding: 0;
+  margin: 8px 10px 8px 250px;
+  position: relative;
+  z-index: 1;
+  overflow-y: auto;
+  max-height: 500px;
+  flex: 1;
+  transition: padding 0.3s cubic-bezier(0.4, 0, 0.2, 1);
+}
+
+.bytecode-panel.expanded {
+  padding: 14px;
+}
+
+.bytecode-wrapper {
+  position: relative;
+  display: flex;
+  overflow: visible;
+  max-height: 0;
+  opacity: 0;
+  transition: max-height 0.4s cubic-bezier(0.4, 0, 0.2, 1), opacity 0.3s ease-in-out;
+}
+
+.bytecode-wrapper.expanded {
+  max-height: 600px;
+  opacity: 1;
+  transition: max-height 0.5s cubic-bezier(0.4, 0, 0.2, 1), opacity 0.4s ease-in-out;
+}
+
+/* Column backdrop matching table header columns (line/self/total) */
+.bytecode-columns {
+  display: none;
+  position: absolute;
+  left: 0;
+  overflow: hidden;
+  pointer-events: none;
+  z-index: 0;
+}
+
+.bytecode-wrapper.expanded .bytecode-columns {
+  display: flex;
+  top: 0;
+  bottom: 0;
+}
+
+.bytecode-panel::-webkit-scrollbar {
+  width: 8px;
+}
+
+.bytecode-panel::-webkit-scrollbar-track {
+  background: var(--bg-secondary);
+  border-radius: 4px;
+}
+
+.bytecode-panel::-webkit-scrollbar-thumb {
+  background: var(--border);
+  border-radius: 4px;
+}
+
+.bytecode-panel::-webkit-scrollbar-thumb:hover {
+  background: var(--text-muted);
 }
 
 /* Specialization summary bar */
diff --git a/Lib/profiling/sampling/_heatmap_assets/heatmap.js b/Lib/profiling/sampling/_heatmap_assets/heatmap.js
index 8ac4ef43e53..53928b7b20f 100644
--- a/Lib/profiling/sampling/_heatmap_assets/heatmap.js
+++ b/Lib/profiling/sampling/_heatmap_assets/heatmap.js
@@ -15,37 +15,13 @@ let coldCodeHidden = false;
 // ============================================================================
 
 function toggleTheme() {
-    const html = document.documentElement;
-    const current = html.getAttribute('data-theme') || 'light';
-    const next = current === 'light' ? 'dark' : 'light';
-    html.setAttribute('data-theme', next);
-    localStorage.setItem('heatmap-theme', next);
-
-    // Update theme button icon
-    const btn = document.getElementById('theme-btn');
-    if (btn) {
-        btn.querySelector('.icon-moon').style.display = next === 'dark' ? 'none' : '';
-        btn.querySelector('.icon-sun').style.display = next === 'dark' ? '' : 'none';
-    }
+    toggleAndSaveTheme();
     applyLineColors();
 
     // Rebuild scroll marker with new theme colors
     buildScrollMarker();
 }
 
-function restoreUIState() {
-    // Restore theme
-    const savedTheme = localStorage.getItem('heatmap-theme');
-    if (savedTheme) {
-        document.documentElement.setAttribute('data-theme', savedTheme);
-        const btn = document.getElementById('theme-btn');
-        if (btn) {
-            btn.querySelector('.icon-moon').style.display = savedTheme === 'dark' ? 'none' : '';
-            btn.querySelector('.icon-sun').style.display = savedTheme === 'dark' ? '' : 'none';
-        }
-    }
-}
-
 // ============================================================================
 // Utility Functions
 // ============================================================================
@@ -542,20 +518,23 @@ function toggleBytecode(button) {
     const lineId = lineDiv.id;
     const lineNum = lineId.replace('line-', '');
     const panel = document.getElementById(`bytecode-${lineNum}`);
+    const wrapper = document.getElementById(`bytecode-wrapper-${lineNum}`);
 
-    if (!panel) return;
+    if (!panel || !wrapper) return;
 
-    const isExpanded = panel.style.display !== 'none';
+    const isExpanded = panel.classList.contains('expanded');
 
     if (isExpanded) {
-        panel.style.display = 'none';
+        panel.classList.remove('expanded');
+        wrapper.classList.remove('expanded');
         button.classList.remove('expanded');
         button.innerHTML = '&#9654;';  // Right arrow
     } else {
         if (!panel.dataset.populated) {
             populateBytecodePanel(panel, button);
         }
-        panel.style.display = 'block';
+        panel.classList.add('expanded');
+        wrapper.classList.add('expanded');
         button.classList.add('expanded');
         button.innerHTML = '&#9660;';  // Down arrow
     }
@@ -598,10 +577,12 @@ function populateBytecodePanel(panel, button) {
         else if (specPct >= 33) specClass = 'medium';
 
         // Build specialization summary
+        const instruction_word = instructions.length === 1 ? 'instruction' : 'instructions';
+        const sample_word = totalSamples === 1 ? 'sample' : 'samples';
         let html = `<div class="bytecode-spec-summary ${specClass}">
             <span class="spec-pct">${specPct}%</span>
             <span class="spec-label">specialized</span>
-            <span class="spec-detail">(${specializedCount}/${instructions.length} instructions, ${specializedSamples.toLocaleString()}/${totalSamples.toLocaleString()} samples)</span>
+            <span class="spec-detail">(${specializedCount}/${instructions.length} ${instruction_word}, ${specializedSamples.toLocaleString()}/${totalSamples.toLocaleString()} ${sample_word})</span>
         </div>`;
 
         html += '<div class="bytecode-header">' +
diff --git a/Lib/profiling/sampling/_heatmap_assets/heatmap_index.js b/Lib/profiling/sampling/_heatmap_assets/heatmap_index.js
index 8eb6af0db53..db4b5485056 100644
--- a/Lib/profiling/sampling/_heatmap_assets/heatmap_index.js
+++ b/Lib/profiling/sampling/_heatmap_assets/heatmap_index.js
@@ -19,35 +19,10 @@ function applyHeatmapBarColors() {
 // ============================================================================
 
 function toggleTheme() {
-    const html = document.documentElement;
-    const current = html.getAttribute('data-theme') || 'light';
-    const next = current === 'light' ? 'dark' : 'light';
-    html.setAttribute('data-theme', next);
-    localStorage.setItem('heatmap-theme', next);
-
-    // Update theme button icon
-    const btn = document.getElementById('theme-btn');
-    if (btn) {
-        btn.querySelector('.icon-moon').style.display = next === 'dark' ? 'none' : '';
-        btn.querySelector('.icon-sun').style.display = next === 'dark' ? '' : 'none';
-    }
-
+    toggleAndSaveTheme();
     applyHeatmapBarColors();
 }
 
-function restoreUIState() {
-    // Restore theme
-    const savedTheme = localStorage.getItem('heatmap-theme');
-    if (savedTheme) {
-        document.documentElement.setAttribute('data-theme', savedTheme);
-        const btn = document.getElementById('theme-btn');
-        if (btn) {
-            btn.querySelector('.icon-moon').style.display = savedTheme === 'dark' ? 'none' : '';
-            btn.querySelector('.icon-sun').style.display = savedTheme === 'dark' ? '' : 'none';
-        }
-    }
-}
-
 // ============================================================================
 // Type Section Toggle (stdlib, project, etc)
 // ============================================================================
diff --git a/Lib/profiling/sampling/_heatmap_assets/heatmap_index_template.html b/Lib/profiling/sampling/_heatmap_assets/heatmap_index_template.html
index 3620f8efb80..8d04149abe3 100644
--- a/Lib/profiling/sampling/_heatmap_assets/heatmap_index_template.html
+++ b/Lib/profiling/sampling/_heatmap_assets/heatmap_index_template.html
@@ -1,5 +1,5 @@
 <!doctype html>
-<html lang="en" data-theme="light">
+<html lang="en">
 <head>
     <meta charset="UTF-8">
     <meta name="viewport" content="width=device-width, initial-scale=1.0">
diff --git a/Lib/profiling/sampling/_heatmap_assets/heatmap_pyfile_template.html b/Lib/profiling/sampling/_heatmap_assets/heatmap_pyfile_template.html
index 91b629b2628..2a9c07647e6 100644
--- a/Lib/profiling/sampling/_heatmap_assets/heatmap_pyfile_template.html
+++ b/Lib/profiling/sampling/_heatmap_assets/heatmap_pyfile_template.html
@@ -1,5 +1,5 @@
 <!doctype html>
-<html lang="en" data-theme="light">
+<html lang="en">
 <head>
     <meta charset="UTF-8">
     <meta name="viewport" content="width=device-width, initial-scale=1.0">
diff --git a/Lib/profiling/sampling/_heatmap_assets/heatmap_shared.js b/Lib/profiling/sampling/_heatmap_assets/heatmap_shared.js
index 7fcd720d45d..84b13ca0a96 100644
--- a/Lib/profiling/sampling/_heatmap_assets/heatmap_shared.js
+++ b/Lib/profiling/sampling/_heatmap_assets/heatmap_shared.js
@@ -39,6 +39,42 @@ function intensityToColor(intensity) {
     return rootStyle.getPropertyValue(`--heat-${level}`).trim();
 }
 
+// ============================================================================
+// Theme Support
+// ============================================================================
+
+// Get the preferred theme from localStorage or browser preference
+function getPreferredTheme() {
+    const saved = localStorage.getItem('heatmap-theme');
+    if (saved) return saved;
+    return window.matchMedia('(prefers-color-scheme: dark)').matches ? 'dark' : 'light';
+}
+
+// Apply theme and update UI. Returns the applied theme.
+function applyTheme(theme) {
+    document.documentElement.setAttribute('data-theme', theme);
+    const btn = document.getElementById('theme-btn');
+    if (btn) {
+        btn.querySelector('.icon-moon').style.display = theme === 'dark' ? 'none' : '';
+        btn.querySelector('.icon-sun').style.display = theme === 'dark' ? '' : 'none';
+    }
+    return theme;
+}
+
+// Toggle theme and save preference. Returns the new theme.
+function toggleAndSaveTheme() {
+    const current = document.documentElement.getAttribute('data-theme') || 'light';
+    const next = current === 'light' ? 'dark' : 'light';
+    applyTheme(next);
+    localStorage.setItem('heatmap-theme', next);
+    return next;
+}
+
+// Restore theme from localStorage, or use browser preference
+function restoreUIState() {
+    applyTheme(getPreferredTheme());
+}
+
 // ============================================================================
 // Favicon (Reuse logo image as favicon)
 // ============================================================================
diff --git a/Lib/profiling/sampling/_shared_assets/base.css b/Lib/profiling/sampling/_shared_assets/base.css
index 39bdd52e943..cb59a0f77c5 100644
--- a/Lib/profiling/sampling/_shared_assets/base.css
+++ b/Lib/profiling/sampling/_shared_assets/base.css
@@ -124,15 +124,15 @@
 
   --header-gradient: linear-gradient(135deg, #21262d 0%, #30363d 100%);
 
-  /* Dark mode heat palette - muted colors that provide sufficient contrast with light text */
-  --heat-1: rgba(74, 123, 167, 0.35);
-  --heat-2: rgba(90, 159, 168, 0.38);
-  --heat-3: rgba(106, 181, 181, 0.40);
-  --heat-4: rgba(126, 196, 136, 0.42);
-  --heat-5: rgba(160, 216, 120, 0.45);
-  --heat-6: rgba(196, 222, 106, 0.48);
-  --heat-7: rgba(244, 212, 77, 0.50);
-  --heat-8: rgba(255, 107, 53, 0.55);
+  /* Dark mode heat palette - cool to warm gradient for visualization */
+  --heat-1: rgba(90, 123, 167, 1);
+  --heat-2: rgba(106, 148, 168, 1);
+  --heat-3: rgba(122, 172, 172, 1);
+  --heat-4: rgba(142, 196, 152, 1);
+  --heat-5: rgba(168, 216, 136, 1);
+  --heat-6: rgba(200, 222, 122, 1);
+  --heat-7: rgba(244, 212, 93, 1);
+  --heat-8: rgba(255, 122, 69, 1);
 
   /* Code view specific - dark mode */
   --code-bg: #0d1117;
diff --git a/Lib/profiling/sampling/binary_collector.py b/Lib/profiling/sampling/binary_collector.py
new file mode 100644
index 00000000000..64afe632fae
--- /dev/null
+++ b/Lib/profiling/sampling/binary_collector.py
@@ -0,0 +1,120 @@
+"""Thin Python wrapper around C binary writer for profiling data."""
+
+import time
+
+import _remote_debugging
+
+from .collector import Collector
+
+# Compression type constants (must match binary_io.h)
+COMPRESSION_NONE = 0
+COMPRESSION_ZSTD = 1
+
+
+def _resolve_compression(compression):
+    """Resolve compression type from string or int.
+
+    Args:
+        compression: 'auto', 'zstd', 'none', or int (0/1)
+
+    Returns:
+        int: Compression type constant
+    """
+    if isinstance(compression, int):
+        return compression
+
+    compression = compression.lower()
+    if compression == 'none':
+        return COMPRESSION_NONE
+    elif compression == 'zstd':
+        return COMPRESSION_ZSTD
+    elif compression == 'auto':
+        # Auto: use zstd if available, otherwise none
+        if _remote_debugging.zstd_available():
+            return COMPRESSION_ZSTD
+        return COMPRESSION_NONE
+    else:
+        raise ValueError(f"Unknown compression type: {compression}")
+
+
+class BinaryCollector(Collector):
+    """High-performance binary collector using C implementation.
+
+    This collector writes profiling data directly to a binary file format
+    with optional zstd compression. All I/O is performed in C for maximum
+    throughput.
+
+    The binary format uses string/frame deduplication and varint encoding
+    for efficient storage.
+    """
+
+    def __init__(self, filename, sample_interval_usec, *, skip_idle=False,
+                 compression='auto'):
+        """Create a new binary collector.
+
+        Args:
+            filename: Path to output binary file
+            sample_interval_usec: Sampling interval in microseconds
+            skip_idle: If True, skip idle threads (not used in binary format)
+            compression: 'auto', 'zstd', 'none', or int (0=none, 1=zstd)
+        """
+        self.filename = filename
+        self.sample_interval_usec = sample_interval_usec
+        self.skip_idle = skip_idle
+
+        compression_type = _resolve_compression(compression)
+        start_time_us = int(time.monotonic() * 1_000_000)
+        self._writer = _remote_debugging.BinaryWriter(
+            filename, sample_interval_usec, start_time_us, compression=compression_type
+        )
+
+    def collect(self, stack_frames, timestamp_us=None):
+        """Collect profiling data from stack frames.
+
+        This passes stack_frames directly to the C writer which handles
+        all encoding and buffering.
+
+        Args:
+            stack_frames: List of InterpreterInfo objects from _remote_debugging
+            timestamp_us: Optional timestamp in microseconds. If not provided,
+                          uses time.monotonic() to generate one.
+        """
+        if timestamp_us is None:
+            timestamp_us = int(time.monotonic() * 1_000_000)
+        self._writer.write_sample(stack_frames, timestamp_us)
+
+    def collect_failed_sample(self):
+        """Record a failed sample attempt (no-op for binary format)."""
+        pass
+
+    def export(self, filename=None):
+        """Finalize and close the binary file.
+
+        Args:
+            filename: Ignored (binary files are written incrementally)
+        """
+        self._writer.finalize()
+
+    @property
+    def total_samples(self):
+        return self._writer.total_samples
+
+    def get_stats(self):
+        """Get encoding statistics.
+
+        Returns:
+            Dict with encoding statistics including repeat/full/suffix/pop-push
+            record counts, frames written/saved, and compression ratio.
+        """
+        return self._writer.get_stats()
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        """Context manager exit - finalize unless there was an error."""
+        if exc_type is None:
+            self._writer.finalize()
+        else:
+            self._writer.close()
+        return False
diff --git a/Lib/profiling/sampling/binary_reader.py b/Lib/profiling/sampling/binary_reader.py
new file mode 100644
index 00000000000..50c96668cc5
--- /dev/null
+++ b/Lib/profiling/sampling/binary_reader.py
@@ -0,0 +1,128 @@
+"""Thin Python wrapper around C binary reader for profiling data."""
+
+
+class BinaryReader:
+    """High-performance binary reader using C implementation.
+
+    This reader uses memory-mapped I/O (on Unix) for fast replay of
+    profiling data from binary files.
+
+    Use as a context manager:
+        with BinaryReader('profile.bin') as reader:
+            info = reader.get_info()
+            reader.replay_samples(collector, progress_callback)
+    """
+
+    def __init__(self, filename):
+        """Create a new binary reader.
+
+        Args:
+            filename: Path to input binary file
+        """
+        self.filename = filename
+        self._reader = None
+
+    def __enter__(self):
+        import _remote_debugging
+        self._reader = _remote_debugging.BinaryReader(self.filename)
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if self._reader is not None:
+            self._reader.close()
+            self._reader = None
+        return False
+
+    def get_info(self):
+        """Get metadata about the binary file.
+
+        Returns:
+            dict: File metadata including:
+                - sample_count: Number of samples in the file
+                - sample_interval_us: Sampling interval in microseconds
+                - start_time_us: Start timestamp in microseconds
+                - string_count: Number of unique strings
+                - frame_count: Number of unique frames
+                - compression: Compression type used
+        """
+        if self._reader is None:
+            raise RuntimeError("Reader not open. Use as context manager.")
+        return self._reader.get_info()
+
+    def replay_samples(self, collector, progress_callback=None):
+        """Replay samples from binary file through a collector.
+
+        This allows converting binary profiling data to other formats
+        (e.g., flamegraph, pstats) by replaying through the appropriate
+        collector.
+
+        Args:
+            collector: A Collector instance with a collect() method
+            progress_callback: Optional callable(current, total) for progress
+
+        Returns:
+            int: Number of samples replayed
+        """
+        if self._reader is None:
+            raise RuntimeError("Reader not open. Use as context manager.")
+        return self._reader.replay(collector, progress_callback)
+
+    @property
+    def sample_count(self):
+        if self._reader is None:
+            raise RuntimeError("Reader not open. Use as context manager.")
+        return self._reader.get_info()['sample_count']
+
+    def get_stats(self):
+        """Get reconstruction statistics from replay.
+
+        Returns:
+            dict: Statistics about record types decoded and samples
+                  reconstructed during replay.
+        """
+        if self._reader is None:
+            raise RuntimeError("Reader not open. Use as context manager.")
+        return self._reader.get_stats()
+
+
+def convert_binary_to_format(input_file, output_file, output_format,
+                             sample_interval_usec=None, progress_callback=None):
+    """Convert a binary profiling file to another format.
+
+    Args:
+        input_file: Path to input binary file
+        output_file: Path to output file
+        output_format: Target format ('flamegraph', 'collapsed', 'pstats', etc.)
+        sample_interval_usec: Override sample interval (uses file's if None)
+        progress_callback: Optional callable(current, total) for progress
+
+    Returns:
+        int: Number of samples converted
+    """
+    from .gecko_collector import GeckoCollector
+    from .stack_collector import FlamegraphCollector, CollapsedStackCollector
+    from .pstats_collector import PStatsCollector
+
+    with BinaryReader(input_file) as reader:
+        info = reader.get_info()
+        interval = sample_interval_usec or info['sample_interval_us']
+
+        # Create appropriate collector based on format
+        if output_format == 'flamegraph':
+            collector = FlamegraphCollector(interval)
+        elif output_format == 'collapsed':
+            collector = CollapsedStackCollector(interval)
+        elif output_format == 'pstats':
+            collector = PStatsCollector(interval)
+        elif output_format == 'gecko':
+            collector = GeckoCollector(interval)
+        else:
+            raise ValueError(f"Unknown output format: {output_format}")
+
+        # Replay samples through collector
+        count = reader.replay_samples(collector, progress_callback)
+
+        # Export to target format
+        collector.export(output_file)
+
+        return count
diff --git a/Lib/profiling/sampling/cli.py b/Lib/profiling/sampling/cli.py
index e1ff3758c0d..0403c75c80f 100644
--- a/Lib/profiling/sampling/cli.py
+++ b/Lib/profiling/sampling/cli.py
@@ -2,6 +2,7 @@
 
 import argparse
 import importlib.util
+import locale
 import os
 import selectors
 import socket
@@ -10,11 +11,14 @@ import sys
 import time
 from contextlib import nullcontext
 
-from .sample import sample, sample_live
+from .errors import SamplingUnknownProcessError, SamplingModuleNotFoundError, SamplingScriptNotFoundError
+from .sample import sample, sample_live, _is_process_running
 from .pstats_collector import PstatsCollector
 from .stack_collector import CollapsedStackCollector, FlamegraphCollector
 from .heatmap_collector import HeatmapCollector
 from .gecko_collector import GeckoCollector
+from .binary_collector import BinaryCollector
+from .binary_reader import BinaryReader
 from .constants import (
     PROFILING_MODE_ALL,
     PROFILING_MODE_WALL,
@@ -74,6 +78,7 @@ FORMAT_EXTENSIONS = {
     "flamegraph": "html",
     "gecko": "json",
     "heatmap": "html",
+    "binary": "bin",
 }
 
 COLLECTOR_MAP = {
@@ -82,6 +87,7 @@ COLLECTOR_MAP = {
     "flamegraph": FlamegraphCollector,
     "gecko": GeckoCollector,
     "heatmap": HeatmapCollector,
+    "binary": BinaryCollector,
 }
 
 def _setup_child_monitor(args, parent_pid):
@@ -179,7 +185,7 @@ def _parse_mode(mode_string):
 def _check_process_died(process):
     """Check if process died and raise an error with stderr if available."""
     if process.poll() is None:
-        return  # Process still running
+        return
 
     # Process died - try to get stderr for error message
     stderr_msg = ""
@@ -364,7 +370,7 @@ def _add_mode_options(parser):
     )
 
 
-def _add_format_options(parser):
+def _add_format_options(parser, include_compression=True, include_binary=True):
     """Add output format options to a parser."""
     output_group = parser.add_argument_group("Output options")
     format_group = output_group.add_mutually_exclusive_group()
@@ -403,8 +409,24 @@ def _add_format_options(parser):
         dest="format",
         help="Generate interactive HTML heatmap visualization with line-level sample counts",
     )
+    if include_binary:
+        format_group.add_argument(
+            "--binary",
+            action="store_const",
+            const="binary",
+            dest="format",
+            help="Generate high-performance binary format (use 'replay' command to convert)",
+        )
     parser.set_defaults(format="pstats")
 
+    if include_compression:
+        output_group.add_argument(
+            "--compression",
+            choices=["auto", "zstd", "none"],
+            default="auto",
+            help="Compression for binary format: auto (use zstd if available), zstd, none",
+        )
+
     output_group.add_argument(
         "-o",
         "--output",
@@ -459,15 +481,18 @@ def _sort_to_mode(sort_choice):
     return sort_map.get(sort_choice, SORT_MODE_NSAMPLES)
 
 
-def _create_collector(format_type, interval, skip_idle, opcodes=False):
+def _create_collector(format_type, interval, skip_idle, opcodes=False,
+                      output_file=None, compression='auto'):
     """Create the appropriate collector based on format type.
 
     Args:
-        format_type: The output format ('pstats', 'collapsed', 'flamegraph', 'gecko', 'heatmap')
+        format_type: The output format ('pstats', 'collapsed', 'flamegraph', 'gecko', 'heatmap', 'binary')
         interval: Sampling interval in microseconds
         skip_idle: Whether to skip idle samples
         opcodes: Whether to collect opcode information (only used by gecko format
                  for creating interval markers in Firefox Profiler)
+        output_file: Output file path (required for binary format)
+        compression: Compression type for binary format ('auto', 'zstd', 'none')
 
     Returns:
         A collector instance of the appropriate type
@@ -476,6 +501,13 @@ def _create_collector(format_type, interval, skip_idle, opcodes=False):
     if collector_class is None:
         raise ValueError(f"Unknown format: {format_type}")
 
+    # Binary format requires output file and compression
+    if format_type == "binary":
+        if output_file is None:
+            raise ValueError("Binary format requires an output file")
+        return collector_class(output_file, interval, skip_idle=skip_idle,
+                              compression=compression)
+
     # Gecko format never skips idle (it needs both GIL and CPU data)
     # and is the only format that uses opcodes for interval markers
     if format_type == "gecko":
@@ -511,7 +543,12 @@ def _handle_output(collector, args, pid, mode):
         pid: Process ID (for generating filenames)
         mode: Profiling mode used
     """
-    if args.format == "pstats":
+    if args.format == "binary":
+        # Binary format already wrote to file incrementally, just finalize
+        collector.export(None)
+        filename = collector.filename
+        print(f"Binary profile written to {filename} ({collector.total_samples} samples)")
+    elif args.format == "pstats":
         if args.outfile:
             # If outfile is a directory, generate filename inside it
             if os.path.isdir(args.outfile):
@@ -544,6 +581,10 @@ def _validate_args(args, parser):
         args: Parsed command-line arguments
         parser: ArgumentParser instance for error reporting
     """
+    # Replay command has no special validation needed
+    if getattr(args, 'command', None) == "replay":
+        return
+
     # Check if live mode is available
     if hasattr(args, 'live') and args.live and LiveStatsCollector is None:
         parser.error(
@@ -556,7 +597,7 @@ def _validate_args(args, parser):
             parser.error("--subprocesses is incompatible with --live mode.")
 
     # Async-aware mode is incompatible with --native, --no-gc, --mode, and --all-threads
-    if args.async_aware:
+    if getattr(args, 'async_aware', False):
         issues = []
         if args.native:
             issues.append("--native")
@@ -573,7 +614,7 @@ def _validate_args(args, parser):
             )
 
     # --async-mode requires --async-aware
-    if hasattr(args, 'async_mode') and args.async_mode != "running" and not args.async_aware:
+    if hasattr(args, 'async_mode') and args.async_mode != "running" and not getattr(args, 'async_aware', False):
         parser.error("--async-mode requires --async-aware to be enabled.")
 
     # Live mode is incompatible with format options
@@ -601,7 +642,7 @@ def _validate_args(args, parser):
         return
 
     # Validate gecko mode doesn't use non-wall mode
-    if args.format == "gecko" and args.mode != "wall":
+    if args.format == "gecko" and getattr(args, 'mode', 'wall') != "wall":
         parser.error(
             "--mode option is incompatible with --gecko. "
             "Gecko format automatically includes both GIL-holding and CPU status analysis."
@@ -609,7 +650,7 @@ def _validate_args(args, parser):
 
     # Validate --opcodes is only used with compatible formats
     opcodes_compatible_formats = ("live", "gecko", "flamegraph", "heatmap")
-    if args.opcodes and args.format not in opcodes_compatible_formats:
+    if getattr(args, 'opcodes', False) and args.format not in opcodes_compatible_formats:
         parser.error(
             f"--opcodes is only compatible with {', '.join('--' + f for f in opcodes_compatible_formats)}."
         )
@@ -633,6 +674,16 @@ def _validate_args(args, parser):
 
 def main():
     """Main entry point for the CLI."""
+    # Set locale for number formatting, restore on exit
+    old_locale = locale.setlocale(locale.LC_ALL, None)
+    locale.setlocale(locale.LC_ALL, "")
+    try:
+        _main()
+    finally:
+        locale.setlocale(locale.LC_ALL, old_locale)
+
+
+def _main():
     # Create the main parser
     parser = argparse.ArgumentParser(
         description=_HELP_DESCRIPTION,
@@ -721,6 +772,30 @@ Examples:
     _add_format_options(attach_parser)
     _add_pstats_options(attach_parser)
 
+    # === REPLAY COMMAND ===
+    replay_parser = subparsers.add_parser(
+        "replay",
+        help="Replay a binary profile and convert to another format",
+        formatter_class=CustomFormatter,
+        description="""Replay a binary profile file and convert to another format
+
+Examples:
+  # Convert binary to flamegraph
+  `python -m profiling.sampling replay --flamegraph -o output.html profile.bin`
+
+  # Convert binary to pstats and print to stdout
+  `python -m profiling.sampling replay profile.bin`
+
+  # Convert binary to gecko format
+  `python -m profiling.sampling replay --gecko -o profile.json profile.bin`""",
+    )
+    replay_parser.add_argument(
+        "input_file",
+        help="Binary profile file to replay",
+    )
+    _add_format_options(replay_parser, include_compression=False, include_binary=False)
+    _add_pstats_options(replay_parser)
+
     # Parse arguments
     args = parser.parse_args()
 
@@ -731,6 +806,7 @@ Examples:
     command_handlers = {
         "run": _handle_run,
         "attach": _handle_attach,
+        "replay": _handle_replay,
     }
 
     # Execute the appropriate command
@@ -743,6 +819,8 @@ Examples:
 
 def _handle_attach(args):
     """Handle the 'attach' command."""
+    if not _is_process_running(args.pid):
+        raise SamplingUnknownProcessError(args.pid)
     # Check if live mode is requested
     if args.live:
         _handle_live_attach(args, args.pid)
@@ -760,8 +838,16 @@ def _handle_attach(args):
         mode != PROFILING_MODE_WALL if mode != PROFILING_MODE_ALL else False
     )
 
+    output_file = None
+    if args.format == "binary":
+        output_file = args.outfile or _generate_output_filename(args.format, args.pid)
+
     # Create the appropriate collector
-    collector = _create_collector(args.format, args.interval, skip_idle, args.opcodes)
+    collector = _create_collector(
+        args.format, args.interval, skip_idle, args.opcodes,
+        output_file=output_file,
+        compression=getattr(args, 'compression', 'auto')
+    )
 
     with _get_child_monitor_context(args, args.pid):
         collector = sample(
@@ -792,13 +878,13 @@ def _handle_run(args):
             added_cwd = True
         try:
             if importlib.util.find_spec(args.target) is None:
-                sys.exit(f"Error: Module not found: {args.target}")
+                raise SamplingModuleNotFoundError(args.target)
         finally:
             if added_cwd:
                 sys.path.remove(cwd)
     else:
         if not os.path.exists(args.target):
-            sys.exit(f"Error: Script not found: {args.target}")
+            raise SamplingScriptNotFoundError(args.target)
 
     # Check if live mode is requested
     if args.live:
@@ -829,8 +915,16 @@ def _handle_run(args):
         mode != PROFILING_MODE_WALL if mode != PROFILING_MODE_ALL else False
     )
 
+    output_file = None
+    if args.format == "binary":
+        output_file = args.outfile or _generate_output_filename(args.format, process.pid)
+
     # Create the appropriate collector
-    collector = _create_collector(args.format, args.interval, skip_idle, args.opcodes)
+    collector = _create_collector(
+        args.format, args.interval, skip_idle, args.opcodes,
+        output_file=output_file,
+        compression=getattr(args, 'compression', 'auto')
+    )
 
     with _get_child_monitor_context(args, process.pid):
         try:
@@ -949,5 +1043,48 @@ def _handle_live_run(args):
                 process.wait()
 
 
+def _handle_replay(args):
+    """Handle the 'replay' command - convert binary profile to another format."""
+    import os
+
+    if not os.path.exists(args.input_file):
+        sys.exit(f"Error: Input file not found: {args.input_file}")
+
+    with BinaryReader(args.input_file) as reader:
+        info = reader.get_info()
+        interval = info['sample_interval_us']
+
+        print(f"Replaying {info['sample_count']} samples from {args.input_file}")
+        print(f"  Sample interval: {interval} us")
+        print(f"  Compression: {'zstd' if info.get('compression_type', 0) == 1 else 'none'}")
+
+        collector = _create_collector(args.format, interval, skip_idle=False)
+
+        def progress_callback(current, total):
+            if total > 0:
+                pct = current / total
+                bar_width = 40
+                filled = int(bar_width * pct)
+                bar = '█' * filled + '░' * (bar_width - filled)
+                print(f"\r  [{bar}] {pct*100:5.1f}% ({current:,}/{total:,})", end="", flush=True)
+
+        count = reader.replay_samples(collector, progress_callback)
+        print()
+
+        if args.format == "pstats":
+            if args.outfile:
+                collector.export(args.outfile)
+            else:
+                sort_choice = args.sort if args.sort is not None else "nsamples"
+                limit = args.limit if args.limit is not None else 15
+                sort_mode = _sort_to_mode(sort_choice)
+                collector.print_stats(sort_mode, limit, not args.no_summary, PROFILING_MODE_WALL)
+        else:
+            filename = args.outfile or _generate_output_filename(args.format, os.getpid())
+            collector.export(filename)
+
+        print(f"Replayed {count} samples")
+
+
 if __name__ == "__main__":
     main()
diff --git a/Lib/profiling/sampling/collector.py b/Lib/profiling/sampling/collector.py
index a1f6ec190f6..c70e1eefe27 100644
--- a/Lib/profiling/sampling/collector.py
+++ b/Lib/profiling/sampling/collector.py
@@ -44,8 +44,17 @@ def extract_lineno(location):
 
 class Collector(ABC):
     @abstractmethod
-    def collect(self, stack_frames):
-        """Collect profiling data from stack frames."""
+    def collect(self, stack_frames, timestamps_us=None):
+        """Collect profiling data from stack frames.
+
+        Args:
+            stack_frames: List of InterpreterInfo objects
+            timestamps_us: Optional list of timestamps in microseconds. If provided
+                (from binary replay with RLE batching), use these instead of current
+                time. If None, collectors should use time.monotonic() or similar.
+                The list may contain multiple timestamps when samples are batched
+                together (same stack, different times).
+        """
 
     def collect_failed_sample(self):
         """Collect data about a failed sample attempt."""
@@ -79,6 +88,17 @@ class Collector(ABC):
         # Phase 3: Build linear stacks from each leaf to root (optimized - no sorting!)
         yield from self._build_linear_stacks(leaf_task_ids, task_map, child_to_parent)
 
+    def _iter_stacks(self, stack_frames, skip_idle=False):
+        """Yield (frames, thread_id) for all stacks, handling both sync and async modes."""
+        if stack_frames and hasattr(stack_frames[0], "awaited_by"):
+            for frames, thread_id, _ in self._iter_async_frames(stack_frames):
+                if frames:
+                    yield frames, thread_id
+        else:
+            for frames, thread_id in self._iter_all_frames(stack_frames, skip_idle=skip_idle):
+                if frames:
+                    yield frames, thread_id
+
     def _build_task_graph(self, awaited_info_list):
         task_map = {}
         child_to_parent = {}  # Maps child_id -> (selected_parent_id, parent_count)
diff --git a/Lib/profiling/sampling/errors.py b/Lib/profiling/sampling/errors.py
new file mode 100644
index 00000000000..0832ad2d438
--- /dev/null
+++ b/Lib/profiling/sampling/errors.py
@@ -0,0 +1,19 @@
+"""Custom exceptions for the sampling profiler."""
+
+class SamplingProfilerError(Exception):
+    """Base exception for sampling profiler errors."""
+
+class SamplingUnknownProcessError(SamplingProfilerError):
+    def __init__(self, pid):
+        self.pid = pid
+        super().__init__(f"Process with PID '{pid}' does not exist.")
+
+class SamplingScriptNotFoundError(SamplingProfilerError):
+    def __init__(self, script_path):
+        self.script_path = script_path
+        super().__init__(f"Script '{script_path}' not found.")
+
+class SamplingModuleNotFoundError(SamplingProfilerError):
+    def __init__(self, module_name):
+        self.module_name = module_name
+        super().__init__(f"Module '{module_name}' not found.")
diff --git a/Lib/profiling/sampling/gecko_collector.py b/Lib/profiling/sampling/gecko_collector.py
index 608a15da483..c1c9cfcf3b9 100644
--- a/Lib/profiling/sampling/gecko_collector.py
+++ b/Lib/profiling/sampling/gecko_collector.py
@@ -66,7 +66,7 @@ class GeckoCollector(Collector):
         self.sample_interval_usec = sample_interval_usec
         self.skip_idle = skip_idle
         self.opcodes_enabled = opcodes
-        self.start_time = time.time() * 1000  # milliseconds since epoch
+        self.start_time = time.monotonic() * 1000  # milliseconds since start
 
         # Global string table (shared across all threads)
         self.global_strings = ["(root)"]  # Start with root
@@ -103,6 +103,9 @@ class GeckoCollector(Collector):
         # Opcode state tracking per thread: tid -> (opcode, lineno, col_offset, funcname, filename, start_time)
         self.opcode_state = {}
 
+        # For binary replay: track base timestamp (first sample's timestamp)
+        self._replay_base_timestamp_us = None
+
     def _track_state_transition(self, tid, condition, active_dict, inactive_dict,
                                   active_name, inactive_name, category, current_time):
         """Track binary state transitions and emit markers.
@@ -138,18 +141,35 @@ class GeckoCollector(Collector):
                 self._add_marker(tid, active_name, active_dict.pop(tid),
                                current_time, category)
 
-    def collect(self, stack_frames):
-        """Collect a sample from stack frames."""
-        current_time = (time.time() * 1000) - self.start_time
+    def collect(self, stack_frames, timestamps_us=None):
+        """Collect samples from stack frames.
+
+        Args:
+            stack_frames: List of interpreter/thread frame info
+            timestamps_us: List of timestamps in microseconds (None for live sampling)
+        """
+        # Handle live sampling (no timestamps provided)
+        if timestamps_us is None:
+            current_time = (time.monotonic() * 1000) - self.start_time
+            times = [current_time]
+        else:
+            if not timestamps_us:
+                return
+            # Initialize base timestamp if needed
+            if self._replay_base_timestamp_us is None:
+                self._replay_base_timestamp_us = timestamps_us[0]
+            # Convert all timestamps to times (ms relative to first sample)
+            base = self._replay_base_timestamp_us
+            times = [(ts - base) / 1000 for ts in timestamps_us]
+
+        first_time = times[0]
 
         # Update interval calculation
         if self.sample_count > 0 and self.last_sample_time > 0:
-            self.interval = (
-                current_time - self.last_sample_time
-            ) / self.sample_count
-        self.last_sample_time = current_time
+            self.interval = (times[-1] - self.last_sample_time) / self.sample_count
+        self.last_sample_time = times[-1]
 
-        # Process threads and track GC per thread
+        # Process threads
         for interpreter_info in stack_frames:
             for thread_info in interpreter_info.threads:
                 frames = thread_info.frame_info
@@ -167,92 +187,86 @@ class GeckoCollector(Collector):
                 on_cpu = bool(status_flags & THREAD_STATUS_ON_CPU)
                 gil_requested = bool(status_flags & THREAD_STATUS_GIL_REQUESTED)
 
-                # Track GIL possession (Has GIL / No GIL)
+                # Track state transitions using first timestamp
                 self._track_state_transition(
                     tid, has_gil, self.has_gil_start, self.no_gil_start,
-                    "Has GIL", "No GIL", CATEGORY_GIL, current_time
+                    "Has GIL", "No GIL", CATEGORY_GIL, first_time
                 )
-
-                # Track CPU state (On CPU / Off CPU)
                 self._track_state_transition(
                     tid, on_cpu, self.on_cpu_start, self.off_cpu_start,
-                    "On CPU", "Off CPU", CATEGORY_CPU, current_time
+                    "On CPU", "Off CPU", CATEGORY_CPU, first_time
                 )
 
-                # Track code type (Python Code / Native Code)
-                # This is tri-state: Python (has_gil), Native (on_cpu without gil), or Neither
+                # Track code type
                 if has_gil:
                     self._track_state_transition(
                         tid, True, self.python_code_start, self.native_code_start,
-                        "Python Code", "Native Code", CATEGORY_CODE_TYPE, current_time
+                        "Python Code", "Native Code", CATEGORY_CODE_TYPE, first_time
                     )
                 elif on_cpu:
                     self._track_state_transition(
                         tid, True, self.native_code_start, self.python_code_start,
-                        "Native Code", "Python Code", CATEGORY_CODE_TYPE, current_time
+                        "Native Code", "Python Code", CATEGORY_CODE_TYPE, first_time
                     )
                 else:
-                    # Thread is idle (neither has GIL nor on CPU) - close any open code markers
-                    # This handles the third state that _track_state_transition doesn't cover
                     if tid in self.initialized_threads:
                         if tid in self.python_code_start:
                             self._add_marker(tid, "Python Code", self.python_code_start.pop(tid),
-                                           current_time, CATEGORY_CODE_TYPE)
+                                           first_time, CATEGORY_CODE_TYPE)
                         if tid in self.native_code_start:
                             self._add_marker(tid, "Native Code", self.native_code_start.pop(tid),
-                                           current_time, CATEGORY_CODE_TYPE)
+                                           first_time, CATEGORY_CODE_TYPE)
 
-                # Track "Waiting for GIL" intervals (one-sided tracking)
+                # Track GIL wait
                 if gil_requested:
-                    self.gil_wait_start.setdefault(tid, current_time)
+                    self.gil_wait_start.setdefault(tid, first_time)
                 elif tid in self.gil_wait_start:
                     self._add_marker(tid, "Waiting for GIL", self.gil_wait_start.pop(tid),
-                                   current_time, CATEGORY_GIL)
+                                   first_time, CATEGORY_GIL)
 
-                # Track exception state (Has Exception / No Exception)
+                # Track exception state
                 has_exception = bool(status_flags & THREAD_STATUS_HAS_EXCEPTION)
                 self._track_state_transition(
                     tid, has_exception, self.exception_start, self.no_exception_start,
-                    "Has Exception", "No Exception", CATEGORY_EXCEPTION, current_time
+                    "Has Exception", "No Exception", CATEGORY_EXCEPTION, first_time
                 )
 
-                # Track GC events by detecting <GC> frames in the stack trace
-                # This leverages the improved GC frame tracking from commit 336366fd7ca
-                # which precisely identifies the thread that initiated GC collection
+                # Track GC events
                 has_gc_frame = any(frame[2] == "<GC>" for frame in frames)
                 if has_gc_frame:
-                    # This thread initiated GC collection
                     if tid not in self.gc_start_per_thread:
-                        self.gc_start_per_thread[tid] = current_time
+                        self.gc_start_per_thread[tid] = first_time
                 elif tid in self.gc_start_per_thread:
-                    # End GC marker when no more GC frames are detected
                     self._add_marker(tid, "GC Collecting", self.gc_start_per_thread.pop(tid),
-                                   current_time, CATEGORY_GC)
+                                   first_time, CATEGORY_GC)
 
-                # Mark thread as initialized after processing all state transitions
+                # Mark thread as initialized
                 self.initialized_threads.add(tid)
 
-                # Categorize: idle if neither has GIL nor on CPU
+                # Skip idle threads if requested
                 is_idle = not has_gil and not on_cpu
-
-                # Skip idle threads if skip_idle is enabled
                 if self.skip_idle and is_idle:
                     continue
 
                 if not frames:
                     continue
 
-                # Process the stack
+                # Process stack once to get stack_index
                 stack_index = self._process_stack(thread_data, frames)
 
-                # Add sample - cache references to avoid dictionary lookups
+                # Add samples with timestamps
                 samples = thread_data["samples"]
-                samples["stack"].append(stack_index)
-                samples["time"].append(current_time)
-                samples["eventDelay"].append(None)
+                samples_stack = samples["stack"]
+                samples_time = samples["time"]
+                samples_delay = samples["eventDelay"]
 
-                # Track opcode state changes for interval markers (leaf frame only)
-                if self.opcodes_enabled:
+                for t in times:
+                    samples_stack.append(stack_index)
+                    samples_time.append(t)
+                    samples_delay.append(None)
+
+                # Handle opcodes
+                if self.opcodes_enabled and frames:
                     leaf_frame = frames[0]
                     filename, location, funcname, opcode = leaf_frame
                     if isinstance(location, tuple):
@@ -264,18 +278,15 @@ class GeckoCollector(Collector):
                     current_state = (opcode, lineno, col_offset, funcname, filename)
 
                     if tid not in self.opcode_state:
-                        # First observation - start tracking
-                        self.opcode_state[tid] = (*current_state, current_time)
+                        self.opcode_state[tid] = (*current_state, first_time)
                     elif self.opcode_state[tid][:5] != current_state:
-                        # State changed - emit marker for previous state
                         prev_opcode, prev_lineno, prev_col, prev_funcname, prev_filename, prev_start = self.opcode_state[tid]
                         self._add_opcode_interval_marker(
-                            tid, prev_opcode, prev_lineno, prev_col, prev_funcname, prev_start, current_time
+                            tid, prev_opcode, prev_lineno, prev_col, prev_funcname, prev_start, first_time
                         )
-                        # Start tracking new state
-                        self.opcode_state[tid] = (*current_state, current_time)
+                        self.opcode_state[tid] = (*current_state, first_time)
 
-        self.sample_count += 1
+        self.sample_count += len(times)
 
     def _create_thread(self, tid):
         """Create a new thread structure with processed profile format."""
diff --git a/Lib/profiling/sampling/heatmap_collector.py b/Lib/profiling/sampling/heatmap_collector.py
index 5b4c89283be..bb810fa485b 100644
--- a/Lib/profiling/sampling/heatmap_collector.py
+++ b/Lib/profiling/sampling/heatmap_collector.py
@@ -5,6 +5,7 @@ import collections
 import html
 import importlib.resources
 import json
+import locale
 import math
 import os
 import platform
@@ -15,6 +16,7 @@ from pathlib import Path
 from typing import Dict, List, Tuple
 
 from ._css_utils import get_combined_css
+from ._format_utils import fmt
 from .collector import normalize_location, extract_lineno
 from .stack_collector import StackTraceCollector
 
@@ -343,7 +345,7 @@ class _HtmlRenderer:
   <div class="type-header" onclick="toggleTypeSection(this)">
     <span class="type-icon">{icon}</span>
     <span class="type-title">{type_names[module_type]}</span>
-    <span class="type-stats">({tree.count} {file_word}, {tree.samples:,} {sample_word})</span>
+    <span class="type-stats">({tree.count} {file_word}, {tree.samples:n} {sample_word})</span>
   </div>
   <div class="type-content"{content_style}>
 '''
@@ -390,7 +392,7 @@ class _HtmlRenderer:
         parts.append(f'{indent}    <span class="folder-icon">▶</span>')
         parts.append(f'{indent}    <span class="folder-name">📁 {html.escape(name)}</span>')
         parts.append(f'{indent}    <span class="folder-stats">'
-                     f'({node.count} {file_word}, {node.samples:,} {sample_word})</span>')
+                     f'({node.count} {file_word}, {node.samples:n} {sample_word})</span>')
         parts.append(f'{indent}  </div>')
         parts.append(f'{indent}  <div class="folder-content" style="display: none;">')
 
@@ -431,10 +433,11 @@ class _HtmlRenderer:
         bar_width = min(stat.percentage, 100)
 
         html_file = self.file_index[stat.filename]
+        s = "" if stat.total_samples == 1 else "s"
 
         return (f'{indent}<div class="file-item">\n'
                 f'{indent}  <a href="{html_file}" class="file-link" title="{full_path}">📄 {module_name}</a>\n'
-                f'{indent}  <span class="file-samples">{stat.total_samples:,} samples</span>\n'
+                f'{indent}  <span class="file-samples">{stat.total_samples:n} sample{s}</span>\n'
                 f'{indent}  <div class="heatmap-bar-container"><div class="heatmap-bar" style="width: {bar_width}px; height: {self.heatmap_bar_height}px;" data-intensity="{intensity:.3f}"></div></div>\n'
                 f'{indent}</div>\n')
 
@@ -518,7 +521,7 @@ class HeatmapCollector(StackTraceCollector):
         }
         self.stats.update(kwargs)
 
-    def process_frames(self, frames, thread_id):
+    def process_frames(self, frames, thread_id, weight=1):
         """Process stack frames and count samples per line.
 
         Args:
@@ -526,8 +529,9 @@ class HeatmapCollector(StackTraceCollector):
                     leaf-to-root order. location is (lineno, end_lineno, col_offset, end_col_offset).
                     opcode is None if not gathered.
             thread_id: Thread ID for this stack trace
+            weight: Number of samples this stack represents (for batched RLE)
         """
-        self._total_samples += 1
+        self._total_samples += weight
         self._seen_lines.clear()
 
         for i, (filename, location, funcname, opcode) in enumerate(frames):
@@ -545,15 +549,16 @@ class HeatmapCollector(StackTraceCollector):
                 self._seen_lines.add(line_key)
 
             self._record_line_sample(filename, lineno, funcname, is_leaf=is_leaf,
-                                     count_cumulative=count_cumulative)
+                                     count_cumulative=count_cumulative, weight=weight)
 
             if opcode is not None:
                 # Set opcodes_enabled flag when we first encounter opcode data
                 self.opcodes_enabled = True
                 self._record_bytecode_sample(filename, lineno, opcode,
-                                             end_lineno, col_offset, end_col_offset)
+                                             end_lineno, col_offset, end_col_offset,
+                                             weight=weight)
 
-            # Build call graph for adjacent frames
+            # Build call graph for adjacent frames (relationships are deduplicated anyway)
             if i + 1 < len(frames):
                 next_frame = frames[i + 1]
                 next_lineno = extract_lineno(next_frame[1])
@@ -575,24 +580,25 @@ class HeatmapCollector(StackTraceCollector):
         return True
 
     def _record_line_sample(self, filename, lineno, funcname, is_leaf=False,
-                            count_cumulative=True):
+                            count_cumulative=True, weight=1):
         """Record a sample for a specific line."""
         # Track cumulative samples (all occurrences in stack)
         if count_cumulative:
-            self.line_samples[(filename, lineno)] += 1
-            self.file_samples[filename][lineno] += 1
+            self.line_samples[(filename, lineno)] += weight
+            self.file_samples[filename][lineno] += weight
 
         # Track self/leaf samples (only when at top of stack)
         if is_leaf:
-            self.line_self_samples[(filename, lineno)] += 1
-            self.file_self_samples[filename][lineno] += 1
+            self.line_self_samples[(filename, lineno)] += weight
+            self.file_self_samples[filename][lineno] += weight
 
         # Record function definition location
         if funcname and (filename, funcname) not in self.function_definitions:
             self.function_definitions[(filename, funcname)] = lineno
 
     def _record_bytecode_sample(self, filename, lineno, opcode,
-                                end_lineno=None, col_offset=None, end_col_offset=None):
+                                end_lineno=None, col_offset=None, end_col_offset=None,
+                                weight=1):
         """Record a sample for a specific bytecode instruction.
 
         Args:
@@ -602,6 +608,7 @@ class HeatmapCollector(StackTraceCollector):
             end_lineno: End line number (may be -1 if not available)
             col_offset: Column offset in UTF-8 bytes (may be -1 if not available)
             end_col_offset: End column offset in UTF-8 bytes (may be -1 if not available)
+            weight: Number of samples this represents (for batched RLE)
         """
         key = (filename, lineno)
 
@@ -609,7 +616,7 @@ class HeatmapCollector(StackTraceCollector):
         if opcode not in self.line_opcodes[key]:
             self.line_opcodes[key][opcode] = {'count': 0, 'locations': set()}
 
-        self.line_opcodes[key][opcode]['count'] += 1
+        self.line_opcodes[key][opcode]['count'] += weight
 
         # Store unique location info if column offset is available (not -1)
         if col_offset is not None and col_offset >= 0:
@@ -761,7 +768,8 @@ class HeatmapCollector(StackTraceCollector):
         """Print summary of exported heatmap."""
         print(f"Heatmap output written to {output_dir}/")
         print(f"  - Index: {output_dir / 'index.html'}")
-        print(f"  - {len(file_stats)} source file(s) analyzed")
+        s = "" if len(file_stats) == 1 else "s"
+        print(f"  - {len(file_stats)} source file{s} analyzed")
 
     def _calculate_file_stats(self) -> List[FileStats]:
         """Calculate statistics for each file.
@@ -824,7 +832,7 @@ class HeatmapCollector(StackTraceCollector):
         # Format error rate and missed samples with bar classes
         error_rate = self.stats.get('error_rate')
         if error_rate is not None:
-            error_rate_str = f"{error_rate:.1f}%"
+            error_rate_str = f"{fmt(error_rate)}%"
             error_rate_width = min(error_rate, 100)
             # Determine bar color class based on rate
             if error_rate < 5:
@@ -840,7 +848,7 @@ class HeatmapCollector(StackTraceCollector):
 
         missed_samples = self.stats.get('missed_samples')
         if missed_samples is not None:
-            missed_samples_str = f"{missed_samples:.1f}%"
+            missed_samples_str = f"{fmt(missed_samples)}%"
             missed_samples_width = min(missed_samples, 100)
             if missed_samples < 5:
                 missed_samples_class = "good"
@@ -859,10 +867,10 @@ class HeatmapCollector(StackTraceCollector):
             "<!-- INLINE_JS -->": f"<script>\n{self._template_loader.index_js}\n</script>",
             "<!-- PYTHON_LOGO -->": self._template_loader.logo_html,
             "<!-- PYTHON_VERSION -->": f"{sys.version_info.major}.{sys.version_info.minor}",
-            "<!-- NUM_FILES -->": str(len(file_stats)),
-            "<!-- TOTAL_SAMPLES -->": f"{self._total_samples:,}",
-            "<!-- DURATION -->": f"{self.stats.get('duration_sec', 0):.1f}s",
-            "<!-- SAMPLE_RATE -->": f"{self.stats.get('sample_rate', 0):.1f}",
+            "<!-- NUM_FILES -->": f"{len(file_stats):n}",
+            "<!-- TOTAL_SAMPLES -->": f"{self._total_samples:n}",
+            "<!-- DURATION -->": fmt(self.stats.get('duration_sec', 0)),
+            "<!-- SAMPLE_RATE -->": fmt(self.stats.get('sample_rate', 0)),
             "<!-- ERROR_RATE -->": error_rate_str,
             "<!-- ERROR_RATE_WIDTH -->": str(error_rate_width),
             "<!-- ERROR_RATE_CLASS -->": error_rate_class,
@@ -906,12 +914,12 @@ class HeatmapCollector(StackTraceCollector):
         # Populate template
         replacements = {
             "<!-- FILENAME -->": html.escape(filename),
-            "<!-- TOTAL_SAMPLES -->": f"{file_stat.total_samples:,}",
-            "<!-- TOTAL_SELF_SAMPLES -->": f"{file_stat.total_self_samples:,}",
-            "<!-- NUM_LINES -->": str(file_stat.num_lines),
-            "<!-- PERCENTAGE -->": f"{file_stat.percentage:.2f}",
-            "<!-- MAX_SAMPLES -->": str(file_stat.max_samples),
-            "<!-- MAX_SELF_SAMPLES -->": str(file_stat.max_self_samples),
+            "<!-- TOTAL_SAMPLES -->": f"{file_stat.total_samples:n}",
+            "<!-- TOTAL_SELF_SAMPLES -->": f"{file_stat.total_self_samples:n}",
+            "<!-- NUM_LINES -->": f"{file_stat.num_lines:n}",
+            "<!-- PERCENTAGE -->": fmt(file_stat.percentage, 2),
+            "<!-- MAX_SAMPLES -->": f"{file_stat.max_samples:n}",
+            "<!-- MAX_SELF_SAMPLES -->": f"{file_stat.max_self_samples:n}",
             "<!-- CODE_LINES -->": ''.join(code_lines_html),
             "<!-- INLINE_CSS -->": f"<style>\n{self._template_loader.file_css}\n</style>",
             "<!-- INLINE_JS -->": f"<script>\n{self._template_loader.file_js}\n</script>",
@@ -948,9 +956,9 @@ class HeatmapCollector(StackTraceCollector):
             else:
                 self_intensity = 0
 
-            self_display = f"{self_samples:,}" if self_samples > 0 else ""
-            cumulative_display = f"{cumulative_samples:,}"
-            tooltip = f"Self: {self_samples:,}, Total: {cumulative_samples:,}"
+            self_display = f"{self_samples:n}" if self_samples > 0 else ""
+            cumulative_display = f"{cumulative_samples:n}"
+            tooltip = f"Self: {self_samples:n}, Total: {cumulative_samples:n}"
         else:
             cumulative_intensity = 0
             self_intensity = 0
@@ -978,7 +986,17 @@ class HeatmapCollector(StackTraceCollector):
                 f'data-spec-pct="{spec_pct}" '
                 f'onclick="toggleBytecode(this)" title="Show bytecode">&#9654;</button>'
             )
-            bytecode_panel_html = f'        <div class="bytecode-panel" id="bytecode-{line_num}" style="display:none;"></div>\n'
+            # Wrapper contains columns + content panel
+            bytecode_panel_html = (
+                f'        <div class="bytecode-wrapper" id="bytecode-wrapper-{line_num}">\n'
+                f'            <div class="bytecode-columns">'
+                f'<div class="line-number"></div>'
+                f'<div class="line-samples-self"></div>'
+                f'<div class="line-samples-cumulative"></div>'
+                f'</div>\n'
+                f'            <div class="bytecode-panel" id="bytecode-{line_num}"></div>\n'
+                f'        </div>\n'
+            )
         elif self.opcodes_enabled:
             # Add invisible spacer to maintain consistent indentation when opcodes are enabled
             bytecode_btn_html = '<div class="bytecode-spacer"></div>'
@@ -1195,7 +1213,7 @@ class HeatmapCollector(StackTraceCollector):
             file, line, func, count = valid_items[0]
             target_html = self.file_index[file]
             nav_data = json.dumps({'link': f"{target_html}#line-{line}", 'func': func})
-            title = f"Go to {btn_class}: {html.escape(func)} ({count:,} samples)"
+            title = f"Go to {btn_class}: {html.escape(func)} ({count:n} samples)"
             return f'<button class="nav-btn {btn_class}" data-nav=\'{html.escape(nav_data)}\' title="{title}">{arrow}</button>'
 
         # Multiple items - create menu
@@ -1210,5 +1228,5 @@ class HeatmapCollector(StackTraceCollector):
             for file, line, func, count in valid_items
         ]
         items_json = html.escape(json.dumps(items_data))
-        title = f"{len(items_data)} {btn_class}s ({total_samples:,} samples)"
+        title = f"{len(items_data)} {btn_class}s ({total_samples:n} samples)"
         return f'<button class="nav-btn {btn_class}" data-nav-multi=\'{items_json}\' title="{title}">{arrow}</button>'
diff --git a/Lib/profiling/sampling/live_collector/collector.py b/Lib/profiling/sampling/live_collector/collector.py
index 28af2e97445..dcb9fcabe32 100644
--- a/Lib/profiling/sampling/live_collector/collector.py
+++ b/Lib/profiling/sampling/live_collector/collector.py
@@ -348,7 +348,7 @@ class LiveStatsCollector(Collector):
         self.failed_samples += 1
         self.total_samples += 1
 
-    def collect(self, stack_frames):
+    def collect(self, stack_frames, timestamp_us=None):
         """Collect and display profiling data."""
         if self.start_time is None:
             self.start_time = time.perf_counter()
diff --git a/Lib/profiling/sampling/pstats_collector.py b/Lib/profiling/sampling/pstats_collector.py
index 7c154e25828..1b2fe6a7727 100644
--- a/Lib/profiling/sampling/pstats_collector.py
+++ b/Lib/profiling/sampling/pstats_collector.py
@@ -18,7 +18,7 @@ class PstatsCollector(Collector):
         self.skip_idle = skip_idle
         self._seen_locations = set()
 
-    def _process_frames(self, frames):
+    def _process_frames(self, frames, weight=1):
         """Process a single thread's frame stack."""
         if not frames:
             return
@@ -32,12 +32,12 @@ class PstatsCollector(Collector):
             location = (frame.filename, lineno, frame.funcname)
             if location not in self._seen_locations:
                 self._seen_locations.add(location)
-                self.result[location]["cumulative_calls"] += 1
+                self.result[location]["cumulative_calls"] += weight
 
         # The top frame gets counted as an inline call (directly executing)
         top_lineno = extract_lineno(frames[0].location)
         top_location = (frames[0].filename, top_lineno, frames[0].funcname)
-        self.result[top_location]["direct_calls"] += 1
+        self.result[top_location]["direct_calls"] += weight
 
         # Track caller-callee relationships for call graph
         for i in range(1, len(frames)):
@@ -49,17 +49,12 @@ class PstatsCollector(Collector):
             callee = (callee_frame.filename, callee_lineno, callee_frame.funcname)
             caller = (caller_frame.filename, caller_lineno, caller_frame.funcname)
 
-            self.callers[callee][caller] += 1
+            self.callers[callee][caller] += weight
 
-    def collect(self, stack_frames):
-        if stack_frames and hasattr(stack_frames[0], "awaited_by"):
-            # Async frame processing
-            for frames, thread_id, task_id in self._iter_async_frames(stack_frames):
-                self._process_frames(frames)
-        else:
-            # Regular frame processing
-            for frames, thread_id in self._iter_all_frames(stack_frames, skip_idle=self.skip_idle):
-                self._process_frames(frames)
+    def collect(self, stack_frames, timestamps_us=None):
+        weight = len(timestamps_us) if timestamps_us else 1
+        for frames, _ in self._iter_stacks(stack_frames, skip_idle=self.skip_idle):
+            self._process_frames(frames, weight=weight)
 
     def export(self, filename):
         self.create_stats()
diff --git a/Lib/profiling/sampling/sample.py b/Lib/profiling/sampling/sample.py
index 294ec3003fc..c0c4c88d13c 100644
--- a/Lib/profiling/sampling/sample.py
+++ b/Lib/profiling/sampling/sample.py
@@ -1,6 +1,5 @@
 import _remote_debugging
 import os
-import pstats
 import statistics
 import sys
 import sysconfig
@@ -8,10 +7,7 @@ import time
 from collections import deque
 from _colorize import ANSIColors
 
-from .pstats_collector import PstatsCollector
-from .stack_collector import CollapsedStackCollector, FlamegraphCollector
-from .heatmap_collector import HeatmapCollector
-from .gecko_collector import GeckoCollector
+from .binary_collector import BinaryCollector
 from .constants import (
     PROFILING_MODE_WALL,
     PROFILING_MODE_CPU,
@@ -19,6 +15,7 @@ from .constants import (
     PROFILING_MODE_ALL,
     PROFILING_MODE_EXCEPTION,
 )
+from ._format_utils import fmt
 try:
     from .live_collector import LiveStatsCollector
 except ImportError:
@@ -34,24 +31,30 @@ class SampleProfiler:
         self.all_threads = all_threads
         self.mode = mode  # Store mode for later use
         self.collect_stats = collect_stats
-        if _FREE_THREADED_BUILD:
-            self.unwinder = _remote_debugging.RemoteUnwinder(
-                self.pid, all_threads=self.all_threads, mode=mode, native=native, gc=gc,
-                opcodes=opcodes, skip_non_matching_threads=skip_non_matching_threads,
-                cache_frames=True, stats=collect_stats
-            )
-        else:
-            only_active_threads = bool(self.all_threads)
-            self.unwinder = _remote_debugging.RemoteUnwinder(
-                self.pid, only_active_thread=only_active_threads, mode=mode, native=native, gc=gc,
-                opcodes=opcodes, skip_non_matching_threads=skip_non_matching_threads,
-                cache_frames=True, stats=collect_stats
-            )
+        try:
+            self.unwinder = self._new_unwinder(native, gc, opcodes, skip_non_matching_threads)
+        except RuntimeError as err:
+            raise SystemExit(err) from err
         # Track sample intervals and total sample count
         self.sample_intervals = deque(maxlen=100)
         self.total_samples = 0
         self.realtime_stats = False
 
+    def _new_unwinder(self, native, gc, opcodes, skip_non_matching_threads):
+        if _FREE_THREADED_BUILD:
+            unwinder = _remote_debugging.RemoteUnwinder(
+                self.pid, all_threads=self.all_threads, mode=self.mode, native=native, gc=gc,
+                opcodes=opcodes, skip_non_matching_threads=skip_non_matching_threads,
+                cache_frames=True, stats=self.collect_stats
+            )
+        else:
+            unwinder = _remote_debugging.RemoteUnwinder(
+                self.pid, only_active_thread=bool(self.all_threads), mode=self.mode, native=native, gc=gc,
+                opcodes=opcodes, skip_non_matching_threads=skip_non_matching_threads,
+                cache_frames=True, stats=self.collect_stats
+            )
+        return unwinder
+
     def sample(self, collector, duration_sec=10, *, async_aware=False):
         sample_interval_sec = self.sample_interval_usec / 1_000_000
         running_time = 0
@@ -86,7 +89,7 @@ class SampleProfiler:
                         collector.collect_failed_sample()
                         errors += 1
                     except Exception as e:
-                        if not self._is_process_running():
+                        if not _is_process_running(self.pid):
                             break
                         raise e from None
 
@@ -129,14 +132,17 @@ class SampleProfiler:
         # Don't print stats for live mode (curses is handling display)
         is_live_mode = LiveStatsCollector is not None and isinstance(collector, LiveStatsCollector)
         if not is_live_mode:
-            print(f"Captured {num_samples} samples in {running_time:.2f} seconds")
-            print(f"Sample rate: {sample_rate:.2f} samples/sec")
-            print(f"Error rate: {error_rate:.2f}%")
+            print(f"Captured {num_samples:n} samples in {fmt(running_time, 2)} seconds")
+            print(f"Sample rate: {fmt(sample_rate, 2)} samples/sec")
+            print(f"Error rate: {fmt(error_rate, 2)}")
 
             # Print unwinder stats if stats collection is enabled
             if self.collect_stats:
                 self._print_unwinder_stats()
 
+            if isinstance(collector, BinaryCollector):
+                self._print_binary_stats(collector)
+
         # Pass stats to flamegraph collector if it's the right type
         if hasattr(collector, 'set_stats'):
             collector.set_stats(self.sample_interval_usec, running_time, sample_rate, error_rate, missed_samples, mode=self.mode)
@@ -145,25 +151,9 @@ class SampleProfiler:
             print(
                 f"Warning: missed {expected_samples - num_samples} samples "
                 f"from the expected total of {expected_samples} "
-                f"({(expected_samples - num_samples) / expected_samples * 100:.2f}%)"
+                f"({fmt((expected_samples - num_samples) / expected_samples * 100, 2)}%)"
             )
 
-    def _is_process_running(self):
-        if sys.platform == "linux" or sys.platform == "darwin":
-            try:
-                os.kill(self.pid, 0)
-                return True
-            except ProcessLookupError:
-                return False
-        elif sys.platform == "win32":
-            try:
-                _remote_debugging.RemoteUnwinder(self.pid)
-            except Exception:
-                return False
-            return True
-        else:
-            raise ValueError(f"Unsupported platform: {sys.platform}")
-
     def _print_realtime_stats(self):
         """Print real-time sampling statistics."""
         if len(self.sample_intervals) < 2:
@@ -195,16 +185,16 @@ class SampleProfiler:
                 total = hits + partial + misses
                 if total > 0:
                     hit_pct = (hits + partial) / total * 100
-                    cache_stats_str = f" {ANSIColors.MAGENTA}Cache: {hit_pct:.1f}% ({hits}+{partial}/{misses}){ANSIColors.RESET}"
+                    cache_stats_str = f" {ANSIColors.MAGENTA}Cache: {fmt(hit_pct)}% ({hits}+{partial}/{misses}){ANSIColors.RESET}"
             except RuntimeError:
                 pass
 
         # Clear line and print stats
         print(
             f"\r\033[K{ANSIColors.BOLD_BLUE}Stats:{ANSIColors.RESET} "
-            f"{ANSIColors.YELLOW}{mean_hz:.1f}Hz ({mean_us_per_sample:.1f}µs){ANSIColors.RESET} "
-            f"{ANSIColors.GREEN}Min: {min_hz:.1f}Hz{ANSIColors.RESET} "
-            f"{ANSIColors.RED}Max: {max_hz:.1f}Hz{ANSIColors.RESET} "
+            f"{ANSIColors.YELLOW}{fmt(mean_hz)}Hz ({fmt(mean_us_per_sample)}µs){ANSIColors.RESET} "
+            f"{ANSIColors.GREEN}Min: {fmt(min_hz)}Hz{ANSIColors.RESET} "
+            f"{ANSIColors.RED}Max: {fmt(max_hz)}Hz{ANSIColors.RESET} "
             f"{ANSIColors.CYAN}N={self.total_samples}{ANSIColors.RESET}"
             f"{cache_stats_str}",
             end="",
@@ -234,10 +224,10 @@ class SampleProfiler:
         misses_pct = (frame_cache_misses / total_lookups * 100) if total_lookups > 0 else 0
 
         print(f"  {ANSIColors.CYAN}Frame Cache:{ANSIColors.RESET}")
-        print(f"    Total samples:    {total_samples:,}")
-        print(f"    Full hits:        {frame_cache_hits:,} ({ANSIColors.GREEN}{hits_pct:.1f}%{ANSIColors.RESET})")
-        print(f"    Partial hits:     {frame_cache_partial_hits:,} ({ANSIColors.YELLOW}{partial_pct:.1f}%{ANSIColors.RESET})")
-        print(f"    Misses:           {frame_cache_misses:,} ({ANSIColors.RED}{misses_pct:.1f}%{ANSIColors.RESET})")
+        print(f"    Total samples:    {total_samples:n}")
+        print(f"    Full hits:        {frame_cache_hits:n} ({ANSIColors.GREEN}{fmt(hits_pct)}%{ANSIColors.RESET})")
+        print(f"    Partial hits:     {frame_cache_partial_hits:n} ({ANSIColors.YELLOW}{fmt(partial_pct)}%{ANSIColors.RESET})")
+        print(f"    Misses:           {frame_cache_misses:n} ({ANSIColors.RED}{fmt(misses_pct)}%{ANSIColors.RESET})")
 
         # Frame read stats
         frames_from_cache = stats.get('frames_read_from_cache', 0)
@@ -247,8 +237,8 @@ class SampleProfiler:
         memory_frame_pct = (frames_from_memory / total_frames * 100) if total_frames > 0 else 0
 
         print(f"  {ANSIColors.CYAN}Frame Reads:{ANSIColors.RESET}")
-        print(f"    From cache:       {frames_from_cache:,} ({ANSIColors.GREEN}{cache_frame_pct:.1f}%{ANSIColors.RESET})")
-        print(f"    From memory:      {frames_from_memory:,} ({ANSIColors.RED}{memory_frame_pct:.1f}%{ANSIColors.RESET})")
+        print(f"    From cache:       {frames_from_cache:n} ({ANSIColors.GREEN}{fmt(cache_frame_pct)}%{ANSIColors.RESET})")
+        print(f"    From memory:      {frames_from_memory:n} ({ANSIColors.RED}{fmt(memory_frame_pct)}%{ANSIColors.RESET})")
 
         # Code object cache stats
         code_hits = stats.get('code_object_cache_hits', 0)
@@ -258,26 +248,95 @@ class SampleProfiler:
         code_misses_pct = (code_misses / total_code * 100) if total_code > 0 else 0
 
         print(f"  {ANSIColors.CYAN}Code Object Cache:{ANSIColors.RESET}")
-        print(f"    Hits:             {code_hits:,} ({ANSIColors.GREEN}{code_hits_pct:.1f}%{ANSIColors.RESET})")
-        print(f"    Misses:           {code_misses:,} ({ANSIColors.RED}{code_misses_pct:.1f}%{ANSIColors.RESET})")
+        print(f"    Hits:             {code_hits:n} ({ANSIColors.GREEN}{fmt(code_hits_pct)}%{ANSIColors.RESET})")
+        print(f"    Misses:           {code_misses:n} ({ANSIColors.RED}{fmt(code_misses_pct)}%{ANSIColors.RESET})")
 
         # Memory operations
         memory_reads = stats.get('memory_reads', 0)
         memory_bytes = stats.get('memory_bytes_read', 0)
         if memory_bytes >= 1024 * 1024:
-            memory_str = f"{memory_bytes / (1024 * 1024):.1f} MB"
+            memory_str = f"{fmt(memory_bytes / (1024 * 1024))} MB"
         elif memory_bytes >= 1024:
-            memory_str = f"{memory_bytes / 1024:.1f} KB"
+            memory_str = f"{fmt(memory_bytes / 1024)} KB"
         else:
             memory_str = f"{memory_bytes} B"
         print(f"  {ANSIColors.CYAN}Memory:{ANSIColors.RESET}")
-        print(f"    Read operations:  {memory_reads:,} ({memory_str})")
+        print(f"    Read operations:  {memory_reads:n} ({memory_str})")
 
         # Stale invalidations
         stale_invalidations = stats.get('stale_cache_invalidations', 0)
         if stale_invalidations > 0:
             print(f"  {ANSIColors.YELLOW}Stale cache invalidations: {stale_invalidations}{ANSIColors.RESET}")
 
+    def _print_binary_stats(self, collector):
+        """Print binary I/O encoding statistics."""
+        try:
+            stats = collector.get_stats()
+        except (ValueError, RuntimeError):
+            return  # Collector closed or stats unavailable
+
+        print(f"  {ANSIColors.CYAN}Binary Encoding:{ANSIColors.RESET}")
+
+        repeat_records = stats.get('repeat_records', 0)
+        repeat_samples = stats.get('repeat_samples', 0)
+        full_records = stats.get('full_records', 0)
+        suffix_records = stats.get('suffix_records', 0)
+        pop_push_records = stats.get('pop_push_records', 0)
+        total_records = stats.get('total_records', 0)
+
+        if total_records > 0:
+            repeat_pct = repeat_records / total_records * 100
+            full_pct = full_records / total_records * 100
+            suffix_pct = suffix_records / total_records * 100
+            pop_push_pct = pop_push_records / total_records * 100
+        else:
+            repeat_pct = full_pct = suffix_pct = pop_push_pct = 0
+
+        print(f"    Records:          {total_records:,}")
+        print(f"      RLE repeat:     {repeat_records:,} ({ANSIColors.GREEN}{repeat_pct:.1f}%{ANSIColors.RESET}) [{repeat_samples:,} samples]")
+        print(f"      Full stack:     {full_records:,} ({full_pct:.1f}%)")
+        print(f"      Suffix match:   {suffix_records:,} ({suffix_pct:.1f}%)")
+        print(f"      Pop-push:       {pop_push_records:,} ({pop_push_pct:.1f}%)")
+
+        frames_written = stats.get('total_frames_written', 0)
+        frames_saved = stats.get('frames_saved', 0)
+        compression_pct = stats.get('frame_compression_pct', 0)
+
+        print(f"  {ANSIColors.CYAN}Frame Efficiency:{ANSIColors.RESET}")
+        print(f"    Frames written:   {frames_written:,}")
+        print(f"    Frames saved:     {frames_saved:,} ({ANSIColors.GREEN}{compression_pct:.1f}%{ANSIColors.RESET})")
+
+        bytes_written = stats.get('bytes_written', 0)
+        if bytes_written >= 1024 * 1024:
+            bytes_str = f"{bytes_written / (1024 * 1024):.1f} MB"
+        elif bytes_written >= 1024:
+            bytes_str = f"{bytes_written / 1024:.1f} KB"
+        else:
+            bytes_str = f"{bytes_written} B"
+        print(f"    Bytes (pre-zstd): {bytes_str}")
+
+
+def _is_process_running(pid):
+    if pid <= 0:
+        return False
+    if os.name == "posix":
+        try:
+            os.kill(pid, 0)
+            return True
+        except ProcessLookupError:
+            return False
+        except PermissionError:
+            # EPERM means process exists but we can't signal it
+            return True
+    elif sys.platform == "win32":
+        try:
+            _remote_debugging.RemoteUnwinder(pid)
+        except Exception:
+            return False
+        return True
+    else:
+        raise ValueError(f"Unsupported platform: {sys.platform}")
+
 
 def sample(
     pid,
diff --git a/Lib/profiling/sampling/stack_collector.py b/Lib/profiling/sampling/stack_collector.py
index e437facd8bb..55e643d0e9c 100644
--- a/Lib/profiling/sampling/stack_collector.py
+++ b/Lib/profiling/sampling/stack_collector.py
@@ -18,21 +18,12 @@ class StackTraceCollector(Collector):
         self.sample_interval_usec = sample_interval_usec
         self.skip_idle = skip_idle
 
-    def collect(self, stack_frames, skip_idle=False):
-        if stack_frames and hasattr(stack_frames[0], "awaited_by"):
-            # Async-aware mode: process async task frames
-            for frames, thread_id, task_id in self._iter_async_frames(stack_frames):
-                if not frames:
-                    continue
-                self.process_frames(frames, thread_id)
-        else:
-            # Sync-only mode
-            for frames, thread_id in self._iter_all_frames(stack_frames, skip_idle=skip_idle):
-                if not frames:
-                    continue
-                self.process_frames(frames, thread_id)
+    def collect(self, stack_frames, timestamps_us=None, skip_idle=False):
+        weight = len(timestamps_us) if timestamps_us else 1
+        for frames, thread_id in self._iter_stacks(stack_frames, skip_idle=skip_idle):
+            self.process_frames(frames, thread_id, weight=weight)
 
-    def process_frames(self, frames, thread_id):
+    def process_frames(self, frames, thread_id, weight=1):
         pass
 
 
@@ -41,13 +32,13 @@ class CollapsedStackCollector(StackTraceCollector):
         super().__init__(*args, **kwargs)
         self.stack_counter = collections.Counter()
 
-    def process_frames(self, frames, thread_id):
+    def process_frames(self, frames, thread_id, weight=1):
         # Extract only (filename, lineno, funcname) - opcode not needed for collapsed stacks
         # frame is (filename, location, funcname, opcode)
         call_tree = tuple(
             (f[0], extract_lineno(f[1]), f[2]) for f in reversed(frames)
         )
-        self.stack_counter[(call_tree, thread_id)] += 1
+        self.stack_counter[(call_tree, thread_id)] += weight
 
     def export(self, filename):
         lines = []
@@ -96,23 +87,26 @@ class FlamegraphCollector(StackTraceCollector):
         # Per-thread statistics
         self.per_thread_stats = {}  # {thread_id: {has_gil, on_cpu, gil_requested, unknown, has_exception, total, gc_samples}}
 
-    def collect(self, stack_frames, skip_idle=False):
+    def collect(self, stack_frames, timestamps_us=None, skip_idle=False):
         """Override to track thread status statistics before processing frames."""
-        # Increment sample count once per sample
-        self._sample_count += 1
+        # Weight is number of timestamps (samples with identical stack)
+        weight = len(timestamps_us) if timestamps_us else 1
+
+        # Increment sample count by weight
+        self._sample_count += weight
 
         # Collect both aggregate and per-thread statistics using base method
         status_counts, has_gc_frame, per_thread_stats = self._collect_thread_status_stats(stack_frames)
 
-        # Merge aggregate status counts
+        # Merge aggregate status counts (multiply by weight)
         for key in status_counts:
-            self.thread_status_counts[key] += status_counts[key]
+            self.thread_status_counts[key] += status_counts[key] * weight
 
         # Update aggregate GC frame count
         if has_gc_frame:
-            self.samples_with_gc_frames += 1
+            self.samples_with_gc_frames += weight
 
-        # Merge per-thread statistics
+        # Merge per-thread statistics (multiply by weight)
         for thread_id, stats in per_thread_stats.items():
             if thread_id not in self.per_thread_stats:
                 self.per_thread_stats[thread_id] = {
@@ -125,10 +119,10 @@ class FlamegraphCollector(StackTraceCollector):
                     "gc_samples": 0,
                 }
             for key, value in stats.items():
-                self.per_thread_stats[thread_id][key] += value
+                self.per_thread_stats[thread_id][key] += value * weight
 
         # Call parent collect to process frames
-        super().collect(stack_frames, skip_idle=skip_idle)
+        super().collect(stack_frames, timestamps_us, skip_idle=skip_idle)
 
     def set_stats(self, sample_interval_usec, duration_sec, sample_rate,
                   error_rate=None, missed_samples=None, mode=None):
@@ -311,7 +305,7 @@ class FlamegraphCollector(StackTraceCollector):
             "opcode_mapping": opcode_mapping
         }
 
-    def process_frames(self, frames, thread_id):
+    def process_frames(self, frames, thread_id, weight=1):
         """Process stack frames into flamegraph tree structure.
 
         Args:
@@ -319,10 +313,11 @@ class FlamegraphCollector(StackTraceCollector):
                     leaf-to-root order. location is (lineno, end_lineno, col_offset, end_col_offset).
                     opcode is None if not gathered.
             thread_id: Thread ID for this stack trace
+            weight: Number of samples this stack represents (for batched RLE)
         """
         # Reverse to root->leaf order for tree building
-        self._root["samples"] += 1
-        self._total_samples += 1
+        self._root["samples"] += weight
+        self._total_samples += weight
         self._root["threads"].add(thread_id)
         self._all_threads.add(thread_id)
 
@@ -336,11 +331,11 @@ class FlamegraphCollector(StackTraceCollector):
             if node is None:
                 node = {"samples": 0, "children": {}, "threads": set(), "opcodes": collections.Counter()}
                 current["children"][func] = node
-            node["samples"] += 1
+            node["samples"] += weight
             node["threads"].add(thread_id)
 
             if opcode is not None:
-                node["opcodes"][opcode] += 1
+                node["opcodes"][opcode] += weight
 
             current = node
 
diff --git a/Lib/pydoc.py b/Lib/pydoc.py
index c8d4aa7a883..f3b44c9b3a6 100644
--- a/Lib/pydoc.py
+++ b/Lib/pydoc.py
@@ -483,10 +483,20 @@ class Doc:
 
         if (self._is_stdlib_module(object, basedir) and
             object.__name__ not in ('xml.etree', 'test.test_pydoc.pydoc_mod')):
-            if docloc.startswith(("http://", "https://")):
-                docloc = "{}/{}.html".format(docloc.rstrip("/"), object.__name__.lower())
+
+            try:
+                from pydoc_data import module_docs
+            except ImportError:
+                module_docs = None
+
+            if module_docs and object.__name__ in module_docs.module_docs:
+                doc_name = module_docs.module_docs[object.__name__]
+                if docloc.startswith(("http://", "https://")):
+                    docloc = "{}/{}".format(docloc.rstrip("/"), doc_name)
+                else:
+                    docloc = os.path.join(docloc, doc_name)
             else:
-                docloc = os.path.join(docloc, object.__name__.lower() + ".html")
+                docloc = None
         else:
             docloc = None
         return docloc
diff --git a/Lib/pydoc_data/module_docs.py b/Lib/pydoc_data/module_docs.py
new file mode 100644
index 00000000000..f6d84a60b43
--- /dev/null
+++ b/Lib/pydoc_data/module_docs.py
@@ -0,0 +1,321 @@
+# Autogenerated by Sphinx on Sun Oct 12 12:02:22 2025
+# as part of the release process.
+
+module_docs = {
+    '__future__': '__future__#module-__future__',
+    '__main__': '__main__#module-__main__',
+    '_thread': '_thread#module-_thread',
+    '_tkinter': 'tkinter#module-_tkinter',
+    'abc': 'abc#module-abc',
+    'aifc': 'aifc#module-aifc',
+    'annotationlib': 'annotationlib#module-annotationlib',
+    'argparse': 'argparse#module-argparse',
+    'array': 'array#module-array',
+    'ast': 'ast#module-ast',
+    'asynchat': 'asynchat#module-asynchat',
+    'asyncio': 'asyncio#module-asyncio',
+    'asyncore': 'asyncore#module-asyncore',
+    'atexit': 'atexit#module-atexit',
+    'audioop': 'audioop#module-audioop',
+    'base64': 'base64#module-base64',
+    'bdb': 'bdb#module-bdb',
+    'binascii': 'binascii#module-binascii',
+    'bisect': 'bisect#module-bisect',
+    'builtins': 'builtins#module-builtins',
+    'bz2': 'bz2#module-bz2',
+    'cProfile': 'profile#module-cProfile',
+    'calendar': 'calendar#module-calendar',
+    'cgi': 'cgi#module-cgi',
+    'cgitb': 'cgitb#module-cgitb',
+    'chunk': 'chunk#module-chunk',
+    'cmath': 'cmath#module-cmath',
+    'cmd': 'cmd#module-cmd',
+    'code': 'code#module-code',
+    'codecs': 'codecs#module-codecs',
+    'codeop': 'codeop#module-codeop',
+    'collections': 'collections#module-collections',
+    'collections.abc': 'collections.abc#module-collections.abc',
+    'colorsys': 'colorsys#module-colorsys',
+    'compileall': 'compileall#module-compileall',
+    'compression': 'compression#module-compression',
+    'compression.zstd': 'compression.zstd#module-compression.zstd',
+    'concurrent.futures': 'concurrent.futures#module-concurrent.futures',
+    'concurrent.interpreters': 'concurrent.interpreters#module-concurrent.interpreters',
+    'configparser': 'configparser#module-configparser',
+    'contextlib': 'contextlib#module-contextlib',
+    'contextvars': 'contextvars#module-contextvars',
+    'copy': 'copy#module-copy',
+    'copyreg': 'copyreg#module-copyreg',
+    'crypt': 'crypt#module-crypt',
+    'csv': 'csv#module-csv',
+    'ctypes': 'ctypes#module-ctypes',
+    'curses': 'curses#module-curses',
+    'curses.ascii': 'curses.ascii#module-curses.ascii',
+    'curses.panel': 'curses.panel#module-curses.panel',
+    'curses.textpad': 'curses#module-curses.textpad',
+    'dataclasses': 'dataclasses#module-dataclasses',
+    'datetime': 'datetime#module-datetime',
+    'dbm': 'dbm#module-dbm',
+    'dbm.dumb': 'dbm#module-dbm.dumb',
+    'dbm.gnu': 'dbm#module-dbm.gnu',
+    'dbm.ndbm': 'dbm#module-dbm.ndbm',
+    'dbm.sqlite3': 'dbm#module-dbm.sqlite3',
+    'decimal': 'decimal#module-decimal',
+    'difflib': 'difflib#module-difflib',
+    'dis': 'dis#module-dis',
+    'distutils': 'distutils#module-distutils',
+    'doctest': 'doctest#module-doctest',
+    'email': 'email#module-email',
+    'email.charset': 'email.charset#module-email.charset',
+    'email.contentmanager': 'email.contentmanager#module-email.contentmanager',
+    'email.encoders': 'email.encoders#module-email.encoders',
+    'email.errors': 'email.errors#module-email.errors',
+    'email.generator': 'email.generator#module-email.generator',
+    'email.header': 'email.header#module-email.header',
+    'email.headerregistry': 'email.headerregistry#module-email.headerregistry',
+    'email.iterators': 'email.iterators#module-email.iterators',
+    'email.message': 'email.message#module-email.message',
+    'email.mime': 'email.mime#module-email.mime',
+    'email.mime.application': 'email.mime#module-email.mime.application',
+    'email.mime.audio': 'email.mime#module-email.mime.audio',
+    'email.mime.base': 'email.mime#module-email.mime.base',
+    'email.mime.image': 'email.mime#module-email.mime.image',
+    'email.mime.message': 'email.mime#module-email.mime.message',
+    'email.mime.multipart': 'email.mime#module-email.mime.multipart',
+    'email.mime.nonmultipart': 'email.mime#module-email.mime.nonmultipart',
+    'email.mime.text': 'email.mime#module-email.mime.text',
+    'email.parser': 'email.parser#module-email.parser',
+    'email.policy': 'email.policy#module-email.policy',
+    'email.utils': 'email.utils#module-email.utils',
+    'encodings': 'codecs#module-encodings',
+    'encodings.idna': 'codecs#module-encodings.idna',
+    'encodings.mbcs': 'codecs#module-encodings.mbcs',
+    'encodings.utf_8_sig': 'codecs#module-encodings.utf_8_sig',
+    'ensurepip': 'ensurepip#module-ensurepip',
+    'enum': 'enum#module-enum',
+    'errno': 'errno#module-errno',
+    'faulthandler': 'faulthandler#module-faulthandler',
+    'fcntl': 'fcntl#module-fcntl',
+    'filecmp': 'filecmp#module-filecmp',
+    'fileinput': 'fileinput#module-fileinput',
+    'fnmatch': 'fnmatch#module-fnmatch',
+    'fractions': 'fractions#module-fractions',
+    'ftplib': 'ftplib#module-ftplib',
+    'functools': 'functools#module-functools',
+    'gc': 'gc#module-gc',
+    'getopt': 'getopt#module-getopt',
+    'getpass': 'getpass#module-getpass',
+    'gettext': 'gettext#module-gettext',
+    'glob': 'glob#module-glob',
+    'graphlib': 'graphlib#module-graphlib',
+    'grp': 'grp#module-grp',
+    'gzip': 'gzip#module-gzip',
+    'hashlib': 'hashlib#module-hashlib',
+    'heapq': 'heapq#module-heapq',
+    'hmac': 'hmac#module-hmac',
+    'html': 'html#module-html',
+    'html.entities': 'html.entities#module-html.entities',
+    'html.parser': 'html.parser#module-html.parser',
+    'http': 'http#module-http',
+    'http.client': 'http.client#module-http.client',
+    'http.cookiejar': 'http.cookiejar#module-http.cookiejar',
+    'http.cookies': 'http.cookies#module-http.cookies',
+    'http.server': 'http.server#module-http.server',
+    'idlelib': 'idle#module-idlelib',
+    'imaplib': 'imaplib#module-imaplib',
+    'imghdr': 'imghdr#module-imghdr',
+    'imp': 'imp#module-imp',
+    'importlib': 'importlib#module-importlib',
+    'importlib.abc': 'importlib#module-importlib.abc',
+    'importlib.machinery': 'importlib#module-importlib.machinery',
+    'importlib.metadata': 'importlib.metadata#module-importlib.metadata',
+    'importlib.resources': 'importlib.resources#module-importlib.resources',
+    'importlib.resources.abc': 'importlib.resources.abc#module-importlib.resources.abc',
+    'importlib.util': 'importlib#module-importlib.util',
+    'inspect': 'inspect#module-inspect',
+    'io': 'io#module-io',
+    'ipaddress': 'ipaddress#module-ipaddress',
+    'itertools': 'itertools#module-itertools',
+    'json': 'json#module-json',
+    'json.tool': 'json#module-json.tool',
+    'keyword': 'keyword#module-keyword',
+    'linecache': 'linecache#module-linecache',
+    'locale': 'locale#module-locale',
+    'logging': 'logging#module-logging',
+    'logging.config': 'logging.config#module-logging.config',
+    'logging.handlers': 'logging.handlers#module-logging.handlers',
+    'lzma': 'lzma#module-lzma',
+    'mailbox': 'mailbox#module-mailbox',
+    'mailcap': 'mailcap#module-mailcap',
+    'marshal': 'marshal#module-marshal',
+    'math': 'math#module-math',
+    'mimetypes': 'mimetypes#module-mimetypes',
+    'mmap': 'mmap#module-mmap',
+    'modulefinder': 'modulefinder#module-modulefinder',
+    'msilib': 'msilib#module-msilib',
+    'msvcrt': 'msvcrt#module-msvcrt',
+    'multiprocessing': 'multiprocessing#module-multiprocessing',
+    'multiprocessing.connection': 'multiprocessing#module-multiprocessing.connection',
+    'multiprocessing.dummy': 'multiprocessing#module-multiprocessing.dummy',
+    'multiprocessing.managers': 'multiprocessing#module-multiprocessing.managers',
+    'multiprocessing.pool': 'multiprocessing#module-multiprocessing.pool',
+    'multiprocessing.shared_memory': 'multiprocessing.shared_memory#module-multiprocessing.shared_memory',
+    'multiprocessing.sharedctypes': 'multiprocessing#module-multiprocessing.sharedctypes',
+    'netrc': 'netrc#module-netrc',
+    'nis': 'nis#module-nis',
+    'nntplib': 'nntplib#module-nntplib',
+    'numbers': 'numbers#module-numbers',
+    'operator': 'operator#module-operator',
+    'optparse': 'optparse#module-optparse',
+    'os': 'os#module-os',
+    'os.path': 'os.path#module-os.path',
+    'ossaudiodev': 'ossaudiodev#module-ossaudiodev',
+    'pathlib': 'pathlib#module-pathlib',
+    'pathlib.types': 'pathlib#module-pathlib.types',
+    'pdb': 'pdb#module-pdb',
+    'pickle': 'pickle#module-pickle',
+    'pickletools': 'pickletools#module-pickletools',
+    'pipes': 'pipes#module-pipes',
+    'pkgutil': 'pkgutil#module-pkgutil',
+    'platform': 'platform#module-platform',
+    'plistlib': 'plistlib#module-plistlib',
+    'poplib': 'poplib#module-poplib',
+    'posix': 'posix#module-posix',
+    'pprint': 'pprint#module-pprint',
+    'profile': 'profile#module-profile',
+    'profiling.sampling': 'profile#module-profiling.sampling',
+    'pstats': 'profile#module-pstats',
+    'pty': 'pty#module-pty',
+    'pwd': 'pwd#module-pwd',
+    'py_compile': 'py_compile#module-py_compile',
+    'pyclbr': 'pyclbr#module-pyclbr',
+    'pydoc': 'pydoc#module-pydoc',
+    'queue': 'queue#module-queue',
+    'quopri': 'quopri#module-quopri',
+    'random': 'random#module-random',
+    're': 're#module-re',
+    'readline': 'readline#module-readline',
+    'reprlib': 'reprlib#module-reprlib',
+    'resource': 'resource#module-resource',
+    'rlcompleter': 'rlcompleter#module-rlcompleter',
+    'runpy': 'runpy#module-runpy',
+    'sched': 'sched#module-sched',
+    'secrets': 'secrets#module-secrets',
+    'select': 'select#module-select',
+    'selectors': 'selectors#module-selectors',
+    'shelve': 'shelve#module-shelve',
+    'shlex': 'shlex#module-shlex',
+    'shutil': 'shutil#module-shutil',
+    'signal': 'signal#module-signal',
+    'site': 'site#module-site',
+    'sitecustomize': 'site#module-sitecustomize',
+    'smtpd': 'smtpd#module-smtpd',
+    'smtplib': 'smtplib#module-smtplib',
+    'sndhdr': 'sndhdr#module-sndhdr',
+    'socket': 'socket#module-socket',
+    'socketserver': 'socketserver#module-socketserver',
+    'spwd': 'spwd#module-spwd',
+    'sqlite3': 'sqlite3#module-sqlite3',
+    'ssl': 'ssl#module-ssl',
+    'stat': 'stat#module-stat',
+    'statistics': 'statistics#module-statistics',
+    'string': 'string#module-string',
+    'string.templatelib': 'string.templatelib#module-string.templatelib',
+    'stringprep': 'stringprep#module-stringprep',
+    'struct': 'struct#module-struct',
+    'subprocess': 'subprocess#module-subprocess',
+    'sunau': 'sunau#module-sunau',
+    'symtable': 'symtable#module-symtable',
+    'sys': 'sys#module-sys',
+    'sys.monitoring': 'sys.monitoring#module-sys.monitoring',
+    'sysconfig': 'sysconfig#module-sysconfig',
+    'syslog': 'syslog#module-syslog',
+    'tabnanny': 'tabnanny#module-tabnanny',
+    'tarfile': 'tarfile#module-tarfile',
+    'telnetlib': 'telnetlib#module-telnetlib',
+    'tempfile': 'tempfile#module-tempfile',
+    'termios': 'termios#module-termios',
+    'test': 'test#module-test',
+    'test.regrtest': 'test#module-test.regrtest',
+    'test.support': 'test#module-test.support',
+    'test.support.bytecode_helper': 'test#module-test.support.bytecode_helper',
+    'test.support.import_helper': 'test#module-test.support.import_helper',
+    'test.support.os_helper': 'test#module-test.support.os_helper',
+    'test.support.script_helper': 'test#module-test.support.script_helper',
+    'test.support.socket_helper': 'test#module-test.support.socket_helper',
+    'test.support.threading_helper': 'test#module-test.support.threading_helper',
+    'test.support.warnings_helper': 'test#module-test.support.warnings_helper',
+    'textwrap': 'textwrap#module-textwrap',
+    'threading': 'threading#module-threading',
+    'time': 'time#module-time',
+    'timeit': 'timeit#module-timeit',
+    'tkinter': 'tkinter#module-tkinter',
+    'tkinter.colorchooser': 'tkinter.colorchooser#module-tkinter.colorchooser',
+    'tkinter.commondialog': 'dialog#module-tkinter.commondialog',
+    'tkinter.dnd': 'tkinter.dnd#module-tkinter.dnd',
+    'tkinter.filedialog': 'dialog#module-tkinter.filedialog',
+    'tkinter.font': 'tkinter.font#module-tkinter.font',
+    'tkinter.messagebox': 'tkinter.messagebox#module-tkinter.messagebox',
+    'tkinter.scrolledtext': 'tkinter.scrolledtext#module-tkinter.scrolledtext',
+    'tkinter.simpledialog': 'dialog#module-tkinter.simpledialog',
+    'tkinter.ttk': 'tkinter.ttk#module-tkinter.ttk',
+    'token': 'token#module-token',
+    'tokenize': 'tokenize#module-tokenize',
+    'tomllib': 'tomllib#module-tomllib',
+    'trace': 'trace#module-trace',
+    'traceback': 'traceback#module-traceback',
+    'tracemalloc': 'tracemalloc#module-tracemalloc',
+    'tty': 'tty#module-tty',
+    'turtle': 'turtle#module-turtle',
+    'turtledemo': 'turtle#module-turtledemo',
+    'types': 'types#module-types',
+    'typing': 'typing#module-typing',
+    'unicodedata': 'unicodedata#module-unicodedata',
+    'unittest': 'unittest#module-unittest',
+    'unittest.mock': 'unittest.mock#module-unittest.mock',
+    'urllib': 'urllib#module-urllib',
+    'urllib.error': 'urllib.error#module-urllib.error',
+    'urllib.parse': 'urllib.parse#module-urllib.parse',
+    'urllib.request': 'urllib.request#module-urllib.request',
+    'urllib.response': 'urllib.request#module-urllib.response',
+    'urllib.robotparser': 'urllib.robotparser#module-urllib.robotparser',
+    'usercustomize': 'site#module-usercustomize',
+    'uu': 'uu#module-uu',
+    'uuid': 'uuid#module-uuid',
+    'venv': 'venv#module-venv',
+    'warnings': 'warnings#module-warnings',
+    'wave': 'wave#module-wave',
+    'weakref': 'weakref#module-weakref',
+    'webbrowser': 'webbrowser#module-webbrowser',
+    'winreg': 'winreg#module-winreg',
+    'winsound': 'winsound#module-winsound',
+    'wsgiref': 'wsgiref#module-wsgiref',
+    'wsgiref.handlers': 'wsgiref#module-wsgiref.handlers',
+    'wsgiref.headers': 'wsgiref#module-wsgiref.headers',
+    'wsgiref.simple_server': 'wsgiref#module-wsgiref.simple_server',
+    'wsgiref.types': 'wsgiref#module-wsgiref.types',
+    'wsgiref.util': 'wsgiref#module-wsgiref.util',
+    'wsgiref.validate': 'wsgiref#module-wsgiref.validate',
+    'xdrlib': 'xdrlib#module-xdrlib',
+    'xml': 'xml#module-xml',
+    'xml.dom': 'xml.dom#module-xml.dom',
+    'xml.dom.minidom': 'xml.dom.minidom#module-xml.dom.minidom',
+    'xml.dom.pulldom': 'xml.dom.pulldom#module-xml.dom.pulldom',
+    'xml.etree.ElementInclude': 'xml.etree.elementtree#module-xml.etree.ElementInclude',
+    'xml.etree.ElementTree': 'xml.etree.elementtree#module-xml.etree.ElementTree',
+    'xml.parsers.expat': 'pyexpat#module-xml.parsers.expat',
+    'xml.parsers.expat.errors': 'pyexpat#module-xml.parsers.expat.errors',
+    'xml.parsers.expat.model': 'pyexpat#module-xml.parsers.expat.model',
+    'xml.sax': 'xml.sax#module-xml.sax',
+    'xml.sax.handler': 'xml.sax.handler#module-xml.sax.handler',
+    'xml.sax.saxutils': 'xml.sax.utils#module-xml.sax.saxutils',
+    'xml.sax.xmlreader': 'xml.sax.reader#module-xml.sax.xmlreader',
+    'xmlrpc': 'xmlrpc#module-xmlrpc',
+    'xmlrpc.client': 'xmlrpc.client#module-xmlrpc.client',
+    'xmlrpc.server': 'xmlrpc.server#module-xmlrpc.server',
+    'zipapp': 'zipapp#module-zipapp',
+    'zipfile': 'zipfile#module-zipfile',
+    'zipimport': 'zipimport#module-zipimport',
+    'zlib': 'zlib#module-zlib',
+    'zoneinfo': 'zoneinfo#module-zoneinfo',
+}
diff --git a/Lib/test/data/NormalizationTest-3.2.0.txt b/Lib/test/NormalizationTest-3.2.0.txt
similarity index 100%
rename from Lib/test/data/NormalizationTest-3.2.0.txt
rename to Lib/test/NormalizationTest-3.2.0.txt
diff --git a/Lib/test/_test_atexit.py b/Lib/test/_test_atexit.py
index f618c1fcbca..490b0686a0c 100644
--- a/Lib/test/_test_atexit.py
+++ b/Lib/test/_test_atexit.py
@@ -135,6 +135,19 @@ class GeneralTest(unittest.TestCase):
         finally:
             atexit.unregister(func)
 
+    def test_eq_unregister_clear(self):
+        # Issue #112127: callback's __eq__ may call unregister or _clear
+        class Evil:
+            def __eq__(self, other):
+                action(other)
+                return NotImplemented
+
+        for action in atexit.unregister, lambda o: atexit._clear():
+            with self.subTest(action=action):
+                atexit.register(lambda: None)
+                atexit.unregister(Evil())
+                atexit._clear()
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/Lib/test/test_bytes.py b/Lib/test/test_bytes.py
index a55ec6cf3b8..21be61e4fec 100644
--- a/Lib/test/test_bytes.py
+++ b/Lib/test/test_bytes.py
@@ -2060,6 +2060,37 @@ class ByteArrayTest(BaseBytesTest, unittest.TestCase):
             self.assertEqual(instance.ba[0], ord("?"), "Assigned bytearray not altered")
             self.assertEqual(instance.new_ba, bytearray(0x180), "Wrong object altered")
 
+    def test_search_methods_reentrancy_raises_buffererror(self):
+        # gh-142560: Raise BufferError if buffer mutates during search arg conversion.
+        class Evil:
+            def __init__(self, ba):
+                self.ba = ba
+            def __buffer__(self, flags):
+                self.ba.clear()
+                return memoryview(self.ba)
+            def __release_buffer__(self, view: memoryview) -> None:
+                view.release()
+            def __index__(self):
+                self.ba.clear()
+                return ord("A")
+
+        def make_case():
+            ba = bytearray(b"A")
+            return ba, Evil(ba)
+
+        for name in ("find", "count", "index", "rindex", "rfind"):
+            ba, evil = make_case()
+            with self.subTest(name):
+                with self.assertRaises(BufferError):
+                    getattr(ba, name)(evil)
+
+        ba, evil = make_case()
+        with self.assertRaises(BufferError):
+            evil in ba
+        with self.assertRaises(BufferError):
+            ba.split(evil)
+        with self.assertRaises(BufferError):
+            ba.rsplit(evil)
 
 class AssortedBytesTest(unittest.TestCase):
     #
diff --git a/Lib/test/test_capi/test_opt.py b/Lib/test/test_capi/test_opt.py
index 0fa74b9e8fa..570fb857e19 100644
--- a/Lib/test/test_capi/test_opt.py
+++ b/Lib/test/test_capi/test_opt.py
@@ -1877,6 +1877,26 @@ class TestUopsOptimization(unittest.TestCase):
         self.assertNotIn("_GUARD_TOS_UNICODE", uops)
         self.assertIn("_BINARY_OP_ADD_UNICODE", uops)
 
+    def test_binary_op_subscr_str_int(self):
+        def testfunc(n):
+            x = 0
+            s = "hello"
+            for _ in range(n):
+                c = s[1]  # _BINARY_OP_SUBSCR_STR_INT
+                if c == 'e':
+                    x += 1
+            return x
+
+        res, ex = self._run_with_optimizer(testfunc, TIER2_THRESHOLD)
+        self.assertEqual(res, TIER2_THRESHOLD)
+        self.assertIsNotNone(ex)
+        uops = get_opnames(ex)
+        self.assertIn("_BINARY_OP_SUBSCR_STR_INT", uops)
+        self.assertIn("_COMPARE_OP_STR", uops)
+        self.assertIn("_POP_TOP_NOP", uops)
+        self.assertNotIn("_POP_TOP", uops)
+        self.assertNotIn("_POP_TOP_INT", uops)
+
     def test_call_type_1_guards_removed(self):
         def testfunc(n):
             x = 0
@@ -2473,6 +2493,46 @@ class TestUopsOptimization(unittest.TestCase):
         uops = get_opnames(ex)
         self.assertIn("_POP_TOP_NOP", uops)
 
+    def test_load_attr_instance_value(self):
+        def testfunc(n):
+            class C():
+                pass
+            c = C()
+            c.x = n
+            x = 0
+            for _ in range(n):
+                x = c.x
+            return x
+        res, ex = self._run_with_optimizer(testfunc, TIER2_THRESHOLD)
+        self.assertEqual(res, TIER2_THRESHOLD)
+        self.assertIsNotNone(ex)
+        uops = get_opnames(ex)
+
+        self.assertIn("_LOAD_ATTR_INSTANCE_VALUE", uops)
+        self.assertNotIn("_POP_TOP", uops)
+        self.assertIn("_POP_TOP_NOP", uops)
+
+    def test_load_attr_with_hint(self):
+        def testfunc(n):
+            class C:
+                pass
+            c = C()
+            c.x = 42
+            for i in range(_testinternalcapi.SHARED_KEYS_MAX_SIZE - 1):
+                setattr(c, f"_{i}", None)
+            x = 0
+            for i in range(n):
+                x += c.x
+            return x
+        res, ex = self._run_with_optimizer(testfunc, TIER2_THRESHOLD)
+        self.assertEqual(res, 42 * TIER2_THRESHOLD)
+        self.assertIsNotNone(ex)
+        uops = get_opnames(ex)
+
+        self.assertIn("_LOAD_ATTR_WITH_HINT", uops)
+        self.assertNotIn("_POP_TOP", uops)
+        self.assertIn("_POP_TOP_NOP", uops)
+
     def test_int_add_op_refcount_elimination(self):
         def testfunc(n):
             c = 1
@@ -2533,6 +2593,22 @@ class TestUopsOptimization(unittest.TestCase):
         self.assertIn("_POP_TOP_NOP", uops)
         self.assertNotIn("_POP_TOP", uops)
 
+    def test_unicode_add_op_refcount_elimination(self):
+        def testfunc(n):
+            c = "a"
+            res = ""
+            for _ in range(n):
+                res = c + c
+            return res
+
+        res, ex = self._run_with_optimizer(testfunc, TIER2_THRESHOLD)
+        self.assertEqual(res, "aa")
+        self.assertIsNotNone(ex)
+        uops = get_opnames(ex)
+        self.assertIn("_BINARY_OP_ADD_UNICODE", uops)
+        self.assertIn("_POP_TOP_NOP", uops)
+        self.assertNotIn("_POP_TOP", uops)
+
     def test_remove_guard_for_slice_list(self):
         def f(n):
             for i in range(n):
@@ -2593,6 +2669,26 @@ class TestUopsOptimization(unittest.TestCase):
         self.assertNotIn("_POP_TOP", uops)
         self.assertIn("_POP_TOP_NOP", uops)
 
+    def test_store_attr_with_hint(self):
+        def testfunc(n):
+            class C:
+                pass
+            c = C()
+            for i in range(_testinternalcapi.SHARED_KEYS_MAX_SIZE - 1):
+                setattr(c, f"_{i}", None)
+
+            for i in range(n):
+                c.x = i
+            return c.x
+        res, ex = self._run_with_optimizer(testfunc, TIER2_THRESHOLD)
+        self.assertEqual(res, TIER2_THRESHOLD - 1)
+        self.assertIsNotNone(ex)
+        uops = get_opnames(ex)
+
+        self.assertIn("_STORE_ATTR_WITH_HINT", uops)
+        self.assertNotIn("_POP_TOP", uops)
+        self.assertIn("_POP_TOP_NOP", uops)
+
     def test_store_subscr_int(self):
         def testfunc(n):
             l = [0, 0, 0, 0]
@@ -2933,6 +3029,74 @@ class TestUopsOptimization(unittest.TestCase):
         for _ in range(TIER2_THRESHOLD+1):
             obj.attr = EvilAttr(obj.__dict__)
 
+    def test_promoted_global_refcount_eliminated(self):
+        result = script_helper.run_python_until_end('-c', textwrap.dedent("""
+        import _testinternalcapi
+        import opcode
+        import _opcode
+
+        def get_first_executor(func):
+            code = func.__code__
+            co_code = code.co_code
+            for i in range(0, len(co_code), 2):
+                try:
+                    return _opcode.get_executor(code, i)
+                except ValueError:
+                    pass
+            return None
+
+        def get_opnames(ex):
+            return {item[0] for item in ex}
+
+
+        def testfunc(n):
+            y = []
+            for i in range(n):
+                x = tuple(y)
+            return x
+
+        testfunc(_testinternalcapi.TIER2_THRESHOLD)
+
+        ex = get_first_executor(testfunc)
+        assert ex is not None
+        uops = get_opnames(ex)
+        assert "_LOAD_GLOBAL_BUILTIN" not in uops
+        assert "_LOAD_CONST_INLINE_BORROW" in uops
+        assert "_POP_TOP_NOP" in uops
+        assert "_POP_TOP" not in uops
+        """), PYTHON_JIT="1")
+        self.assertEqual(result[0].rc, 0, result)
+
+    def test_constant_fold_tuple(self):
+        def testfunc(n):
+            for _ in range(n):
+                t = (1,)
+                p = len(t)
+
+        res, ex = self._run_with_optimizer(testfunc, TIER2_THRESHOLD)
+        self.assertIsNotNone(ex)
+        uops = get_opnames(ex)
+
+        self.assertNotIn("_CALL_LEN", uops)
+
+    def test_binary_subscr_list_int(self):
+        def testfunc(n):
+            l = [1]
+            x = 0
+            for _ in range(n):
+                y = l[0]
+                x += y
+            return x
+
+        res, ex = self._run_with_optimizer(testfunc, TIER2_THRESHOLD)
+        self.assertEqual(res, TIER2_THRESHOLD)
+        self.assertIsNotNone(ex)
+        uops = get_opnames(ex)
+
+        self.assertIn("_BINARY_OP_SUBSCR_LIST_INT", uops)
+        self.assertNotIn("_POP_TOP", uops)
+        self.assertNotIn("_POP_TOP_INT", uops)
+        self.assertIn("_POP_TOP_NOP", uops)
 
 def global_identity(x):
     return x
diff --git a/Lib/test/test_configparser.py b/Lib/test/test_configparser.py
index e7364e18742..1bfb53ccbb1 100644
--- a/Lib/test/test_configparser.py
+++ b/Lib/test/test_configparser.py
@@ -2215,6 +2215,16 @@ class SectionlessTestCase(unittest.TestCase):
         cfg.add_section(configparser.UNNAMED_SECTION)
         cfg.set(configparser.UNNAMED_SECTION, 'a', '1')
         self.assertEqual('1', cfg[configparser.UNNAMED_SECTION]['a'])
+        output = io.StringIO()
+        cfg.write(output)
+        self.assertEqual(output.getvalue(), 'a = 1\n\n')
+
+        cfg = configparser.ConfigParser(allow_unnamed_section=True)
+        cfg[configparser.UNNAMED_SECTION] = {'a': '1'}
+        self.assertEqual('1', cfg[configparser.UNNAMED_SECTION]['a'])
+        output = io.StringIO()
+        cfg.write(output)
+        self.assertEqual(output.getvalue(), 'a = 1\n\n')
 
     def test_disabled_error(self):
         with self.assertRaises(configparser.MissingSectionHeaderError):
@@ -2223,6 +2233,9 @@ class SectionlessTestCase(unittest.TestCase):
         with self.assertRaises(configparser.UnnamedSectionDisabledError):
             configparser.ConfigParser().add_section(configparser.UNNAMED_SECTION)
 
+        with self.assertRaises(configparser.UnnamedSectionDisabledError):
+            configparser.ConfigParser()[configparser.UNNAMED_SECTION] = {'a': '1'}
+
     def test_multiple_configs(self):
         cfg = configparser.ConfigParser(allow_unnamed_section=True)
         cfg.read_string('a = 1')
diff --git a/Lib/test/test_defaultdict.py b/Lib/test/test_defaultdict.py
index bdbe9b81e8f..fbd7354a915 100644
--- a/Lib/test/test_defaultdict.py
+++ b/Lib/test/test_defaultdict.py
@@ -186,5 +186,23 @@ class TestDefaultDict(unittest.TestCase):
         with self.assertRaises(TypeError):
             i |= None
 
+    def test_factory_conflict_with_set_value(self):
+        key = "conflict_test"
+        count = 0
+
+        def default_factory():
+            nonlocal count
+            count += 1
+            local_count = count
+            if count == 1:
+                test_dict[key]
+            return local_count
+
+        test_dict = defaultdict(default_factory)
+
+        self.assertEqual(count, 0)
+        self.assertEqual(test_dict[key], 2)
+        self.assertEqual(count, 2)
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/Lib/test/test_free_threading/test_generators.py b/Lib/test/test_free_threading/test_generators.py
index d01675eb38b..11f59301bcd 100644
--- a/Lib/test/test_free_threading/test_generators.py
+++ b/Lib/test/test_free_threading/test_generators.py
@@ -49,3 +49,74 @@ class TestFTGenerators(TestCase):
             self.concurrent_write_with_func(func=set_gen_name)
         with self.subTest(func=set_gen_qualname):
             self.concurrent_write_with_func(func=set_gen_qualname)
+
+    def test_concurrent_send(self):
+        def gen():
+            yield 1
+            yield 2
+            yield 3
+            yield 4
+            yield 5
+
+        def run_test(drive_generator):
+            g = gen()
+            values = []
+            threading_helper.run_concurrently(drive_generator, self.NUM_THREADS, args=(g, values,))
+            self.assertEqual(sorted(values), [1, 2, 3, 4, 5])
+
+        def call_next(g, values):
+            while True:
+                try:
+                    values.append(next(g))
+                except ValueError:
+                    continue
+                except StopIteration:
+                    break
+
+        with self.subTest(method='next'):
+            run_test(call_next)
+
+        def call_send(g, values):
+            while True:
+                try:
+                    values.append(g.send(None))
+                except ValueError:
+                    continue
+                except StopIteration:
+                    break
+
+        with self.subTest(method='send'):
+            run_test(call_send)
+
+        def for_iter_gen(g, values):
+            while True:
+                try:
+                    for value in g:
+                        values.append(value)
+                    else:
+                        break
+                except ValueError:
+                    continue
+
+        with self.subTest(method='for'):
+            run_test(for_iter_gen)
+
+    def test_concurrent_close(self):
+        def gen():
+            for i in range(10):
+                yield i
+                time.sleep(0.001)
+
+        def drive_generator(g):
+            while True:
+                try:
+                    for value in g:
+                        if value == 5:
+                            g.close()
+                    else:
+                        return
+                except ValueError as e:
+                    self.assertEqual(e.args[0], "generator already executing")
+
+        g = gen()
+        threading_helper.run_concurrently(drive_generator, self.NUM_THREADS, args=(g,))
diff --git a/Lib/test/test_free_threading/test_list.py b/Lib/test/test_free_threading/test_list.py
index 44c0ac74e02..27ddc9c2d26 100644
--- a/Lib/test/test_free_threading/test_list.py
+++ b/Lib/test/test_free_threading/test_list.py
@@ -91,6 +91,64 @@ class TestList(TestCase):
         with threading_helper.start_threads(threads):
             pass
 
+    def test_reverse(self):
+        def reverse_list(b, l):
+            b.wait()
+            for _ in range(100):
+                l.reverse()
+
+        def reader_list(b, l):
+            b.wait()
+            for _ in range(100):
+                for i in range(10):
+                    self.assertTrue(0 <= l[i] < 10)
+
+        l = list(range(10))
+        barrier = Barrier(2)
+        threads = [Thread(target=reverse_list, args=(barrier, l)),
+                   Thread(target=reader_list, args=(barrier, l))]
+        with threading_helper.start_threads(threads):
+            pass
+
+    def test_slice_assignment1(self):
+        def assign_slice(b, l):
+            b.wait()
+            for _ in range(100):
+                l[2:5] = [7, 8, 9]
+
+        def reader_list(b, l):
+            b.wait()
+            for _ in range(100):
+                self.assertIn(l[2], (2, 7))
+                self.assertIn(l[3], (3, 8))
+                self.assertIn(l[4], (4, 9))
+
+        l = list(range(10))
+        barrier = Barrier(2)
+        threads = [Thread(target=assign_slice, args=(barrier, l)),
+                   Thread(target=reader_list, args=(barrier, l))]
+        with threading_helper.start_threads(threads):
+            pass
+
+    def test_slice_assignment2(self):
+        def assign_slice(b, l):
+            b.wait()
+            for _ in range(100):
+                l[::2] = [10, 11, 12, 13, 14]
+
+        def reader_list(b, l):
+            b.wait()
+            for _ in range(100):
+                for i in range(0, 10, 2):
+                    self.assertIn(l[i], (i, 10 + i // 2))
+
+        l = list(range(10))
+        barrier = Barrier(2)
+        threads = [Thread(target=assign_slice, args=(barrier, l)),
+                   Thread(target=reader_list, args=(barrier, l))]
+        with threading_helper.start_threads(threads):
+            pass
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/Lib/test/test_generators.py b/Lib/test/test_generators.py
index 3e41c7b9663..6643c396edf 100644
--- a/Lib/test/test_generators.py
+++ b/Lib/test/test_generators.py
@@ -134,6 +134,18 @@ class FinalizationTest(unittest.TestCase):
         self.assertEqual(len(resurrected), 1)
         self.assertIsInstance(resurrected[0].gi_code, types.CodeType)
 
+    def test_exhausted_generator_frame_cycle(self):
+        def g():
+            yield
+
+        generator = g()
+        frame = generator.gi_frame
+        self.assertIsNone(frame.f_back)
+        next(generator)
+        self.assertIsNone(frame.f_back)
+        next(generator, None)
+        self.assertIsNone(frame.f_back)
+
 
 class GeneratorTest(unittest.TestCase):
 
@@ -290,6 +302,33 @@ class GeneratorTest(unittest.TestCase):
 
         self.assertEqual([1,2], list(i for i in C()))
 
+    def test_close_clears_frame(self):
+        # gh-142766: Test that closing a generator clears its frame
+        class DetectDelete:
+            def __init__(self):
+                DetectDelete.deleted = False
+
+            def __del__(self):
+                DetectDelete.deleted = True
+
+        def generator(arg):
+            yield
+
+        # Test a freshly created generator (not suspended)
+        g = generator(DetectDelete())
+        g.close()
+        self.assertTrue(DetectDelete.deleted)
+
+        # Test a suspended generator
+        g = generator(DetectDelete())
+        next(g)
+        g.close()
+        self.assertTrue(DetectDelete.deleted)
+
+        # Clear via gi_frame.clear()
+        g = generator(DetectDelete())
+        g.gi_frame.clear()
+        self.assertTrue(DetectDelete.deleted)
 
 class ModifyUnderlyingIterableTest(unittest.TestCase):
     iterables = [
diff --git a/Lib/test/test_minidom.py b/Lib/test/test_minidom.py
index 7717a98583f..69fae957ec7 100644
--- a/Lib/test/test_minidom.py
+++ b/Lib/test/test_minidom.py
@@ -174,6 +174,7 @@ class MinidomTest(unittest.TestCase):
         self.assertEqual(dom.documentElement.childNodes[-1].data, "Hello")
         dom.unlink()
 
+    @support.requires_resource('cpu')
     def testAppendChildNoQuadraticComplexity(self):
         impl = getDOMImplementation()
 
@@ -182,14 +183,18 @@ class MinidomTest(unittest.TestCase):
         children = [newdoc.createElement(f"child-{i}") for i in range(1, 2 ** 15 + 1)]
         element = top_element
 
-        start = time.time()
+        start = time.monotonic()
         for child in children:
             element.appendChild(child)
             element = child
-        end = time.time()
+        end = time.monotonic()
 
         # This example used to take at least 30 seconds.
-        self.assertLess(end - start, 1)
+        # Conservative assertion due to the wide variety of systems and
+        # build configs timing based tests wind up run under.
+        # A --with-address-sanitizer --with-pydebug build on a rpi5 still
+        # completes this loop in <0.5 seconds.
+        self.assertLess(end - start, 4)
 
     def testSetAttributeNodeWithoutOwnerDocument(self):
         # regression test for gh-142754
diff --git a/Lib/test/test_mmap.py b/Lib/test/test_mmap.py
index 368af0cf89c..aad916ecfe2 100644
--- a/Lib/test/test_mmap.py
+++ b/Lib/test/test_mmap.py
@@ -6,6 +6,7 @@ from test.support import (
 from test.support.import_helper import import_module
 from test.support.os_helper import TESTFN, unlink
 from test.support.script_helper import assert_python_ok
+import errno
 import unittest
 import os
 import re
@@ -1165,6 +1166,46 @@ class MmapTests(unittest.TestCase):
             m.flush(PAGESIZE)
             m.flush(PAGESIZE, PAGESIZE)
 
+    @unittest.skipUnless(sys.platform == 'linux', 'Linux only')
+    @support.requires_linux_version(5, 17, 0)
+    def test_set_name(self):
+        # Test setting name on anonymous mmap
+        m = mmap.mmap(-1, PAGESIZE)
+        self.addCleanup(m.close)
+        try:
+            result = m.set_name('test_mapping')
+        except OSError as exc:
+            if exc.errno == errno.EINVAL:
+                # gh-142419: On Fedora, prctl(PR_SET_VMA_ANON_NAME) fails with
+                # EINVAL because the kernel option CONFIG_ANON_VMA_NAME is
+                # disabled.
+                # See: https://bugzilla.redhat.com/show_bug.cgi?id=2302746
+                self.skipTest("prctl() failed with EINVAL")
+            else:
+                raise
+        self.assertIsNone(result)
+
+        # Test name length limit (80 chars including prefix "cpython:mmap:" and '\0')
+        # Prefix is 13 chars, so max name is 66 chars
+        long_name = 'x' * 66
+        result = m.set_name(long_name)
+        self.assertIsNone(result)
+
+        # Test name too long
+        too_long_name = 'x' * 67
+        with self.assertRaises(ValueError):
+            m.set_name(too_long_name)
+
+        # Test that file-backed mmap raises error
+        with open(TESTFN, 'wb+') as f:
+            f.write(b'x' * PAGESIZE)
+            f.flush()
+            m2 = mmap.mmap(f.fileno(), PAGESIZE)
+            self.addCleanup(m2.close)
+
+            with self.assertRaises(ValueError):
+                m2.set_name('should_fail')
+
 
 class LargeMmapTests(unittest.TestCase):
 
diff --git a/Lib/test/test_pdb.py b/Lib/test/test_pdb.py
index f4c870036a7..0e23cd66043 100644
--- a/Lib/test/test_pdb.py
+++ b/Lib/test/test_pdb.py
@@ -3478,6 +3478,49 @@ def test_pdb_issue_gh_65052():
     (Pdb) continue
     """
 
+def test_pdb_commands_last_breakpoint():
+    """See GH-142834
+
+    >>> def test_function():
+    ...     import pdb; pdb.Pdb(nosigint=True, readrc=False).set_trace()
+    ...     foo = 1
+    ...     bar = 2
+
+    >>> with PdbTestInput([  # doctest: +NORMALIZE_WHITESPACE
+    ...     'break 4',
+    ...     'break 3',
+    ...     'clear 2',
+    ...     'commands',
+    ...     'p "success"',
+    ...     'end',
+    ...     'continue',
+    ...     'clear 1',
+    ...     'commands',
+    ...     'continue',
+    ... ]):
+    ...    test_function()
+    > <doctest test.test_pdb.test_pdb_commands_last_breakpoint[0]>(2)test_function()
+    -> import pdb; pdb.Pdb(nosigint=True, readrc=False).set_trace()
+    (Pdb) break 4
+    Breakpoint 1 at <doctest test.test_pdb.test_pdb_commands_last_breakpoint[0]>:4
+    (Pdb) break 3
+    Breakpoint 2 at <doctest test.test_pdb.test_pdb_commands_last_breakpoint[0]>:3
+    (Pdb) clear 2
+    Deleted breakpoint 2 at <doctest test.test_pdb.test_pdb_commands_last_breakpoint[0]>:3
+    (Pdb) commands
+    (com) p "success"
+    (com) end
+    (Pdb) continue
+    'success'
+    > <doctest test.test_pdb.test_pdb_commands_last_breakpoint[0]>(4)test_function()
+    -> bar = 2
+    (Pdb) clear 1
+    Deleted breakpoint 1 at <doctest test.test_pdb.test_pdb_commands_last_breakpoint[0]>:4
+    (Pdb) commands
+    *** cannot set commands: no existing breakpoint
+    (Pdb) continue
+    """
+
 
 @support.force_not_colorized_test_class
 @support.requires_subprocess()
@@ -3563,10 +3606,22 @@ class PdbTestCase(unittest.TestCase):
 
     def _fd_dir_for_pipe_targets(self):
         """Return a directory exposing live file descriptors, if any."""
+        return self._proc_fd_dir() or self._dev_fd_dir()
+
+    def _proc_fd_dir(self):
+        """Return /proc-backed fd dir when it can be used for pipes."""
+        # GH-142836: Opening /proc/self/fd entries for pipes raises EACCES on
+        # Solaris, so prefer other mechanisms there.
+        if sys.platform.startswith("sunos"):
+            return None
+
         proc_fd = "/proc/self/fd"
         if os.path.isdir(proc_fd) and os.path.exists(os.path.join(proc_fd, '0')):
             return proc_fd
+        return None
 
+    def _dev_fd_dir(self):
+        """Return /dev-backed fd dir when usable."""
         dev_fd = "/dev/fd"
         if os.path.isdir(dev_fd) and os.path.exists(os.path.join(dev_fd, '0')):
             if sys.platform.startswith("freebsd"):
@@ -3576,7 +3631,6 @@ class PdbTestCase(unittest.TestCase):
                 except FileNotFoundError:
                     return None
             return dev_fd
-
         return None
 
     def test_find_function_empty_file(self):
@@ -4039,6 +4093,23 @@ def bœr():
                     f.write("invalid")
                 self.assertEqual(pdb.Pdb().rcLines[0], "invalid")
 
+    def test_readrc_current_dir(self):
+        with os_helper.temp_cwd() as cwd:
+            rc_path = os.path.join(cwd, ".pdbrc")
+            with open(rc_path, "w") as f:
+                f.write("invalid")
+            self.assertEqual(pdb.Pdb().rcLines[-1], "invalid")
+
+    def test_readrc_cwd_is_home(self):
+        with os_helper.EnvironmentVarGuard() as env:
+            env.unset("HOME")
+            with os_helper.temp_cwd() as cwd, patch("os.path.expanduser"):
+                rc_path = os.path.join(cwd, ".pdbrc")
+                os.path.expanduser.return_value = rc_path
+                with open(rc_path, "w") as f:
+                    f.write("invalid")
+                self.assertEqual(pdb.Pdb().rcLines, ["invalid"])
+
     def test_header(self):
         stdout = StringIO()
         header = 'Nobody expects... blah, blah, blah'
diff --git a/Lib/test/test_profiling/test_sampling_profiler/test_binary_format.py b/Lib/test/test_profiling/test_sampling_profiler/test_binary_format.py
new file mode 100644
index 00000000000..2bc005901e3
--- /dev/null
+++ b/Lib/test/test_profiling/test_sampling_profiler/test_binary_format.py
@@ -0,0 +1,1081 @@
+"""Tests for binary format round-trip functionality."""
+
+import os
+import random
+import tempfile
+import unittest
+from collections import defaultdict
+
+try:
+    import _remote_debugging
+    from _remote_debugging import (
+        InterpreterInfo,
+        ThreadInfo,
+        FrameInfo,
+        LocationInfo,
+        THREAD_STATUS_HAS_GIL,
+        THREAD_STATUS_ON_CPU,
+        THREAD_STATUS_UNKNOWN,
+        THREAD_STATUS_GIL_REQUESTED,
+        THREAD_STATUS_HAS_EXCEPTION,
+    )
+    from profiling.sampling.binary_collector import BinaryCollector
+    from profiling.sampling.binary_reader import BinaryReader
+
+    ZSTD_AVAILABLE = _remote_debugging.zstd_available()
+except ImportError:
+    raise unittest.SkipTest(
+        "Test only runs when _remote_debugging is available"
+    )
+
+
+def make_frame(filename, lineno, funcname):
+    """Create a FrameInfo struct sequence."""
+    location = LocationInfo((lineno, lineno, -1, -1))
+    return FrameInfo((filename, location, funcname, None))
+
+
+def make_thread(thread_id, frames, status=0):
+    """Create a ThreadInfo struct sequence."""
+    return ThreadInfo((thread_id, status, frames))
+
+
+def make_interpreter(interp_id, threads):
+    """Create an InterpreterInfo struct sequence."""
+    return InterpreterInfo((interp_id, threads))
+
+
+def extract_lineno(location):
+    """Extract line number from location (tuple or int or None)."""
+    if location is None:
+        return 0  # Treat None as 0
+    if isinstance(location, tuple):
+        return location[0] if location[0] is not None else 0
+    return location
+
+
+class RawCollector:
+    """Collector that captures all raw data grouped by thread."""
+
+    def __init__(self):
+        # Key: (interpreter_id, thread_id) -> list of samples for that thread
+        self.by_thread = defaultdict(list)
+        self.total_count = 0
+
+    def collect(self, stack_frames, timestamps_us):
+        """Capture the raw sample data."""
+        # timestamps_us is a list; add one sample per timestamp
+        count = len(timestamps_us)
+        for interp in stack_frames:
+            for thread in interp.threads:
+                frames = []
+                for frame in thread.frame_info:
+                    frames.append(
+                        {
+                            "filename": frame.filename,
+                            "funcname": frame.funcname,
+                            "lineno": extract_lineno(frame.location),
+                        }
+                    )
+                key = (interp.interpreter_id, thread.thread_id)
+                sample = {"status": thread.status, "frames": frames}
+                for _ in range(count):
+                    self.by_thread[key].append(sample)
+                self.total_count += count
+
+    def export(self, filename):
+        pass
+
+
+def samples_to_by_thread(samples):
+    """Convert input samples to by-thread format for comparison."""
+    by_thread = defaultdict(list)
+    for sample in samples:
+        for interp in sample:
+            for thread in interp.threads:
+                frames = []
+                for frame in thread.frame_info:
+                    frames.append(
+                        {
+                            "filename": frame.filename,
+                            "funcname": frame.funcname,
+                            "lineno": extract_lineno(frame.location),
+                        }
+                    )
+                key = (interp.interpreter_id, thread.thread_id)
+                by_thread[key].append(
+                    {
+                        "status": thread.status,
+                        "frames": frames,
+                    }
+                )
+    return by_thread
+
+
+class BinaryFormatTestBase(unittest.TestCase):
+    """Base class with common setup/teardown for binary format tests."""
+
+    def setUp(self):
+        self.temp_files = []
+
+    def tearDown(self):
+        for f in self.temp_files:
+            if os.path.exists(f):
+                os.unlink(f)
+
+    def create_binary_file(self, samples, interval=1000, compression="none"):
+        """Create a test binary file and track it for cleanup."""
+        with tempfile.NamedTemporaryFile(suffix=".bin", delete=False) as f:
+            filename = f.name
+        self.temp_files.append(filename)
+
+        collector = BinaryCollector(
+            filename, interval, compression=compression
+        )
+        for sample in samples:
+            collector.collect(sample)
+        collector.export(None)
+        return filename
+
+    def roundtrip(self, samples, interval=1000, compression="none"):
+        """Write samples to binary and read back."""
+        filename = self.create_binary_file(samples, interval, compression)
+        collector = RawCollector()
+        with BinaryReader(filename) as reader:
+            count = reader.replay_samples(collector)
+        return collector, count
+
+    def assert_samples_equal(self, expected_samples, collector):
+        """Assert that roundtripped samples match input exactly, per-thread."""
+        expected = samples_to_by_thread(expected_samples)
+
+        # Same threads present
+        self.assertEqual(
+            set(expected.keys()),
+            set(collector.by_thread.keys()),
+            "Thread set mismatch",
+        )
+
+        # For each thread, samples match in order
+        for key in expected:
+            exp_samples = expected[key]
+            act_samples = collector.by_thread[key]
+            interp_id, thread_id = key
+
+            self.assertEqual(
+                len(exp_samples),
+                len(act_samples),
+                f"Thread ({interp_id}, {thread_id}): sample count mismatch "
+                f"(expected {len(exp_samples)}, got {len(act_samples)})",
+            )
+
+            for i, (exp, act) in enumerate(zip(exp_samples, act_samples)):
+                self.assertEqual(
+                    exp["status"],
+                    act["status"],
+                    f"Thread ({interp_id}, {thread_id}), sample {i}: "
+                    f"status mismatch (expected {exp['status']}, got {act['status']})",
+                )
+
+                self.assertEqual(
+                    len(exp["frames"]),
+                    len(act["frames"]),
+                    f"Thread ({interp_id}, {thread_id}), sample {i}: "
+                    f"frame count mismatch",
+                )
+
+                for j, (exp_frame, act_frame) in enumerate(
+                    zip(exp["frames"], act["frames"])
+                ):
+                    self.assertEqual(
+                        exp_frame["filename"],
+                        act_frame["filename"],
+                        f"Thread ({interp_id}, {thread_id}), sample {i}, "
+                        f"frame {j}: filename mismatch",
+                    )
+                    self.assertEqual(
+                        exp_frame["funcname"],
+                        act_frame["funcname"],
+                        f"Thread ({interp_id}, {thread_id}), sample {i}, "
+                        f"frame {j}: funcname mismatch",
+                    )
+                    self.assertEqual(
+                        exp_frame["lineno"],
+                        act_frame["lineno"],
+                        f"Thread ({interp_id}, {thread_id}), sample {i}, "
+                        f"frame {j}: lineno mismatch "
+                        f"(expected {exp_frame['lineno']}, got {act_frame['lineno']})",
+                    )
+
+
+class TestBinaryRoundTrip(BinaryFormatTestBase):
+    """Tests for exact binary format round-trip."""
+
+    def test_single_sample_single_frame(self):
+        """Single sample with one frame roundtrips exactly."""
+        samples = [
+            [
+                make_interpreter(
+                    0,
+                    [
+                        make_thread(
+                            12345, [make_frame("test.py", 42, "myfunc")]
+                        )
+                    ],
+                )
+            ]
+        ]
+        collector, count = self.roundtrip(samples)
+        self.assertEqual(count, 1)
+        self.assert_samples_equal(samples, collector)
+
+    def test_single_sample_multi_frame(self):
+        """Single sample with call stack roundtrips exactly."""
+        frames = [
+            make_frame("inner.py", 10, "inner"),
+            make_frame("middle.py", 20, "middle"),
+            make_frame("outer.py", 30, "outer"),
+        ]
+        samples = [[make_interpreter(0, [make_thread(100, frames)])]]
+        collector, count = self.roundtrip(samples)
+        self.assertEqual(count, 1)
+        self.assert_samples_equal(samples, collector)
+
+    def test_multiple_samples_same_stack(self):
+        """Multiple identical samples roundtrip exactly (tests RLE)."""
+        frame = make_frame("hot.py", 99, "hot_func")
+        samples = [
+            [make_interpreter(0, [make_thread(1, [frame])])]
+            for _ in range(100)
+        ]
+        collector, count = self.roundtrip(samples)
+        self.assertEqual(count, 100)
+        self.assert_samples_equal(samples, collector)
+
+    def test_multiple_samples_varying_stacks(self):
+        """Multiple samples with varying stacks roundtrip exactly."""
+        samples = []
+        for i in range(20):
+            depth = i % 5 + 1
+            frames = [
+                make_frame(f"f{j}.py", j * 10 + i, f"func{j}")
+                for j in range(depth)
+            ]
+            samples.append([make_interpreter(0, [make_thread(1, frames)])])
+        collector, count = self.roundtrip(samples)
+        self.assertEqual(count, 20)
+        self.assert_samples_equal(samples, collector)
+
+    def test_thread_ids_preserved(self):
+        """Thread IDs are preserved exactly."""
+        thread_ids = [1, 12345, 0x7FFF12345678, 999999]
+        samples = []
+        for tid in thread_ids:
+            samples.append(
+                [
+                    make_interpreter(
+                        0, [make_thread(tid, [make_frame("t.py", 10, "f")])]
+                    )
+                ]
+            )
+        collector, count = self.roundtrip(samples)
+        self.assertEqual(count, len(thread_ids))
+        self.assert_samples_equal(samples, collector)
+
+    def test_interpreter_ids_preserved(self):
+        """Interpreter IDs are preserved exactly."""
+        interp_ids = [0, 1, 5, 100]
+        samples = []
+        for iid in interp_ids:
+            samples.append(
+                [
+                    make_interpreter(
+                        iid, [make_thread(1, [make_frame("i.py", 10, "f")])]
+                    )
+                ]
+            )
+        collector, count = self.roundtrip(samples)
+        self.assertEqual(count, len(interp_ids))
+        self.assert_samples_equal(samples, collector)
+
+    def test_status_flags_preserved(self):
+        """All thread status flags are preserved exactly."""
+        statuses = [
+            0,
+            THREAD_STATUS_HAS_GIL,
+            THREAD_STATUS_ON_CPU,
+            THREAD_STATUS_UNKNOWN,
+            THREAD_STATUS_GIL_REQUESTED,
+            THREAD_STATUS_HAS_EXCEPTION,
+            THREAD_STATUS_HAS_GIL | THREAD_STATUS_ON_CPU,
+            THREAD_STATUS_HAS_GIL | THREAD_STATUS_HAS_EXCEPTION,
+            THREAD_STATUS_HAS_GIL
+            | THREAD_STATUS_ON_CPU
+            | THREAD_STATUS_GIL_REQUESTED,
+        ]
+        samples = []
+        for i, status in enumerate(statuses):
+            samples.append(
+                [
+                    make_interpreter(
+                        0,
+                        [
+                            make_thread(
+                                1, [make_frame("s.py", 10 + i, "f")], status
+                            )
+                        ],
+                    )
+                ]
+            )
+        collector, count = self.roundtrip(samples)
+        self.assertEqual(count, len(statuses))
+        self.assert_samples_equal(samples, collector)
+
+    def test_multiple_threads_per_sample(self):
+        """Multiple threads in one sample roundtrip exactly."""
+        threads = [
+            make_thread(
+                1, [make_frame("t1.py", 10, "t1")], THREAD_STATUS_HAS_GIL
+            ),
+            make_thread(
+                2, [make_frame("t2.py", 20, "t2")], THREAD_STATUS_ON_CPU
+            ),
+            make_thread(3, [make_frame("t3.py", 30, "t3")], 0),
+        ]
+        samples = [[make_interpreter(0, threads)] for _ in range(10)]
+        collector, count = self.roundtrip(samples)
+        # 10 samples × 3 threads = 30 thread-samples
+        self.assertEqual(count, 30)
+        self.assert_samples_equal(samples, collector)
+
+    def test_multiple_interpreters_per_sample(self):
+        """Multiple interpreters in one sample roundtrip exactly."""
+        samples = [
+            [
+                make_interpreter(
+                    0, [make_thread(1, [make_frame("i0.py", 10, "i0")])]
+                ),
+                make_interpreter(
+                    1, [make_thread(2, [make_frame("i1.py", 20, "i1")])]
+                ),
+            ]
+            for _ in range(5)
+        ]
+        collector, count = self.roundtrip(samples)
+        # 5 samples × 2 interpreters × 1 thread = 10 thread-samples
+        self.assertEqual(count, 10)
+        self.assert_samples_equal(samples, collector)
+
+    def test_same_thread_id_different_interpreters(self):
+        """Same thread_id in different interpreters must be tracked separately."""
+        # This test catches bugs where thread state is keyed only by thread_id
+        # without considering interpreter_id
+        samples = []
+        # Interleave samples from interpreter 0 and 1, both using thread_id=1
+        for i in range(20):
+            interp_id = i % 2  # Alternate between interpreter 0 and 1
+            frame = make_frame(
+                f"interp{interp_id}.py", 10 + i, f"func{interp_id}"
+            )
+            samples.append(
+                [make_interpreter(interp_id, [make_thread(1, [frame])])]
+            )
+
+        collector, count = self.roundtrip(samples)
+        self.assertEqual(count, 20)
+        self.assert_samples_equal(samples, collector)
+
+        # Verify both interpreters are present
+        keys = set(collector.by_thread.keys())
+        self.assertIn((0, 1), keys)  # interpreter 0, thread 1
+        self.assertIn((1, 1), keys)  # interpreter 1, thread 1
+
+        # Verify each interpreter got 10 samples
+        self.assertEqual(len(collector.by_thread[(0, 1)]), 10)
+        self.assertEqual(len(collector.by_thread[(1, 1)]), 10)
+
+        # Verify the samples are in the right order for each interpreter
+        for i, sample in enumerate(collector.by_thread[(0, 1)]):
+            expected_lineno = 10 + i * 2  # 10, 12, 14, ...
+            self.assertEqual(sample["frames"][0]["lineno"], expected_lineno)
+            self.assertEqual(sample["frames"][0]["filename"], "interp0.py")
+
+        for i, sample in enumerate(collector.by_thread[(1, 1)]):
+            expected_lineno = 11 + i * 2  # 11, 13, 15, ...
+            self.assertEqual(sample["frames"][0]["lineno"], expected_lineno)
+            self.assertEqual(sample["frames"][0]["filename"], "interp1.py")
+
+    def test_deep_call_stack(self):
+        """Deep call stack roundtrips exactly."""
+        depth = 100
+        frames = [
+            make_frame(f"f{i}.py", i + 1, f"func{i}") for i in range(depth)
+        ]
+        samples = [[make_interpreter(0, [make_thread(1, frames)])]]
+        collector, count = self.roundtrip(samples)
+        self.assertEqual(count, 1)
+        self.assert_samples_equal(samples, collector)
+
+    def test_line_numbers_preserved(self):
+        """Various line numbers are preserved exactly."""
+        linenos = [1, 100, 1000, 65535, 100000]
+        samples = []
+        for lineno in linenos:
+            samples.append(
+                [
+                    make_interpreter(
+                        0, [make_thread(1, [make_frame("l.py", lineno, "f")])]
+                    )
+                ]
+            )
+        collector, count = self.roundtrip(samples)
+        self.assertEqual(count, len(linenos))
+        self.assert_samples_equal(samples, collector)
+
+    @unittest.skipUnless(ZSTD_AVAILABLE, "zstd compression not available")
+    def test_zstd_compression_roundtrip(self):
+        """Zstd compressed data roundtrips exactly."""
+        samples = []
+        for i in range(200):
+            frames = [
+                make_frame(f"z{j}.py", j * 10 + i + 1, f"zfunc{j}")
+                for j in range(3)
+            ]
+            samples.append([make_interpreter(0, [make_thread(1, frames)])])
+        collector, count = self.roundtrip(samples, compression="zstd")
+        self.assertEqual(count, 200)
+        self.assert_samples_equal(samples, collector)
+
+    def test_sample_interval_preserved(self):
+        """Sample interval is preserved in file metadata."""
+        intervals = [100, 500, 1000, 5000, 10000]
+        for interval in intervals:
+            with self.subTest(interval=interval):
+                samples = [
+                    [
+                        make_interpreter(
+                            0, [make_thread(1, [make_frame("i.py", 1, "f")])]
+                        )
+                    ]
+                ]
+                filename = self.create_binary_file(samples, interval=interval)
+                with BinaryReader(filename) as reader:
+                    info = reader.get_info()
+                    self.assertEqual(info["sample_interval_us"], interval)
+
+    def test_threads_interleaved_samples(self):
+        """Multiple threads with interleaved varying samples."""
+        samples = []
+        for i in range(30):
+            threads = [
+                make_thread(
+                    1,
+                    [make_frame("t1.py", 10 + i, "t1")],
+                    THREAD_STATUS_HAS_GIL if i % 2 == 0 else 0,
+                ),
+                make_thread(
+                    2,
+                    [make_frame("t2.py", 20 + i, "t2")],
+                    THREAD_STATUS_ON_CPU if i % 3 == 0 else 0,
+                ),
+            ]
+            samples.append([make_interpreter(0, threads)])
+        collector, count = self.roundtrip(samples)
+        self.assertEqual(count, 60)
+        self.assert_samples_equal(samples, collector)
+
+
+class TestBinaryEdgeCases(BinaryFormatTestBase):
+    """Tests for edge cases in binary format."""
+
+    def test_unicode_filenames(self):
+        """Unicode filenames roundtrip exactly."""
+        filenames = [
+            "/путь/файл.py",
+            "/路径/文件.py",
+            "/パス/ファイル.py",
+            "/chemin/café.py",
+        ]
+        for fname in filenames:
+            with self.subTest(filename=fname):
+                samples = [
+                    [
+                        make_interpreter(
+                            0, [make_thread(1, [make_frame(fname, 1, "func")])]
+                        )
+                    ]
+                ]
+                collector, count = self.roundtrip(samples)
+                self.assertEqual(count, 1)
+                self.assert_samples_equal(samples, collector)
+
+    def test_unicode_funcnames(self):
+        """Unicode function names roundtrip exactly."""
+        funcnames = [
+            "функция",
+            "函数",
+            "関数",
+            "función",
+        ]
+        for funcname in funcnames:
+            with self.subTest(funcname=funcname):
+                samples = [
+                    [
+                        make_interpreter(
+                            0,
+                            [
+                                make_thread(
+                                    1, [make_frame("test.py", 1, funcname)]
+                                )
+                            ],
+                        )
+                    ]
+                ]
+                collector, count = self.roundtrip(samples)
+                self.assertEqual(count, 1)
+                self.assert_samples_equal(samples, collector)
+
+    def test_special_char_filenames(self):
+        """Filenames with special characters roundtrip exactly."""
+        filenames = [
+            "/path/with spaces/file.py",
+            "/path/with\ttab/file.py",
+            "/path/with'quote/file.py",
+            '/path/with"double/file.py',
+            "/path/with\\backslash/file.py",
+        ]
+        for fname in filenames:
+            with self.subTest(filename=fname):
+                samples = [
+                    [
+                        make_interpreter(
+                            0, [make_thread(1, [make_frame(fname, 1, "func")])]
+                        )
+                    ]
+                ]
+                collector, count = self.roundtrip(samples)
+                self.assertEqual(count, 1)
+                self.assert_samples_equal(samples, collector)
+
+    def test_special_funcnames(self):
+        """Function names with special characters roundtrip exactly."""
+        funcnames = [
+            "<lambda>",
+            "<listcomp>",
+            "<genexpr>",
+            "<module>",
+            "__init__",
+            "func.inner",
+        ]
+        for funcname in funcnames:
+            with self.subTest(funcname=funcname):
+                samples = [
+                    [
+                        make_interpreter(
+                            0,
+                            [
+                                make_thread(
+                                    1, [make_frame("test.py", 1, funcname)]
+                                )
+                            ],
+                        )
+                    ]
+                ]
+                collector, count = self.roundtrip(samples)
+                self.assertEqual(count, 1)
+                self.assert_samples_equal(samples, collector)
+
+    def test_long_filename(self):
+        """Long filename roundtrips exactly."""
+        long_file = "/very/long/path/" + "sub/" * 50 + "file.py"
+        samples = [
+            [
+                make_interpreter(
+                    0, [make_thread(1, [make_frame(long_file, 1, "func")])]
+                )
+            ]
+        ]
+        collector, count = self.roundtrip(samples)
+        self.assertEqual(count, 1)
+        self.assert_samples_equal(samples, collector)
+
+    def test_long_funcname(self):
+        """Long function name roundtrips exactly."""
+        long_func = "very_long_function_name_" + "x" * 200
+        samples = [
+            [
+                make_interpreter(
+                    0, [make_thread(1, [make_frame("test.py", 1, long_func)])]
+                )
+            ]
+        ]
+        collector, count = self.roundtrip(samples)
+        self.assertEqual(count, 1)
+        self.assert_samples_equal(samples, collector)
+
+    def test_empty_funcname(self):
+        """Empty function name roundtrips exactly."""
+        samples = [
+            [
+                make_interpreter(
+                    0, [make_thread(1, [make_frame("test.py", 1, "")])]
+                )
+            ]
+        ]
+        collector, count = self.roundtrip(samples)
+        self.assertEqual(count, 1)
+        self.assert_samples_equal(samples, collector)
+
+    @unittest.skipUnless(ZSTD_AVAILABLE, "zstd compression not available")
+    def test_large_sample_count(self):
+        """Large number of samples roundtrips exactly."""
+        num = 5000
+        samples = [
+            [
+                make_interpreter(
+                    0,
+                    [
+                        make_thread(
+                            1, [make_frame("test.py", (i % 100) + 1, "func")]
+                        )
+                    ],
+                )
+            ]
+            for i in range(num)
+        ]
+        collector, count = self.roundtrip(samples, compression="zstd")
+        self.assertEqual(count, num)
+        self.assert_samples_equal(samples, collector)
+
+    def test_context_manager_cleanup(self):
+        """Reader cleans up on context exit."""
+        samples = [
+            [
+                make_interpreter(
+                    0, [make_thread(1, [make_frame("t.py", 1, "f")])]
+                )
+            ]
+        ]
+        filename = self.create_binary_file(samples)
+        reader = BinaryReader(filename)
+        with reader:
+            collector = RawCollector()
+            count = reader.replay_samples(collector)
+            self.assertEqual(count, 1)
+        with self.assertRaises(RuntimeError):
+            reader.replay_samples(collector)
+
+    def test_invalid_file_path(self):
+        """Invalid file path raises appropriate error."""
+        with self.assertRaises((FileNotFoundError, OSError, ValueError)):
+            with BinaryReader("/nonexistent/path/file.bin") as reader:
+                reader.replay_samples(RawCollector())
+
+
+class TestBinaryEncodings(BinaryFormatTestBase):
+    """Tests specifically targeting different stack encodings."""
+
+    def test_stack_full_encoding(self):
+        """First sample uses STACK_FULL encoding and roundtrips."""
+        frames = [make_frame(f"f{i}.py", i + 1, f"func{i}") for i in range(5)]
+        samples = [[make_interpreter(0, [make_thread(1, frames)])]]
+        collector, count = self.roundtrip(samples)
+        self.assertEqual(count, 1)
+        self.assert_samples_equal(samples, collector)
+
+    def test_stack_repeat_encoding(self):
+        """Identical consecutive samples use RLE and roundtrip."""
+        frame = make_frame("repeat.py", 42, "repeat_func")
+        samples = [
+            [make_interpreter(0, [make_thread(1, [frame])])]
+            for _ in range(1000)
+        ]
+        collector, count = self.roundtrip(samples)
+        self.assertEqual(count, 1000)
+        self.assert_samples_equal(samples, collector)
+
+    def test_stack_suffix_encoding(self):
+        """Samples sharing suffix use STACK_SUFFIX and roundtrip."""
+        samples = []
+        for i in range(10):
+            frames = [make_frame(f"new{i}.py", i + 1, f"new{i}")]
+            frames.extend(
+                [
+                    make_frame(f"shared{j}.py", j + 1, f"shared{j}")
+                    for j in range(5)
+                ]
+            )
+            samples.append([make_interpreter(0, [make_thread(1, frames)])])
+        collector, count = self.roundtrip(samples)
+        self.assertEqual(count, 10)
+        self.assert_samples_equal(samples, collector)
+
+    def test_stack_pop_push_encoding(self):
+        """Samples with pop+push pattern roundtrip."""
+        samples = []
+        base_frames = [make_frame("base.py", 10, "base")]
+
+        # Call deeper
+        samples.append([make_interpreter(0, [make_thread(1, base_frames)])])
+        samples.append(
+            [
+                make_interpreter(
+                    0,
+                    [
+                        make_thread(
+                            1,
+                            [make_frame("call1.py", 20, "call1")]
+                            + base_frames,
+                        )
+                    ],
+                )
+            ]
+        )
+        samples.append(
+            [
+                make_interpreter(
+                    0,
+                    [
+                        make_thread(
+                            1,
+                            [
+                                make_frame("call2.py", 30, "call2"),
+                                make_frame("call1.py", 20, "call1"),
+                            ]
+                            + base_frames,
+                        )
+                    ],
+                )
+            ]
+        )
+        # Return
+        samples.append(
+            [
+                make_interpreter(
+                    0,
+                    [
+                        make_thread(
+                            1,
+                            [make_frame("call1.py", 25, "call1")]
+                            + base_frames,
+                        )
+                    ],
+                )
+            ]
+        )
+        samples.append([make_interpreter(0, [make_thread(1, base_frames)])])
+
+        collector, count = self.roundtrip(samples)
+        self.assertEqual(count, 5)
+        self.assert_samples_equal(samples, collector)
+
+    def test_mixed_encodings(self):
+        """Mix of different encoding patterns roundtrips."""
+        samples = []
+        # Some repeated samples (RLE)
+        frame1 = make_frame("hot.py", 1, "hot")
+        for _ in range(20):
+            samples.append([make_interpreter(0, [make_thread(1, [frame1])])])
+        # Some varying samples
+        for i in range(20):
+            frames = [make_frame(f"vary{i}.py", i + 1, f"vary{i}")]
+            samples.append([make_interpreter(0, [make_thread(1, frames)])])
+        # More repeated
+        for _ in range(20):
+            samples.append([make_interpreter(0, [make_thread(1, [frame1])])])
+
+        collector, count = self.roundtrip(samples)
+        self.assertEqual(count, 60)
+        self.assert_samples_equal(samples, collector)
+
+    def test_alternating_threads_status_changes(self):
+        """Alternating thread status changes roundtrip correctly."""
+        samples = []
+        for i in range(50):
+            status1 = THREAD_STATUS_HAS_GIL if i % 2 == 0 else 0
+            status2 = (
+                THREAD_STATUS_ON_CPU if i % 3 == 0 else THREAD_STATUS_HAS_GIL
+            )
+            threads = [
+                make_thread(1, [make_frame("t1.py", 10, "t1")], status1),
+                make_thread(2, [make_frame("t2.py", 20, "t2")], status2),
+            ]
+            samples.append([make_interpreter(0, threads)])
+        collector, count = self.roundtrip(samples)
+        self.assertEqual(count, 100)
+        self.assert_samples_equal(samples, collector)
+
+
+class TestBinaryStress(BinaryFormatTestBase):
+    """Randomized stress tests for binary format."""
+
+    @unittest.skipUnless(ZSTD_AVAILABLE, "zstd compression not available")
+    def test_random_samples_stress(self):
+        """Stress test with random samples - exercises hash table resizing."""
+        random.seed(42)  # Reproducible
+
+        # Large pools to force hash table resizing (exceeds initial 8192/4096 sizes)
+        filenames = [f"file{i}.py" for i in range(200)]
+        funcnames = [f"func{i}" for i in range(300)]
+        thread_ids = list(range(1, 50))
+        interp_ids = list(range(10))
+        statuses = [
+            0,
+            THREAD_STATUS_HAS_GIL,
+            THREAD_STATUS_ON_CPU,
+            THREAD_STATUS_HAS_GIL | THREAD_STATUS_ON_CPU,
+            THREAD_STATUS_HAS_EXCEPTION,
+        ]
+
+        samples = []
+        for _ in range(1000):
+            num_interps = random.randint(1, 3)
+            interps = []
+            for _ in range(num_interps):
+                iid = random.choice(interp_ids)
+                num_threads = random.randint(1, 5)
+                threads = []
+                for _ in range(num_threads):
+                    tid = random.choice(thread_ids)
+                    status = random.choice(statuses)
+                    depth = random.randint(1, 15)
+                    frames = []
+                    for _ in range(depth):
+                        fname = random.choice(filenames)
+                        func = random.choice(funcnames)
+                        # Wide line number range to create many unique frames
+                        lineno = random.randint(1, 5000)
+                        frames.append(make_frame(fname, lineno, func))
+                    threads.append(make_thread(tid, frames, status))
+                interps.append(make_interpreter(iid, threads))
+            samples.append(interps)
+
+        collector, count = self.roundtrip(samples, compression="zstd")
+        self.assertGreater(count, 0)
+        self.assert_samples_equal(samples, collector)
+
+    def test_rle_stress(self):
+        """Stress test RLE encoding with identical samples."""
+        random.seed(123)
+
+        # Create a few distinct stacks
+        stacks = []
+        for i in range(5):
+            depth = random.randint(1, 8)
+            frames = [
+                make_frame(f"rle{j}.py", j * 10, f"rle{j}")
+                for j in range(depth)
+            ]
+            stacks.append(frames)
+
+        # Generate samples with repeated stacks (should trigger RLE)
+        samples = []
+        for _ in range(100):
+            stack = random.choice(stacks)
+            repeat = random.randint(1, 50)
+            for _ in range(repeat):
+                samples.append([make_interpreter(0, [make_thread(1, stack)])])
+
+        collector, count = self.roundtrip(samples)
+        self.assertEqual(count, len(samples))
+        self.assert_samples_equal(samples, collector)
+
+    @unittest.skipUnless(ZSTD_AVAILABLE, "zstd compression not available")
+    def test_multi_thread_stress(self):
+        """Stress test with many threads and interleaved samples."""
+        random.seed(456)
+
+        thread_ids = list(range(1, 20))
+        samples = []
+
+        for i in range(300):
+            # Randomly select 1-5 threads for this sample
+            num_threads = random.randint(1, 5)
+            selected = random.sample(thread_ids, num_threads)
+            threads = []
+            for tid in selected:
+                status = random.choice(
+                    [0, THREAD_STATUS_HAS_GIL, THREAD_STATUS_ON_CPU]
+                )
+                depth = random.randint(1, 5)
+                frames = [
+                    make_frame(f"mt{tid}_{j}.py", i + j, f"f{j}")
+                    for j in range(depth)
+                ]
+                threads.append(make_thread(tid, frames, status))
+            samples.append([make_interpreter(0, threads)])
+
+        collector, count = self.roundtrip(samples, compression="zstd")
+        self.assertGreater(count, 0)
+        self.assert_samples_equal(samples, collector)
+
+    def test_encoding_transitions_stress(self):
+        """Stress test stack encoding transitions."""
+        random.seed(789)
+
+        base_frames = [
+            make_frame(f"base{i}.py", i, f"base{i}") for i in range(5)
+        ]
+        samples = []
+
+        for i in range(200):
+            choice = random.randint(0, 4)
+            if choice == 0:
+                # Full new stack
+                depth = random.randint(1, 8)
+                frames = [
+                    make_frame(f"new{i}_{j}.py", j, f"new{j}")
+                    for j in range(depth)
+                ]
+            elif choice == 1:
+                # Repeat previous (will use RLE if identical)
+                frames = base_frames[: random.randint(1, 5)]
+            elif choice == 2:
+                # Add frames on top (suffix encoding)
+                extra = random.randint(1, 3)
+                frames = [
+                    make_frame(f"top{i}_{j}.py", j, f"top{j}")
+                    for j in range(extra)
+                ]
+                frames.extend(base_frames[: random.randint(2, 4)])
+            else:
+                # Pop and push (pop-push encoding)
+                keep = random.randint(1, 3)
+                push = random.randint(0, 2)
+                frames = [
+                    make_frame(f"push{i}_{j}.py", j, f"push{j}")
+                    for j in range(push)
+                ]
+                frames.extend(base_frames[:keep])
+
+            samples.append([make_interpreter(0, [make_thread(1, frames)])])
+
+        collector, count = self.roundtrip(samples)
+        self.assertEqual(count, len(samples))
+        self.assert_samples_equal(samples, collector)
+
+    @unittest.skipUnless(ZSTD_AVAILABLE, "zstd compression not available")
+    def test_same_thread_id_multiple_interpreters_stress(self):
+        """Stress test: same thread_id across multiple interpreters with interleaved samples.
+
+        This test catches bugs where thread state is keyed only by thread_id
+        without considering interpreter_id (both in writer and reader).
+        """
+        random.seed(999)
+
+        # Multiple interpreters, each with overlapping thread_ids
+        interp_ids = [0, 1, 2, 3]
+        # Same thread_ids used across all interpreters
+        shared_thread_ids = [1, 2, 3]
+
+        filenames = [f"file{i}.py" for i in range(10)]
+        funcnames = [f"func{i}" for i in range(15)]
+        statuses = [0, THREAD_STATUS_HAS_GIL, THREAD_STATUS_ON_CPU]
+
+        samples = []
+        for i in range(1000):
+            # Randomly pick an interpreter
+            iid = random.choice(interp_ids)
+            # Randomly pick 1-3 threads (from shared pool)
+            num_threads = random.randint(1, 3)
+            selected_tids = random.sample(shared_thread_ids, num_threads)
+
+            threads = []
+            for tid in selected_tids:
+                status = random.choice(statuses)
+                depth = random.randint(1, 6)
+                frames = []
+                for d in range(depth):
+                    # Include interpreter and thread info in frame data for verification
+                    fname = f"i{iid}_t{tid}_{random.choice(filenames)}"
+                    func = random.choice(funcnames)
+                    lineno = i * 10 + d + 1  # Unique per sample
+                    frames.append(make_frame(fname, lineno, func))
+                threads.append(make_thread(tid, frames, status))
+
+            samples.append([make_interpreter(iid, threads)])
+
+        collector, count = self.roundtrip(samples, compression="zstd")
+        self.assertGreater(count, 0)
+        self.assert_samples_equal(samples, collector)
+
+        # Verify that we have samples from multiple (interpreter, thread) combinations
+        # with the same thread_id
+        keys = set(collector.by_thread.keys())
+        # Should have samples for same thread_id in different interpreters
+        for tid in shared_thread_ids:
+            interps_with_tid = [iid for (iid, t) in keys if t == tid]
+            self.assertGreater(
+                len(interps_with_tid),
+                1,
+                f"Thread {tid} should appear in multiple interpreters",
+            )
+
+
+class TimestampCollector:
+    """Collector that captures timestamps for verification."""
+
+    def __init__(self):
+        self.all_timestamps = []
+
+    def collect(self, stack_frames, timestamps_us=None):
+        if timestamps_us is not None:
+            self.all_timestamps.extend(timestamps_us)
+
+    def export(self, filename):
+        pass
+
+
+class TestTimestampPreservation(BinaryFormatTestBase):
+    """Tests for timestamp preservation during binary round-trip."""
+
+    def test_timestamp_preservation(self):
+        """Timestamps are preserved during round-trip."""
+        frame = make_frame("test.py", 10, "func")
+        timestamps = [1000000, 2000000, 3000000]
+
+        with tempfile.NamedTemporaryFile(suffix=".bin", delete=False) as f:
+            filename = f.name
+        self.temp_files.append(filename)
+
+        collector = BinaryCollector(filename, 1000, compression="none")
+        for ts in timestamps:
+            sample = [make_interpreter(0, [make_thread(1, [frame])])]
+            collector.collect(sample, timestamp_us=ts)
+        collector.export(None)
+
+        ts_collector = TimestampCollector()
+        with BinaryReader(filename) as reader:
+            count = reader.replay_samples(ts_collector)
+
+        self.assertEqual(count, 3)
+        self.assertEqual(ts_collector.all_timestamps, timestamps)
+
+    def test_timestamp_preservation_with_rle(self):
+        """RLE-batched samples preserve individual timestamps."""
+        frame = make_frame("rle.py", 42, "rle_func")
+
+        with tempfile.NamedTemporaryFile(suffix=".bin", delete=False) as f:
+            filename = f.name
+        self.temp_files.append(filename)
+
+        # Identical samples (triggers RLE) with different timestamps
+        collector = BinaryCollector(filename, 1000, compression="none")
+        expected_timestamps = []
+        for i in range(50):
+            ts = 1000000 + i * 100
+            expected_timestamps.append(ts)
+            sample = [make_interpreter(0, [make_thread(1, [frame])])]
+            collector.collect(sample, timestamp_us=ts)
+        collector.export(None)
+
+        ts_collector = TimestampCollector()
+        with BinaryReader(filename) as reader:
+            count = reader.replay_samples(ts_collector)
+
+        self.assertEqual(count, 50)
+        self.assertEqual(ts_collector.all_timestamps, expected_timestamps)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/Lib/test/test_profiling/test_sampling_profiler/test_cli.py b/Lib/test/test_profiling/test_sampling_profiler/test_cli.py
index 4434335130c..9b2b16d6e19 100644
--- a/Lib/test/test_profiling/test_sampling_profiler/test_cli.py
+++ b/Lib/test/test_profiling/test_sampling_profiler/test_cli.py
@@ -16,6 +16,7 @@ except ImportError:
 from test.support import is_emscripten, requires_remote_subprocess_debugging
 
 from profiling.sampling.cli import main
+from profiling.sampling.errors import SamplingScriptNotFoundError, SamplingModuleNotFoundError, SamplingUnknownProcessError
 
 
 class TestSampleProfilerCLI(unittest.TestCase):
@@ -203,12 +204,12 @@ class TestSampleProfilerCLI(unittest.TestCase):
         with (
             mock.patch("sys.argv", test_args),
             mock.patch("sys.stderr", io.StringIO()) as mock_stderr,
-            self.assertRaises(SystemExit) as cm,
+            self.assertRaises(SamplingScriptNotFoundError) as cm,
         ):
             main()
 
         # Verify the error is about the non-existent script
-        self.assertIn("12345", str(cm.exception.code))
+        self.assertIn("12345", str(cm.exception))
 
     def test_cli_no_target_specified(self):
         # In new CLI, must specify a subcommand
@@ -436,6 +437,7 @@ class TestSampleProfilerCLI(unittest.TestCase):
 
         with (
             mock.patch("sys.argv", test_args),
+            mock.patch("profiling.sampling.cli._is_process_running", return_value=True),
             mock.patch("profiling.sampling.cli.sample") as mock_sample,
         ):
             main()
@@ -475,6 +477,7 @@ class TestSampleProfilerCLI(unittest.TestCase):
         for test_args, expected_filename, expected_format in test_cases:
             with (
                 mock.patch("sys.argv", test_args),
+                mock.patch("profiling.sampling.cli._is_process_running", return_value=True),
                 mock.patch("profiling.sampling.cli.sample") as mock_sample,
             ):
                 main()
@@ -513,6 +516,7 @@ class TestSampleProfilerCLI(unittest.TestCase):
 
         with (
             mock.patch("sys.argv", test_args),
+            mock.patch("profiling.sampling.cli._is_process_running", return_value=True),
             mock.patch("profiling.sampling.cli.sample") as mock_sample,
         ):
             main()
@@ -534,6 +538,7 @@ class TestSampleProfilerCLI(unittest.TestCase):
 
             with (
                 mock.patch("sys.argv", test_args),
+                mock.patch("profiling.sampling.cli._is_process_running", return_value=True),
                 mock.patch("profiling.sampling.cli.sample") as mock_sample,
             ):
                 main()
@@ -547,6 +552,7 @@ class TestSampleProfilerCLI(unittest.TestCase):
 
         with (
             mock.patch("sys.argv", test_args),
+            mock.patch("profiling.sampling.cli._is_process_running", return_value=True),
             mock.patch("profiling.sampling.cli.sample") as mock_sample,
         ):
             main()
@@ -562,6 +568,7 @@ class TestSampleProfilerCLI(unittest.TestCase):
 
         with (
             mock.patch("sys.argv", test_args),
+            mock.patch("profiling.sampling.cli._is_process_running", return_value=True),
             mock.patch("profiling.sampling.cli.sample") as mock_sample,
         ):
             main()
@@ -576,6 +583,7 @@ class TestSampleProfilerCLI(unittest.TestCase):
 
         with (
             mock.patch("sys.argv", test_args),
+            mock.patch("profiling.sampling.cli._is_process_running", return_value=True),
             mock.patch("profiling.sampling.cli.sample") as mock_sample,
         ):
             main()
@@ -697,14 +705,20 @@ class TestSampleProfilerCLI(unittest.TestCase):
     def test_run_nonexistent_script_exits_cleanly(self):
         """Test that running a non-existent script exits with a clean error."""
         with mock.patch("sys.argv", ["profiling.sampling.cli", "run", "/nonexistent/script.py"]):
-            with self.assertRaises(SystemExit) as cm:
+            with self.assertRaisesRegex(SamplingScriptNotFoundError, "Script '[\\w/.]+' not found."):
                 main()
-        self.assertIn("Script not found", str(cm.exception.code))
 
     @unittest.skipIf(is_emscripten, "subprocess not available")
     def test_run_nonexistent_module_exits_cleanly(self):
         """Test that running a non-existent module exits with a clean error."""
         with mock.patch("sys.argv", ["profiling.sampling.cli", "run", "-m", "nonexistent_module_xyz"]):
-            with self.assertRaises(SystemExit) as cm:
+            with self.assertRaisesRegex(SamplingModuleNotFoundError, "Module '[\\w/.]+' not found."):
                 main()
-        self.assertIn("Module not found", str(cm.exception.code))
+
+    def test_cli_attach_nonexistent_pid(self):
+        fake_pid = "99999"
+        with mock.patch("sys.argv", ["profiling.sampling.cli", "attach", fake_pid]):
+            with self.assertRaises(SamplingUnknownProcessError) as cm:
+                main()
+
+            self.assertIn(fake_pid, str(cm.exception))
diff --git a/Lib/test/test_profiling/test_sampling_profiler/test_integration.py b/Lib/test/test_profiling/test_sampling_profiler/test_integration.py
index 08a96d7eb45..b82474858dd 100644
--- a/Lib/test/test_profiling/test_sampling_profiler/test_integration.py
+++ b/Lib/test/test_profiling/test_sampling_profiler/test_integration.py
@@ -17,7 +17,7 @@ try:
     import profiling.sampling.sample
     from profiling.sampling.pstats_collector import PstatsCollector
     from profiling.sampling.stack_collector import CollapsedStackCollector
-    from profiling.sampling.sample import SampleProfiler
+    from profiling.sampling.sample import SampleProfiler, _is_process_running
 except ImportError:
     raise unittest.SkipTest(
         "Test only runs when _remote_debugging is available"
@@ -602,7 +602,7 @@ do_work()
 @requires_remote_subprocess_debugging()
 class TestSampleProfilerErrorHandling(unittest.TestCase):
     def test_invalid_pid(self):
-        with self.assertRaises((OSError, RuntimeError)):
+        with self.assertRaises((SystemExit, PermissionError)):
             collector = PstatsCollector(sample_interval_usec=100, skip_idle=False)
             profiling.sampling.sample.sample(-1, collector, duration_sec=1)
 
@@ -638,7 +638,7 @@ class TestSampleProfilerErrorHandling(unittest.TestCase):
                 sample_interval_usec=1000,
                 all_threads=False,
             )
-            self.assertTrue(profiler._is_process_running())
+            self.assertTrue(_is_process_running(profiler.pid))
             self.assertIsNotNone(profiler.unwinder.get_stack_trace())
             subproc.process.kill()
             subproc.process.wait()
@@ -647,7 +647,7 @@ class TestSampleProfilerErrorHandling(unittest.TestCase):
             )
 
         # Exit the context manager to ensure the process is terminated
-        self.assertFalse(profiler._is_process_running())
+        self.assertFalse(_is_process_running(profiler.pid))
         self.assertRaises(
             ProcessLookupError, profiler.unwinder.get_stack_trace
         )
@@ -863,3 +863,98 @@ asyncio.run(supervisor())
         self.assertGreater(cpu_percentage, 90.0,
             f"cpu_leaf should dominate samples in 'running' mode, "
             f"got {cpu_percentage:.1f}% ({cpu_leaf_samples}/{total})")
+
+
+def _generate_deep_generators_script(chain_depth=20, recurse_depth=150):
+    """Generate a script with deep nested generators for stress testing."""
+    lines = [
+        'import sys',
+        'sys.setrecursionlimit(5000)',
+        '',
+    ]
+    # Generate chain of yield-from functions
+    for i in range(chain_depth - 1):
+        lines.extend([
+            f'def deep_yield_chain_{i}(n):',
+            f'    yield ("L{i}", n)',
+            f'    yield from deep_yield_chain_{i + 1}(n)',
+            '',
+        ])
+    # Last chain function calls recursive_diver
+    lines.extend([
+        f'def deep_yield_chain_{chain_depth - 1}(n):',
+        f'    yield ("L{chain_depth - 1}", n)',
+        f'    yield from recursive_diver(n, {chain_depth})',
+        '',
+        'def recursive_diver(n, depth):',
+        '    yield (f"DIVE_{depth}", n)',
+        f'    if depth < {recurse_depth}:',
+        '        yield from recursive_diver(n, depth + 1)',
+        '    else:',
+        '        for i in range(5):',
+        '            yield (f"BOTTOM_{depth}", i)',
+        '',
+        'def oscillating_generator(iterations=1000):',
+        '    for i in range(iterations):',
+        '        yield ("OSCILLATE", i)',
+        '        yield from deep_yield_chain_0(i)',
+        '',
+        'def run_forever():',
+        '    while True:',
+        '        for _ in oscillating_generator(10):',
+        '            pass',
+        '',
+        '_test_sock.sendall(b"working")',
+        'run_forever()',
+    ])
+    return '\n'.join(lines)
+
+
+@requires_remote_subprocess_debugging()
+class TestDeepGeneratorFrameCache(unittest.TestCase):
+    """Test frame cache consistency with deep oscillating generator stacks."""
+
+    def test_all_stacks_share_same_base_frame(self):
+        """Verify all sampled stacks reach the entry point function.
+
+        When profiling deep generators that oscillate up and down the call
+        stack, every sample should include the entry point function
+        (run_forever) in its call chain. If the frame cache stores
+        incomplete stacks, some samples will be missing this base function,
+        causing broken flamegraphs.
+        """
+        script = _generate_deep_generators_script()
+        with test_subprocess(script, wait_for_working=True) as subproc:
+            collector = CollapsedStackCollector(sample_interval_usec=1, skip_idle=False)
+
+            with (
+                io.StringIO() as captured_output,
+                mock.patch("sys.stdout", captured_output),
+            ):
+                profiling.sampling.sample.sample(
+                    subproc.process.pid,
+                    collector,
+                    duration_sec=2,
+                )
+
+        samples_with_entry_point = 0
+        samples_without_entry_point = 0
+        total_samples = 0
+
+        for (call_tree, _thread_id), count in collector.stack_counter.items():
+            total_samples += count
+            if call_tree:
+                has_entry_point = call_tree and call_tree[0][2] == "<module>"
+                if has_entry_point:
+                    samples_with_entry_point += count
+                else:
+                    samples_without_entry_point += count
+
+        self.assertGreater(total_samples, 100,
+            f"Expected at least 100 samples, got {total_samples}")
+
+        self.assertEqual(samples_without_entry_point, 0,
+            f"Found {samples_without_entry_point}/{total_samples} samples "
+            f"missing the entry point function 'run_forever'. This indicates "
+            f"incomplete stacks are being returned, likely due to frame cache "
+            f"storing partial stack traces.")
diff --git a/Lib/test/test_profiling/test_sampling_profiler/test_modes.py b/Lib/test/test_profiling/test_sampling_profiler/test_modes.py
index f1293544776..247416389da 100644
--- a/Lib/test/test_profiling/test_sampling_profiler/test_modes.py
+++ b/Lib/test/test_profiling/test_sampling_profiler/test_modes.py
@@ -252,6 +252,7 @@ class TestGilModeFiltering(unittest.TestCase):
 
         with (
             mock.patch("sys.argv", test_args),
+            mock.patch("profiling.sampling.cli._is_process_running", return_value=True),
             mock.patch("profiling.sampling.cli.sample") as mock_sample,
         ):
             try:
@@ -313,6 +314,7 @@ class TestGilModeFiltering(unittest.TestCase):
 
         with (
             mock.patch("sys.argv", test_args),
+            mock.patch("profiling.sampling.cli._is_process_running", return_value=True),
             mock.patch("profiling.sampling.cli.sample") as mock_sample,
         ):
             try:
@@ -432,6 +434,7 @@ class TestExceptionModeFiltering(unittest.TestCase):
 
         with (
             mock.patch("sys.argv", test_args),
+            mock.patch("profiling.sampling.cli._is_process_running", return_value=True),
             mock.patch("profiling.sampling.cli.sample") as mock_sample,
         ):
             try:
@@ -493,6 +496,7 @@ class TestExceptionModeFiltering(unittest.TestCase):
 
         with (
             mock.patch("sys.argv", test_args),
+            mock.patch("profiling.sampling.cli._is_process_running", return_value=True),
             mock.patch("profiling.sampling.cli.sample") as mock_sample,
         ):
             try:
diff --git a/Lib/test/test_pydoc/test_pydoc.py b/Lib/test/test_pydoc/test_pydoc.py
index 89480423d32..3e12a2a96fa 100644
--- a/Lib/test/test_pydoc/test_pydoc.py
+++ b/Lib/test/test_pydoc/test_pydoc.py
@@ -473,6 +473,32 @@ class PydocDocTest(unittest.TestCase):
         result, doc_loc = get_pydoc_text(xml.etree)
         self.assertEqual(doc_loc, "", "MODULE DOCS incorrectly includes a link")
 
+    def test_online_docs_link(self):
+        import encodings.idna
+        import importlib._bootstrap
+
+        module_docs = {
+            'encodings': 'codecs#module-encodings',
+            'encodings.idna': 'codecs#module-encodings.idna',
+        }
+
+        with unittest.mock.patch('pydoc_data.module_docs.module_docs', module_docs):
+            doc = pydoc.TextDoc()
+
+            basedir = os.path.dirname(encodings.__file__)
+            doc_link = doc.getdocloc(encodings, basedir=basedir)
+            self.assertIsNotNone(doc_link)
+            self.assertIn('codecs#module-encodings', doc_link)
+            self.assertNotIn('encodings.html', doc_link)
+
+            doc_link = doc.getdocloc(encodings.idna, basedir=basedir)
+            self.assertIsNotNone(doc_link)
+            self.assertIn('codecs#module-encodings.idna', doc_link)
+            self.assertNotIn('encodings.idna.html', doc_link)
+
+            doc_link = doc.getdocloc(importlib._bootstrap, basedir=basedir)
+            self.assertIsNone(doc_link)
+
     def test_getpager_with_stdin_none(self):
         previous_stdin = sys.stdin
         try:
diff --git a/Lib/test/test_repl.py b/Lib/test/test_repl.py
index 042aa84b35d..0fa1df40e44 100644
--- a/Lib/test/test_repl.py
+++ b/Lib/test/test_repl.py
@@ -409,6 +409,12 @@ class TestAsyncioREPL(unittest.TestCase):
         expected = "toplevel contextvar test: ok"
         self.assertIn(expected, output, expected)
 
+    def test_quiet_mode(self):
+        p = spawn_repl("-q", "-m", "asyncio", custom=True)
+        output = kill_python(p)
+        self.assertEqual(p.returncode, 0)
+        self.assertEqual(output[:3], ">>>")
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/Lib/test/test_sax.py b/Lib/test/test_sax.py
index 5c10bcedc69..29babd7bf69 100644
--- a/Lib/test/test_sax.py
+++ b/Lib/test/test_sax.py
@@ -1573,5 +1573,17 @@ class TestModuleAll(unittest.TestCase):
         check__all__(self, sax, extra=extra)
 
 
+class TestModule(unittest.TestCase):
+    def test_deprecated__version__and__date__(self):
+        for module in (sax.expatreader, sax.handler):
+            with self.subTest(module=module):
+                with self.assertWarnsRegex(
+                    DeprecationWarning,
+                    "'version' is deprecated and slated for removal in Python 3.20",
+                ) as cm:
+                    getattr(module, "version")
+                self.assertEqual(cm.filename, __file__)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/Lib/test/test_sqlite3/__init__.py b/Lib/test/test_sqlite3/__init__.py
index 78a1e2078a5..145f3b80024 100644
--- a/Lib/test/test_sqlite3/__init__.py
+++ b/Lib/test/test_sqlite3/__init__.py
@@ -6,9 +6,14 @@ import_helper.import_module('_sqlite3')
 import os
 import sqlite3
 
+# make sure only print once
+_printed_version = False
+
 # Implement the unittest "load tests" protocol.
-def load_tests(*args):
-    if verbose:
+def load_tests(loader, tests, pattern):
+    global _printed_version
+    if verbose and not _printed_version:
         print(f"test_sqlite3: testing with SQLite version {sqlite3.sqlite_version}")
+        _printed_version = True
     pkg_dir = os.path.dirname(__file__)
-    return load_package_tests(pkg_dir, *args)
+    return load_package_tests(pkg_dir, loader, tests, pattern)
diff --git a/Lib/test/test_unicodedata.py b/Lib/test/test_unicodedata.py
index f5ab25c602a..b1888b6db9b 100644
--- a/Lib/test/test_unicodedata.py
+++ b/Lib/test/test_unicodedata.py
@@ -759,7 +759,7 @@ class NormalizationTest(unittest.TestCase):
 
     @requires_resource('cpu')
     def test_normalization_3_2_0(self):
-        testdatafile = findfile('NormalizationTest-3.2.0.txt', 'data')
+        testdatafile = findfile('NormalizationTest-3.2.0.txt')
         with open(testdatafile, encoding='utf-8') as testdata:
             self.run_normalization_tests(testdata, unicodedata.ucd_3_2_0)
 
diff --git a/Lib/test/test_unittest/testmock/testthreadingmock.py b/Lib/test/test_unittest/testmock/testthreadingmock.py
index 3603995b090..dda4916434e 100644
--- a/Lib/test/test_unittest/testmock/testthreadingmock.py
+++ b/Lib/test/test_unittest/testmock/testthreadingmock.py
@@ -219,5 +219,81 @@ class TestThreadingMock(unittest.TestCase):
         self.assertEqual(m.call_count, LOOPS * THREADS)
 
 
+    def test_call_args_thread_safe(self):
+        m = ThreadingMock()
+        LOOPS = 100
+        THREADS = 10
+        def test_function(thread_id):
+            for i in range(LOOPS):
+                m(thread_id, i)
+
+        oldswitchinterval = sys.getswitchinterval()
+        setswitchinterval(1e-6)
+        try:
+            threads = [
+                threading.Thread(target=test_function, args=(thread_id,))
+                for thread_id in range(THREADS)
+            ]
+            with threading_helper.start_threads(threads):
+                pass
+        finally:
+            sys.setswitchinterval(oldswitchinterval)
+        expected_calls = {
+            (thread_id, i)
+            for thread_id in range(THREADS)
+            for i in range(LOOPS)
+        }
+        self.assertSetEqual({call.args for call in m.call_args_list}, expected_calls)
+
+    def test_method_calls_thread_safe(self):
+        m = ThreadingMock()
+        LOOPS = 100
+        THREADS = 10
+        def test_function(thread_id):
+            for i in range(LOOPS):
+                getattr(m, f"method_{thread_id}")(i)
+
+        oldswitchinterval = sys.getswitchinterval()
+        setswitchinterval(1e-6)
+        try:
+            threads = [
+                threading.Thread(target=test_function, args=(thread_id,))
+                for thread_id in range(THREADS)
+            ]
+            with threading_helper.start_threads(threads):
+                pass
+        finally:
+            sys.setswitchinterval(oldswitchinterval)
+        for thread_id in range(THREADS):
+            self.assertEqual(getattr(m, f"method_{thread_id}").call_count, LOOPS)
+            self.assertEqual({call.args for call in getattr(m, f"method_{thread_id}").call_args_list},
+                              {(i,) for i in range(LOOPS)})
+
+    def test_mock_calls_thread_safe(self):
+        m = ThreadingMock()
+        LOOPS = 100
+        THREADS = 10
+        def test_function(thread_id):
+            for i in range(LOOPS):
+                m(thread_id, i)
+
+        oldswitchinterval = sys.getswitchinterval()
+        setswitchinterval(1e-6)
+        try:
+            threads = [
+                threading.Thread(target=test_function, args=(thread_id,))
+                for thread_id in range(THREADS)
+            ]
+            with threading_helper.start_threads(threads):
+                pass
+        finally:
+            sys.setswitchinterval(oldswitchinterval)
+        expected_calls = {
+            (thread_id, i)
+            for thread_id in range(THREADS)
+            for i in range(LOOPS)
+        }
+        self.assertSetEqual({call.args for call in m.mock_calls}, expected_calls)
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/Lib/test/test_urllib2.py b/Lib/test/test_urllib2.py
index 7d7f2fa00d3..3a77b9e5ab7 100644
--- a/Lib/test/test_urllib2.py
+++ b/Lib/test/test_urllib2.py
@@ -577,6 +577,23 @@ class OpenerDirectorTests(unittest.TestCase):
         self.assertRaises(TypeError,
                           OpenerDirector().add_handler, NonHandler())
 
+    def test_no_protocol_methods(self):
+        # test the case that methods starts with handler type without the protocol
+        # like open*() or _open*().
+        # These methods should be ignored
+
+        o = OpenerDirector()
+        meth_spec = [
+            ["open"],
+            ["_open"],
+            ["error"]
+        ]
+
+        add_ordered_mock_handlers(o, meth_spec)
+
+        self.assertEqual(len(o.handle_open), 0)
+        self.assertEqual(len(o.handle_error), 0)
+
     def test_badly_named_methods(self):
         # test work-around for three methods that accidentally follow the
         # naming conventions for handler methods
diff --git a/Lib/test/test_xml_etree.py b/Lib/test/test_xml_etree.py
index 87811199706..0178ed02b35 100644
--- a/Lib/test/test_xml_etree.py
+++ b/Lib/test/test_xml_etree.py
@@ -4705,6 +4705,19 @@ class C14NTest(unittest.TestCase):
 
 # --------------------------------------------------------------------
 
+
+class TestModule(unittest.TestCase):
+    def test_deprecated_version(self):
+        with self.assertWarnsRegex(
+            DeprecationWarning,
+            "'VERSION' is deprecated and slated for removal in Python 3.20",
+        ) as cm:
+                getattr(ET, "VERSION")
+        self.assertEqual(cm.filename, __file__)
+
+
+# --------------------------------------------------------------------
+
 def setUpModule(module=None):
     # When invoked without a module, runs the Python ET tests by loading pyET.
     # Otherwise, uses the given module as the ET.
diff --git a/Lib/test/test_zoneinfo/test_zoneinfo.py b/Lib/test/test_zoneinfo/test_zoneinfo.py
index fad741e477b..8f3ca59c9ef 100644
--- a/Lib/test/test_zoneinfo/test_zoneinfo.py
+++ b/Lib/test/test_zoneinfo/test_zoneinfo.py
@@ -1551,6 +1551,26 @@ class ZoneInfoCacheTest(TzPathUserMixin, ZoneInfoTestBase):
         except CustomError:
             pass
 
+    def test_weak_cache_descriptor_use_after_free(self):
+        class BombDescriptor:
+            def __get__(self, obj, owner):
+                return {}
+
+        class EvilZoneInfo(self.klass):
+            pass
+
+        # Must be set after the class creation.
+        EvilZoneInfo._weak_cache = BombDescriptor()
+
+        key = "America/Los_Angeles"
+        zone1 = EvilZoneInfo(key)
+        self.assertEqual(str(zone1), key)
+
+        EvilZoneInfo.clear_cache()
+        zone2 = EvilZoneInfo(key)
+        self.assertEqual(str(zone2), key)
+        self.assertIsNot(zone2, zone1)
+
 
 class CZoneInfoCacheTest(ZoneInfoCacheTest):
     module = c_zoneinfo
diff --git a/Lib/urllib/request.py b/Lib/urllib/request.py
index 566b8087aec..f32de189b13 100644
--- a/Lib/urllib/request.py
+++ b/Lib/urllib/request.py
@@ -415,6 +415,8 @@ class OpenerDirector:
                 continue
 
             i = meth.find("_")
+            if i < 1:
+                continue
             protocol = meth[:i]
             condition = meth[i+1:]
 
diff --git a/Lib/xml/etree/ElementTree.py b/Lib/xml/etree/ElementTree.py
index d8c0b1b6216..92f902b9a8b 100644
--- a/Lib/xml/etree/ElementTree.py
+++ b/Lib/xml/etree/ElementTree.py
@@ -83,15 +83,12 @@ __all__ = [
     "SubElement",
     "tostring", "tostringlist",
     "TreeBuilder",
-    "VERSION",
     "XML", "XMLID",
     "XMLParser", "XMLPullParser",
     "register_namespace",
     "canonicalize", "C14NWriterTarget",
     ]
 
-VERSION = "1.3.0"
-
 import sys
 import re
 import warnings
@@ -2104,3 +2101,14 @@ except ImportError:
     pass
 else:
     _set_factories(Comment, ProcessingInstruction)
+
+
+# --------------------------------------------------------------------
+
+def __getattr__(name):
+    if name == "VERSION":
+        from warnings import _deprecated
+
+        _deprecated("VERSION", remove=(3, 20))
+        return "1.3.0"  # Do not change
+    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
diff --git a/Lib/xml/sax/expatreader.py b/Lib/xml/sax/expatreader.py
index ba3c1e98517..37b1add2848 100644
--- a/Lib/xml/sax/expatreader.py
+++ b/Lib/xml/sax/expatreader.py
@@ -3,8 +3,6 @@ SAX driver for the pyexpat C module.  This driver works with
 pyexpat.__version__ == '2.22'.
 """
 
-version = "0.20"
-
 from xml.sax._exceptions import *
 from xml.sax.handler import feature_validation, feature_namespaces
 from xml.sax.handler import feature_namespace_prefixes
@@ -446,6 +444,16 @@ def create_parser(*args, **kwargs):
 
 # ---
 
+def __getattr__(name):
+    if name == "version":
+        from warnings import _deprecated
+
+        _deprecated("version", remove=(3, 20))
+        return "0.20"  # Do not change
+    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
+
+# ---
+
 if __name__ == "__main__":
     import xml.sax.saxutils
     p = create_parser()
diff --git a/Lib/xml/sax/handler.py b/Lib/xml/sax/handler.py
index 3183c3fe96d..9c2e3af838a 100644
--- a/Lib/xml/sax/handler.py
+++ b/Lib/xml/sax/handler.py
@@ -9,8 +9,6 @@ of the interfaces.
 $Id$
 """
 
-version = '2.0beta'
-
 #============================================================================
 #
 # HANDLER INTERFACES
@@ -385,3 +383,12 @@ class LexicalHandler:
 
     def endCDATA(self):
         """Reports the end of a CDATA marked section."""
+
+
+def __getattr__(name):
+    if name == "version":
+        from warnings import _deprecated
+
+        _deprecated("version", remove=(3, 20))
+        return "2.0beta"  # Do not change
+    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
diff --git a/Misc/ACKS b/Misc/ACKS
index e3927ff0b33..a14089a39cc 100644
--- a/Misc/ACKS
+++ b/Misc/ACKS
@@ -908,6 +908,7 @@ Jim Jewett
 Pedro Diaz Jimenez
 Orjan Johansen
 Fredrik Johansson
+Benjamin Johnson
 Benjamin K. Johnson
 Gregory K. Johnson
 Kent Johnson
diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2025-10-11-17-01-21.gh-issue-139922.RUkXyd.rst b/Misc/NEWS.d/next/Core_and_Builtins/2025-10-11-17-01-21.gh-issue-139922.RUkXyd.rst
new file mode 100644
index 00000000000..d498db07f91
--- /dev/null
+++ b/Misc/NEWS.d/next/Core_and_Builtins/2025-10-11-17-01-21.gh-issue-139922.RUkXyd.rst
@@ -0,0 +1 @@
+Allow building CPython with the tail calling interpreter on Visual Studio 2026 MSVC. This provides a performance gain over the prior interpreter for MSVC. Patch by Ken Jin, Brandt Bucher, and Chris Eibl. With help from the MSVC team including Hulon Jenkins.
diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2025-11-06-05-21-25.gh-issue-100964.TxPf1b.rst b/Misc/NEWS.d/next/Core_and_Builtins/2025-11-06-05-21-25.gh-issue-100964.TxPf1b.rst
new file mode 100644
index 00000000000..7c554cf8dda
--- /dev/null
+++ b/Misc/NEWS.d/next/Core_and_Builtins/2025-11-06-05-21-25.gh-issue-100964.TxPf1b.rst
@@ -0,0 +1 @@
+Fix reference cycle in exhausted generator frames. Patch by Savannah Ostrowski.
diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2025-12-13-01-11-03.gh-issue-142476.44Sp4N.rst b/Misc/NEWS.d/next/Core_and_Builtins/2025-12-13-01-11-03.gh-issue-142476.44Sp4N.rst
new file mode 100644
index 00000000000..eae1f3a1ce5
--- /dev/null
+++ b/Misc/NEWS.d/next/Core_and_Builtins/2025-12-13-01-11-03.gh-issue-142476.44Sp4N.rst
@@ -0,0 +1,2 @@
+Fix a memory leak in the experimental Tier 2 optimizer when creating
+executors. Patched by Shamil Abdulaev.
diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2025-12-16-05-24-24.gh-issue-134584.tJ1usH.rst b/Misc/NEWS.d/next/Core_and_Builtins/2025-12-16-05-24-24.gh-issue-134584.tJ1usH.rst
new file mode 100644
index 00000000000..aa096fc827d
--- /dev/null
+++ b/Misc/NEWS.d/next/Core_and_Builtins/2025-12-16-05-24-24.gh-issue-134584.tJ1usH.rst
@@ -0,0 +1 @@
+Eliminate redundant refcounting from ``_STORE_ATTR_WITH_HINT``.
diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2025-12-16-05-52-37.gh-issue-134584.VsfOQR.rst b/Misc/NEWS.d/next/Core_and_Builtins/2025-12-16-05-52-37.gh-issue-134584.VsfOQR.rst
new file mode 100644
index 00000000000..d92f2c166c5
--- /dev/null
+++ b/Misc/NEWS.d/next/Core_and_Builtins/2025-12-16-05-52-37.gh-issue-134584.VsfOQR.rst
@@ -0,0 +1 @@
+Eliminate redundant refcounting from ``_LOAD_ATTR_INSTANCE_VALUE``.
diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2025-12-16-11-56-20.gh-issue-142766.Uy2HTm.rst b/Misc/NEWS.d/next/Core_and_Builtins/2025-12-16-11-56-20.gh-issue-142766.Uy2HTm.rst
new file mode 100644
index 00000000000..6a14976a6dc
--- /dev/null
+++ b/Misc/NEWS.d/next/Core_and_Builtins/2025-12-16-11-56-20.gh-issue-142766.Uy2HTm.rst
@@ -0,0 +1 @@
+Clear the frame of a generator when :meth:`generator.close` is called.
diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2025-12-16-23-26-41.gh-issue-142543.wJKjBs.rst b/Misc/NEWS.d/next/Core_and_Builtins/2025-12-16-23-26-41.gh-issue-142543.wJKjBs.rst
new file mode 100644
index 00000000000..0897127fec7
--- /dev/null
+++ b/Misc/NEWS.d/next/Core_and_Builtins/2025-12-16-23-26-41.gh-issue-142543.wJKjBs.rst
@@ -0,0 +1 @@
+Fix a stack overflow on Clang JIT build configurations with full LTO.
diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2025-12-17-10-49-03.gh-issue-129068.GlYnrO.rst b/Misc/NEWS.d/next/Core_and_Builtins/2025-12-17-10-49-03.gh-issue-129068.GlYnrO.rst
new file mode 100644
index 00000000000..16b19524cd4
--- /dev/null
+++ b/Misc/NEWS.d/next/Core_and_Builtins/2025-12-17-10-49-03.gh-issue-129068.GlYnrO.rst
@@ -0,0 +1,2 @@
+Make concurrent iteration over the same range iterator thread-safe in the
+free threading build.
diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2025-12-17-20-31-09.gh-issue-139757.6DWxeQ.rst b/Misc/NEWS.d/next/Core_and_Builtins/2025-12-17-20-31-09.gh-issue-139757.6DWxeQ.rst
new file mode 100644
index 00000000000..3c476d3eaea
--- /dev/null
+++ b/Misc/NEWS.d/next/Core_and_Builtins/2025-12-17-20-31-09.gh-issue-139757.6DWxeQ.rst
@@ -0,0 +1 @@
+Fix building JIT stencils on free-threaded builds.
diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2025-12-18-01-00-14.gh-issue-142776.ACaoeP.rst b/Misc/NEWS.d/next/Core_and_Builtins/2025-12-18-01-00-14.gh-issue-142776.ACaoeP.rst
new file mode 100644
index 00000000000..3039b04d89c
--- /dev/null
+++ b/Misc/NEWS.d/next/Core_and_Builtins/2025-12-18-01-00-14.gh-issue-142776.ACaoeP.rst
@@ -0,0 +1 @@
+Fix a file descriptor leak in import.c
diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2025-12-19-00-59-29.gh-issue-142961.q8WRSq.rst b/Misc/NEWS.d/next/Core_and_Builtins/2025-12-19-00-59-29.gh-issue-142961.q8WRSq.rst
new file mode 100644
index 00000000000..4b75ab232d5
--- /dev/null
+++ b/Misc/NEWS.d/next/Core_and_Builtins/2025-12-19-00-59-29.gh-issue-142961.q8WRSq.rst
@@ -0,0 +1 @@
+Fix a segfault in the JIT when constant folding ``len(tuple)``.
diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2025-12-21-00-25-26.gh-issue-139109.gwSsOL.rst b/Misc/NEWS.d/next/Core_and_Builtins/2025-12-21-00-25-26.gh-issue-139109.gwSsOL.rst
new file mode 100644
index 00000000000..17cceac4e09
--- /dev/null
+++ b/Misc/NEWS.d/next/Core_and_Builtins/2025-12-21-00-25-26.gh-issue-139109.gwSsOL.rst
@@ -0,0 +1 @@
+Add missing terminator in certain cases when tracing in the new JIT compiler.
diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2025-12-22-12-03-09.gh-issue-143057.Majsre.rst b/Misc/NEWS.d/next/Core_and_Builtins/2025-12-22-12-03-09.gh-issue-143057.Majsre.rst
new file mode 100644
index 00000000000..2eac8c1cfdc
--- /dev/null
+++ b/Misc/NEWS.d/next/Core_and_Builtins/2025-12-22-12-03-09.gh-issue-143057.Majsre.rst
@@ -0,0 +1 @@
+Avoid locking in :c:func:`PyTraceMalloc_Track` and :c:func:`PyTraceMalloc_Untrack` when :mod:`tracemalloc` is not enabled.
diff --git a/Misc/NEWS.d/next/Library/2025-07-05-08-30-07.gh-issue-136282.K3JKyD.rst b/Misc/NEWS.d/next/Library/2025-07-05-08-30-07.gh-issue-136282.K3JKyD.rst
new file mode 100644
index 00000000000..b5589b47716
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2025-07-05-08-30-07.gh-issue-136282.K3JKyD.rst
@@ -0,0 +1,2 @@
+Add support for :const:`~configparser.UNNAMED_SECTION` when creating a
+section via the mapping protocol access
diff --git a/Misc/NEWS.d/next/Library/2025-07-20-15-39-54.gh-issue-124098.znFPIp.rst b/Misc/NEWS.d/next/Library/2025-07-20-15-39-54.gh-issue-124098.znFPIp.rst
new file mode 100644
index 00000000000..236b37d268e
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2025-07-20-15-39-54.gh-issue-124098.znFPIp.rst
@@ -0,0 +1,4 @@
+Fix issue where methods in handlers that lacked the protocol name but
+matched a valid base handler method (e.g., ``_open()`` or ``error()``)
+were incorrectly added to :class:`urllib.request.OpenerDirector`'s
+handlers. Contributed by Andrea Mattei.
diff --git a/Misc/NEWS.d/next/Library/2025-10-12-12-05-52.gh-issue-139971.UdoStU.rst b/Misc/NEWS.d/next/Library/2025-10-12-12-05-52.gh-issue-139971.UdoStU.rst
new file mode 100644
index 00000000000..720397e2729
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2025-10-12-12-05-52.gh-issue-139971.UdoStU.rst
@@ -0,0 +1,2 @@
+:mod:`pydoc`: Ensure that the link to the online documentation of a
+:term:`stdlib` module is correct.
diff --git a/Misc/NEWS.d/next/Library/2025-12-10-02-31-43.gh-issue-142419.C8_LES.rst b/Misc/NEWS.d/next/Library/2025-12-10-02-31-43.gh-issue-142419.C8_LES.rst
new file mode 100644
index 00000000000..63955923cd1
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2025-12-10-02-31-43.gh-issue-142419.C8_LES.rst
@@ -0,0 +1,3 @@
+:meth:`mmap.mmap.set_name` method added to annotate an anonymous memory map
+if Linux kernel supports ``PR_SET_VMA_ANON_NAME`` (Linux 5.17 or newer).
+Patch by Donghee Na.
diff --git a/Misc/NEWS.d/next/Library/2025-12-11-22-59-33.gh-issue-142560.GkJrkk.rst b/Misc/NEWS.d/next/Library/2025-12-11-22-59-33.gh-issue-142560.GkJrkk.rst
new file mode 100644
index 00000000000..9c0657214b0
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2025-12-11-22-59-33.gh-issue-142560.GkJrkk.rst
@@ -0,0 +1 @@
+Fix use-after-free in :class:`bytearray` search-like methods (:meth:`~bytearray.find`, :meth:`~bytearray.count`, :meth:`~bytearray.index`, :meth:`~bytearray.rindex`, and :meth:`~bytearray.rfind`) by marking the storage as exported which causes reallocation attempts to raise :exc:`BufferError`. For :func:`~operator.contains`, :meth:`~bytearray.split`, and :meth:`~bytearray.rsplit` the :ref:`buffer protocol <bufferobjects>` is used for this.
diff --git a/Misc/NEWS.d/next/Library/2025-12-13-10-34-59.gh-issue-142654.fmm974.rst b/Misc/NEWS.d/next/Library/2025-12-13-10-34-59.gh-issue-142654.fmm974.rst
new file mode 100644
index 00000000000..7bb14cb499d
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2025-12-13-10-34-59.gh-issue-142654.fmm974.rst
@@ -0,0 +1,2 @@
+Show the clearer error message when using ``profiling.sampling`` on an
+unknown PID.
diff --git a/Misc/NEWS.d/next/Library/2025-12-13-23-26-42.gh-issue-142495.I88Uv_.rst b/Misc/NEWS.d/next/Library/2025-12-13-23-26-42.gh-issue-142495.I88Uv_.rst
new file mode 100644
index 00000000000..3e1a624fe56
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2025-12-13-23-26-42.gh-issue-142495.I88Uv_.rst
@@ -0,0 +1,4 @@
+:class:`collections.defaultdict` now prioritizes :meth:`~object.__setitem__`
+when inserting default values from ``default_factory``. This prevents race
+conditions where a default value would overwrite a value set before
+``default_factory`` returns.
diff --git a/Misc/NEWS.d/next/Library/2025-12-15-02-00-31.gh-issue-138122.m3EF9E.rst b/Misc/NEWS.d/next/Library/2025-12-15-02-00-31.gh-issue-138122.m3EF9E.rst
new file mode 100644
index 00000000000..f9c2cee51d1
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2025-12-15-02-00-31.gh-issue-138122.m3EF9E.rst
@@ -0,0 +1,4 @@
+Add binary output format to :mod:`profiling.sampling` for compact storage of
+profiling data. The new ``--binary`` option captures samples to a file that
+can be converted to other formats using the ``replay`` command. Patch by
+Pablo Galindo
diff --git a/Misc/NEWS.d/next/Library/2025-12-16-04-39-27.gh-issue-142784.HBGJag.rst b/Misc/NEWS.d/next/Library/2025-12-16-04-39-27.gh-issue-142784.HBGJag.rst
new file mode 100644
index 00000000000..92a723cbc29
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2025-12-16-04-39-27.gh-issue-142784.HBGJag.rst
@@ -0,0 +1,3 @@
+The :mod:`asyncio` REPL now properly closes the loop upon the end of interactive session.
+Previously, it could cause surprising warnings.
+Contributed by Bartosz Sławecki.
diff --git a/Misc/NEWS.d/next/Library/2025-12-16-14-21-20.gh-issue-76007.O4AmYl.rst b/Misc/NEWS.d/next/Library/2025-12-16-14-21-20.gh-issue-76007.O4AmYl.rst
new file mode 100644
index 00000000000..cfda7327e8f
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2025-12-16-14-21-20.gh-issue-76007.O4AmYl.rst
@@ -0,0 +1,3 @@
+Deprecate ``VERSION`` from :mod:`xml.etree.ElementTree` and ``version`` from
+:mod:`!xml.sax.expatreader` and :mod:`xml.sax.handler`. Patch by Hugo van
+Kemenade.
diff --git a/Misc/NEWS.d/next/Library/2025-12-16-14-49-19.gh-issue-142783.VPV1ig.rst b/Misc/NEWS.d/next/Library/2025-12-16-14-49-19.gh-issue-142783.VPV1ig.rst
new file mode 100644
index 00000000000..f014771ae9a
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2025-12-16-14-49-19.gh-issue-142783.VPV1ig.rst
@@ -0,0 +1 @@
+Fix zoneinfo use-after-free with descriptor _weak_cache. a descriptor as _weak_cache could cause crashes during object creation. The fix ensures proper reference counting for descriptor-provided objects.
diff --git a/Misc/NEWS.d/next/Library/2025-12-16-15-32-41.gh-issue-142834.g7mHw_.rst b/Misc/NEWS.d/next/Library/2025-12-16-15-32-41.gh-issue-142834.g7mHw_.rst
new file mode 100644
index 00000000000..8cde592e7c9
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2025-12-16-15-32-41.gh-issue-142834.g7mHw_.rst
@@ -0,0 +1 @@
+Change the :mod:`pdb` ``commands`` command to use the last available breakpoint instead of failing when the most recently created breakpoint was deleted.
diff --git a/Misc/NEWS.d/next/Library/2025-12-17-03-03-12.gh-issue-138122.m3EF9E.rst b/Misc/NEWS.d/next/Library/2025-12-17-03-03-12.gh-issue-138122.m3EF9E.rst
new file mode 100644
index 00000000000..e33a761aa61
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2025-12-17-03-03-12.gh-issue-138122.m3EF9E.rst
@@ -0,0 +1,4 @@
+Fix incomplete stack traces in the Tachyon profiler's frame cache when
+profiling code with deeply nested generators. The frame cache now validates
+that stack traces reach the base frame before caching, preventing broken
+flamegraphs. Patch by Pablo Galindo.
diff --git a/Misc/NEWS.d/next/Library/2025-12-17-14-41-09.gh-issue-112127.13OHQk.rst b/Misc/NEWS.d/next/Library/2025-12-17-14-41-09.gh-issue-112127.13OHQk.rst
new file mode 100644
index 00000000000..c983683ebd5
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2025-12-17-14-41-09.gh-issue-112127.13OHQk.rst
@@ -0,0 +1,2 @@
+Fix possible use-after-free in :func:`atexit.unregister` when the callback
+is unregistered during comparison.
diff --git a/Misc/NEWS.d/next/Library/2025-12-20-01-49-02.gh-issue-143010._-SWX0.rst b/Misc/NEWS.d/next/Library/2025-12-20-01-49-02.gh-issue-143010._-SWX0.rst
new file mode 100644
index 00000000000..4914d0b7be7
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2025-12-20-01-49-02.gh-issue-143010._-SWX0.rst
@@ -0,0 +1 @@
+Fixed a bug in :mod:`mailbox` where the precise timing of an external event could result in the library opening an existing file instead of a file it expected to create.
diff --git a/Misc/NEWS.d/next/Library/2025-12-20-16-35-42.gh-issue-80744.X4pZ2N.rst b/Misc/NEWS.d/next/Library/2025-12-20-16-35-42.gh-issue-80744.X4pZ2N.rst
new file mode 100644
index 00000000000..03ec9e4652b
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2025-12-20-16-35-42.gh-issue-80744.X4pZ2N.rst
@@ -0,0 +1 @@
+Fix issue where ``pdb`` would read a ``.pdbrc`` twice if launched from the home directory
diff --git a/Misc/NEWS.d/next/Library/2025-12-21-17-44-28.gh-issue-143046.GBa5Ip.rst b/Misc/NEWS.d/next/Library/2025-12-21-17-44-28.gh-issue-143046.GBa5Ip.rst
new file mode 100644
index 00000000000..ac819a47f4c
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2025-12-21-17-44-28.gh-issue-143046.GBa5Ip.rst
@@ -0,0 +1,2 @@
+The :mod:`asyncio` REPL no longer prints copyright and version messages in
+the quiet mode (:option:`-q`). Patch by Bartosz Sławecki.
diff --git a/Misc/NEWS.d/next/Library/2025-12-22-22-36-21.gh-issue-122431.9E3085.rst b/Misc/NEWS.d/next/Library/2025-12-22-22-36-21.gh-issue-122431.9E3085.rst
new file mode 100644
index 00000000000..8936ac9395f
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2025-12-22-22-36-21.gh-issue-122431.9E3085.rst
@@ -0,0 +1 @@
+Corrected the error message in :func:`readline.append_history_file` to state that ``nelements`` must be non-negative instead of positive.
diff --git a/Misc/NEWS.d/next/Tests/2025-12-17-02-02-57.gh-issue-142836.mR-fvK.rst b/Misc/NEWS.d/next/Tests/2025-12-17-02-02-57.gh-issue-142836.mR-fvK.rst
new file mode 100644
index 00000000000..dd84ce9839f
--- /dev/null
+++ b/Misc/NEWS.d/next/Tests/2025-12-17-02-02-57.gh-issue-142836.mR-fvK.rst
@@ -0,0 +1 @@
+Accommodated Solaris in ``test_pdb.test_script_target_anonymous_pipe``.
diff --git a/Modules/Setup.stdlib.in b/Modules/Setup.stdlib.in
index acb08400e24..2a4b937ce6b 100644
--- a/Modules/Setup.stdlib.in
+++ b/Modules/Setup.stdlib.in
@@ -41,7 +41,7 @@
 @MODULE__PICKLE_TRUE@_pickle _pickle.c
 @MODULE__QUEUE_TRUE@_queue _queuemodule.c
 @MODULE__RANDOM_TRUE@_random _randommodule.c
-@MODULE__REMOTE_DEBUGGING_TRUE@_remote_debugging _remote_debugging/module.c _remote_debugging/object_reading.c _remote_debugging/code_objects.c _remote_debugging/frames.c _remote_debugging/frame_cache.c _remote_debugging/threads.c _remote_debugging/asyncio.c _remote_debugging/subprocess.c
+@MODULE__REMOTE_DEBUGGING_TRUE@_remote_debugging _remote_debugging/module.c _remote_debugging/object_reading.c _remote_debugging/code_objects.c _remote_debugging/frames.c _remote_debugging/frame_cache.c _remote_debugging/threads.c _remote_debugging/asyncio.c _remote_debugging/binary_io_writer.c _remote_debugging/binary_io_reader.c _remote_debugging/subprocess.c
 @MODULE__STRUCT_TRUE@_struct _struct.c
 
 # build supports subinterpreters
diff --git a/Modules/_collectionsmodule.c b/Modules/_collectionsmodule.c
index 3ba48d5d9d3..3b14a21fa84 100644
--- a/Modules/_collectionsmodule.c
+++ b/Modules/_collectionsmodule.c
@@ -2231,11 +2231,11 @@ defdict_missing(PyObject *op, PyObject *key)
     value = _PyObject_CallNoArgs(factory);
     if (value == NULL)
         return value;
-    if (PyObject_SetItem(op, key, value) < 0) {
-        Py_DECREF(value);
-        return NULL;
-    }
-    return value;
+    PyObject *result = NULL;
+    (void)PyDict_SetDefaultRef(op, key, value, &result);
+    // 'result' is NULL, or a strong reference to 'value' or 'op[key]'
+    Py_DECREF(value);
+    return result;
 }
 
 static inline PyObject*
diff --git a/Modules/_remote_debugging/_remote_debugging.h b/Modules/_remote_debugging/_remote_debugging.h
index 2f3efedd1e0..918fd2bbe43 100644
--- a/Modules/_remote_debugging/_remote_debugging.h
+++ b/Modules/_remote_debugging/_remote_debugging.h
@@ -9,14 +9,18 @@
 #define Py_REMOTE_DEBUGGING_H
 
 /* _GNU_SOURCE must be defined before any system headers */
+#ifndef _GNU_SOURCE
 #define _GNU_SOURCE
+#endif
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
 #ifndef Py_BUILD_CORE_BUILTIN
+#  ifndef Py_BUILD_CORE_MODULE
 #    define Py_BUILD_CORE_MODULE 1
+#  endif
 #endif
 
 #include "Python.h"
@@ -205,6 +209,8 @@ typedef struct {
     PyTypeObject *ThreadInfo_Type;
     PyTypeObject *InterpreterInfo_Type;
     PyTypeObject *AwaitedInfo_Type;
+    PyTypeObject *BinaryWriter_Type;
+    PyTypeObject *BinaryReader_Type;
 } RemoteDebuggingState;
 
 enum _ThreadState {
@@ -279,6 +285,35 @@ typedef struct {
     size_t count;
 } StackChunkList;
 
+/*
+ * Context for frame chain traversal operations.
+ */
+typedef struct {
+    /* Inputs */
+    uintptr_t frame_addr;           // Starting frame address
+    uintptr_t base_frame_addr;      // Sentinel at bottom (for validation)
+    uintptr_t gc_frame;             // GC frame address (0 if not tracking)
+    uintptr_t last_profiled_frame;  // Last cached frame (0 if no cache)
+    StackChunkList *chunks;         // Pre-copied stack chunks
+
+    /* Outputs */
+    PyObject *frame_info;           // List to append FrameInfo objects
+    uintptr_t *frame_addrs;         // Array of visited frame addresses
+    Py_ssize_t num_addrs;           // Count of addresses collected
+    Py_ssize_t max_addrs;           // Capacity of frame_addrs array
+    uintptr_t last_frame_visited;   // Last frame address visited
+    int stopped_at_cached_frame;    // Whether we stopped at cached frame
+} FrameWalkContext;
+
+/*
+ * Context for code object parsing.
+ */
+typedef struct {
+    uintptr_t code_addr;            // Code object address in remote process
+    uintptr_t instruction_pointer;  // Current instruction pointer
+    int32_t tlbc_index;             // Thread-local bytecode index (free-threading)
+} CodeObjectContext;
+
 /* Function pointer types for iteration callbacks */
 typedef int (*thread_processor_func)(
     RemoteUnwinderObject *unwinder,
@@ -343,10 +378,7 @@ extern long read_py_long(RemoteUnwinderObject *unwinder, uintptr_t address);
 extern int parse_code_object(
     RemoteUnwinderObject *unwinder,
     PyObject **result,
-    uintptr_t address,
-    uintptr_t instruction_pointer,
-    uintptr_t *previous_frame,
-    int32_t tlbc_index
+    const CodeObjectContext *ctx
 );
 
 extern PyObject *make_location_info(
@@ -420,16 +452,7 @@ extern void *find_frame_in_chunks(StackChunkList *chunks, uintptr_t remote_ptr);
 
 extern int process_frame_chain(
     RemoteUnwinderObject *unwinder,
-    uintptr_t initial_frame_addr,
-    StackChunkList *chunks,
-    PyObject *frame_info,
-    uintptr_t base_frame_addr,
-    uintptr_t gc_frame,
-    uintptr_t last_profiled_frame,
-    int *stopped_at_cached_frame,
-    uintptr_t *frame_addrs,
-    Py_ssize_t *num_addrs,
-    Py_ssize_t max_addrs
+    FrameWalkContext *ctx
 );
 
 /* Frame cache functions */
@@ -447,20 +470,19 @@ extern int frame_cache_lookup_and_extend(
     Py_ssize_t *num_addrs,
     Py_ssize_t max_addrs);
 // Returns: 1 = stored, 0 = not stored (graceful), -1 = error
+// Only stores complete stacks that reach base_frame_addr
 extern int frame_cache_store(
     RemoteUnwinderObject *unwinder,
     uint64_t thread_id,
     PyObject *frame_list,
     const uintptr_t *addrs,
-    Py_ssize_t num_addrs);
+    Py_ssize_t num_addrs,
+    uintptr_t base_frame_addr,
+    uintptr_t last_frame_visited);
 
 extern int collect_frames_with_cache(
     RemoteUnwinderObject *unwinder,
-    uintptr_t frame_addr,
-    StackChunkList *chunks,
-    PyObject *frame_info,
-    uintptr_t gc_frame,
-    uintptr_t last_profiled_frame,
+    FrameWalkContext *ctx,
     uint64_t thread_id);
 
 /* ============================================================================
diff --git a/Modules/_remote_debugging/binary_io.h b/Modules/_remote_debugging/binary_io.h
new file mode 100644
index 00000000000..bdfe35f5f2c
--- /dev/null
+++ b/Modules/_remote_debugging/binary_io.h
@@ -0,0 +1,638 @@
+/******************************************************************************
+ * Python Remote Debugging Module - Binary I/O Header
+ *
+ * This header provides declarations for high-performance binary file I/O
+ * for profiling data with optional zstd streaming compression.
+ ******************************************************************************/
+
+#ifndef Py_BINARY_IO_H
+#define Py_BINARY_IO_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "Python.h"
+#include "pycore_hashtable.h"
+#include <stdint.h>
+#include <stdio.h>
+
+/* ============================================================================
+ * BINARY FORMAT CONSTANTS
+ * ============================================================================ */
+
+#define BINARY_FORMAT_MAGIC     0x54414348  /* "TACH" (Tachyon) in native byte order */
+#define BINARY_FORMAT_MAGIC_SWAPPED 0x48434154  /* Byte-swapped magic for endianness detection */
+#define BINARY_FORMAT_VERSION   1
+
+/* Conditional byte-swap macros for cross-endian file reading.
+ * Uses Python's optimized byte-swap functions from pycore_bitutils.h */
+#define SWAP16_IF(swap, x) ((swap) ? _Py_bswap16(x) : (x))
+#define SWAP32_IF(swap, x) ((swap) ? _Py_bswap32(x) : (x))
+#define SWAP64_IF(swap, x) ((swap) ? _Py_bswap64(x) : (x))
+
+/* Header field offsets and sizes */
+#define HDR_OFF_MAGIC        0
+#define HDR_SIZE_MAGIC       4
+#define HDR_OFF_VERSION      (HDR_OFF_MAGIC + HDR_SIZE_MAGIC)
+#define HDR_SIZE_VERSION     4
+#define HDR_OFF_PY_VERSION   (HDR_OFF_VERSION + HDR_SIZE_VERSION)
+#define HDR_SIZE_PY_VERSION  4   /* 3 bytes: major, minor, micro + 1 reserved */
+#define HDR_OFF_PY_MAJOR     HDR_OFF_PY_VERSION
+#define HDR_OFF_PY_MINOR     (HDR_OFF_PY_VERSION + 1)
+#define HDR_OFF_PY_MICRO     (HDR_OFF_PY_VERSION + 2)
+#define HDR_OFF_START_TIME   (HDR_OFF_PY_VERSION + HDR_SIZE_PY_VERSION)
+#define HDR_SIZE_START_TIME  8
+#define HDR_OFF_INTERVAL     (HDR_OFF_START_TIME + HDR_SIZE_START_TIME)
+#define HDR_SIZE_INTERVAL    8
+#define HDR_OFF_SAMPLES      (HDR_OFF_INTERVAL + HDR_SIZE_INTERVAL)
+#define HDR_SIZE_SAMPLES     4
+#define HDR_OFF_THREADS      (HDR_OFF_SAMPLES + HDR_SIZE_SAMPLES)
+#define HDR_SIZE_THREADS     4
+#define HDR_OFF_STR_TABLE    (HDR_OFF_THREADS + HDR_SIZE_THREADS)
+#define HDR_SIZE_STR_TABLE   8
+#define HDR_OFF_FRAME_TABLE  (HDR_OFF_STR_TABLE + HDR_SIZE_STR_TABLE)
+#define HDR_SIZE_FRAME_TABLE 8
+#define HDR_OFF_COMPRESSION  (HDR_OFF_FRAME_TABLE + HDR_SIZE_FRAME_TABLE)
+#define HDR_SIZE_COMPRESSION 4
+#define FILE_HEADER_SIZE     (HDR_OFF_COMPRESSION + HDR_SIZE_COMPRESSION)
+#define FILE_HEADER_PLACEHOLDER_SIZE 64
+
+static_assert(FILE_HEADER_SIZE <= FILE_HEADER_PLACEHOLDER_SIZE,
+              "FILE_HEADER_SIZE exceeds FILE_HEADER_PLACEHOLDER_SIZE");
+
+/* Buffer sizes: 512KB balances syscall amortization against memory use,
+ * and aligns well with filesystem block sizes and zstd dictionary windows */
+#define WRITE_BUFFER_SIZE       (512 * 1024)
+#define COMPRESSED_BUFFER_SIZE  (512 * 1024)
+
+/* Compression types */
+#define COMPRESSION_NONE        0
+#define COMPRESSION_ZSTD        1
+
+/* Stack encoding types for delta compression */
+#define STACK_REPEAT            0x00  /* RLE: identical to previous, with count */
+#define STACK_FULL              0x01  /* Full stack (first sample or no match) */
+#define STACK_SUFFIX            0x02  /* Shares N frames from bottom */
+#define STACK_POP_PUSH          0x03  /* Remove M frames, add N frames */
+
+/* Maximum stack depth we'll buffer for delta encoding */
+#define MAX_STACK_DEPTH         256
+
+/* Initial capacity for RLE pending buffer */
+#define INITIAL_RLE_CAPACITY    64
+
+/* Initial capacities for dynamic arrays - sized to reduce reallocations */
+#define INITIAL_STRING_CAPACITY 4096
+#define INITIAL_FRAME_CAPACITY  4096
+#define INITIAL_THREAD_CAPACITY 256
+
+/* ============================================================================
+ * STATISTICS STRUCTURES
+ * ============================================================================ */
+
+/* Writer statistics - tracks encoding efficiency */
+typedef struct {
+    uint64_t repeat_records;      /* Number of RLE repeat records written */
+    uint64_t repeat_samples;      /* Total samples encoded via RLE */
+    uint64_t full_records;        /* Number of full stack records */
+    uint64_t suffix_records;      /* Number of suffix match records */
+    uint64_t pop_push_records;    /* Number of pop-push records */
+    uint64_t total_frames_written;/* Total frame indices written */
+    uint64_t frames_saved;        /* Frames avoided due to delta encoding */
+    uint64_t bytes_written;       /* Total bytes written (before compression) */
+} BinaryWriterStats;
+
+/* Reader statistics - tracks reconstruction performance */
+typedef struct {
+    uint64_t repeat_records;      /* RLE records decoded */
+    uint64_t repeat_samples;      /* Samples decoded from RLE */
+    uint64_t full_records;        /* Full stack records decoded */
+    uint64_t suffix_records;      /* Suffix match records decoded */
+    uint64_t pop_push_records;    /* Pop-push records decoded */
+    uint64_t total_samples;       /* Total samples reconstructed */
+    uint64_t stack_reconstructions; /* Number of stack array reconstructions */
+} BinaryReaderStats;
+
+/* ============================================================================
+ * PLATFORM ABSTRACTION
+ * ============================================================================ */
+
+#if defined(__linux__) || defined(__APPLE__)
+    #include <sys/mman.h>
+    #include <unistd.h>
+    #include <sys/stat.h>
+    #include <fcntl.h>
+    #define USE_MMAP 1
+#else
+    #define USE_MMAP 0
+#endif
+
+/* 64-bit file position support for files larger than 2GB.
+ * On POSIX: use ftello/fseeko with off_t (already 64-bit on 64-bit systems)
+ * On Windows: use _ftelli64/_fseeki64 with __int64 */
+#if defined(_WIN32) || defined(_WIN64)
+    #include <io.h>
+    typedef __int64 file_offset_t;
+    #define FTELL64(fp) _ftelli64(fp)
+    #define FSEEK64(fp, offset, whence) _fseeki64(fp, offset, whence)
+#else
+    /* POSIX - off_t is 64-bit on 64-bit systems, ftello/fseeko handle large files */
+    typedef off_t file_offset_t;
+    #define FTELL64(fp) ftello(fp)
+    #define FSEEK64(fp, offset, whence) fseeko(fp, offset, whence)
+#endif
+
+/* Forward declare zstd types if available */
+#ifdef HAVE_ZSTD
+#include <zstd.h>
+#endif
+
+/* Branch prediction hints - same as Objects/obmalloc.c */
+#if (defined(__clang__) || (defined(__GNUC__) && (__GNUC__ > 2))) && defined(__OPTIMIZE__)
+#  define UNLIKELY(value) __builtin_expect((value), 0)
+#  define LIKELY(value) __builtin_expect((value), 1)
+#else
+#  define UNLIKELY(value) (value)
+#  define LIKELY(value) (value)
+#endif
+
+/* ============================================================================
+ * BINARY WRITER STRUCTURES
+ * ============================================================================ */
+
+/* zstd compression state (only used if HAVE_ZSTD defined) */
+typedef struct {
+#ifdef HAVE_ZSTD
+    ZSTD_CCtx *cctx;  /* Modern API: CCtx and CStream are the same since v1.3.0 */
+#else
+    void *cctx;  /* Placeholder */
+#endif
+    uint8_t *compressed_buffer;
+    size_t compressed_buffer_size;
+} ZstdCompressor;
+
+/* Frame entry - combines all frame data for better cache locality */
+typedef struct {
+    uint32_t filename_idx;
+    uint32_t funcname_idx;
+    int32_t lineno;
+} FrameEntry;
+
+/* Frame key for hash table lookup */
+typedef struct {
+    uint32_t filename_idx;
+    uint32_t funcname_idx;
+    int32_t lineno;
+} FrameKey;
+
+/* Pending RLE sample - buffered for run-length encoding */
+typedef struct {
+    uint64_t timestamp_delta;
+    uint8_t status;
+} PendingRLESample;
+
+/* Thread entry - tracks per-thread state for delta encoding */
+typedef struct {
+    uint64_t thread_id;
+    uint64_t prev_timestamp;
+    uint32_t interpreter_id;
+
+    /* Previous stack for delta encoding (frame indices, innermost first) */
+    uint32_t *prev_stack;
+    size_t prev_stack_depth;
+    size_t prev_stack_capacity;
+
+    /* RLE pending buffer - samples waiting to be written as a repeat group */
+    PendingRLESample *pending_rle;
+    size_t pending_rle_count;
+    size_t pending_rle_capacity;
+    int has_pending_rle;  /* Flag: do we have buffered repeats? */
+} ThreadEntry;
+
+/* Main binary writer structure */
+typedef struct {
+    FILE *fp;
+    char *filename;
+
+    /* Write buffer for batched I/O */
+    uint8_t *write_buffer;
+    size_t buffer_pos;
+    size_t buffer_size;
+
+    /* Compression */
+    int compression_type;
+    ZstdCompressor zstd;
+
+    /* Metadata */
+    uint64_t start_time_us;
+    uint64_t sample_interval_us;
+    uint32_t total_samples;
+
+    /* String hash table: PyObject* -> uint32_t index */
+    _Py_hashtable_t *string_hash;
+    /* String storage: array of UTF-8 encoded strings */
+    char **strings;
+    size_t *string_lengths;
+    size_t string_count;
+    size_t string_capacity;
+
+    /* Frame hash table: FrameKey* -> uint32_t index */
+    _Py_hashtable_t *frame_hash;
+    /* Frame storage: combined struct for better cache locality */
+    FrameEntry *frame_entries;
+    size_t frame_count;
+    size_t frame_capacity;
+
+    /* Thread timestamp tracking for delta encoding - combined for cache locality */
+    ThreadEntry *thread_entries;
+    size_t thread_count;
+    size_t thread_capacity;
+
+    /* Statistics */
+    BinaryWriterStats stats;
+} BinaryWriter;
+
+/* ============================================================================
+ * BINARY READER STRUCTURES
+ * ============================================================================ */
+
+/* Per-thread state for stack reconstruction during replay */
+typedef struct {
+    uint64_t thread_id;
+    uint32_t interpreter_id;
+    uint64_t prev_timestamp;
+
+    /* Reconstructed stack buffer (frame indices, innermost first) */
+    uint32_t *current_stack;
+    size_t current_stack_depth;
+    size_t current_stack_capacity;
+} ReaderThreadState;
+
+/* Main binary reader structure */
+typedef struct {
+    char *filename;
+
+#if USE_MMAP
+    int fd;
+    uint8_t *mapped_data;
+    size_t mapped_size;
+#else
+    FILE *fp;
+    uint8_t *file_data;
+    size_t file_size;
+#endif
+
+    /* Decompression state */
+    int compression_type;
+    /* Note: ZSTD_DCtx is not stored - created/freed during decompression */
+    uint8_t *decompressed_data;
+    size_t decompressed_size;
+
+    /* Header metadata */
+    uint8_t py_major;
+    uint8_t py_minor;
+    uint8_t py_micro;
+    int needs_swap;  /* Non-zero if file was written on different-endian system */
+    uint64_t start_time_us;
+    uint64_t sample_interval_us;
+    uint32_t sample_count;
+    uint32_t thread_count;
+    uint64_t string_table_offset;
+    uint64_t frame_table_offset;
+
+    /* Parsed string table: array of Python string objects */
+    PyObject **strings;
+    uint32_t strings_count;
+
+    /* Parsed frame table: packed as [filename_idx, funcname_idx, lineno] */
+    uint32_t *frame_data;
+    uint32_t frames_count;
+
+    /* Sample data region */
+    uint8_t *sample_data;
+    size_t sample_data_size;
+
+    /* Per-thread state for stack reconstruction (used during replay) */
+    ReaderThreadState *thread_states;
+    size_t thread_state_count;
+    size_t thread_state_capacity;
+
+    /* Statistics */
+    BinaryReaderStats stats;
+} BinaryReader;
+
+/* ============================================================================
+ * VARINT ENCODING/DECODING (INLINE FOR PERFORMANCE)
+ * ============================================================================ */
+
+/* Encode unsigned 64-bit varint (LEB128). Returns bytes written. */
+static inline size_t
+encode_varint_u64(uint8_t *buf, uint64_t value)
+{
+    /* Fast path for single-byte values (0-127) - very common case */
+    if (value < 0x80) {
+        buf[0] = (uint8_t)value;
+        return 1;
+    }
+
+    size_t i = 0;
+    while (value >= 0x80) {
+        buf[i++] = (uint8_t)((value & 0x7F) | 0x80);
+        value >>= 7;
+    }
+    buf[i++] = (uint8_t)(value & 0x7F);
+    return i;
+}
+
+/* Encode unsigned 32-bit varint. Returns bytes written. */
+static inline size_t
+encode_varint_u32(uint8_t *buf, uint32_t value)
+{
+    return encode_varint_u64(buf, value);
+}
+
+/* Encode signed 32-bit varint (zigzag encoding). Returns bytes written. */
+static inline size_t
+encode_varint_i32(uint8_t *buf, int32_t value)
+{
+    /* Zigzag encode: map signed to unsigned */
+    uint32_t zigzag = ((uint32_t)value << 1) ^ (uint32_t)(value >> 31);
+    return encode_varint_u32(buf, zigzag);
+}
+
+/* Decode unsigned 64-bit varint (LEB128). Updates offset only on success.
+ * On error (overflow or incomplete), offset is NOT updated, allowing callers
+ * to detect errors via (offset == prev_offset) check. Sets PyErr on error. */
+static inline uint64_t
+decode_varint_u64(const uint8_t *data, size_t *offset, size_t max_size)
+{
+    size_t pos = *offset;
+    uint64_t result = 0;
+    int shift = 0;
+
+    /* Fast path for single-byte varints (0-127) - most common case */
+    if (LIKELY(pos < max_size && (data[pos] & 0x80) == 0)) {
+        *offset = pos + 1;
+        return data[pos];
+    }
+
+    while (pos < max_size) {
+        uint8_t byte = data[pos++];
+        result |= (uint64_t)(byte & 0x7F) << shift;
+        if ((byte & 0x80) == 0) {
+            *offset = pos;
+            return result;
+        }
+        shift += 7;
+        if (UNLIKELY(shift >= 64)) {
+            PyErr_SetString(PyExc_ValueError, "Invalid or incomplete varint in binary data");
+            return 0;
+        }
+    }
+
+    PyErr_SetString(PyExc_ValueError, "Invalid or incomplete varint in binary data");
+    return 0;
+}
+
+/* Decode unsigned 32-bit varint. If value exceeds UINT32_MAX, treats as error. */
+static inline uint32_t
+decode_varint_u32(const uint8_t *data, size_t *offset, size_t max_size)
+{
+    size_t saved_offset = *offset;
+    uint64_t value = decode_varint_u64(data, offset, max_size);
+    if (PyErr_Occurred()) {
+        return 0;
+    }
+    if (UNLIKELY(value > UINT32_MAX)) {
+        *offset = saved_offset;
+        PyErr_SetString(PyExc_ValueError, "Invalid or incomplete varint in binary data");
+        return 0;
+    }
+    return (uint32_t)value;
+}
+
+/* Decode signed 32-bit varint (zigzag encoding). */
+static inline int32_t
+decode_varint_i32(const uint8_t *data, size_t *offset, size_t max_size)
+{
+    uint32_t zigzag = decode_varint_u32(data, offset, max_size);
+    if (PyErr_Occurred()) {
+        return 0;
+    }
+    return (int32_t)((zigzag >> 1) ^ -(int32_t)(zigzag & 1));
+}
+
+/* ============================================================================
+ * SHARED UTILITY FUNCTIONS
+ * ============================================================================ */
+
+/* Generic array growth - returns new pointer or NULL (sets PyErr_NoMemory)
+ * Includes overflow checking for capacity doubling and allocation size. */
+static inline void *
+grow_array(void *ptr, size_t *capacity, size_t elem_size)
+{
+    size_t old_cap = *capacity;
+
+    /* Check for overflow when doubling capacity */
+    if (old_cap > SIZE_MAX / 2) {
+        PyErr_SetString(PyExc_OverflowError, "Array capacity overflow");
+        return NULL;
+    }
+    size_t new_cap = old_cap * 2;
+
+    /* Check for overflow when calculating allocation size */
+    if (new_cap > SIZE_MAX / elem_size) {
+        PyErr_SetString(PyExc_OverflowError, "Array allocation size overflow");
+        return NULL;
+    }
+
+    void *new_ptr = PyMem_Realloc(ptr, new_cap * elem_size);
+    if (new_ptr) {
+        *capacity = new_cap;
+    } else {
+        PyErr_NoMemory();
+    }
+    return new_ptr;
+}
+
+static inline int
+grow_array_inplace(void **ptr_addr, size_t count, size_t *capacity, size_t elem_size)
+{
+    if (count < *capacity) {
+        return 0;
+    }
+    void *tmp = grow_array(*ptr_addr, capacity, elem_size);
+    if (tmp == NULL) {
+        return -1;
+    }
+    *ptr_addr = tmp;
+    return 0;
+}
+
+#define GROW_ARRAY(ptr, count, cap, type) \
+    grow_array_inplace((void**)&(ptr), (count), &(cap), sizeof(type))
+
+/* ============================================================================
+ * BINARY WRITER API
+ * ============================================================================ */
+
+/*
+ * Create a new binary writer.
+ *
+ * Arguments:
+ *   filename: Path to output file
+ *   sample_interval_us: Sampling interval in microseconds
+ *   compression_type: COMPRESSION_NONE or COMPRESSION_ZSTD
+ *   start_time_us: Start timestamp in microseconds (from time.monotonic() * 1e6)
+ *
+ * Returns:
+ *   New BinaryWriter* on success, NULL on failure (PyErr set)
+ */
+BinaryWriter *binary_writer_create(
+    const char *filename,
+    uint64_t sample_interval_us,
+    int compression_type,
+    uint64_t start_time_us
+);
+
+/*
+ * Write a sample to the binary file.
+ *
+ * Arguments:
+ *   writer: Writer from binary_writer_create
+ *   stack_frames: List of InterpreterInfo struct sequences
+ *   timestamp_us: Current timestamp in microseconds (from time.monotonic() * 1e6)
+ *
+ * Returns:
+ *   0 on success, -1 on failure (PyErr set)
+ */
+int binary_writer_write_sample(
+    BinaryWriter *writer,
+    PyObject *stack_frames,
+    uint64_t timestamp_us
+);
+
+/*
+ * Finalize and close the binary file.
+ * Writes string/frame tables, footer, and updates header.
+ *
+ * Arguments:
+ *   writer: Writer to finalize
+ *
+ * Returns:
+ *   0 on success, -1 on failure (PyErr set)
+ */
+int binary_writer_finalize(BinaryWriter *writer);
+
+/*
+ * Destroy a binary writer and free all resources.
+ * Safe to call even if writer is partially initialized.
+ *
+ * Arguments:
+ *   writer: Writer to destroy (may be NULL)
+ */
+void binary_writer_destroy(BinaryWriter *writer);
+
+/* ============================================================================
+ * BINARY READER API
+ * ============================================================================ */
+
+/*
+ * Open a binary file for reading.
+ *
+ * Arguments:
+ *   filename: Path to input file
+ *
+ * Returns:
+ *   New BinaryReader* on success, NULL on failure (PyErr set)
+ */
+BinaryReader *binary_reader_open(const char *filename);
+
+/*
+ * Replay samples from binary file through a collector.
+ *
+ * Arguments:
+ *   reader: Reader from binary_reader_open
+ *   collector: Python collector with collect() method
+ *   progress_callback: Optional callable(current, total) or NULL
+ *
+ * Returns:
+ *   Number of samples replayed on success, -1 on failure (PyErr set)
+ */
+Py_ssize_t binary_reader_replay(
+    BinaryReader *reader,
+    PyObject *collector,
+    PyObject *progress_callback
+);
+
+/*
+ * Get metadata about the binary file.
+ *
+ * Arguments:
+ *   reader: Reader from binary_reader_open
+ *
+ * Returns:
+ *   Dict with file metadata on success, NULL on failure (PyErr set)
+ */
+PyObject *binary_reader_get_info(BinaryReader *reader);
+
+/*
+ * Close a binary reader and free all resources.
+ *
+ * Arguments:
+ *   reader: Reader to close (may be NULL)
+ */
+void binary_reader_close(BinaryReader *reader);
+
+/* ============================================================================
+ * STATISTICS FUNCTIONS
+ * ============================================================================ */
+
+/*
+ * Get writer statistics as a Python dict.
+ *
+ * Arguments:
+ *   writer: Writer to get stats from
+ *
+ * Returns:
+ *   Dict with statistics on success, NULL on failure (PyErr set)
+ */
+PyObject *binary_writer_get_stats(BinaryWriter *writer);
+
+/*
+ * Get reader statistics as a Python dict.
+ *
+ * Arguments:
+ *   reader: Reader to get stats from
+ *
+ * Returns:
+ *   Dict with statistics on success, NULL on failure (PyErr set)
+ */
+PyObject *binary_reader_get_stats(BinaryReader *reader);
+
+/* ============================================================================
+ * UTILITY FUNCTIONS
+ * ============================================================================ */
+
+/*
+ * Check if zstd compression is available.
+ *
+ * Returns:
+ *   1 if zstd available, 0 otherwise
+ */
+int binary_io_zstd_available(void);
+
+/*
+ * Get the best available compression type.
+ *
+ * Returns:
+ *   COMPRESSION_ZSTD if available, COMPRESSION_NONE otherwise
+ */
+int binary_io_get_best_compression(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* Py_BINARY_IO_H */
diff --git a/Modules/_remote_debugging/binary_io_reader.c b/Modules/_remote_debugging/binary_io_reader.c
new file mode 100644
index 00000000000..f47e3a1767f
--- /dev/null
+++ b/Modules/_remote_debugging/binary_io_reader.c
@@ -0,0 +1,1205 @@
+/******************************************************************************
+ * Python Remote Debugging Module - Binary Reader Implementation
+ *
+ * High-performance binary file reader for profiling data with optional zstd
+ * decompression.
+ ******************************************************************************/
+
+#ifndef Py_BUILD_CORE_MODULE
+#  define Py_BUILD_CORE_MODULE
+#endif
+
+#include "binary_io.h"
+#include "_remote_debugging.h"
+#include "pycore_bitutils.h"   /* _Py_bswap32, _Py_bswap64 for cross-endian reading */
+#include <string.h>
+
+#ifdef HAVE_ZSTD
+#include <zstd.h>
+#endif
+
+/* ============================================================================
+ * CONSTANTS FOR BINARY FORMAT SIZES
+ * ============================================================================ */
+
+/* File structure sizes */
+#define FILE_FOOTER_SIZE 32
+#define MIN_DECOMPRESS_BUFFER_SIZE (64 * 1024)  /* Minimum decompression buffer */
+
+/* Progress callback frequency */
+#define PROGRESS_CALLBACK_INTERVAL 1000
+
+/* Maximum decompression size limit (1GB) */
+#define MAX_DECOMPRESS_SIZE (1ULL << 30)
+
+/* ============================================================================
+ * BINARY READER IMPLEMENTATION
+ * ============================================================================ */
+
+static inline int
+reader_parse_header(BinaryReader *reader, const uint8_t *data, size_t file_size)
+{
+    if (file_size < FILE_HEADER_PLACEHOLDER_SIZE) {
+        PyErr_SetString(PyExc_ValueError, "File too small for header");
+        return -1;
+    }
+
+    /* Use memcpy to avoid strict aliasing violations and unaligned access */
+    uint32_t magic;
+    uint32_t version;
+    memcpy(&magic, &data[0], sizeof(magic));
+    memcpy(&version, &data[4], sizeof(version));
+
+    /* Detect endianness from magic number */
+    if (magic == BINARY_FORMAT_MAGIC) {
+        reader->needs_swap = 0;
+    } else if (magic == BINARY_FORMAT_MAGIC_SWAPPED) {
+        reader->needs_swap = 1;
+        version = _Py_bswap32(version);
+    } else {
+        PyErr_Format(PyExc_ValueError, "Invalid magic number: 0x%08x", magic);
+        return -1;
+    }
+
+    if (version != BINARY_FORMAT_VERSION) {
+        if (version > BINARY_FORMAT_VERSION && file_size >= HDR_OFF_PY_MICRO + 1) {
+            /* Newer format - try to read Python version for better error */
+            uint8_t py_major = data[HDR_OFF_PY_MAJOR];
+            uint8_t py_minor = data[HDR_OFF_PY_MINOR];
+            uint8_t py_micro = data[HDR_OFF_PY_MICRO];
+            PyErr_Format(PyExc_ValueError,
+                "Binary file was created with Python %u.%u.%u (format version %u), "
+                "but this is Python %d.%d.%d (format version %d)",
+                py_major, py_minor, py_micro, version,
+                PY_MAJOR_VERSION, PY_MINOR_VERSION, PY_MICRO_VERSION,
+                BINARY_FORMAT_VERSION);
+        } else {
+            PyErr_Format(PyExc_ValueError,
+                "Unsupported format version %u (this reader supports version %d)",
+                version, BINARY_FORMAT_VERSION);
+        }
+        return -1;
+    }
+
+    reader->py_major = data[HDR_OFF_PY_MAJOR];
+    reader->py_minor = data[HDR_OFF_PY_MINOR];
+    reader->py_micro = data[HDR_OFF_PY_MICRO];
+
+    /* Read header fields with byte-swapping if needed */
+    uint64_t start_time_us, sample_interval_us, string_table_offset, frame_table_offset;
+    uint32_t sample_count, thread_count, compression_type;
+
+    memcpy(&start_time_us, &data[HDR_OFF_START_TIME], HDR_SIZE_START_TIME);
+    memcpy(&sample_interval_us, &data[HDR_OFF_INTERVAL], HDR_SIZE_INTERVAL);
+    memcpy(&sample_count, &data[HDR_OFF_SAMPLES], HDR_SIZE_SAMPLES);
+    memcpy(&thread_count, &data[HDR_OFF_THREADS], HDR_SIZE_THREADS);
+    memcpy(&string_table_offset, &data[HDR_OFF_STR_TABLE], HDR_SIZE_STR_TABLE);
+    memcpy(&frame_table_offset, &data[HDR_OFF_FRAME_TABLE], HDR_SIZE_FRAME_TABLE);
+    memcpy(&compression_type, &data[HDR_OFF_COMPRESSION], HDR_SIZE_COMPRESSION);
+
+    reader->start_time_us = SWAP64_IF(reader->needs_swap, start_time_us);
+    reader->sample_interval_us = SWAP64_IF(reader->needs_swap, sample_interval_us);
+    reader->sample_count = SWAP32_IF(reader->needs_swap, sample_count);
+    reader->thread_count = SWAP32_IF(reader->needs_swap, thread_count);
+    reader->string_table_offset = SWAP64_IF(reader->needs_swap, string_table_offset);
+    reader->frame_table_offset = SWAP64_IF(reader->needs_swap, frame_table_offset);
+    reader->compression_type = (int)SWAP32_IF(reader->needs_swap, compression_type);
+
+    return 0;
+}
+
+static inline int
+reader_parse_footer(BinaryReader *reader, const uint8_t *data, size_t file_size)
+{
+    if (file_size < FILE_FOOTER_SIZE) {
+        PyErr_SetString(PyExc_ValueError, "File too small for footer");
+        return -1;
+    }
+
+    const uint8_t *footer = data + file_size - FILE_FOOTER_SIZE;
+    /* Use memcpy to avoid strict aliasing violations */
+    uint32_t strings_count, frames_count;
+    memcpy(&strings_count, &footer[0], sizeof(strings_count));
+    memcpy(&frames_count, &footer[4], sizeof(frames_count));
+
+    reader->strings_count = SWAP32_IF(reader->needs_swap, strings_count);
+    reader->frames_count = SWAP32_IF(reader->needs_swap, frames_count);
+
+    return 0;
+}
+
+#ifdef HAVE_ZSTD
+/* Maximum decompression buffer size to prevent memory exhaustion (1GB) */
+#define MAX_DECOMPRESS_SIZE (1ULL << 30)
+
+static inline int
+reader_decompress_samples(BinaryReader *reader, const uint8_t *data)
+{
+    size_t compressed_size = reader->string_table_offset - FILE_HEADER_PLACEHOLDER_SIZE;
+    const uint8_t *compressed_data = data + FILE_HEADER_PLACEHOLDER_SIZE;
+
+    /* Validate compressed data region */
+    if (reader->string_table_offset < FILE_HEADER_PLACEHOLDER_SIZE) {
+        PyErr_SetString(PyExc_ValueError, "Invalid string table offset");
+        return -1;
+    }
+
+    ZSTD_DCtx *dctx = ZSTD_createDCtx();
+    if (!dctx) {
+        PyErr_SetString(PyExc_MemoryError, "Failed to create zstd decompression context");
+        return -1;
+    }
+
+    /* Try to get exact decompressed size from frame header for optimal allocation */
+    unsigned long long frame_content_size = ZSTD_getFrameContentSize(compressed_data, compressed_size);
+    size_t alloc_size;
+
+    if (frame_content_size == ZSTD_CONTENTSIZE_ERROR) {
+        /* Corrupted frame header - fail early */
+        ZSTD_freeDCtx(dctx);
+        PyErr_SetString(PyExc_ValueError, "Corrupted zstd frame header");
+        return -1;
+    } else if (frame_content_size != ZSTD_CONTENTSIZE_UNKNOWN &&
+               frame_content_size <= SIZE_MAX &&
+               frame_content_size <= MAX_DECOMPRESS_SIZE) {
+        alloc_size = (size_t)frame_content_size;
+    } else {
+        alloc_size = ZSTD_DStreamOutSize() * 4;
+        if (alloc_size < MIN_DECOMPRESS_BUFFER_SIZE) {
+            alloc_size = MIN_DECOMPRESS_BUFFER_SIZE;
+        }
+    }
+
+    reader->decompressed_data = PyMem_Malloc(alloc_size);
+    if (!reader->decompressed_data) {
+        ZSTD_freeDCtx(dctx);
+        PyErr_NoMemory();
+        return -1;
+    }
+
+    ZSTD_inBuffer input = { compressed_data, compressed_size, 0 };
+    size_t total_output = 0;
+    size_t last_result = 0;
+
+    while (input.pos < input.size) {
+        if (total_output >= alloc_size) {
+            /* Check for overflow before doubling */
+            if (alloc_size > SIZE_MAX / 2 || alloc_size * 2 > MAX_DECOMPRESS_SIZE) {
+                PyMem_Free(reader->decompressed_data);
+                reader->decompressed_data = NULL;
+                ZSTD_freeDCtx(dctx);
+                PyErr_SetString(PyExc_MemoryError, "Decompressed data exceeds maximum size");
+                return -1;
+            }
+            size_t new_size = alloc_size * 2;
+            uint8_t *new_buf = PyMem_Realloc(reader->decompressed_data, new_size);
+            if (!new_buf) {
+                PyMem_Free(reader->decompressed_data);
+                reader->decompressed_data = NULL;
+                ZSTD_freeDCtx(dctx);
+                PyErr_NoMemory();
+                return -1;
+            }
+            reader->decompressed_data = new_buf;
+            alloc_size = new_size;
+        }
+
+        ZSTD_outBuffer output = {
+            reader->decompressed_data + total_output,
+            alloc_size - total_output,
+            0
+        };
+
+        last_result = ZSTD_decompressStream(dctx, &output, &input);
+        if (ZSTD_isError(last_result)) {
+            PyMem_Free(reader->decompressed_data);
+            reader->decompressed_data = NULL;
+            ZSTD_freeDCtx(dctx);
+            PyErr_Format(PyExc_ValueError, "zstd decompression error: %s",
+                         ZSTD_getErrorName(last_result));
+            return -1;
+        }
+
+        total_output += output.pos;
+    }
+
+    /* Verify decompression is complete (last_result == 0 means frame is complete) */
+    if (last_result != 0) {
+        PyMem_Free(reader->decompressed_data);
+        reader->decompressed_data = NULL;
+        ZSTD_freeDCtx(dctx);
+        PyErr_SetString(PyExc_ValueError, "Incomplete zstd frame: data may be truncated");
+        return -1;
+    }
+
+    ZSTD_freeDCtx(dctx);
+    reader->decompressed_size = total_output;
+    reader->sample_data = reader->decompressed_data;
+    reader->sample_data_size = reader->decompressed_size;
+
+    return 0;
+}
+#endif
+
+static inline int
+reader_parse_string_table(BinaryReader *reader, const uint8_t *data, size_t file_size)
+{
+    reader->strings = PyMem_Calloc(reader->strings_count, sizeof(PyObject *));
+    if (!reader->strings && reader->strings_count > 0) {
+        PyErr_NoMemory();
+        return -1;
+    }
+
+    size_t offset = reader->string_table_offset;
+    for (uint32_t i = 0; i < reader->strings_count; i++) {
+        size_t prev_offset = offset;
+        uint32_t str_len = decode_varint_u32(data, &offset, file_size);
+        if (offset == prev_offset) {
+            PyErr_SetString(PyExc_ValueError, "Malformed varint in string table");
+            return -1;
+        }
+        if (offset + str_len > file_size) {
+            PyErr_SetString(PyExc_ValueError, "String table overflow");
+            return -1;
+        }
+
+        reader->strings[i] = PyUnicode_DecodeUTF8((char *)&data[offset], str_len, "replace");
+        if (!reader->strings[i]) {
+            return -1;
+        }
+        offset += str_len;
+    }
+
+    return 0;
+}
+
+static inline int
+reader_parse_frame_table(BinaryReader *reader, const uint8_t *data, size_t file_size)
+{
+    /* Check for integer overflow in allocation size calculation.
+       Only needed on 32-bit where SIZE_MAX can be exceeded by uint32_t * 12. */
+#if SIZEOF_SIZE_T < 8
+    if (reader->frames_count > SIZE_MAX / (3 * sizeof(uint32_t))) {
+        PyErr_SetString(PyExc_OverflowError, "Frame count too large for allocation");
+        return -1;
+    }
+#endif
+
+    size_t alloc_size = (size_t)reader->frames_count * 3 * sizeof(uint32_t);
+    reader->frame_data = PyMem_Malloc(alloc_size);
+    if (!reader->frame_data && reader->frames_count > 0) {
+        PyErr_NoMemory();
+        return -1;
+    }
+
+    size_t offset = reader->frame_table_offset;
+    for (uint32_t i = 0; i < reader->frames_count; i++) {
+        size_t base = (size_t)i * 3;
+        size_t prev_offset;
+
+        prev_offset = offset;
+        reader->frame_data[base] = decode_varint_u32(data, &offset, file_size);
+        if (offset == prev_offset) {
+            PyErr_SetString(PyExc_ValueError, "Malformed varint in frame table (filename)");
+            return -1;
+        }
+
+        prev_offset = offset;
+        reader->frame_data[base + 1] = decode_varint_u32(data, &offset, file_size);
+        if (offset == prev_offset) {
+            PyErr_SetString(PyExc_ValueError, "Malformed varint in frame table (funcname)");
+            return -1;
+        }
+
+        prev_offset = offset;
+        reader->frame_data[base + 2] = (uint32_t)decode_varint_i32(data, &offset, file_size);
+        if (offset == prev_offset) {
+            PyErr_SetString(PyExc_ValueError, "Malformed varint in frame table (lineno)");
+            return -1;
+        }
+    }
+
+    return 0;
+}
+
+BinaryReader *
+binary_reader_open(const char *filename)
+{
+    BinaryReader *reader = PyMem_Calloc(1, sizeof(BinaryReader));
+    if (!reader) {
+        PyErr_NoMemory();
+        return NULL;
+    }
+
+#if USE_MMAP
+    reader->fd = -1;  /* Explicit initialization for cleanup safety */
+#endif
+
+    reader->filename = PyMem_Malloc(strlen(filename) + 1);
+    if (!reader->filename) {
+        PyMem_Free(reader);
+        PyErr_NoMemory();
+        return NULL;
+    }
+    strcpy(reader->filename, filename);
+
+#if USE_MMAP
+    /* Open with mmap on Unix */
+    reader->fd = open(filename, O_RDONLY);
+    if (reader->fd < 0) {
+        PyErr_SetFromErrnoWithFilename(PyExc_IOError, filename);
+        goto error;
+    }
+
+    struct stat st;
+    if (fstat(reader->fd, &st) < 0) {
+        PyErr_SetFromErrno(PyExc_IOError);
+        goto error;
+    }
+    reader->mapped_size = st.st_size;
+
+    /* Map the file into memory.
+     * MAP_POPULATE (Linux-only) pre-faults all pages at mmap time, which:
+     * - Catches issues (e.g., file truncation) immediately rather than as SIGBUS during reads
+     * - Eliminates page faults during subsequent reads for better performance
+     */
+#ifdef __linux__
+    reader->mapped_data = mmap(NULL, reader->mapped_size, PROT_READ,
+                               MAP_PRIVATE | MAP_POPULATE, reader->fd, 0);
+#else
+    reader->mapped_data = mmap(NULL, reader->mapped_size, PROT_READ,
+                               MAP_PRIVATE, reader->fd, 0);
+#endif
+    if (reader->mapped_data == MAP_FAILED) {
+        reader->mapped_data = NULL;
+        PyErr_SetFromErrno(PyExc_IOError);
+        goto error;
+    }
+
+    /* Hint sequential access pattern - failures are non-fatal */
+    (void)madvise(reader->mapped_data, reader->mapped_size, MADV_SEQUENTIAL);
+
+    /* Pre-fetch pages into memory - failures are non-fatal.
+     * Complements MAP_POPULATE on Linux, provides benefit on macOS. */
+    (void)madvise(reader->mapped_data, reader->mapped_size, MADV_WILLNEED);
+
+    /* Use transparent huge pages for large files to reduce TLB misses.
+     * Only beneficial for files >= 32MB where TLB pressure matters. */
+#ifdef MADV_HUGEPAGE
+    if (reader->mapped_size >= (32 * 1024 * 1024)) {
+        (void)madvise(reader->mapped_data, reader->mapped_size, MADV_HUGEPAGE);
+    }
+#endif
+
+    /* Add file descriptor-level hints for better kernel I/O scheduling */
+#if defined(__linux__) && defined(POSIX_FADV_SEQUENTIAL)
+    (void)posix_fadvise(reader->fd, 0, 0, POSIX_FADV_SEQUENTIAL);
+    if (reader->mapped_size > (64 * 1024 * 1024)) {
+        (void)posix_fadvise(reader->fd, 0, 0, POSIX_FADV_WILLNEED);
+    }
+#endif
+
+    uint8_t *data = reader->mapped_data;
+    size_t file_size = reader->mapped_size;
+#else
+    /* Use stdio on Windows */
+    reader->fp = fopen(filename, "rb");
+    if (!reader->fp) {
+        PyErr_SetFromErrnoWithFilename(PyExc_IOError, filename);
+        goto error;
+    }
+
+    if (FSEEK64(reader->fp, 0, SEEK_END) != 0) {
+        PyErr_SetFromErrno(PyExc_IOError);
+        goto error;
+    }
+    file_offset_t file_size_off = FTELL64(reader->fp);
+    if (file_size_off < 0) {
+        PyErr_SetFromErrno(PyExc_IOError);
+        goto error;
+    }
+    reader->file_size = (size_t)file_size_off;
+    if (FSEEK64(reader->fp, 0, SEEK_SET) != 0) {
+        PyErr_SetFromErrno(PyExc_IOError);
+        goto error;
+    }
+
+    reader->file_data = PyMem_Malloc(reader->file_size);
+    if (!reader->file_data) {
+        PyErr_NoMemory();
+        goto error;
+    }
+
+    if (fread(reader->file_data, 1, reader->file_size, reader->fp) != reader->file_size) {
+        PyErr_SetFromErrno(PyExc_IOError);
+        goto error;
+    }
+
+    uint8_t *data = reader->file_data;
+    size_t file_size = reader->file_size;
+#endif
+
+    /* Parse header and footer */
+    if (reader_parse_header(reader, data, file_size) < 0) {
+        goto error;
+    }
+    if (reader_parse_footer(reader, data, file_size) < 0) {
+        goto error;
+    }
+
+    /* Validate table offsets are within file bounds */
+    if (reader->string_table_offset > file_size) {
+        PyErr_Format(PyExc_ValueError,
+            "Invalid string table offset: %llu exceeds file size %zu",
+            (unsigned long long)reader->string_table_offset, file_size);
+        goto error;
+    }
+    if (reader->frame_table_offset > file_size) {
+        PyErr_Format(PyExc_ValueError,
+            "Invalid frame table offset: %llu exceeds file size %zu",
+            (unsigned long long)reader->frame_table_offset, file_size);
+        goto error;
+    }
+    if (reader->string_table_offset < FILE_HEADER_PLACEHOLDER_SIZE) {
+        PyErr_Format(PyExc_ValueError,
+            "Invalid string table offset: %llu is before data section",
+            (unsigned long long)reader->string_table_offset);
+        goto error;
+    }
+    if (reader->frame_table_offset < FILE_HEADER_PLACEHOLDER_SIZE) {
+        PyErr_Format(PyExc_ValueError,
+            "Invalid frame table offset: %llu is before data section",
+            (unsigned long long)reader->frame_table_offset);
+        goto error;
+    }
+    if (reader->string_table_offset > reader->frame_table_offset) {
+        PyErr_Format(PyExc_ValueError,
+            "Invalid table offsets: string table (%llu) is after frame table (%llu)",
+            (unsigned long long)reader->string_table_offset,
+            (unsigned long long)reader->frame_table_offset);
+        goto error;
+    }
+
+    /* Handle compressed data */
+    if (reader->compression_type == COMPRESSION_ZSTD) {
+#ifdef HAVE_ZSTD
+        if (reader_decompress_samples(reader, data) < 0) {
+            goto error;
+        }
+#else
+        PyErr_SetString(PyExc_RuntimeError,
+            "File uses zstd compression but zstd support not compiled in");
+        goto error;
+#endif
+    } else {
+        reader->sample_data = data + FILE_HEADER_PLACEHOLDER_SIZE;
+        reader->sample_data_size = reader->string_table_offset - FILE_HEADER_PLACEHOLDER_SIZE;
+    }
+
+    /* Parse string and frame tables */
+    if (reader_parse_string_table(reader, data, file_size) < 0) {
+        goto error;
+    }
+    if (reader_parse_frame_table(reader, data, file_size) < 0) {
+        goto error;
+    }
+
+    return reader;
+
+error:
+    binary_reader_close(reader);
+    return NULL;
+}
+
+/* Get or create reader thread state for stack reconstruction */
+static ReaderThreadState *
+reader_get_or_create_thread_state(BinaryReader *reader, uint64_t thread_id,
+                                   uint32_t interpreter_id)
+{
+    /* Search existing threads (key is thread_id + interpreter_id) */
+    for (size_t i = 0; i < reader->thread_state_count; i++) {
+        if (reader->thread_states[i].thread_id == thread_id &&
+            reader->thread_states[i].interpreter_id == interpreter_id) {
+            return &reader->thread_states[i];
+        }
+    }
+
+    if (!reader->thread_states) {
+        reader->thread_state_capacity = 16;
+        reader->thread_states = PyMem_Calloc(reader->thread_state_capacity, sizeof(ReaderThreadState));
+        if (!reader->thread_states) {
+            PyErr_NoMemory();
+            return NULL;
+        }
+    } else if (reader->thread_state_count >= reader->thread_state_capacity) {
+        reader->thread_states = grow_array(reader->thread_states,
+                                           &reader->thread_state_capacity,
+                                           sizeof(ReaderThreadState));
+        if (!reader->thread_states) {
+            return NULL;
+        }
+    }
+
+    ReaderThreadState *ts = &reader->thread_states[reader->thread_state_count++];
+    memset(ts, 0, sizeof(ReaderThreadState));
+    ts->thread_id = thread_id;
+    ts->interpreter_id = interpreter_id;
+    ts->prev_timestamp = reader->start_time_us;
+    ts->current_stack_capacity = MAX_STACK_DEPTH;
+    ts->current_stack = PyMem_Malloc(ts->current_stack_capacity * sizeof(uint32_t));
+    if (!ts->current_stack) {
+        PyErr_NoMemory();
+        return NULL;
+    }
+    return ts;
+}
+
+/* ============================================================================
+ * STACK DECODING HELPERS
+ * ============================================================================ */
+
+/* Decode a full stack from sample data.
+ * Updates ts->current_stack and ts->current_stack_depth.
+ * Returns 0 on success, -1 on error (bounds violation). */
+static inline int
+decode_stack_full(ReaderThreadState *ts, const uint8_t *data,
+                  size_t *offset, size_t max_size)
+{
+    uint32_t depth = decode_varint_u32(data, offset, max_size);
+
+    /* Validate depth against capacity to prevent buffer overflow */
+    if (depth > ts->current_stack_capacity) {
+        PyErr_Format(PyExc_ValueError,
+            "Stack depth %u exceeds capacity %zu", depth, ts->current_stack_capacity);
+        return -1;
+    }
+
+    ts->current_stack_depth = depth;
+    for (uint32_t i = 0; i < depth; i++) {
+        ts->current_stack[i] = decode_varint_u32(data, offset, max_size);
+    }
+    return 0;
+}
+
+/* Decode a suffix-encoded stack from sample data.
+ * The suffix encoding shares frames from the bottom of the previous stack.
+ * Returns 0 on success, -1 on error (bounds violation). */
+static inline int
+decode_stack_suffix(ReaderThreadState *ts, const uint8_t *data,
+                    size_t *offset, size_t max_size)
+{
+    uint32_t shared = decode_varint_u32(data, offset, max_size);
+    uint32_t new_count = decode_varint_u32(data, offset, max_size);
+
+    /* Validate shared doesn't exceed current stack depth */
+    if (shared > ts->current_stack_depth) {
+        PyErr_Format(PyExc_ValueError,
+            "Shared count %u exceeds current stack depth %zu",
+            shared, ts->current_stack_depth);
+        return -1;
+    }
+
+    /* Validate final depth doesn't exceed capacity */
+    size_t final_depth = (size_t)shared + new_count;
+    if (final_depth > ts->current_stack_capacity) {
+        PyErr_Format(PyExc_ValueError,
+            "Final stack depth %zu exceeds capacity %zu",
+            final_depth, ts->current_stack_capacity);
+        return -1;
+    }
+
+    /* Move shared frames (from bottom of stack) to make room for new frames at the top */
+    if (new_count > 0 && shared > 0) {
+        /* Defensive check: ensure subtraction won't underflow.
+         * This should already be guaranteed by the check above, but we add
+         * this assertion as defense-in-depth against stack corruption. */
+        if (ts->current_stack_depth < shared) {
+            PyErr_SetString(PyExc_ValueError,
+                "Internal error: stack corruption detected in suffix decoding");
+            return -1;
+        }
+        size_t prev_shared_start = ts->current_stack_depth - shared;
+        memmove(&ts->current_stack[new_count],
+                &ts->current_stack[prev_shared_start],
+                shared * sizeof(uint32_t));
+    }
+
+    for (uint32_t i = 0; i < new_count; i++) {
+        ts->current_stack[i] = decode_varint_u32(data, offset, max_size);
+    }
+    ts->current_stack_depth = final_depth;
+    return 0;
+}
+
+/* Decode a pop-push encoded stack from sample data.
+ * Pops frames from the top and pushes new frames.
+ * Returns 0 on success, -1 on error (bounds violation). */
+static inline int
+decode_stack_pop_push(ReaderThreadState *ts, const uint8_t *data,
+                      size_t *offset, size_t max_size)
+{
+    uint32_t pop = decode_varint_u32(data, offset, max_size);
+    uint32_t push = decode_varint_u32(data, offset, max_size);
+    size_t keep = (ts->current_stack_depth > pop) ? ts->current_stack_depth - pop : 0;
+
+    /* Validate final depth doesn't exceed capacity */
+    size_t final_depth = keep + push;
+    if (final_depth > ts->current_stack_capacity) {
+        PyErr_Format(PyExc_ValueError,
+            "Final stack depth %zu exceeds capacity %zu",
+            final_depth, ts->current_stack_capacity);
+        return -1;
+    }
+
+    /* Move kept frames (from bottom of stack) to make room for new frames at the top.
+     * Even when push == 0, we need to move kept frames to index 0 if pop > 0. */
+    if (keep > 0) {
+        memmove(&ts->current_stack[push],
+                &ts->current_stack[pop],
+                keep * sizeof(uint32_t));
+    }
+
+    for (uint32_t i = 0; i < push; i++) {
+        ts->current_stack[i] = decode_varint_u32(data, offset, max_size);
+    }
+    ts->current_stack_depth = final_depth;
+    return 0;
+}
+
+/* Build a Python list of FrameInfo objects from frame indices */
+static PyObject *
+build_frame_list(RemoteDebuggingState *state, BinaryReader *reader,
+                 const uint32_t *frame_indices, size_t stack_depth)
+{
+    PyObject *frame_list = PyList_New(stack_depth);
+    if (!frame_list) {
+        return NULL;
+    }
+
+    for (size_t k = 0; k < stack_depth; k++) {
+        uint32_t frame_idx = frame_indices[k];
+        if (frame_idx >= reader->frames_count) {
+            PyErr_Format(PyExc_ValueError, "Invalid frame index: %u", frame_idx);
+            goto error;
+        }
+
+        size_t base = frame_idx * 3;
+        uint32_t filename_idx = reader->frame_data[base];
+        uint32_t funcname_idx = reader->frame_data[base + 1];
+        int32_t lineno = (int32_t)reader->frame_data[base + 2];
+
+        if (filename_idx >= reader->strings_count ||
+            funcname_idx >= reader->strings_count) {
+            PyErr_SetString(PyExc_ValueError, "Invalid string index in frame");
+            goto error;
+        }
+
+        PyObject *frame_info = PyStructSequence_New(state->FrameInfo_Type);
+        if (!frame_info) {
+            goto error;
+        }
+
+        PyObject *location;
+        if (lineno > 0) {
+            location = Py_BuildValue("(iiii)", lineno, lineno, 0, 0);
+            if (!location) {
+                Py_DECREF(frame_info);
+                goto error;
+            }
+        }
+        else {
+            location = Py_NewRef(Py_None);
+        }
+
+        PyStructSequence_SetItem(frame_info, 0, Py_NewRef(reader->strings[filename_idx]));
+        PyStructSequence_SetItem(frame_info, 1, location);
+        PyStructSequence_SetItem(frame_info, 2, Py_NewRef(reader->strings[funcname_idx]));
+        PyStructSequence_SetItem(frame_info, 3, Py_NewRef(Py_None));
+        PyList_SET_ITEM(frame_list, k, frame_info);
+    }
+
+    return frame_list;
+
+error:
+    Py_DECREF(frame_list);
+    return NULL;
+}
+
+/* Helper to build sample_list from frame indices (shared by emit functions) */
+static PyObject *
+build_sample_list(RemoteDebuggingState *state, BinaryReader *reader,
+                  uint64_t thread_id, uint32_t interpreter_id, uint8_t status,
+                  const uint32_t *frame_indices, size_t stack_depth)
+{
+    PyObject *frame_list = NULL, *thread_info = NULL, *thread_list = NULL;
+    PyObject *interp_info = NULL, *sample_list = NULL;
+
+    frame_list = build_frame_list(state, reader, frame_indices, stack_depth);
+    if (!frame_list) {
+        goto error;
+    }
+
+    thread_info = PyStructSequence_New(state->ThreadInfo_Type);
+    if (!thread_info) {
+        goto error;
+    }
+    PyObject *tid = PyLong_FromUnsignedLongLong(thread_id);
+    if (!tid) {
+        goto error;
+    }
+    PyObject *st = PyLong_FromLong(status);
+    if (!st) {
+        Py_DECREF(tid);
+        goto error;
+    }
+    PyStructSequence_SetItem(thread_info, 0, tid);
+    PyStructSequence_SetItem(thread_info, 1, st);
+    PyStructSequence_SetItem(thread_info, 2, frame_list);
+    frame_list = NULL;  /* ownership transferred */
+
+    thread_list = PyList_New(1);
+    if (!thread_list) {
+        goto error;
+    }
+    PyList_SET_ITEM(thread_list, 0, thread_info);
+    thread_info = NULL;
+
+    interp_info = PyStructSequence_New(state->InterpreterInfo_Type);
+    if (!interp_info) {
+        goto error;
+    }
+    PyObject *iid = PyLong_FromUnsignedLong(interpreter_id);
+    if (!iid) {
+        goto error;
+    }
+    PyStructSequence_SetItem(interp_info, 0, iid);
+    PyStructSequence_SetItem(interp_info, 1, thread_list);
+    thread_list = NULL;
+
+    sample_list = PyList_New(1);
+    if (!sample_list) {
+        goto error;
+    }
+    PyList_SET_ITEM(sample_list, 0, interp_info);
+    return sample_list;
+
+error:
+    Py_XDECREF(sample_list);
+    Py_XDECREF(interp_info);
+    Py_XDECREF(thread_list);
+    Py_XDECREF(thread_info);
+    Py_XDECREF(frame_list);
+    return NULL;
+}
+
+/* Helper to emit a sample to the collector. timestamps_list is borrowed. */
+static int
+emit_sample(RemoteDebuggingState *state, PyObject *collector,
+            uint64_t thread_id, uint32_t interpreter_id, uint8_t status,
+            const uint32_t *frame_indices, size_t stack_depth,
+            BinaryReader *reader, PyObject *timestamps_list)
+{
+    PyObject *sample_list = build_sample_list(state, reader, thread_id,
+                                               interpreter_id, status,
+                                               frame_indices, stack_depth);
+    if (!sample_list) {
+        return -1;
+    }
+
+    PyObject *result = PyObject_CallMethod(collector, "collect", "OO", sample_list, timestamps_list);
+    Py_DECREF(sample_list);
+
+    if (!result) {
+        return -1;
+    }
+    Py_DECREF(result);
+    return 0;
+}
+
+/* Helper to trim timestamp list and emit batch. Returns 0 on success, -1 on error. */
+static int
+emit_batch(RemoteDebuggingState *state, PyObject *collector,
+           uint64_t thread_id, uint32_t interpreter_id, uint8_t status,
+           const uint32_t *frame_indices, size_t stack_depth,
+           BinaryReader *reader, PyObject *timestamps_list, Py_ssize_t actual_size)
+{
+    /* Trim list to actual size */
+    if (PyList_SetSlice(timestamps_list, actual_size, PyList_GET_SIZE(timestamps_list), NULL) < 0) {
+        return -1;
+    }
+    return emit_sample(state, collector, thread_id, interpreter_id, status,
+                       frame_indices, stack_depth, reader, timestamps_list);
+}
+
+/* Helper to invoke progress callback, returns -1 on error */
+static inline int
+invoke_progress_callback(PyObject *callback, Py_ssize_t current, uint32_t total)
+{
+    if (callback && callback != Py_None) {
+        PyObject *result = PyObject_CallFunction(callback, "nI", current, total);
+        if (result) {
+            Py_DECREF(result);
+        } else {
+            return -1;
+        }
+    }
+    return 0;
+}
+
+Py_ssize_t
+binary_reader_replay(BinaryReader *reader, PyObject *collector, PyObject *progress_callback)
+{
+    if (!PyObject_HasAttrString(collector, "collect")) {
+        PyErr_SetString(PyExc_TypeError, "Collector must have a collect() method");
+        return -1;
+    }
+
+    /* Get module state for struct sequence types */
+    PyObject *module = PyImport_ImportModule("_remote_debugging");
+    if (!module) {
+        return -1;
+    }
+    RemoteDebuggingState *state = RemoteDebugging_GetState(module);
+    Py_DECREF(module);
+
+    if (!state) {
+        PyErr_SetString(PyExc_RuntimeError, "Failed to get module state");
+        return -1;
+    }
+
+    size_t offset = 0;
+    Py_ssize_t replayed = 0;
+
+    /* Initial progress callback at 0% */
+    if (invoke_progress_callback(progress_callback, 0, reader->sample_count) < 0) {
+        return -1;
+    }
+
+    while (offset < reader->sample_data_size) {
+        /* Read thread_id (8 bytes) + interpreter_id (4 bytes) */
+        if (offset + 13 > reader->sample_data_size) {
+            break;  /* End of data */
+        }
+
+        /* Use memcpy to avoid strict aliasing violations, then byte-swap if needed */
+        uint64_t thread_id_raw;
+        uint32_t interpreter_id_raw;
+        memcpy(&thread_id_raw, &reader->sample_data[offset], sizeof(thread_id_raw));
+        offset += 8;
+
+        memcpy(&interpreter_id_raw, &reader->sample_data[offset], sizeof(interpreter_id_raw));
+        offset += 4;
+
+        uint64_t thread_id = SWAP64_IF(reader->needs_swap, thread_id_raw);
+        uint32_t interpreter_id = SWAP32_IF(reader->needs_swap, interpreter_id_raw);
+
+        /* Get or create thread state for reconstruction */
+        ReaderThreadState *ts = reader_get_or_create_thread_state(reader, thread_id, interpreter_id);
+        if (!ts) {
+            return -1;
+        }
+
+        /* Read encoding byte */
+        uint8_t encoding = reader->sample_data[offset++];
+
+        switch (encoding) {
+        case STACK_REPEAT: {
+            /* RLE repeat: [count: varint] [delta: varint, status: 1]... */
+            size_t prev_offset = offset;
+            uint32_t count = decode_varint_u32(reader->sample_data, &offset, reader->sample_data_size);
+            /* Detect varint decode failure */
+            if (offset == prev_offset) {
+                PyErr_SetString(PyExc_ValueError, "Malformed varint for RLE count");
+                return -1;
+            }
+
+            /* Validate RLE count to prevent DoS from malicious files.
+             * Each RLE sample needs at least 2 bytes (1 byte min varint + 1 status byte).
+             * Also reject absurdly large counts that would exhaust memory. */
+            size_t remaining_data = reader->sample_data_size - offset;
+            size_t max_possible_samples = remaining_data / 2;
+            if (count > max_possible_samples) {
+                PyErr_Format(PyExc_ValueError,
+                    "Invalid RLE count %u exceeds maximum possible %zu for remaining data",
+                    count, max_possible_samples);
+                return -1;
+            }
+
+            reader->stats.repeat_records++;
+            reader->stats.repeat_samples += count;
+
+            /* Process RLE samples, batching by status */
+            PyObject *timestamps_list = NULL;
+            uint8_t batch_status = 0;
+            Py_ssize_t batch_idx = 0;
+
+            for (uint32_t i = 0; i < count; i++) {
+                size_t delta_prev_offset = offset;
+                uint64_t delta = decode_varint_u64(reader->sample_data, &offset, reader->sample_data_size);
+                if (offset == delta_prev_offset) {
+                    Py_XDECREF(timestamps_list);
+                    PyErr_SetString(PyExc_ValueError, "Malformed varint in RLE sample data");
+                    return -1;
+                }
+                if (offset >= reader->sample_data_size) {
+                    Py_XDECREF(timestamps_list);
+                    PyErr_SetString(PyExc_ValueError, "Unexpected end of sample data in RLE");
+                    return -1;
+                }
+                uint8_t status = reader->sample_data[offset++];
+                ts->prev_timestamp += delta;
+
+                /* Start new batch on first sample or status change */
+                if (i == 0 || status != batch_status) {
+                    if (timestamps_list) {
+                        int rc = emit_batch(state, collector, thread_id, interpreter_id,
+                                            batch_status, ts->current_stack, ts->current_stack_depth,
+                                            reader, timestamps_list, batch_idx);
+                        Py_DECREF(timestamps_list);
+                        if (rc < 0) {
+                            return -1;
+                        }
+                    }
+                    timestamps_list = PyList_New(count - i);
+                    if (!timestamps_list) {
+                        return -1;
+                    }
+                    batch_status = status;
+                    batch_idx = 0;
+                }
+
+                PyObject *ts_obj = PyLong_FromUnsignedLongLong(ts->prev_timestamp);
+                if (!ts_obj) {
+                    Py_DECREF(timestamps_list);
+                    return -1;
+                }
+                PyList_SET_ITEM(timestamps_list, batch_idx++, ts_obj);
+            }
+
+            /* Emit final batch */
+            if (timestamps_list) {
+                int rc = emit_batch(state, collector, thread_id, interpreter_id,
+                                    batch_status, ts->current_stack, ts->current_stack_depth,
+                                    reader, timestamps_list, batch_idx);
+                Py_DECREF(timestamps_list);
+                if (rc < 0) {
+                    return -1;
+                }
+            }
+
+            replayed += count;
+            reader->stats.total_samples += count;
+
+            /* Progress callback after batch */
+            if (replayed % PROGRESS_CALLBACK_INTERVAL < count) {
+                if (invoke_progress_callback(progress_callback, replayed, reader->sample_count) < 0) {
+                    return -1;
+                }
+            }
+            break;
+        }
+
+        case STACK_FULL:
+        case STACK_SUFFIX:
+        case STACK_POP_PUSH: {
+            /* All three encodings share: [delta: varint] [status: 1] ... */
+            size_t prev_offset = offset;
+            uint64_t delta = decode_varint_u64(reader->sample_data, &offset, reader->sample_data_size);
+            /* Detect varint decode failure: offset unchanged means error */
+            if (offset == prev_offset) {
+                PyErr_SetString(PyExc_ValueError, "Malformed varint in sample data");
+                return -1;
+            }
+            if (offset >= reader->sample_data_size) {
+                PyErr_SetString(PyExc_ValueError, "Unexpected end of sample data");
+                return -1;
+            }
+            uint8_t status = reader->sample_data[offset++];
+            ts->prev_timestamp += delta;
+
+            if (encoding == STACK_FULL) {
+                if (decode_stack_full(ts, reader->sample_data, &offset, reader->sample_data_size) < 0) {
+                    return -1;
+                }
+                reader->stats.full_records++;
+            } else if (encoding == STACK_SUFFIX) {
+                if (decode_stack_suffix(ts, reader->sample_data, &offset, reader->sample_data_size) < 0) {
+                    return -1;
+                }
+                reader->stats.suffix_records++;
+            } else { /* STACK_POP_PUSH */
+                if (decode_stack_pop_push(ts, reader->sample_data, &offset, reader->sample_data_size) < 0) {
+                    return -1;
+                }
+                reader->stats.pop_push_records++;
+            }
+            reader->stats.stack_reconstructions++;
+
+            /* Build single-element timestamp list */
+            PyObject *ts_obj = PyLong_FromUnsignedLongLong(ts->prev_timestamp);
+            if (!ts_obj) {
+                return -1;
+            }
+            PyObject *timestamps_list = PyList_New(1);
+            if (!timestamps_list) {
+                Py_DECREF(ts_obj);
+                return -1;
+            }
+            PyList_SET_ITEM(timestamps_list, 0, ts_obj);
+
+            if (emit_sample(state, collector, thread_id, interpreter_id, status,
+                           ts->current_stack, ts->current_stack_depth, reader,
+                           timestamps_list) < 0) {
+                Py_DECREF(timestamps_list);
+                return -1;
+            }
+            Py_DECREF(timestamps_list);
+            replayed++;
+            reader->stats.total_samples++;
+            break;
+        }
+
+        default:
+            PyErr_Format(PyExc_ValueError, "Unknown stack encoding: %u", encoding);
+            return -1;
+        }
+
+        /* Progress callback */
+        if (replayed % PROGRESS_CALLBACK_INTERVAL == 0) {
+            if (invoke_progress_callback(progress_callback, replayed, reader->sample_count) < 0) {
+                return -1;
+            }
+        }
+    }
+
+    /* Final progress callback at 100% */
+    if (invoke_progress_callback(progress_callback, replayed, reader->sample_count) < 0) {
+        return -1;
+    }
+
+    return replayed;
+}
+
+PyObject *
+binary_reader_get_info(BinaryReader *reader)
+{
+    PyObject *py_version = Py_BuildValue("(B,B,B)",
+        reader->py_major, reader->py_minor, reader->py_micro);
+    if (py_version == NULL) {
+        return NULL;
+    }
+    return Py_BuildValue(
+        "{s:I, s:N, s:K, s:K, s:I, s:I, s:I, s:I, s:i}",
+        "version", BINARY_FORMAT_VERSION,
+        "python_version", py_version,
+        "start_time_us", reader->start_time_us,
+        "sample_interval_us", reader->sample_interval_us,
+        "sample_count", reader->sample_count,
+        "thread_count", reader->thread_count,
+        "string_count", reader->strings_count,
+        "frame_count", reader->frames_count,
+        "compression_type", reader->compression_type
+    );
+}
+
+PyObject *
+binary_writer_get_stats(BinaryWriter *writer)
+{
+    BinaryWriterStats *s = &writer->stats;
+
+    /* Calculate derived stats */
+    uint64_t total_records = s->repeat_records + s->full_records +
+                             s->suffix_records + s->pop_push_records;
+    uint64_t total_samples = writer->total_samples;
+    uint64_t potential_frames = s->total_frames_written + s->frames_saved;
+    double compression_ratio = (potential_frames > 0) ?
+        (double)s->frames_saved / potential_frames * 100.0 : 0.0;
+
+    return Py_BuildValue(
+        "{s:K, s:K, s:K, s:K, s:K, s:K, s:K, s:K, s:K, s:K, s:d}",
+        "repeat_records", s->repeat_records,
+        "repeat_samples", s->repeat_samples,
+        "full_records", s->full_records,
+        "suffix_records", s->suffix_records,
+        "pop_push_records", s->pop_push_records,
+        "total_records", total_records,
+        "total_samples", total_samples,
+        "total_frames_written", s->total_frames_written,
+        "frames_saved", s->frames_saved,
+        "bytes_written", s->bytes_written,
+        "frame_compression_pct", compression_ratio
+    );
+}
+
+PyObject *
+binary_reader_get_stats(BinaryReader *reader)
+{
+    BinaryReaderStats *s = &reader->stats;
+
+    uint64_t total_records = s->repeat_records + s->full_records +
+                             s->suffix_records + s->pop_push_records;
+
+    return Py_BuildValue(
+        "{s:K, s:K, s:K, s:K, s:K, s:K, s:K, s:K}",
+        "repeat_records", s->repeat_records,
+        "repeat_samples", s->repeat_samples,
+        "full_records", s->full_records,
+        "suffix_records", s->suffix_records,
+        "pop_push_records", s->pop_push_records,
+        "total_records", total_records,
+        "total_samples", s->total_samples,
+        "stack_reconstructions", s->stack_reconstructions
+    );
+}
+
+void
+binary_reader_close(BinaryReader *reader)
+{
+    if (!reader) {
+        return;
+    }
+
+    PyMem_Free(reader->filename);
+
+#if USE_MMAP
+    if (reader->mapped_data) {
+        munmap(reader->mapped_data, reader->mapped_size);
+        reader->mapped_data = NULL;  /* Prevent use-after-free */
+        reader->mapped_size = 0;
+    }
+    if (reader->fd >= 0) {
+        close(reader->fd);
+        reader->fd = -1;  /* Mark as closed */
+    }
+#else
+    if (reader->fp) {
+        fclose(reader->fp);
+        reader->fp = NULL;
+    }
+    if (reader->file_data) {
+        PyMem_Free(reader->file_data);
+        reader->file_data = NULL;
+        reader->file_size = 0;
+    }
+#endif
+
+    PyMem_Free(reader->decompressed_data);
+
+    if (reader->strings) {
+        for (uint32_t i = 0; i < reader->strings_count; i++) {
+            Py_XDECREF(reader->strings[i]);
+        }
+        PyMem_Free(reader->strings);
+    }
+
+    PyMem_Free(reader->frame_data);
+
+    if (reader->thread_states) {
+        for (size_t i = 0; i < reader->thread_state_count; i++) {
+            PyMem_Free(reader->thread_states[i].current_stack);
+        }
+        PyMem_Free(reader->thread_states);
+    }
+
+    PyMem_Free(reader);
+}
diff --git a/Modules/_remote_debugging/binary_io_writer.c b/Modules/_remote_debugging/binary_io_writer.c
new file mode 100644
index 00000000000..3a20f3463b0
--- /dev/null
+++ b/Modules/_remote_debugging/binary_io_writer.c
@@ -0,0 +1,1158 @@
+/******************************************************************************
+ * Python Remote Debugging Module - Binary Writer Implementation
+ *
+ * High-performance binary file writer for profiling data with optional zstd
+ * streaming compression.
+ ******************************************************************************/
+
+#ifndef Py_BUILD_CORE_MODULE
+#  define Py_BUILD_CORE_MODULE
+#endif
+
+#include "binary_io.h"
+#include "_remote_debugging.h"
+#include <string.h>
+
+#ifdef HAVE_ZSTD
+#include <zstd.h>
+#endif
+
+/* ============================================================================
+ * CONSTANTS FOR BINARY FORMAT SIZES
+ * ============================================================================ */
+
+/* Sample header sizes */
+#define SAMPLE_HEADER_FIXED_SIZE 13      /* thread_id(8) + interpreter_id(4) + encoding(1) */
+#define SAMPLE_HEADER_MAX_SIZE 26        /* fixed + max_varint(10) + status(1) + margin */
+#define MAX_VARINT_SIZE 10               /* Maximum bytes for a varint64 */
+#define MAX_VARINT_SIZE_U32 5            /* Maximum bytes for a varint32 */
+/* Frame buffer: depth varint (max 2 bytes for 256) + 256 frames * 5 bytes/varint + margin */
+#define MAX_FRAME_BUFFER_SIZE ((MAX_STACK_DEPTH * MAX_VARINT_SIZE_U32) + MAX_VARINT_SIZE_U32 + 16)
+
+/* File structure sizes */
+#define FILE_FOOTER_SIZE 32
+
+/* ============================================================================
+ * WRITER-SPECIFIC UTILITY HELPERS
+ * ============================================================================ */
+
+/* Grow two parallel arrays together (e.g., strings and string_lengths).
+ * Returns 0 on success, -1 on error (sets PyErr).
+ * On error, original arrays are preserved (truly atomic update). */
+static inline int
+grow_parallel_arrays(void **array1, void **array2, size_t *capacity,
+                     size_t elem_size1, size_t elem_size2)
+{
+    size_t old_cap = *capacity;
+
+    if (old_cap > SIZE_MAX / 2) {
+        PyErr_SetString(PyExc_OverflowError, "Array capacity overflow");
+        return -1;
+    }
+    size_t new_cap = old_cap * 2;
+
+    if (new_cap > SIZE_MAX / elem_size1 || new_cap > SIZE_MAX / elem_size2) {
+        PyErr_SetString(PyExc_OverflowError, "Array allocation size overflow");
+        return -1;
+    }
+
+    size_t new_size1 = new_cap * elem_size1;
+    size_t new_size2 = new_cap * elem_size2;
+    size_t old_size1 = old_cap * elem_size1;
+    size_t old_size2 = old_cap * elem_size2;
+
+    /* Allocate fresh memory blocks (not realloc) to ensure atomicity.
+     * If either allocation fails, original arrays are completely unchanged. */
+    void *new_array1 = PyMem_Malloc(new_size1);
+    if (!new_array1) {
+        PyErr_NoMemory();
+        return -1;
+    }
+
+    void *new_array2 = PyMem_Malloc(new_size2);
+    if (!new_array2) {
+        /* Second allocation failed - free first and return with no state change */
+        PyMem_Free(new_array1);
+        PyErr_NoMemory();
+        return -1;
+    }
+
+    /* Both allocations succeeded - copy data and update pointers atomically */
+    memcpy(new_array1, *array1, old_size1);
+    memcpy(new_array2, *array2, old_size2);
+
+    PyMem_Free(*array1);
+    PyMem_Free(*array2);
+
+    *array1 = new_array1;
+    *array2 = new_array2;
+    *capacity = new_cap;
+    return 0;
+}
+
+/* Checked fwrite with GIL release - returns 0 on success, -1 on error (sets PyErr).
+ * This version releases the GIL during the write operation to allow other Python
+ * threads to run during potentially blocking I/O. */
+static inline int
+fwrite_checked_allow_threads(const void *data, size_t size, FILE *fp)
+{
+    size_t written;
+    Py_BEGIN_ALLOW_THREADS
+    written = fwrite(data, 1, size, fp);
+    Py_END_ALLOW_THREADS
+    if (written != size) {
+        PyErr_SetFromErrno(PyExc_IOError);
+        return -1;
+    }
+    return 0;
+}
+
+/* Forward declaration for writer_write_bytes */
+static inline int writer_write_bytes(BinaryWriter *writer, const void *data, size_t size);
+
+/* Encode and write a varint u32 - returns 0 on success, -1 on error */
+static inline int
+writer_write_varint_u32(BinaryWriter *writer, uint32_t value)
+{
+    uint8_t buf[MAX_VARINT_SIZE];
+    size_t len = encode_varint_u32(buf, value);
+    return writer_write_bytes(writer, buf, len);
+}
+
+/* Encode and write a varint u64 - returns 0 on success, -1 on error */
+static inline int
+writer_write_varint_u64(BinaryWriter *writer, uint64_t value)
+{
+    uint8_t buf[MAX_VARINT_SIZE];
+    size_t len = encode_varint_u64(buf, value);
+    return writer_write_bytes(writer, buf, len);
+}
+
+
+/* ============================================================================
+ * UTILITY FUNCTIONS
+ * ============================================================================ */
+
+int
+binary_io_zstd_available(void)
+{
+#ifdef HAVE_ZSTD
+    return 1;
+#else
+    return 0;
+#endif
+}
+
+int
+binary_io_get_best_compression(void)
+{
+#ifdef HAVE_ZSTD
+    return COMPRESSION_ZSTD;
+#else
+    return COMPRESSION_NONE;
+#endif
+}
+
+/* ============================================================================
+ * BINARY WRITER IMPLEMENTATION
+ * ============================================================================ */
+
+static int
+writer_init_zstd(BinaryWriter *writer)
+{
+#ifdef HAVE_ZSTD
+    writer->zstd.cctx = ZSTD_createCCtx();
+    if (!writer->zstd.cctx) {
+        PyErr_SetString(PyExc_MemoryError, "Failed to create zstd compression context");
+        return -1;
+    }
+
+    /* Compression level 5: better ratio for repetitive profiling data */
+    size_t result = ZSTD_CCtx_setParameter(writer->zstd.cctx,
+                                           ZSTD_c_compressionLevel, 5);
+    if (ZSTD_isError(result)) {
+        PyErr_Format(PyExc_RuntimeError, "Failed to set zstd compression level: %s",
+                     ZSTD_getErrorName(result));
+        ZSTD_freeCCtx(writer->zstd.cctx);
+        writer->zstd.cctx = NULL;
+        return -1;
+    }
+
+    /* Use large buffer (512KB) for fewer I/O syscalls */
+    writer->zstd.compressed_buffer = PyMem_Malloc(COMPRESSED_BUFFER_SIZE);
+    if (!writer->zstd.compressed_buffer) {
+        ZSTD_freeCCtx(writer->zstd.cctx);
+        writer->zstd.cctx = NULL;
+        PyErr_NoMemory();
+        return -1;
+    }
+    writer->zstd.compressed_buffer_size = COMPRESSED_BUFFER_SIZE;
+
+    return 0;
+#else
+    PyErr_SetString(PyExc_RuntimeError,
+        "zstd compression requested but not available (HAVE_ZSTD not defined)");
+    return -1;
+#endif
+}
+
+static int
+writer_flush_buffer(BinaryWriter *writer)
+{
+    if (writer->buffer_pos == 0) {
+        return 0;
+    }
+
+#ifdef HAVE_ZSTD
+    if (writer->compression_type == COMPRESSION_ZSTD) {
+        ZSTD_inBuffer input = { writer->write_buffer, writer->buffer_pos, 0 };
+
+        while (input.pos < input.size) {
+            ZSTD_outBuffer output = {
+                writer->zstd.compressed_buffer,
+                writer->zstd.compressed_buffer_size,
+                0
+            };
+
+            size_t result = ZSTD_compressStream2(
+                writer->zstd.cctx, &output, &input, ZSTD_e_continue
+            );
+
+            if (ZSTD_isError(result)) {
+                PyErr_Format(PyExc_IOError, "zstd compression error: %s",
+                             ZSTD_getErrorName(result));
+                return -1;
+            }
+
+            if (output.pos > 0) {
+                if (fwrite_checked_allow_threads(writer->zstd.compressed_buffer, output.pos, writer->fp) < 0) {
+                    return -1;
+                }
+            }
+        }
+    } else
+#endif
+    {
+        if (fwrite_checked_allow_threads(writer->write_buffer, writer->buffer_pos, writer->fp) < 0) {
+            return -1;
+        }
+    }
+
+    writer->buffer_pos = 0;
+    return 0;
+}
+
+static inline int
+writer_write_bytes(BinaryWriter *writer, const void *data, size_t size)
+{
+    const uint8_t *src = (const uint8_t *)data;
+    size_t original_size = size;
+
+    while (size > 0) {
+        size_t space = writer->buffer_size - writer->buffer_pos;
+        size_t to_copy = (size < space) ? size : space;
+
+        memcpy(writer->write_buffer + writer->buffer_pos, src, to_copy);
+        writer->buffer_pos += to_copy;
+        src += to_copy;
+        size -= to_copy;
+
+        if (writer->buffer_pos == writer->buffer_size) {
+            if (writer_flush_buffer(writer) < 0) {
+                return -1;
+            }
+        }
+    }
+
+    writer->stats.bytes_written += original_size;
+    return 0;
+}
+
+/* ============================================================================
+ * HASH TABLE SUPPORT FUNCTIONS (using _Py_hashtable)
+ * ============================================================================ */
+
+/* Hash function for Python strings - uses Python's cached hash */
+static Py_uhash_t
+string_hash_func(const void *key)
+{
+    PyObject *str = (PyObject *)key;
+    Py_hash_t hash = PyObject_Hash(str);
+    if (hash == -1) {
+        PyErr_Clear();
+        return 0;
+    }
+    return (Py_uhash_t)hash;
+}
+
+static int
+string_compare_func(const void *key1, const void *key2)
+{
+    PyObject *str1 = (PyObject *)key1;
+    PyObject *str2 = (PyObject *)key2;
+    if (str1 == str2) {
+        return 1;
+    }
+    int result = PyObject_RichCompareBool(str1, str2, Py_EQ);
+    if (result == -1) {
+        PyErr_Clear();
+        return 0;
+    }
+    return result;
+}
+
+static void
+string_key_destroy(void *key)
+{
+    Py_XDECREF((PyObject *)key);
+}
+
+static Py_uhash_t
+frame_key_hash_func(const void *key)
+{
+    const FrameKey *fk = (const FrameKey *)key;
+    /* FNV-1a style hash combining all three values */
+    Py_uhash_t hash = 2166136261u;
+    hash ^= fk->filename_idx;
+    hash *= 16777619u;
+    hash ^= fk->funcname_idx;
+    hash *= 16777619u;
+    hash ^= (uint32_t)fk->lineno;
+    hash *= 16777619u;
+    return hash;
+}
+
+static int
+frame_key_compare_func(const void *key1, const void *key2)
+{
+    const FrameKey *fk1 = (const FrameKey *)key1;
+    const FrameKey *fk2 = (const FrameKey *)key2;
+    return (fk1->filename_idx == fk2->filename_idx &&
+            fk1->funcname_idx == fk2->funcname_idx &&
+            fk1->lineno == fk2->lineno);
+}
+
+static void
+frame_key_destroy(void *key)
+{
+    PyMem_Free(key);
+}
+
+static inline int
+writer_intern_string(BinaryWriter *writer, PyObject *string, uint32_t *index)
+{
+    void *existing = _Py_hashtable_get(writer->string_hash, string);
+    if (existing != NULL) {
+        *index = (uint32_t)(uintptr_t)existing - 1;  /* index+1 stored to distinguish from NULL */
+        return 0;
+    }
+
+    if (writer->string_count >= writer->string_capacity) {
+        if (grow_parallel_arrays((void **)&writer->strings,
+                                  (void **)&writer->string_lengths,
+                                  &writer->string_capacity,
+                                  sizeof(char *), sizeof(size_t)) < 0) {
+            return -1;
+        }
+    }
+
+    Py_ssize_t str_len;
+    const char *str_data = PyUnicode_AsUTF8AndSize(string, &str_len);
+    if (!str_data) {
+        return -1;
+    }
+
+    char *str_copy = PyMem_Malloc(str_len + 1);
+    if (!str_copy) {
+        PyErr_NoMemory();
+        return -1;
+    }
+    memcpy(str_copy, str_data, str_len + 1);
+
+    *index = (uint32_t)writer->string_count;
+
+    /* Add to hash table FIRST to ensure atomic rollback on failure */
+    Py_INCREF(string);
+    if (_Py_hashtable_set(writer->string_hash, string, (void *)(uintptr_t)(*index + 1)) < 0) {
+        Py_DECREF(string);
+        PyMem_Free(str_copy);
+        PyErr_NoMemory();
+        return -1;
+    }
+
+    writer->strings[writer->string_count] = str_copy;
+    writer->string_lengths[writer->string_count] = str_len;
+    writer->string_count++;
+
+    return 0;
+}
+
+static inline int
+writer_intern_frame(BinaryWriter *writer, uint32_t filename_idx, uint32_t funcname_idx,
+                    int32_t lineno, uint32_t *index)
+{
+    FrameKey lookup_key = {filename_idx, funcname_idx, lineno};
+
+    void *existing = _Py_hashtable_get(writer->frame_hash, &lookup_key);
+    if (existing != NULL) {
+        *index = (uint32_t)(uintptr_t)existing - 1;  /* index+1 stored to distinguish from NULL */
+        return 0;
+    }
+
+    if (GROW_ARRAY(writer->frame_entries, writer->frame_count,
+                   writer->frame_capacity, FrameEntry) < 0) {
+        return -1;
+    }
+
+    FrameKey *key = PyMem_Malloc(sizeof(FrameKey));
+    if (!key) {
+        PyErr_NoMemory();
+        return -1;
+    }
+    *key = lookup_key;
+
+    *index = (uint32_t)writer->frame_count;
+    FrameEntry *fe = &writer->frame_entries[writer->frame_count];
+    fe->filename_idx = filename_idx;
+    fe->funcname_idx = funcname_idx;
+    fe->lineno = lineno;
+
+    if (_Py_hashtable_set(writer->frame_hash, key, (void *)(uintptr_t)(*index + 1)) < 0) {
+        PyMem_Free(key);
+        PyErr_NoMemory();
+        return -1;
+    }
+
+    writer->frame_count++;
+    return 0;
+}
+
+/* Get or create a thread entry for the given thread_id.
+ * Returns pointer to ThreadEntry, or NULL on allocation failure.
+ * If is_new is non-NULL, sets it to 1 if this is a new thread, 0 otherwise. */
+static ThreadEntry *
+writer_get_or_create_thread_entry(BinaryWriter *writer, uint64_t thread_id,
+                                   uint32_t interpreter_id, int *is_new)
+{
+    /* Linear search is OK for small number of threads.
+     * Key is (thread_id, interpreter_id) since same thread_id can exist in different interpreters. */
+    for (size_t i = 0; i < writer->thread_count; i++) {
+        if (writer->thread_entries[i].thread_id == thread_id &&
+            writer->thread_entries[i].interpreter_id == interpreter_id) {
+            if (is_new) {
+                *is_new = 0;
+            }
+            return &writer->thread_entries[i];
+        }
+    }
+
+    if (writer->thread_count >= writer->thread_capacity) {
+        ThreadEntry *new_entries = grow_array(writer->thread_entries,
+                                              &writer->thread_capacity,
+                                              sizeof(ThreadEntry));
+        if (!new_entries) {
+            return NULL;
+        }
+        writer->thread_entries = new_entries;
+    }
+
+    ThreadEntry *entry = &writer->thread_entries[writer->thread_count];
+    memset(entry, 0, sizeof(ThreadEntry));
+    entry->thread_id = thread_id;
+    entry->interpreter_id = interpreter_id;
+    entry->prev_timestamp = writer->start_time_us;
+    entry->prev_stack_capacity = MAX_STACK_DEPTH;
+    entry->pending_rle_capacity = INITIAL_RLE_CAPACITY;
+
+    entry->prev_stack = PyMem_Malloc(entry->prev_stack_capacity * sizeof(uint32_t));
+    if (!entry->prev_stack) {
+        PyErr_NoMemory();
+        return NULL;
+    }
+
+    entry->pending_rle = PyMem_Malloc(entry->pending_rle_capacity * sizeof(PendingRLESample));
+    if (!entry->pending_rle) {
+        PyMem_Free(entry->prev_stack);
+        PyErr_NoMemory();
+        return NULL;
+    }
+
+    writer->thread_count++;
+    if (is_new) {
+        *is_new = 1;
+    }
+    return entry;
+}
+
+/* Compare two stacks and return the encoding type and parameters.
+ * Sets:
+ *   - shared_count: number of frames matching from bottom of stack
+ *   - pop_count: frames to remove from prev stack
+ *   - push_count: new frames to add
+ *
+ * Returns the best encoding type to use. */
+static int
+compare_stacks(const uint32_t *prev_stack, size_t prev_depth,
+               const uint32_t *curr_stack, size_t curr_depth,
+               size_t *shared_count, size_t *pop_count, size_t *push_count)
+{
+    if (prev_depth == curr_depth) {
+        int identical = 1;
+        for (size_t i = 0; i < prev_depth; i++) {
+            if (prev_stack[i] != curr_stack[i]) {
+                identical = 0;
+                break;
+            }
+        }
+        if (identical) {
+            *shared_count = prev_depth;
+            *pop_count = 0;
+            *push_count = 0;
+            return STACK_REPEAT;
+        }
+    }
+
+    /* Find longest common suffix (frames at the bottom/outer part of stack).
+     * Stacks are stored innermost-first, so suffix is at the end. */
+    size_t suffix_len = 0;
+    size_t min_depth = (prev_depth < curr_depth) ? prev_depth : curr_depth;
+
+    for (size_t i = 0; i < min_depth; i++) {
+        size_t prev_idx = prev_depth - 1 - i;
+        size_t curr_idx = curr_depth - 1 - i;
+        if (prev_stack[prev_idx] == curr_stack[curr_idx]) {
+            suffix_len++;
+        } else {
+            break;
+        }
+    }
+
+    *shared_count = suffix_len;
+    *pop_count = prev_depth - suffix_len;
+    *push_count = curr_depth - suffix_len;
+
+    /* Choose best encoding based on byte cost */
+    /* STACK_FULL: 1 (type) + 1-2 (depth) + sum(frame varints) */
+    /* STACK_SUFFIX: 1 (type) + 1-2 (shared) + 1-2 (new_count) + sum(new frame varints) */
+    /* STACK_POP_PUSH: 1 (type) + 1-2 (pop) + 1-2 (push) + sum(new frame varints) */
+
+    /* If no common suffix, use full stack */
+    if (suffix_len == 0) {
+        return STACK_FULL;
+    }
+
+    /* If only adding frames (suffix == prev_depth), use SUFFIX */
+    if (*pop_count == 0 && *push_count > 0) {
+        return STACK_SUFFIX;
+    }
+
+    /* If popping and/or pushing, use POP_PUSH if it saves bytes */
+    /* Heuristic: POP_PUSH is better when we're modifying top frames */
+    if (*pop_count > 0 || *push_count > 0) {
+        /* Use full stack if sharing less than half the frames */
+        if (suffix_len < curr_depth / 2) {
+            return STACK_FULL;
+        }
+        return STACK_POP_PUSH;
+    }
+
+    return STACK_FULL;
+}
+
+/* Write common sample header: thread_id(8) + interpreter_id(4) + encoding(1).
+ * Returns 0 on success, -1 on failure. */
+static inline int
+write_sample_header(BinaryWriter *writer, ThreadEntry *entry, uint8_t encoding)
+{
+    uint8_t header[SAMPLE_HEADER_FIXED_SIZE];
+    memcpy(header, &entry->thread_id, 8);
+    memcpy(header + 8, &entry->interpreter_id, 4);
+    header[12] = encoding;
+    return writer_write_bytes(writer, header, SAMPLE_HEADER_FIXED_SIZE);
+}
+
+/* Flush pending RLE samples for a thread.
+ * Writes the RLE record to the output buffer.
+ * Returns 0 on success, -1 on failure. */
+static int
+flush_pending_rle(BinaryWriter *writer, ThreadEntry *entry)
+{
+    if (!entry->has_pending_rle || entry->pending_rle_count == 0) {
+        return 0;
+    }
+
+    /* Write RLE record:
+     * [thread_id: 8] [interpreter_id: 4] [STACK_REPEAT: 1] [count: varint]
+     * [timestamp_delta_1: varint] [status_1: 1] ... [timestamp_delta_N: varint] [status_N: 1]
+     */
+
+    if (write_sample_header(writer, entry, STACK_REPEAT) < 0) {
+        return -1;
+    }
+
+    if (writer_write_varint_u32(writer, (uint32_t)entry->pending_rle_count) < 0) {
+        return -1;
+    }
+
+    for (size_t i = 0; i < entry->pending_rle_count; i++) {
+        if (writer_write_varint_u64(writer, entry->pending_rle[i].timestamp_delta) < 0) {
+            return -1;
+        }
+        if (writer_write_bytes(writer, &entry->pending_rle[i].status, 1) < 0) {
+            return -1;
+        }
+        writer->total_samples++;
+    }
+
+    writer->stats.repeat_records++;
+    writer->stats.repeat_samples += entry->pending_rle_count;
+    /* Each RLE sample saves writing the entire stack */
+    writer->stats.frames_saved += entry->pending_rle_count * entry->prev_stack_depth;
+
+    entry->pending_rle_count = 0;
+    entry->has_pending_rle = 0;
+
+    return 0;
+}
+
+/* Write a single sample with the specified encoding.
+ * Returns 0 on success, -1 on failure. */
+static int
+write_sample_with_encoding(BinaryWriter *writer, ThreadEntry *entry,
+                           uint64_t timestamp_delta, uint8_t status,
+                           int encoding_type,
+                           const uint32_t *frame_indices, size_t stack_depth,
+                           size_t shared_count, size_t pop_count, size_t push_count)
+{
+    /* Header: thread_id(8) + interpreter_id(4) + encoding(1) + delta(varint) + status(1) */
+    uint8_t header_buf[SAMPLE_HEADER_MAX_SIZE];
+    memcpy(header_buf, &entry->thread_id, 8);
+    memcpy(header_buf + 8, &entry->interpreter_id, 4);
+    header_buf[12] = (uint8_t)encoding_type;
+    size_t varint_len = encode_varint_u64(header_buf + 13, timestamp_delta);
+    header_buf[13 + varint_len] = status;
+
+    if (writer_write_bytes(writer, header_buf, 14 + varint_len) < 0) {
+        return -1;
+    }
+
+    uint8_t frame_buf[MAX_FRAME_BUFFER_SIZE];
+    size_t frame_buf_pos = 0;
+    size_t frames_written = 0;
+
+    switch (encoding_type) {
+    case STACK_FULL:
+        /* [depth: varint] [frame_idx: varint]... */
+        frame_buf_pos += encode_varint_u32(frame_buf, (uint32_t)stack_depth);
+        for (size_t i = 0; i < stack_depth; i++) {
+            frame_buf_pos += encode_varint_u32(frame_buf + frame_buf_pos, frame_indices[i]);
+        }
+        frames_written = stack_depth;
+        writer->stats.full_records++;
+        break;
+
+    case STACK_SUFFIX:
+        /* [shared_count: varint] [new_count: varint] [new_frame_idx: varint]... */
+        frame_buf_pos += encode_varint_u32(frame_buf, (uint32_t)shared_count);
+        frame_buf_pos += encode_varint_u32(frame_buf + frame_buf_pos, (uint32_t)push_count);
+        /* New frames are at the top (beginning) of current stack */
+        for (size_t i = 0; i < push_count; i++) {
+            frame_buf_pos += encode_varint_u32(frame_buf + frame_buf_pos, frame_indices[i]);
+        }
+        frames_written = push_count;
+        writer->stats.suffix_records++;
+        /* Saved writing shared_count frames */
+        writer->stats.frames_saved += shared_count;
+        break;
+
+    case STACK_POP_PUSH:
+        /* [pop_count: varint] [push_count: varint] [new_frame_idx: varint]... */
+        frame_buf_pos += encode_varint_u32(frame_buf, (uint32_t)pop_count);
+        frame_buf_pos += encode_varint_u32(frame_buf + frame_buf_pos, (uint32_t)push_count);
+        /* New frames are at the top (beginning) of current stack */
+        for (size_t i = 0; i < push_count; i++) {
+            frame_buf_pos += encode_varint_u32(frame_buf + frame_buf_pos, frame_indices[i]);
+        }
+        frames_written = push_count;
+        writer->stats.pop_push_records++;
+        /* Saved writing shared_count frames (stack_depth - push_count if we had written full) */
+        writer->stats.frames_saved += shared_count;
+        break;
+
+    default:
+        PyErr_SetString(PyExc_RuntimeError, "Invalid stack encoding type");
+        return -1;
+    }
+
+    if (writer_write_bytes(writer, frame_buf, frame_buf_pos) < 0) {
+        return -1;
+    }
+
+    writer->stats.total_frames_written += frames_written;
+    writer->total_samples++;
+    return 0;
+}
+
+BinaryWriter *
+binary_writer_create(const char *filename, uint64_t sample_interval_us, int compression_type,
+                     uint64_t start_time_us)
+{
+    BinaryWriter *writer = PyMem_Calloc(1, sizeof(BinaryWriter));
+    if (!writer) {
+        PyErr_NoMemory();
+        return NULL;
+    }
+
+    writer->filename = PyMem_Malloc(strlen(filename) + 1);
+    if (!writer->filename) {
+        PyMem_Free(writer);
+        PyErr_NoMemory();
+        return NULL;
+    }
+    strcpy(writer->filename, filename);
+
+    writer->start_time_us = start_time_us;
+    writer->sample_interval_us = sample_interval_us;
+    writer->compression_type = compression_type;
+
+    writer->write_buffer = PyMem_Malloc(WRITE_BUFFER_SIZE);
+    if (!writer->write_buffer) {
+        goto error;
+    }
+    writer->buffer_size = WRITE_BUFFER_SIZE;
+
+    writer->string_hash = _Py_hashtable_new_full(
+        string_hash_func,
+        string_compare_func,
+        string_key_destroy,  /* Key destroy: decref the Python string */
+        NULL,                /* Value destroy: values are just indices, not pointers */
+        NULL                 /* Use default allocator */
+    );
+    if (!writer->string_hash) {
+        goto error;
+    }
+    writer->strings = PyMem_Malloc(INITIAL_STRING_CAPACITY * sizeof(char *));
+    if (!writer->strings) {
+        goto error;
+    }
+    writer->string_lengths = PyMem_Malloc(INITIAL_STRING_CAPACITY * sizeof(size_t));
+    if (!writer->string_lengths) {
+        goto error;
+    }
+    writer->string_capacity = INITIAL_STRING_CAPACITY;
+
+    writer->frame_hash = _Py_hashtable_new_full(
+        frame_key_hash_func,
+        frame_key_compare_func,
+        frame_key_destroy,   /* Key destroy: free the FrameKey */
+        NULL,                /* Value destroy: values are just indices, not pointers */
+        NULL                 /* Use default allocator */
+    );
+    if (!writer->frame_hash) {
+        goto error;
+    }
+    writer->frame_entries = PyMem_Malloc(INITIAL_FRAME_CAPACITY * sizeof(FrameEntry));
+    if (!writer->frame_entries) {
+        goto error;
+    }
+    writer->frame_capacity = INITIAL_FRAME_CAPACITY;
+
+    writer->thread_entries = PyMem_Malloc(INITIAL_THREAD_CAPACITY * sizeof(ThreadEntry));
+    if (!writer->thread_entries) {
+        goto error;
+    }
+    writer->thread_capacity = INITIAL_THREAD_CAPACITY;
+
+    if (compression_type == COMPRESSION_ZSTD) {
+        if (writer_init_zstd(writer) < 0) {
+            goto error;
+        }
+    }
+
+    writer->fp = fopen(filename, "wb");
+    if (!writer->fp) {
+        PyErr_SetFromErrnoWithFilename(PyExc_IOError, filename);
+        goto error;
+    }
+
+    /* Hint sequential write pattern to kernel for better I/O scheduling */
+#if defined(__linux__) && defined(POSIX_FADV_SEQUENTIAL)
+    {
+        int fd = fileno(writer->fp);
+        if (fd >= 0) {
+            (void)posix_fadvise(fd, 0, 0, POSIX_FADV_SEQUENTIAL);
+        }
+    }
+#endif
+
+    uint8_t header[FILE_HEADER_PLACEHOLDER_SIZE] = {0};
+    if (fwrite_checked_allow_threads(header, FILE_HEADER_PLACEHOLDER_SIZE, writer->fp) < 0) {
+        goto error;
+    }
+
+    return writer;
+
+error:
+    binary_writer_destroy(writer);
+    return NULL;
+}
+
+/* Build a frame stack from Python frame list by interning all strings and frames.
+ * Returns 0 on success, -1 on error. */
+static int
+build_frame_stack(BinaryWriter *writer, PyObject *frame_list,
+                  uint32_t *curr_stack, size_t *curr_depth)
+{
+    Py_ssize_t stack_depth = PyList_Size(frame_list);
+    *curr_depth = (stack_depth < MAX_STACK_DEPTH) ? stack_depth : MAX_STACK_DEPTH;
+
+    for (Py_ssize_t k = 0; k < (Py_ssize_t)*curr_depth; k++) {
+        /* Use unchecked accessors since we control the data structures */
+        PyObject *frame_info = PyList_GET_ITEM(frame_list, k);
+
+        /* Get filename, location, funcname from FrameInfo using unchecked access */
+        PyObject *filename = PyStructSequence_GET_ITEM(frame_info, 0);
+        PyObject *location = PyStructSequence_GET_ITEM(frame_info, 1);
+        PyObject *funcname = PyStructSequence_GET_ITEM(frame_info, 2);
+
+        /* Extract lineno from location (can be None for synthetic frames) */
+        int32_t lineno = 0;
+        if (location != Py_None) {
+            /* Use unchecked access - first element is lineno */
+            PyObject *lineno_obj = PyTuple_Check(location) ?
+                PyTuple_GET_ITEM(location, 0) :
+                PyStructSequence_GET_ITEM(location, 0);
+            lineno = (int32_t)PyLong_AsLong(lineno_obj);
+            if (UNLIKELY(PyErr_Occurred() != NULL)) {
+                PyErr_Clear();
+                lineno = 0;
+            }
+        }
+
+        /* Intern filename */
+        uint32_t filename_idx;
+        if (writer_intern_string(writer, filename, &filename_idx) < 0) {
+            return -1;
+        }
+
+        /* Intern funcname */
+        uint32_t funcname_idx;
+        if (writer_intern_string(writer, funcname, &funcname_idx) < 0) {
+            return -1;
+        }
+
+        /* Intern frame */
+        uint32_t frame_idx;
+        if (writer_intern_frame(writer, filename_idx, funcname_idx, lineno, &frame_idx) < 0) {
+            return -1;
+        }
+
+        curr_stack[k] = frame_idx;
+    }
+    return 0;
+}
+
+/* Process a single thread's sample.
+ * Returns 0 on success, -1 on error. */
+static int
+process_thread_sample(BinaryWriter *writer, PyObject *thread_info,
+                      uint32_t interpreter_id, uint64_t timestamp_us)
+{
+    PyObject *thread_id_obj = PyStructSequence_GET_ITEM(thread_info, 0);
+    PyObject *status_obj = PyStructSequence_GET_ITEM(thread_info, 1);
+    PyObject *frame_list = PyStructSequence_GET_ITEM(thread_info, 2);
+
+    uint64_t thread_id = PyLong_AsUnsignedLongLong(thread_id_obj);
+    if (thread_id == (uint64_t)-1 && PyErr_Occurred()) {
+        return -1;
+    }
+    long status_long = PyLong_AsLong(status_obj);
+    if (status_long == -1 && PyErr_Occurred()) {
+        return -1;
+    }
+    uint8_t status = (uint8_t)status_long;
+
+    int is_new_thread = 0;
+    ThreadEntry *entry = writer_get_or_create_thread_entry(
+        writer, thread_id, interpreter_id, &is_new_thread);
+    if (!entry) {
+        return -1;
+    }
+
+    /* Calculate timestamp delta */
+    uint64_t delta = timestamp_us - entry->prev_timestamp;
+    entry->prev_timestamp = timestamp_us;
+
+    /* Process frames and build current stack */
+    uint32_t curr_stack[MAX_STACK_DEPTH];
+    size_t curr_depth;
+    if (build_frame_stack(writer, frame_list, curr_stack, &curr_depth) < 0) {
+        return -1;
+    }
+
+    /* Compare with previous stack to determine encoding */
+    size_t shared_count, pop_count, push_count;
+    int encoding = compare_stacks(
+        entry->prev_stack, entry->prev_stack_depth,
+        curr_stack, curr_depth,
+        &shared_count, &pop_count, &push_count);
+
+    if (encoding == STACK_REPEAT && !is_new_thread) {
+        /* Buffer this sample for RLE */
+        if (GROW_ARRAY(entry->pending_rle, entry->pending_rle_count,
+                       entry->pending_rle_capacity, PendingRLESample) < 0) {
+            return -1;
+        }
+        entry->pending_rle[entry->pending_rle_count].timestamp_delta = delta;
+        entry->pending_rle[entry->pending_rle_count].status = status;
+        entry->pending_rle_count++;
+        entry->has_pending_rle = 1;
+    } else {
+        /* Stack changed - flush any pending RLE first */
+        if (entry->has_pending_rle) {
+            if (flush_pending_rle(writer, entry) < 0) {
+                return -1;
+            }
+        }
+
+        if (write_sample_with_encoding(writer, entry, delta, status, encoding,
+                                       curr_stack, curr_depth,
+                                       shared_count, pop_count, push_count) < 0) {
+            return -1;
+        }
+
+        memcpy(entry->prev_stack, curr_stack, curr_depth * sizeof(uint32_t));
+        entry->prev_stack_depth = curr_depth;
+    }
+
+    return 0;
+}
+
+int
+binary_writer_write_sample(BinaryWriter *writer, PyObject *stack_frames, uint64_t timestamp_us)
+{
+    if (!PyList_Check(stack_frames)) {
+        PyErr_SetString(PyExc_TypeError, "stack_frames must be a list");
+        return -1;
+    }
+
+    Py_ssize_t num_interpreters = PyList_GET_SIZE(stack_frames);
+    for (Py_ssize_t i = 0; i < num_interpreters; i++) {
+        PyObject *interp_info = PyList_GET_ITEM(stack_frames, i);
+
+        PyObject *interp_id_obj = PyStructSequence_GET_ITEM(interp_info, 0);
+        PyObject *threads = PyStructSequence_GET_ITEM(interp_info, 1);
+
+        unsigned long interp_id_long = PyLong_AsUnsignedLong(interp_id_obj);
+        if (interp_id_long == (unsigned long)-1 && PyErr_Occurred()) {
+            return -1;
+        }
+        /* Bounds check: interpreter_id is stored as uint32_t in binary format */
+        if (interp_id_long > UINT32_MAX) {
+            PyErr_Format(PyExc_OverflowError,
+                "interpreter_id %lu exceeds maximum value %lu",
+                interp_id_long, (unsigned long)UINT32_MAX);
+            return -1;
+        }
+        uint32_t interpreter_id = (uint32_t)interp_id_long;
+
+        Py_ssize_t num_threads = PyList_GET_SIZE(threads);
+        for (Py_ssize_t j = 0; j < num_threads; j++) {
+            PyObject *thread_info = PyList_GET_ITEM(threads, j);
+            if (process_thread_sample(writer, thread_info, interpreter_id, timestamp_us) < 0) {
+                return -1;
+            }
+        }
+    }
+
+    return 0;
+}
+
+int
+binary_writer_finalize(BinaryWriter *writer)
+{
+    for (size_t i = 0; i < writer->thread_count; i++) {
+        if (writer->thread_entries[i].has_pending_rle) {
+            if (flush_pending_rle(writer, &writer->thread_entries[i]) < 0) {
+                return -1;
+            }
+        }
+    }
+
+    if (writer_flush_buffer(writer) < 0) {
+        return -1;
+    }
+
+#ifdef HAVE_ZSTD
+    /* Finalize compression stream */
+    if (writer->compression_type == COMPRESSION_ZSTD && writer->zstd.cctx) {
+        ZSTD_inBuffer input = { NULL, 0, 0 };
+        size_t remaining;
+
+        do {
+            ZSTD_outBuffer output = {
+                writer->zstd.compressed_buffer,
+                writer->zstd.compressed_buffer_size,
+                0
+            };
+
+            remaining = ZSTD_compressStream2(writer->zstd.cctx, &output, &input, ZSTD_e_end);
+
+            if (ZSTD_isError(remaining)) {
+                PyErr_Format(PyExc_IOError, "zstd finalization error: %s",
+                             ZSTD_getErrorName(remaining));
+                return -1;
+            }
+
+            if (output.pos > 0) {
+                if (fwrite_checked_allow_threads(writer->zstd.compressed_buffer, output.pos, writer->fp) < 0) {
+                    return -1;
+                }
+            }
+        } while (remaining > 0);
+    }
+#endif
+
+    /* Use 64-bit file position for >2GB files */
+    file_offset_t string_table_offset = FTELL64(writer->fp);
+    if (string_table_offset < 0) {
+        PyErr_SetFromErrno(PyExc_IOError);
+        return -1;
+    }
+
+    /* Release GIL during potentially large writes */
+    for (size_t i = 0; i < writer->string_count; i++) {
+        uint8_t len_buf[10];
+        size_t len_size = encode_varint_u32(len_buf, (uint32_t)writer->string_lengths[i]);
+        if (fwrite_checked_allow_threads(len_buf, len_size, writer->fp) < 0 ||
+            fwrite_checked_allow_threads(writer->strings[i], writer->string_lengths[i], writer->fp) < 0) {
+            return -1;
+        }
+    }
+
+    file_offset_t frame_table_offset = FTELL64(writer->fp);
+    if (frame_table_offset < 0) {
+        PyErr_SetFromErrno(PyExc_IOError);
+        return -1;
+    }
+
+    for (size_t i = 0; i < writer->frame_count; i++) {
+        FrameEntry *entry = &writer->frame_entries[i];
+        uint8_t buf[30];
+        size_t pos = encode_varint_u32(buf, entry->filename_idx);
+        pos += encode_varint_u32(buf + pos, entry->funcname_idx);
+        pos += encode_varint_i32(buf + pos, entry->lineno);
+        if (fwrite_checked_allow_threads(buf, pos, writer->fp) < 0) {
+            return -1;
+        }
+    }
+
+    /* Footer: string_count(4) + frame_count(4) + file_size(8) + checksum(16) */
+    file_offset_t footer_offset = FTELL64(writer->fp);
+    if (footer_offset < 0) {
+        PyErr_SetFromErrno(PyExc_IOError);
+        return -1;
+    }
+    uint64_t file_size = (uint64_t)footer_offset + 32;
+    uint8_t footer[32] = {0};
+    /* Cast size_t to uint32_t before memcpy to ensure correct bytes are copied
+     * on both little-endian and big-endian systems (size_t is 8 bytes on 64-bit) */
+    uint32_t string_count_u32 = (uint32_t)writer->string_count;
+    uint32_t frame_count_u32 = (uint32_t)writer->frame_count;
+    memcpy(footer + 0, &string_count_u32, 4);
+    memcpy(footer + 4, &frame_count_u32, 4);
+    memcpy(footer + 8, &file_size, 8);
+    /* bytes 16-31: checksum placeholder (zeros) */
+    if (fwrite_checked_allow_threads(footer, 32, writer->fp) < 0) {
+        return -1;
+    }
+
+    if (FSEEK64(writer->fp, 0, SEEK_SET) < 0) {
+        PyErr_SetFromErrno(PyExc_IOError);
+        return -1;
+    }
+
+    /* Convert file offsets and counts to fixed-width types for portable header format.
+     * This ensures correct behavior on both little-endian and big-endian systems. */
+    uint64_t string_table_offset_u64 = (uint64_t)string_table_offset;
+    uint64_t frame_table_offset_u64 = (uint64_t)frame_table_offset;
+    uint32_t thread_count_u32 = (uint32_t)writer->thread_count;
+    uint32_t compression_type_u32 = (uint32_t)writer->compression_type;
+
+    uint8_t header[FILE_HEADER_SIZE] = {0};
+    uint32_t magic = BINARY_FORMAT_MAGIC;
+    uint32_t version = BINARY_FORMAT_VERSION;
+    memcpy(header + HDR_OFF_MAGIC, &magic, HDR_SIZE_MAGIC);
+    memcpy(header + HDR_OFF_VERSION, &version, HDR_SIZE_VERSION);
+    header[HDR_OFF_PY_MAJOR] = PY_MAJOR_VERSION;
+    header[HDR_OFF_PY_MINOR] = PY_MINOR_VERSION;
+    header[HDR_OFF_PY_MICRO] = PY_MICRO_VERSION;
+    memcpy(header + HDR_OFF_START_TIME, &writer->start_time_us, HDR_SIZE_START_TIME);
+    memcpy(header + HDR_OFF_INTERVAL, &writer->sample_interval_us, HDR_SIZE_INTERVAL);
+    memcpy(header + HDR_OFF_SAMPLES, &writer->total_samples, HDR_SIZE_SAMPLES);
+    memcpy(header + HDR_OFF_THREADS, &thread_count_u32, HDR_SIZE_THREADS);
+    memcpy(header + HDR_OFF_STR_TABLE, &string_table_offset_u64, HDR_SIZE_STR_TABLE);
+    memcpy(header + HDR_OFF_FRAME_TABLE, &frame_table_offset_u64, HDR_SIZE_FRAME_TABLE);
+    memcpy(header + HDR_OFF_COMPRESSION, &compression_type_u32, HDR_SIZE_COMPRESSION);
+    if (fwrite_checked_allow_threads(header, FILE_HEADER_SIZE, writer->fp) < 0) {
+        return -1;
+    }
+
+    if (fclose(writer->fp) != 0) {
+        writer->fp = NULL;
+        PyErr_SetFromErrno(PyExc_IOError);
+        return -1;
+    }
+    writer->fp = NULL;
+
+    return 0;
+}
+
+void
+binary_writer_destroy(BinaryWriter *writer)
+{
+    if (!writer) {
+        return;
+    }
+
+    if (writer->fp) {
+        fclose(writer->fp);
+    }
+
+    PyMem_Free(writer->filename);
+    PyMem_Free(writer->write_buffer);
+
+#ifdef HAVE_ZSTD
+    if (writer->zstd.cctx) {
+        ZSTD_freeCCtx(writer->zstd.cctx);
+    }
+    PyMem_Free(writer->zstd.compressed_buffer);
+#endif
+
+    if (writer->string_hash) {
+        _Py_hashtable_destroy(writer->string_hash);
+    }
+    if (writer->strings) {
+        for (size_t i = 0; i < writer->string_count; i++) {
+            PyMem_Free(writer->strings[i]);
+        }
+        PyMem_Free(writer->strings);
+    }
+    PyMem_Free(writer->string_lengths);
+
+    if (writer->frame_hash) {
+        _Py_hashtable_destroy(writer->frame_hash);
+    }
+    PyMem_Free(writer->frame_entries);
+
+    if (writer->thread_entries) {
+        for (size_t i = 0; i < writer->thread_count; i++) {
+            PyMem_Free(writer->thread_entries[i].prev_stack);
+            PyMem_Free(writer->thread_entries[i].pending_rle);
+        }
+        PyMem_Free(writer->thread_entries);
+    }
+
+    PyMem_Free(writer);
+}
+
diff --git a/Modules/_remote_debugging/clinic/module.c.h b/Modules/_remote_debugging/clinic/module.c.h
index 5cbf64517af..263dfd68565 100644
--- a/Modules/_remote_debugging/clinic/module.c.h
+++ b/Modules/_remote_debugging/clinic/module.c.h
@@ -7,6 +7,7 @@ preserve
 #  include "pycore_runtime.h"     // _Py_ID()
 #endif
 #include "pycore_critical_section.h"// Py_BEGIN_CRITICAL_SECTION()
+#include "pycore_long.h"          // _PyLong_UnsignedLongLong_Converter()
 #include "pycore_modsupport.h"    // _PyArg_UnpackKeywords()
 
 PyDoc_STRVAR(_remote_debugging_RemoteUnwinder___init____doc__,
@@ -434,6 +435,659 @@ _remote_debugging_RemoteUnwinder_get_stats(PyObject *self, PyObject *Py_UNUSED(i
     return return_value;
 }
 
+PyDoc_STRVAR(_remote_debugging_BinaryWriter___init____doc__,
+"BinaryWriter(filename, sample_interval_us, start_time_us, *,\n"
+"             compression=0)\n"
+"--\n"
+"\n"
+"High-performance binary writer for profiling data.\n"
+"\n"
+"Arguments:\n"
+"    filename: Path to output file\n"
+"    sample_interval_us: Sampling interval in microseconds\n"
+"    start_time_us: Start timestamp in microseconds (from time.monotonic() * 1e6)\n"
+"    compression: 0=none, 1=zstd (default: 0)\n"
+"\n"
+"Use as a context manager or call finalize() when done.");
+
+static int
+_remote_debugging_BinaryWriter___init___impl(BinaryWriterObject *self,
+                                             const char *filename,
+                                             unsigned long long sample_interval_us,
+                                             unsigned long long start_time_us,
+                                             int compression);
+
+static int
+_remote_debugging_BinaryWriter___init__(PyObject *self, PyObject *args, PyObject *kwargs)
+{
+    int return_value = -1;
+    #if defined(Py_BUILD_CORE) && !defined(Py_BUILD_CORE_MODULE)
+
+    #define NUM_KEYWORDS 4
+    static struct {
+        PyGC_Head _this_is_not_used;
+        PyObject_VAR_HEAD
+        Py_hash_t ob_hash;
+        PyObject *ob_item[NUM_KEYWORDS];
+    } _kwtuple = {
+        .ob_base = PyVarObject_HEAD_INIT(&PyTuple_Type, NUM_KEYWORDS)
+        .ob_hash = -1,
+        .ob_item = { &_Py_ID(filename), &_Py_ID(sample_interval_us), &_Py_ID(start_time_us), &_Py_ID(compression), },
+    };
+    #undef NUM_KEYWORDS
+    #define KWTUPLE (&_kwtuple.ob_base.ob_base)
+
+    #else  // !Py_BUILD_CORE
+    #  define KWTUPLE NULL
+    #endif  // !Py_BUILD_CORE
+
+    static const char * const _keywords[] = {"filename", "sample_interval_us", "start_time_us", "compression", NULL};
+    static _PyArg_Parser _parser = {
+        .keywords = _keywords,
+        .fname = "BinaryWriter",
+        .kwtuple = KWTUPLE,
+    };
+    #undef KWTUPLE
+    PyObject *argsbuf[4];
+    PyObject * const *fastargs;
+    Py_ssize_t nargs = PyTuple_GET_SIZE(args);
+    Py_ssize_t noptargs = nargs + (kwargs ? PyDict_GET_SIZE(kwargs) : 0) - 3;
+    const char *filename;
+    unsigned long long sample_interval_us;
+    unsigned long long start_time_us;
+    int compression = 0;
+
+    fastargs = _PyArg_UnpackKeywords(_PyTuple_CAST(args)->ob_item, nargs, kwargs, NULL, &_parser,
+            /*minpos*/ 3, /*maxpos*/ 3, /*minkw*/ 0, /*varpos*/ 0, argsbuf);
+    if (!fastargs) {
+        goto exit;
+    }
+    if (!PyUnicode_Check(fastargs[0])) {
+        _PyArg_BadArgument("BinaryWriter", "argument 'filename'", "str", fastargs[0]);
+        goto exit;
+    }
+    Py_ssize_t filename_length;
+    filename = PyUnicode_AsUTF8AndSize(fastargs[0], &filename_length);
+    if (filename == NULL) {
+        goto exit;
+    }
+    if (strlen(filename) != (size_t)filename_length) {
+        PyErr_SetString(PyExc_ValueError, "embedded null character");
+        goto exit;
+    }
+    if (!_PyLong_UnsignedLongLong_Converter(fastargs[1], &sample_interval_us)) {
+        goto exit;
+    }
+    if (!_PyLong_UnsignedLongLong_Converter(fastargs[2], &start_time_us)) {
+        goto exit;
+    }
+    if (!noptargs) {
+        goto skip_optional_kwonly;
+    }
+    compression = PyLong_AsInt(fastargs[3]);
+    if (compression == -1 && PyErr_Occurred()) {
+        goto exit;
+    }
+skip_optional_kwonly:
+    return_value = _remote_debugging_BinaryWriter___init___impl((BinaryWriterObject *)self, filename, sample_interval_us, start_time_us, compression);
+
+exit:
+    return return_value;
+}
+
+PyDoc_STRVAR(_remote_debugging_BinaryWriter_write_sample__doc__,
+"write_sample($self, /, stack_frames, timestamp_us)\n"
+"--\n"
+"\n"
+"Write a sample to the binary file.\n"
+"\n"
+"Arguments:\n"
+"    stack_frames: List of InterpreterInfo objects\n"
+"    timestamp_us: Current timestamp in microseconds (from time.monotonic() * 1e6)");
+
+#define _REMOTE_DEBUGGING_BINARYWRITER_WRITE_SAMPLE_METHODDEF    \
+    {"write_sample", _PyCFunction_CAST(_remote_debugging_BinaryWriter_write_sample), METH_FASTCALL|METH_KEYWORDS, _remote_debugging_BinaryWriter_write_sample__doc__},
+
+static PyObject *
+_remote_debugging_BinaryWriter_write_sample_impl(BinaryWriterObject *self,
+                                                 PyObject *stack_frames,
+                                                 unsigned long long timestamp_us);
+
+static PyObject *
+_remote_debugging_BinaryWriter_write_sample(PyObject *self, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames)
+{
+    PyObject *return_value = NULL;
+    #if defined(Py_BUILD_CORE) && !defined(Py_BUILD_CORE_MODULE)
+
+    #define NUM_KEYWORDS 2
+    static struct {
+        PyGC_Head _this_is_not_used;
+        PyObject_VAR_HEAD
+        Py_hash_t ob_hash;
+        PyObject *ob_item[NUM_KEYWORDS];
+    } _kwtuple = {
+        .ob_base = PyVarObject_HEAD_INIT(&PyTuple_Type, NUM_KEYWORDS)
+        .ob_hash = -1,
+        .ob_item = { &_Py_ID(stack_frames), &_Py_ID(timestamp_us), },
+    };
+    #undef NUM_KEYWORDS
+    #define KWTUPLE (&_kwtuple.ob_base.ob_base)
+
+    #else  // !Py_BUILD_CORE
+    #  define KWTUPLE NULL
+    #endif  // !Py_BUILD_CORE
+
+    static const char * const _keywords[] = {"stack_frames", "timestamp_us", NULL};
+    static _PyArg_Parser _parser = {
+        .keywords = _keywords,
+        .fname = "write_sample",
+        .kwtuple = KWTUPLE,
+    };
+    #undef KWTUPLE
+    PyObject *argsbuf[2];
+    PyObject *stack_frames;
+    unsigned long long timestamp_us;
+
+    args = _PyArg_UnpackKeywords(args, nargs, NULL, kwnames, &_parser,
+            /*minpos*/ 2, /*maxpos*/ 2, /*minkw*/ 0, /*varpos*/ 0, argsbuf);
+    if (!args) {
+        goto exit;
+    }
+    stack_frames = args[0];
+    if (!_PyLong_UnsignedLongLong_Converter(args[1], &timestamp_us)) {
+        goto exit;
+    }
+    return_value = _remote_debugging_BinaryWriter_write_sample_impl((BinaryWriterObject *)self, stack_frames, timestamp_us);
+
+exit:
+    return return_value;
+}
+
+PyDoc_STRVAR(_remote_debugging_BinaryWriter_finalize__doc__,
+"finalize($self, /)\n"
+"--\n"
+"\n"
+"Finalize and close the binary file.\n"
+"\n"
+"Writes string/frame tables, footer, and updates header.");
+
+#define _REMOTE_DEBUGGING_BINARYWRITER_FINALIZE_METHODDEF    \
+    {"finalize", (PyCFunction)_remote_debugging_BinaryWriter_finalize, METH_NOARGS, _remote_debugging_BinaryWriter_finalize__doc__},
+
+static PyObject *
+_remote_debugging_BinaryWriter_finalize_impl(BinaryWriterObject *self);
+
+static PyObject *
+_remote_debugging_BinaryWriter_finalize(PyObject *self, PyObject *Py_UNUSED(ignored))
+{
+    return _remote_debugging_BinaryWriter_finalize_impl((BinaryWriterObject *)self);
+}
+
+PyDoc_STRVAR(_remote_debugging_BinaryWriter_close__doc__,
+"close($self, /)\n"
+"--\n"
+"\n"
+"Close the writer without finalizing (discards data).");
+
+#define _REMOTE_DEBUGGING_BINARYWRITER_CLOSE_METHODDEF    \
+    {"close", (PyCFunction)_remote_debugging_BinaryWriter_close, METH_NOARGS, _remote_debugging_BinaryWriter_close__doc__},
+
+static PyObject *
+_remote_debugging_BinaryWriter_close_impl(BinaryWriterObject *self);
+
+static PyObject *
+_remote_debugging_BinaryWriter_close(PyObject *self, PyObject *Py_UNUSED(ignored))
+{
+    return _remote_debugging_BinaryWriter_close_impl((BinaryWriterObject *)self);
+}
+
+PyDoc_STRVAR(_remote_debugging_BinaryWriter___enter____doc__,
+"__enter__($self, /)\n"
+"--\n"
+"\n"
+"Enter context manager.");
+
+#define _REMOTE_DEBUGGING_BINARYWRITER___ENTER___METHODDEF    \
+    {"__enter__", (PyCFunction)_remote_debugging_BinaryWriter___enter__, METH_NOARGS, _remote_debugging_BinaryWriter___enter____doc__},
+
+static PyObject *
+_remote_debugging_BinaryWriter___enter___impl(BinaryWriterObject *self);
+
+static PyObject *
+_remote_debugging_BinaryWriter___enter__(PyObject *self, PyObject *Py_UNUSED(ignored))
+{
+    return _remote_debugging_BinaryWriter___enter___impl((BinaryWriterObject *)self);
+}
+
+PyDoc_STRVAR(_remote_debugging_BinaryWriter___exit____doc__,
+"__exit__($self, /, exc_type=None, exc_val=None, exc_tb=None)\n"
+"--\n"
+"\n"
+"Exit context manager, finalizing the file.");
+
+#define _REMOTE_DEBUGGING_BINARYWRITER___EXIT___METHODDEF    \
+    {"__exit__", _PyCFunction_CAST(_remote_debugging_BinaryWriter___exit__), METH_FASTCALL|METH_KEYWORDS, _remote_debugging_BinaryWriter___exit____doc__},
+
+static PyObject *
+_remote_debugging_BinaryWriter___exit___impl(BinaryWriterObject *self,
+                                             PyObject *exc_type,
+                                             PyObject *exc_val,
+                                             PyObject *exc_tb);
+
+static PyObject *
+_remote_debugging_BinaryWriter___exit__(PyObject *self, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames)
+{
+    PyObject *return_value = NULL;
+    #if defined(Py_BUILD_CORE) && !defined(Py_BUILD_CORE_MODULE)
+
+    #define NUM_KEYWORDS 3
+    static struct {
+        PyGC_Head _this_is_not_used;
+        PyObject_VAR_HEAD
+        Py_hash_t ob_hash;
+        PyObject *ob_item[NUM_KEYWORDS];
+    } _kwtuple = {
+        .ob_base = PyVarObject_HEAD_INIT(&PyTuple_Type, NUM_KEYWORDS)
+        .ob_hash = -1,
+        .ob_item = { &_Py_ID(exc_type), &_Py_ID(exc_val), &_Py_ID(exc_tb), },
+    };
+    #undef NUM_KEYWORDS
+    #define KWTUPLE (&_kwtuple.ob_base.ob_base)
+
+    #else  // !Py_BUILD_CORE
+    #  define KWTUPLE NULL
+    #endif  // !Py_BUILD_CORE
+
+    static const char * const _keywords[] = {"exc_type", "exc_val", "exc_tb", NULL};
+    static _PyArg_Parser _parser = {
+        .keywords = _keywords,
+        .fname = "__exit__",
+        .kwtuple = KWTUPLE,
+    };
+    #undef KWTUPLE
+    PyObject *argsbuf[3];
+    Py_ssize_t noptargs = nargs + (kwnames ? PyTuple_GET_SIZE(kwnames) : 0) - 0;
+    PyObject *exc_type = Py_None;
+    PyObject *exc_val = Py_None;
+    PyObject *exc_tb = Py_None;
+
+    args = _PyArg_UnpackKeywords(args, nargs, NULL, kwnames, &_parser,
+            /*minpos*/ 0, /*maxpos*/ 3, /*minkw*/ 0, /*varpos*/ 0, argsbuf);
+    if (!args) {
+        goto exit;
+    }
+    if (!noptargs) {
+        goto skip_optional_pos;
+    }
+    if (args[0]) {
+        exc_type = args[0];
+        if (!--noptargs) {
+            goto skip_optional_pos;
+        }
+    }
+    if (args[1]) {
+        exc_val = args[1];
+        if (!--noptargs) {
+            goto skip_optional_pos;
+        }
+    }
+    exc_tb = args[2];
+skip_optional_pos:
+    return_value = _remote_debugging_BinaryWriter___exit___impl((BinaryWriterObject *)self, exc_type, exc_val, exc_tb);
+
+exit:
+    return return_value;
+}
+
+PyDoc_STRVAR(_remote_debugging_BinaryWriter_get_stats__doc__,
+"get_stats($self, /)\n"
+"--\n"
+"\n"
+"Get encoding statistics for the writer.\n"
+"\n"
+"Returns a dict with encoding statistics including repeat/full/suffix/pop-push\n"
+"record counts, frames written/saved, and compression ratio.");
+
+#define _REMOTE_DEBUGGING_BINARYWRITER_GET_STATS_METHODDEF    \
+    {"get_stats", (PyCFunction)_remote_debugging_BinaryWriter_get_stats, METH_NOARGS, _remote_debugging_BinaryWriter_get_stats__doc__},
+
+static PyObject *
+_remote_debugging_BinaryWriter_get_stats_impl(BinaryWriterObject *self);
+
+static PyObject *
+_remote_debugging_BinaryWriter_get_stats(PyObject *self, PyObject *Py_UNUSED(ignored))
+{
+    return _remote_debugging_BinaryWriter_get_stats_impl((BinaryWriterObject *)self);
+}
+
+PyDoc_STRVAR(_remote_debugging_BinaryReader___init____doc__,
+"BinaryReader(filename)\n"
+"--\n"
+"\n"
+"High-performance binary reader for profiling data.\n"
+"\n"
+"Arguments:\n"
+"    filename: Path to input file\n"
+"\n"
+"Use as a context manager or call close() when done.");
+
+static int
+_remote_debugging_BinaryReader___init___impl(BinaryReaderObject *self,
+                                             const char *filename);
+
+static int
+_remote_debugging_BinaryReader___init__(PyObject *self, PyObject *args, PyObject *kwargs)
+{
+    int return_value = -1;
+    #if defined(Py_BUILD_CORE) && !defined(Py_BUILD_CORE_MODULE)
+
+    #define NUM_KEYWORDS 1
+    static struct {
+        PyGC_Head _this_is_not_used;
+        PyObject_VAR_HEAD
+        Py_hash_t ob_hash;
+        PyObject *ob_item[NUM_KEYWORDS];
+    } _kwtuple = {
+        .ob_base = PyVarObject_HEAD_INIT(&PyTuple_Type, NUM_KEYWORDS)
+        .ob_hash = -1,
+        .ob_item = { &_Py_ID(filename), },
+    };
+    #undef NUM_KEYWORDS
+    #define KWTUPLE (&_kwtuple.ob_base.ob_base)
+
+    #else  // !Py_BUILD_CORE
+    #  define KWTUPLE NULL
+    #endif  // !Py_BUILD_CORE
+
+    static const char * const _keywords[] = {"filename", NULL};
+    static _PyArg_Parser _parser = {
+        .keywords = _keywords,
+        .fname = "BinaryReader",
+        .kwtuple = KWTUPLE,
+    };
+    #undef KWTUPLE
+    PyObject *argsbuf[1];
+    PyObject * const *fastargs;
+    Py_ssize_t nargs = PyTuple_GET_SIZE(args);
+    const char *filename;
+
+    fastargs = _PyArg_UnpackKeywords(_PyTuple_CAST(args)->ob_item, nargs, kwargs, NULL, &_parser,
+            /*minpos*/ 1, /*maxpos*/ 1, /*minkw*/ 0, /*varpos*/ 0, argsbuf);
+    if (!fastargs) {
+        goto exit;
+    }
+    if (!PyUnicode_Check(fastargs[0])) {
+        _PyArg_BadArgument("BinaryReader", "argument 'filename'", "str", fastargs[0]);
+        goto exit;
+    }
+    Py_ssize_t filename_length;
+    filename = PyUnicode_AsUTF8AndSize(fastargs[0], &filename_length);
+    if (filename == NULL) {
+        goto exit;
+    }
+    if (strlen(filename) != (size_t)filename_length) {
+        PyErr_SetString(PyExc_ValueError, "embedded null character");
+        goto exit;
+    }
+    return_value = _remote_debugging_BinaryReader___init___impl((BinaryReaderObject *)self, filename);
+
+exit:
+    return return_value;
+}
+
+PyDoc_STRVAR(_remote_debugging_BinaryReader_replay__doc__,
+"replay($self, /, collector, progress_callback=None)\n"
+"--\n"
+"\n"
+"Replay samples through a collector.\n"
+"\n"
+"Arguments:\n"
+"    collector: Collector object with collect() method\n"
+"    progress_callback: Optional callable(current, total)\n"
+"\n"
+"Returns:\n"
+"    Number of samples replayed");
+
+#define _REMOTE_DEBUGGING_BINARYREADER_REPLAY_METHODDEF    \
+    {"replay", _PyCFunction_CAST(_remote_debugging_BinaryReader_replay), METH_FASTCALL|METH_KEYWORDS, _remote_debugging_BinaryReader_replay__doc__},
+
+static PyObject *
+_remote_debugging_BinaryReader_replay_impl(BinaryReaderObject *self,
+                                           PyObject *collector,
+                                           PyObject *progress_callback);
+
+static PyObject *
+_remote_debugging_BinaryReader_replay(PyObject *self, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames)
+{
+    PyObject *return_value = NULL;
+    #if defined(Py_BUILD_CORE) && !defined(Py_BUILD_CORE_MODULE)
+
+    #define NUM_KEYWORDS 2
+    static struct {
+        PyGC_Head _this_is_not_used;
+        PyObject_VAR_HEAD
+        Py_hash_t ob_hash;
+        PyObject *ob_item[NUM_KEYWORDS];
+    } _kwtuple = {
+        .ob_base = PyVarObject_HEAD_INIT(&PyTuple_Type, NUM_KEYWORDS)
+        .ob_hash = -1,
+        .ob_item = { &_Py_ID(collector), &_Py_ID(progress_callback), },
+    };
+    #undef NUM_KEYWORDS
+    #define KWTUPLE (&_kwtuple.ob_base.ob_base)
+
+    #else  // !Py_BUILD_CORE
+    #  define KWTUPLE NULL
+    #endif  // !Py_BUILD_CORE
+
+    static const char * const _keywords[] = {"collector", "progress_callback", NULL};
+    static _PyArg_Parser _parser = {
+        .keywords = _keywords,
+        .fname = "replay",
+        .kwtuple = KWTUPLE,
+    };
+    #undef KWTUPLE
+    PyObject *argsbuf[2];
+    Py_ssize_t noptargs = nargs + (kwnames ? PyTuple_GET_SIZE(kwnames) : 0) - 1;
+    PyObject *collector;
+    PyObject *progress_callback = Py_None;
+
+    args = _PyArg_UnpackKeywords(args, nargs, NULL, kwnames, &_parser,
+            /*minpos*/ 1, /*maxpos*/ 2, /*minkw*/ 0, /*varpos*/ 0, argsbuf);
+    if (!args) {
+        goto exit;
+    }
+    collector = args[0];
+    if (!noptargs) {
+        goto skip_optional_pos;
+    }
+    progress_callback = args[1];
+skip_optional_pos:
+    return_value = _remote_debugging_BinaryReader_replay_impl((BinaryReaderObject *)self, collector, progress_callback);
+
+exit:
+    return return_value;
+}
+
+PyDoc_STRVAR(_remote_debugging_BinaryReader_get_info__doc__,
+"get_info($self, /)\n"
+"--\n"
+"\n"
+"Get metadata about the binary file.\n"
+"\n"
+"Returns:\n"
+"    Dict with file metadata");
+
+#define _REMOTE_DEBUGGING_BINARYREADER_GET_INFO_METHODDEF    \
+    {"get_info", (PyCFunction)_remote_debugging_BinaryReader_get_info, METH_NOARGS, _remote_debugging_BinaryReader_get_info__doc__},
+
+static PyObject *
+_remote_debugging_BinaryReader_get_info_impl(BinaryReaderObject *self);
+
+static PyObject *
+_remote_debugging_BinaryReader_get_info(PyObject *self, PyObject *Py_UNUSED(ignored))
+{
+    return _remote_debugging_BinaryReader_get_info_impl((BinaryReaderObject *)self);
+}
+
+PyDoc_STRVAR(_remote_debugging_BinaryReader_get_stats__doc__,
+"get_stats($self, /)\n"
+"--\n"
+"\n"
+"Get reconstruction statistics from replay.\n"
+"\n"
+"Returns a dict with statistics about record types decoded and samples\n"
+"reconstructed during replay.");
+
+#define _REMOTE_DEBUGGING_BINARYREADER_GET_STATS_METHODDEF    \
+    {"get_stats", (PyCFunction)_remote_debugging_BinaryReader_get_stats, METH_NOARGS, _remote_debugging_BinaryReader_get_stats__doc__},
+
+static PyObject *
+_remote_debugging_BinaryReader_get_stats_impl(BinaryReaderObject *self);
+
+static PyObject *
+_remote_debugging_BinaryReader_get_stats(PyObject *self, PyObject *Py_UNUSED(ignored))
+{
+    return _remote_debugging_BinaryReader_get_stats_impl((BinaryReaderObject *)self);
+}
+
+PyDoc_STRVAR(_remote_debugging_BinaryReader_close__doc__,
+"close($self, /)\n"
+"--\n"
+"\n"
+"Close the reader and free resources.");
+
+#define _REMOTE_DEBUGGING_BINARYREADER_CLOSE_METHODDEF    \
+    {"close", (PyCFunction)_remote_debugging_BinaryReader_close, METH_NOARGS, _remote_debugging_BinaryReader_close__doc__},
+
+static PyObject *
+_remote_debugging_BinaryReader_close_impl(BinaryReaderObject *self);
+
+static PyObject *
+_remote_debugging_BinaryReader_close(PyObject *self, PyObject *Py_UNUSED(ignored))
+{
+    return _remote_debugging_BinaryReader_close_impl((BinaryReaderObject *)self);
+}
+
+PyDoc_STRVAR(_remote_debugging_BinaryReader___enter____doc__,
+"__enter__($self, /)\n"
+"--\n"
+"\n"
+"Enter context manager.");
+
+#define _REMOTE_DEBUGGING_BINARYREADER___ENTER___METHODDEF    \
+    {"__enter__", (PyCFunction)_remote_debugging_BinaryReader___enter__, METH_NOARGS, _remote_debugging_BinaryReader___enter____doc__},
+
+static PyObject *
+_remote_debugging_BinaryReader___enter___impl(BinaryReaderObject *self);
+
+static PyObject *
+_remote_debugging_BinaryReader___enter__(PyObject *self, PyObject *Py_UNUSED(ignored))
+{
+    return _remote_debugging_BinaryReader___enter___impl((BinaryReaderObject *)self);
+}
+
+PyDoc_STRVAR(_remote_debugging_BinaryReader___exit____doc__,
+"__exit__($self, /, exc_type=None, exc_val=None, exc_tb=None)\n"
+"--\n"
+"\n"
+"Exit context manager, closing the file.");
+
+#define _REMOTE_DEBUGGING_BINARYREADER___EXIT___METHODDEF    \
+    {"__exit__", _PyCFunction_CAST(_remote_debugging_BinaryReader___exit__), METH_FASTCALL|METH_KEYWORDS, _remote_debugging_BinaryReader___exit____doc__},
+
+static PyObject *
+_remote_debugging_BinaryReader___exit___impl(BinaryReaderObject *self,
+                                             PyObject *exc_type,
+                                             PyObject *exc_val,
+                                             PyObject *exc_tb);
+
+static PyObject *
+_remote_debugging_BinaryReader___exit__(PyObject *self, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames)
+{
+    PyObject *return_value = NULL;
+    #if defined(Py_BUILD_CORE) && !defined(Py_BUILD_CORE_MODULE)
+
+    #define NUM_KEYWORDS 3
+    static struct {
+        PyGC_Head _this_is_not_used;
+        PyObject_VAR_HEAD
+        Py_hash_t ob_hash;
+        PyObject *ob_item[NUM_KEYWORDS];
+    } _kwtuple = {
+        .ob_base = PyVarObject_HEAD_INIT(&PyTuple_Type, NUM_KEYWORDS)
+        .ob_hash = -1,
+        .ob_item = { &_Py_ID(exc_type), &_Py_ID(exc_val), &_Py_ID(exc_tb), },
+    };
+    #undef NUM_KEYWORDS
+    #define KWTUPLE (&_kwtuple.ob_base.ob_base)
+
+    #else  // !Py_BUILD_CORE
+    #  define KWTUPLE NULL
+    #endif  // !Py_BUILD_CORE
+
+    static const char * const _keywords[] = {"exc_type", "exc_val", "exc_tb", NULL};
+    static _PyArg_Parser _parser = {
+        .keywords = _keywords,
+        .fname = "__exit__",
+        .kwtuple = KWTUPLE,
+    };
+    #undef KWTUPLE
+    PyObject *argsbuf[3];
+    Py_ssize_t noptargs = nargs + (kwnames ? PyTuple_GET_SIZE(kwnames) : 0) - 0;
+    PyObject *exc_type = Py_None;
+    PyObject *exc_val = Py_None;
+    PyObject *exc_tb = Py_None;
+
+    args = _PyArg_UnpackKeywords(args, nargs, NULL, kwnames, &_parser,
+            /*minpos*/ 0, /*maxpos*/ 3, /*minkw*/ 0, /*varpos*/ 0, argsbuf);
+    if (!args) {
+        goto exit;
+    }
+    if (!noptargs) {
+        goto skip_optional_pos;
+    }
+    if (args[0]) {
+        exc_type = args[0];
+        if (!--noptargs) {
+            goto skip_optional_pos;
+        }
+    }
+    if (args[1]) {
+        exc_val = args[1];
+        if (!--noptargs) {
+            goto skip_optional_pos;
+        }
+    }
+    exc_tb = args[2];
+skip_optional_pos:
+    return_value = _remote_debugging_BinaryReader___exit___impl((BinaryReaderObject *)self, exc_type, exc_val, exc_tb);
+
+exit:
+    return return_value;
+}
+
+PyDoc_STRVAR(_remote_debugging_zstd_available__doc__,
+"zstd_available($module, /)\n"
+"--\n"
+"\n"
+"Check if zstd compression is available.\n"
+"\n"
+"Returns:\n"
+"    True if zstd available, False otherwise");
+
+#define _REMOTE_DEBUGGING_ZSTD_AVAILABLE_METHODDEF    \
+    {"zstd_available", (PyCFunction)_remote_debugging_zstd_available, METH_NOARGS, _remote_debugging_zstd_available__doc__},
+
+static PyObject *
+_remote_debugging_zstd_available_impl(PyObject *module);
+
+static PyObject *
+_remote_debugging_zstd_available(PyObject *module, PyObject *Py_UNUSED(ignored))
+{
+    return _remote_debugging_zstd_available_impl(module);
+}
+
 PyDoc_STRVAR(_remote_debugging_get_child_pids__doc__,
 "get_child_pids($module, /, pid, *, recursive=True)\n"
 "--\n"
@@ -582,4 +1236,4 @@ _remote_debugging_is_python_process(PyObject *module, PyObject *const *args, Py_
 exit:
     return return_value;
 }
-/*[clinic end generated code: output=dc0550ad3d6a409c input=a9049054013a1b77]*/
+/*[clinic end generated code: output=036de0b06d0e34cc input=a9049054013a1b77]*/
diff --git a/Modules/_remote_debugging/code_objects.c b/Modules/_remote_debugging/code_objects.c
index 98fe74e8cb6..ca6ffe7a00a 100644
--- a/Modules/_remote_debugging/code_objects.c
+++ b/Modules/_remote_debugging/code_objects.c
@@ -76,6 +76,7 @@ cache_tlbc_array(RemoteUnwinderObject *unwinder, uintptr_t code_addr, uintptr_t
         PyErr_SetString(PyExc_RuntimeError, "TLBC array size exceeds maximum limit");
         return 0; // Invalid size
     }
+    assert(tlbc_size > 0 && tlbc_size <= MAX_TLBC_SIZE);
 
     // Allocate and read the entire TLBC array
     size_t array_data_size = tlbc_size * sizeof(void*);
@@ -156,8 +157,11 @@ parse_linetable(const uintptr_t addrq, const char* linetable, int firstlineno, L
     const uint8_t* ptr = (const uint8_t*)(linetable);
     uintptr_t addr = 0;
     int computed_line = firstlineno;  // Running accumulator, separate from output
+    const size_t MAX_LINETABLE_ENTRIES = 65536;
+    size_t entry_count = 0;
 
-    while (*ptr != '\0') {
+    while (*ptr != '\0' && entry_count < MAX_LINETABLE_ENTRIES) {
+        entry_count++;
         uint8_t first_byte = *(ptr++);
         uint8_t code = (first_byte >> 3) & 15;
         size_t length = (first_byte & 7) + 1;
@@ -277,12 +281,9 @@ make_frame_info(RemoteUnwinderObject *unwinder, PyObject *file, PyObject *locati
 int
 parse_code_object(RemoteUnwinderObject *unwinder,
                   PyObject **result,
-                  uintptr_t address,
-                  uintptr_t instruction_pointer,
-                  uintptr_t *previous_frame,
-                  int32_t tlbc_index)
+                  const CodeObjectContext *ctx)
 {
-    void *key = (void *)address;
+    void *key = (void *)ctx->code_addr;
     CachedCodeMetadata *meta = NULL;
     PyObject *func = NULL;
     PyObject *file = NULL;
@@ -291,9 +292,9 @@ parse_code_object(RemoteUnwinderObject *unwinder,
 #ifdef Py_GIL_DISABLED
     // In free threading builds, code object addresses might have the low bit set
     // as a flag, so we need to mask it off to get the real address
-    uintptr_t real_address = address & (~1);
+    uintptr_t real_address = ctx->code_addr & (~1);
 #else
-    uintptr_t real_address = address;
+    uintptr_t real_address = ctx->code_addr;
 #endif
 
     if (unwinder && unwinder->code_object_cache != NULL) {
@@ -360,12 +361,12 @@ parse_code_object(RemoteUnwinderObject *unwinder,
         linetable = NULL;
     }
 
-    uintptr_t ip = instruction_pointer;
+    uintptr_t ip = ctx->instruction_pointer;
     ptrdiff_t addrq;
 
 #ifdef Py_GIL_DISABLED
     // Handle thread-local bytecode (TLBC) in free threading builds
-    if (tlbc_index == 0 || unwinder->debug_offsets.code_object.co_tlbc == 0 || unwinder == NULL) {
+    if (ctx->tlbc_index == 0 || unwinder->debug_offsets.code_object.co_tlbc == 0 || unwinder == NULL) {
         // No TLBC or no unwinder - use main bytecode directly
         addrq = (uint16_t *)ip - (uint16_t *)meta->addr_code_adaptive;
         goto done_tlbc;
@@ -383,10 +384,12 @@ parse_code_object(RemoteUnwinderObject *unwinder,
         tlbc_entry = get_tlbc_cache_entry(unwinder, real_address, unwinder->tlbc_generation);
     }
 
-    if (tlbc_entry && tlbc_index < tlbc_entry->tlbc_array_size) {
+    if (tlbc_entry && ctx->tlbc_index < tlbc_entry->tlbc_array_size) {
+        assert(ctx->tlbc_index >= 0);
+        assert(tlbc_entry->tlbc_array_size > 0);
         // Use cached TLBC data
         uintptr_t *entries = (uintptr_t *)((char *)tlbc_entry->tlbc_array + sizeof(Py_ssize_t));
-        uintptr_t tlbc_bytecode_addr = entries[tlbc_index];
+        uintptr_t tlbc_bytecode_addr = entries[ctx->tlbc_index];
 
         if (tlbc_bytecode_addr != 0) {
             // Calculate offset from TLBC bytecode
@@ -401,8 +404,6 @@ parse_code_object(RemoteUnwinderObject *unwinder,
 done_tlbc:
 #else
     // Non-free-threaded build, always use the main bytecode
-    (void)tlbc_index; // Suppress unused parameter warning
-    (void)unwinder;   // Suppress unused parameter warning
     addrq = (uint16_t *)ip - (uint16_t *)meta->addr_code_adaptive;
 #endif
     ;  // Empty statement to avoid C23 extension warning
diff --git a/Modules/_remote_debugging/frame_cache.c b/Modules/_remote_debugging/frame_cache.c
index 4598b9dc353..e94f4d3d81c 100644
--- a/Modules/_remote_debugging/frame_cache.c
+++ b/Modules/_remote_debugging/frame_cache.c
@@ -44,7 +44,9 @@ frame_cache_find(RemoteUnwinderObject *unwinder, uint64_t thread_id)
         return NULL;
     }
     for (int i = 0; i < FRAME_CACHE_MAX_THREADS; i++) {
+        assert(i >= 0 && i < FRAME_CACHE_MAX_THREADS);
         if (unwinder->frame_cache[i].thread_id == thread_id) {
+            assert(unwinder->frame_cache[i].num_addrs <= FRAME_CACHE_MAX_FRAMES);
             return &unwinder->frame_cache[i];
         }
     }
@@ -154,6 +156,8 @@ frame_cache_lookup_and_extend(
         return 0;
     }
 
+    assert(entry->num_addrs >= 0 && entry->num_addrs <= FRAME_CACHE_MAX_FRAMES);
+
     // Find the index where last_profiled_frame matches
     Py_ssize_t start_idx = -1;
     for (Py_ssize_t i = 0; i < entry->num_addrs; i++) {
@@ -166,6 +170,7 @@ frame_cache_lookup_and_extend(
     if (start_idx < 0) {
         return 0;  // Not found
     }
+    assert(start_idx < entry->num_addrs);
 
     Py_ssize_t num_frames = PyList_GET_SIZE(entry->frame_list);
 
@@ -194,6 +199,7 @@ frame_cache_lookup_and_extend(
 }
 
 // Store frame list with addresses in cache
+// Only stores complete stacks that reach base_frame_addr (validation done internally)
 // Returns: 1 = stored successfully, 0 = not stored (graceful degradation), -1 = error
 int
 frame_cache_store(
@@ -201,16 +207,30 @@ frame_cache_store(
     uint64_t thread_id,
     PyObject *frame_list,
     const uintptr_t *addrs,
-    Py_ssize_t num_addrs)
+    Py_ssize_t num_addrs,
+    uintptr_t base_frame_addr,
+    uintptr_t last_frame_visited)
 {
     if (!unwinder->frame_cache || thread_id == 0) {
         return 0;
     }
 
+    // Validate we have a complete stack before caching.
+    // Only cache if last_frame_visited matches base_frame_addr (the sentinel
+    // at the bottom of the stack). Note: we use last_frame_visited rather than
+    // addrs[num_addrs-1] because the base frame is visited but not added to the
+    // addrs array (it returns frame==NULL from is_frame_valid due to
+    // owner==FRAME_OWNED_BY_INTERPRETER).
+    if (base_frame_addr != 0 && last_frame_visited != base_frame_addr) {
+        // Incomplete stack - don't cache (graceful degradation)
+        return 0;
+    }
+
     // Clamp to max frames
     if (num_addrs > FRAME_CACHE_MAX_FRAMES) {
         num_addrs = FRAME_CACHE_MAX_FRAMES;
     }
+    assert(num_addrs >= 0 && num_addrs <= FRAME_CACHE_MAX_FRAMES);
 
     FrameCacheEntry *entry = frame_cache_alloc_slot(unwinder, thread_id);
     if (!entry) {
@@ -231,6 +251,8 @@ frame_cache_store(
     entry->thread_id = thread_id;
     memcpy(entry->addrs, addrs, num_addrs * sizeof(uintptr_t));
     entry->num_addrs = num_addrs;
+    assert(entry->num_addrs == num_addrs);
+    assert(entry->thread_id == thread_id);
 
     return 1;
 }
diff --git a/Modules/_remote_debugging/frames.c b/Modules/_remote_debugging/frames.c
index abde60c4576..8aebd40671c 100644
--- a/Modules/_remote_debugging/frames.c
+++ b/Modules/_remote_debugging/frames.c
@@ -88,7 +88,8 @@ copy_stack_chunks(RemoteUnwinderObject *unwinder,
         return -1;
     }
 
-    while (chunk_addr != 0) {
+    const size_t MAX_STACK_CHUNKS = 4096;
+    while (chunk_addr != 0 && count < MAX_STACK_CHUNKS) {
         // Grow array if needed
         if (count >= max_chunks) {
             max_chunks *= 2;
@@ -128,6 +129,7 @@ void *
 find_frame_in_chunks(StackChunkList *chunks, uintptr_t remote_ptr)
 {
     for (size_t i = 0; i < chunks->count; ++i) {
+        assert(chunks->chunks[i].size > offsetof(_PyStackChunk, data));
         uintptr_t base = chunks->chunks[i].remote_addr + offsetof(_PyStackChunk, data);
         size_t payload = chunks->chunks[i].size - offsetof(_PyStackChunk, data);
 
@@ -209,7 +211,13 @@ parse_frame_object(
 #endif
 
     *address_of_code_object = code_object;
-    return parse_code_object(unwinder, result, code_object, instruction_pointer, previous_frame, tlbc_index);
+
+    CodeObjectContext code_ctx = {
+        .code_addr = code_object,
+        .instruction_pointer = instruction_pointer,
+        .tlbc_index = tlbc_index,
+    };
+    return parse_code_object(unwinder, result, &code_ctx);
 }
 
 int
@@ -246,7 +254,12 @@ parse_frame_from_chunks(
     }
 #endif
 
-    return parse_code_object(unwinder, result, code_object, instruction_pointer, previous_frame, tlbc_index);
+    CodeObjectContext code_ctx = {
+        .code_addr = code_object,
+        .instruction_pointer = instruction_pointer,
+        .tlbc_index = tlbc_index,
+    };
+    return parse_code_object(unwinder, result, &code_ctx);
 }
 
 /* ============================================================================
@@ -256,101 +269,80 @@ parse_frame_from_chunks(
 int
 process_frame_chain(
     RemoteUnwinderObject *unwinder,
-    uintptr_t initial_frame_addr,
-    StackChunkList *chunks,
-    PyObject *frame_info,
-    uintptr_t base_frame_addr,
-    uintptr_t gc_frame,
-    uintptr_t last_profiled_frame,
-    int *stopped_at_cached_frame,
-    uintptr_t *frame_addrs,      // optional: C array to receive frame addresses
-    Py_ssize_t *num_addrs,       // in/out: current count / updated count
-    Py_ssize_t max_addrs)        // max capacity of frame_addrs array
+    FrameWalkContext *ctx)
 {
-    uintptr_t frame_addr = initial_frame_addr;
+    uintptr_t frame_addr = ctx->frame_addr;
     uintptr_t prev_frame_addr = 0;
-    uintptr_t last_frame_addr = 0;  // Track last frame visited for validation
+    uintptr_t last_frame_addr = 0;
     const size_t MAX_FRAMES = 1024 + 512;
     size_t frame_count = 0;
+    assert(MAX_FRAMES > 0 && MAX_FRAMES < 10000);
 
-    // Initialize output flag
-    if (stopped_at_cached_frame) {
-        *stopped_at_cached_frame = 0;
-    }
+    ctx->stopped_at_cached_frame = 0;
+    ctx->last_frame_visited = 0;
 
-    // Quick check: if current_frame == last_profiled_frame, entire stack is unchanged
-    if (last_profiled_frame != 0 && initial_frame_addr == last_profiled_frame) {
-        if (stopped_at_cached_frame) {
-            *stopped_at_cached_frame = 1;
-        }
+    if (ctx->last_profiled_frame != 0 && ctx->frame_addr == ctx->last_profiled_frame) {
+        ctx->stopped_at_cached_frame = 1;
         return 0;
     }
 
     while ((void*)frame_addr != NULL) {
-        // Check if we've reached the cached frame - if so, stop here
-        if (last_profiled_frame != 0 && frame_addr == last_profiled_frame) {
-            if (stopped_at_cached_frame) {
-                *stopped_at_cached_frame = 1;
-            }
+        if (ctx->last_profiled_frame != 0 && frame_addr == ctx->last_profiled_frame) {
+            ctx->stopped_at_cached_frame = 1;
             break;
         }
         PyObject *frame = NULL;
         uintptr_t next_frame_addr = 0;
         uintptr_t stackpointer = 0;
-        last_frame_addr = frame_addr;  // Remember this frame address
+        last_frame_addr = frame_addr;
 
         if (++frame_count > MAX_FRAMES) {
             PyErr_SetString(PyExc_RuntimeError, "Too many stack frames (possible infinite loop)");
             set_exception_cause(unwinder, PyExc_RuntimeError, "Frame chain iteration limit exceeded");
             return -1;
         }
+        assert(frame_count <= MAX_FRAMES);
 
-        if (parse_frame_from_chunks(unwinder, &frame, frame_addr, &next_frame_addr, &stackpointer, chunks) < 0) {
+        if (parse_frame_from_chunks(unwinder, &frame, frame_addr, &next_frame_addr, &stackpointer, ctx->chunks) < 0) {
             PyErr_Clear();
             uintptr_t address_of_code_object = 0;
-            if (parse_frame_object(unwinder, &frame, frame_addr, &address_of_code_object ,&next_frame_addr) < 0) {
+            if (parse_frame_object(unwinder, &frame, frame_addr, &address_of_code_object, &next_frame_addr) < 0) {
                 set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to parse frame object in chain");
                 return -1;
             }
         }
-        if (frame == NULL && PyList_GET_SIZE(frame_info) == 0) {
+        if (frame == NULL && PyList_GET_SIZE(ctx->frame_info) == 0) {
             const char *e = "Failed to parse initial frame in chain";
             PyErr_SetString(PyExc_RuntimeError, e);
             return -1;
         }
         PyObject *extra_frame = NULL;
-        // This frame kicked off the current GC collection:
-        if (unwinder->gc && frame_addr == gc_frame) {
+        if (unwinder->gc && frame_addr == ctx->gc_frame) {
             _Py_DECLARE_STR(gc, "<GC>");
             extra_frame = &_Py_STR(gc);
         }
-        // Otherwise, check for native frames to insert:
         else if (unwinder->native &&
-                 // We've reached an interpreter trampoline frame:
                  frame == NULL &&
-                 // Bottommost frame is always native, so skip that one:
                  next_frame_addr &&
-                 // Only suppress native frames if GC tracking is enabled and the next frame will be a GC frame:
-                 !(unwinder->gc && next_frame_addr == gc_frame))
+                 !(unwinder->gc && next_frame_addr == ctx->gc_frame))
         {
             _Py_DECLARE_STR(native, "<native>");
             extra_frame = &_Py_STR(native);
         }
         if (extra_frame) {
-            // Use "~" as file, None as location (synthetic frame), None as opcode
             PyObject *extra_frame_info = make_frame_info(
                 unwinder, _Py_LATIN1_CHR('~'), Py_None, extra_frame, Py_None);
             if (extra_frame_info == NULL) {
                 return -1;
             }
-            if (PyList_Append(frame_info, extra_frame_info) < 0) {
+            if (PyList_Append(ctx->frame_info, extra_frame_info) < 0) {
                 Py_DECREF(extra_frame_info);
                 set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to append extra frame");
                 return -1;
             }
-            // Extra frames use 0 as address (they're synthetic)
-            if (frame_addrs && *num_addrs < max_addrs) {
-                frame_addrs[(*num_addrs)++] = 0;
+            if (ctx->frame_addrs && ctx->num_addrs < ctx->max_addrs) {
+                assert(ctx->num_addrs >= 0);
+                ctx->frame_addrs[ctx->num_addrs++] = 0;
             }
             Py_DECREF(extra_frame_info);
         }
@@ -363,14 +355,14 @@ process_frame_chain(
                 return -1;
             }
 
-            if (PyList_Append(frame_info, frame) < 0) {
+            if (PyList_Append(ctx->frame_info, frame) < 0) {
                 Py_DECREF(frame);
                 set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to append frame");
                 return -1;
             }
-            // Track the address for this frame
-            if (frame_addrs && *num_addrs < max_addrs) {
-                frame_addrs[(*num_addrs)++] = frame_addr;
+            if (ctx->frame_addrs && ctx->num_addrs < ctx->max_addrs) {
+                assert(ctx->num_addrs >= 0);
+                ctx->frame_addrs[ctx->num_addrs++] = frame_addr;
             }
             Py_DECREF(frame);
         }
@@ -379,17 +371,15 @@ process_frame_chain(
         frame_addr = next_frame_addr;
     }
 
-    // Validate we reached the base frame (sentinel at bottom of stack)
-    // Only validate if we walked the full chain (didn't stop at cached frame)
-    // and base_frame_addr is provided (non-zero)
-    int stopped_early = stopped_at_cached_frame && *stopped_at_cached_frame;
-    if (!stopped_early && base_frame_addr != 0 && last_frame_addr != base_frame_addr) {
+    if (!ctx->stopped_at_cached_frame && ctx->base_frame_addr != 0 && last_frame_addr != ctx->base_frame_addr) {
         PyErr_Format(PyExc_RuntimeError,
             "Incomplete sample: did not reach base frame (expected 0x%lx, got 0x%lx)",
-            base_frame_addr, last_frame_addr);
+            ctx->base_frame_addr, last_frame_addr);
         return -1;
     }
 
+    ctx->last_frame_visited = last_frame_addr;
+
     return 0;
 }
 
@@ -401,8 +391,11 @@ clear_last_profiled_frames(RemoteUnwinderObject *unwinder)
 {
     uintptr_t current_interp = unwinder->interpreter_addr;
     uintptr_t zero = 0;
+    const size_t MAX_INTERPRETERS = 256;
+    size_t interp_count = 0;
 
-    while (current_interp != 0) {
+    while (current_interp != 0 && interp_count < MAX_INTERPRETERS) {
+        interp_count++;
         // Get first thread in this interpreter
         uintptr_t tstate_addr;
         if (_Py_RemoteDebug_PagedReadRemoteMemory(
@@ -416,7 +409,10 @@ clear_last_profiled_frames(RemoteUnwinderObject *unwinder)
         }
 
         // Iterate all threads in this interpreter
-        while (tstate_addr != 0) {
+        const size_t MAX_THREADS_PER_INTERP = 8192;
+        size_t thread_count = 0;
+        while (tstate_addr != 0 && thread_count < MAX_THREADS_PER_INTERP) {
+            thread_count++;
             // Clear last_profiled_frame
             uintptr_t lpf_addr = tstate_addr + unwinder->debug_offsets.thread_state.last_profiled_frame;
             if (_Py_RemoteDebug_WriteRemoteMemory(&unwinder->handle, lpf_addr,
@@ -459,16 +455,13 @@ clear_last_profiled_frames(RemoteUnwinderObject *unwinder)
 static int
 try_full_cache_hit(
     RemoteUnwinderObject *unwinder,
-    uintptr_t frame_addr,
-    uintptr_t last_profiled_frame,
-    uint64_t thread_id,
-    PyObject *frame_info)
+    const FrameWalkContext *ctx,
+    uint64_t thread_id)
 {
-    if (!unwinder->frame_cache || last_profiled_frame == 0) {
+    if (!unwinder->frame_cache || ctx->last_profiled_frame == 0) {
         return 0;
     }
-    // Full hit only if current frame == last profiled frame
-    if (frame_addr != last_profiled_frame) {
+    if (ctx->frame_addr != ctx->last_profiled_frame) {
         return 0;
     }
 
@@ -477,22 +470,19 @@ try_full_cache_hit(
         return 0;
     }
 
-    // Verify first address matches (sanity check)
-    if (entry->num_addrs == 0 || entry->addrs[0] != frame_addr) {
+    if (entry->num_addrs == 0 || entry->addrs[0] != ctx->frame_addr) {
         return 0;
     }
 
-    // Always read the current frame from memory to get updated line number
     PyObject *current_frame = NULL;
     uintptr_t code_object_addr = 0;
     uintptr_t previous_frame = 0;
-    int parse_result = parse_frame_object(unwinder, &current_frame, frame_addr,
+    int parse_result = parse_frame_object(unwinder, &current_frame, ctx->frame_addr,
                                           &code_object_addr, &previous_frame);
     if (parse_result < 0) {
         return -1;
     }
 
-    // Get cached parent frames first (before modifying frame_info)
     Py_ssize_t cached_size = PyList_GET_SIZE(entry->frame_list);
     PyObject *parent_slice = NULL;
     if (cached_size > 1) {
@@ -503,9 +493,8 @@ try_full_cache_hit(
         }
     }
 
-    // Now safe to modify frame_info - add current frame if valid
     if (current_frame != NULL) {
-        if (PyList_Append(frame_info, current_frame) < 0) {
+        if (PyList_Append(ctx->frame_info, current_frame) < 0) {
             Py_DECREF(current_frame);
             Py_XDECREF(parent_slice);
             return -1;
@@ -514,10 +503,9 @@ try_full_cache_hit(
         STATS_ADD(unwinder, frames_read_from_memory, 1);
     }
 
-    // Extend with cached parent frames
     if (parent_slice) {
-        Py_ssize_t cur_size = PyList_GET_SIZE(frame_info);
-        int result = PyList_SetSlice(frame_info, cur_size, cur_size, parent_slice);
+        Py_ssize_t cur_size = PyList_GET_SIZE(ctx->frame_info);
+        int result = PyList_SetSlice(ctx->frame_info, cur_size, cur_size, parent_slice);
         Py_DECREF(parent_slice);
         if (result < 0) {
             return -1;
@@ -534,64 +522,67 @@ try_full_cache_hit(
 int
 collect_frames_with_cache(
     RemoteUnwinderObject *unwinder,
-    uintptr_t frame_addr,
-    StackChunkList *chunks,
-    PyObject *frame_info,
-    uintptr_t gc_frame,
-    uintptr_t last_profiled_frame,
+    FrameWalkContext *ctx,
     uint64_t thread_id)
 {
-    // Fast path: check for full cache hit first (no allocations needed)
-    int full_hit = try_full_cache_hit(unwinder, frame_addr, last_profiled_frame,
-                                       thread_id, frame_info);
+    int full_hit = try_full_cache_hit(unwinder, ctx, thread_id);
     if (full_hit != 0) {
-        return full_hit < 0 ? -1 : 0;  // Either error or success
+        return full_hit < 0 ? -1 : 0;
     }
 
-    uintptr_t addrs[FRAME_CACHE_MAX_FRAMES];
-    Py_ssize_t num_addrs = 0;
-    Py_ssize_t frames_before = PyList_GET_SIZE(frame_info);
+    Py_ssize_t frames_before = PyList_GET_SIZE(ctx->frame_info);
 
-    int stopped_at_cached = 0;
-    if (process_frame_chain(unwinder, frame_addr, chunks, frame_info, 0, gc_frame,
-                            last_profiled_frame, &stopped_at_cached,
-                            addrs, &num_addrs, FRAME_CACHE_MAX_FRAMES) < 0) {
+    if (process_frame_chain(unwinder, ctx) < 0) {
         return -1;
     }
 
-    // Track frames read from memory (frames added by process_frame_chain)
-    STATS_ADD(unwinder, frames_read_from_memory, PyList_GET_SIZE(frame_info) - frames_before);
+    STATS_ADD(unwinder, frames_read_from_memory, PyList_GET_SIZE(ctx->frame_info) - frames_before);
 
-    // If stopped at cached frame, extend with cached continuation (both frames and addresses)
-    if (stopped_at_cached) {
-        Py_ssize_t frames_before_cache = PyList_GET_SIZE(frame_info);
-        int cache_result = frame_cache_lookup_and_extend(unwinder, thread_id, last_profiled_frame,
-                                                         frame_info, addrs, &num_addrs,
-                                                         FRAME_CACHE_MAX_FRAMES);
+    if (ctx->stopped_at_cached_frame) {
+        Py_ssize_t frames_before_cache = PyList_GET_SIZE(ctx->frame_info);
+        int cache_result = frame_cache_lookup_and_extend(unwinder, thread_id, ctx->last_profiled_frame,
+                                                         ctx->frame_info, ctx->frame_addrs, &ctx->num_addrs,
+                                                         ctx->max_addrs);
         if (cache_result < 0) {
             return -1;
         }
         if (cache_result == 0) {
-            // Cache miss - continue walking from last_profiled_frame to get the rest
             STATS_INC(unwinder, frame_cache_misses);
-            Py_ssize_t frames_before_walk = PyList_GET_SIZE(frame_info);
-            if (process_frame_chain(unwinder, last_profiled_frame, chunks, frame_info, 0, gc_frame,
-                                    0, NULL, addrs, &num_addrs, FRAME_CACHE_MAX_FRAMES) < 0) {
+            Py_ssize_t frames_before_walk = PyList_GET_SIZE(ctx->frame_info);
+
+            FrameWalkContext continue_ctx = {
+                .frame_addr = ctx->last_profiled_frame,
+                .base_frame_addr = ctx->base_frame_addr,
+                .gc_frame = ctx->gc_frame,
+                .last_profiled_frame = 0,
+                .chunks = ctx->chunks,
+                .frame_info = ctx->frame_info,
+                .frame_addrs = ctx->frame_addrs,
+                .num_addrs = ctx->num_addrs,
+                .max_addrs = ctx->max_addrs,
+            };
+            if (process_frame_chain(unwinder, &continue_ctx) < 0) {
                 return -1;
             }
-            STATS_ADD(unwinder, frames_read_from_memory, PyList_GET_SIZE(frame_info) - frames_before_walk);
+            ctx->num_addrs = continue_ctx.num_addrs;
+            ctx->last_frame_visited = continue_ctx.last_frame_visited;
+
+            STATS_ADD(unwinder, frames_read_from_memory, PyList_GET_SIZE(ctx->frame_info) - frames_before_walk);
         } else {
-            // Partial cache hit
+            // Partial cache hit - cached stack was validated as complete when stored,
+            // so set last_frame_visited to base_frame_addr for validation in frame_cache_store
+            ctx->last_frame_visited = ctx->base_frame_addr;
             STATS_INC(unwinder, frame_cache_partial_hits);
-            STATS_ADD(unwinder, frames_read_from_cache, PyList_GET_SIZE(frame_info) - frames_before_cache);
+            STATS_ADD(unwinder, frames_read_from_cache, PyList_GET_SIZE(ctx->frame_info) - frames_before_cache);
+        }
+    } else {
+        if (ctx->last_profiled_frame == 0) {
+            STATS_INC(unwinder, frame_cache_misses);
         }
-    } else if (last_profiled_frame == 0) {
-        // No cache involvement (no last_profiled_frame or cache disabled)
-        STATS_INC(unwinder, frame_cache_misses);
     }
 
-    // Store in cache (frame_cache_store handles truncation if num_addrs > FRAME_CACHE_MAX_FRAMES)
-    if (frame_cache_store(unwinder, thread_id, frame_info, addrs, num_addrs) < 0) {
+    if (frame_cache_store(unwinder, thread_id, ctx->frame_info, ctx->frame_addrs, ctx->num_addrs,
+                          ctx->base_frame_addr, ctx->last_frame_visited) < 0) {
         return -1;
     }
 
diff --git a/Modules/_remote_debugging/module.c b/Modules/_remote_debugging/module.c
index fc58e2428b2..ea53dbf5996 100644
--- a/Modules/_remote_debugging/module.c
+++ b/Modules/_remote_debugging/module.c
@@ -6,6 +6,20 @@
  ******************************************************************************/
 
 #include "_remote_debugging.h"
+#include "binary_io.h"
+
+/* Forward declarations for clinic-generated code */
+typedef struct {
+    PyObject_HEAD
+    BinaryWriter *writer;
+    uint32_t cached_total_samples;  /* Preserved after finalize */
+} BinaryWriterObject;
+
+typedef struct {
+    PyObject_HEAD
+    BinaryReader *reader;
+} BinaryReaderObject;
+
 #include "clinic/module.c.h"
 
 /* ============================================================================
@@ -970,6 +984,10 @@ static PyType_Spec RemoteUnwinder_spec = {
     .slots = RemoteUnwinder_slots,
 };
 
+/* Forward declarations for type specs defined later */
+static PyType_Spec BinaryWriter_spec;
+static PyType_Spec BinaryReader_spec;
+
 /* ============================================================================
  * MODULE INITIALIZATION
  * ============================================================================ */
@@ -1048,6 +1066,18 @@ _remote_debugging_exec(PyObject *m)
     if (PyModule_AddType(m, st->AwaitedInfo_Type) < 0) {
         return -1;
     }
+
+    // Create BinaryWriter and BinaryReader types
+    CREATE_TYPE(m, st->BinaryWriter_Type, &BinaryWriter_spec);
+    if (PyModule_AddType(m, st->BinaryWriter_Type) < 0) {
+        return -1;
+    }
+
+    CREATE_TYPE(m, st->BinaryReader_Type, &BinaryReader_spec);
+    if (PyModule_AddType(m, st->BinaryReader_Type) < 0) {
+        return -1;
+    }
+
 #ifdef Py_GIL_DISABLED
     PyUnstable_Module_SetGIL(m, Py_MOD_GIL_NOT_USED);
 #endif
@@ -1091,6 +1121,8 @@ remote_debugging_traverse(PyObject *mod, visitproc visit, void *arg)
     Py_VISIT(state->ThreadInfo_Type);
     Py_VISIT(state->InterpreterInfo_Type);
     Py_VISIT(state->AwaitedInfo_Type);
+    Py_VISIT(state->BinaryWriter_Type);
+    Py_VISIT(state->BinaryReader_Type);
     return 0;
 }
 
@@ -1106,6 +1138,8 @@ remote_debugging_clear(PyObject *mod)
     Py_CLEAR(state->ThreadInfo_Type);
     Py_CLEAR(state->InterpreterInfo_Type);
     Py_CLEAR(state->AwaitedInfo_Type);
+    Py_CLEAR(state->BinaryWriter_Type);
+    Py_CLEAR(state->BinaryReader_Type);
     return 0;
 }
 
@@ -1115,13 +1149,506 @@ remote_debugging_free(void *mod)
     (void)remote_debugging_clear((PyObject *)mod);
 }
 
-static PyModuleDef_Slot remote_debugging_slots[] = {
-    {Py_mod_exec, _remote_debugging_exec},
-    {Py_mod_multiple_interpreters, Py_MOD_PER_INTERPRETER_GIL_SUPPORTED},
-    {Py_mod_gil, Py_MOD_GIL_NOT_USED},
-    {0, NULL},
+/* ============================================================================
+ * BINARY WRITER CLASS
+ * ============================================================================ */
+
+#define BinaryWriter_CAST(op) ((BinaryWriterObject *)(op))
+
+/*[clinic input]
+class _remote_debugging.BinaryWriter "BinaryWriterObject *" "&PyBinaryWriter_Type"
+[clinic start generated code]*/
+/*[clinic end generated code: output=da39a3ee5e6b4b0d input=e948838b90a2003c]*/
+
+/*[clinic input]
+_remote_debugging.BinaryWriter.__init__
+    filename: str
+    sample_interval_us: unsigned_long_long
+    start_time_us: unsigned_long_long
+    *
+    compression: int = 0
+
+High-performance binary writer for profiling data.
+
+Arguments:
+    filename: Path to output file
+    sample_interval_us: Sampling interval in microseconds
+    start_time_us: Start timestamp in microseconds (from time.monotonic() * 1e6)
+    compression: 0=none, 1=zstd (default: 0)
+
+Use as a context manager or call finalize() when done.
+[clinic start generated code]*/
+
+static int
+_remote_debugging_BinaryWriter___init___impl(BinaryWriterObject *self,
+                                             const char *filename,
+                                             unsigned long long sample_interval_us,
+                                             unsigned long long start_time_us,
+                                             int compression)
+/*[clinic end generated code: output=014c0306f1bacf4b input=57497fe3cb9214a6]*/
+{
+    if (self->writer) {
+        binary_writer_destroy(self->writer);
+    }
+
+    self->writer = binary_writer_create(filename, sample_interval_us, compression, start_time_us);
+    if (!self->writer) {
+        return -1;
+    }
+
+    return 0;
+}
+
+/*[clinic input]
+_remote_debugging.BinaryWriter.write_sample
+    stack_frames: object
+    timestamp_us: unsigned_long_long
+
+Write a sample to the binary file.
+
+Arguments:
+    stack_frames: List of InterpreterInfo objects
+    timestamp_us: Current timestamp in microseconds (from time.monotonic() * 1e6)
+[clinic start generated code]*/
+
+static PyObject *
+_remote_debugging_BinaryWriter_write_sample_impl(BinaryWriterObject *self,
+                                                 PyObject *stack_frames,
+                                                 unsigned long long timestamp_us)
+/*[clinic end generated code: output=24d5b86679b4128f input=dce3148417482624]*/
+{
+    if (!self->writer) {
+        PyErr_SetString(PyExc_ValueError, "Writer is closed");
+        return NULL;
+    }
+
+    if (binary_writer_write_sample(self->writer, stack_frames, timestamp_us) < 0) {
+        return NULL;
+    }
+
+    Py_RETURN_NONE;
+}
+
+/*[clinic input]
+_remote_debugging.BinaryWriter.finalize
+
+Finalize and close the binary file.
+
+Writes string/frame tables, footer, and updates header.
+[clinic start generated code]*/
+
+static PyObject *
+_remote_debugging_BinaryWriter_finalize_impl(BinaryWriterObject *self)
+/*[clinic end generated code: output=3534b88c6628de88 input=c02191750682f6a2]*/
+{
+    if (!self->writer) {
+        PyErr_SetString(PyExc_ValueError, "Writer is already closed");
+        return NULL;
+    }
+
+    /* Save total_samples before finalizing */
+    self->cached_total_samples = self->writer->total_samples;
+
+    if (binary_writer_finalize(self->writer) < 0) {
+        return NULL;
+    }
+
+    binary_writer_destroy(self->writer);
+    self->writer = NULL;
+
+    Py_RETURN_NONE;
+}
+
+/*[clinic input]
+_remote_debugging.BinaryWriter.close
+
+Close the writer without finalizing (discards data).
+[clinic start generated code]*/
+
+static PyObject *
+_remote_debugging_BinaryWriter_close_impl(BinaryWriterObject *self)
+/*[clinic end generated code: output=9571bb2256fd1fd2 input=6e0da206e60daf16]*/
+{
+    if (self->writer) {
+        binary_writer_destroy(self->writer);
+        self->writer = NULL;
+    }
+    Py_RETURN_NONE;
+}
+
+/*[clinic input]
+_remote_debugging.BinaryWriter.__enter__
+
+Enter context manager.
+[clinic start generated code]*/
+
+static PyObject *
+_remote_debugging_BinaryWriter___enter___impl(BinaryWriterObject *self)
+/*[clinic end generated code: output=8eb95f61daf2d120 input=8ef14ee18da561d2]*/
+{
+    Py_INCREF(self);
+    return (PyObject *)self;
+}
+
+/*[clinic input]
+_remote_debugging.BinaryWriter.__exit__
+    exc_type: object = None
+    exc_val: object = None
+    exc_tb: object = None
+
+Exit context manager, finalizing the file.
+[clinic start generated code]*/
+
+static PyObject *
+_remote_debugging_BinaryWriter___exit___impl(BinaryWriterObject *self,
+                                             PyObject *exc_type,
+                                             PyObject *exc_val,
+                                             PyObject *exc_tb)
+/*[clinic end generated code: output=61831f47c72a53c6 input=12334ce1009af37f]*/
+{
+    if (self->writer) {
+        /* Only finalize on normal exit (no exception) */
+        if (exc_type == Py_None) {
+            if (binary_writer_finalize(self->writer) < 0) {
+                binary_writer_destroy(self->writer);
+                self->writer = NULL;
+                return NULL;
+            }
+        }
+        binary_writer_destroy(self->writer);
+        self->writer = NULL;
+    }
+    Py_RETURN_FALSE;
+}
+
+/*[clinic input]
+_remote_debugging.BinaryWriter.get_stats
+
+Get encoding statistics for the writer.
+
+Returns a dict with encoding statistics including repeat/full/suffix/pop-push
+record counts, frames written/saved, and compression ratio.
+[clinic start generated code]*/
+
+static PyObject *
+_remote_debugging_BinaryWriter_get_stats_impl(BinaryWriterObject *self)
+/*[clinic end generated code: output=06522cd52544df89 input=82968491b53ad277]*/
+{
+    if (!self->writer) {
+        PyErr_SetString(PyExc_ValueError, "Writer is closed");
+        return NULL;
+    }
+    return binary_writer_get_stats(self->writer);
+}
+
+static PyObject *
+BinaryWriter_get_total_samples(BinaryWriterObject *self, void *closure)
+{
+    if (!self->writer) {
+        /* Use cached value after finalize/close */
+        return PyLong_FromUnsignedLong(self->cached_total_samples);
+    }
+    return PyLong_FromUnsignedLong(self->writer->total_samples);
+}
+
+static PyGetSetDef BinaryWriter_getset[] = {
+    {"total_samples", (getter)BinaryWriter_get_total_samples, NULL, "Total samples written", NULL},
+    {NULL}
 };
 
+static PyMethodDef BinaryWriter_methods[] = {
+    _REMOTE_DEBUGGING_BINARYWRITER_WRITE_SAMPLE_METHODDEF
+    _REMOTE_DEBUGGING_BINARYWRITER_FINALIZE_METHODDEF
+    _REMOTE_DEBUGGING_BINARYWRITER_CLOSE_METHODDEF
+    _REMOTE_DEBUGGING_BINARYWRITER___ENTER___METHODDEF
+    _REMOTE_DEBUGGING_BINARYWRITER___EXIT___METHODDEF
+    _REMOTE_DEBUGGING_BINARYWRITER_GET_STATS_METHODDEF
+    {NULL, NULL, 0, NULL}
+};
+
+static void
+BinaryWriter_dealloc(PyObject *op)
+{
+    BinaryWriterObject *self = BinaryWriter_CAST(op);
+    PyTypeObject *tp = Py_TYPE(self);
+    if (self->writer) {
+        binary_writer_destroy(self->writer);
+    }
+    tp->tp_free(self);
+    Py_DECREF(tp);
+}
+
+static PyType_Slot BinaryWriter_slots[] = {
+    {Py_tp_getset, BinaryWriter_getset},
+    {Py_tp_methods, BinaryWriter_methods},
+    {Py_tp_init, _remote_debugging_BinaryWriter___init__},
+    {Py_tp_dealloc, BinaryWriter_dealloc},
+    {0, NULL}
+};
+
+static PyType_Spec BinaryWriter_spec = {
+    .name = "_remote_debugging.BinaryWriter",
+    .basicsize = sizeof(BinaryWriterObject),
+    .flags = (
+        Py_TPFLAGS_DEFAULT
+        | Py_TPFLAGS_IMMUTABLETYPE
+    ),
+    .slots = BinaryWriter_slots,
+};
+
+/* ============================================================================
+ * BINARY READER CLASS
+ * ============================================================================ */
+
+#define BinaryReader_CAST(op) ((BinaryReaderObject *)(op))
+
+/*[clinic input]
+class _remote_debugging.BinaryReader "BinaryReaderObject *" "&PyBinaryReader_Type"
+[clinic start generated code]*/
+/*[clinic end generated code: output=da39a3ee5e6b4b0d input=36400aaf6f53216d]*/
+
+/*[clinic input]
+_remote_debugging.BinaryReader.__init__
+    filename: str
+
+High-performance binary reader for profiling data.
+
+Arguments:
+    filename: Path to input file
+
+Use as a context manager or call close() when done.
+[clinic start generated code]*/
+
+static int
+_remote_debugging_BinaryReader___init___impl(BinaryReaderObject *self,
+                                             const char *filename)
+/*[clinic end generated code: output=9699226f7ae052bb input=4201f9cc500ef2f6]*/
+{
+    if (self->reader) {
+        binary_reader_close(self->reader);
+    }
+
+    self->reader = binary_reader_open(filename);
+    if (!self->reader) {
+        return -1;
+    }
+
+    return 0;
+}
+
+/*[clinic input]
+_remote_debugging.BinaryReader.replay
+    collector: object
+    progress_callback: object = None
+
+Replay samples through a collector.
+
+Arguments:
+    collector: Collector object with collect() method
+    progress_callback: Optional callable(current, total)
+
+Returns:
+    Number of samples replayed
+[clinic start generated code]*/
+
+static PyObject *
+_remote_debugging_BinaryReader_replay_impl(BinaryReaderObject *self,
+                                           PyObject *collector,
+                                           PyObject *progress_callback)
+/*[clinic end generated code: output=442345562574b61c input=ebb687aed3e0f4f1]*/
+{
+    if (!self->reader) {
+        PyErr_SetString(PyExc_ValueError, "Reader is closed");
+        return NULL;
+    }
+
+    Py_ssize_t replayed = binary_reader_replay(self->reader, collector, progress_callback);
+    if (replayed < 0) {
+        return NULL;
+    }
+
+    return PyLong_FromSsize_t(replayed);
+}
+
+/*[clinic input]
+_remote_debugging.BinaryReader.get_info
+
+Get metadata about the binary file.
+
+Returns:
+    Dict with file metadata
+[clinic start generated code]*/
+
+static PyObject *
+_remote_debugging_BinaryReader_get_info_impl(BinaryReaderObject *self)
+/*[clinic end generated code: output=7f641fbd39147391 input=02e75e39c8a6cd1f]*/
+{
+    if (!self->reader) {
+        PyErr_SetString(PyExc_ValueError, "Reader is closed");
+        return NULL;
+    }
+
+    return binary_reader_get_info(self->reader);
+}
+
+/*[clinic input]
+_remote_debugging.BinaryReader.get_stats
+
+Get reconstruction statistics from replay.
+
+Returns a dict with statistics about record types decoded and samples
+reconstructed during replay.
+[clinic start generated code]*/
+
+static PyObject *
+_remote_debugging_BinaryReader_get_stats_impl(BinaryReaderObject *self)
+/*[clinic end generated code: output=628b9ab5e4c4fd36 input=d8dd6654abd6c3c0]*/
+{
+    if (!self->reader) {
+        PyErr_SetString(PyExc_ValueError, "Reader is closed");
+        return NULL;
+    }
+    return binary_reader_get_stats(self->reader);
+}
+
+/*[clinic input]
+_remote_debugging.BinaryReader.close
+
+Close the reader and free resources.
+[clinic start generated code]*/
+
+static PyObject *
+_remote_debugging_BinaryReader_close_impl(BinaryReaderObject *self)
+/*[clinic end generated code: output=ad0238cf5240b4f8 input=b919a66c737712d5]*/
+{
+    if (self->reader) {
+        binary_reader_close(self->reader);
+        self->reader = NULL;
+    }
+    Py_RETURN_NONE;
+}
+
+/*[clinic input]
+_remote_debugging.BinaryReader.__enter__
+
+Enter context manager.
+[clinic start generated code]*/
+
+static PyObject *
+_remote_debugging_BinaryReader___enter___impl(BinaryReaderObject *self)
+/*[clinic end generated code: output=fade133538e93817 input=4794844c9efdc4f6]*/
+{
+    Py_INCREF(self);
+    return (PyObject *)self;
+}
+
+/*[clinic input]
+_remote_debugging.BinaryReader.__exit__
+    exc_type: object = None
+    exc_val: object = None
+    exc_tb: object = None
+
+Exit context manager, closing the file.
+[clinic start generated code]*/
+
+static PyObject *
+_remote_debugging_BinaryReader___exit___impl(BinaryReaderObject *self,
+                                             PyObject *exc_type,
+                                             PyObject *exc_val,
+                                             PyObject *exc_tb)
+/*[clinic end generated code: output=2acdd36cfdc14e4a input=87284243d7935835]*/
+{
+    if (self->reader) {
+        binary_reader_close(self->reader);
+        self->reader = NULL;
+    }
+    Py_RETURN_FALSE;
+}
+
+static PyObject *
+BinaryReader_get_sample_count(BinaryReaderObject *self, void *closure)
+{
+    if (!self->reader) {
+        return PyLong_FromLong(0);
+    }
+    return PyLong_FromUnsignedLong(self->reader->sample_count);
+}
+
+static PyObject *
+BinaryReader_get_sample_interval_us(BinaryReaderObject *self, void *closure)
+{
+    if (!self->reader) {
+        return PyLong_FromLong(0);
+    }
+    return PyLong_FromUnsignedLongLong(self->reader->sample_interval_us);
+}
+
+static PyGetSetDef BinaryReader_getset[] = {
+    {"sample_count", (getter)BinaryReader_get_sample_count, NULL, "Number of samples in file", NULL},
+    {"sample_interval_us", (getter)BinaryReader_get_sample_interval_us, NULL, "Sample interval in microseconds", NULL},
+    {NULL}
+};
+
+static PyMethodDef BinaryReader_methods[] = {
+    _REMOTE_DEBUGGING_BINARYREADER_REPLAY_METHODDEF
+    _REMOTE_DEBUGGING_BINARYREADER_GET_INFO_METHODDEF
+    _REMOTE_DEBUGGING_BINARYREADER_GET_STATS_METHODDEF
+    _REMOTE_DEBUGGING_BINARYREADER_CLOSE_METHODDEF
+    _REMOTE_DEBUGGING_BINARYREADER___ENTER___METHODDEF
+    _REMOTE_DEBUGGING_BINARYREADER___EXIT___METHODDEF
+    {NULL, NULL, 0, NULL}
+};
+
+static void
+BinaryReader_dealloc(PyObject *op)
+{
+    BinaryReaderObject *self = BinaryReader_CAST(op);
+    PyTypeObject *tp = Py_TYPE(self);
+    if (self->reader) {
+        binary_reader_close(self->reader);
+    }
+    tp->tp_free(self);
+    Py_DECREF(tp);
+}
+
+static PyType_Slot BinaryReader_slots[] = {
+    {Py_tp_getset, BinaryReader_getset},
+    {Py_tp_methods, BinaryReader_methods},
+    {Py_tp_init, _remote_debugging_BinaryReader___init__},
+    {Py_tp_dealloc, BinaryReader_dealloc},
+    {0, NULL}
+};
+
+static PyType_Spec BinaryReader_spec = {
+    .name = "_remote_debugging.BinaryReader",
+    .basicsize = sizeof(BinaryReaderObject),
+    .flags = (
+        Py_TPFLAGS_DEFAULT
+        | Py_TPFLAGS_IMMUTABLETYPE
+    ),
+    .slots = BinaryReader_slots,
+};
+
+/* ============================================================================
+ * MODULE METHODS
+ * ============================================================================ */
+
+/*[clinic input]
+_remote_debugging.zstd_available
+
+Check if zstd compression is available.
+
+Returns:
+    True if zstd available, False otherwise
+[clinic start generated code]*/
+
+static PyObject *
+_remote_debugging_zstd_available_impl(PyObject *module)
+/*[clinic end generated code: output=55e35a70ef280cdd input=a1b4d41bc09c7cf9]*/
+{
+    return PyBool_FromLong(binary_io_zstd_available());
+}
+
 /* ============================================================================
  * MODULE-LEVEL FUNCTIONS
  * ============================================================================ */
@@ -1188,11 +1715,19 @@ _remote_debugging_is_python_process_impl(PyObject *module, int pid)
 }
 
 static PyMethodDef remote_debugging_methods[] = {
+    _REMOTE_DEBUGGING_ZSTD_AVAILABLE_METHODDEF
     _REMOTE_DEBUGGING_GET_CHILD_PIDS_METHODDEF
     _REMOTE_DEBUGGING_IS_PYTHON_PROCESS_METHODDEF
     {NULL, NULL, 0, NULL},
 };
 
+static PyModuleDef_Slot remote_debugging_slots[] = {
+    {Py_mod_exec, _remote_debugging_exec},
+    {Py_mod_multiple_interpreters, Py_MOD_PER_INTERPRETER_GIL_SUPPORTED},
+    {Py_mod_gil, Py_MOD_GIL_NOT_USED},
+    {0, NULL},
+};
+
 static struct PyModuleDef remote_debugging_module = {
     PyModuleDef_HEAD_INIT,
     .m_name = "_remote_debugging",
diff --git a/Modules/_remote_debugging/threads.c b/Modules/_remote_debugging/threads.c
index 81c13ea48e3..3a5b8adb3f4 100644
--- a/Modules/_remote_debugging/threads.c
+++ b/Modules/_remote_debugging/threads.c
@@ -23,6 +23,8 @@ iterate_threads(
 ) {
     uintptr_t thread_state_addr;
     unsigned long tid = 0;
+    const size_t MAX_THREADS = 8192;
+    size_t thread_count = 0;
 
     if (0 > _Py_RemoteDebug_PagedReadRemoteMemory(
                 &unwinder->handle,
@@ -34,7 +36,8 @@ iterate_threads(
         return -1;
     }
 
-    while (thread_state_addr != 0) {
+    while (thread_state_addr != 0 && thread_count < MAX_THREADS) {
+        thread_count++;
         if (0 > _Py_RemoteDebug_PagedReadRemoteMemory(
                     &unwinder->handle,
                     thread_state_addr + (uintptr_t)unwinder->debug_offsets.thread_state.native_thread_id,
@@ -425,12 +428,24 @@ unwind_stack_for_thread(
         }
     }
 
+    uintptr_t addrs[FRAME_CACHE_MAX_FRAMES];
+    FrameWalkContext ctx = {
+        .frame_addr = frame_addr,
+        .base_frame_addr = base_frame_addr,
+        .gc_frame = gc_frame,
+        .chunks = &chunks,
+        .frame_info = frame_info,
+        .frame_addrs = addrs,
+        .num_addrs = 0,
+        .max_addrs = FRAME_CACHE_MAX_FRAMES,
+    };
+    assert(ctx.max_addrs == FRAME_CACHE_MAX_FRAMES);
+
     if (unwinder->cache_frames) {
         // Use cache to avoid re-reading unchanged parent frames
-        uintptr_t last_profiled_frame = GET_MEMBER(uintptr_t, ts,
+        ctx.last_profiled_frame = GET_MEMBER(uintptr_t, ts,
             unwinder->debug_offsets.thread_state.last_profiled_frame);
-        if (collect_frames_with_cache(unwinder, frame_addr, &chunks, frame_info,
-                                      gc_frame, last_profiled_frame, tid) < 0) {
+        if (collect_frames_with_cache(unwinder, &ctx, tid) < 0) {
             set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to collect frames");
             goto error;
         }
@@ -443,8 +458,7 @@ unwind_stack_for_thread(
         }
     } else {
         // No caching - process entire frame chain with base_frame validation
-        if (process_frame_chain(unwinder, frame_addr, &chunks, frame_info,
-                                base_frame_addr, gc_frame, 0, NULL, NULL, NULL, 0) < 0) {
+        if (process_frame_chain(unwinder, &ctx) < 0) {
             set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to process frame chain");
             goto error;
         }
diff --git a/Modules/_testcapimodule.c b/Modules/_testcapimodule.c
index c14f925b4e7..de6d3cbce54 100644
--- a/Modules/_testcapimodule.c
+++ b/Modules/_testcapimodule.c
@@ -3523,7 +3523,10 @@ _testcapi_exec(PyObject *m)
     return 0;
 }
 
+PyABIInfo_VAR(abi_info);
+
 static PyModuleDef_Slot _testcapi_slots[] = {
+    {Py_mod_abi, &abi_info},
     {Py_mod_exec, _testcapi_exec},
     {Py_mod_gil, Py_MOD_GIL_NOT_USED},
     {Py_mod_multiple_interpreters, Py_MOD_PER_INTERPRETER_GIL_SUPPORTED},
diff --git a/Modules/_testinternalcapi.c b/Modules/_testinternalcapi.c
index 4140cd23ded..a7fbb0f87b6 100644
--- a/Modules/_testinternalcapi.c
+++ b/Modules/_testinternalcapi.c
@@ -2696,7 +2696,10 @@ module_exec(PyObject *module)
     return 0;
 }
 
+PyABIInfo_VAR(abi_info);
+
 static struct PyModuleDef_Slot module_slots[] = {
+    {Py_mod_abi, &abi_info},
     {Py_mod_exec, module_exec},
     {Py_mod_multiple_interpreters, Py_MOD_PER_INTERPRETER_GIL_SUPPORTED},
     {Py_mod_gil, Py_MOD_GIL_NOT_USED},
diff --git a/Modules/_zoneinfo.c b/Modules/_zoneinfo.c
index b99be073db5..e07dfd19efa 100644
--- a/Modules/_zoneinfo.c
+++ b/Modules/_zoneinfo.c
@@ -292,16 +292,11 @@ static PyObject *
 get_weak_cache(zoneinfo_state *state, PyTypeObject *type)
 {
     if (type == state->ZoneInfoType) {
+        Py_INCREF(state->ZONEINFO_WEAK_CACHE);
         return state->ZONEINFO_WEAK_CACHE;
     }
     else {
-        PyObject *cache =
-            PyObject_GetAttrString((PyObject *)type, "_weak_cache");
-        // We are assuming that the type lives at least as long as the function
-        // that calls get_weak_cache, and that it holds a reference to the
-        // cache, so we'll return a "borrowed reference".
-        Py_XDECREF(cache);
-        return cache;
+        return PyObject_GetAttrString((PyObject *)type, "_weak_cache");
     }
 }
 
@@ -328,6 +323,7 @@ zoneinfo_ZoneInfo_impl(PyTypeObject *type, PyObject *key)
     PyObject *weak_cache = get_weak_cache(state, type);
     instance = PyObject_CallMethod(weak_cache, "get", "O", key, Py_None);
     if (instance == NULL) {
+        Py_DECREF(weak_cache);
         return NULL;
     }
 
@@ -335,6 +331,7 @@ zoneinfo_ZoneInfo_impl(PyTypeObject *type, PyObject *key)
         Py_DECREF(instance);
         PyObject *tmp = zoneinfo_new_instance(state, type, key);
         if (tmp == NULL) {
+            Py_DECREF(weak_cache);
             return NULL;
         }
 
@@ -342,12 +339,14 @@ zoneinfo_ZoneInfo_impl(PyTypeObject *type, PyObject *key)
             PyObject_CallMethod(weak_cache, "setdefault", "OO", key, tmp);
         Py_DECREF(tmp);
         if (instance == NULL) {
+            Py_DECREF(weak_cache);
             return NULL;
         }
         ((PyZoneInfo_ZoneInfo *)instance)->source = SOURCE_CACHE;
     }
 
     update_strong_cache(state, type, key, instance);
+    Py_DECREF(weak_cache);
     return instance;
 }
 
@@ -510,12 +509,14 @@ zoneinfo_ZoneInfo_clear_cache_impl(PyTypeObject *type, PyTypeObject *cls,
         PyObject *item = NULL;
         PyObject *pop = PyUnicode_FromString("pop");
         if (pop == NULL) {
+            Py_DECREF(weak_cache);
             return NULL;
         }
 
         PyObject *iter = PyObject_GetIter(only_keys);
         if (iter == NULL) {
             Py_DECREF(pop);
+            Py_DECREF(weak_cache);
             return NULL;
         }
 
@@ -540,6 +541,7 @@ zoneinfo_ZoneInfo_clear_cache_impl(PyTypeObject *type, PyTypeObject *cls,
         Py_DECREF(pop);
     }
 
+    Py_DECREF(weak_cache);
     if (PyErr_Occurred()) {
         return NULL;
     }
diff --git a/Modules/atexitmodule.c b/Modules/atexitmodule.c
index 4536b03fbc4..f81f0b57247 100644
--- a/Modules/atexitmodule.c
+++ b/Modules/atexitmodule.c
@@ -257,10 +257,11 @@ static int
 atexit_unregister_locked(PyObject *callbacks, PyObject *func)
 {
     for (Py_ssize_t i = 0; i < PyList_GET_SIZE(callbacks); ++i) {
-        PyObject *tuple = PyList_GET_ITEM(callbacks, i);
+        PyObject *tuple = Py_NewRef(PyList_GET_ITEM(callbacks, i));
         assert(PyTuple_CheckExact(tuple));
         PyObject *to_compare = PyTuple_GET_ITEM(tuple, 0);
         int cmp = PyObject_RichCompareBool(func, to_compare, Py_EQ);
+        Py_DECREF(tuple);
         if (cmp < 0)
         {
             return -1;
diff --git a/Modules/clinic/mmapmodule.c.h b/Modules/clinic/mmapmodule.c.h
index f7fc172b3af..b63f7df2a7e 100644
--- a/Modules/clinic/mmapmodule.c.h
+++ b/Modules/clinic/mmapmodule.c.h
@@ -479,6 +479,42 @@ exit:
     return return_value;
 }
 
+PyDoc_STRVAR(mmap_mmap_set_name__doc__,
+"set_name($self, name, /)\n"
+"--\n"
+"\n");
+
+#define MMAP_MMAP_SET_NAME_METHODDEF    \
+    {"set_name", (PyCFunction)mmap_mmap_set_name, METH_O, mmap_mmap_set_name__doc__},
+
+static PyObject *
+mmap_mmap_set_name_impl(mmap_object *self, const char *name);
+
+static PyObject *
+mmap_mmap_set_name(PyObject *self, PyObject *arg)
+{
+    PyObject *return_value = NULL;
+    const char *name;
+
+    if (!PyUnicode_Check(arg)) {
+        _PyArg_BadArgument("set_name", "argument", "str", arg);
+        goto exit;
+    }
+    Py_ssize_t name_length;
+    name = PyUnicode_AsUTF8AndSize(arg, &name_length);
+    if (name == NULL) {
+        goto exit;
+    }
+    if (strlen(name) != (size_t)name_length) {
+        PyErr_SetString(PyExc_ValueError, "embedded null character");
+        goto exit;
+    }
+    return_value = mmap_mmap_set_name_impl((mmap_object *)self, name);
+
+exit:
+    return return_value;
+}
+
 PyDoc_STRVAR(mmap_mmap_seekable__doc__,
 "seekable($self, /)\n"
 "--\n"
@@ -796,4 +832,4 @@ exit:
 #ifndef MMAP_MMAP_MADVISE_METHODDEF
     #define MMAP_MMAP_MADVISE_METHODDEF
 #endif /* !defined(MMAP_MMAP_MADVISE_METHODDEF) */
-/*[clinic end generated code: output=381f6cf4986ac867 input=a9049054013a1b77]*/
+/*[clinic end generated code: output=fd9ca0ef425af934 input=a9049054013a1b77]*/
diff --git a/Modules/mmapmodule.c b/Modules/mmapmodule.c
index 37003020de2..0928ea6a8b7 100644
--- a/Modules/mmapmodule.c
+++ b/Modules/mmapmodule.c
@@ -963,7 +963,6 @@ mmap_mmap_resize_impl(mmap_object *self, Py_ssize_t new_size)
 
         if (error) {
             return PyErr_SetFromWindowsErr(error);
-            return NULL;
         }
         /* It's possible for a resize to fail, typically because another mapping
         is still held against the same underlying file. Even if nothing has
@@ -1117,6 +1116,47 @@ mmap_mmap_seek_impl(mmap_object *self, Py_ssize_t dist, int how)
     return NULL;
 }
 
+/*[clinic input]
+mmap.mmap.set_name
+
+    name: str
+    /
+
+[clinic start generated code]*/
+
+static PyObject *
+mmap_mmap_set_name_impl(mmap_object *self, const char *name)
+/*[clinic end generated code: output=1edaf4fd51277760 input=6c7dd91cad205f07]*/
+{
+#if defined(MAP_ANONYMOUS) && defined(__linux__)
+    const char *prefix = "cpython:mmap:";
+    if (strlen(name) + strlen(prefix) > 79) {
+        PyErr_SetString(PyExc_ValueError, "name is too long");
+        return NULL;
+    }
+    if (self->flags & MAP_ANONYMOUS) {
+        char buf[80];
+        sprintf(buf, "%s%s", prefix, name);
+        if (_PyAnnotateMemoryMap(self->data, self->size, buf) < 0) {
+            PyErr_SetFromErrno(PyExc_OSError);
+            return NULL;
+        }
+        Py_RETURN_NONE;
+    }
+    else {
+        /* cannot name non-anonymous mappings */
+        PyErr_SetString(PyExc_ValueError,
+                        "Cannot set annotation on non-anonymous mappings");
+        return NULL;
+    }
+#else
+    /* naming not supported on this platform */
+    PyErr_SetString(PyExc_NotImplementedError,
+                    "Annotation of mmap is not supported on this platform");
+    return NULL;
+#endif
+}
+
 /*[clinic input]
 mmap.mmap.seekable
 
@@ -1397,6 +1437,7 @@ static struct PyMethodDef mmap_object_methods[] = {
     MMAP_MMAP_RESIZE_METHODDEF
     MMAP_MMAP_SEEK_METHODDEF
     MMAP_MMAP_SEEKABLE_METHODDEF
+    MMAP_MMAP_SET_NAME_METHODDEF
     MMAP_MMAP_SIZE_METHODDEF
     MMAP_MMAP_TELL_METHODDEF
     MMAP_MMAP_WRITE_METHODDEF
@@ -1952,7 +1993,11 @@ new_mmap_object(PyTypeObject *type, PyObject *args, PyObject *kwdict)
         PyErr_SetFromErrno(PyExc_OSError);
         return NULL;
     }
-    _PyAnnotateMemoryMap(m_obj->data, map_size, "cpython:mmap");
+#ifdef MAP_ANONYMOUS
+    if (m_obj->flags & MAP_ANONYMOUS) {
+        (void)_PyAnnotateMemoryMap(m_obj->data, map_size, "cpython:mmap");
+    }
+#endif
     m_obj->access = (access_mode)access;
     return (PyObject *)m_obj;
 }
diff --git a/Modules/posixmodule.c b/Modules/posixmodule.c
index 221cfc5a934..e0276ce9e39 100644
--- a/Modules/posixmodule.c
+++ b/Modules/posixmodule.c
@@ -1815,7 +1815,7 @@ convertenviron(void)
 #ifdef MS_WINDOWS
         k = PyUnicode_FromWideChar(*e, (Py_ssize_t)(p-*e));
 #else
-        k = PyBytes_FromStringAndSize(*e, (int)(p-*e));
+        k = PyBytes_FromStringAndSize(*e, (Py_ssize_t)(p-*e));
 #endif
         if (k == NULL) {
             Py_DECREF(d);
diff --git a/Modules/readline.c b/Modules/readline.c
index cc84eb6229e..579a34b02ce 100644
--- a/Modules/readline.c
+++ b/Modules/readline.c
@@ -391,7 +391,7 @@ readline_append_history_file_impl(PyObject *module, int nelements,
 {
     if (nelements < 0)
     {
-        PyErr_SetString(PyExc_ValueError, "nelements must be positive");
+        PyErr_SetString(PyExc_ValueError, "nelements must be non-negative");
         return NULL;
     }
 
diff --git a/Objects/abstract.c b/Objects/abstract.c
index 8adad8407d0..f2c7de3d1ef 100644
--- a/Objects/abstract.c
+++ b/Objects/abstract.c
@@ -224,6 +224,14 @@ PyMapping_GetOptionalItem(PyObject *obj, PyObject *key, PyObject **result)
     return 0;
 }
 
+PyObject*
+_PyMapping_GetOptionalItem2(PyObject *obj, PyObject *key, int *err)
+{
+    PyObject* result;
+    *err = PyMapping_GetOptionalItem(obj, key, &result);
+    return result;
+}
+
 int
 PyObject_SetItem(PyObject *o, PyObject *key, PyObject *value)
 {
diff --git a/Objects/bytearrayobject.c b/Objects/bytearrayobject.c
index 25cc0bfcbab..338c71ad38f 100644
--- a/Objects/bytearrayobject.c
+++ b/Objects/bytearrayobject.c
@@ -90,6 +90,25 @@ bytearray_releasebuffer(PyObject *self, Py_buffer *view)
     Py_END_CRITICAL_SECTION();
 }
 
+typedef PyObject* (*_ba_bytes_op)(const char *buf, Py_ssize_t len,
+                                  PyObject *sub, Py_ssize_t start,
+                                  Py_ssize_t end);
+
+static PyObject *
+_bytearray_with_buffer(PyByteArrayObject *self, _ba_bytes_op op, PyObject *sub,
+                       Py_ssize_t start, Py_ssize_t end)
+{
+    PyObject *res;
+
+    _Py_CRITICAL_SECTION_ASSERT_OBJECT_LOCKED(self);
+
+    /* Increase exports to prevent bytearray storage from changing during op. */
+    self->ob_exports++;
+    res = op(PyByteArray_AS_STRING(self), Py_SIZE(self), sub, start, end);
+    self->ob_exports--;
+    return res;
+}
+
 static int
 _canresize(PyByteArrayObject *self)
 {
@@ -1248,8 +1267,7 @@ bytearray_find_impl(PyByteArrayObject *self, PyObject *sub, Py_ssize_t start,
                     Py_ssize_t end)
 /*[clinic end generated code: output=413e1cab2ae87da0 input=df3aa94840d893a7]*/
 {
-    return _Py_bytes_find(PyByteArray_AS_STRING(self), PyByteArray_GET_SIZE(self),
-                          sub, start, end);
+    return _bytearray_with_buffer(self, _Py_bytes_find, sub, start, end);
 }
 
 /*[clinic input]
@@ -1265,8 +1283,7 @@ bytearray_count_impl(PyByteArrayObject *self, PyObject *sub,
                      Py_ssize_t start, Py_ssize_t end)
 /*[clinic end generated code: output=a21ee2692e4f1233 input=e8fcdca8272857e0]*/
 {
-    return _Py_bytes_count(PyByteArray_AS_STRING(self), PyByteArray_GET_SIZE(self),
-                           sub, start, end);
+    return _bytearray_with_buffer(self, _Py_bytes_count, sub, start, end);
 }
 
 /*[clinic input]
@@ -1314,8 +1331,7 @@ bytearray_index_impl(PyByteArrayObject *self, PyObject *sub,
                      Py_ssize_t start, Py_ssize_t end)
 /*[clinic end generated code: output=067a1e78efc672a7 input=c37f177cfee19fe4]*/
 {
-    return _Py_bytes_index(PyByteArray_AS_STRING(self), PyByteArray_GET_SIZE(self),
-                           sub, start, end);
+    return _bytearray_with_buffer(self, _Py_bytes_index, sub, start, end);
 }
 
 /*[clinic input]
@@ -1333,8 +1349,7 @@ bytearray_rfind_impl(PyByteArrayObject *self, PyObject *sub,
                      Py_ssize_t start, Py_ssize_t end)
 /*[clinic end generated code: output=51bf886f932b283c input=1265b11c437d2750]*/
 {
-    return _Py_bytes_rfind(PyByteArray_AS_STRING(self), PyByteArray_GET_SIZE(self),
-                           sub, start, end);
+    return _bytearray_with_buffer(self, _Py_bytes_rfind, sub, start, end);
 }
 
 /*[clinic input]
@@ -1352,18 +1367,22 @@ bytearray_rindex_impl(PyByteArrayObject *self, PyObject *sub,
                       Py_ssize_t start, Py_ssize_t end)
 /*[clinic end generated code: output=38e1cf66bafb08b9 input=7d198b3d6b0a62ce]*/
 {
-    return _Py_bytes_rindex(PyByteArray_AS_STRING(self), PyByteArray_GET_SIZE(self),
-                            sub, start, end);
+    return _bytearray_with_buffer(self, _Py_bytes_rindex, sub, start, end);
 }
 
 static int
 bytearray_contains(PyObject *self, PyObject *arg)
 {
-    int ret;
+    int ret = -1;
     Py_BEGIN_CRITICAL_SECTION(self);
-    ret = _Py_bytes_contains(PyByteArray_AS_STRING(self),
+    PyByteArrayObject *ba = _PyByteArray_CAST(self);
+
+    /* Increase exports to prevent bytearray storage from changing during _Py_bytes_contains(). */
+    ba->ob_exports++;
+    ret = _Py_bytes_contains(PyByteArray_AS_STRING(ba),
                              PyByteArray_GET_SIZE(self),
                              arg);
+    ba->ob_exports--;
     Py_END_CRITICAL_SECTION();
     return ret;
 }
@@ -1390,8 +1409,7 @@ bytearray_startswith_impl(PyByteArrayObject *self, PyObject *subobj,
                           Py_ssize_t start, Py_ssize_t end)
 /*[clinic end generated code: output=a3d9b6d44d3662a6 input=93f9ffee684f109a]*/
 {
-    return _Py_bytes_startswith(PyByteArray_AS_STRING(self), PyByteArray_GET_SIZE(self),
-                                subobj, start, end);
+    return _bytearray_with_buffer(self, _Py_bytes_startswith, subobj, start, end);
 }
 
 /*[clinic input]
@@ -1416,8 +1434,7 @@ bytearray_endswith_impl(PyByteArrayObject *self, PyObject *subobj,
                         Py_ssize_t start, Py_ssize_t end)
 /*[clinic end generated code: output=e75ea8c227954caa input=d158b030a11d0b06]*/
 {
-    return _Py_bytes_endswith(PyByteArray_AS_STRING(self), PyByteArray_GET_SIZE(self),
-                              subobj, start, end);
+    return _bytearray_with_buffer(self, _Py_bytes_endswith, subobj, start, end);
 }
 
 /*[clinic input]
@@ -1782,26 +1799,32 @@ bytearray_split_impl(PyByteArrayObject *self, PyObject *sep,
                      Py_ssize_t maxsplit)
 /*[clinic end generated code: output=833e2cf385d9a04d input=dd9f6e2910cc3a34]*/
 {
-    Py_ssize_t len = PyByteArray_GET_SIZE(self), n;
-    const char *s = PyByteArray_AS_STRING(self), *sub;
-    PyObject *list;
-    Py_buffer vsub;
+    PyObject *list = NULL;
+
+    /* Increase exports to prevent bytearray storage from changing during _Py_bytes_contains(). */
+    self->ob_exports++;
+    const char *sbuf = PyByteArray_AS_STRING(self);
+    Py_ssize_t slen = PyByteArray_GET_SIZE((PyObject *)self);
 
     if (maxsplit < 0)
         maxsplit = PY_SSIZE_T_MAX;
 
-    if (sep == Py_None)
-        return stringlib_split_whitespace((PyObject*) self, s, len, maxsplit);
+    if (sep == Py_None) {
+        list = stringlib_split_whitespace((PyObject*)self, sbuf, slen, maxsplit);
+        goto done;
+    }
 
-    if (PyObject_GetBuffer(sep, &vsub, PyBUF_SIMPLE) != 0)
-        return NULL;
-    sub = vsub.buf;
-    n = vsub.len;
+    Py_buffer vsub;
+    if (PyObject_GetBuffer(sep, &vsub, PyBUF_SIMPLE) != 0) {
+        goto done;
+    }
 
-    list = stringlib_split(
-        (PyObject*) self, s, len, sub, n, maxsplit
-        );
+    list = stringlib_split((PyObject*)self, sbuf, slen,
+                           (const char *)vsub.buf, vsub.len, maxsplit);
     PyBuffer_Release(&vsub);
+
+done:
+    self->ob_exports--;
     return list;
 }
 
@@ -1900,26 +1923,32 @@ bytearray_rsplit_impl(PyByteArrayObject *self, PyObject *sep,
                       Py_ssize_t maxsplit)
 /*[clinic end generated code: output=a55e0b5a03cb6190 input=60e9abf305128ff4]*/
 {
-    Py_ssize_t len = PyByteArray_GET_SIZE(self), n;
-    const char *s = PyByteArray_AS_STRING(self), *sub;
-    PyObject *list;
-    Py_buffer vsub;
+    PyObject *list = NULL;
+
+    /* Increase exports to prevent bytearray storage from changing during _Py_bytes_contains(). */
+    self->ob_exports++;
+    const char *sbuf = PyByteArray_AS_STRING(self);
+    Py_ssize_t slen = PyByteArray_GET_SIZE((PyObject *)self);
 
     if (maxsplit < 0)
         maxsplit = PY_SSIZE_T_MAX;
 
-    if (sep == Py_None)
-        return stringlib_rsplit_whitespace((PyObject*) self, s, len, maxsplit);
+    if (sep == Py_None) {
+        list = stringlib_rsplit_whitespace((PyObject*)self, sbuf, slen, maxsplit);
+        goto done;
+    }
 
-    if (PyObject_GetBuffer(sep, &vsub, PyBUF_SIMPLE) != 0)
-        return NULL;
-    sub = vsub.buf;
-    n = vsub.len;
+    Py_buffer vsub;
+    if (PyObject_GetBuffer(sep, &vsub, PyBUF_SIMPLE) != 0) {
+        goto done;
+    }
 
-    list = stringlib_rsplit(
-        (PyObject*) self, s, len, sub, n, maxsplit
-        );
+    list = stringlib_rsplit((PyObject*)self, sbuf, slen,
+                            (const char *)vsub.buf, vsub.len, maxsplit);
     PyBuffer_Release(&vsub);
+
+done:
+    self->ob_exports--;
     return list;
 }
 
diff --git a/Objects/clinic/rangeobject.c.h b/Objects/clinic/rangeobject.c.h
new file mode 100644
index 00000000000..d9142eddde4
--- /dev/null
+++ b/Objects/clinic/rangeobject.c.h
@@ -0,0 +1,150 @@
+/*[clinic input]
+preserve
+[clinic start generated code]*/
+
+#include "pycore_critical_section.h"// Py_BEGIN_CRITICAL_SECTION()
+
+PyDoc_STRVAR(range_iterator___length_hint____doc__,
+"__length_hint__($self, /)\n"
+"--\n"
+"\n"
+"Private method returning an estimate of len(list(it)).");
+
+#define RANGE_ITERATOR___LENGTH_HINT___METHODDEF    \
+    {"__length_hint__", (PyCFunction)range_iterator___length_hint__, METH_NOARGS, range_iterator___length_hint____doc__},
+
+static PyObject *
+range_iterator___length_hint___impl(_PyRangeIterObject *r);
+
+static PyObject *
+range_iterator___length_hint__(PyObject *r, PyObject *Py_UNUSED(ignored))
+{
+    PyObject *return_value = NULL;
+
+    Py_BEGIN_CRITICAL_SECTION(r);
+    return_value = range_iterator___length_hint___impl((_PyRangeIterObject *)r);
+    Py_END_CRITICAL_SECTION();
+
+    return return_value;
+}
+
+PyDoc_STRVAR(range_iterator___reduce____doc__,
+"__reduce__($self, /)\n"
+"--\n"
+"\n"
+"Return state information for pickling.");
+
+#define RANGE_ITERATOR___REDUCE___METHODDEF    \
+    {"__reduce__", (PyCFunction)range_iterator___reduce__, METH_NOARGS, range_iterator___reduce____doc__},
+
+static PyObject *
+range_iterator___reduce___impl(_PyRangeIterObject *r);
+
+static PyObject *
+range_iterator___reduce__(PyObject *r, PyObject *Py_UNUSED(ignored))
+{
+    PyObject *return_value = NULL;
+
+    Py_BEGIN_CRITICAL_SECTION(r);
+    return_value = range_iterator___reduce___impl((_PyRangeIterObject *)r);
+    Py_END_CRITICAL_SECTION();
+
+    return return_value;
+}
+
+PyDoc_STRVAR(range_iterator___setstate____doc__,
+"__setstate__($self, state, /)\n"
+"--\n"
+"\n"
+"Set state information for unpickling.");
+
+#define RANGE_ITERATOR___SETSTATE___METHODDEF    \
+    {"__setstate__", (PyCFunction)range_iterator___setstate__, METH_O, range_iterator___setstate____doc__},
+
+static PyObject *
+range_iterator___setstate___impl(_PyRangeIterObject *r, PyObject *state);
+
+static PyObject *
+range_iterator___setstate__(PyObject *r, PyObject *state)
+{
+    PyObject *return_value = NULL;
+
+    Py_BEGIN_CRITICAL_SECTION(r);
+    return_value = range_iterator___setstate___impl((_PyRangeIterObject *)r, state);
+    Py_END_CRITICAL_SECTION();
+
+    return return_value;
+}
+
+PyDoc_STRVAR(longrange_iterator___length_hint____doc__,
+"__length_hint__($self, /)\n"
+"--\n"
+"\n"
+"Private method returning an estimate of len(list(it)).");
+
+#define LONGRANGE_ITERATOR___LENGTH_HINT___METHODDEF    \
+    {"__length_hint__", (PyCFunction)longrange_iterator___length_hint__, METH_NOARGS, longrange_iterator___length_hint____doc__},
+
+static PyObject *
+longrange_iterator___length_hint___impl(longrangeiterobject *r);
+
+static PyObject *
+longrange_iterator___length_hint__(PyObject *r, PyObject *Py_UNUSED(ignored))
+{
+    PyObject *return_value = NULL;
+
+    Py_BEGIN_CRITICAL_SECTION(r);
+    return_value = longrange_iterator___length_hint___impl((longrangeiterobject *)r);
+    Py_END_CRITICAL_SECTION();
+
+    return return_value;
+}
+
+PyDoc_STRVAR(longrange_iterator___reduce____doc__,
+"__reduce__($self, /)\n"
+"--\n"
+"\n"
+"Return state information for pickling.");
+
+#define LONGRANGE_ITERATOR___REDUCE___METHODDEF    \
+    {"__reduce__", (PyCFunction)longrange_iterator___reduce__, METH_NOARGS, longrange_iterator___reduce____doc__},
+
+static PyObject *
+longrange_iterator___reduce___impl(longrangeiterobject *r);
+
+static PyObject *
+longrange_iterator___reduce__(PyObject *r, PyObject *Py_UNUSED(ignored))
+{
+    PyObject *return_value = NULL;
+
+    Py_BEGIN_CRITICAL_SECTION(r);
+    return_value = longrange_iterator___reduce___impl((longrangeiterobject *)r);
+    Py_END_CRITICAL_SECTION();
+
+    return return_value;
+}
+
+PyDoc_STRVAR(longrange_iterator___setstate____doc__,
+"__setstate__($self, state, /)\n"
+"--\n"
+"\n"
+"Set state information for unpickling.");
+
+#define LONGRANGE_ITERATOR___SETSTATE___METHODDEF    \
+    {"__setstate__", (PyCFunction)longrange_iterator___setstate__, METH_O, longrange_iterator___setstate____doc__},
+
+static PyObject *
+longrange_iterator___setstate___impl(longrangeiterobject *r, PyObject *state);
+
+static PyObject *
+longrange_iterator___setstate__(PyObject *r, PyObject *state)
+{
+    PyObject *return_value = NULL;
+
+    Py_BEGIN_CRITICAL_SECTION(r);
+    return_value = longrange_iterator___setstate___impl((longrangeiterobject *)r, state);
+    Py_END_CRITICAL_SECTION();
+
+    return return_value;
+}
+/*[clinic end generated code: output=719c0e4c81fe0f4a input=a9049054013a1b77]*/
diff --git a/Objects/dictobject.c b/Objects/dictobject.c
index 49a42a35acb..5a2bb7d3d8c 100644
--- a/Objects/dictobject.c
+++ b/Objects/dictobject.c
@@ -1771,7 +1771,7 @@ insertion_resize(PyDictObject *mp, int unicode)
 }
 
 static inline int
-insert_combined_dict(PyInterpreterState *interp, PyDictObject *mp,
+insert_combined_dict(PyDictObject *mp,
                      Py_hash_t hash, PyObject *key, PyObject *value)
 {
     // gh-140551: If dict was cleared in _Py_dict_lookup,
@@ -1789,7 +1789,7 @@ insert_combined_dict(PyInterpreterState *interp, PyDictObject *mp,
         }
     }
 
-    _PyDict_NotifyEvent(interp, PyDict_EVENT_ADDED, mp, key, value);
+    _PyDict_NotifyEvent(PyDict_EVENT_ADDED, mp, key, value);
     FT_ATOMIC_STORE_UINT32_RELAXED(mp->ma_keys->dk_version, 0);
 
     Py_ssize_t hashpos = find_empty_slot(mp->ma_keys, hash);
@@ -1846,19 +1846,19 @@ insert_split_key(PyDictKeysObject *keys, PyObject *key, Py_hash_t hash)
 }
 
 static void
-insert_split_value(PyInterpreterState *interp, PyDictObject *mp, PyObject *key, PyObject *value, Py_ssize_t ix)
+insert_split_value(PyDictObject *mp, PyObject *key, PyObject *value, Py_ssize_t ix)
 {
     assert(PyUnicode_CheckExact(key));
     ASSERT_DICT_LOCKED(mp);
     PyObject *old_value = mp->ma_values->values[ix];
     if (old_value == NULL) {
-        _PyDict_NotifyEvent(interp, PyDict_EVENT_ADDED, mp, key, value);
+        _PyDict_NotifyEvent(PyDict_EVENT_ADDED, mp, key, value);
         STORE_SPLIT_VALUE(mp, ix, Py_NewRef(value));
         _PyDictValues_AddToInsertionOrder(mp->ma_values, ix);
         STORE_USED(mp, mp->ma_used + 1);
     }
     else {
-        _PyDict_NotifyEvent(interp, PyDict_EVENT_MODIFIED, mp, key, value);
+        _PyDict_NotifyEvent(PyDict_EVENT_MODIFIED, mp, key, value);
         STORE_SPLIT_VALUE(mp, ix, Py_NewRef(value));
         // old_value should be DECREFed after GC track checking is done, if not, it could raise a segmentation fault,
         // when dict only holds the strong reference to value in ep->me_value.
@@ -1874,7 +1874,7 @@ Returns -1 if an error occurred, or 0 on success.
 Consumes key and value references.
 */
 static int
-insertdict(PyInterpreterState *interp, PyDictObject *mp,
+insertdict(PyDictObject *mp,
            PyObject *key, Py_hash_t hash, PyObject *value)
 {
     PyObject *old_value;
@@ -1885,7 +1885,7 @@ insertdict(PyInterpreterState *interp, PyDictObject *mp,
     if (_PyDict_HasSplitTable(mp) && PyUnicode_CheckExact(key)) {
         ix = insert_split_key(mp->ma_keys, key, hash);
         if (ix != DKIX_EMPTY) {
-            insert_split_value(interp, mp, key, value, ix);
+            insert_split_value(mp, key, value, ix);
             Py_DECREF(key);
             Py_DECREF(value);
             return 0;
@@ -1903,7 +1903,7 @@ insertdict(PyInterpreterState *interp, PyDictObject *mp,
         // into DICT_KEYS_GENERAL table if key is not Unicode.
         // We don't convert it before _Py_dict_lookup because non-Unicode key
         // may change generic table into Unicode table.
-        if (insert_combined_dict(interp, mp, hash, key, value) < 0) {
+        if (insert_combined_dict(mp, hash, key, value) < 0) {
             goto Fail;
         }
         STORE_USED(mp, mp->ma_used + 1);
@@ -1912,7 +1912,7 @@ insertdict(PyInterpreterState *interp, PyDictObject *mp,
     }
 
     if (old_value != value) {
-        _PyDict_NotifyEvent(interp, PyDict_EVENT_MODIFIED, mp, key, value);
+        _PyDict_NotifyEvent(PyDict_EVENT_MODIFIED, mp, key, value);
         assert(old_value != NULL);
         if (DK_IS_UNICODE(mp->ma_keys)) {
             if (_PyDict_HasSplitTable(mp)) {
@@ -1942,7 +1942,7 @@ Fail:
 // Same as insertdict but specialized for ma_keys == Py_EMPTY_KEYS.
 // Consumes key and value references.
 static int
-insert_to_emptydict(PyInterpreterState *interp, PyDictObject *mp,
+insert_to_emptydict(PyDictObject *mp,
                     PyObject *key, Py_hash_t hash, PyObject *value)
 {
     assert(mp->ma_keys == Py_EMPTY_KEYS);
@@ -1955,7 +1955,7 @@ insert_to_emptydict(PyInterpreterState *interp, PyDictObject *mp,
         Py_DECREF(value);
         return -1;
     }
-    _PyDict_NotifyEvent(interp, PyDict_EVENT_ADDED, mp, key, value);
+    _PyDict_NotifyEvent(PyDict_EVENT_ADDED, mp, key, value);
 
     /* We don't decref Py_EMPTY_KEYS here because it is immortal. */
     assert(mp->ma_values == NULL);
@@ -2658,13 +2658,11 @@ setitem_take2_lock_held(PyDictObject *mp, PyObject *key, PyObject *value)
         return -1;
     }
 
-    PyInterpreterState *interp = _PyInterpreterState_GET();
-
     if (mp->ma_keys == Py_EMPTY_KEYS) {
-        return insert_to_emptydict(interp, mp, key, hash, value);
+        return insert_to_emptydict(mp, key, hash, value);
     }
     /* insertdict() handles any resizing that might be necessary */
-    return insertdict(interp, mp, key, hash, value);
+    return insertdict(mp, key, hash, value);
 }
 
 int
@@ -2710,12 +2708,11 @@ int
 _PyDict_SetItem_KnownHash_LockHeld(PyDictObject *mp, PyObject *key, PyObject *value,
                                    Py_hash_t hash)
 {
-    PyInterpreterState *interp = _PyInterpreterState_GET();
     if (mp->ma_keys == Py_EMPTY_KEYS) {
-        return insert_to_emptydict(interp, mp, Py_NewRef(key), hash, Py_NewRef(value));
+        return insert_to_emptydict(mp, Py_NewRef(key), hash, Py_NewRef(value));
     }
     /* insertdict() handles any resizing that might be necessary */
-    return insertdict(interp, mp, Py_NewRef(key), hash, Py_NewRef(value));
+    return insertdict(mp, Py_NewRef(key), hash, Py_NewRef(value));
 }
 
 int
@@ -2836,8 +2833,7 @@ _PyDict_DelItem_KnownHash_LockHeld(PyObject *op, PyObject *key, Py_hash_t hash)
         return -1;
     }
 
-    PyInterpreterState *interp = _PyInterpreterState_GET();
-    _PyDict_NotifyEvent(interp, PyDict_EVENT_DELETED, mp, key, NULL);
+    _PyDict_NotifyEvent(PyDict_EVENT_DELETED, mp, key, NULL);
     delitem_common(mp, hash, ix, old_value);
     return 0;
 }
@@ -2883,8 +2879,7 @@ delitemif_lock_held(PyObject *op, PyObject *key,
         return -1;
 
     if (res > 0) {
-        PyInterpreterState *interp = _PyInterpreterState_GET();
-        _PyDict_NotifyEvent(interp, PyDict_EVENT_DELETED, mp, key, NULL);
+        _PyDict_NotifyEvent(PyDict_EVENT_DELETED, mp, key, NULL);
         delitem_common(mp, hash, ix, old_value);
         return 1;
     } else {
@@ -2928,8 +2923,7 @@ clear_lock_held(PyObject *op)
         return;
     }
     /* Empty the dict... */
-    PyInterpreterState *interp = _PyInterpreterState_GET();
-    _PyDict_NotifyEvent(interp, PyDict_EVENT_CLEARED, mp, NULL, NULL);
+    _PyDict_NotifyEvent(PyDict_EVENT_CLEARED, mp, NULL, NULL);
     // We don't inc ref empty keys because they're immortal
     ensure_shared_on_resize(mp);
     STORE_USED(mp, 0);
@@ -3095,8 +3089,7 @@ _PyDict_Pop_KnownHash(PyDictObject *mp, PyObject *key, Py_hash_t hash,
     }
 
     assert(old_value != NULL);
-    PyInterpreterState *interp = _PyInterpreterState_GET();
-    _PyDict_NotifyEvent(interp, PyDict_EVENT_DELETED, mp, key, NULL);
+    _PyDict_NotifyEvent(PyDict_EVENT_DELETED, mp, key, NULL);
     delitem_common(mp, hash, ix, Py_NewRef(old_value));
 
     ASSERT_CONSISTENT(mp);
@@ -3191,8 +3184,7 @@ _PyDict_Pop(PyObject *dict, PyObject *key, PyObject *default_value)
 }
 
 static PyDictObject *
-dict_dict_fromkeys(PyInterpreterState *interp, PyDictObject *mp,
-                   PyObject *iterable, PyObject *value)
+dict_dict_fromkeys(PyDictObject *mp, PyObject *iterable, PyObject *value)
 {
     PyObject *oldvalue;
     Py_ssize_t pos = 0;
@@ -3208,8 +3200,7 @@ dict_dict_fromkeys(PyInterpreterState *interp, PyDictObject *mp,
     }
 
     while (_PyDict_Next(iterable, &pos, &key, &oldvalue, &hash)) {
-        if (insertdict(interp, mp,
-                        Py_NewRef(key), hash, Py_NewRef(value))) {
+        if (insertdict(mp, Py_NewRef(key), hash, Py_NewRef(value))) {
             Py_DECREF(mp);
             return NULL;
         }
@@ -3218,8 +3209,7 @@ dict_dict_fromkeys(PyInterpreterState *interp, PyDictObject *mp,
 }
 
 static PyDictObject *
-dict_set_fromkeys(PyInterpreterState *interp, PyDictObject *mp,
-                  PyObject *iterable, PyObject *value)
+dict_set_fromkeys(PyDictObject *mp, PyObject *iterable, PyObject *value)
 {
     Py_ssize_t pos = 0;
     PyObject *key;
@@ -3234,7 +3224,7 @@ dict_set_fromkeys(PyInterpreterState *interp, PyDictObject *mp,
 
     _Py_CRITICAL_SECTION_ASSERT_OBJECT_LOCKED(iterable);
     while (_PySet_NextEntryRef(iterable, &pos, &key, &hash)) {
-        if (insertdict(interp, mp, key, hash, Py_NewRef(value))) {
+        if (insertdict(mp, key, hash, Py_NewRef(value))) {
             Py_DECREF(mp);
             return NULL;
         }
@@ -3250,7 +3240,6 @@ _PyDict_FromKeys(PyObject *cls, PyObject *iterable, PyObject *value)
     PyObject *key;
     PyObject *d;
     int status;
-    PyInterpreterState *interp = _PyInterpreterState_GET();
 
     d = _PyObject_CallNoArgs(cls);
     if (d == NULL)
@@ -3262,7 +3251,7 @@ _PyDict_FromKeys(PyObject *cls, PyObject *iterable, PyObject *value)
             PyDictObject *mp = (PyDictObject *)d;
 
             Py_BEGIN_CRITICAL_SECTION2(d, iterable);
-            d = (PyObject *)dict_dict_fromkeys(interp, mp, iterable, value);
+            d = (PyObject *)dict_dict_fromkeys(mp, iterable, value);
             Py_END_CRITICAL_SECTION2();
             return d;
         }
@@ -3270,7 +3259,7 @@ _PyDict_FromKeys(PyObject *cls, PyObject *iterable, PyObject *value)
             PyDictObject *mp = (PyDictObject *)d;
 
             Py_BEGIN_CRITICAL_SECTION2(d, iterable);
-            d = (PyObject *)dict_set_fromkeys(interp, mp, iterable, value);
+            d = (PyObject *)dict_set_fromkeys(mp, iterable, value);
             Py_END_CRITICAL_SECTION2();
             return d;
         }
@@ -3320,9 +3309,8 @@ static void
 dict_dealloc(PyObject *self)
 {
     PyDictObject *mp = (PyDictObject *)self;
-    PyInterpreterState *interp = _PyInterpreterState_GET();
     _PyObject_ResurrectStart(self);
-    _PyDict_NotifyEvent(interp, PyDict_EVENT_DEALLOCATED, mp, NULL, NULL);
+    _PyDict_NotifyEvent(PyDict_EVENT_DEALLOCATED, mp, NULL, NULL);
     if (_PyObject_ResurrectEnd(self)) {
         return;
     }
@@ -3844,7 +3832,7 @@ PyDict_MergeFromSeq2(PyObject *d, PyObject *seq2, int override)
 }
 
 static int
-dict_dict_merge(PyInterpreterState *interp, PyDictObject *mp, PyDictObject *other, int override)
+dict_dict_merge(PyDictObject *mp, PyDictObject *other, int override)
 {
     ASSERT_DICT_LOCKED(mp);
     ASSERT_DICT_LOCKED(other);
@@ -3867,7 +3855,7 @@ dict_dict_merge(PyInterpreterState *interp, PyDictObject *mp, PyDictObject *othe
             (DK_LOG_SIZE(okeys) == PyDict_LOG_MINSIZE ||
              USABLE_FRACTION(DK_SIZE(okeys)/2) < other->ma_used)
         ) {
-            _PyDict_NotifyEvent(interp, PyDict_EVENT_CLONED, mp, (PyObject *)other, NULL);
+            _PyDict_NotifyEvent(PyDict_EVENT_CLONED, mp, (PyObject *)other, NULL);
             PyDictKeysObject *keys = clone_combined_dict_keys(other);
             if (keys == NULL)
                 return -1;
@@ -3908,14 +3896,12 @@ dict_dict_merge(PyInterpreterState *interp, PyDictObject *mp, PyDictObject *othe
         Py_INCREF(key);
         Py_INCREF(value);
         if (override == 1) {
-            err = insertdict(interp, mp,
-                                Py_NewRef(key), hash, Py_NewRef(value));
+            err = insertdict(mp, Py_NewRef(key), hash, Py_NewRef(value));
         }
         else {
             err = _PyDict_Contains_KnownHash((PyObject *)mp, key, hash);
             if (err == 0) {
-                err = insertdict(interp, mp,
-                                    Py_NewRef(key), hash, Py_NewRef(value));
+                err = insertdict(mp, Py_NewRef(key), hash, Py_NewRef(value));
             }
             else if (err > 0) {
                 if (override != 0) {
@@ -3942,7 +3928,7 @@ dict_dict_merge(PyInterpreterState *interp, PyDictObject *mp, PyDictObject *othe
 }
 
 static int
-dict_merge(PyInterpreterState *interp, PyObject *a, PyObject *b, int override)
+dict_merge(PyObject *a, PyObject *b, int override)
 {
     PyDictObject *mp, *other;
 
@@ -3963,7 +3949,7 @@ dict_merge(PyInterpreterState *interp, PyObject *a, PyObject *b, int override)
         other = (PyDictObject*)b;
         int res;
         Py_BEGIN_CRITICAL_SECTION2(a, b);
-        res = dict_dict_merge(interp, (PyDictObject *)a, other, override);
+        res = dict_dict_merge((PyDictObject *)a, other, override);
         ASSERT_CONSISTENT(a);
         Py_END_CRITICAL_SECTION2();
         return res;
@@ -4044,23 +4030,20 @@ slow_exit:
 int
 PyDict_Update(PyObject *a, PyObject *b)
 {
-    PyInterpreterState *interp = _PyInterpreterState_GET();
-    return dict_merge(interp, a, b, 1);
+    return dict_merge(a, b, 1);
 }
 
 int
 PyDict_Merge(PyObject *a, PyObject *b, int override)
 {
-    PyInterpreterState *interp = _PyInterpreterState_GET();
     /* XXX Deprecate override not in (0, 1). */
-    return dict_merge(interp, a, b, override != 0);
+    return dict_merge(a, b, override != 0);
 }
 
 int
 _PyDict_MergeEx(PyObject *a, PyObject *b, int override)
 {
-    PyInterpreterState *interp = _PyInterpreterState_GET();
-    return dict_merge(interp, a, b, override);
+    return dict_merge(a, b, override);
 }
 
 /*[clinic input]
@@ -4102,7 +4085,6 @@ copy_lock_held(PyObject *o)
 {
     PyObject *copy;
     PyDictObject *mp;
-    PyInterpreterState *interp = _PyInterpreterState_GET();
 
     ASSERT_DICT_LOCKED(o);
 
@@ -4172,7 +4154,7 @@ copy_lock_held(PyObject *o)
     copy = PyDict_New();
     if (copy == NULL)
         return NULL;
-    if (dict_merge(interp, copy, o, 1) == 0)
+    if (dict_merge(copy, o, 1) == 0)
         return copy;
     Py_DECREF(copy);
     return NULL;
@@ -4367,7 +4349,6 @@ dict_setdefault_ref_lock_held(PyObject *d, PyObject *key, PyObject *default_valu
     PyObject *value;
     Py_hash_t hash;
     Py_ssize_t ix;
-    PyInterpreterState *interp = _PyInterpreterState_GET();
 
     ASSERT_DICT_LOCKED(d);
 
@@ -4389,7 +4370,7 @@ dict_setdefault_ref_lock_held(PyObject *d, PyObject *key, PyObject *default_valu
     }
 
     if (mp->ma_keys == Py_EMPTY_KEYS) {
-        if (insert_to_emptydict(interp, mp, Py_NewRef(key), hash,
+        if (insert_to_emptydict(mp, Py_NewRef(key), hash,
                                 Py_NewRef(default_value)) < 0) {
             if (result) {
                 *result = NULL;
@@ -4408,7 +4389,7 @@ dict_setdefault_ref_lock_held(PyObject *d, PyObject *key, PyObject *default_valu
             PyObject *value = mp->ma_values->values[ix];
             int already_present = value != NULL;
             if (!already_present) {
-                insert_split_value(interp, mp, key, default_value, ix);
+                insert_split_value(mp, key, default_value, ix);
                 value = default_value;
             }
             if (result) {
@@ -4432,7 +4413,7 @@ dict_setdefault_ref_lock_held(PyObject *d, PyObject *key, PyObject *default_valu
         value = default_value;
 
         // See comment to this function in insertdict.
-        if (insert_combined_dict(interp, mp, hash, Py_NewRef(key), Py_NewRef(value)) < 0) {
+        if (insert_combined_dict(mp, hash, Py_NewRef(key), Py_NewRef(value)) < 0) {
             Py_DECREF(key);
             Py_DECREF(value);
             if (result) {
@@ -4554,7 +4535,6 @@ dict_popitem_impl(PyDictObject *self)
 {
     Py_ssize_t i, j;
     PyObject *res;
-    PyInterpreterState *interp = _PyInterpreterState_GET();
 
     ASSERT_DICT_LOCKED(self);
 
@@ -4596,7 +4576,7 @@ dict_popitem_impl(PyDictObject *self)
         assert(i >= 0);
 
         key = ep0[i].me_key;
-        _PyDict_NotifyEvent(interp, PyDict_EVENT_DELETED, self, key, NULL);
+        _PyDict_NotifyEvent(PyDict_EVENT_DELETED, self, key, NULL);
         hash = unicode_get_hash(key);
         value = ep0[i].me_value;
         STORE_KEY(&ep0[i], NULL);
@@ -4611,7 +4591,7 @@ dict_popitem_impl(PyDictObject *self)
         assert(i >= 0);
 
         key = ep0[i].me_key;
-        _PyDict_NotifyEvent(interp, PyDict_EVENT_DELETED, self, key, NULL);
+        _PyDict_NotifyEvent(PyDict_EVENT_DELETED, self, key, NULL);
         hash = ep0[i].me_hash;
         value = ep0[i].me_value;
         STORE_KEY(&ep0[i], NULL);
@@ -6925,11 +6905,10 @@ store_instance_attr_lock_held(PyObject *obj, PyDictValues *values,
     }
 
     if (dict) {
-        PyInterpreterState *interp = _PyInterpreterState_GET();
         PyDict_WatchEvent event = (old_value == NULL ? PyDict_EVENT_ADDED :
                                    value == NULL ? PyDict_EVENT_DELETED :
                                    PyDict_EVENT_MODIFIED);
-        _PyDict_NotifyEvent(interp, event, dict, name, value);
+        _PyDict_NotifyEvent(event, dict, name, value);
     }
 
     FT_ATOMIC_STORE_PTR_RELEASE(values->values[ix], Py_XNewRef(value));
diff --git a/Objects/genobject.c b/Objects/genobject.c
index e47180a23d2..020af903a3f 100644
--- a/Objects/genobject.c
+++ b/Objects/genobject.c
@@ -36,6 +36,14 @@ static PyObject* async_gen_athrow_new(PyAsyncGenObject *, PyObject *);
 #define _PyAsyncGenObject_CAST(op) \
     _Py_CAST(PyAsyncGenObject*, (op))
 
+#ifdef Py_GIL_DISABLED
+# define _Py_GEN_TRY_SET_FRAME_STATE(gen, expected, state) \
+    _Py_atomic_compare_exchange_int8(&(gen)->gi_frame_state, &expected, (state))
+#else
+# define _Py_GEN_TRY_SET_FRAME_STATE(gen, expected, state) \
+    ((gen)->gi_frame_state = (state), true)
+#endif
+
 
 static const char *NON_INIT_CORO_MSG = "can't send non-None value to a "
                                  "just-started coroutine";
@@ -145,10 +153,7 @@ _PyGen_Finalize(PyObject *self)
 static void
 gen_clear_frame(PyGenObject *gen)
 {
-    if (gen->gi_frame_state == FRAME_CLEARED)
-        return;
-
-    gen->gi_frame_state = FRAME_CLEARED;
+    assert(FT_ATOMIC_LOAD_INT8_RELAXED(gen->gi_frame_state) == FRAME_CLEARED);
     _PyInterpreterFrame *frame = &gen->gi_iframe;
     frame->previous = NULL;
     _PyFrame_ClearExceptCode(frame);
@@ -179,7 +184,10 @@ gen_dealloc(PyObject *self)
     if (PyCoro_CheckExact(gen)) {
         Py_CLEAR(((PyCoroObject *)gen)->cr_origin_or_finalizer);
     }
-    gen_clear_frame(gen);
+    if (gen->gi_frame_state != FRAME_CLEARED) {
+        gen->gi_frame_state = FRAME_CLEARED;
+        gen_clear_frame(gen);
+    }
     assert(gen->gi_exc_state.exc_value == NULL);
     PyStackRef_CLEAR(gen->gi_iframe.f_executable);
     Py_CLEAR(gen->gi_name);
@@ -188,59 +196,32 @@ gen_dealloc(PyObject *self)
     PyObject_GC_Del(gen);
 }
 
-static PySendResult
-gen_send_ex2(PyGenObject *gen, PyObject *arg, PyObject **presult,
-             int exc, int closing)
+static void
+gen_raise_already_executing_error(PyGenObject *gen)
 {
+    const char *msg = "generator already executing";
+    if (PyCoro_CheckExact(gen)) {
+        msg = "coroutine already executing";
+    }
+    else if (PyAsyncGen_CheckExact(gen)) {
+        msg = "async generator already executing";
+    }
+    PyErr_SetString(PyExc_ValueError, msg);
+}
+
+// Send 'arg' into 'gen'. On success, return PYGEN_NEXT or PYGEN_RETURN.
+// Returns PYGEN_ERROR on failure. 'presult' is set to the yielded or
+// returned value.
+// The generator must be in the FRAME_EXECUTING state when this function
+// is called.
+static PySendResult
+gen_send_ex2(PyGenObject *gen, PyObject *arg, PyObject **presult, int exc)
+{
+    assert(FT_ATOMIC_LOAD_INT8_RELAXED(gen->gi_frame_state) == FRAME_EXECUTING);
+
     PyThreadState *tstate = _PyThreadState_GET();
     _PyInterpreterFrame *frame = &gen->gi_iframe;
 
-    *presult = NULL;
-    if (gen->gi_frame_state == FRAME_CREATED && arg && arg != Py_None) {
-        const char *msg = "can't send non-None value to a "
-                            "just-started generator";
-        if (PyCoro_CheckExact(gen)) {
-            msg = NON_INIT_CORO_MSG;
-        }
-        else if (PyAsyncGen_CheckExact(gen)) {
-            msg = "can't send non-None value to a "
-                    "just-started async generator";
-        }
-        PyErr_SetString(PyExc_TypeError, msg);
-        return PYGEN_ERROR;
-    }
-    if (gen->gi_frame_state == FRAME_EXECUTING) {
-        const char *msg = "generator already executing";
-        if (PyCoro_CheckExact(gen)) {
-            msg = "coroutine already executing";
-        }
-        else if (PyAsyncGen_CheckExact(gen)) {
-            msg = "async generator already executing";
-        }
-        PyErr_SetString(PyExc_ValueError, msg);
-        return PYGEN_ERROR;
-    }
-    if (FRAME_STATE_FINISHED(gen->gi_frame_state)) {
-        if (PyCoro_CheckExact(gen) && !closing) {
-            /* `gen` is an exhausted coroutine: raise an error,
-               except when called from gen_close(), which should
-               always be a silent method. */
-            PyErr_SetString(
-                PyExc_RuntimeError,
-                "cannot reuse already awaited coroutine");
-        }
-        else if (arg && !exc) {
-            /* `gen` is an exhausted generator:
-               only return value if called from send(). */
-            *presult = Py_NewRef(Py_None);
-            return PYGEN_RETURN;
-        }
-        return PYGEN_ERROR;
-    }
-
-    assert((gen->gi_frame_state == FRAME_CREATED) ||
-           FRAME_STATE_SUSPENDED(gen->gi_frame_state));
-
     /* Push arg onto the frame's value stack */
     PyObject *arg_obj = arg ? arg : Py_None;
     _PyFrame_StackPush(frame, PyStackRef_FromPyObjectNew(arg_obj));
@@ -254,21 +235,34 @@ gen_send_ex2(PyGenObject *gen, PyObject *arg, PyObject **presult,
         _PyErr_ChainStackItem();
     }
 
-    gen->gi_frame_state = FRAME_EXECUTING;
     EVAL_CALL_STAT_INC(EVAL_CALL_GENERATOR);
     PyObject *result = _PyEval_EvalFrame(tstate, frame, exc);
     assert(tstate->exc_info == prev_exc_info);
+#ifndef Py_GIL_DISABLED
     assert(gen->gi_exc_state.previous_item == NULL);
-    assert(gen->gi_frame_state != FRAME_EXECUTING);
     assert(frame->previous == NULL);
+    assert(gen->gi_frame_state != FRAME_EXECUTING);
+#endif
+
+    // The generator_return_kind field is used to distinguish between a
+    // yield and a return from within _PyEval_EvalFrame(). Earlier versions
+    // of CPython (prior to 3.15) used gi_frame_state for this purpose, but
+    // that requires the GIL for thread-safety.
+    int return_kind = ((_PyThreadStateImpl *)tstate)->generator_return_kind;
+
+    if (return_kind == GENERATOR_YIELD) {
+        assert(result != NULL && !_PyErr_Occurred(tstate));
+        *presult = result;
+        return PYGEN_NEXT;
+    }
+
+    assert(return_kind == GENERATOR_RETURN);
+    assert(gen->gi_exc_state.exc_value == NULL);
+    assert(FT_ATOMIC_LOAD_INT8_RELAXED(gen->gi_frame_state) == FRAME_CLEARED);
 
     /* If the generator just returned (as opposed to yielding), signal
      * that the generator is exhausted. */
     if (result) {
-        if (FRAME_STATE_SUSPENDED(gen->gi_frame_state)) {
-            *presult = result;
-            return PYGEN_NEXT;
-        }
         assert(result == Py_None || !PyAsyncGen_CheckExact(gen));
         if (result == Py_None && !PyAsyncGen_CheckExact(gen) && !arg) {
             /* Return NULL if called by gen_iternext() */
@@ -281,37 +275,82 @@ gen_send_ex2(PyGenObject *gen, PyObject *arg, PyObject **presult,
             !PyErr_ExceptionMatches(PyExc_StopAsyncIteration));
     }
 
-    assert(gen->gi_exc_state.exc_value == NULL);
-    assert(gen->gi_frame_state == FRAME_CLEARED);
     *presult = result;
     return result ? PYGEN_RETURN : PYGEN_ERROR;
 }
 
+// Set the generator 'gen' to the executing state and send 'arg' into it.
+// See gen_send_ex2() for details.
+static PySendResult
+gen_send_ex(PyGenObject *gen, PyObject *arg, PyObject **presult)
+{
+    *presult = NULL;
+    int8_t frame_state = FT_ATOMIC_LOAD_INT8_RELAXED(gen->gi_frame_state);
+    do {
+        if (frame_state == FRAME_CREATED && arg && arg != Py_None) {
+            const char *msg = "can't send non-None value to a "
+                                "just-started generator";
+            if (PyCoro_CheckExact(gen)) {
+                msg = NON_INIT_CORO_MSG;
+            }
+            else if (PyAsyncGen_CheckExact(gen)) {
+                msg = "can't send non-None value to a "
+                        "just-started async generator";
+            }
+            PyErr_SetString(PyExc_TypeError, msg);
+            return PYGEN_ERROR;
+        }
+        if (frame_state == FRAME_EXECUTING) {
+            gen_raise_already_executing_error(gen);
+            return PYGEN_ERROR;
+        }
+        if (FRAME_STATE_FINISHED(frame_state)) {
+            if (PyCoro_CheckExact(gen)) {
+                /* `gen` is an exhausted coroutine: raise an error,
+                except when called from gen_close(), which should
+                always be a silent method. */
+                PyErr_SetString(
+                    PyExc_RuntimeError,
+                    "cannot reuse already awaited coroutine");
+            }
+            else if (arg) {
+                /* `gen` is an exhausted generator:
+                only return value if called from send(). */
+                *presult = Py_None;
+                return PYGEN_RETURN;
+            }
+            return PYGEN_ERROR;
+        }
+
+        assert((frame_state == FRAME_CREATED) ||
+               FRAME_STATE_SUSPENDED(frame_state));
+    } while (!_Py_GEN_TRY_SET_FRAME_STATE(gen, frame_state, FRAME_EXECUTING));
+
+    return gen_send_ex2(gen, arg, presult, 0);
+}
+
 static PySendResult
 PyGen_am_send(PyObject *self, PyObject *arg, PyObject **result)
 {
     PyGenObject *gen = _PyGen_CAST(self);
-    return gen_send_ex2(gen, arg, result, 0, 0);
+    return gen_send_ex(gen, arg, result);
 }
 
 static PyObject *
-gen_send_ex(PyGenObject *gen, PyObject *arg, int exc, int closing)
+gen_set_stop_iteration(PyGenObject *gen, PyObject *result)
 {
-    PyObject *result;
-    if (gen_send_ex2(gen, arg, &result, exc, closing) == PYGEN_RETURN) {
-        if (PyAsyncGen_CheckExact(gen)) {
-            assert(result == Py_None);
-            PyErr_SetNone(PyExc_StopAsyncIteration);
-        }
-        else if (result == Py_None) {
-            PyErr_SetNone(PyExc_StopIteration);
-        }
-        else {
-            _PyGen_SetStopIterationValue(result);
-        }
-        Py_CLEAR(result);
+    if (PyAsyncGen_CheckExact(gen)) {
+        assert(result == Py_None);
+        PyErr_SetNone(PyExc_StopAsyncIteration);
     }
-    return result;
+    else if (result == Py_None) {
+        PyErr_SetNone(PyExc_StopIteration);
+    }
+    else {
+        _PyGen_SetStopIterationValue(result);
+    }
+    Py_DECREF(result);
+    return NULL;
 }
 
 PyDoc_STRVAR(send_doc,
@@ -319,9 +358,14 @@ PyDoc_STRVAR(send_doc,
 return next yielded value or raise StopIteration.");
 
 static PyObject *
-gen_send(PyObject *gen, PyObject *arg)
+gen_send(PyObject *op, PyObject *arg)
 {
-    return gen_send_ex((PyGenObject*)gen, arg, 0, 0);
+    PyObject *result;
+    PyGenObject *gen = _PyGen_CAST(op);
+    if (gen_send_ex(gen, arg, &result) == PYGEN_RETURN) {
+        return gen_set_stop_iteration(gen, result);
+    }
+    return result;
 }
 
 PyDoc_STRVAR(close_doc,
@@ -370,42 +414,44 @@ is_resume(_Py_CODEUNIT *instr)
     );
 }
 
-PyObject *
-_PyGen_yf(PyGenObject *gen)
-{
-    if (gen->gi_frame_state == FRAME_SUSPENDED_YIELD_FROM) {
-        _PyInterpreterFrame *frame = &gen->gi_iframe;
-        // GH-122390: These asserts are wrong in the presence of ENTER_EXECUTOR!
-        // assert(is_resume(frame->instr_ptr));
-        // assert((frame->instr_ptr->op.arg & RESUME_OPARG_LOCATION_MASK) >= RESUME_AFTER_YIELD_FROM);
-        return PyStackRef_AsPyObjectNew(_PyFrame_StackPeek(frame));
-    }
-    return NULL;
-}
-
 static PyObject *
 gen_close(PyObject *self, PyObject *args)
 {
     PyGenObject *gen = _PyGen_CAST(self);
 
-    if (gen->gi_frame_state == FRAME_CREATED) {
-        gen->gi_frame_state = FRAME_COMPLETED;
-        Py_RETURN_NONE;
-    }
-    if (FRAME_STATE_FINISHED(gen->gi_frame_state)) {
-        Py_RETURN_NONE;
-    }
+    int8_t frame_state = FT_ATOMIC_LOAD_INT8_RELAXED(gen->gi_frame_state);
+    do {
+        if (frame_state == FRAME_CREATED) {
+            // && (1) to avoid -Wunreachable-code warning on Clang
+            if (!_Py_GEN_TRY_SET_FRAME_STATE(gen, frame_state, FRAME_CLEARED) && (1)) {
+                continue;
+            }
+            gen_clear_frame(gen);
+            Py_RETURN_NONE;
+        }
+
+        if (FRAME_STATE_FINISHED(frame_state)) {
+            Py_RETURN_NONE;
+        }
+
+        if (frame_state == FRAME_EXECUTING) {
+            gen_raise_already_executing_error(gen);
+            return NULL;
+        }
+
+        assert(frame_state == FRAME_SUSPENDED_YIELD_FROM ||
+               frame_state == FRAME_SUSPENDED);
+
+    } while (!_Py_GEN_TRY_SET_FRAME_STATE(gen, frame_state, FRAME_EXECUTING));
 
-    PyObject *yf = _PyGen_yf(gen);
     int err = 0;
-    if (yf) {
-        PyFrameState state = gen->gi_frame_state;
-        gen->gi_frame_state = FRAME_EXECUTING;
+    _PyInterpreterFrame *frame = &gen->gi_iframe;
+    if (frame_state == FRAME_SUSPENDED_YIELD_FROM) {
+        PyObject *yf = PyStackRef_AsPyObjectNew(_PyFrame_StackPeek(frame));
         err = gen_close_iter(yf);
-        gen->gi_frame_state = state;
         Py_DECREF(yf);
     }
-    _PyInterpreterFrame *frame = &gen->gi_iframe;
+
     if (is_resume(frame->instr_ptr)) {
         bool no_unwind_tools = _PyEval_NoToolsForUnwind(_PyThreadState_GET());
         /* We can safely ignore the outermost try block
@@ -415,7 +461,7 @@ gen_close(PyObject *self, PyObject *args)
         if (oparg & RESUME_OPARG_DEPTH1_MASK && no_unwind_tools) {
             // RESUME after YIELD_VALUE and exception depth is 1
             assert((oparg & RESUME_OPARG_LOCATION_MASK) != RESUME_AT_FUNC_START);
-            gen->gi_frame_state = FRAME_COMPLETED;
+            FT_ATOMIC_STORE_INT8_RELEASE(gen->gi_frame_state, FRAME_CLEARED);
             gen_clear_frame(gen);
             Py_RETURN_NONE;
         }
@@ -424,8 +470,13 @@ gen_close(PyObject *self, PyObject *args)
         PyErr_SetNone(PyExc_GeneratorExit);
     }
 
-    PyObject *retval = gen_send_ex(gen, Py_None, 1, 1);
-    if (retval) {
+    PyObject *retval;
+    if (gen_send_ex2(gen, Py_None, &retval, 1) == PYGEN_RETURN) {
+        // the generator returned a value while closing, return the value here
+        assert(!PyErr_Occurred());
+        return retval;
+    }
+    else if (retval) {
         const char *msg = "generator ignored GeneratorExit";
         if (PyCoro_CheckExact(gen)) {
             msg = "coroutine ignored GeneratorExit";
@@ -442,102 +493,14 @@ gen_close(PyObject *self, PyObject *args)
         PyErr_Clear();          /* ignore this error */
         Py_RETURN_NONE;
     }
-
-    /* if the generator returned a value while closing, StopIteration was
-     * raised in gen_send_ex() above; retrieve and return the value here */
-    if (_PyGen_FetchStopIterationValue(&retval) == 0) {
-        return retval;
-    }
     return NULL;
 }
 
-
-PyDoc_STRVAR(throw_doc,
-"throw(value)\n\
-throw(type[,value[,tb]])\n\
-\n\
-Raise exception in generator, return next yielded value or raise\n\
-StopIteration.\n\
-the (type, val, tb) signature is deprecated, \n\
-and may be removed in a future version of Python.");
-
-static PyObject *
-_gen_throw(PyGenObject *gen, int close_on_genexit,
-           PyObject *typ, PyObject *val, PyObject *tb)
+// Set an exception for a gen.throw() call.
+// Return 0 on success, -1 on failure.
+static int
+gen_set_exception(PyObject *typ, PyObject *val, PyObject *tb)
 {
-    PyObject *yf = _PyGen_yf(gen);
-
-    if (yf) {
-        _PyInterpreterFrame *frame = &gen->gi_iframe;
-        PyObject *ret;
-        int err;
-        if (PyErr_GivenExceptionMatches(typ, PyExc_GeneratorExit) &&
-            close_on_genexit
-        ) {
-            /* Asynchronous generators *should not* be closed right away.
-               We have to allow some awaits to work it through, hence the
-               `close_on_genexit` parameter here.
-            */
-            PyFrameState state = gen->gi_frame_state;
-            gen->gi_frame_state = FRAME_EXECUTING;
-            err = gen_close_iter(yf);
-            gen->gi_frame_state = state;
-            Py_DECREF(yf);
-            if (err < 0)
-                return gen_send_ex(gen, Py_None, 1, 0);
-            goto throw_here;
-        }
-        PyThreadState *tstate = _PyThreadState_GET();
-        assert(tstate != NULL);
-        if (PyGen_CheckExact(yf) || PyCoro_CheckExact(yf)) {
-            /* `yf` is a generator or a coroutine. */
-
-            /* Link frame into the stack to enable complete backtraces. */
-            /* XXX We should probably be updating the current frame somewhere in
-               ceval.c. */
-            _PyInterpreterFrame *prev = tstate->current_frame;
-            frame->previous = prev;
-            tstate->current_frame = frame;
-            /* Close the generator that we are currently iterating with
-               'yield from' or awaiting on with 'await'. */
-            PyFrameState state = gen->gi_frame_state;
-            gen->gi_frame_state = FRAME_EXECUTING;
-            ret = _gen_throw((PyGenObject *)yf, close_on_genexit,
-                             typ, val, tb);
-            gen->gi_frame_state = state;
-            tstate->current_frame = prev;
-            frame->previous = NULL;
-        } else {
-            /* `yf` is an iterator or a coroutine-like object. */
-            PyObject *meth;
-            if (PyObject_GetOptionalAttr(yf, &_Py_ID(throw), &meth) < 0) {
-                Py_DECREF(yf);
-                return NULL;
-            }
-            if (meth == NULL) {
-                Py_DECREF(yf);
-                goto throw_here;
-            }
-
-            _PyInterpreterFrame *prev = tstate->current_frame;
-            frame->previous = prev;
-            tstate->current_frame = frame;
-            PyFrameState state = gen->gi_frame_state;
-            gen->gi_frame_state = FRAME_EXECUTING;
-            ret = PyObject_CallFunctionObjArgs(meth, typ, val, tb, NULL);
-            gen->gi_frame_state = state;
-            tstate->current_frame = prev;
-            frame->previous = NULL;
-            Py_DECREF(meth);
-        }
-        Py_DECREF(yf);
-        if (!ret) {
-            ret = gen_send_ex(gen, Py_None, 1, 0);
-        }
-        return ret;
-    }
-
-throw_here:
     /* First, check the traceback argument, replacing None with
        NULL. */
     if (tb == Py_None) {
@@ -546,16 +509,16 @@ throw_here:
     else if (tb != NULL && !PyTraceBack_Check(tb)) {
         PyErr_SetString(PyExc_TypeError,
             "throw() third argument must be a traceback object");
-        return NULL;
+        return -1;
     }
 
     Py_INCREF(typ);
     Py_XINCREF(val);
     Py_XINCREF(tb);
 
-    if (PyExceptionClass_Check(typ))
+    if (PyExceptionClass_Check(typ)) {
         PyErr_NormalizeException(&typ, &val, &tb);
-
+    }
     else if (PyExceptionInstance_Check(typ)) {
         /* Raising an instance.  The value should be a dummy. */
         if (val && val != Py_None) {
@@ -583,14 +546,137 @@ throw_here:
     }
 
     PyErr_Restore(typ, val, tb);
-    return gen_send_ex(gen, Py_None, 1, 0);
+    return 0;
 
 failed_throw:
     /* Didn't use our arguments, so restore their original refcounts */
     Py_DECREF(typ);
     Py_XDECREF(val);
     Py_XDECREF(tb);
-    return NULL;
+    return -1;
+}
+
+static PyObject *
+gen_throw_current_exception(PyGenObject *gen)
+{
+    assert(gen->gi_frame_state == FRAME_EXECUTING);
+
+    PyObject *result;
+    if (gen_send_ex2(gen, Py_None, &result, 1) == PYGEN_RETURN) {
+        return gen_set_stop_iteration(gen, result);
+    }
+    return result;
+}
+
+PyDoc_STRVAR(throw_doc,
+"throw(value)\n\
+throw(type[,value[,tb]])\n\
+\n\
+Raise exception in generator, return next yielded value or raise\n\
+StopIteration.\n\
+the (type, val, tb) signature is deprecated, \n\
+and may be removed in a future version of Python.");
+
+static PyObject *
+_gen_throw(PyGenObject *gen, int close_on_genexit,
+           PyObject *typ, PyObject *val, PyObject *tb)
+{
+    int8_t frame_state = FT_ATOMIC_LOAD_INT8_RELAXED(gen->gi_frame_state);
+    do {
+        if (frame_state == FRAME_EXECUTING) {
+            gen_raise_already_executing_error(gen);
+            return NULL;
+        }
+
+        if (FRAME_STATE_FINISHED(frame_state)) {
+            if (PyCoro_CheckExact(gen)) {
+                /* `gen` is an exhausted coroutine: raise an error */
+                PyErr_SetString(
+                    PyExc_RuntimeError,
+                    "cannot reuse already awaited coroutine");
+                return NULL;
+            }
+            gen_set_exception(typ, val, tb);
+            return NULL;
+        }
+
+        assert((frame_state == FRAME_CREATED) ||
+               FRAME_STATE_SUSPENDED(frame_state));
+    } while (!_Py_GEN_TRY_SET_FRAME_STATE(gen, frame_state, FRAME_EXECUTING));
+
+    if (frame_state == FRAME_SUSPENDED_YIELD_FROM) {
+        _PyInterpreterFrame *frame = &gen->gi_iframe;
+        PyObject *yf = PyStackRef_AsPyObjectNew(_PyFrame_StackPeek(frame));
+        PyObject *ret;
+        int err;
+        if (PyErr_GivenExceptionMatches(typ, PyExc_GeneratorExit) &&
+            close_on_genexit
+        ) {
+            /* Asynchronous generators *should not* be closed right away.
+               We have to allow some awaits to work it through, hence the
+               `close_on_genexit` parameter here.
+            */
+            err = gen_close_iter(yf);
+            Py_DECREF(yf);
+            if (err < 0) {
+                return gen_throw_current_exception(gen);
+            }
+            goto throw_here;
+        }
+        PyThreadState *tstate = _PyThreadState_GET();
+        assert(tstate != NULL);
+        if (PyGen_CheckExact(yf) || PyCoro_CheckExact(yf)) {
+            /* `yf` is a generator or a coroutine. */
+
+            /* Link frame into the stack to enable complete backtraces. */
+            /* XXX We should probably be updating the current frame somewhere in
+               ceval.c. */
+            _PyInterpreterFrame *prev = tstate->current_frame;
+            frame->previous = prev;
+            tstate->current_frame = frame;
+            /* Close the generator that we are currently iterating with
+               'yield from' or awaiting on with 'await'. */
+            ret = _gen_throw((PyGenObject *)yf, close_on_genexit,
+                             typ, val, tb);
+            tstate->current_frame = prev;
+            frame->previous = NULL;
+        }
+        else {
+            /* `yf` is an iterator or a coroutine-like object. */
+            PyObject *meth;
+            if (PyObject_GetOptionalAttr(yf, &_Py_ID(throw), &meth) < 0) {
+                Py_DECREF(yf);
+                FT_ATOMIC_STORE_INT8_RELEASE(gen->gi_frame_state, frame_state);
+                return NULL;
+            }
+            if (meth == NULL) {
+                Py_DECREF(yf);
+                goto throw_here;
+            }
+
+            _PyInterpreterFrame *prev = tstate->current_frame;
+            frame->previous = prev;
+            tstate->current_frame = frame;
+            ret = PyObject_CallFunctionObjArgs(meth, typ, val, tb, NULL);
+            tstate->current_frame = prev;
+            frame->previous = NULL;
+            Py_DECREF(meth);
+        }
+        Py_DECREF(yf);
+        if (!ret) {
+            return gen_throw_current_exception(gen);
+        }
+        FT_ATOMIC_STORE_INT8_RELEASE(gen->gi_frame_state, frame_state);
+        return ret;
+    }
+
+throw_here:
+    assert(FT_ATOMIC_LOAD_INT8_RELAXED(gen->gi_frame_state) == FRAME_EXECUTING);
+    if (gen_set_exception(typ, val, tb) < 0) {
+        FT_ATOMIC_STORE_INT8_RELEASE(gen->gi_frame_state, frame_state);
+        return NULL;
+    }
+    return gen_throw_current_exception(gen);
 }
 
 
@@ -632,7 +718,7 @@ gen_iternext(PyObject *self)
     PyGenObject *gen = _PyGen_CAST(self);
 
     PyObject *result;
-    if (gen_send_ex2(gen, NULL, &result, 0, 0) == PYGEN_RETURN) {
+    if (gen_send_ex(gen, NULL, &result) == PYGEN_RETURN) {
         if (result != Py_None) {
             _PyGen_SetStopIterationValue(result);
         }
@@ -756,13 +842,15 @@ gen_set_qualname(PyObject *self, PyObject *value, void *Py_UNUSED(ignored))
 }
 
 static PyObject *
-gen_getyieldfrom(PyObject *gen, void *Py_UNUSED(ignored))
+gen_getyieldfrom(PyObject *self, void *Py_UNUSED(ignored))
 {
-    PyObject *yf = _PyGen_yf(_PyGen_CAST(gen));
-    if (yf == NULL) {
+    PyGenObject *gen = _PyGen_CAST(self);
+    int8_t frame_state = FT_ATOMIC_LOAD_INT8_RELAXED(gen->gi_frame_state);
+    if (frame_state != FRAME_SUSPENDED_YIELD_FROM) {
         Py_RETURN_NONE;
     }
-    return yf;
+    // TODO: still not thread-safe with free threading
+    return PyStackRef_AsPyObjectNew(_PyFrame_StackPeek(&gen->gi_iframe));
 }
 
 
@@ -770,17 +858,16 @@ static PyObject *
 gen_getrunning(PyObject *self, void *Py_UNUSED(ignored))
 {
     PyGenObject *gen = _PyGen_CAST(self);
-    if (gen->gi_frame_state == FRAME_EXECUTING) {
-        Py_RETURN_TRUE;
-    }
-    Py_RETURN_FALSE;
+    int8_t frame_state = FT_ATOMIC_LOAD_INT8_RELAXED(gen->gi_frame_state);
+    return frame_state == FRAME_EXECUTING ? Py_True : Py_False;
 }
 
 static PyObject *
 gen_getsuspended(PyObject *self, void *Py_UNUSED(ignored))
 {
     PyGenObject *gen = _PyGen_CAST(self);
-    return PyBool_FromLong(FRAME_STATE_SUSPENDED(gen->gi_frame_state));
+    int8_t frame_state = FT_ATOMIC_LOAD_INT8_RELAXED(gen->gi_frame_state);
+    return FRAME_STATE_SUSPENDED(frame_state) ? Py_True : Py_False;
 }
 
 static PyObject *
@@ -789,9 +876,11 @@ _gen_getframe(PyGenObject *gen, const char *const name)
     if (PySys_Audit("object.__getattr__", "Os", gen, name) < 0) {
         return NULL;
     }
-    if (FRAME_STATE_FINISHED(gen->gi_frame_state)) {
+    int8_t frame_state = FT_ATOMIC_LOAD_INT8_RELAXED(gen->gi_frame_state);
+    if (FRAME_STATE_FINISHED(frame_state)) {
         Py_RETURN_NONE;
     }
+    // TODO: still not thread-safe with free threading
     return _Py_XNewRef((PyObject *)_PyFrame_GetFrameObject(&gen->gi_iframe));
 }
 
@@ -1134,35 +1223,6 @@ coro_await(PyObject *coro)
     return (PyObject *)cw;
 }
 
-static PyObject *
-coro_get_cr_await(PyObject *coro, void *Py_UNUSED(ignored))
-{
-    PyObject *yf = _PyGen_yf((PyGenObject *) coro);
-    if (yf == NULL)
-        Py_RETURN_NONE;
-    return yf;
-}
-
-static PyObject *
-cr_getsuspended(PyObject *self, void *Py_UNUSED(ignored))
-{
-    PyCoroObject *coro = _PyCoroObject_CAST(self);
-    if (FRAME_STATE_SUSPENDED(coro->cr_frame_state)) {
-        Py_RETURN_TRUE;
-    }
-    Py_RETURN_FALSE;
-}
-
-static PyObject *
-cr_getrunning(PyObject *self, void *Py_UNUSED(ignored))
-{
-    PyCoroObject *coro = _PyCoroObject_CAST(self);
-    if (coro->cr_frame_state == FRAME_EXECUTING) {
-        Py_RETURN_TRUE;
-    }
-    Py_RETURN_FALSE;
-}
-
 static PyObject *
 cr_getframe(PyObject *coro, void *Py_UNUSED(ignored))
 {
@@ -1180,12 +1240,12 @@ static PyGetSetDef coro_getsetlist[] = {
      PyDoc_STR("name of the coroutine")},
     {"__qualname__", gen_get_qualname, gen_set_qualname,
      PyDoc_STR("qualified name of the coroutine")},
-    {"cr_await", coro_get_cr_await, NULL,
+    {"cr_await", gen_getyieldfrom, NULL,
      PyDoc_STR("object being awaited on, or None")},
-    {"cr_running", cr_getrunning, NULL, NULL},
+    {"cr_running", gen_getrunning, NULL, NULL},
     {"cr_frame", cr_getframe, NULL, NULL},
     {"cr_code", cr_getcode, NULL, NULL},
-    {"cr_suspended", cr_getsuspended, NULL, NULL},
+    {"cr_suspended", gen_getsuspended, NULL, NULL},
     {NULL} /* Sentinel */
 };
 
@@ -1601,26 +1661,16 @@ ag_getcode(PyObject *gen, void *Py_UNUSED(ignored))
     return _gen_getcode((PyGenObject*)gen, "ag_code");
 }
 
-static PyObject *
-ag_getsuspended(PyObject *self, void *Py_UNUSED(ignored))
-{
-    PyAsyncGenObject *ag = _PyAsyncGenObject_CAST(self);
-    if (FRAME_STATE_SUSPENDED(ag->ag_frame_state)) {
-        Py_RETURN_TRUE;
-    }
-    Py_RETURN_FALSE;
-}
-
 static PyGetSetDef async_gen_getsetlist[] = {
     {"__name__", gen_get_name, gen_set_name,
      PyDoc_STR("name of the async generator")},
     {"__qualname__", gen_get_qualname, gen_set_qualname,
      PyDoc_STR("qualified name of the async generator")},
-    {"ag_await", coro_get_cr_await, NULL,
+    {"ag_await", gen_getyieldfrom, NULL,
      PyDoc_STR("object being awaited on, or None")},
      {"ag_frame", ag_getframe, NULL, NULL},
      {"ag_code", ag_getcode, NULL, NULL},
-     {"ag_suspended", ag_getsuspended, NULL, NULL},
+     {"ag_suspended", gen_getsuspended, NULL, NULL},
     {NULL} /* Sentinel */
 };
 
diff --git a/Objects/listobject.c b/Objects/listobject.c
index 1722ea60cdc..4a98c8e54ab 100644
--- a/Objects/listobject.c
+++ b/Objects/listobject.c
@@ -96,11 +96,7 @@ ensure_shared_on_resize(PyListObject *self)
  * of the new slots at exit is undefined heap trash; it's the caller's
  * responsibility to overwrite them with sane values.
  * The number of allocated elements may grow, shrink, or stay the same.
- * Failure is impossible if newsize <= self.allocated on entry, although
- * that partly relies on an assumption that the system realloc() never
- * fails when passed a number of bytes <= the number of bytes last
- * allocated (the C standard doesn't guarantee this, but it's hard to
- * imagine a realloc implementation where it wouldn't be true).
+ * Failure is impossible if newsize <= self.allocated on entry.
  * Note that self->ob_item may change, and even if newsize is less
  * than ob_size on entry.
  */
@@ -145,6 +141,11 @@ list_resize(PyListObject *self, Py_ssize_t newsize)
 #ifdef Py_GIL_DISABLED
     _PyListArray *array = list_allocate_array(new_allocated);
     if (array == NULL) {
+        if (newsize < allocated) {
+            // Never fail when shrinking allocations
+            Py_SET_SIZE(self, newsize);
+            return 0;
+        }
         PyErr_NoMemory();
         return -1;
     }
@@ -178,6 +179,11 @@ list_resize(PyListObject *self, Py_ssize_t newsize)
         items = NULL;
     }
     if (items == NULL) {
+        if (newsize < allocated) {
+            // Never fail when shrinking allocations
+            Py_SET_SIZE(self, newsize);
+            return 0;
+        }
         PyErr_NoMemory();
         return -1;
     }
@@ -495,7 +501,7 @@ ins1(PyListObject *self, Py_ssize_t where, PyObject *v)
         where = n;
     items = self->ob_item;
     for (i = n; --i >= where; )
-        FT_ATOMIC_STORE_PTR_RELAXED(items[i+1], items[i]);
+        FT_ATOMIC_STORE_PTR_RELEASE(items[i+1], items[i]);
     FT_ATOMIC_STORE_PTR_RELEASE(items[where], Py_NewRef(v));
     return 0;
 }
@@ -818,8 +824,8 @@ list_repeat_lock_held(PyListObject *a, Py_ssize_t n)
             _Py_RefcntAdd(*src, n);
             *dest++ = *src++;
         }
-        // TODO: _Py_memory_repeat calls are not safe for shared lists in
-        // GIL_DISABLED builds. (See issue #129069)
+        // This list is not yet visible to other threads, so atomic repeat
+        // is not necessary even in Py_GIL_DISABLED builds.
         _Py_memory_repeat((char *)np->ob_item, sizeof(PyObject *)*output_size,
                                         sizeof(PyObject *)*input_size);
     }
@@ -882,6 +888,34 @@ list_clear_slot(PyObject *self)
     return 0;
 }
 
+// Pointer-by-pointer memmove for PyObject** arrays that is safe
+// for shared lists in Py_GIL_DISABLED builds.
+static void
+ptr_wise_atomic_memmove(PyListObject *a, PyObject **dest, PyObject **src, Py_ssize_t n)
+{
+#ifndef Py_GIL_DISABLED
+    memmove(dest, src, n * sizeof(PyObject *));
+#else
+    _Py_CRITICAL_SECTION_ASSERT_OBJECT_LOCKED(a);
+    if (_Py_IsOwnedByCurrentThread((PyObject *)a) && !_PyObject_GC_IS_SHARED(a)) {
+        // No other threads can read this list concurrently
+        memmove(dest, src, n * sizeof(PyObject *));
+        return;
+    }
+    if (dest < src) {
+        for (Py_ssize_t i = 0; i != n; i++) {
+            _Py_atomic_store_ptr_release(&dest[i], src[i]);
+        }
+    }
+    else {
+        // copy backwards to avoid overwriting src before it's read
+        for (Py_ssize_t i = n; i != 0; i--) {
+            _Py_atomic_store_ptr_release(&dest[i - 1], src[i - 1]);
+        }
+    }
+#endif
+}
+
 /* a[ilow:ihigh] = v if v != NULL.
  * del a[ilow:ihigh] if v == NULL.
  *
@@ -952,16 +986,9 @@ list_ass_slice_lock_held(PyListObject *a, Py_ssize_t ilow, Py_ssize_t ihigh, PyO
     }
 
     if (d < 0) { /* Delete -d items */
-        Py_ssize_t tail;
-        tail = (Py_SIZE(a) - ihigh) * sizeof(PyObject *);
-        // TODO: these memmove/memcpy calls are not safe for shared lists in
-        // GIL_DISABLED builds. (See issue #129069)
-        memmove(&item[ihigh+d], &item[ihigh], tail);
-        if (list_resize(a, Py_SIZE(a) + d) < 0) {
-            memmove(&item[ihigh], &item[ihigh+d], tail);
-            memcpy(&item[ilow], recycle, s);
-            goto Error;
-        }
+        Py_ssize_t tail = Py_SIZE(a) - ihigh;
+        ptr_wise_atomic_memmove(a, &item[ihigh+d], &item[ihigh], tail);
+        (void)list_resize(a, Py_SIZE(a) + d); // NB: shrinking a list can't fail
         item = a->ob_item;
     }
     else if (d > 0) { /* Insert d items */
@@ -969,10 +996,7 @@ list_ass_slice_lock_held(PyListObject *a, Py_ssize_t ilow, Py_ssize_t ihigh, PyO
         if (list_resize(a, k+d) < 0)
             goto Error;
         item = a->ob_item;
-        // TODO: these memmove/memcpy calls are not safe for shared lists in
-        // GIL_DISABLED builds. (See issue #129069)
-        memmove(&item[ihigh+d], &item[ihigh],
-            (k - ihigh)*sizeof(PyObject *));
+        ptr_wise_atomic_memmove(a, &item[ihigh+d], &item[ihigh], k - ihigh);
     }
     for (k = 0; k < n; k++, ilow++) {
         PyObject *w = vitem[k];
@@ -1056,10 +1080,17 @@ list_inplace_repeat_lock_held(PyListObject *self, Py_ssize_t n)
     for (Py_ssize_t j = 0; j < input_size; j++) {
         _Py_RefcntAdd(items[j], n-1);
     }
-    // TODO: _Py_memory_repeat calls are not safe for shared lists in
-    // GIL_DISABLED builds. (See issue #129069)
+#ifndef Py_GIL_DISABLED
     _Py_memory_repeat((char *)items, sizeof(PyObject *)*output_size,
                       sizeof(PyObject *)*input_size);
+#else
+    Py_ssize_t copied = input_size;
+    while (copied < output_size) {
+        Py_ssize_t items_to_copy = Py_MIN(copied, output_size - copied);
+        ptr_wise_atomic_memmove(self, items + copied, items, items_to_copy);
+        copied += items_to_copy;
+    }
+#endif
     return 0;
 }
 
@@ -1091,7 +1122,7 @@ list_ass_item_lock_held(PyListObject *a, Py_ssize_t i, PyObject *v)
     if (v == NULL) {
         Py_ssize_t size = Py_SIZE(a);
         for (Py_ssize_t idx = i; idx < size - 1; idx++) {
-            FT_ATOMIC_STORE_PTR_RELAXED(a->ob_item[idx], a->ob_item[idx + 1]);
+            FT_ATOMIC_STORE_PTR_RELEASE(a->ob_item[idx], a->ob_item[idx + 1]);
         }
         Py_SET_SIZE(a, size - 1);
     }
@@ -1532,7 +1563,6 @@ list_pop_impl(PyListObject *self, Py_ssize_t index)
 /*[clinic end generated code: output=6bd69dcb3f17eca8 input=c269141068ae4b8f]*/
 {
     PyObject *v;
-    int status;
 
     if (Py_SIZE(self) == 0) {
         /* Special-case most common failure cause */
@@ -1548,27 +1578,18 @@ list_pop_impl(PyListObject *self, Py_ssize_t index)
 
     PyObject **items = self->ob_item;
     v = items[index];
-    const Py_ssize_t size_after_pop = Py_SIZE(self) - 1;
-    if (size_after_pop == 0) {
+    if (Py_SIZE(self) == 1) {
         Py_INCREF(v);
         list_clear(self);
-        status = 0;
+        return v;
     }
-    else {
-        if ((size_after_pop - index) > 0) {
-            memmove(&items[index], &items[index+1], (size_after_pop - index) * sizeof(PyObject *));
-        }
-        status = list_resize(self, size_after_pop);
-    }
-    if (status >= 0) {
-        return v; // and v now owns the reference the list had
-    }
-    else {
-        // list resize failed, need to restore
-        memmove(&items[index+1], &items[index], (size_after_pop - index)* sizeof(PyObject *));
-        items[index] = v;
-        return NULL;
+    Py_ssize_t size_after_pop = Py_SIZE(self) - 1;
+    if (index < size_after_pop) {
+        ptr_wise_atomic_memmove(self, &items[index], &items[index+1],
+                                size_after_pop - index);
     }
+    list_resize(self, size_after_pop);  // NB: shrinking a list can't fail
+    return v;
 }
 
 /* Reverse a slice of a list in place, from lo up to (exclusive) hi. */
@@ -1580,8 +1601,8 @@ reverse_slice(PyObject **lo, PyObject **hi)
     --hi;
     while (lo < hi) {
         PyObject *t = *lo;
-        *lo = *hi;
-        *hi = t;
+        FT_ATOMIC_STORE_PTR_RELEASE(*lo, *hi);
+        FT_ATOMIC_STORE_PTR_RELEASE(*hi, t);
         ++lo;
         --hi;
     }
@@ -3824,7 +3845,7 @@ list_ass_subscript_lock_held(PyObject *_self, PyObject *item, PyObject *value)
                  cur += (size_t)step, i++) {
                 garbage[i] = selfitems[cur];
                 ins = Py_NewRef(seqitems[i]);
-                selfitems[cur] = ins;
+                FT_ATOMIC_STORE_PTR_RELEASE(selfitems[cur], ins);
             }
 
             for (i = 0; i < slicelength; i++) {
diff --git a/Objects/obmalloc.c b/Objects/obmalloc.c
index b1f9fa2e692..c4ccc9e283f 100644
--- a/Objects/obmalloc.c
+++ b/Objects/obmalloc.c
@@ -468,7 +468,7 @@ _PyMem_ArenaAlloc(void *Py_UNUSED(ctx), size_t size)
     if (ptr == MAP_FAILED)
         return NULL;
     assert(ptr != NULL);
-    _PyAnnotateMemoryMap(ptr, size, "cpython:pymalloc");
+    (void)_PyAnnotateMemoryMap(ptr, size, "cpython:pymalloc");
     return ptr;
 #else
     return malloc(size);
diff --git a/Objects/rangeobject.c b/Objects/rangeobject.c
index e93346fb277..55b7f108730 100644
--- a/Objects/rangeobject.c
+++ b/Objects/rangeobject.c
@@ -9,6 +9,21 @@
 #include "pycore_range.h"
 #include "pycore_tuple.h"         // _PyTuple_ITEMS()
 
+typedef struct {
+    PyObject_HEAD
+    PyObject *start;
+    PyObject *step;
+    PyObject *len;
+} longrangeiterobject;
+
+/*[clinic input]
+class range_iterator "_PyRangeIterObject *" "&PyRangeIter_Type"
+class longrange_iterator "longrangeiterobject *" "&PyLongRangeIter_Type"
+[clinic start generated code]*/
+/*[clinic end generated code: output=da39a3ee5e6b4b0d input=c7d97a63d1cfa6b3]*/
+
+#include "clinic/rangeobject.c.h"
+
 
 /* Support objects whose length is > PY_SSIZE_T_MAX.
 
@@ -830,30 +845,46 @@ PyTypeObject PyRange_Type = {
 static PyObject *
 rangeiter_next(PyObject *op)
 {
+    PyObject *ret = NULL;
+    Py_BEGIN_CRITICAL_SECTION(op);
     _PyRangeIterObject *r = (_PyRangeIterObject*)op;
     if (r->len > 0) {
         long result = r->start;
         r->start = result + r->step;
         r->len--;
-        return PyLong_FromLong(result);
+        ret = PyLong_FromLong(result);
     }
-    return NULL;
+    Py_END_CRITICAL_SECTION();
+    return ret;
 }
 
+/*[clinic input]
+@critical_section
+range_iterator.__length_hint__
+    self as r: self(type="_PyRangeIterObject *")
+
+Private method returning an estimate of len(list(it)).
+[clinic start generated code]*/
+
 static PyObject *
-rangeiter_len(PyObject *op, PyObject *Py_UNUSED(ignored))
+range_iterator___length_hint___impl(_PyRangeIterObject *r)
+/*[clinic end generated code: output=9ba6f22b1fc23dcc input=e3eb311e99d76e43]*/
 {
-    _PyRangeIterObject *r = (_PyRangeIterObject*)op;
     return PyLong_FromLong(r->len);
 }
 
-PyDoc_STRVAR(length_hint_doc,
-             "Private method returning an estimate of len(list(it)).");
+/*[clinic input]
+@critical_section
+range_iterator.__reduce__
+    self as r: self(type="_PyRangeIterObject *")
+
+Return state information for pickling.
+[clinic start generated code]*/
 
 static PyObject *
-rangeiter_reduce(PyObject *op, PyObject *Py_UNUSED(ignored))
+range_iterator___reduce___impl(_PyRangeIterObject *r)
+/*[clinic end generated code: output=c44d53750c388415 input=75a25b7076dc2c54]*/
 {
-    _PyRangeIterObject *r = (_PyRangeIterObject*)op;
     PyObject *start=NULL, *stop=NULL, *step=NULL;
     PyObject *range;
 
@@ -881,10 +912,20 @@ err:
     return NULL;
 }
 
+/*[clinic input]
+@critical_section
+range_iterator.__setstate__
+    self as r: self(type="_PyRangeIterObject *")
+    state: object
+    /
+
+Set state information for unpickling.
+[clinic start generated code]*/
+
 static PyObject *
-rangeiter_setstate(PyObject *op, PyObject *state)
+range_iterator___setstate___impl(_PyRangeIterObject *r, PyObject *state)
+/*[clinic end generated code: output=464b3cbafc2e3562 input=c8c84fab2519d200]*/
 {
-    _PyRangeIterObject *r = (_PyRangeIterObject*)op;
     long index = PyLong_AsLong(state);
     if (index == -1 && PyErr_Occurred())
         return NULL;
@@ -904,13 +945,10 @@ rangeiter_dealloc(PyObject *self)
     _Py_FREELIST_FREE(range_iters, (_PyRangeIterObject *)self, PyObject_Free);
 }
 
-PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
-PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
-
 static PyMethodDef rangeiter_methods[] = {
-    {"__length_hint__", rangeiter_len, METH_NOARGS, length_hint_doc},
-    {"__reduce__", rangeiter_reduce, METH_NOARGS, reduce_doc},
-    {"__setstate__", rangeiter_setstate, METH_O, setstate_doc},
+    RANGE_ITERATOR___LENGTH_HINT___METHODDEF
+    RANGE_ITERATOR___REDUCE___METHODDEF
+    RANGE_ITERATOR___SETSTATE___METHODDEF
     {NULL,              NULL}           /* sentinel */
 };
 
@@ -995,25 +1033,34 @@ fast_range_iter(long start, long stop, long step, long len)
     return (PyObject *)it;
 }
 
-typedef struct {
-    PyObject_HEAD
-    PyObject *start;
-    PyObject *step;
-    PyObject *len;
-} longrangeiterobject;
+/*[clinic input]
+@critical_section
+longrange_iterator.__length_hint__
+    self as r: self(type="longrangeiterobject *")
+
+Private method returning an estimate of len(list(it)).
+[clinic start generated code]*/
 
 static PyObject *
-longrangeiter_len(PyObject *op, PyObject *Py_UNUSED(ignored))
+longrange_iterator___length_hint___impl(longrangeiterobject *r)
+/*[clinic end generated code: output=e1bce24da7e8bfde input=ba94b050d940411e]*/
 {
-    longrangeiterobject *r = (longrangeiterobject*)op;
     Py_INCREF(r->len);
     return r->len;
 }
 
+/*[clinic input]
+@critical_section
+longrange_iterator.__reduce__
+    self as r: self(type="longrangeiterobject *")
+
+Return state information for pickling.
+[clinic start generated code]*/
+
 static PyObject *
-longrangeiter_reduce(PyObject *op, PyObject *Py_UNUSED(ignored))
+longrange_iterator___reduce___impl(longrangeiterobject *r)
+/*[clinic end generated code: output=0077f94ae2a4e99a input=2e8930e897ace086]*/
 {
-    longrangeiterobject *r = (longrangeiterobject*)op;
     PyObject *product, *stop=NULL;
     PyObject *range;
 
@@ -1039,15 +1086,25 @@ longrangeiter_reduce(PyObject *op, PyObject *Py_UNUSED(ignored))
                          range, Py_None);
 }
 
+/*[clinic input]
+@critical_section
+longrange_iterator.__setstate__
+    self as r: self(type="longrangeiterobject *")
+    state: object
+    /
+
+Set state information for unpickling.
+[clinic start generated code]*/
+
 static PyObject *
-longrangeiter_setstate(PyObject *op, PyObject *state)
+longrange_iterator___setstate___impl(longrangeiterobject *r, PyObject *state)
+/*[clinic end generated code: output=870787f0574f0da4 input=8b116de3018de824]*/
 {
     if (!PyLong_CheckExact(state)) {
         PyErr_Format(PyExc_TypeError, "state must be an int, not %T", state);
         return NULL;
     }
 
-    longrangeiterobject *r = (longrangeiterobject*)op;
     PyObject *zero = _PyLong_GetZero();  // borrowed reference
     int cmp;
 
@@ -1085,9 +1142,9 @@ longrangeiter_setstate(PyObject *op, PyObject *state)
 }
 
 static PyMethodDef longrangeiter_methods[] = {
-    {"__length_hint__", longrangeiter_len, METH_NOARGS, length_hint_doc},
-    {"__reduce__", longrangeiter_reduce, METH_NOARGS, reduce_doc},
-    {"__setstate__", longrangeiter_setstate, METH_O, setstate_doc},
+    LONGRANGE_ITERATOR___LENGTH_HINT___METHODDEF
+    LONGRANGE_ITERATOR___REDUCE___METHODDEF
+    LONGRANGE_ITERATOR___SETSTATE___METHODDEF
     {NULL,              NULL}           /* sentinel */
 };
 
@@ -1102,7 +1159,7 @@ longrangeiter_dealloc(PyObject *op)
 }
 
 static PyObject *
-longrangeiter_next(PyObject *op)
+longrangeiter_next_lock_held(PyObject *op)
 {
     longrangeiterobject *r = (longrangeiterobject*)op;
     if (PyObject_RichCompareBool(r->len, _PyLong_GetZero(), Py_GT) != 1)
@@ -1123,6 +1180,16 @@ longrangeiter_next(PyObject *op)
     return result;
 }
 
+static PyObject *
+longrangeiter_next(PyObject *op)
+{
+    PyObject *result;
+    Py_BEGIN_CRITICAL_SECTION(op);
+    result = longrangeiter_next_lock_held(op);
+    Py_END_CRITICAL_SECTION();
+    return result;
+}
+
 PyTypeObject PyLongRangeIter_Type = {
         PyVarObject_HEAD_INIT(&PyType_Type, 0)
         "longrange_iterator",                   /* tp_name */
diff --git a/PCbuild/_remote_debugging.vcxproj b/PCbuild/_remote_debugging.vcxproj
index 830b7b87448..0e86ce9f4c9 100644
--- a/PCbuild/_remote_debugging.vcxproj
+++ b/PCbuild/_remote_debugging.vcxproj
@@ -105,10 +105,13 @@
     <ClCompile Include="..\Modules\_remote_debugging\frame_cache.c" />
     <ClCompile Include="..\Modules\_remote_debugging\threads.c" />
     <ClCompile Include="..\Modules\_remote_debugging\asyncio.c" />
+    <ClCompile Include="..\Modules\_remote_debugging\binary_io_writer.c" />
+    <ClCompile Include="..\Modules\_remote_debugging\binary_io_reader.c" />
     <ClCompile Include="..\Modules\_remote_debugging\subprocess.c" />
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="..\Modules\_remote_debugging\_remote_debugging.h" />
+    <ClInclude Include="..\Modules\_remote_debugging\binary_io.h" />
   </ItemGroup>
   <ItemGroup>
     <ResourceCompile Include="..\PC\python_nt.rc" />
diff --git a/PCbuild/_remote_debugging.vcxproj.filters b/PCbuild/_remote_debugging.vcxproj.filters
index 793a3256c52..59d4d5c5c33 100644
--- a/PCbuild/_remote_debugging.vcxproj.filters
+++ b/PCbuild/_remote_debugging.vcxproj.filters
@@ -33,6 +33,12 @@
     <ClCompile Include="..\Modules\_remote_debugging\asyncio.c">
       <Filter>Source Files</Filter>
     </ClCompile>
+    <ClCompile Include="..\Modules\_remote_debugging\binary_io_writer.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\Modules\_remote_debugging\binary_io_reader.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
     <ClCompile Include="..\Modules\_remote_debugging\subprocess.c">
       <Filter>Source Files</Filter>
     </ClCompile>
@@ -41,6 +47,9 @@
     <ClInclude Include="..\Modules\_remote_debugging\_remote_debugging.h">
       <Filter>Header Files</Filter>
     </ClInclude>
+    <ClInclude Include="..\Modules\_remote_debugging\binary_io.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
   </ItemGroup>
   <ItemGroup>
     <ResourceCompile Include="..\PC\python_nt.rc">
diff --git a/PCbuild/pythoncore.vcxproj b/PCbuild/pythoncore.vcxproj
index dcfb75ce162..5e7e739dc5f 100644
--- a/PCbuild/pythoncore.vcxproj
+++ b/PCbuild/pythoncore.vcxproj
@@ -600,7 +600,9 @@
     <ClCompile Include="..\Python\bltinmodule.c" />
     <ClCompile Include="..\Python\bootstrap_hash.c" />
     <ClCompile Include="..\Python\brc.c" />
-    <ClCompile Include="..\Python\ceval.c" />
+    <ClCompile Include="..\Python\ceval.c">
+      <AdditionalOptions Condition="'$(UseTailCallInterp)' == 'true' and $(PlatformToolset) != 'ClangCL'">/std:clatest %(AdditionalOptions)</AdditionalOptions>
+    </ClCompile>
     <ClCompile Include="..\Python\codecs.c" />
     <ClCompile Include="..\Python\codegen.c" />
     <ClCompile Include="..\Python\compile.c" />
diff --git a/Parser/pegen.c b/Parser/pegen.c
index a38e973b3f6..7ecc55eee13 100644
--- a/Parser/pegen.c
+++ b/Parser/pegen.c
@@ -3,9 +3,8 @@
 #include "pycore_pystate.h"       // _PyThreadState_GET()
 #include "pycore_parser.h"        // _PYPEGEN_NSTATISTICS
 #include "pycore_pyerrors.h"      // PyExc_IncompleteInputError
-#include "pycore_runtime.h"     // _PyRuntime
+#include "pycore_runtime.h"       // _PyRuntime
 #include "pycore_unicodeobject.h" // _PyUnicode_InternImmortal
-#include "pycore_pyatomic_ft_wrappers.h"
 #include <errcode.h>
 
 #include "lexer/lexer.h"
@@ -303,11 +302,11 @@ error:
 void
 _PyPegen_clear_memo_statistics(void)
 {
-    FT_MUTEX_LOCK(&_PyRuntime.parser.mutex);
+    PyMutex_Lock(&_PyRuntime.parser.mutex);
     for (int i = 0; i < NSTATISTICS; i++) {
         memo_statistics[i] = 0;
     }
-    FT_MUTEX_UNLOCK(&_PyRuntime.parser.mutex);
+    PyMutex_Unlock(&_PyRuntime.parser.mutex);
 }
 
 PyObject *
@@ -318,22 +317,22 @@ _PyPegen_get_memo_statistics(void)
         return NULL;
     }
 
-    FT_MUTEX_LOCK(&_PyRuntime.parser.mutex);
+    PyMutex_Lock(&_PyRuntime.parser.mutex);
     for (int i = 0; i < NSTATISTICS; i++) {
         PyObject *value = PyLong_FromLong(memo_statistics[i]);
         if (value == NULL) {
-            FT_MUTEX_UNLOCK(&_PyRuntime.parser.mutex);
+            PyMutex_Unlock(&_PyRuntime.parser.mutex);
             Py_DECREF(ret);
             return NULL;
         }
         // PyList_SetItem borrows a reference to value.
         if (PyList_SetItem(ret, i, value) < 0) {
-            FT_MUTEX_UNLOCK(&_PyRuntime.parser.mutex);
+            PyMutex_Unlock(&_PyRuntime.parser.mutex);
             Py_DECREF(ret);
             return NULL;
         }
     }
-    FT_MUTEX_UNLOCK(&_PyRuntime.parser.mutex);
+    PyMutex_Unlock(&_PyRuntime.parser.mutex);
     return ret;
 }
 #endif
@@ -359,9 +358,9 @@ _PyPegen_is_memoized(Parser *p, int type, void *pres)
                 if (count <= 0) {
                     count = 1;
                 }
-                FT_MUTEX_LOCK(&_PyRuntime.parser.mutex);
+                PyMutex_Lock(&_PyRuntime.parser.mutex);
                 memo_statistics[type] += count;
-                FT_MUTEX_UNLOCK(&_PyRuntime.parser.mutex);
+                PyMutex_Unlock(&_PyRuntime.parser.mutex);
             }
 #endif
             p->mark = m->mark;
diff --git a/Python/bytecodes.c b/Python/bytecodes.c
index 0dbfe962684..da06d53f144 100644
--- a/Python/bytecodes.c
+++ b/Python/bytecodes.c
@@ -240,7 +240,7 @@ dummy_func(
 
         op(_MONITOR_RESUME, (--)) {
             int err = _Py_call_instrumentation(
-                    tstate, oparg > 0, frame, this_instr);
+                    tstate, oparg == 0 ? PY_MONITORING_EVENT_PY_START : PY_MONITORING_EVENT_PY_RESUME, frame, this_instr);
             ERROR_IF(err);
             if (frame->instr_ptr != this_instr) {
                 /* Instrumentation has jumped */
@@ -742,7 +742,7 @@ dummy_func(
         macro(BINARY_OP_SUBTRACT_FLOAT) =
             _GUARD_TOS_FLOAT + _GUARD_NOS_FLOAT + unused/5 + _BINARY_OP_SUBTRACT_FLOAT + _POP_TOP_FLOAT + _POP_TOP_FLOAT;
 
-        pure op(_BINARY_OP_ADD_UNICODE, (left, right -- res)) {
+        pure op(_BINARY_OP_ADD_UNICODE, (left, right -- res, l, r)) {
             PyObject *left_o = PyStackRef_AsPyObjectBorrow(left);
             PyObject *right_o = PyStackRef_AsPyObjectBorrow(right);
             assert(PyUnicode_CheckExact(left_o));
@@ -750,15 +750,17 @@ dummy_func(
 
             STAT_INC(BINARY_OP, hit);
             PyObject *res_o = PyUnicode_Concat(left_o, right_o);
-            PyStackRef_CLOSE_SPECIALIZED(right, _PyUnicode_ExactDealloc);
-            PyStackRef_CLOSE_SPECIALIZED(left, _PyUnicode_ExactDealloc);
-            INPUTS_DEAD();
-            ERROR_IF(res_o == NULL);
             res = PyStackRef_FromPyObjectSteal(res_o);
+            if (PyStackRef_IsNull(res)) {
+                ERROR_NO_POP();
+            }
+            l = left;
+            r = right;
+            INPUTS_DEAD();
         }
 
         macro(BINARY_OP_ADD_UNICODE) =
-            _GUARD_TOS_UNICODE + _GUARD_NOS_UNICODE + unused/5 + _BINARY_OP_ADD_UNICODE;
+            _GUARD_TOS_UNICODE + _GUARD_NOS_UNICODE + unused/5 + _BINARY_OP_ADD_UNICODE + _POP_TOP_UNICODE + _POP_TOP_UNICODE;
 
         // This is a subtle one. It's a super-instruction for
         // BINARY_OP_ADD_UNICODE followed by STORE_FAST
@@ -891,9 +893,9 @@ dummy_func(
         macro(STORE_SLICE) = _SPECIALIZE_STORE_SLICE + _STORE_SLICE;
 
         macro(BINARY_OP_SUBSCR_LIST_INT) =
-            _GUARD_TOS_INT + _GUARD_NOS_LIST + unused/5 + _BINARY_OP_SUBSCR_LIST_INT;
+            _GUARD_TOS_INT + _GUARD_NOS_LIST + unused/5 + _BINARY_OP_SUBSCR_LIST_INT + _POP_TOP_INT + POP_TOP;
 
-        op(_BINARY_OP_SUBSCR_LIST_INT, (list_st, sub_st -- res)) {
+        op(_BINARY_OP_SUBSCR_LIST_INT, (list_st, sub_st -- res, ls, ss)) {
             PyObject *sub = PyStackRef_AsPyObjectBorrow(sub_st);
             PyObject *list = PyStackRef_AsPyObjectBorrow(list_st);
 
@@ -916,7 +918,9 @@ dummy_func(
             res = PyStackRef_FromPyObjectNew(res_o);
 #endif
             STAT_INC(BINARY_OP, hit);
-            DECREF_INPUTS();
+            ls = list_st;
+            ss = sub_st;
+            INPUTS_DEAD();
         }
 
         macro(BINARY_OP_SUBSCR_LIST_SLICE) =
@@ -937,9 +941,9 @@ dummy_func(
         }
 
         macro(BINARY_OP_SUBSCR_STR_INT) =
-            _GUARD_TOS_INT + _GUARD_NOS_UNICODE + unused/5 + _BINARY_OP_SUBSCR_STR_INT;
+            _GUARD_TOS_INT + _GUARD_NOS_UNICODE + unused/5 + _BINARY_OP_SUBSCR_STR_INT + _POP_TOP_INT + POP_TOP;
 
-        op(_BINARY_OP_SUBSCR_STR_INT, (str_st, sub_st -- res)) {
+        op(_BINARY_OP_SUBSCR_STR_INT, (str_st, sub_st -- res, s, i)) {
             PyObject *sub = PyStackRef_AsPyObjectBorrow(sub_st);
             PyObject *str = PyStackRef_AsPyObjectBorrow(str_st);
 
@@ -954,9 +958,9 @@ dummy_func(
             assert(c < 128);
             STAT_INC(BINARY_OP, hit);
             PyObject *res_o = (PyObject*)&_Py_SINGLETON(strings).ascii[c];
-            PyStackRef_CLOSE_SPECIALIZED(sub_st, _PyLong_ExactDealloc);
-            DEAD(sub_st);
-            PyStackRef_CLOSE(str_st);
+            INPUTS_DEAD();
+            s = str_st;
+            i = sub_st;
             res = PyStackRef_FromPyObjectBorrow(res_o);
         }
 
@@ -1308,14 +1312,13 @@ dummy_func(
             assert(frame->owner != FRAME_OWNED_BY_INTERPRETER);
             if ((tstate->interp->eval_frame == NULL) &&
                 (Py_TYPE(receiver_o) == &PyGen_Type || Py_TYPE(receiver_o) == &PyCoro_Type) &&
-                ((PyGenObject *)receiver_o)->gi_frame_state < FRAME_EXECUTING)
+                gen_try_set_executing((PyGenObject *)receiver_o))
             {
                 PyGenObject *gen = (PyGenObject *)receiver_o;
                 _PyInterpreterFrame *gen_frame = &gen->gi_iframe;
                 _PyFrame_StackPush(gen_frame, PyStackRef_MakeHeapSafe(v));
                 DEAD(v);
                 SYNC_SP();
-                gen->gi_frame_state = FRAME_EXECUTING;
                 gen->gi_exc_state.previous_item = tstate->exc_info;
                 tstate->exc_info = &gen->gi_exc_state;
                 assert(INSTRUCTION_SIZE + oparg <= UINT16_MAX);
@@ -1356,12 +1359,11 @@ dummy_func(
         op(_SEND_GEN_FRAME, (receiver, v -- receiver, gen_frame)) {
             PyGenObject *gen = (PyGenObject *)PyStackRef_AsPyObjectBorrow(receiver);
             DEOPT_IF(Py_TYPE(gen) != &PyGen_Type && Py_TYPE(gen) != &PyCoro_Type);
-            DEOPT_IF(gen->gi_frame_state >= FRAME_EXECUTING);
+            DEOPT_IF(!gen_try_set_executing((PyGenObject *)gen));
             STAT_INC(SEND, hit);
             _PyInterpreterFrame *pushed_frame = &gen->gi_iframe;
             _PyFrame_StackPush(pushed_frame, PyStackRef_MakeHeapSafe(v));
             DEAD(v);
-            gen->gi_frame_state = FRAME_EXECUTING;
             gen->gi_exc_state.previous_item = tstate->exc_info;
             tstate->exc_info = &gen->gi_exc_state;
             assert(INSTRUCTION_SIZE + oparg <= UINT16_MAX);
@@ -1385,7 +1387,6 @@ dummy_func(
             PyGenObject *gen = _PyGen_GetGeneratorFromFrame(frame);
             assert(FRAME_SUSPENDED_YIELD_FROM == FRAME_SUSPENDED + 1);
             assert(oparg == 0 || oparg == 1);
-            gen->gi_frame_state = FRAME_SUSPENDED + oparg;
             _PyStackRef temp = retval;
             DEAD(retval);
             SAVE_STACK();
@@ -1395,6 +1396,8 @@ dummy_func(
             _PyInterpreterFrame *gen_frame = frame;
             frame = tstate->current_frame = frame->previous;
             gen_frame->previous = NULL;
+            ((_PyThreadStateImpl *)tstate)->generator_return_kind = GENERATOR_YIELD;
+            FT_ATOMIC_STORE_INT8_RELEASE(gen->gi_frame_state, FRAME_SUSPENDED + oparg);
             /* We don't know which of these is relevant here, so keep them equal */
             assert(INLINE_CACHE_ENTRIES_SEND == INLINE_CACHE_ENTRIES_FOR_ITER);
             #if TIER_ONE
@@ -1504,8 +1507,8 @@ dummy_func(
         }
 
         inst(LOAD_BUILD_CLASS, ( -- bc)) {
-            PyObject *bc_o;
-            int err = PyMapping_GetOptionalItem(BUILTINS(), &_Py_ID(__build_class__), &bc_o);
+            int err;
+            PyObject *bc_o = _PyMapping_GetOptionalItem2(BUILTINS(), &_Py_ID(__build_class__), &err);
             ERROR_IF(err < 0);
             if (bc_o == NULL) {
                 _PyErr_SetString(tstate, PyExc_NameError,
@@ -1708,8 +1711,9 @@ dummy_func(
 
         inst(LOAD_FROM_DICT_OR_GLOBALS, (mod_or_class_dict -- v)) {
             PyObject *name = GETITEM(FRAME_CO_NAMES, oparg);
-            PyObject *v_o;
-            int err = PyMapping_GetOptionalItem(PyStackRef_AsPyObjectBorrow(mod_or_class_dict), name, &v_o);
+            int err;
+            PyObject *v_o = _PyMapping_GetOptionalItem2(PyStackRef_AsPyObjectBorrow(mod_or_class_dict), name, &err);
+
             PyStackRef_CLOSE(mod_or_class_dict);
             ERROR_IF(err < 0);
             if (v_o == NULL) {
@@ -1732,11 +1736,11 @@ dummy_func(
                 else {
                     /* Slow-path if globals or builtins is not a dict */
                     /* namespace 1: globals */
-                    int err = PyMapping_GetOptionalItem(GLOBALS(), name, &v_o);
+                    v_o = _PyMapping_GetOptionalItem2(GLOBALS(), name, &err);
                     ERROR_IF(err < 0);
                     if (v_o == NULL) {
                         /* namespace 2: builtins */
-                        int err = PyMapping_GetOptionalItem(BUILTINS(), name, &v_o);
+                        v_o = _PyMapping_GetOptionalItem2(BUILTINS(), name, &err);
                         ERROR_IF(err < 0);
                         if (v_o == NULL) {
                             _PyEval_FormatExcCheckArg(
@@ -1895,14 +1899,14 @@ dummy_func(
         }
 
         inst(LOAD_FROM_DICT_OR_DEREF, (class_dict_st -- value)) {
-            PyObject *value_o;
             PyObject *name;
             PyObject *class_dict = PyStackRef_AsPyObjectBorrow(class_dict_st);
 
             assert(class_dict);
             assert(oparg >= 0 && oparg < _PyFrame_GetCode(frame)->co_nlocalsplus);
             name = PyTuple_GET_ITEM(_PyFrame_GetCode(frame)->co_localsplusnames, oparg);
-            int err = PyMapping_GetOptionalItem(class_dict, name, &value_o);
+            int err;
+            PyObject* value_o = _PyMapping_GetOptionalItem2(class_dict, name, &err);
             if (err < 0) {
                 ERROR_NO_POP();
             }
@@ -2071,14 +2075,14 @@ dummy_func(
         }
 
         inst(SETUP_ANNOTATIONS, (--)) {
-            PyObject *ann_dict;
             if (LOCALS() == NULL) {
                 _PyErr_Format(tstate, PyExc_SystemError,
                               "no locals found when setting up annotations");
                 ERROR_IF(true);
             }
             /* check if __annotations__ in locals()... */
-            int err = PyMapping_GetOptionalItem(LOCALS(), &_Py_ID(__annotations__), &ann_dict);
+            int err;
+            PyObject* ann_dict = _PyMapping_GetOptionalItem2(LOCALS(), &_Py_ID(__annotations__), &err);
             ERROR_IF(err < 0);
             if (ann_dict == NULL) {
                 ann_dict = PyDict_New();
@@ -2182,8 +2186,12 @@ dummy_func(
             }
             // we make no attempt to optimize here; specializations should
             // handle any case whose performance we care about
-            PyObject *stack[] = {class, self};
-            PyObject *super = PyObject_Vectorcall(global_super, stack, oparg & 2, NULL);
+            PyObject *super;
+            {
+                // scope to tell MSVC that stack is not escaping
+                PyObject *stack[] = {class, self};
+                super = PyObject_Vectorcall(global_super, stack, oparg & 2, NULL);
+            }
             if (opcode == INSTRUMENTED_LOAD_SUPER_ATTR) {
                 PyObject *arg = oparg & 2 ? class : &_PyInstrumentation_MISSING;
                 if (super == NULL) {
@@ -2242,8 +2250,13 @@ dummy_func(
             PyObject *name = GETITEM(FRAME_CO_NAMES, oparg >> 2);
             PyTypeObject *cls = (PyTypeObject *)class;
             int method_found = 0;
-            PyObject *attr_o = _PySuper_Lookup(cls, self, name,
-                                   Py_TYPE(self)->tp_getattro == PyObject_GenericGetAttr ? &method_found : NULL);
+            PyObject *attr_o;
+            {
+                // scope to tell MSVC that method_found_ptr is not escaping
+                int *method_found_ptr = &method_found;
+                attr_o = _PySuper_Lookup(cls, self, name,
+                    Py_TYPE(self)->tp_getattro == PyObject_GenericGetAttr ? method_found_ptr : NULL);
+            }
             if (attr_o == NULL) {
                 ERROR_NO_POP();
             }
@@ -2356,7 +2369,7 @@ dummy_func(
             DEOPT_IF(!FT_ATOMIC_LOAD_UINT8(_PyObject_InlineValues(owner_o)->valid));
         }
 
-        op(_LOAD_ATTR_INSTANCE_VALUE, (offset/1, owner -- attr)) {
+        op(_LOAD_ATTR_INSTANCE_VALUE, (offset/1, owner -- attr, o)) {
             PyObject *owner_o = PyStackRef_AsPyObjectBorrow(owner);
             PyObject **value_ptr = (PyObject**)(((char *)owner_o) + offset);
             PyObject *attr_o = FT_ATOMIC_LOAD_PTR_ACQUIRE(*value_ptr);
@@ -2370,7 +2383,8 @@ dummy_func(
             attr = PyStackRef_FromPyObjectNew(attr_o);
             #endif
             STAT_INC(LOAD_ATTR, hit);
-            PyStackRef_CLOSE(owner);
+            o = owner;
+            DEAD(owner);
         }
 
         macro(LOAD_ATTR_INSTANCE_VALUE) =
@@ -2378,6 +2392,7 @@ dummy_func(
             _GUARD_TYPE_VERSION +
             _CHECK_MANAGED_OBJECT_HAS_VALUES +
             _LOAD_ATTR_INSTANCE_VALUE +
+            POP_TOP +
             unused/5 +
             _PUSH_NULL_CONDITIONAL;
 
@@ -2411,7 +2426,7 @@ dummy_func(
             unused/5 +
             _PUSH_NULL_CONDITIONAL;
 
-        op(_LOAD_ATTR_WITH_HINT, (hint/1, owner -- attr)) {
+        op(_LOAD_ATTR_WITH_HINT, (hint/1, owner -- attr, o)) {
             PyObject *owner_o = PyStackRef_AsPyObjectBorrow(owner);
             assert(Py_TYPE(owner_o)->tp_flags & Py_TPFLAGS_MANAGED_DICT);
             PyDictObject *dict = _PyObject_GetManagedDict(owner_o);
@@ -2447,13 +2462,15 @@ dummy_func(
 #else
             attr = PyStackRef_FromPyObjectNew(attr_o);
 #endif
-            PyStackRef_CLOSE(owner);
+            o = owner;
+            DEAD(owner);
         }
 
         macro(LOAD_ATTR_WITH_HINT) =
             unused/1 +
             _GUARD_TYPE_VERSION +
             _LOAD_ATTR_WITH_HINT +
+            POP_TOP +
             unused/5 +
             _PUSH_NULL_CONDITIONAL;
 
@@ -2601,7 +2618,7 @@ dummy_func(
             _STORE_ATTR_INSTANCE_VALUE +
             POP_TOP;
 
-        op(_STORE_ATTR_WITH_HINT, (hint/1, value, owner --)) {
+        op(_STORE_ATTR_WITH_HINT, (hint/1, value, owner -- o)) {
             PyObject *owner_o = PyStackRef_AsPyObjectBorrow(owner);
             assert(Py_TYPE(owner_o)->tp_flags & Py_TPFLAGS_MANAGED_DICT);
             PyDictObject *dict = _PyObject_GetManagedDict(owner_o);
@@ -2624,21 +2641,23 @@ dummy_func(
                 UNLOCK_OBJECT(dict);
                 DEOPT_IF(true);
             }
-            _PyDict_NotifyEvent(tstate->interp, PyDict_EVENT_MODIFIED, dict, name, PyStackRef_AsPyObjectBorrow(value));
+            _PyDict_NotifyEvent(PyDict_EVENT_MODIFIED, dict, name, PyStackRef_AsPyObjectBorrow(value));
             FT_ATOMIC_STORE_PTR_RELEASE(ep->me_value, PyStackRef_AsPyObjectSteal(value));
             UNLOCK_OBJECT(dict);
 
             // old_value should be DECREFed after GC track checking is done, if not, it could raise a segmentation fault,
             // when dict only holds the strong reference to value in ep->me_value.
             STAT_INC(STORE_ATTR, hit);
-            PyStackRef_CLOSE(owner);
+            o = owner;
+            DEAD(owner);
             Py_XDECREF(old_value);
         }
 
         macro(STORE_ATTR_WITH_HINT) =
             unused/1 +
             _GUARD_TYPE_VERSION +
-            _STORE_ATTR_WITH_HINT;
+            _STORE_ATTR_WITH_HINT +
+            POP_TOP;
 
         op(_STORE_ATTR_SLOT, (index/1, value, owner -- o)) {
             PyObject *owner_o = PyStackRef_AsPyObjectBorrow(owner);
@@ -3397,18 +3416,10 @@ dummy_func(
         op(_FOR_ITER_GEN_FRAME, (iter, null -- iter, null, gen_frame)) {
             PyGenObject *gen = (PyGenObject *)PyStackRef_AsPyObjectBorrow(iter);
             DEOPT_IF(Py_TYPE(gen) != &PyGen_Type);
-#ifdef Py_GIL_DISABLED
-            // Since generators can't be used by multiple threads anyway we
-            // don't need to deopt here, but this lets us work on making
-            // generators thread-safe without necessarily having to
-            // specialize them thread-safely as well.
-            DEOPT_IF(!_PyObject_IsUniquelyReferenced((PyObject *)gen));
-#endif
-            DEOPT_IF(gen->gi_frame_state >= FRAME_EXECUTING);
+            DEOPT_IF(!gen_try_set_executing((PyGenObject *)gen));
             STAT_INC(FOR_ITER, hit);
             _PyInterpreterFrame *pushed_frame = &gen->gi_iframe;
             _PyFrame_StackPush(pushed_frame, PyStackRef_None);
-            gen->gi_frame_state = FRAME_EXECUTING;
             gen->gi_exc_state.previous_item = tstate->exc_info;
             tstate->exc_info = &gen->gi_exc_state;
             pushed_frame->previous = frame;
@@ -3473,10 +3484,14 @@ dummy_func(
             }
             assert(PyStackRef_IsTaggedInt(lasti));
             (void)lasti; // Shut up compiler warning if asserts are off
-            PyObject *stack[5] = {NULL, PyStackRef_AsPyObjectBorrow(exit_self), exc, val_o, tb};
-            int has_self = !PyStackRef_IsNull(exit_self);
-            PyObject *res_o = PyObject_Vectorcall(exit_func_o, stack + 2 - has_self,
-                    (3 + has_self) | PY_VECTORCALL_ARGUMENTS_OFFSET, NULL);
+            PyObject* res_o;
+            {
+                // scope to tell MSVC that stack is not escaping
+                PyObject *stack[5] = {NULL, PyStackRef_AsPyObjectBorrow(exit_self), exc, val_o, tb};
+                int has_self = !PyStackRef_IsNull(exit_self);
+                res_o = PyObject_Vectorcall(exit_func_o, stack + 2 - has_self,
+                        (3 + has_self) | PY_VECTORCALL_ARGUMENTS_OFFSET, NULL);
+            }
             Py_XDECREF(original_tb);
             ERROR_IF(res_o == NULL);
             res = PyStackRef_FromPyObjectSteal(res_o);
@@ -3708,36 +3723,18 @@ dummy_func(
                 frame->return_offset = INSTRUCTION_SIZE;
                 DISPATCH_INLINED(new_frame);
             }
-            /* Callable is not a normal Python function */
-            STACKREFS_TO_PYOBJECTS(arguments, total_args, args_o);
-            if (CONVERSION_FAILED(args_o)) {
-                DECREF_INPUTS();
-                ERROR_IF(true);
-            }
-            PyObject *res_o = PyObject_Vectorcall(
-                callable_o, args_o,
-                total_args | PY_VECTORCALL_ARGUMENTS_OFFSET,
-                NULL);
-            STACKREFS_TO_PYOBJECTS_CLEANUP(args_o);
-            if (opcode == INSTRUMENTED_CALL) {
-                PyObject *arg = total_args == 0 ?
-                    &_PyInstrumentation_MISSING : PyStackRef_AsPyObjectBorrow(arguments[0]);
-                if (res_o == NULL) {
-                    _Py_call_instrumentation_exc2(
-                        tstate, PY_MONITORING_EVENT_C_RAISE,
-                        frame, this_instr, callable_o, arg);
-                }
-                else {
-                    int err = _Py_call_instrumentation_2args(
-                        tstate, PY_MONITORING_EVENT_C_RETURN,
-                        frame, this_instr, callable_o, arg);
-                    if (err < 0) {
-                        Py_CLEAR(res_o);
-                    }
-                }
-            }
-            assert((res_o != NULL) ^ (_PyErr_Occurred(tstate) != NULL));
-            DECREF_INPUTS();
+            PyObject* res_o = _Py_VectorCallInstrumentation_StackRefSteal(
+                callable,
+                arguments,
+                total_args,
+                PyStackRef_NULL,
+                opcode == INSTRUMENTED_CALL,
+                frame,
+                this_instr,
+                tstate);
+            DEAD(args);
+            DEAD(self_or_null);
+            DEAD(callable);
             ERROR_IF(res_o == NULL);
             res = PyStackRef_FromPyObjectSteal(res_o);
         }
@@ -4588,35 +4585,19 @@ dummy_func(
                 frame->return_offset = INSTRUCTION_SIZE;
                 DISPATCH_INLINED(new_frame);
             }
-            /* Callable is not a normal Python function */
-            STACKREFS_TO_PYOBJECTS(arguments, total_args, args_o);
-            if (CONVERSION_FAILED(args_o)) {
-                DECREF_INPUTS();
-                ERROR_IF(true);
-            }
-            PyObject *res_o = PyObject_Vectorcall(
-                callable_o, args_o,
-                positional_args | PY_VECTORCALL_ARGUMENTS_OFFSET,
-                kwnames_o);
-            STACKREFS_TO_PYOBJECTS_CLEANUP(args_o);
-            if (opcode == INSTRUMENTED_CALL_KW) {
-                PyObject *arg = total_args == 0 ?
-                    &_PyInstrumentation_MISSING : PyStackRef_AsPyObjectBorrow(arguments[0]);
-                if (res_o == NULL) {
-                    _Py_call_instrumentation_exc2(
-                        tstate, PY_MONITORING_EVENT_C_RAISE,
-                        frame, this_instr, callable_o, arg);
-                }
-                else {
-                    int err = _Py_call_instrumentation_2args(
-                        tstate, PY_MONITORING_EVENT_C_RETURN,
-                        frame, this_instr, callable_o, arg);
-                    if (err < 0) {
-                        Py_CLEAR(res_o);
-                    }
-                }
-            }
-            DECREF_INPUTS();
+            PyObject* res_o = _Py_VectorCallInstrumentation_StackRefSteal(
+                callable,
+                arguments,
+                total_args,
+                kwnames,
+                opcode == INSTRUMENTED_CALL_KW,
+                frame,
+                this_instr,
+                tstate);
+            DEAD(kwnames);
+            DEAD(args);
+            DEAD(self_or_null);
+            DEAD(callable);
             ERROR_IF(res_o == NULL);
             res = PyStackRef_FromPyObjectSteal(res_o);
         }
@@ -5277,6 +5258,13 @@ dummy_func(
             value = PyStackRef_FromPyObjectBorrow(ptr);
         }
 
+        tier2 op(_SHUFFLE_3_LOAD_CONST_INLINE_BORROW, (ptr/4, callable, null, arg -- res, a, c)) {
+            res = PyStackRef_FromPyObjectBorrow(ptr);
+            a = arg;
+            c = callable;
+            INPUTS_DEAD();
+        }
+
         tier2 op(_POP_CALL_TWO_LOAD_CONST_INLINE_BORROW, (ptr/4, callable, null, pop1, pop2 -- value)) {
             PyStackRef_CLOSE(pop2);
             PyStackRef_CLOSE(pop1);
diff --git a/Python/ceval.c b/Python/ceval.c
index 37679d4cd18..924afaa9744 100644
--- a/Python/ceval.c
+++ b/Python/ceval.c
@@ -1071,6 +1071,65 @@ cleanup:
     return res;
 }
 
+PyObject*
+_Py_VectorCallInstrumentation_StackRefSteal(
+    _PyStackRef callable,
+    _PyStackRef* arguments,
+    int total_args,
+    _PyStackRef kwnames,
+    bool call_instrumentation,
+    _PyInterpreterFrame* frame,
+    _Py_CODEUNIT* this_instr,
+    PyThreadState* tstate)
+{
+    PyObject* res;
+    STACKREFS_TO_PYOBJECTS(arguments, total_args, args_o);
+    if (CONVERSION_FAILED(args_o)) {
+        res = NULL;
+        goto cleanup;
+    }
+    PyObject* callable_o = PyStackRef_AsPyObjectBorrow(callable);
+    PyObject* kwnames_o = PyStackRef_AsPyObjectBorrow(kwnames);
+    int positional_args = total_args;
+    if (kwnames_o != NULL) {
+        positional_args -= (int)PyTuple_GET_SIZE(kwnames_o);
+    }
+    res = PyObject_Vectorcall(
+        callable_o, args_o,
+        positional_args | PY_VECTORCALL_ARGUMENTS_OFFSET,
+        kwnames_o);
+    STACKREFS_TO_PYOBJECTS_CLEANUP(args_o);
+    if (call_instrumentation) {
+        PyObject* arg = total_args == 0 ?
+            &_PyInstrumentation_MISSING : PyStackRef_AsPyObjectBorrow(arguments[0]);
+        if (res == NULL) {
+            _Py_call_instrumentation_exc2(
+                tstate, PY_MONITORING_EVENT_C_RAISE,
+                frame, this_instr, callable_o, arg);
+        }
+        else {
+            int err = _Py_call_instrumentation_2args(
+                tstate, PY_MONITORING_EVENT_C_RETURN,
+                frame, this_instr, callable_o, arg);
+            if (err < 0) {
+                Py_CLEAR(res);
+            }
+        }
+    }
+    assert((res != NULL) ^ (PyErr_Occurred() != NULL));
+cleanup:
+    PyStackRef_XCLOSE(kwnames);
+    // arguments is a pointer into the GC visible stack,
+    // so we must NULL out values as we clear them.
+    for (int i = total_args - 1; i >= 0; i--) {
+        _PyStackRef tmp = arguments[i];
+        arguments[i] = PyStackRef_NULL;
+        PyStackRef_CLOSE(tmp);
+    }
+    PyStackRef_CLOSE(callable);
+    return res;
+}
+
 PyObject *
 _Py_BuiltinCallFast_StackRefSteal(
     _PyStackRef callable,
@@ -1601,7 +1660,7 @@ early_exit:
 }
 #ifdef _Py_TIER2
 #ifdef _Py_JIT
-_PyJitEntryFuncPtr _Py_jit_entry = _Py_LazyJitTrampoline;
+_PyJitEntryFuncPtr _Py_jit_entry = _Py_LazyJitShim;
 #else
 _PyJitEntryFuncPtr _Py_jit_entry = _PyTier2Interpreter;
 #endif
@@ -1617,7 +1676,7 @@ _PyTier2Interpreter(
     const _PyUOpInstruction *next_uop;
     int oparg;
     /* Set up "jit" state after entry from tier 1.
-     * This mimics what the jit trampoline function does. */
+     * This mimics what the jit shim function does. */
     tstate->jit_exit = NULL;
     _PyStackRef _tos_cache0 = PyStackRef_ZERO_BITS;
     _PyStackRef _tos_cache1 = PyStackRef_ZERO_BITS;
@@ -2304,14 +2363,15 @@ clear_gen_frame(PyThreadState *tstate, _PyInterpreterFrame * frame)
 {
     assert(frame->owner == FRAME_OWNED_BY_GENERATOR);
     PyGenObject *gen = _PyGen_GetGeneratorFromFrame(frame);
-    gen->gi_frame_state = FRAME_CLEARED;
+    FT_ATOMIC_STORE_INT8_RELEASE(gen->gi_frame_state, FRAME_CLEARED);
+    ((_PyThreadStateImpl *)tstate)->generator_return_kind = GENERATOR_RETURN;
     assert(tstate->exc_info == &gen->gi_exc_state);
     tstate->exc_info = gen->gi_exc_state.previous_item;
     gen->gi_exc_state.previous_item = NULL;
     assert(frame->frame_obj == NULL || frame->frame_obj->f_frame == frame);
+    frame->previous = NULL;
     _PyFrame_ClearExceptCode(frame);
     _PyErr_ClearExcState(&gen->gi_exc_state);
-    frame->previous = NULL;
 }
 
 void
@@ -3979,15 +4039,13 @@ _PyEval_GetAwaitable(PyObject *iterable, int oparg)
             Py_TYPE(iterable), oparg);
     }
     else if (PyCoro_CheckExact(iter)) {
-        PyObject *yf = _PyGen_yf((PyGenObject*)iter);
-        if (yf != NULL) {
-            /* `iter` is a coroutine object that is being
-                awaited, `yf` is a pointer to the current awaitable
-                being awaited on. */
-            Py_DECREF(yf);
+        PyCoroObject *coro = (PyCoroObject *)iter;
+        int8_t frame_state = FT_ATOMIC_LOAD_INT8_RELAXED(coro->cr_frame_state);
+        if (frame_state == FRAME_SUSPENDED_YIELD_FROM) {
+            /* `iter` is a coroutine object that is being awaited. */
             Py_CLEAR(iter);
             _PyErr_SetString(PyThreadState_GET(), PyExc_RuntimeError,
-                                "coroutine is being awaited already");
+                             "coroutine is being awaited already");
         }
     }
     return iter;
diff --git a/Python/ceval_gil.c b/Python/ceval_gil.c
index f6ada3892f8..88cc66e97f3 100644
--- a/Python/ceval_gil.c
+++ b/Python/ceval_gil.c
@@ -1397,13 +1397,19 @@ _Py_HandlePending(PyThreadState *tstate)
     if ((breaker & _PY_GC_SCHEDULED_BIT) != 0) {
         _Py_unset_eval_breaker_bit(tstate, _PY_GC_SCHEDULED_BIT);
         _Py_RunGC(tstate);
+#ifdef _Py_TIER2
+        _Py_ClearExecutorDeletionList(tstate->interp);
+#endif
     }
 
+#ifdef _Py_TIER2
     if ((breaker & _PY_EVAL_JIT_INVALIDATE_COLD_BIT) != 0) {
         _Py_unset_eval_breaker_bit(tstate, _PY_EVAL_JIT_INVALIDATE_COLD_BIT);
         _Py_Executors_InvalidateCold(tstate->interp);
         tstate->interp->executor_creation_counter = JIT_CLEANUP_THRESHOLD;
+        _Py_ClearExecutorDeletionList(tstate->interp);
     }
+#endif
 
     /* GIL drop request */
     if ((breaker & _PY_GIL_DROP_REQUEST_BIT) != 0) {
diff --git a/Python/ceval_macros.h b/Python/ceval_macros.h
index a526a453dd9..697466d14a6 100644
--- a/Python/ceval_macros.h
+++ b/Python/ceval_macros.h
@@ -87,16 +87,19 @@
 #   elif defined(_MSC_VER) && (_MSC_VER < 1950)
 #       error "You need at least VS 2026 / PlatformToolset v145 for tail calling."
 #   endif
-
-    // Note: [[clang::musttail]] works for GCC 15, but not __attribute__((musttail)) at the moment.
-#   define Py_MUSTTAIL [[clang::musttail]]
-#   define Py_PRESERVE_NONE_CC __attribute__((preserve_none))
-    Py_PRESERVE_NONE_CC typedef PyObject* (*py_tail_call_funcptr)(TAIL_CALL_PARAMS);
+#   if defined(_MSC_VER) && !defined(__clang__)
+#      define Py_MUSTTAIL [[msvc::musttail]]
+#      define Py_PRESERVE_NONE_CC __preserve_none
+#   else
+#       define Py_MUSTTAIL __attribute__((musttail))
+#       define Py_PRESERVE_NONE_CC __attribute__((preserve_none))
+#   endif
+    typedef PyObject *(Py_PRESERVE_NONE_CC *py_tail_call_funcptr)(TAIL_CALL_PARAMS);
 
 #   define DISPATCH_TABLE_VAR instruction_funcptr_table
 #   define DISPATCH_TABLE instruction_funcptr_handler_table
 #   define TRACING_DISPATCH_TABLE instruction_funcptr_tracing_table
-#   define TARGET(op) Py_PRESERVE_NONE_CC PyObject *_TAIL_CALL_##op(TAIL_CALL_PARAMS)
+#   define TARGET(op) Py_NO_INLINE PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_##op(TAIL_CALL_PARAMS)
 
 #   define DISPATCH_GOTO() \
         do { \
@@ -496,3 +499,28 @@ check_periodics(PyThreadState *tstate) {
     return 0;
 }
 
+// Mark the generator as executing. Returns true if the state was changed,
+// false if it was already executing or finished.
+static inline bool
+gen_try_set_executing(PyGenObject *gen)
+{
+#ifdef Py_GIL_DISABLED
+    if (!_PyObject_IsUniquelyReferenced((PyObject *)gen)) {
+        int8_t frame_state = _Py_atomic_load_int8_relaxed(&gen->gi_frame_state);
+        while (frame_state < FRAME_EXECUTING) {
+            if (_Py_atomic_compare_exchange_int8(&gen->gi_frame_state,
+                                                 &frame_state,
+                                                 FRAME_EXECUTING)) {
+                return true;
+            }
+        }
+    }
+#endif
+    // Use faster non-atomic modifications in the GIL-enabled build and when
+    // the object is uniquely referenced in the free-threaded build.
+    if (gen->gi_frame_state < FRAME_EXECUTING) {
+        gen->gi_frame_state = FRAME_EXECUTING;
+        return true;
+    }
+    return false;
+}
diff --git a/Python/context.c b/Python/context.c
index 620e78ab1f9..606ce4b1c8f 100644
--- a/Python/context.c
+++ b/Python/context.c
@@ -343,12 +343,6 @@ PyContextVar_Set(PyObject *ovar, PyObject *val)
     ENSURE_ContextVar(ovar, NULL)
     PyContextVar *var = (PyContextVar *)ovar;
 
-    if (!PyContextVar_CheckExact(var)) {
-        PyErr_SetString(
-            PyExc_TypeError, "an instance of ContextVar was expected");
-        return NULL;
-    }
-
     PyContext *ctx = context_get();
     if (ctx == NULL) {
         return NULL;
@@ -1025,12 +1019,6 @@ static PyObject *
 _contextvars_ContextVar_get_impl(PyContextVar *self, PyObject *default_value)
 /*[clinic end generated code: output=0746bd0aa2ced7bf input=da66664d5d0af4ad]*/
 {
-    if (!PyContextVar_CheckExact(self)) {
-        PyErr_SetString(
-            PyExc_TypeError, "an instance of ContextVar was expected");
-        return NULL;
-    }
-
     PyObject *val;
     if (PyContextVar_Get((PyObject *)self, default_value, &val) < 0) {
         return NULL;
diff --git a/Python/executor_cases.c.h b/Python/executor_cases.c.h
index b2ce7d0d182..c47c485fb7c 100644
--- a/Python/executor_cases.c.h
+++ b/Python/executor_cases.c.h
@@ -4184,12 +4184,14 @@
             break;
         }
 
-        case _BINARY_OP_ADD_UNICODE_r01: {
+        case _BINARY_OP_ADD_UNICODE_r03: {
             CHECK_CURRENT_CACHED_VALUES(0);
             assert(WITHIN_STACK_BOUNDS_IGNORING_CACHE());
             _PyStackRef right;
             _PyStackRef left;
             _PyStackRef res;
+            _PyStackRef l;
+            _PyStackRef r;
             right = stack_pointer[-1];
             left = stack_pointer[-2];
             PyObject *left_o = PyStackRef_AsPyObjectBorrow(left);
@@ -4198,29 +4200,31 @@
             assert(PyUnicode_CheckExact(right_o));
             STAT_INC(BINARY_OP, hit);
             PyObject *res_o = PyUnicode_Concat(left_o, right_o);
-            PyStackRef_CLOSE_SPECIALIZED(right, _PyUnicode_ExactDealloc);
-            PyStackRef_CLOSE_SPECIALIZED(left, _PyUnicode_ExactDealloc);
-            if (res_o == NULL) {
-                stack_pointer += -2;
-                ASSERT_WITHIN_STACK_BOUNDS(__FILE__, __LINE__);
+            res = PyStackRef_FromPyObjectSteal(res_o);
+            if (PyStackRef_IsNull(res)) {
                 SET_CURRENT_CACHED_VALUES(0);
                 JUMP_TO_ERROR();
             }
-            res = PyStackRef_FromPyObjectSteal(res_o);
+            l = left;
+            r = right;
+            _tos_cache2 = r;
+            _tos_cache1 = l;
             _tos_cache0 = res;
-            SET_CURRENT_CACHED_VALUES(1);
+            SET_CURRENT_CACHED_VALUES(3);
             stack_pointer += -2;
             ASSERT_WITHIN_STACK_BOUNDS(__FILE__, __LINE__);
             assert(WITHIN_STACK_BOUNDS_IGNORING_CACHE());
             break;
         }
 
-        case _BINARY_OP_ADD_UNICODE_r11: {
+        case _BINARY_OP_ADD_UNICODE_r13: {
             CHECK_CURRENT_CACHED_VALUES(1);
             assert(WITHIN_STACK_BOUNDS_IGNORING_CACHE());
             _PyStackRef right;
             _PyStackRef left;
             _PyStackRef res;
+            _PyStackRef l;
+            _PyStackRef r;
             _PyStackRef _stack_item_0 = _tos_cache0;
             right = _stack_item_0;
             left = stack_pointer[-1];
@@ -4230,29 +4234,34 @@
             assert(PyUnicode_CheckExact(right_o));
             STAT_INC(BINARY_OP, hit);
             PyObject *res_o = PyUnicode_Concat(left_o, right_o);
-            PyStackRef_CLOSE_SPECIALIZED(right, _PyUnicode_ExactDealloc);
-            PyStackRef_CLOSE_SPECIALIZED(left, _PyUnicode_ExactDealloc);
-            if (res_o == NULL) {
-                stack_pointer += -1;
+            res = PyStackRef_FromPyObjectSteal(res_o);
+            if (PyStackRef_IsNull(res)) {
+                stack_pointer[0] = right;
+                stack_pointer += 1;
                 ASSERT_WITHIN_STACK_BOUNDS(__FILE__, __LINE__);
                 SET_CURRENT_CACHED_VALUES(0);
                 JUMP_TO_ERROR();
             }
-            res = PyStackRef_FromPyObjectSteal(res_o);
+            l = left;
+            r = right;
+            _tos_cache2 = r;
+            _tos_cache1 = l;
             _tos_cache0 = res;
-            SET_CURRENT_CACHED_VALUES(1);
+            SET_CURRENT_CACHED_VALUES(3);
             stack_pointer += -1;
             ASSERT_WITHIN_STACK_BOUNDS(__FILE__, __LINE__);
             assert(WITHIN_STACK_BOUNDS_IGNORING_CACHE());
             break;
         }
 
-        case _BINARY_OP_ADD_UNICODE_r21: {
+        case _BINARY_OP_ADD_UNICODE_r23: {
             CHECK_CURRENT_CACHED_VALUES(2);
             assert(WITHIN_STACK_BOUNDS_IGNORING_CACHE());
             _PyStackRef right;
             _PyStackRef left;
             _PyStackRef res;
+            _PyStackRef l;
+            _PyStackRef r;
             _PyStackRef _stack_item_0 = _tos_cache0;
             _PyStackRef _stack_item_1 = _tos_cache1;
             right = _stack_item_1;
@@ -4263,49 +4272,21 @@
             assert(PyUnicode_CheckExact(right_o));
             STAT_INC(BINARY_OP, hit);
             PyObject *res_o = PyUnicode_Concat(left_o, right_o);
-            PyStackRef_CLOSE_SPECIALIZED(right, _PyUnicode_ExactDealloc);
-            PyStackRef_CLOSE_SPECIALIZED(left, _PyUnicode_ExactDealloc);
-            if (res_o == NULL) {
-                SET_CURRENT_CACHED_VALUES(0);
-                JUMP_TO_ERROR();
-            }
             res = PyStackRef_FromPyObjectSteal(res_o);
-            _tos_cache0 = res;
-            SET_CURRENT_CACHED_VALUES(1);
-            assert(WITHIN_STACK_BOUNDS_IGNORING_CACHE());
-            break;
-        }
-
-        case _BINARY_OP_ADD_UNICODE_r32: {
-            CHECK_CURRENT_CACHED_VALUES(3);
-            assert(WITHIN_STACK_BOUNDS_IGNORING_CACHE());
-            _PyStackRef right;
-            _PyStackRef left;
-            _PyStackRef res;
-            _PyStackRef _stack_item_0 = _tos_cache0;
-            _PyStackRef _stack_item_1 = _tos_cache1;
-            _PyStackRef _stack_item_2 = _tos_cache2;
-            right = _stack_item_2;
-            left = _stack_item_1;
-            PyObject *left_o = PyStackRef_AsPyObjectBorrow(left);
-            PyObject *right_o = PyStackRef_AsPyObjectBorrow(right);
-            assert(PyUnicode_CheckExact(left_o));
-            assert(PyUnicode_CheckExact(right_o));
-            STAT_INC(BINARY_OP, hit);
-            PyObject *res_o = PyUnicode_Concat(left_o, right_o);
-            PyStackRef_CLOSE_SPECIALIZED(right, _PyUnicode_ExactDealloc);
-            PyStackRef_CLOSE_SPECIALIZED(left, _PyUnicode_ExactDealloc);
-            if (res_o == NULL) {
-                stack_pointer[0] = _stack_item_0;
-                stack_pointer += 1;
+            if (PyStackRef_IsNull(res)) {
+                stack_pointer[0] = left;
+                stack_pointer[1] = right;
+                stack_pointer += 2;
                 ASSERT_WITHIN_STACK_BOUNDS(__FILE__, __LINE__);
                 SET_CURRENT_CACHED_VALUES(0);
                 JUMP_TO_ERROR();
             }
-            res = PyStackRef_FromPyObjectSteal(res_o);
-            _tos_cache1 = res;
-            _tos_cache0 = _stack_item_0;
-            SET_CURRENT_CACHED_VALUES(2);
+            l = left;
+            r = right;
+            _tos_cache2 = r;
+            _tos_cache1 = l;
+            _tos_cache0 = res;
+            SET_CURRENT_CACHED_VALUES(3);
             assert(WITHIN_STACK_BOUNDS_IGNORING_CACHE());
             break;
         }
@@ -4563,12 +4544,14 @@
             break;
         }
 
-        case _BINARY_OP_SUBSCR_LIST_INT_r21: {
+        case _BINARY_OP_SUBSCR_LIST_INT_r23: {
             CHECK_CURRENT_CACHED_VALUES(2);
             assert(WITHIN_STACK_BOUNDS_IGNORING_CACHE());
             _PyStackRef sub_st;
             _PyStackRef list_st;
             _PyStackRef res;
+            _PyStackRef ls;
+            _PyStackRef ss;
             _PyStackRef _stack_item_0 = _tos_cache0;
             _PyStackRef _stack_item_1 = _tos_cache1;
             sub_st = _stack_item_1;
@@ -4619,23 +4602,13 @@
             stack_pointer += 2;
             #endif
             STAT_INC(BINARY_OP, hit);
-            _PyFrame_SetStackPointer(frame, stack_pointer);
-            _PyStackRef tmp = list_st;
-            list_st = res;
-            stack_pointer[-2] = list_st;
-            PyStackRef_CLOSE(tmp);
-            tmp = sub_st;
-            sub_st = PyStackRef_NULL;
-            stack_pointer[-1] = sub_st;
-            PyStackRef_CLOSE(tmp);
-            stack_pointer = _PyFrame_GetStackPointer(frame);
-            stack_pointer += -1;
-            ASSERT_WITHIN_STACK_BOUNDS(__FILE__, __LINE__);
+            ls = list_st;
+            ss = sub_st;
+            _tos_cache2 = ss;
+            _tos_cache1 = ls;
             _tos_cache0 = res;
-            _tos_cache1 = PyStackRef_ZERO_BITS;
-            _tos_cache2 = PyStackRef_ZERO_BITS;
-            SET_CURRENT_CACHED_VALUES(1);
-            stack_pointer += -1;
+            SET_CURRENT_CACHED_VALUES(3);
+            stack_pointer += -2;
             ASSERT_WITHIN_STACK_BOUNDS(__FILE__, __LINE__);
             assert(WITHIN_STACK_BOUNDS_IGNORING_CACHE());
             break;
@@ -4688,12 +4661,14 @@
             break;
         }
 
-        case _BINARY_OP_SUBSCR_STR_INT_r21: {
+        case _BINARY_OP_SUBSCR_STR_INT_r23: {
             CHECK_CURRENT_CACHED_VALUES(2);
             assert(WITHIN_STACK_BOUNDS_IGNORING_CACHE());
             _PyStackRef sub_st;
             _PyStackRef str_st;
             _PyStackRef res;
+            _PyStackRef s;
+            _PyStackRef i;
             _PyStackRef _stack_item_0 = _tos_cache0;
             _PyStackRef _stack_item_1 = _tos_cache1;
             sub_st = _stack_item_1;
@@ -4728,15 +4703,13 @@
             assert(c < 128);
             STAT_INC(BINARY_OP, hit);
             PyObject *res_o = (PyObject*)&_Py_SINGLETON(strings).ascii[c];
-            PyStackRef_CLOSE_SPECIALIZED(sub_st, _PyLong_ExactDealloc);
-            _PyFrame_SetStackPointer(frame, stack_pointer);
-            PyStackRef_CLOSE(str_st);
-            stack_pointer = _PyFrame_GetStackPointer(frame);
+            s = str_st;
+            i = sub_st;
             res = PyStackRef_FromPyObjectBorrow(res_o);
+            _tos_cache2 = i;
+            _tos_cache1 = s;
             _tos_cache0 = res;
-            _tos_cache1 = PyStackRef_ZERO_BITS;
-            _tos_cache2 = PyStackRef_ZERO_BITS;
-            SET_CURRENT_CACHED_VALUES(1);
+            SET_CURRENT_CACHED_VALUES(3);
             assert(WITHIN_STACK_BOUNDS_IGNORING_CACHE());
             break;
         }
@@ -5850,7 +5823,7 @@
                 SET_CURRENT_CACHED_VALUES(2);
                 JUMP_TO_JUMP_TARGET();
             }
-            if (gen->gi_frame_state >= FRAME_EXECUTING) {
+            if (!gen_try_set_executing((PyGenObject *)gen)) {
                 UOP_STAT_INC(uopcode, miss);
                 _tos_cache1 = v;
                 _tos_cache0 = receiver;
@@ -5860,7 +5833,6 @@
             STAT_INC(SEND, hit);
             _PyInterpreterFrame *pushed_frame = &gen->gi_iframe;
             _PyFrame_StackPush(pushed_frame, PyStackRef_MakeHeapSafe(v));
-            gen->gi_frame_state = FRAME_EXECUTING;
             gen->gi_exc_state.previous_item = tstate->exc_info;
             tstate->exc_info = &gen->gi_exc_state;
             assert( 2u + oparg <= UINT16_MAX);
@@ -5888,7 +5860,6 @@
             PyGenObject *gen = _PyGen_GetGeneratorFromFrame(frame);
             assert(FRAME_SUSPENDED_YIELD_FROM == FRAME_SUSPENDED + 1);
             assert(oparg == 0 || oparg == 1);
-            gen->gi_frame_state = FRAME_SUSPENDED + oparg;
             _PyStackRef temp = retval;
             _PyFrame_SetStackPointer(frame, stack_pointer);
             tstate->exc_info = gen->gi_exc_state.previous_item;
@@ -5897,6 +5868,8 @@
             _PyInterpreterFrame *gen_frame = frame;
             frame = tstate->current_frame = frame->previous;
             gen_frame->previous = NULL;
+            ((_PyThreadStateImpl *)tstate)->generator_return_kind = GENERATOR_YIELD;
+            FT_ATOMIC_STORE_INT8_RELEASE(gen->gi_frame_state, FRAME_SUSPENDED + oparg);
             assert(INLINE_CACHE_ENTRIES_SEND == INLINE_CACHE_ENTRIES_FOR_ITER);
             #if TIER_ONE
             assert(frame->instr_ptr->op.code == INSTRUMENTED_LINE ||
@@ -5992,9 +5965,9 @@
             CHECK_CURRENT_CACHED_VALUES(0);
             assert(WITHIN_STACK_BOUNDS_IGNORING_CACHE());
             _PyStackRef bc;
-            PyObject *bc_o;
+            int err;
             _PyFrame_SetStackPointer(frame, stack_pointer);
-            int err = PyMapping_GetOptionalItem(BUILTINS(), &_Py_ID(__build_class__), &bc_o);
+            PyObject *bc_o = _PyMapping_GetOptionalItem2(BUILTINS(), &_Py_ID(__build_class__), &err);
             stack_pointer = _PyFrame_GetStackPointer(frame);
             if (err < 0) {
                 SET_CURRENT_CACHED_VALUES(0);
@@ -6839,17 +6812,17 @@
             _PyStackRef _stack_item_0 = _tos_cache0;
             oparg = CURRENT_OPARG();
             class_dict_st = _stack_item_0;
-            PyObject *value_o;
             PyObject *name;
             PyObject *class_dict = PyStackRef_AsPyObjectBorrow(class_dict_st);
             assert(class_dict);
             assert(oparg >= 0 && oparg < _PyFrame_GetCode(frame)->co_nlocalsplus);
             name = PyTuple_GET_ITEM(_PyFrame_GetCode(frame)->co_localsplusnames, oparg);
+            int err;
             stack_pointer[0] = class_dict_st;
             stack_pointer += 1;
             ASSERT_WITHIN_STACK_BOUNDS(__FILE__, __LINE__);
             _PyFrame_SetStackPointer(frame, stack_pointer);
-            int err = PyMapping_GetOptionalItem(class_dict, name, &value_o);
+            PyObject* value_o = _PyMapping_GetOptionalItem2(class_dict, name, &err);
             stack_pointer = _PyFrame_GetStackPointer(frame);
             if (err < 0) {
                 SET_CURRENT_CACHED_VALUES(0);
@@ -7366,7 +7339,6 @@
         case _SETUP_ANNOTATIONS_r00: {
             CHECK_CURRENT_CACHED_VALUES(0);
             assert(WITHIN_STACK_BOUNDS_IGNORING_CACHE());
-            PyObject *ann_dict;
             if (LOCALS() == NULL) {
                 _PyFrame_SetStackPointer(frame, stack_pointer);
                 _PyErr_Format(tstate, PyExc_SystemError,
@@ -7375,8 +7347,9 @@
                 SET_CURRENT_CACHED_VALUES(0);
                 JUMP_TO_ERROR();
             }
+            int err;
             _PyFrame_SetStackPointer(frame, stack_pointer);
-            int err = PyMapping_GetOptionalItem(LOCALS(), &_Py_ID(__annotations__), &ann_dict);
+            PyObject* ann_dict = _PyMapping_GetOptionalItem2(LOCALS(), &_Py_ID(__annotations__), &err);
             stack_pointer = _PyFrame_GetStackPointer(frame);
             if (err < 0) {
                 SET_CURRENT_CACHED_VALUES(0);
@@ -7658,15 +7631,19 @@
             PyObject *name = GETITEM(FRAME_CO_NAMES, oparg >> 2);
             PyTypeObject *cls = (PyTypeObject *)class;
             int method_found = 0;
-            stack_pointer[0] = global_super_st;
-            stack_pointer[1] = class_st;
-            stack_pointer[2] = self_st;
-            stack_pointer += 3;
-            ASSERT_WITHIN_STACK_BOUNDS(__FILE__, __LINE__);
-            _PyFrame_SetStackPointer(frame, stack_pointer);
-            PyObject *attr_o = _PySuper_Lookup(cls, self, name,
-                Py_TYPE(self)->tp_getattro == PyObject_GenericGetAttr ? &method_found : NULL);
-            stack_pointer = _PyFrame_GetStackPointer(frame);
+            PyObject *attr_o;
+            {
+                int *method_found_ptr = &method_found;
+                stack_pointer[0] = global_super_st;
+                stack_pointer[1] = class_st;
+                stack_pointer[2] = self_st;
+                stack_pointer += 3;
+                ASSERT_WITHIN_STACK_BOUNDS(__FILE__, __LINE__);
+                _PyFrame_SetStackPointer(frame, stack_pointer);
+                attr_o = _PySuper_Lookup(cls, self, name,
+                    Py_TYPE(self)->tp_getattro == PyObject_GenericGetAttr ? method_found_ptr : NULL);
+                stack_pointer = _PyFrame_GetStackPointer(frame);
+            }
             if (attr_o == NULL) {
                 SET_CURRENT_CACHED_VALUES(0);
                 JUMP_TO_ERROR();
@@ -8096,11 +8073,51 @@
             break;
         }
 
-        case _LOAD_ATTR_INSTANCE_VALUE_r11: {
+        case _LOAD_ATTR_INSTANCE_VALUE_r02: {
+            CHECK_CURRENT_CACHED_VALUES(0);
+            assert(WITHIN_STACK_BOUNDS_IGNORING_CACHE());
+            _PyStackRef owner;
+            _PyStackRef attr;
+            _PyStackRef o;
+            owner = stack_pointer[-1];
+            uint16_t offset = (uint16_t)CURRENT_OPERAND0_16();
+            PyObject *owner_o = PyStackRef_AsPyObjectBorrow(owner);
+            PyObject **value_ptr = (PyObject**)(((char *)owner_o) + offset);
+            PyObject *attr_o = FT_ATOMIC_LOAD_PTR_ACQUIRE(*value_ptr);
+            if (attr_o == NULL) {
+                UOP_STAT_INC(uopcode, miss);
+                SET_CURRENT_CACHED_VALUES(0);
+                JUMP_TO_JUMP_TARGET();
+            }
+            #ifdef Py_GIL_DISABLED
+            int increfed = _Py_TryIncrefCompareStackRef(value_ptr, attr_o, &attr);
+            if (!increfed) {
+                if (true) {
+                    UOP_STAT_INC(uopcode, miss);
+                    SET_CURRENT_CACHED_VALUES(0);
+                    JUMP_TO_JUMP_TARGET();
+                }
+            }
+            #else
+            attr = PyStackRef_FromPyObjectNew(attr_o);
+            #endif
+            STAT_INC(LOAD_ATTR, hit);
+            o = owner;
+            _tos_cache1 = o;
+            _tos_cache0 = attr;
+            SET_CURRENT_CACHED_VALUES(2);
+            stack_pointer += -1;
+            ASSERT_WITHIN_STACK_BOUNDS(__FILE__, __LINE__);
+            assert(WITHIN_STACK_BOUNDS_IGNORING_CACHE());
+            break;
+        }
+
+        case _LOAD_ATTR_INSTANCE_VALUE_r12: {
             CHECK_CURRENT_CACHED_VALUES(1);
             assert(WITHIN_STACK_BOUNDS_IGNORING_CACHE());
             _PyStackRef owner;
             _PyStackRef attr;
+            _PyStackRef o;
             _PyStackRef _stack_item_0 = _tos_cache0;
             owner = _stack_item_0;
             uint16_t offset = (uint16_t)CURRENT_OPERAND0_16();
@@ -8127,18 +8144,54 @@
             attr = PyStackRef_FromPyObjectNew(attr_o);
             #endif
             STAT_INC(LOAD_ATTR, hit);
-            stack_pointer[0] = attr;
-            stack_pointer += 1;
-            ASSERT_WITHIN_STACK_BOUNDS(__FILE__, __LINE__);
-            _PyFrame_SetStackPointer(frame, stack_pointer);
-            PyStackRef_CLOSE(owner);
-            stack_pointer = _PyFrame_GetStackPointer(frame);
+            o = owner;
+            _tos_cache1 = o;
             _tos_cache0 = attr;
-            _tos_cache1 = PyStackRef_ZERO_BITS;
-            _tos_cache2 = PyStackRef_ZERO_BITS;
-            SET_CURRENT_CACHED_VALUES(1);
-            stack_pointer += -1;
-            ASSERT_WITHIN_STACK_BOUNDS(__FILE__, __LINE__);
+            SET_CURRENT_CACHED_VALUES(2);
+            assert(WITHIN_STACK_BOUNDS_IGNORING_CACHE());
+            break;
+        }
+
+        case _LOAD_ATTR_INSTANCE_VALUE_r23: {
+            CHECK_CURRENT_CACHED_VALUES(2);
+            assert(WITHIN_STACK_BOUNDS_IGNORING_CACHE());
+            _PyStackRef owner;
+            _PyStackRef attr;
+            _PyStackRef o;
+            _PyStackRef _stack_item_0 = _tos_cache0;
+            _PyStackRef _stack_item_1 = _tos_cache1;
+            owner = _stack_item_1;
+            uint16_t offset = (uint16_t)CURRENT_OPERAND0_16();
+            PyObject *owner_o = PyStackRef_AsPyObjectBorrow(owner);
+            PyObject **value_ptr = (PyObject**)(((char *)owner_o) + offset);
+            PyObject *attr_o = FT_ATOMIC_LOAD_PTR_ACQUIRE(*value_ptr);
+            if (attr_o == NULL) {
+                UOP_STAT_INC(uopcode, miss);
+                _tos_cache1 = owner;
+                _tos_cache0 = _stack_item_0;
+                SET_CURRENT_CACHED_VALUES(2);
+                JUMP_TO_JUMP_TARGET();
+            }
+            #ifdef Py_GIL_DISABLED
+            int increfed = _Py_TryIncrefCompareStackRef(value_ptr, attr_o, &attr);
+            if (!increfed) {
+                if (true) {
+                    UOP_STAT_INC(uopcode, miss);
+                    _tos_cache1 = owner;
+                    _tos_cache0 = _stack_item_0;
+                    SET_CURRENT_CACHED_VALUES(2);
+                    JUMP_TO_JUMP_TARGET();
+                }
+            }
+            #else
+            attr = PyStackRef_FromPyObjectNew(attr_o);
+            #endif
+            STAT_INC(LOAD_ATTR, hit);
+            o = owner;
+            _tos_cache2 = o;
+            _tos_cache1 = attr;
+            _tos_cache0 = _stack_item_0;
+            SET_CURRENT_CACHED_VALUES(3);
             assert(WITHIN_STACK_BOUNDS_IGNORING_CACHE());
             break;
         }
@@ -8208,11 +8261,12 @@
             break;
         }
 
-        case _LOAD_ATTR_WITH_HINT_r11: {
+        case _LOAD_ATTR_WITH_HINT_r12: {
             CHECK_CURRENT_CACHED_VALUES(1);
             assert(WITHIN_STACK_BOUNDS_IGNORING_CACHE());
             _PyStackRef owner;
             _PyStackRef attr;
+            _PyStackRef o;
             _PyStackRef _stack_item_0 = _tos_cache0;
             oparg = CURRENT_OPARG();
             owner = _stack_item_0;
@@ -8286,18 +8340,11 @@
             #else
             attr = PyStackRef_FromPyObjectNew(attr_o);
             #endif
-            stack_pointer[0] = attr;
-            stack_pointer += 1;
-            ASSERT_WITHIN_STACK_BOUNDS(__FILE__, __LINE__);
-            _PyFrame_SetStackPointer(frame, stack_pointer);
-            PyStackRef_CLOSE(owner);
-            stack_pointer = _PyFrame_GetStackPointer(frame);
+            o = owner;
+            _tos_cache1 = o;
             _tos_cache0 = attr;
-            _tos_cache1 = PyStackRef_ZERO_BITS;
             _tos_cache2 = PyStackRef_ZERO_BITS;
-            SET_CURRENT_CACHED_VALUES(1);
-            stack_pointer += -1;
-            ASSERT_WITHIN_STACK_BOUNDS(__FILE__, __LINE__);
+            SET_CURRENT_CACHED_VALUES(2);
             assert(WITHIN_STACK_BOUNDS_IGNORING_CACHE());
             break;
         }
@@ -8699,11 +8746,12 @@
             break;
         }
 
-        case _STORE_ATTR_WITH_HINT_r20: {
+        case _STORE_ATTR_WITH_HINT_r21: {
             CHECK_CURRENT_CACHED_VALUES(2);
             assert(WITHIN_STACK_BOUNDS_IGNORING_CACHE());
             _PyStackRef owner;
             _PyStackRef value;
+            _PyStackRef o;
             _PyStackRef _stack_item_0 = _tos_cache0;
             _PyStackRef _stack_item_1 = _tos_cache1;
             oparg = CURRENT_OPARG();
@@ -8767,21 +8815,24 @@
             stack_pointer += 2;
             ASSERT_WITHIN_STACK_BOUNDS(__FILE__, __LINE__);
             _PyFrame_SetStackPointer(frame, stack_pointer);
-            _PyDict_NotifyEvent(tstate->interp, PyDict_EVENT_MODIFIED, dict, name, PyStackRef_AsPyObjectBorrow(value));
+            _PyDict_NotifyEvent(PyDict_EVENT_MODIFIED, dict, name, PyStackRef_AsPyObjectBorrow(value));
             stack_pointer = _PyFrame_GetStackPointer(frame);
             FT_ATOMIC_STORE_PTR_RELEASE(ep->me_value, PyStackRef_AsPyObjectSteal(value));
             UNLOCK_OBJECT(dict);
             STAT_INC(STORE_ATTR, hit);
-            stack_pointer += -2;
+            o = owner;
+            stack_pointer[-2] = o;
+            stack_pointer += -1;
             ASSERT_WITHIN_STACK_BOUNDS(__FILE__, __LINE__);
             _PyFrame_SetStackPointer(frame, stack_pointer);
-            PyStackRef_CLOSE(owner);
             Py_XDECREF(old_value);
             stack_pointer = _PyFrame_GetStackPointer(frame);
-            _tos_cache0 = PyStackRef_ZERO_BITS;
+            _tos_cache0 = o;
             _tos_cache1 = PyStackRef_ZERO_BITS;
             _tos_cache2 = PyStackRef_ZERO_BITS;
-            SET_CURRENT_CACHED_VALUES(0);
+            SET_CURRENT_CACHED_VALUES(1);
+            stack_pointer += -1;
+            ASSERT_WITHIN_STACK_BOUNDS(__FILE__, __LINE__);
             assert(WITHIN_STACK_BOUNDS_IGNORING_CACHE());
             break;
         }
@@ -10806,6 +10857,81 @@
             break;
         }
 
+        case _FOR_ITER_GEN_FRAME_r03: {
+            CHECK_CURRENT_CACHED_VALUES(0);
+            assert(WITHIN_STACK_BOUNDS_IGNORING_CACHE());
+            _PyStackRef iter;
+            _PyStackRef gen_frame;
+            oparg = CURRENT_OPARG();
+            iter = stack_pointer[-2];
+            PyGenObject *gen = (PyGenObject *)PyStackRef_AsPyObjectBorrow(iter);
+            if (Py_TYPE(gen) != &PyGen_Type) {
+                UOP_STAT_INC(uopcode, miss);
+                SET_CURRENT_CACHED_VALUES(0);
+                JUMP_TO_JUMP_TARGET();
+            }
+            if (!gen_try_set_executing((PyGenObject *)gen)) {
+                UOP_STAT_INC(uopcode, miss);
+                SET_CURRENT_CACHED_VALUES(0);
+                JUMP_TO_JUMP_TARGET();
+            }
+            STAT_INC(FOR_ITER, hit);
+            _PyInterpreterFrame *pushed_frame = &gen->gi_iframe;
+            _PyFrame_StackPush(pushed_frame, PyStackRef_None);
+            gen->gi_exc_state.previous_item = tstate->exc_info;
+            tstate->exc_info = &gen->gi_exc_state;
+            pushed_frame->previous = frame;
+            frame->return_offset = (uint16_t)( 2u + oparg);
+            gen_frame = PyStackRef_Wrap(pushed_frame);
+            _tos_cache2 = gen_frame;
+            _tos_cache1 = stack_pointer[-1];
+            _tos_cache0 = iter;
+            SET_CURRENT_CACHED_VALUES(3);
+            stack_pointer += -2;
+            ASSERT_WITHIN_STACK_BOUNDS(__FILE__, __LINE__);
+            assert(WITHIN_STACK_BOUNDS_IGNORING_CACHE());
+            break;
+        }
+
+        case _FOR_ITER_GEN_FRAME_r13: {
+            CHECK_CURRENT_CACHED_VALUES(1);
+            assert(WITHIN_STACK_BOUNDS_IGNORING_CACHE());
+            _PyStackRef iter;
+            _PyStackRef gen_frame;
+            _PyStackRef _stack_item_0 = _tos_cache0;
+            oparg = CURRENT_OPARG();
+            iter = stack_pointer[-1];
+            PyGenObject *gen = (PyGenObject *)PyStackRef_AsPyObjectBorrow(iter);
+            if (Py_TYPE(gen) != &PyGen_Type) {
+                UOP_STAT_INC(uopcode, miss);
+                _tos_cache0 = _stack_item_0;
+                SET_CURRENT_CACHED_VALUES(1);
+                JUMP_TO_JUMP_TARGET();
+            }
+            if (!gen_try_set_executing((PyGenObject *)gen)) {
+                UOP_STAT_INC(uopcode, miss);
+                _tos_cache0 = _stack_item_0;
+                SET_CURRENT_CACHED_VALUES(1);
+                JUMP_TO_JUMP_TARGET();
+            }
+            STAT_INC(FOR_ITER, hit);
+            _PyInterpreterFrame *pushed_frame = &gen->gi_iframe;
+            _PyFrame_StackPush(pushed_frame, PyStackRef_None);
+            gen->gi_exc_state.previous_item = tstate->exc_info;
+            tstate->exc_info = &gen->gi_exc_state;
+            pushed_frame->previous = frame;
+            frame->return_offset = (uint16_t)( 2u + oparg);
+            gen_frame = PyStackRef_Wrap(pushed_frame);
+            _tos_cache2 = gen_frame;
+            _tos_cache1 = _stack_item_0;
+            _tos_cache0 = iter;
+            SET_CURRENT_CACHED_VALUES(3);
+            stack_pointer += -1;
+            ASSERT_WITHIN_STACK_BOUNDS(__FILE__, __LINE__);
+            assert(WITHIN_STACK_BOUNDS_IGNORING_CACHE());
+            break;
+        }
+
         case _FOR_ITER_GEN_FRAME_r23: {
             CHECK_CURRENT_CACHED_VALUES(2);
             assert(WITHIN_STACK_BOUNDS_IGNORING_CACHE());
@@ -10823,17 +10949,7 @@
                 SET_CURRENT_CACHED_VALUES(2);
                 JUMP_TO_JUMP_TARGET();
             }
-            #ifdef Py_GIL_DISABLED
-
-            if (!_PyObject_IsUniquelyReferenced((PyObject *)gen)) {
-                UOP_STAT_INC(uopcode, miss);
-                _tos_cache1 = _stack_item_1;
-                _tos_cache0 = iter;
-                SET_CURRENT_CACHED_VALUES(2);
-                JUMP_TO_JUMP_TARGET();
-            }
-            #endif
-            if (gen->gi_frame_state >= FRAME_EXECUTING) {
+            if (!gen_try_set_executing((PyGenObject *)gen)) {
                 UOP_STAT_INC(uopcode, miss);
                 _tos_cache1 = _stack_item_1;
                 _tos_cache0 = iter;
@@ -10843,7 +10959,6 @@
             STAT_INC(FOR_ITER, hit);
             _PyInterpreterFrame *pushed_frame = &gen->gi_iframe;
             _PyFrame_StackPush(pushed_frame, PyStackRef_None);
-            gen->gi_frame_state = FRAME_EXECUTING;
             gen->gi_exc_state.previous_item = tstate->exc_info;
             tstate->exc_info = &gen->gi_exc_state;
             pushed_frame->previous = frame;
@@ -10935,16 +11050,21 @@
             }
             assert(PyStackRef_IsTaggedInt(lasti));
             (void)lasti;
-            PyObject *stack[5] = {NULL, PyStackRef_AsPyObjectBorrow(exit_self), exc, val_o, tb};
-            int has_self = !PyStackRef_IsNull(exit_self);
-            stack_pointer[0] = lasti;
-            stack_pointer[1] = _stack_item_1;
-            stack_pointer[2] = val;
-            stack_pointer += 3;
-            ASSERT_WITHIN_STACK_BOUNDS(__FILE__, __LINE__);
+            PyObject* res_o;
+            {
+                PyObject *stack[5] = {NULL, PyStackRef_AsPyObjectBorrow(exit_self), exc, val_o, tb};
+                int has_self = !PyStackRef_IsNull(exit_self);
+                stack_pointer[0] = lasti;
+                stack_pointer[1] = _stack_item_1;
+                stack_pointer[2] = val;
+                stack_pointer += 3;
+                ASSERT_WITHIN_STACK_BOUNDS(__FILE__, __LINE__);
+                _PyFrame_SetStackPointer(frame, stack_pointer);
+                res_o = PyObject_Vectorcall(exit_func_o, stack + 2 - has_self,
+                    (3 + has_self) | PY_VECTORCALL_ARGUMENTS_OFFSET, NULL);
+                stack_pointer = _PyFrame_GetStackPointer(frame);
+            }
             _PyFrame_SetStackPointer(frame, stack_pointer);
-            PyObject *res_o = PyObject_Vectorcall(exit_func_o, stack + 2 - has_self,
-                (3 + has_self) | PY_VECTORCALL_ARGUMENTS_OFFSET, NULL);
             Py_XDECREF(original_tb);
             stack_pointer = _PyFrame_GetStackPointer(frame);
             if (res_o == NULL) {
@@ -16479,6 +16599,106 @@
             break;
         }
 
+        case _SHUFFLE_3_LOAD_CONST_INLINE_BORROW_r03: {
+            CHECK_CURRENT_CACHED_VALUES(0);
+            assert(WITHIN_STACK_BOUNDS_IGNORING_CACHE());
+            _PyStackRef arg;
+            _PyStackRef callable;
+            _PyStackRef res;
+            _PyStackRef a;
+            _PyStackRef c;
+            arg = stack_pointer[-1];
+            callable = stack_pointer[-3];
+            PyObject *ptr = (PyObject *)CURRENT_OPERAND0_64();
+            res = PyStackRef_FromPyObjectBorrow(ptr);
+            a = arg;
+            c = callable;
+            _tos_cache2 = c;
+            _tos_cache1 = a;
+            _tos_cache0 = res;
+            SET_CURRENT_CACHED_VALUES(3);
+            stack_pointer += -3;
+            ASSERT_WITHIN_STACK_BOUNDS(__FILE__, __LINE__);
+            assert(WITHIN_STACK_BOUNDS_IGNORING_CACHE());
+            break;
+        }
+
+        case _SHUFFLE_3_LOAD_CONST_INLINE_BORROW_r13: {
+            CHECK_CURRENT_CACHED_VALUES(1);
+            assert(WITHIN_STACK_BOUNDS_IGNORING_CACHE());
+            _PyStackRef arg;
+            _PyStackRef callable;
+            _PyStackRef res;
+            _PyStackRef a;
+            _PyStackRef c;
+            _PyStackRef _stack_item_0 = _tos_cache0;
+            arg = _stack_item_0;
+            callable = stack_pointer[-2];
+            PyObject *ptr = (PyObject *)CURRENT_OPERAND0_64();
+            res = PyStackRef_FromPyObjectBorrow(ptr);
+            a = arg;
+            c = callable;
+            _tos_cache2 = c;
+            _tos_cache1 = a;
+            _tos_cache0 = res;
+            SET_CURRENT_CACHED_VALUES(3);
+            stack_pointer += -2;
+            ASSERT_WITHIN_STACK_BOUNDS(__FILE__, __LINE__);
+            assert(WITHIN_STACK_BOUNDS_IGNORING_CACHE());
+            break;
+        }
+
+        case _SHUFFLE_3_LOAD_CONST_INLINE_BORROW_r23: {
+            CHECK_CURRENT_CACHED_VALUES(2);
+            assert(WITHIN_STACK_BOUNDS_IGNORING_CACHE());
+            _PyStackRef arg;
+            _PyStackRef callable;
+            _PyStackRef res;
+            _PyStackRef a;
+            _PyStackRef c;
+            _PyStackRef _stack_item_0 = _tos_cache0;
+            _PyStackRef _stack_item_1 = _tos_cache1;
+            arg = _stack_item_1;
+            callable = stack_pointer[-1];
+            PyObject *ptr = (PyObject *)CURRENT_OPERAND0_64();
+            res = PyStackRef_FromPyObjectBorrow(ptr);
+            a = arg;
+            c = callable;
+            _tos_cache2 = c;
+            _tos_cache1 = a;
+            _tos_cache0 = res;
+            SET_CURRENT_CACHED_VALUES(3);
+            stack_pointer += -1;
+            ASSERT_WITHIN_STACK_BOUNDS(__FILE__, __LINE__);
+            assert(WITHIN_STACK_BOUNDS_IGNORING_CACHE());
+            break;
+        }
+
+        case _SHUFFLE_3_LOAD_CONST_INLINE_BORROW_r33: {
+            CHECK_CURRENT_CACHED_VALUES(3);
+            assert(WITHIN_STACK_BOUNDS_IGNORING_CACHE());
+            _PyStackRef arg;
+            _PyStackRef callable;
+            _PyStackRef res;
+            _PyStackRef a;
+            _PyStackRef c;
+            _PyStackRef _stack_item_0 = _tos_cache0;
+            _PyStackRef _stack_item_1 = _tos_cache1;
+            _PyStackRef _stack_item_2 = _tos_cache2;
+            arg = _stack_item_2;
+            callable = _stack_item_0;
+            PyObject *ptr = (PyObject *)CURRENT_OPERAND0_64();
+            res = PyStackRef_FromPyObjectBorrow(ptr);
+            a = arg;
+            c = callable;
+            _tos_cache2 = c;
+            _tos_cache1 = a;
+            _tos_cache0 = res;
+            SET_CURRENT_CACHED_VALUES(3);
+            assert(WITHIN_STACK_BOUNDS_IGNORING_CACHE());
+            break;
+        }
+
         case _POP_CALL_TWO_LOAD_CONST_INLINE_BORROW_r31: {
             CHECK_CURRENT_CACHED_VALUES(3);
             assert(WITHIN_STACK_BOUNDS_IGNORING_CACHE());
diff --git a/Python/generated_cases.c.h b/Python/generated_cases.c.h
index 2811a2ec8ac..e562f337a00 100644
--- a/Python/generated_cases.c.h
+++ b/Python/generated_cases.c.h
@@ -245,6 +245,8 @@
             _PyStackRef left;
             _PyStackRef right;
             _PyStackRef res;
+            _PyStackRef l;
+            _PyStackRef r;
             // _GUARD_TOS_UNICODE
             {
                 value = stack_pointer[-1];
@@ -276,12 +278,24 @@
                 assert(PyUnicode_CheckExact(right_o));
                 STAT_INC(BINARY_OP, hit);
                 PyObject *res_o = PyUnicode_Concat(left_o, right_o);
-                PyStackRef_CLOSE_SPECIALIZED(right, _PyUnicode_ExactDealloc);
-                PyStackRef_CLOSE_SPECIALIZED(left, _PyUnicode_ExactDealloc);
-                if (res_o == NULL) {
-                    JUMP_TO_LABEL(pop_2_error);
-                }
                 res = PyStackRef_FromPyObjectSteal(res_o);
+                if (PyStackRef_IsNull(res)) {
+                    JUMP_TO_LABEL(error);
+                }
+                l = left;
+                r = right;
+            }
+            // _POP_TOP_UNICODE
+            {
+                value = r;
+                assert(PyUnicode_CheckExact(PyStackRef_AsPyObjectBorrow(value)));
+                PyStackRef_CLOSE_SPECIALIZED(value, _PyUnicode_ExactDealloc);
+            }
+            // _POP_TOP_UNICODE
+            {
+                value = l;
+                assert(PyUnicode_CheckExact(PyStackRef_AsPyObjectBorrow(value)));
+                PyStackRef_CLOSE_SPECIALIZED(value, _PyUnicode_ExactDealloc);
             }
             stack_pointer[-2] = res;
             stack_pointer += -1;
@@ -750,6 +764,8 @@
             _PyStackRef list_st;
             _PyStackRef sub_st;
             _PyStackRef res;
+            _PyStackRef ls;
+            _PyStackRef ss;
             // _GUARD_TOS_INT
             {
                 value = stack_pointer[-1];
@@ -808,18 +824,24 @@
                 res = PyStackRef_FromPyObjectNew(res_o);
                 #endif
                 STAT_INC(BINARY_OP, hit);
-                _PyFrame_SetStackPointer(frame, stack_pointer);
-                _PyStackRef tmp = list_st;
-                list_st = res;
-                stack_pointer[-2] = list_st;
-                PyStackRef_CLOSE(tmp);
-                tmp = sub_st;
-                sub_st = PyStackRef_NULL;
-                stack_pointer[-1] = sub_st;
-                PyStackRef_CLOSE(tmp);
-                stack_pointer = _PyFrame_GetStackPointer(frame);
+                ls = list_st;
+                ss = sub_st;
+            }
+            // _POP_TOP_INT
+            {
+                value = ss;
+                assert(PyLong_CheckExact(PyStackRef_AsPyObjectBorrow(value)));
+                PyStackRef_CLOSE_SPECIALIZED(value, _PyLong_ExactDealloc);
+            }
+            // _POP_TOP
+            {
+                value = ls;
+                stack_pointer[-2] = res;
                 stack_pointer += -1;
                 ASSERT_WITHIN_STACK_BOUNDS(__FILE__, __LINE__);
+                _PyFrame_SetStackPointer(frame, stack_pointer);
+                PyStackRef_XCLOSE(value);
+                stack_pointer = _PyFrame_GetStackPointer(frame);
             }
             DISPATCH();
         }
@@ -912,6 +934,8 @@
             _PyStackRef str_st;
             _PyStackRef sub_st;
             _PyStackRef res;
+            _PyStackRef s;
+            _PyStackRef i;
             // _GUARD_TOS_INT
             {
                 value = stack_pointer[-1];
@@ -961,17 +985,26 @@
                 assert(c < 128);
                 STAT_INC(BINARY_OP, hit);
                 PyObject *res_o = (PyObject*)&_Py_SINGLETON(strings).ascii[c];
-                PyStackRef_CLOSE_SPECIALIZED(sub_st, _PyLong_ExactDealloc);
-                stack_pointer += -2;
-                ASSERT_WITHIN_STACK_BOUNDS(__FILE__, __LINE__);
-                _PyFrame_SetStackPointer(frame, stack_pointer);
-                PyStackRef_CLOSE(str_st);
-                stack_pointer = _PyFrame_GetStackPointer(frame);
+                s = str_st;
+                i = sub_st;
                 res = PyStackRef_FromPyObjectBorrow(res_o);
             }
-            stack_pointer[0] = res;
-            stack_pointer += 1;
-            ASSERT_WITHIN_STACK_BOUNDS(__FILE__, __LINE__);
+            // _POP_TOP_INT
+            {
+                value = i;
+                assert(PyLong_CheckExact(PyStackRef_AsPyObjectBorrow(value)));
+                PyStackRef_CLOSE_SPECIALIZED(value, _PyLong_ExactDealloc);
+            }
+            // _POP_TOP
+            {
+                value = s;
+                stack_pointer[-2] = res;
+                stack_pointer += -1;
+                ASSERT_WITHIN_STACK_BOUNDS(__FILE__, __LINE__);
+                _PyFrame_SetStackPointer(frame, stack_pointer);
+                PyStackRef_XCLOSE(value);
+                stack_pointer = _PyFrame_GetStackPointer(frame);
+            }
             DISPATCH();
         }
 
@@ -1634,90 +1667,30 @@
                     frame->return_offset = 4u ;
                     DISPATCH_INLINED(new_frame);
                 }
-                STACKREFS_TO_PYOBJECTS(arguments, total_args, args_o);
-                if (CONVERSION_FAILED(args_o)) {
-                    _PyFrame_SetStackPointer(frame, stack_pointer);
-                    _PyStackRef tmp;
-                    for (int _i = oparg; --_i >= 0;) {
-                        tmp = args[_i];
-                        args[_i] = PyStackRef_NULL;
-                        stack_pointer[-2 - oparg] = callable;
-                        stack_pointer[-1 - oparg] = self_or_null;
-                        PyStackRef_CLOSE(tmp);
-                    }
-                    tmp = self_or_null;
-                    self_or_null = PyStackRef_NULL;
-                    stack_pointer[-1 - oparg] = self_or_null;
-                    PyStackRef_XCLOSE(tmp);
-                    tmp = callable;
-                    callable = PyStackRef_NULL;
-                    stack_pointer[-2 - oparg] = callable;
-                    PyStackRef_CLOSE(tmp);
-                    stack_pointer = _PyFrame_GetStackPointer(frame);
+                stack_pointer[-2 - oparg] = callable;
+                stack_pointer[-1 - oparg] = self_or_null;
+                _PyFrame_SetStackPointer(frame, stack_pointer);
+                PyObject* res_o = _Py_VectorCallInstrumentation_StackRefSteal(
+                    callable,
+                    arguments,
+                    total_args,
+                    PyStackRef_NULL,
+                    opcode == INSTRUMENTED_CALL,
+                    frame,
+                    this_instr,
+                    tstate);
+                stack_pointer = _PyFrame_GetStackPointer(frame);
+                if (res_o == NULL) {
                     stack_pointer += -2 - oparg;
                     ASSERT_WITHIN_STACK_BOUNDS(__FILE__, __LINE__);
                     JUMP_TO_LABEL(error);
                 }
-                stack_pointer[-2 - oparg] = callable;
-                stack_pointer[-1 - oparg] = self_or_null;
-                _PyFrame_SetStackPointer(frame, stack_pointer);
-                PyObject *res_o = PyObject_Vectorcall(
-                    callable_o, args_o,
-                    total_args | PY_VECTORCALL_ARGUMENTS_OFFSET,
-                    NULL);
-                stack_pointer = _PyFrame_GetStackPointer(frame);
-                STACKREFS_TO_PYOBJECTS_CLEANUP(args_o);
-                if (opcode == INSTRUMENTED_CALL) {
-                    PyObject *arg = total_args == 0 ?
-                    &_PyInstrumentation_MISSING : PyStackRef_AsPyObjectBorrow(arguments[0]);
-                    if (res_o == NULL) {
-                        _PyFrame_SetStackPointer(frame, stack_pointer);
-                        _Py_call_instrumentation_exc2(
-                            tstate, PY_MONITORING_EVENT_C_RAISE,
-                            frame, this_instr, callable_o, arg);
-                        stack_pointer = _PyFrame_GetStackPointer(frame);
-                    }
-                    else {
-                        _PyFrame_SetStackPointer(frame, stack_pointer);
-                        int err = _Py_call_instrumentation_2args(
-                            tstate, PY_MONITORING_EVENT_C_RETURN,
-                            frame, this_instr, callable_o, arg);
-                        stack_pointer = _PyFrame_GetStackPointer(frame);
-                        if (err < 0) {
-                            _PyFrame_SetStackPointer(frame, stack_pointer);
-                            Py_CLEAR(res_o);
-                            stack_pointer = _PyFrame_GetStackPointer(frame);
-                        }
-                    }
-                }
-                assert((res_o != NULL) ^ (_PyErr_Occurred(tstate) != NULL));
-                _PyFrame_SetStackPointer(frame, stack_pointer);
-                _PyStackRef tmp;
-                for (int _i = oparg; --_i >= 0;) {
-                    tmp = args[_i];
-                    args[_i] = PyStackRef_NULL;
-                    PyStackRef_CLOSE(tmp);
-                }
-                tmp = self_or_null;
-                self_or_null = PyStackRef_NULL;
-                stack_pointer[-1 - oparg] = self_or_null;
-                PyStackRef_XCLOSE(tmp);
-                tmp = callable;
-                callable = PyStackRef_NULL;
-                stack_pointer[-2 - oparg] = callable;
-                PyStackRef_CLOSE(tmp);
-                stack_pointer = _PyFrame_GetStackPointer(frame);
-                stack_pointer += -2 - oparg;
-                ASSERT_WITHIN_STACK_BOUNDS(__FILE__, __LINE__);
-                if (res_o == NULL) {
-                    JUMP_TO_LABEL(error);
-                }
                 res = PyStackRef_FromPyObjectSteal(res_o);
             }
             // _CHECK_PERIODIC_AT_END
             {
-                stack_pointer[0] = res;
-                stack_pointer += 1;
+                stack_pointer[-2 - oparg] = res;
+                stack_pointer += -1 - oparg;
                 ASSERT_WITHIN_STACK_BOUNDS(__FILE__, __LINE__);
                 _PyFrame_SetStackPointer(frame, stack_pointer);
                 int err = check_periodics(tstate);
@@ -2825,93 +2798,28 @@
                     frame->return_offset = 4u ;
                     DISPATCH_INLINED(new_frame);
                 }
-                STACKREFS_TO_PYOBJECTS(arguments, total_args, args_o);
-                if (CONVERSION_FAILED(args_o)) {
-                    _PyFrame_SetStackPointer(frame, stack_pointer);
-                    _PyStackRef tmp = kwnames;
-                    kwnames = PyStackRef_NULL;
-                    stack_pointer[-3 - oparg] = callable;
-                    stack_pointer[-2 - oparg] = self_or_null;
-                    stack_pointer[-1] = kwnames;
-                    PyStackRef_CLOSE(tmp);
-                    for (int _i = oparg; --_i >= 0;) {
-                        tmp = args[_i];
-                        args[_i] = PyStackRef_NULL;
-                        PyStackRef_CLOSE(tmp);
-                    }
-                    tmp = self_or_null;
-                    self_or_null = PyStackRef_NULL;
-                    stack_pointer[-2 - oparg] = self_or_null;
-                    PyStackRef_XCLOSE(tmp);
-                    tmp = callable;
-                    callable = PyStackRef_NULL;
-                    stack_pointer[-3 - oparg] = callable;
-                    PyStackRef_CLOSE(tmp);
-                    stack_pointer = _PyFrame_GetStackPointer(frame);
+                stack_pointer[-3 - oparg] = callable;
+                stack_pointer[-2 - oparg] = self_or_null;
+                _PyFrame_SetStackPointer(frame, stack_pointer);
+                PyObject* res_o = _Py_VectorCallInstrumentation_StackRefSteal(
+                    callable,
+                    arguments,
+                    total_args,
+                    kwnames,
+                    opcode == INSTRUMENTED_CALL_KW,
+                    frame,
+                    this_instr,
+                    tstate);
+                stack_pointer = _PyFrame_GetStackPointer(frame);
+                if (res_o == NULL) {
                     stack_pointer += -3 - oparg;
                     ASSERT_WITHIN_STACK_BOUNDS(__FILE__, __LINE__);
                     JUMP_TO_LABEL(error);
                 }
-                stack_pointer[-3 - oparg] = callable;
-                stack_pointer[-2 - oparg] = self_or_null;
-                _PyFrame_SetStackPointer(frame, stack_pointer);
-                PyObject *res_o = PyObject_Vectorcall(
-                    callable_o, args_o,
-                    positional_args | PY_VECTORCALL_ARGUMENTS_OFFSET,
-                    kwnames_o);
-                stack_pointer = _PyFrame_GetStackPointer(frame);
-                STACKREFS_TO_PYOBJECTS_CLEANUP(args_o);
-                if (opcode == INSTRUMENTED_CALL_KW) {
-                    PyObject *arg = total_args == 0 ?
-                    &_PyInstrumentation_MISSING : PyStackRef_AsPyObjectBorrow(arguments[0]);
-                    if (res_o == NULL) {
-                        _PyFrame_SetStackPointer(frame, stack_pointer);
-                        _Py_call_instrumentation_exc2(
-                            tstate, PY_MONITORING_EVENT_C_RAISE,
-                            frame, this_instr, callable_o, arg);
-                        stack_pointer = _PyFrame_GetStackPointer(frame);
-                    }
-                    else {
-                        _PyFrame_SetStackPointer(frame, stack_pointer);
-                        int err = _Py_call_instrumentation_2args(
-                            tstate, PY_MONITORING_EVENT_C_RETURN,
-                            frame, this_instr, callable_o, arg);
-                        stack_pointer = _PyFrame_GetStackPointer(frame);
-                        if (err < 0) {
-                            _PyFrame_SetStackPointer(frame, stack_pointer);
-                            Py_CLEAR(res_o);
-                            stack_pointer = _PyFrame_GetStackPointer(frame);
-                        }
-                    }
-                }
-                _PyFrame_SetStackPointer(frame, stack_pointer);
-                _PyStackRef tmp = kwnames;
-                kwnames = PyStackRef_NULL;
-                stack_pointer[-1] = kwnames;
-                PyStackRef_CLOSE(tmp);
-                for (int _i = oparg; --_i >= 0;) {
-                    tmp = args[_i];
-                    args[_i] = PyStackRef_NULL;
-                    PyStackRef_CLOSE(tmp);
-                }
-                tmp = self_or_null;
-                self_or_null = PyStackRef_NULL;
-                stack_pointer[-2 - oparg] = self_or_null;
-                PyStackRef_XCLOSE(tmp);
-                tmp = callable;
-                callable = PyStackRef_NULL;
-                stack_pointer[-3 - oparg] = callable;
-                PyStackRef_CLOSE(tmp);
-                stack_pointer = _PyFrame_GetStackPointer(frame);
-                stack_pointer += -3 - oparg;
-                ASSERT_WITHIN_STACK_BOUNDS(__FILE__, __LINE__);
-                if (res_o == NULL) {
-                    JUMP_TO_LABEL(error);
-                }
                 res = PyStackRef_FromPyObjectSteal(res_o);
             }
-            stack_pointer[0] = res;
-            stack_pointer += 1;
+            stack_pointer[-3 - oparg] = res;
+            stack_pointer += -2 - oparg;
             ASSERT_WITHIN_STACK_BOUNDS(__FILE__, __LINE__);
             DISPATCH();
         }
@@ -5515,14 +5423,7 @@
                     assert(_PyOpcode_Deopt[opcode] == (FOR_ITER));
                     JUMP_TO_PREDICTED(FOR_ITER);
                 }
-                #ifdef Py_GIL_DISABLED
-                if (!_PyObject_IsUniquelyReferenced((PyObject *)gen)) {
-                    UPDATE_MISS_STATS(FOR_ITER);
-                    assert(_PyOpcode_Deopt[opcode] == (FOR_ITER));
-                    JUMP_TO_PREDICTED(FOR_ITER);
-                }
-                #endif
-                if (gen->gi_frame_state >= FRAME_EXECUTING) {
+                if (!gen_try_set_executing((PyGenObject *)gen)) {
                     UPDATE_MISS_STATS(FOR_ITER);
                     assert(_PyOpcode_Deopt[opcode] == (FOR_ITER));
                     JUMP_TO_PREDICTED(FOR_ITER);
@@ -5530,7 +5431,6 @@
                 STAT_INC(FOR_ITER, hit);
                 _PyInterpreterFrame *pushed_frame = &gen->gi_iframe;
                 _PyFrame_StackPush(pushed_frame, PyStackRef_None);
-                gen->gi_frame_state = FRAME_EXECUTING;
                 gen->gi_exc_state.previous_item = tstate->exc_info;
                 tstate->exc_info = &gen->gi_exc_state;
                 pushed_frame->previous = frame;
@@ -6155,86 +6055,28 @@
                     frame->return_offset = 4u ;
                     DISPATCH_INLINED(new_frame);
                 }
-                STACKREFS_TO_PYOBJECTS(arguments, total_args, args_o);
-                if (CONVERSION_FAILED(args_o)) {
-                    _PyFrame_SetStackPointer(frame, stack_pointer);
-                    _PyStackRef tmp;
-                    for (int _i = oparg; --_i >= 0;) {
-                        tmp = args[_i];
-                        args[_i] = PyStackRef_NULL;
-                        PyStackRef_CLOSE(tmp);
-                    }
-                    tmp = self_or_null;
-                    self_or_null = PyStackRef_NULL;
-                    stack_pointer[-1 - oparg] = self_or_null;
-                    PyStackRef_XCLOSE(tmp);
-                    tmp = callable;
-                    callable = PyStackRef_NULL;
-                    stack_pointer[-2 - oparg] = callable;
-                    PyStackRef_CLOSE(tmp);
-                    stack_pointer = _PyFrame_GetStackPointer(frame);
+                _PyFrame_SetStackPointer(frame, stack_pointer);
+                PyObject* res_o = _Py_VectorCallInstrumentation_StackRefSteal(
+                    callable,
+                    arguments,
+                    total_args,
+                    PyStackRef_NULL,
+                    opcode == INSTRUMENTED_CALL,
+                    frame,
+                    this_instr,
+                    tstate);
+                stack_pointer = _PyFrame_GetStackPointer(frame);
+                if (res_o == NULL) {
                     stack_pointer += -2 - oparg;
                     ASSERT_WITHIN_STACK_BOUNDS(__FILE__, __LINE__);
                     JUMP_TO_LABEL(error);
                 }
-                _PyFrame_SetStackPointer(frame, stack_pointer);
-                PyObject *res_o = PyObject_Vectorcall(
-                    callable_o, args_o,
-                    total_args | PY_VECTORCALL_ARGUMENTS_OFFSET,
-                    NULL);
-                stack_pointer = _PyFrame_GetStackPointer(frame);
-                STACKREFS_TO_PYOBJECTS_CLEANUP(args_o);
-                if (opcode == INSTRUMENTED_CALL) {
-                    PyObject *arg = total_args == 0 ?
-                    &_PyInstrumentation_MISSING : PyStackRef_AsPyObjectBorrow(arguments[0]);
-                    if (res_o == NULL) {
-                        _PyFrame_SetStackPointer(frame, stack_pointer);
-                        _Py_call_instrumentation_exc2(
-                            tstate, PY_MONITORING_EVENT_C_RAISE,
-                            frame, this_instr, callable_o, arg);
-                        stack_pointer = _PyFrame_GetStackPointer(frame);
-                    }
-                    else {
-                        _PyFrame_SetStackPointer(frame, stack_pointer);
-                        int err = _Py_call_instrumentation_2args(
-                            tstate, PY_MONITORING_EVENT_C_RETURN,
-                            frame, this_instr, callable_o, arg);
-                        stack_pointer = _PyFrame_GetStackPointer(frame);
-                        if (err < 0) {
-                            _PyFrame_SetStackPointer(frame, stack_pointer);
-                            Py_CLEAR(res_o);
-                            stack_pointer = _PyFrame_GetStackPointer(frame);
-                        }
-                    }
-                }
-                assert((res_o != NULL) ^ (_PyErr_Occurred(tstate) != NULL));
-                _PyFrame_SetStackPointer(frame, stack_pointer);
-                _PyStackRef tmp;
-                for (int _i = oparg; --_i >= 0;) {
-                    tmp = args[_i];
-                    args[_i] = PyStackRef_NULL;
-                    PyStackRef_CLOSE(tmp);
-                }
-                tmp = self_or_null;
-                self_or_null = PyStackRef_NULL;
-                stack_pointer[-1 - oparg] = self_or_null;
-                PyStackRef_XCLOSE(tmp);
-                tmp = callable;
-                callable = PyStackRef_NULL;
-                stack_pointer[-2 - oparg] = callable;
-                PyStackRef_CLOSE(tmp);
-                stack_pointer = _PyFrame_GetStackPointer(frame);
-                stack_pointer += -2 - oparg;
-                ASSERT_WITHIN_STACK_BOUNDS(__FILE__, __LINE__);
-                if (res_o == NULL) {
-                    JUMP_TO_LABEL(error);
-                }
                 res = PyStackRef_FromPyObjectSteal(res_o);
             }
             // _CHECK_PERIODIC_AT_END
             {
-                stack_pointer[0] = res;
-                stack_pointer += 1;
+                stack_pointer[-2 - oparg] = res;
+                stack_pointer += -1 - oparg;
                 ASSERT_WITHIN_STACK_BOUNDS(__FILE__, __LINE__);
                 _PyFrame_SetStackPointer(frame, stack_pointer);
                 int err = check_periodics(tstate);
@@ -6512,89 +6354,26 @@
                     frame->return_offset = 4u ;
                     DISPATCH_INLINED(new_frame);
                 }
-                STACKREFS_TO_PYOBJECTS(arguments, total_args, args_o);
-                if (CONVERSION_FAILED(args_o)) {
-                    _PyFrame_SetStackPointer(frame, stack_pointer);
-                    _PyStackRef tmp = kwnames;
-                    kwnames = PyStackRef_NULL;
-                    stack_pointer[-1] = kwnames;
-                    PyStackRef_CLOSE(tmp);
-                    for (int _i = oparg; --_i >= 0;) {
-                        tmp = args[_i];
-                        args[_i] = PyStackRef_NULL;
-                        PyStackRef_CLOSE(tmp);
-                    }
-                    tmp = self_or_null;
-                    self_or_null = PyStackRef_NULL;
-                    stack_pointer[-2 - oparg] = self_or_null;
-                    PyStackRef_XCLOSE(tmp);
-                    tmp = callable;
-                    callable = PyStackRef_NULL;
-                    stack_pointer[-3 - oparg] = callable;
-                    PyStackRef_CLOSE(tmp);
-                    stack_pointer = _PyFrame_GetStackPointer(frame);
+                _PyFrame_SetStackPointer(frame, stack_pointer);
+                PyObject* res_o = _Py_VectorCallInstrumentation_StackRefSteal(
+                    callable,
+                    arguments,
+                    total_args,
+                    kwnames,
+                    opcode == INSTRUMENTED_CALL_KW,
+                    frame,
+                    this_instr,
+                    tstate);
+                stack_pointer = _PyFrame_GetStackPointer(frame);
+                if (res_o == NULL) {
                     stack_pointer += -3 - oparg;
                     ASSERT_WITHIN_STACK_BOUNDS(__FILE__, __LINE__);
                     JUMP_TO_LABEL(error);
                 }
-                _PyFrame_SetStackPointer(frame, stack_pointer);
-                PyObject *res_o = PyObject_Vectorcall(
-                    callable_o, args_o,
-                    positional_args | PY_VECTORCALL_ARGUMENTS_OFFSET,
-                    kwnames_o);
-                stack_pointer = _PyFrame_GetStackPointer(frame);
-                STACKREFS_TO_PYOBJECTS_CLEANUP(args_o);
-                if (opcode == INSTRUMENTED_CALL_KW) {
-                    PyObject *arg = total_args == 0 ?
-                    &_PyInstrumentation_MISSING : PyStackRef_AsPyObjectBorrow(arguments[0]);
-                    if (res_o == NULL) {
-                        _PyFrame_SetStackPointer(frame, stack_pointer);
-                        _Py_call_instrumentation_exc2(
-                            tstate, PY_MONITORING_EVENT_C_RAISE,
-                            frame, this_instr, callable_o, arg);
-                        stack_pointer = _PyFrame_GetStackPointer(frame);
-                    }
-                    else {
-                        _PyFrame_SetStackPointer(frame, stack_pointer);
-                        int err = _Py_call_instrumentation_2args(
-                            tstate, PY_MONITORING_EVENT_C_RETURN,
-                            frame, this_instr, callable_o, arg);
-                        stack_pointer = _PyFrame_GetStackPointer(frame);
-                        if (err < 0) {
-                            _PyFrame_SetStackPointer(frame, stack_pointer);
-                            Py_CLEAR(res_o);
-                            stack_pointer = _PyFrame_GetStackPointer(frame);
-                        }
-                    }
-                }
-                _PyFrame_SetStackPointer(frame, stack_pointer);
-                _PyStackRef tmp = kwnames;
-                kwnames = PyStackRef_NULL;
-                stack_pointer[-1] = kwnames;
-                PyStackRef_CLOSE(tmp);
-                for (int _i = oparg; --_i >= 0;) {
-                    tmp = args[_i];
-                    args[_i] = PyStackRef_NULL;
-                    PyStackRef_CLOSE(tmp);
-                }
-                tmp = self_or_null;
-                self_or_null = PyStackRef_NULL;
-                stack_pointer[-2 - oparg] = self_or_null;
-                PyStackRef_XCLOSE(tmp);
-                tmp = callable;
-                callable = PyStackRef_NULL;
-                stack_pointer[-3 - oparg] = callable;
-                PyStackRef_CLOSE(tmp);
-                stack_pointer = _PyFrame_GetStackPointer(frame);
-                stack_pointer += -3 - oparg;
-                ASSERT_WITHIN_STACK_BOUNDS(__FILE__, __LINE__);
-                if (res_o == NULL) {
-                    JUMP_TO_LABEL(error);
-                }
                 res = PyStackRef_FromPyObjectSteal(res_o);
             }
-            stack_pointer[0] = res;
-            stack_pointer += 1;
+            stack_pointer[-3 - oparg] = res;
+            stack_pointer += -2 - oparg;
             ASSERT_WITHIN_STACK_BOUNDS(__FILE__, __LINE__);
             DISPATCH();
         }
@@ -6911,10 +6690,13 @@
                         JUMP_TO_LABEL(error);
                     }
                 }
-                PyObject *stack[] = {class, self};
-                _PyFrame_SetStackPointer(frame, stack_pointer);
-                PyObject *super = PyObject_Vectorcall(global_super, stack, oparg & 2, NULL);
-                stack_pointer = _PyFrame_GetStackPointer(frame);
+                PyObject *super;
+                {
+                    PyObject *stack[] = {class, self};
+                    _PyFrame_SetStackPointer(frame, stack_pointer);
+                    super = PyObject_Vectorcall(global_super, stack, oparg & 2, NULL);
+                    stack_pointer = _PyFrame_GetStackPointer(frame);
+                }
                 if (opcode == INSTRUMENTED_LOAD_SUPER_ATTR) {
                     PyObject *arg = oparg & 2 ? class : &_PyInstrumentation_MISSING;
                     if (super == NULL) {
@@ -7196,7 +6978,7 @@
             {
                 _PyFrame_SetStackPointer(frame, stack_pointer);
                 int err = _Py_call_instrumentation(
-                    tstate, oparg > 0, frame, this_instr);
+                    tstate, oparg == 0 ? PY_MONITORING_EVENT_PY_START : PY_MONITORING_EVENT_PY_RESUME, frame, this_instr);
                 stack_pointer = _PyFrame_GetStackPointer(frame);
                 if (err) {
                     JUMP_TO_LABEL(error);
@@ -7294,7 +7076,6 @@
                 PyGenObject *gen = _PyGen_GetGeneratorFromFrame(frame);
                 assert(FRAME_SUSPENDED_YIELD_FROM == FRAME_SUSPENDED + 1);
                 assert(oparg == 0 || oparg == 1);
-                gen->gi_frame_state = FRAME_SUSPENDED + oparg;
                 _PyStackRef temp = retval;
                 stack_pointer += -1;
                 ASSERT_WITHIN_STACK_BOUNDS(__FILE__, __LINE__);
@@ -7305,6 +7086,8 @@
                 _PyInterpreterFrame *gen_frame = frame;
                 frame = tstate->current_frame = frame->previous;
                 gen_frame->previous = NULL;
+                ((_PyThreadStateImpl *)tstate)->generator_return_kind = GENERATOR_YIELD;
+                FT_ATOMIC_STORE_INT8_RELEASE(gen->gi_frame_state, FRAME_SUSPENDED + oparg);
                 assert(INLINE_CACHE_ENTRIES_SEND == INLINE_CACHE_ENTRIES_FOR_ITER);
                 #if TIER_ONE
                 assert(frame->instr_ptr->op.code == INSTRUMENTED_LINE ||
@@ -7889,6 +7672,8 @@
             static_assert(INLINE_CACHE_ENTRIES_LOAD_ATTR == 9, "incorrect cache size");
             _PyStackRef owner;
             _PyStackRef attr;
+            _PyStackRef o;
+            _PyStackRef value;
             _PyStackRef *null;
             /* Skip 1 cache entry */
             // _GUARD_TYPE_VERSION
@@ -7938,9 +7723,14 @@
                 attr = PyStackRef_FromPyObjectNew(attr_o);
                 #endif
                 STAT_INC(LOAD_ATTR, hit);
+                o = owner;
+            }
+            // _POP_TOP
+            {
+                value = o;
                 stack_pointer[-1] = attr;
                 _PyFrame_SetStackPointer(frame, stack_pointer);
-                PyStackRef_CLOSE(owner);
+                PyStackRef_XCLOSE(value);
                 stack_pointer = _PyFrame_GetStackPointer(frame);
             }
             /* Skip 5 cache entries */
@@ -8490,6 +8280,8 @@
             static_assert(INLINE_CACHE_ENTRIES_LOAD_ATTR == 9, "incorrect cache size");
             _PyStackRef owner;
             _PyStackRef attr;
+            _PyStackRef o;
+            _PyStackRef value;
             _PyStackRef *null;
             /* Skip 1 cache entry */
             // _GUARD_TYPE_VERSION
@@ -8569,9 +8361,14 @@
                 #else
                 attr = PyStackRef_FromPyObjectNew(attr_o);
                 #endif
+                o = owner;
+            }
+            // _POP_TOP
+            {
+                value = o;
                 stack_pointer[-1] = attr;
                 _PyFrame_SetStackPointer(frame, stack_pointer);
-                PyStackRef_CLOSE(owner);
+                PyStackRef_XCLOSE(value);
                 stack_pointer = _PyFrame_GetStackPointer(frame);
             }
             /* Skip 5 cache entries */
@@ -8596,9 +8393,9 @@
             next_instr += 1;
             INSTRUCTION_STATS(LOAD_BUILD_CLASS);
             _PyStackRef bc;
-            PyObject *bc_o;
+            int err;
             _PyFrame_SetStackPointer(frame, stack_pointer);
-            int err = PyMapping_GetOptionalItem(BUILTINS(), &_Py_ID(__build_class__), &bc_o);
+            PyObject *bc_o = _PyMapping_GetOptionalItem2(BUILTINS(), &_Py_ID(__build_class__), &err);
             stack_pointer = _PyFrame_GetStackPointer(frame);
             if (err < 0) {
                 JUMP_TO_LABEL(error);
@@ -8809,14 +8606,14 @@
             _PyStackRef class_dict_st;
             _PyStackRef value;
             class_dict_st = stack_pointer[-1];
-            PyObject *value_o;
             PyObject *name;
             PyObject *class_dict = PyStackRef_AsPyObjectBorrow(class_dict_st);
             assert(class_dict);
             assert(oparg >= 0 && oparg < _PyFrame_GetCode(frame)->co_nlocalsplus);
             name = PyTuple_GET_ITEM(_PyFrame_GetCode(frame)->co_localsplusnames, oparg);
+            int err;
             _PyFrame_SetStackPointer(frame, stack_pointer);
-            int err = PyMapping_GetOptionalItem(class_dict, name, &value_o);
+            PyObject* value_o = _PyMapping_GetOptionalItem2(class_dict, name, &err);
             stack_pointer = _PyFrame_GetStackPointer(frame);
             if (err < 0) {
                 JUMP_TO_LABEL(error);
@@ -8855,9 +8652,9 @@
             _PyStackRef v;
             mod_or_class_dict = stack_pointer[-1];
             PyObject *name = GETITEM(FRAME_CO_NAMES, oparg);
-            PyObject *v_o;
+            int err;
             _PyFrame_SetStackPointer(frame, stack_pointer);
-            int err = PyMapping_GetOptionalItem(PyStackRef_AsPyObjectBorrow(mod_or_class_dict), name, &v_o);
+            PyObject *v_o = _PyMapping_GetOptionalItem2(PyStackRef_AsPyObjectBorrow(mod_or_class_dict), name, &err);
             stack_pointer = _PyFrame_GetStackPointer(frame);
             stack_pointer += -1;
             ASSERT_WITHIN_STACK_BOUNDS(__FILE__, __LINE__);
@@ -8888,14 +8685,14 @@
                 }
                 else {
                     _PyFrame_SetStackPointer(frame, stack_pointer);
-                    int err = PyMapping_GetOptionalItem(GLOBALS(), name, &v_o);
+                    v_o = _PyMapping_GetOptionalItem2(GLOBALS(), name, &err);
                     stack_pointer = _PyFrame_GetStackPointer(frame);
                     if (err < 0) {
                         JUMP_TO_LABEL(error);
                     }
                     if (v_o == NULL) {
                         _PyFrame_SetStackPointer(frame, stack_pointer);
-                        int err = PyMapping_GetOptionalItem(BUILTINS(), name, &v_o);
+                        v_o = _PyMapping_GetOptionalItem2(BUILTINS(), name, &err);
                         stack_pointer = _PyFrame_GetStackPointer(frame);
                         if (err < 0) {
                             JUMP_TO_LABEL(error);
@@ -9301,10 +9098,13 @@
                         JUMP_TO_LABEL(error);
                     }
                 }
-                PyObject *stack[] = {class, self};
-                _PyFrame_SetStackPointer(frame, stack_pointer);
-                PyObject *super = PyObject_Vectorcall(global_super, stack, oparg & 2, NULL);
-                stack_pointer = _PyFrame_GetStackPointer(frame);
+                PyObject *super;
+                {
+                    PyObject *stack[] = {class, self};
+                    _PyFrame_SetStackPointer(frame, stack_pointer);
+                    super = PyObject_Vectorcall(global_super, stack, oparg & 2, NULL);
+                    stack_pointer = _PyFrame_GetStackPointer(frame);
+                }
                 if (opcode == INSTRUMENTED_LOAD_SUPER_ATTR) {
                     PyObject *arg = oparg & 2 ? class : &_PyInstrumentation_MISSING;
                     if (super == NULL) {
@@ -9469,10 +9269,14 @@
             PyObject *name = GETITEM(FRAME_CO_NAMES, oparg >> 2);
             PyTypeObject *cls = (PyTypeObject *)class;
             int method_found = 0;
-            _PyFrame_SetStackPointer(frame, stack_pointer);
-            PyObject *attr_o = _PySuper_Lookup(cls, self, name,
-                Py_TYPE(self)->tp_getattro == PyObject_GenericGetAttr ? &method_found : NULL);
-            stack_pointer = _PyFrame_GetStackPointer(frame);
+            PyObject *attr_o;
+            {
+                int *method_found_ptr = &method_found;
+                _PyFrame_SetStackPointer(frame, stack_pointer);
+                attr_o = _PySuper_Lookup(cls, self, name,
+                    Py_TYPE(self)->tp_getattro == PyObject_GenericGetAttr ? method_found_ptr : NULL);
+                stack_pointer = _PyFrame_GetStackPointer(frame);
+            }
             if (attr_o == NULL) {
                 JUMP_TO_LABEL(error);
             }
@@ -10261,14 +10065,13 @@
                 assert(frame->owner != FRAME_OWNED_BY_INTERPRETER);
                 if ((tstate->interp->eval_frame == NULL) &&
                     (Py_TYPE(receiver_o) == &PyGen_Type || Py_TYPE(receiver_o) == &PyCoro_Type) &&
-                    ((PyGenObject *)receiver_o)->gi_frame_state < FRAME_EXECUTING)
+                    gen_try_set_executing((PyGenObject *)receiver_o))
                 {
                     PyGenObject *gen = (PyGenObject *)receiver_o;
                     _PyInterpreterFrame *gen_frame = &gen->gi_iframe;
                     _PyFrame_StackPush(gen_frame, PyStackRef_MakeHeapSafe(v));
                     stack_pointer += -1;
                     ASSERT_WITHIN_STACK_BOUNDS(__FILE__, __LINE__);
-                    gen->gi_frame_state = FRAME_EXECUTING;
                     gen->gi_exc_state.previous_item = tstate->exc_info;
                     tstate->exc_info = &gen->gi_exc_state;
                     assert( 2u + oparg <= UINT16_MAX);
@@ -10361,7 +10164,7 @@
                     assert(_PyOpcode_Deopt[opcode] == (SEND));
                     JUMP_TO_PREDICTED(SEND);
                 }
-                if (gen->gi_frame_state >= FRAME_EXECUTING) {
+                if (!gen_try_set_executing((PyGenObject *)gen)) {
                     UPDATE_MISS_STATS(SEND);
                     assert(_PyOpcode_Deopt[opcode] == (SEND));
                     JUMP_TO_PREDICTED(SEND);
@@ -10369,7 +10172,6 @@
                 STAT_INC(SEND, hit);
                 _PyInterpreterFrame *pushed_frame = &gen->gi_iframe;
                 _PyFrame_StackPush(pushed_frame, PyStackRef_MakeHeapSafe(v));
-                gen->gi_frame_state = FRAME_EXECUTING;
                 gen->gi_exc_state.previous_item = tstate->exc_info;
                 tstate->exc_info = &gen->gi_exc_state;
                 assert( 2u + oparg <= UINT16_MAX);
@@ -10404,7 +10206,6 @@
             frame->instr_ptr = next_instr;
             next_instr += 1;
             INSTRUCTION_STATS(SETUP_ANNOTATIONS);
-            PyObject *ann_dict;
             if (LOCALS() == NULL) {
                 _PyFrame_SetStackPointer(frame, stack_pointer);
                 _PyErr_Format(tstate, PyExc_SystemError,
@@ -10412,8 +10213,9 @@
                 stack_pointer = _PyFrame_GetStackPointer(frame);
                 JUMP_TO_LABEL(error);
             }
+            int err;
             _PyFrame_SetStackPointer(frame, stack_pointer);
-            int err = PyMapping_GetOptionalItem(LOCALS(), &_Py_ID(__annotations__), &ann_dict);
+            PyObject* ann_dict = _PyMapping_GetOptionalItem2(LOCALS(), &_Py_ID(__annotations__), &err);
             stack_pointer = _PyFrame_GetStackPointer(frame);
             if (err < 0) {
                 JUMP_TO_LABEL(error);
@@ -10740,6 +10542,7 @@
             static_assert(INLINE_CACHE_ENTRIES_STORE_ATTR == 4, "incorrect cache size");
             _PyStackRef owner;
             _PyStackRef value;
+            _PyStackRef o;
             /* Skip 1 cache entry */
             // _GUARD_TYPE_VERSION
             {
@@ -10800,18 +10603,28 @@
                     }
                 }
                 _PyFrame_SetStackPointer(frame, stack_pointer);
-                _PyDict_NotifyEvent(tstate->interp, PyDict_EVENT_MODIFIED, dict, name, PyStackRef_AsPyObjectBorrow(value));
+                _PyDict_NotifyEvent(PyDict_EVENT_MODIFIED, dict, name, PyStackRef_AsPyObjectBorrow(value));
                 stack_pointer = _PyFrame_GetStackPointer(frame);
                 FT_ATOMIC_STORE_PTR_RELEASE(ep->me_value, PyStackRef_AsPyObjectSteal(value));
                 UNLOCK_OBJECT(dict);
                 STAT_INC(STORE_ATTR, hit);
-                stack_pointer += -2;
+                o = owner;
+                stack_pointer[-2] = o;
+                stack_pointer += -1;
                 ASSERT_WITHIN_STACK_BOUNDS(__FILE__, __LINE__);
                 _PyFrame_SetStackPointer(frame, stack_pointer);
-                PyStackRef_CLOSE(owner);
                 Py_XDECREF(old_value);
                 stack_pointer = _PyFrame_GetStackPointer(frame);
             }
+            // _POP_TOP
+            {
+                value = o;
+                stack_pointer += -1;
+                ASSERT_WITHIN_STACK_BOUNDS(__FILE__, __LINE__);
+                _PyFrame_SetStackPointer(frame, stack_pointer);
+                PyStackRef_XCLOSE(value);
+                stack_pointer = _PyFrame_GetStackPointer(frame);
+            }
             DISPATCH();
         }
 
@@ -11961,11 +11774,16 @@
             }
             assert(PyStackRef_IsTaggedInt(lasti));
             (void)lasti;
-            PyObject *stack[5] = {NULL, PyStackRef_AsPyObjectBorrow(exit_self), exc, val_o, tb};
-            int has_self = !PyStackRef_IsNull(exit_self);
+            PyObject* res_o;
+            {
+                PyObject *stack[5] = {NULL, PyStackRef_AsPyObjectBorrow(exit_self), exc, val_o, tb};
+                int has_self = !PyStackRef_IsNull(exit_self);
+                _PyFrame_SetStackPointer(frame, stack_pointer);
+                res_o = PyObject_Vectorcall(exit_func_o, stack + 2 - has_self,
+                    (3 + has_self) | PY_VECTORCALL_ARGUMENTS_OFFSET, NULL);
+                stack_pointer = _PyFrame_GetStackPointer(frame);
+            }
             _PyFrame_SetStackPointer(frame, stack_pointer);
-            PyObject *res_o = PyObject_Vectorcall(exit_func_o, stack + 2 - has_self,
-                (3 + has_self) | PY_VECTORCALL_ARGUMENTS_OFFSET, NULL);
             Py_XDECREF(original_tb);
             stack_pointer = _PyFrame_GetStackPointer(frame);
             if (res_o == NULL) {
@@ -11994,7 +11812,6 @@
             PyGenObject *gen = _PyGen_GetGeneratorFromFrame(frame);
             assert(FRAME_SUSPENDED_YIELD_FROM == FRAME_SUSPENDED + 1);
             assert(oparg == 0 || oparg == 1);
-            gen->gi_frame_state = FRAME_SUSPENDED + oparg;
             _PyStackRef temp = retval;
             stack_pointer += -1;
             ASSERT_WITHIN_STACK_BOUNDS(__FILE__, __LINE__);
@@ -12005,6 +11822,8 @@
             _PyInterpreterFrame *gen_frame = frame;
             frame = tstate->current_frame = frame->previous;
             gen_frame->previous = NULL;
+            ((_PyThreadStateImpl *)tstate)->generator_return_kind = GENERATOR_YIELD;
+            FT_ATOMIC_STORE_INT8_RELEASE(gen->gi_frame_state, FRAME_SUSPENDED + oparg);
             assert(INLINE_CACHE_ENTRIES_SEND == INLINE_CACHE_ENTRIES_FOR_ITER);
             #if TIER_ONE
             assert(frame->instr_ptr->op.code == INSTRUMENTED_LINE ||
diff --git a/Python/import.c b/Python/import.c
index db433dbc971..466c5868ab7 100644
--- a/Python/import.c
+++ b/Python/import.c
@@ -4762,6 +4762,7 @@ static PyObject *
 _imp_create_dynamic_impl(PyObject *module, PyObject *spec, PyObject *file)
 /*[clinic end generated code: output=83249b827a4fde77 input=c31b954f4cf4e09d]*/
 {
+    FILE *fp = NULL;
     PyObject *mod = NULL;
     PyThreadState *tstate = _PyThreadState_GET();
 
@@ -4804,16 +4805,12 @@ _imp_create_dynamic_impl(PyObject *module, PyObject *spec, PyObject *file)
     /* We would move this (and the fclose() below) into
      * _PyImport_GetModuleExportHooks(), but it isn't clear if the intervening
      * code relies on fp still being open. */
-    FILE *fp;
     if (file != NULL) {
         fp = Py_fopen(info.filename, "r");
         if (fp == NULL) {
             goto finally;
         }
     }
-    else {
-        fp = NULL;
-    }
 
     PyModInitFunction p0 = NULL;
     PyModExportFunction ex0 = NULL;
@@ -4822,7 +4819,7 @@ _imp_create_dynamic_impl(PyObject *module, PyObject *spec, PyObject *file)
         mod = import_run_modexport(tstate, ex0, &info, spec);
         // Modules created from slots handle GIL enablement (Py_mod_gil slot)
         // when they're created.
-        goto cleanup;
+        goto finally;
     }
     if (p0 == NULL) {
         goto finally;
@@ -4845,13 +4842,10 @@ _imp_create_dynamic_impl(PyObject *module, PyObject *spec, PyObject *file)
     }
 #endif
 
-cleanup:
-    // XXX Shouldn't this happen in the error cases too (i.e. in "finally")?
-    if (fp) {
+finally:
+    if (fp != NULL) {
         fclose(fp);
     }
-
-finally:
     _Py_ext_module_loader_info_clear(&info);
     return mod;
 }
diff --git a/Python/jit.c b/Python/jit.c
index 602d7a519bd..4ce90edf73a 100644
--- a/Python/jit.c
+++ b/Python/jit.c
@@ -77,7 +77,7 @@ jit_alloc(size_t size)
     unsigned char *memory = mmap(NULL, size, prot, flags, -1, 0);
     int failed = memory == MAP_FAILED;
     if (!failed) {
-        _PyAnnotateMemoryMap(memory, size, "cpython:jit");
+        (void)_PyAnnotateMemoryMap(memory, size, "cpython:jit");
     }
 #endif
     if (failed) {
@@ -432,6 +432,15 @@ patch_aarch64_33rx(unsigned char *location, uint64_t value)
         loc32[1] = 0xF2A00000 | (get_bits(relaxed, 16, 16) << 5) | reg;
         return;
     }
+    int64_t page_delta = (relaxed >> 12) - ((uintptr_t)location >> 12);
+    if (page_delta >= -(1L << 20) &&
+        page_delta < (1L << 20))
+    {
+        // adrp reg, AAA; ldr reg, [reg + BBB] -> adrp reg, AAA; add reg, reg, BBB
+        patch_aarch64_21rx(location, relaxed);
+        loc32[1] = 0x91000000 | get_bits(relaxed, 0, 12) << 10 | reg << 5 | reg;
+        return;
+    }
     relaxed = value - (uintptr_t)location;
     if ((relaxed & 0x3) == 0 &&
         (int64_t)relaxed >= -(1L << 19) &&
@@ -663,20 +672,20 @@ _PyJIT_Compile(_PyExecutorObject *executor, const _PyUOpInstruction trace[], siz
     return 0;
 }
 
-/* One-off compilation of the jit entry trampoline
+/* One-off compilation of the jit entry shim
  * We compile this once only as it effectively a normal
  * function, but we need to use the JIT because it needs
  * to understand the jit-specific calling convention.
  */
 static _PyJitEntryFuncPtr
-compile_trampoline(void)
+compile_shim(void)
 {
     _PyExecutorObject dummy;
     const StencilGroup *group;
     size_t code_size = 0;
     size_t data_size = 0;
     jit_state state = {0};
-    group = &trampoline;
+    group = &shim;
     code_size += group->code_size;
     data_size += group->data_size;
     combine_symbol_mask(group->trampoline_mask, state.trampolines.mask);
@@ -698,7 +707,7 @@ compile_trampoline(void)
     // Compile the shim, which handles converting between the native
     // calling convention and the calling convention used by jitted code
     // (which may be different for efficiency reasons).
-    group = &trampoline;
+    group = &shim;
     group->emit(code, data, &dummy, NULL, &state);
     code += group->code_size;
     data += group->data_size;
@@ -714,17 +723,17 @@ compile_trampoline(void)
 static PyMutex lazy_jit_mutex = { 0 };
 
 _Py_CODEUNIT *
-_Py_LazyJitTrampoline(
+_Py_LazyJitShim(
     _PyExecutorObject *executor, _PyInterpreterFrame *frame, _PyStackRef *stack_pointer, PyThreadState *tstate
 ) {
     PyMutex_Lock(&lazy_jit_mutex);
-    if (_Py_jit_entry == _Py_LazyJitTrampoline) {
-        _PyJitEntryFuncPtr trampoline = compile_trampoline();
-        if (trampoline == NULL) {
+    if (_Py_jit_entry == _Py_LazyJitShim) {
+        _PyJitEntryFuncPtr shim = compile_shim();
+        if (shim == NULL) {
             PyMutex_Unlock(&lazy_jit_mutex);
             Py_FatalError("Cannot allocate core JIT code");
         }
-        _Py_jit_entry = trampoline;
+        _Py_jit_entry = shim;
     }
     PyMutex_Unlock(&lazy_jit_mutex);
     return _Py_jit_entry(executor, frame, stack_pointer, tstate);
diff --git a/Python/opcode_targets.h b/Python/opcode_targets.h
index b2fa7d01e8f..267e707dc59 100644
--- a/Python/opcode_targets.h
+++ b/Python/opcode_targets.h
@@ -522,242 +522,242 @@ static py_tail_call_funcptr instruction_funcptr_handler_table[256];
 
 static py_tail_call_funcptr instruction_funcptr_tracing_table[256];
 
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_pop_2_error(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_pop_1_error(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_error(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_exception_unwind(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_exit_unwind(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_start_frame(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_stop_tracing(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_pop_2_error(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_pop_1_error(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_error(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_exception_unwind(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_exit_unwind(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_start_frame(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_stop_tracing(TAIL_CALL_PARAMS);
 
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_BINARY_OP(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_BINARY_OP_ADD_FLOAT(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_BINARY_OP_ADD_INT(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_BINARY_OP_ADD_UNICODE(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_BINARY_OP_EXTEND(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_BINARY_OP_INPLACE_ADD_UNICODE(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_BINARY_OP_MULTIPLY_FLOAT(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_BINARY_OP_MULTIPLY_INT(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_BINARY_OP_SUBSCR_DICT(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_BINARY_OP_SUBSCR_GETITEM(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_BINARY_OP_SUBSCR_LIST_INT(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_BINARY_OP_SUBSCR_LIST_SLICE(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_BINARY_OP_SUBSCR_STR_INT(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_BINARY_OP_SUBSCR_TUPLE_INT(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_BINARY_OP_SUBTRACT_FLOAT(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_BINARY_OP_SUBTRACT_INT(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_BINARY_SLICE(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_BUILD_INTERPOLATION(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_BUILD_LIST(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_BUILD_MAP(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_BUILD_SET(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_BUILD_SLICE(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_BUILD_STRING(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_BUILD_TEMPLATE(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_BUILD_TUPLE(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_CACHE(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_CALL(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_CALL_ALLOC_AND_ENTER_INIT(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_CALL_BOUND_METHOD_EXACT_ARGS(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_CALL_BOUND_METHOD_GENERAL(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_CALL_BUILTIN_CLASS(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_CALL_BUILTIN_FAST(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_CALL_BUILTIN_FAST_WITH_KEYWORDS(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_CALL_BUILTIN_O(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_CALL_FUNCTION_EX(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_CALL_INTRINSIC_1(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_CALL_INTRINSIC_2(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_CALL_ISINSTANCE(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_CALL_KW(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_CALL_KW_BOUND_METHOD(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_CALL_KW_NON_PY(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_CALL_KW_PY(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_CALL_LEN(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_CALL_LIST_APPEND(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_CALL_METHOD_DESCRIPTOR_FAST(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_CALL_METHOD_DESCRIPTOR_FAST_WITH_KEYWORDS(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_CALL_METHOD_DESCRIPTOR_NOARGS(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_CALL_METHOD_DESCRIPTOR_O(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_CALL_NON_PY_GENERAL(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_CALL_PY_EXACT_ARGS(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_CALL_PY_GENERAL(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_CALL_STR_1(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_CALL_TUPLE_1(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_CALL_TYPE_1(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_CHECK_EG_MATCH(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_CHECK_EXC_MATCH(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_CLEANUP_THROW(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_COMPARE_OP(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_COMPARE_OP_FLOAT(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_COMPARE_OP_INT(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_COMPARE_OP_STR(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_CONTAINS_OP(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_CONTAINS_OP_DICT(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_CONTAINS_OP_SET(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_CONVERT_VALUE(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_COPY(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_COPY_FREE_VARS(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_DELETE_ATTR(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_DELETE_DEREF(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_DELETE_FAST(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_DELETE_GLOBAL(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_DELETE_NAME(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_DELETE_SUBSCR(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_DICT_MERGE(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_DICT_UPDATE(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_END_ASYNC_FOR(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_END_FOR(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_END_SEND(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_ENTER_EXECUTOR(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_EXIT_INIT_CHECK(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_EXTENDED_ARG(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_FORMAT_SIMPLE(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_FORMAT_WITH_SPEC(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_FOR_ITER(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_FOR_ITER_GEN(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_FOR_ITER_LIST(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_FOR_ITER_RANGE(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_FOR_ITER_TUPLE(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_GET_AITER(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_GET_ANEXT(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_GET_AWAITABLE(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_GET_ITER(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_GET_LEN(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_GET_YIELD_FROM_ITER(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_IMPORT_FROM(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_IMPORT_NAME(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_INSTRUMENTED_CALL(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_INSTRUMENTED_CALL_FUNCTION_EX(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_INSTRUMENTED_CALL_KW(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_INSTRUMENTED_END_ASYNC_FOR(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_INSTRUMENTED_END_FOR(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_INSTRUMENTED_END_SEND(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_INSTRUMENTED_FOR_ITER(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_INSTRUMENTED_INSTRUCTION(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_INSTRUMENTED_JUMP_BACKWARD(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_INSTRUMENTED_JUMP_FORWARD(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_INSTRUMENTED_LINE(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_INSTRUMENTED_LOAD_SUPER_ATTR(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_INSTRUMENTED_NOT_TAKEN(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_INSTRUMENTED_POP_ITER(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_INSTRUMENTED_POP_JUMP_IF_FALSE(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_INSTRUMENTED_POP_JUMP_IF_NONE(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_INSTRUMENTED_POP_JUMP_IF_NOT_NONE(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_INSTRUMENTED_POP_JUMP_IF_TRUE(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_INSTRUMENTED_RESUME(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_INSTRUMENTED_RETURN_VALUE(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_INSTRUMENTED_YIELD_VALUE(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_INTERPRETER_EXIT(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_IS_OP(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_JUMP_BACKWARD(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_JUMP_BACKWARD_JIT(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_JUMP_BACKWARD_NO_INTERRUPT(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_JUMP_BACKWARD_NO_JIT(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_JUMP_FORWARD(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_LIST_APPEND(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_LIST_EXTEND(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_LOAD_ATTR(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_LOAD_ATTR_CLASS(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_LOAD_ATTR_CLASS_WITH_METACLASS_CHECK(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_LOAD_ATTR_GETATTRIBUTE_OVERRIDDEN(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_LOAD_ATTR_INSTANCE_VALUE(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_LOAD_ATTR_METHOD_LAZY_DICT(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_LOAD_ATTR_METHOD_NO_DICT(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_LOAD_ATTR_METHOD_WITH_VALUES(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_LOAD_ATTR_MODULE(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_LOAD_ATTR_NONDESCRIPTOR_NO_DICT(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_LOAD_ATTR_NONDESCRIPTOR_WITH_VALUES(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_LOAD_ATTR_PROPERTY(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_LOAD_ATTR_SLOT(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_LOAD_ATTR_WITH_HINT(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_LOAD_BUILD_CLASS(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_LOAD_COMMON_CONSTANT(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_LOAD_CONST(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_LOAD_DEREF(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_LOAD_FAST(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_LOAD_FAST_AND_CLEAR(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_LOAD_FAST_BORROW(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_LOAD_FAST_BORROW_LOAD_FAST_BORROW(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_LOAD_FAST_CHECK(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_LOAD_FAST_LOAD_FAST(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_LOAD_FROM_DICT_OR_DEREF(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_LOAD_FROM_DICT_OR_GLOBALS(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_LOAD_GLOBAL(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_LOAD_GLOBAL_BUILTIN(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_LOAD_GLOBAL_MODULE(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_LOAD_LOCALS(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_LOAD_NAME(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_LOAD_SMALL_INT(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_LOAD_SPECIAL(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_LOAD_SUPER_ATTR(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_LOAD_SUPER_ATTR_ATTR(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_LOAD_SUPER_ATTR_METHOD(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_MAKE_CELL(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_MAKE_FUNCTION(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_MAP_ADD(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_MATCH_CLASS(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_MATCH_KEYS(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_MATCH_MAPPING(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_MATCH_SEQUENCE(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_NOP(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_NOT_TAKEN(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_POP_EXCEPT(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_POP_ITER(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_POP_JUMP_IF_FALSE(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_POP_JUMP_IF_NONE(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_POP_JUMP_IF_NOT_NONE(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_POP_JUMP_IF_TRUE(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_POP_TOP(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_PUSH_EXC_INFO(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_PUSH_NULL(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_RAISE_VARARGS(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_RERAISE(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_RESERVED(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_RESUME(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_RESUME_CHECK(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_RETURN_GENERATOR(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_RETURN_VALUE(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_SEND(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_SEND_GEN(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_SETUP_ANNOTATIONS(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_SET_ADD(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_SET_FUNCTION_ATTRIBUTE(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_SET_UPDATE(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_STORE_ATTR(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_STORE_ATTR_INSTANCE_VALUE(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_STORE_ATTR_SLOT(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_STORE_ATTR_WITH_HINT(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_STORE_DEREF(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_STORE_FAST(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_STORE_FAST_LOAD_FAST(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_STORE_FAST_STORE_FAST(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_STORE_GLOBAL(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_STORE_NAME(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_STORE_SLICE(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_STORE_SUBSCR(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_STORE_SUBSCR_DICT(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_STORE_SUBSCR_LIST_INT(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_SWAP(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_TO_BOOL(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_TO_BOOL_ALWAYS_TRUE(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_TO_BOOL_BOOL(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_TO_BOOL_INT(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_TO_BOOL_LIST(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_TO_BOOL_NONE(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_TO_BOOL_STR(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_TRACE_RECORD(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_UNARY_INVERT(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_UNARY_NEGATIVE(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_UNARY_NOT(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_UNPACK_EX(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_UNPACK_SEQUENCE(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_UNPACK_SEQUENCE_LIST(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_UNPACK_SEQUENCE_TUPLE(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_UNPACK_SEQUENCE_TWO_TUPLE(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_WITH_EXCEPT_START(TAIL_CALL_PARAMS);
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_YIELD_VALUE(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_BINARY_OP(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_BINARY_OP_ADD_FLOAT(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_BINARY_OP_ADD_INT(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_BINARY_OP_ADD_UNICODE(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_BINARY_OP_EXTEND(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_BINARY_OP_INPLACE_ADD_UNICODE(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_BINARY_OP_MULTIPLY_FLOAT(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_BINARY_OP_MULTIPLY_INT(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_BINARY_OP_SUBSCR_DICT(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_BINARY_OP_SUBSCR_GETITEM(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_BINARY_OP_SUBSCR_LIST_INT(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_BINARY_OP_SUBSCR_LIST_SLICE(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_BINARY_OP_SUBSCR_STR_INT(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_BINARY_OP_SUBSCR_TUPLE_INT(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_BINARY_OP_SUBTRACT_FLOAT(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_BINARY_OP_SUBTRACT_INT(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_BINARY_SLICE(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_BUILD_INTERPOLATION(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_BUILD_LIST(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_BUILD_MAP(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_BUILD_SET(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_BUILD_SLICE(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_BUILD_STRING(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_BUILD_TEMPLATE(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_BUILD_TUPLE(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_CACHE(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_CALL(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_CALL_ALLOC_AND_ENTER_INIT(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_CALL_BOUND_METHOD_EXACT_ARGS(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_CALL_BOUND_METHOD_GENERAL(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_CALL_BUILTIN_CLASS(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_CALL_BUILTIN_FAST(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_CALL_BUILTIN_FAST_WITH_KEYWORDS(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_CALL_BUILTIN_O(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_CALL_FUNCTION_EX(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_CALL_INTRINSIC_1(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_CALL_INTRINSIC_2(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_CALL_ISINSTANCE(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_CALL_KW(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_CALL_KW_BOUND_METHOD(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_CALL_KW_NON_PY(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_CALL_KW_PY(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_CALL_LEN(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_CALL_LIST_APPEND(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_CALL_METHOD_DESCRIPTOR_FAST(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_CALL_METHOD_DESCRIPTOR_FAST_WITH_KEYWORDS(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_CALL_METHOD_DESCRIPTOR_NOARGS(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_CALL_METHOD_DESCRIPTOR_O(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_CALL_NON_PY_GENERAL(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_CALL_PY_EXACT_ARGS(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_CALL_PY_GENERAL(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_CALL_STR_1(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_CALL_TUPLE_1(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_CALL_TYPE_1(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_CHECK_EG_MATCH(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_CHECK_EXC_MATCH(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_CLEANUP_THROW(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_COMPARE_OP(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_COMPARE_OP_FLOAT(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_COMPARE_OP_INT(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_COMPARE_OP_STR(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_CONTAINS_OP(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_CONTAINS_OP_DICT(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_CONTAINS_OP_SET(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_CONVERT_VALUE(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_COPY(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_COPY_FREE_VARS(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_DELETE_ATTR(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_DELETE_DEREF(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_DELETE_FAST(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_DELETE_GLOBAL(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_DELETE_NAME(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_DELETE_SUBSCR(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_DICT_MERGE(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_DICT_UPDATE(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_END_ASYNC_FOR(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_END_FOR(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_END_SEND(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_ENTER_EXECUTOR(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_EXIT_INIT_CHECK(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_EXTENDED_ARG(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_FORMAT_SIMPLE(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_FORMAT_WITH_SPEC(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_FOR_ITER(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_FOR_ITER_GEN(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_FOR_ITER_LIST(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_FOR_ITER_RANGE(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_FOR_ITER_TUPLE(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_GET_AITER(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_GET_ANEXT(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_GET_AWAITABLE(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_GET_ITER(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_GET_LEN(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_GET_YIELD_FROM_ITER(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_IMPORT_FROM(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_IMPORT_NAME(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_INSTRUMENTED_CALL(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_INSTRUMENTED_CALL_FUNCTION_EX(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_INSTRUMENTED_CALL_KW(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_INSTRUMENTED_END_ASYNC_FOR(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_INSTRUMENTED_END_FOR(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_INSTRUMENTED_END_SEND(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_INSTRUMENTED_FOR_ITER(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_INSTRUMENTED_INSTRUCTION(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_INSTRUMENTED_JUMP_BACKWARD(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_INSTRUMENTED_JUMP_FORWARD(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_INSTRUMENTED_LINE(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_INSTRUMENTED_LOAD_SUPER_ATTR(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_INSTRUMENTED_NOT_TAKEN(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_INSTRUMENTED_POP_ITER(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_INSTRUMENTED_POP_JUMP_IF_FALSE(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_INSTRUMENTED_POP_JUMP_IF_NONE(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_INSTRUMENTED_POP_JUMP_IF_NOT_NONE(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_INSTRUMENTED_POP_JUMP_IF_TRUE(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_INSTRUMENTED_RESUME(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_INSTRUMENTED_RETURN_VALUE(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_INSTRUMENTED_YIELD_VALUE(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_INTERPRETER_EXIT(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_IS_OP(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_JUMP_BACKWARD(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_JUMP_BACKWARD_JIT(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_JUMP_BACKWARD_NO_INTERRUPT(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_JUMP_BACKWARD_NO_JIT(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_JUMP_FORWARD(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_LIST_APPEND(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_LIST_EXTEND(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_LOAD_ATTR(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_LOAD_ATTR_CLASS(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_LOAD_ATTR_CLASS_WITH_METACLASS_CHECK(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_LOAD_ATTR_GETATTRIBUTE_OVERRIDDEN(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_LOAD_ATTR_INSTANCE_VALUE(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_LOAD_ATTR_METHOD_LAZY_DICT(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_LOAD_ATTR_METHOD_NO_DICT(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_LOAD_ATTR_METHOD_WITH_VALUES(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_LOAD_ATTR_MODULE(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_LOAD_ATTR_NONDESCRIPTOR_NO_DICT(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_LOAD_ATTR_NONDESCRIPTOR_WITH_VALUES(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_LOAD_ATTR_PROPERTY(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_LOAD_ATTR_SLOT(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_LOAD_ATTR_WITH_HINT(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_LOAD_BUILD_CLASS(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_LOAD_COMMON_CONSTANT(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_LOAD_CONST(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_LOAD_DEREF(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_LOAD_FAST(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_LOAD_FAST_AND_CLEAR(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_LOAD_FAST_BORROW(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_LOAD_FAST_BORROW_LOAD_FAST_BORROW(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_LOAD_FAST_CHECK(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_LOAD_FAST_LOAD_FAST(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_LOAD_FROM_DICT_OR_DEREF(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_LOAD_FROM_DICT_OR_GLOBALS(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_LOAD_GLOBAL(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_LOAD_GLOBAL_BUILTIN(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_LOAD_GLOBAL_MODULE(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_LOAD_LOCALS(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_LOAD_NAME(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_LOAD_SMALL_INT(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_LOAD_SPECIAL(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_LOAD_SUPER_ATTR(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_LOAD_SUPER_ATTR_ATTR(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_LOAD_SUPER_ATTR_METHOD(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_MAKE_CELL(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_MAKE_FUNCTION(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_MAP_ADD(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_MATCH_CLASS(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_MATCH_KEYS(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_MATCH_MAPPING(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_MATCH_SEQUENCE(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_NOP(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_NOT_TAKEN(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_POP_EXCEPT(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_POP_ITER(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_POP_JUMP_IF_FALSE(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_POP_JUMP_IF_NONE(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_POP_JUMP_IF_NOT_NONE(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_POP_JUMP_IF_TRUE(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_POP_TOP(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_PUSH_EXC_INFO(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_PUSH_NULL(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_RAISE_VARARGS(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_RERAISE(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_RESERVED(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_RESUME(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_RESUME_CHECK(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_RETURN_GENERATOR(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_RETURN_VALUE(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_SEND(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_SEND_GEN(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_SETUP_ANNOTATIONS(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_SET_ADD(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_SET_FUNCTION_ATTRIBUTE(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_SET_UPDATE(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_STORE_ATTR(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_STORE_ATTR_INSTANCE_VALUE(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_STORE_ATTR_SLOT(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_STORE_ATTR_WITH_HINT(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_STORE_DEREF(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_STORE_FAST(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_STORE_FAST_LOAD_FAST(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_STORE_FAST_STORE_FAST(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_STORE_GLOBAL(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_STORE_NAME(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_STORE_SLICE(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_STORE_SUBSCR(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_STORE_SUBSCR_DICT(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_STORE_SUBSCR_LIST_INT(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_SWAP(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_TO_BOOL(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_TO_BOOL_ALWAYS_TRUE(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_TO_BOOL_BOOL(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_TO_BOOL_INT(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_TO_BOOL_LIST(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_TO_BOOL_NONE(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_TO_BOOL_STR(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_TRACE_RECORD(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_UNARY_INVERT(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_UNARY_NEGATIVE(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_UNARY_NOT(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_UNPACK_EX(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_UNPACK_SEQUENCE(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_UNPACK_SEQUENCE_LIST(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_UNPACK_SEQUENCE_TUPLE(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_UNPACK_SEQUENCE_TWO_TUPLE(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_WITH_EXCEPT_START(TAIL_CALL_PARAMS);
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_YIELD_VALUE(TAIL_CALL_PARAMS);
 
-Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_UNKNOWN_OPCODE(TAIL_CALL_PARAMS) {
+static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_UNKNOWN_OPCODE(TAIL_CALL_PARAMS) {
     int opcode = next_instr->op.code;
     _PyErr_Format(tstate, PyExc_SystemError,
               "%U:%d: unknown opcode %d",
diff --git a/Python/optimizer.c b/Python/optimizer.c
index 5a2cdb42194..869889bf259 100644
--- a/Python/optimizer.c
+++ b/Python/optimizer.c
@@ -185,12 +185,17 @@ _PyOptimizer_Optimize(
     else {
         executor->vm_data.code = NULL;
     }
-    _PyExitData *exit = _tstate->jit_tracer_state.initial_state.exit;
-    if (exit != NULL) {
-        exit->executor = executor;
-    }
     executor->vm_data.chain_depth = chain_depth;
     assert(executor->vm_data.valid);
+    _PyExitData *exit = _tstate->jit_tracer_state.initial_state.exit;
+    if (exit != NULL && !progress_needed) {
+        exit->executor = executor;
+    }
+    else {
+        // An executor inserted into the code object now has a strong reference
+        // to it from the code object. Thus, we don't need this reference anymore.
+        Py_DECREF(executor);
+    }
     interp->compiling = false;
     return 1;
 #else
@@ -246,8 +251,6 @@ get_oparg(PyObject *self, PyObject *Py_UNUSED(ignored))
 ///////////////////// Experimental UOp Optimizer /////////////////////
 
 static int executor_clear(PyObject *executor);
-static void unlink_executor(_PyExecutorObject *executor);
-
 
 void
 _PyExecutor_Free(_PyExecutorObject *self)
@@ -258,63 +261,76 @@ _PyExecutor_Free(_PyExecutorObject *self)
     PyObject_GC_Del(self);
 }
 
+static void executor_invalidate(PyObject *op);
+
+static void
+executor_clear_exits(_PyExecutorObject *executor)
+{
+    _PyExecutorObject *cold = _PyExecutor_GetColdExecutor();
+    _PyExecutorObject *cold_dynamic = _PyExecutor_GetColdDynamicExecutor();
+    for (uint32_t i = 0; i < executor->exit_count; i++) {
+        _PyExitData *exit = &executor->exits[i];
+        exit->temperature = initial_unreachable_backoff_counter();
+        _PyExecutorObject *old = executor->exits[i].executor;
+        exit->executor = exit->is_dynamic ? cold_dynamic : cold;
+        Py_DECREF(old);
+    }
+}
+
+
 void
 _Py_ClearExecutorDeletionList(PyInterpreterState *interp)
 {
+    if (interp->executor_deletion_list_head == NULL) {
+        return;
+    }
     _PyRuntimeState *runtime = &_PyRuntime;
     HEAD_LOCK(runtime);
     PyThreadState* ts = PyInterpreterState_ThreadHead(interp);
+    while (ts) {
+        _PyExecutorObject *current = (_PyExecutorObject *)ts->current_executor;
+        Py_XINCREF(current);
+        ts = ts->next;
+    }
     HEAD_UNLOCK(runtime);
+    _PyExecutorObject *keep_list = NULL;
+    do {
+        _PyExecutorObject *exec = interp->executor_deletion_list_head;
+        interp->executor_deletion_list_head = exec->vm_data.links.next;
+        if (Py_REFCNT(exec) == 0) {
+            _PyExecutor_Free(exec);
+        } else {
+            exec->vm_data.links.next = keep_list;
+            keep_list = exec;
+        }
+    } while (interp->executor_deletion_list_head != NULL);
+    interp->executor_deletion_list_head = keep_list;
+    HEAD_LOCK(runtime);
+    ts = PyInterpreterState_ThreadHead(interp);
     while (ts) {
         _PyExecutorObject *current = (_PyExecutorObject *)ts->current_executor;
         if (current != NULL) {
-            /* Anything in this list will be unlinked, so we can reuse the
-             * linked field as a reachability marker. */
-            current->vm_data.linked = 1;
+            _Py_DECREF_NO_DEALLOC((PyObject *)current);
         }
-        HEAD_LOCK(runtime);
-        ts = PyThreadState_Next(ts);
-        HEAD_UNLOCK(runtime);
+        ts = ts->next;
     }
-    _PyExecutorObject **prev_to_next_ptr = &interp->executor_deletion_list_head;
-    _PyExecutorObject *exec = *prev_to_next_ptr;
-    while (exec != NULL) {
-        if (exec->vm_data.linked) {
-            // This executor is currently executing
-            exec->vm_data.linked = 0;
-            prev_to_next_ptr = &exec->vm_data.links.next;
-        }
-        else {
-            *prev_to_next_ptr = exec->vm_data.links.next;
-            _PyExecutor_Free(exec);
-        }
-        exec = *prev_to_next_ptr;
-    }
-    interp->executor_deletion_list_remaining_capacity = EXECUTOR_DELETE_LIST_MAX;
+    HEAD_UNLOCK(runtime);
 }
 
 static void
 add_to_pending_deletion_list(_PyExecutorObject *self)
 {
     PyInterpreterState *interp = PyInterpreterState_Get();
+    self->vm_data.links.previous = NULL;
     self->vm_data.links.next = interp->executor_deletion_list_head;
     interp->executor_deletion_list_head = self;
-    if (interp->executor_deletion_list_remaining_capacity > 0) {
-        interp->executor_deletion_list_remaining_capacity--;
-    }
-    else {
-        _Py_ClearExecutorDeletionList(interp);
-    }
 }
 
 static void
 uop_dealloc(PyObject *op) {
     _PyExecutorObject *self = _PyExecutorObject_CAST(op);
-    _PyObject_GC_UNTRACK(self);
+    executor_invalidate(op);
     assert(self->vm_data.code == NULL);
-    unlink_executor(self);
-    // Once unlinked it becomes impossible to invalidate an executor, so do it here.
-    self->vm_data.valid = 0;
     add_to_pending_deletion_list(self);
 }
 
@@ -356,7 +372,7 @@ _PyUOpPrint(const _PyUOpInstruction *uop)
         default:
             printf(" (%d, Unknown format)", uop->oparg);
     }
-    if (_PyUop_Flags[uop->opcode] & HAS_ERROR_FLAG) {
+    if (_PyUop_Flags[_PyUop_Uncached[uop->opcode]] & HAS_ERROR_FLAG) {
         printf(", error_target=%d", uop->error_target);
     }
 
@@ -582,7 +598,8 @@ is_terminator(const _PyUOpInstruction *uop)
 
 /* Returns 1 on success (added to trace), 0 on trace end.
  */
-int
+// gh-142543: inlining this function causes stack overflows
+Py_NO_INLINE int
 _PyJit_translate_single_bytecode_to_trace(
     PyThreadState *tstate,
     _PyInterpreterFrame *frame,
@@ -693,7 +710,7 @@ _PyJit_translate_single_bytecode_to_trace(
     }
 
     if (!_tstate->jit_tracer_state.prev_state.dependencies_still_valid) {
-        goto done;
+        goto full;
     }
 
     // This happens when a recursive call happens that we can't trace. Such as Python -> C -> Python calls
@@ -994,7 +1011,8 @@ full:
 }
 
 // Returns 0 for do not enter tracing, 1 on enter tracing.
-int
+// gh-142543: inlining this function causes stack overflows
+Py_NO_INLINE int
 _PyJit_TryInitializeTracing(
     PyThreadState *tstate, _PyInterpreterFrame *frame, _Py_CODEUNIT *curr_instr,
     _Py_CODEUNIT *start_instr, _Py_CODEUNIT *close_loop_instr, int curr_stackdepth, int chain_depth,
@@ -1066,7 +1084,7 @@ _PyJit_TryInitializeTracing(
     return 1;
 }
 
-void
+Py_NO_INLINE void
 _PyJit_FinalizeTracing(PyThreadState *tstate)
 {
     _PyThreadStateImpl *_tstate = (_PyThreadStateImpl *)tstate;
@@ -1619,7 +1637,6 @@ link_executor(_PyExecutorObject *executor)
         head->vm_data.links.previous = executor;
         interp->executor_list_head = executor;
     }
-    executor->vm_data.linked = true;
     /* executor_list_head must be first in list */
     assert(interp->executor_list_head->vm_data.links.previous == NULL);
 }
@@ -1627,11 +1644,7 @@ link_executor(_PyExecutorObject *executor)
 static void
 unlink_executor(_PyExecutorObject *executor)
 {
-    if (!executor->vm_data.linked) {
-        return;
-    }
     _PyExecutorLinkListNode *links = &executor->vm_data.links;
-    assert(executor->vm_data.valid);
     _PyExecutorObject *next = links->next;
     _PyExecutorObject *prev = links->previous;
     if (next != NULL) {
@@ -1646,7 +1659,6 @@ unlink_executor(_PyExecutorObject *executor)
         assert(interp->executor_list_head == executor);
         interp->executor_list_head = next;
     }
-    executor->vm_data.linked = false;
 }
 
 /* This must be called by optimizers before using the executor */
@@ -1660,61 +1672,47 @@ _Py_ExecutorInit(_PyExecutorObject *executor, const _PyBloomFilter *dependency_s
     link_executor(executor);
 }
 
-_PyExecutorObject *
-_PyExecutor_GetColdExecutor(void)
+static _PyExecutorObject *
+make_cold_executor(uint16_t opcode)
 {
-    PyInterpreterState *interp = _PyInterpreterState_GET();
-    if (interp->cold_executor != NULL) {
-        return interp->cold_executor;
-    }
     _PyExecutorObject *cold = allocate_executor(0, 1);
     if (cold == NULL) {
         Py_FatalError("Cannot allocate core JIT code");
     }
-    ((_PyUOpInstruction *)cold->trace)->opcode = _COLD_EXIT_r00;
-#ifdef _Py_JIT
-    cold->jit_code = NULL;
-    cold->jit_size = 0;
+    ((_PyUOpInstruction *)cold->trace)->opcode = opcode;
     // This is initialized to true so we can prevent the executor
     // from being immediately detected as cold and invalidated.
     cold->vm_data.warm = true;
+#ifdef _Py_JIT
+    cold->jit_code = NULL;
+    cold->jit_size = 0;
     if (_PyJIT_Compile(cold, cold->trace, 1)) {
         Py_DECREF(cold);
         Py_FatalError("Cannot allocate core JIT code");
     }
 #endif
     _Py_SetImmortal((PyObject *)cold);
-    interp->cold_executor = cold;
     return cold;
 }
 
+_PyExecutorObject *
+_PyExecutor_GetColdExecutor(void)
+{
+    PyInterpreterState *interp = _PyInterpreterState_GET();
+    if (interp->cold_executor == NULL) {
+        return interp->cold_executor = make_cold_executor(_COLD_EXIT_r00);;
+    }
+    return interp->cold_executor;
+}
+
 _PyExecutorObject *
 _PyExecutor_GetColdDynamicExecutor(void)
 {
     PyInterpreterState *interp = _PyInterpreterState_GET();
-    if (interp->cold_dynamic_executor != NULL) {
-        assert(interp->cold_dynamic_executor->trace[0].opcode == _COLD_DYNAMIC_EXIT_r00);
-        return interp->cold_dynamic_executor;
+    if (interp->cold_dynamic_executor == NULL) {
+        interp->cold_dynamic_executor = make_cold_executor(_COLD_DYNAMIC_EXIT_r00);
     }
-    _PyExecutorObject *cold = allocate_executor(0, 1);
-    if (cold == NULL) {
-        Py_FatalError("Cannot allocate core JIT code");
-    }
-    ((_PyUOpInstruction *)cold->trace)->opcode = _COLD_DYNAMIC_EXIT_r00;
-#ifdef _Py_JIT
-    cold->jit_code = NULL;
-    cold->jit_size = 0;
-    // This is initialized to true so we can prevent the executor
-    // from being immediately detected as cold and invalidated.
-    cold->vm_data.warm = true;
-    if (_PyJIT_Compile(cold, cold->trace, 1)) {
-        Py_DECREF(cold);
-        Py_FatalError("Cannot allocate core JIT code");
-    }
-#endif
-    _Py_SetImmortal((PyObject *)cold);
-    interp->cold_dynamic_executor = cold;
-    return cold;
+    return interp->cold_dynamic_executor;
 }
 
 void
@@ -1753,31 +1751,28 @@ _Py_ExecutorDetach(_PyExecutorObject *executor)
     Py_DECREF(executor);
 }
 
-static int
-executor_clear(PyObject *op)
+/* Executors can be invalidated at any time,
+   even with a stop-the-world lock held.
+   Consequently it must not run arbitrary code,
+   including Py_DECREF with a non-executor. */
+static void
+executor_invalidate(PyObject *op)
 {
     _PyExecutorObject *executor = _PyExecutorObject_CAST(op);
     if (!executor->vm_data.valid) {
-        return 0;
+        return;
     }
-    assert(executor->vm_data.valid == 1);
-    unlink_executor(executor);
     executor->vm_data.valid = 0;
-
-    /* It is possible for an executor to form a reference
-     * cycle with itself, so decref'ing a side exit could
-     * free the executor unless we hold a strong reference to it
-     */
-    _PyExecutorObject *cold = _PyExecutor_GetColdExecutor();
-    Py_INCREF(executor);
-    for (uint32_t i = 0; i < executor->exit_count; i++) {
-        executor->exits[i].temperature = initial_unreachable_backoff_counter();
-        _PyExecutorObject *e = executor->exits[i].executor;
-        executor->exits[i].executor = cold;
-        Py_DECREF(e);
-    }
+    unlink_executor(executor);
+    executor_clear_exits(executor);
     _Py_ExecutorDetach(executor);
-    Py_DECREF(executor);
+    _PyObject_GC_UNTRACK(op);
+}
+
+static int
+executor_clear(PyObject *op)
+{
+    executor_invalidate(op);
     return 0;
 }
 
@@ -1803,7 +1798,7 @@ _Py_Executors_InvalidateDependency(PyInterpreterState *interp, void *obj, int is
     if (invalidate == NULL) {
         goto error;
     }
-    /* Clearing an executor can deallocate others, so we need to make a list of
+    /* Clearing an executor can clear others, so we need to make a list of
      * executors to invalidate first */
     for (_PyExecutorObject *exec = interp->executor_list_head; exec != NULL;) {
         assert(exec->vm_data.valid);
@@ -1817,7 +1812,7 @@ _Py_Executors_InvalidateDependency(PyInterpreterState *interp, void *obj, int is
     }
     for (Py_ssize_t i = 0; i < PyList_GET_SIZE(invalidate); i++) {
         PyObject *exec = PyList_GET_ITEM(invalidate, i);
-        executor_clear(exec);
+        executor_invalidate(exec);
         if (is_invalidation) {
             OPT_STAT_INC(executors_invalidated);
         }
@@ -1849,13 +1844,13 @@ _Py_Executors_InvalidateAll(PyInterpreterState *interp, int is_invalidation)
 {
     while (interp->executor_list_head) {
         _PyExecutorObject *executor = interp->executor_list_head;
-        assert(executor->vm_data.valid == 1 && executor->vm_data.linked == 1);
+        assert(executor->vm_data.valid);
         if (executor->vm_data.code) {
             // Clear the entire code object so its co_executors array be freed:
             _PyCode_Clear_Executors(executor->vm_data.code);
         }
         else {
-            executor_clear((PyObject *)executor);
+            executor_invalidate((PyObject *)executor);
         }
         if (is_invalidation) {
             OPT_STAT_INC(executors_invalidated);
@@ -1890,7 +1885,7 @@ _Py_Executors_InvalidateCold(PyInterpreterState *interp)
     }
     for (Py_ssize_t i = 0; i < PyList_GET_SIZE(invalidate); i++) {
         PyObject *exec = PyList_GET_ITEM(invalidate, i);
-        executor_clear(exec);
+        executor_invalidate(exec);
     }
     Py_DECREF(invalidate);
     return;
@@ -1975,7 +1970,7 @@ executor_to_gv(_PyExecutorObject *executor, FILE *out)
 #else
         fprintf(out, "        <tr><td port=\"i%d\" border=\"1\" >%s op0=%" PRIu64 "</td></tr>\n", i, opname, inst->operand0);
 #endif
-        if (inst->opcode == _EXIT_TRACE || inst->opcode == _JUMP_TO_TOP) {
+        if (base_opcode == _EXIT_TRACE || base_opcode == _JUMP_TO_TOP) {
             break;
         }
     }
@@ -1988,7 +1983,7 @@ executor_to_gv(_PyExecutorObject *executor, FILE *out)
     for (uint32_t i = 0; i < executor->code_size; i++) {
         _PyUOpInstruction const *inst = &executor->trace[i];
         uint16_t base_opcode = _PyUop_Uncached[inst->opcode];
-        uint16_t flags = _PyUop_Flags[inst->opcode];
+        uint16_t flags = _PyUop_Flags[base_opcode];
         _PyExitData *exit = NULL;
         if (base_opcode == _EXIT_TRACE) {
             exit = (_PyExitData *)inst->operand0;
diff --git a/Python/optimizer_bytecodes.c b/Python/optimizer_bytecodes.c
index decf98865ae..2bca0a3cd7c 100644
--- a/Python/optimizer_bytecodes.c
+++ b/Python/optimizer_bytecodes.c
@@ -104,6 +104,11 @@ dummy_func(void) {
         o = owner;
     }
 
+    op(_STORE_ATTR_WITH_HINT, (hint/1, value, owner -- o)) {
+        (void)value;
+        o = owner;
+    }
+
     op(_STORE_FAST, (value --)) {
         GETLOCAL(oparg) = value;
     }
@@ -293,12 +298,14 @@ dummy_func(void) {
         r = right;
     }
 
-    op(_BINARY_OP_ADD_UNICODE, (left, right -- res)) {
+    op(_BINARY_OP_ADD_UNICODE, (left, right -- res, l, r)) {
         REPLACE_OPCODE_IF_EVALUATES_PURE(left, right);
         res = sym_new_type(ctx, &PyUnicode_Type);
+        l = left;
+        r = right;
     }
 
-    op(_BINARY_OP_INPLACE_ADD_UNICODE, (left, right -- )) {
+    op(_BINARY_OP_INPLACE_ADD_UNICODE, (left, right --)) {
         JitOptRef res;
         if (sym_is_const(ctx, left) && sym_is_const(ctx, right)) {
             assert(PyUnicode_CheckExact(sym_get_const(ctx, left)));
@@ -322,8 +329,10 @@ dummy_func(void) {
         ctx->done = true;
     }
 
-    op(_BINARY_OP_SUBSCR_STR_INT, (str_st, sub_st -- res)) {
+    op(_BINARY_OP_SUBSCR_STR_INT, (str_st, sub_st -- res, s, i)) {
         res = sym_new_type(ctx, &PyUnicode_Type);
+        s = str_st;
+        i = sub_st;
     }
 
     op(_BINARY_OP_SUBSCR_TUPLE_INT, (tuple_st, sub_st -- res)) {
@@ -520,10 +529,6 @@ dummy_func(void) {
         value = PyJitRef_Borrow(sym_new_const(ctx, ptr));
     }
 
-    op(_POP_CALL_ONE_LOAD_CONST_INLINE_BORROW, (ptr/4, unused, unused, unused -- value)) {
-        value = PyJitRef_Borrow(sym_new_const(ctx, ptr));
-    }
-
     op(_POP_CALL_TWO_LOAD_CONST_INLINE_BORROW, (ptr/4, unused, unused, unused, unused -- value)) {
         value = PyJitRef_Borrow(sym_new_const(ctx, ptr));
     }
@@ -558,6 +563,12 @@ dummy_func(void) {
         }
     }
 
+    op(_POP_TOP_UNICODE, (value --)) {
+        if (PyJitRef_IsBorrowed(value)) {
+            REPLACE_OP(this_instr, _POP_TOP_NOP, 0, 0);
+        }
+    }
+
     op(_COPY, (bottom, unused[oparg-1] -- bottom, unused[oparg-1], top)) {
         assert(oparg > 0);
         top = bottom;
@@ -570,9 +581,10 @@ dummy_func(void) {
         assert(oparg >= 2);
     }
 
-    op(_LOAD_ATTR_INSTANCE_VALUE, (offset/1, owner -- attr)) {
+    op(_LOAD_ATTR_INSTANCE_VALUE, (offset/1, owner -- attr, o)) {
         attr = sym_new_not_null(ctx);
         (void)offset;
+        o = owner;
     }
 
     op(_LOAD_ATTR_MODULE, (dict_version/2, index/1, owner -- attr)) {
@@ -622,9 +634,10 @@ dummy_func(void) {
         }
     }
 
-    op(_LOAD_ATTR_WITH_HINT, (hint/1, owner -- attr)) {
+    op(_LOAD_ATTR_WITH_HINT, (hint/1, owner -- attr, o)) {
         attr = sym_new_not_null(ctx);
         (void)hint;
+        o = owner;
     }
 
     op(_LOAD_ATTR_SLOT, (index/1, owner -- attr)) {
@@ -1247,7 +1260,7 @@ dummy_func(void) {
                 goto error;
             }
             if (_Py_IsImmortal(temp)) {
-                REPLACE_OP(this_instr, _POP_CALL_ONE_LOAD_CONST_INLINE_BORROW,
+                REPLACE_OP(this_instr, _SHUFFLE_3_LOAD_CONST_INLINE_BORROW,
                            0, (uintptr_t)temp);
             }
             res = sym_new_const(ctx, temp);
@@ -1364,7 +1377,12 @@ dummy_func(void) {
             res = sym_new_not_null(ctx);
         }
         else {
-            res = sym_new_const(ctx, cnst);
+            if (_Py_IsImmortal(cnst)) {
+                res = PyJitRef_Borrow(sym_new_const(ctx, cnst));
+            }
+            else {
+                res = sym_new_const(ctx, cnst);
+            }
         }
     }
 
@@ -1399,10 +1417,21 @@ dummy_func(void) {
             res = sym_new_not_null(ctx);
         }
         else {
-            res = sym_new_const(ctx, cnst);
+            if (_Py_IsImmortal(cnst)) {
+                res = PyJitRef_Borrow(sym_new_const(ctx, cnst));
+            }
+            else {
+                res = sym_new_const(ctx, cnst);
+            }
         }
     }
 
+    op(_BINARY_OP_SUBSCR_LIST_INT, (list_st, sub_st -- res, ls, ss)) {
+        res = sym_new_unknown(ctx);
+        ls = list_st;
+        ss = sub_st;
+    }
+
 
 // END BYTECODES //
 
diff --git a/Python/optimizer_cases.c.h b/Python/optimizer_cases.c.h
index cdab0fd2ef8..e52d4e1db46 100644
--- a/Python/optimizer_cases.c.h
+++ b/Python/optimizer_cases.c.h
@@ -165,6 +165,11 @@
         }
 
         case _POP_TOP_UNICODE: {
+            JitOptRef value;
+            value = stack_pointer[-1];
+            if (PyJitRef_IsBorrowed(value)) {
+                REPLACE_OP(this_instr, _POP_TOP_NOP, 0, 0);
+            }
             CHECK_STACK_BOUNDS(-1);
             stack_pointer += -1;
             ASSERT_WITHIN_STACK_BOUNDS(__FILE__, __LINE__);
@@ -637,6 +642,8 @@
             JitOptRef right;
             JitOptRef left;
             JitOptRef res;
+            JitOptRef l;
+            JitOptRef r;
             right = stack_pointer[-1];
             left = stack_pointer[-2];
             if (
@@ -648,6 +655,8 @@
                 _PyStackRef left = sym_get_const_as_stackref(ctx, left_sym);
                 _PyStackRef right = sym_get_const_as_stackref(ctx, right_sym);
                 _PyStackRef res_stackref;
+                _PyStackRef l_stackref;
+                _PyStackRef r_stackref;
                 /* Start of uop copied from bytecodes for constant evaluation */
                 PyObject *left_o = PyStackRef_AsPyObjectBorrow(left);
                 PyObject *right_o = PyStackRef_AsPyObjectBorrow(right);
@@ -655,31 +664,32 @@
                 assert(PyUnicode_CheckExact(right_o));
                 STAT_INC(BINARY_OP, hit);
                 PyObject *res_o = PyUnicode_Concat(left_o, right_o);
-                PyStackRef_CLOSE_SPECIALIZED(right, _PyUnicode_ExactDealloc);
-                PyStackRef_CLOSE_SPECIALIZED(left, _PyUnicode_ExactDealloc);
-                if (res_o == NULL) {
-                    goto error;
-                }
                 res_stackref = PyStackRef_FromPyObjectSteal(res_o);
+                if (PyStackRef_IsNull(res)) {
+                    JUMP_TO_LABEL(error);
+                }
+                l_stackref = left;
+                r_stackref = right;
                 /* End of uop copied from bytecodes for constant evaluation */
                 res = sym_new_const_steal(ctx, PyStackRef_AsPyObjectSteal(res_stackref));
-                if (sym_is_const(ctx, res)) {
-                    PyObject *result = sym_get_const(ctx, res);
-                    if (_Py_IsImmortal(result)) {
-                        // Replace with _POP_TWO_LOAD_CONST_INLINE_BORROW since we have two inputs and an immortal result
-                        REPLACE_OP(this_instr, _POP_TWO_LOAD_CONST_INLINE_BORROW, 0, (uintptr_t)result);
-                    }
-                }
-                CHECK_STACK_BOUNDS(-1);
+                l = sym_new_const_steal(ctx, PyStackRef_AsPyObjectSteal(l_stackref));
+                r = sym_new_const_steal(ctx, PyStackRef_AsPyObjectSteal(r_stackref));
+                CHECK_STACK_BOUNDS(1);
                 stack_pointer[-2] = res;
-                stack_pointer += -1;
+                stack_pointer[-1] = l;
+                stack_pointer[0] = r;
+                stack_pointer += 1;
                 ASSERT_WITHIN_STACK_BOUNDS(__FILE__, __LINE__);
                 break;
             }
             res = sym_new_type(ctx, &PyUnicode_Type);
-            CHECK_STACK_BOUNDS(-1);
+            l = left;
+            r = right;
+            CHECK_STACK_BOUNDS(1);
             stack_pointer[-2] = res;
-            stack_pointer += -1;
+            stack_pointer[-1] = l;
+            stack_pointer[0] = r;
+            stack_pointer += 1;
             ASSERT_WITHIN_STACK_BOUNDS(__FILE__, __LINE__);
             break;
         }
@@ -753,11 +763,21 @@
         }
 
         case _BINARY_OP_SUBSCR_LIST_INT: {
+            JitOptRef sub_st;
+            JitOptRef list_st;
             JitOptRef res;
-            res = sym_new_not_null(ctx);
-            CHECK_STACK_BOUNDS(-1);
+            JitOptRef ls;
+            JitOptRef ss;
+            sub_st = stack_pointer[-1];
+            list_st = stack_pointer[-2];
+            res = sym_new_unknown(ctx);
+            ls = list_st;
+            ss = sub_st;
+            CHECK_STACK_BOUNDS(1);
             stack_pointer[-2] = res;
-            stack_pointer += -1;
+            stack_pointer[-1] = ls;
+            stack_pointer[0] = ss;
+            stack_pointer += 1;
             ASSERT_WITHIN_STACK_BOUNDS(__FILE__, __LINE__);
             break;
         }
@@ -773,11 +793,21 @@
         }
 
         case _BINARY_OP_SUBSCR_STR_INT: {
+            JitOptRef sub_st;
+            JitOptRef str_st;
             JitOptRef res;
+            JitOptRef s;
+            JitOptRef i;
+            sub_st = stack_pointer[-1];
+            str_st = stack_pointer[-2];
             res = sym_new_type(ctx, &PyUnicode_Type);
-            CHECK_STACK_BOUNDS(-1);
+            s = str_st;
+            i = sub_st;
+            CHECK_STACK_BOUNDS(1);
             stack_pointer[-2] = res;
-            stack_pointer += -1;
+            stack_pointer[-1] = s;
+            stack_pointer[0] = i;
+            stack_pointer += 1;
             ASSERT_WITHIN_STACK_BOUNDS(__FILE__, __LINE__);
             break;
         }
@@ -1300,7 +1330,12 @@
                 res = sym_new_not_null(ctx);
             }
             else {
-                res = sym_new_const(ctx, cnst);
+                if (_Py_IsImmortal(cnst)) {
+                    res = PyJitRef_Borrow(sym_new_const(ctx, cnst));
+                }
+                else {
+                    res = sym_new_const(ctx, cnst);
+                }
             }
             CHECK_STACK_BOUNDS(1);
             stack_pointer[0] = res;
@@ -1337,7 +1372,12 @@
                 res = sym_new_not_null(ctx);
             }
             else {
-                res = sym_new_const(ctx, cnst);
+                if (_Py_IsImmortal(cnst)) {
+                    res = PyJitRef_Borrow(sym_new_const(ctx, cnst));
+                }
+                else {
+                    res = sym_new_const(ctx, cnst);
+                }
             }
             CHECK_STACK_BOUNDS(1);
             stack_pointer[0] = res;
@@ -1566,11 +1606,19 @@
         }
 
         case _LOAD_ATTR_INSTANCE_VALUE: {
+            JitOptRef owner;
             JitOptRef attr;
+            JitOptRef o;
+            owner = stack_pointer[-1];
             uint16_t offset = (uint16_t)this_instr->operand0;
             attr = sym_new_not_null(ctx);
             (void)offset;
+            o = owner;
+            CHECK_STACK_BOUNDS(1);
             stack_pointer[-1] = attr;
+            stack_pointer[0] = o;
+            stack_pointer += 1;
+            ASSERT_WITHIN_STACK_BOUNDS(__FILE__, __LINE__);
             break;
         }
 
@@ -1610,11 +1658,19 @@
         }
 
         case _LOAD_ATTR_WITH_HINT: {
+            JitOptRef owner;
             JitOptRef attr;
+            JitOptRef o;
+            owner = stack_pointer[-1];
             uint16_t hint = (uint16_t)this_instr->operand0;
             attr = sym_new_not_null(ctx);
             (void)hint;
+            o = owner;
+            CHECK_STACK_BOUNDS(1);
             stack_pointer[-1] = attr;
+            stack_pointer[0] = o;
+            stack_pointer += 1;
+            ASSERT_WITHIN_STACK_BOUNDS(__FILE__, __LINE__);
             break;
         }
 
@@ -1691,8 +1747,17 @@
         }
 
         case _STORE_ATTR_WITH_HINT: {
-            CHECK_STACK_BOUNDS(-2);
-            stack_pointer += -2;
+            JitOptRef owner;
+            JitOptRef value;
+            JitOptRef o;
+            owner = stack_pointer[-1];
+            value = stack_pointer[-2];
+            uint16_t hint = (uint16_t)this_instr->operand0;
+            (void)value;
+            o = owner;
+            CHECK_STACK_BOUNDS(-1);
+            stack_pointer[-2] = o;
+            stack_pointer += -1;
             ASSERT_WITHIN_STACK_BOUNDS(__FILE__, __LINE__);
             break;
         }
@@ -2815,7 +2880,7 @@
                     goto error;
                 }
                 if (_Py_IsImmortal(temp)) {
-                    REPLACE_OP(this_instr, _POP_CALL_ONE_LOAD_CONST_INLINE_BORROW,
+                    REPLACE_OP(this_instr, _SHUFFLE_3_LOAD_CONST_INLINE_BORROW,
                            0, (uintptr_t)temp);
                 }
                 res = sym_new_const(ctx, temp);
@@ -3377,8 +3442,7 @@
 
         case _POP_CALL_ONE_LOAD_CONST_INLINE_BORROW: {
             JitOptRef value;
-            PyObject *ptr = (PyObject *)this_instr->operand0;
-            value = PyJitRef_Borrow(sym_new_const(ctx, ptr));
+            value = sym_new_not_null(ctx);
             CHECK_STACK_BOUNDS(-2);
             stack_pointer[-3] = value;
             stack_pointer += -2;
@@ -3386,6 +3450,19 @@
             break;
         }
 
+        case _SHUFFLE_3_LOAD_CONST_INLINE_BORROW: {
+            JitOptRef res;
+            JitOptRef a;
+            JitOptRef c;
+            res = sym_new_not_null(ctx);
+            a = sym_new_not_null(ctx);
+            c = sym_new_not_null(ctx);
+            stack_pointer[-3] = res;
+            stack_pointer[-2] = a;
+            stack_pointer[-1] = c;
+            break;
+        }
+
         case _POP_CALL_TWO_LOAD_CONST_INLINE_BORROW: {
             JitOptRef value;
             PyObject *ptr = (PyObject *)this_instr->operand0;
diff --git a/Python/perf_jit_trampoline.c b/Python/perf_jit_trampoline.c
index af7d8f9f1ec..0ffa906d85c 100644
--- a/Python/perf_jit_trampoline.c
+++ b/Python/perf_jit_trampoline.c
@@ -1086,7 +1086,8 @@ static void* perf_map_jit_init(void) {
         close(fd);
         return NULL;  // Memory mapping failed
     }
-    _PyAnnotateMemoryMap(perf_jit_map_state.mapped_buffer, page_size, "cpython:perf_jit_trampoline");
+    (void)_PyAnnotateMemoryMap(perf_jit_map_state.mapped_buffer, page_size,
+                               "cpython:perf_jit_trampoline");
 #endif
 
     perf_jit_map_state.mapped_size = page_size;
diff --git a/Python/perf_trampoline.c b/Python/perf_trampoline.c
index 669a47ae173..335d8ac7dad 100644
--- a/Python/perf_trampoline.c
+++ b/Python/perf_trampoline.c
@@ -291,7 +291,7 @@ new_code_arena(void)
         perf_status = PERF_STATUS_FAILED;
         return -1;
     }
-    _PyAnnotateMemoryMap(memory, mem_size, "cpython:perf_trampoline");
+    (void)_PyAnnotateMemoryMap(memory, mem_size, "cpython:perf_trampoline");
     void *start = &_Py_trampoline_func_start;
     void *end = &_Py_trampoline_func_end;
     size_t code_size = end - start;
diff --git a/Python/pystate.c b/Python/pystate.c
index 4bf89a23426..cf55297cf8d 100644
--- a/Python/pystate.c
+++ b/Python/pystate.c
@@ -490,7 +490,7 @@ static inline int check_interpreter_whence(long);
 #endif
 
 extern _Py_CODEUNIT *
-_Py_LazyJitTrampoline(
+_Py_LazyJitShim(
     struct _PyExecutorObject *exec, _PyInterpreterFrame *frame, _PyStackRef *stack_pointer, PyThreadState *tstate
 );
 
@@ -570,7 +570,6 @@ init_interpreter(PyInterpreterState *interp,
     interp->compiling = false;
     interp->executor_list_head = NULL;
     interp->executor_deletion_list_head = NULL;
-    interp->executor_deletion_list_remaining_capacity = 0;
     interp->executor_creation_counter = JIT_CLEANUP_THRESHOLD;
     if (interp != &runtime->_main_interpreter) {
         /* Fix the self-referential, statically initialized fields. */
diff --git a/Python/tracemalloc.c b/Python/tracemalloc.c
index 20351618721..cdd96925d1f 100644
--- a/Python/tracemalloc.c
+++ b/Python/tracemalloc.c
@@ -850,7 +850,7 @@ _PyTraceMalloc_Start(int max_nframe)
 
     /* everything is ready: start tracing Python memory allocations */
     TABLES_LOCK();
-    tracemalloc_config.tracing = 1;
+    _Py_atomic_store_int_relaxed(&tracemalloc_config.tracing, 1);
     TABLES_UNLOCK();
 
     return 0;
@@ -867,7 +867,7 @@ _PyTraceMalloc_Stop(void)
     }
 
     /* stop tracing Python memory allocations */
-    tracemalloc_config.tracing = 0;
+    _Py_atomic_store_int_relaxed(&tracemalloc_config.tracing, 0);
 
     /* unregister the hook on memory allocators */
     PyMem_SetAllocator(PYMEM_DOMAIN_RAW, &allocators.raw);
@@ -1207,6 +1207,10 @@ int
 PyTraceMalloc_Track(unsigned int domain, uintptr_t ptr,
                     size_t size)
 {
+    if (_Py_atomic_load_int_relaxed(&tracemalloc_config.tracing) == 0) {
+        /* tracemalloc is not tracing: do nothing */
+        return -2;
+    }
     PyGILState_STATE gil_state = PyGILState_Ensure();
     TABLES_LOCK();
 
@@ -1228,6 +1232,11 @@ PyTraceMalloc_Track(unsigned int domain, uintptr_t ptr,
 int
 PyTraceMalloc_Untrack(unsigned int domain, uintptr_t ptr)
 {
+    if (_Py_atomic_load_int_relaxed(&tracemalloc_config.tracing) == 0) {
+        /* tracemalloc is not tracing: do nothing */
+        return -2;
+    }
+
     TABLES_LOCK();
 
     int result;
diff --git a/Tools/c-analyzer/cpython/_analyzer.py b/Tools/c-analyzer/cpython/_analyzer.py
index 6f0f4648928..43ed552fcf7 100644
--- a/Tools/c-analyzer/cpython/_analyzer.py
+++ b/Tools/c-analyzer/cpython/_analyzer.py
@@ -76,6 +76,7 @@ _OTHER_SUPPORTED_TYPES = {
     'PyBufferProcs',
     'PyStructSequence_Field[]',
     'PyStructSequence_Desc',
+    'PyABIInfo',
 }
 
 # XXX We should normalize all cases to a single name,
diff --git a/Tools/c-analyzer/cpython/_parser.py b/Tools/c-analyzer/cpython/_parser.py
index 6332dec3aef..6d157c8da61 100644
--- a/Tools/c-analyzer/cpython/_parser.py
+++ b/Tools/c-analyzer/cpython/_parser.py
@@ -313,6 +313,7 @@ MAX_SIZES = {
     _abs('Modules/_hacl/*.c'): (200_000, 500),
     _abs('Modules/posixmodule.c'): (20_000, 500),
     _abs('Modules/termios.c'): (10_000, 800),
+    _abs('Modules/_remote_debugging/*.h'): (20_000, 1000),
     _abs('Modules/_testcapimodule.c'): (20_000, 400),
     _abs('Modules/expat/expat.h'): (10_000, 400),
     _abs('Objects/stringlib/unicode_format.h'): (10_000, 400),
diff --git a/Tools/cases_generator/analyzer.py b/Tools/cases_generator/analyzer.py
index fcd0dcf12ac..659befe312a 100644
--- a/Tools/cases_generator/analyzer.py
+++ b/Tools/cases_generator/analyzer.py
@@ -642,6 +642,7 @@ NON_ESCAPING_FUNCTIONS = (
     "_PyFrame_StackPush",
     "_PyFunction_SetVersion",
     "_PyGen_GetGeneratorFromFrame",
+    "gen_try_set_executing",
     "_PyInterpreterState_GET",
     "_PyList_AppendTakeRef",
     "_PyList_ITEMS",
diff --git a/Tools/cases_generator/target_generator.py b/Tools/cases_generator/target_generator.py
index f633f704485..e7cf6c223d2 100644
--- a/Tools/cases_generator/target_generator.py
+++ b/Tools/cases_generator/target_generator.py
@@ -44,7 +44,7 @@ def write_opcode_targets(analysis: Analysis, out: CWriter) -> None:
     out.emit("#else /* _Py_TAIL_CALL_INTERP */\n")
 
 def function_proto(name: str) -> str:
-    return f"Py_PRESERVE_NONE_CC static PyObject *_TAIL_CALL_{name}(TAIL_CALL_PARAMS)"
+    return f"static PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_{name}(TAIL_CALL_PARAMS)"
 
 
 def write_tailcall_dispatch_table(analysis: Analysis, out: CWriter) -> None:
diff --git a/Tools/check-c-api-docs/mypy.ini b/Tools/check-c-api-docs/mypy.ini
new file mode 100644
index 00000000000..f42eb2836e2
--- /dev/null
+++ b/Tools/check-c-api-docs/mypy.ini
@@ -0,0 +1,19 @@
+[mypy]
+files = Tools/check-c-api-docs/
+pretty = True
+
+# We need `_colorize` import:
+mypy_path = $MYPY_CONFIG_FILE_DIR/../../Misc/mypy
+
+# Make sure Python can still be built
+# using Python 3.13 for `PYTHON_FOR_REGEN`...
+python_version = 3.13
+
+# ...And be strict:
+strict = True
+extra_checks = True
+enable_error_code = 
+    ignore-without-code,
+    redundant-expr,
+    truthy-bool,
+    possibly-undefined,
diff --git a/Tools/jit/_optimizers.py b/Tools/jit/_optimizers.py
index 08d4e700652..83c878d8fe2 100644
--- a/Tools/jit/_optimizers.py
+++ b/Tools/jit/_optimizers.py
@@ -95,6 +95,7 @@ class InstructionKind(enum.Enum):
     JUMP = enum.auto()
     LONG_BRANCH = enum.auto()
     SHORT_BRANCH = enum.auto()
+    CALL = enum.auto()
     RETURN = enum.auto()
     SMALL_CONST_1 = enum.auto()
     SMALL_CONST_2 = enum.auto()
@@ -182,6 +183,8 @@ class Optimizer:
     # Two groups (instruction and target):
     _re_branch: typing.ClassVar[re.Pattern[str]] = _RE_NEVER_MATCH
     # One group (target):
+    _re_call: typing.ClassVar[re.Pattern[str]] = _RE_NEVER_MATCH
+    # One group (target):
     _re_jump: typing.ClassVar[re.Pattern[str]] = _RE_NEVER_MATCH
     # No groups:
     _re_return: typing.ClassVar[re.Pattern[str]] = _RE_NEVER_MATCH
@@ -225,6 +228,11 @@ class Optimizer:
                 assert inst.target is not None
                 block.target = self._lookup_label(inst.target)
                 assert block.fallthrough
+            elif inst.kind == InstructionKind.CALL:
+                # A block ending in a call has a target and return point after call:
+                assert inst.target is not None
+                block.target = self._lookup_label(inst.target)
+                assert block.fallthrough
             elif inst.kind == InstructionKind.JUMP:
                 # A block ending in a jump has a target and no fallthrough:
                 assert inst.target is not None
@@ -256,6 +264,10 @@ class Optimizer:
             target = match["target"]
             name = line[: -len(target)].strip()
             kind = InstructionKind.JUMP
+        elif match := self._re_call.match(line):
+            target = match["target"]
+            name = line[: -len(target)].strip()
+            kind = InstructionKind.CALL
         elif match := self._re_return.match(line):
             name = line
             kind = InstructionKind.RETURN
@@ -463,6 +475,8 @@ class Optimizer:
         for index, block in enumerate(self._blocks()):
             if block.target and block.fallthrough:
                 branch = block.instructions[-1]
+                if branch.kind == InstructionKind.CALL:
+                    continue
                 assert branch.is_branch()
                 target = branch.target
                 assert target is not None
@@ -566,6 +580,8 @@ class OptimizerAArch64(Optimizer):  # pylint: disable = too-few-public-methods
         rf"\s*(?P<instruction>{'|'.join(_branch_patterns)})\s+(.+,\s+)*(?P<target>[\w.]+)"
     )
 
+    # https://developer.arm.com/documentation/ddi0406/b/Application-Level-Architecture/Instruction-Details/Alphabetical-list-of-instructions/BL--BLX--immediate-
+    _re_call = re.compile(r"\s*blx?\s+(?P<target>[\w.]+)")
     # https://developer.arm.com/documentation/ddi0602/2025-03/Base-Instructions/B--Branch-
     _re_jump = re.compile(r"\s*b\s+(?P<target>[\w.]+)")
     # https://developer.arm.com/documentation/ddi0602/2025-09/Base-Instructions/RET--Return-from-subroutine-
@@ -628,6 +644,8 @@ class OptimizerX86(Optimizer):  # pylint: disable = too-few-public-methods
     _re_branch = re.compile(
         rf"\s*(?P<instruction>{'|'.join(_X86_BRANCHES)})\s+(?P<target>[\w.]+)"
     )
+    # https://www.felixcloutier.com/x86/call
+    _re_call = re.compile(r"\s*callq?\s+(?P<target>[\w.]+)")
     # https://www.felixcloutier.com/x86/jmp
     _re_jump = re.compile(r"\s*jmp\s+(?P<target>[\w.]+)")
     # https://www.felixcloutier.com/x86/ret
diff --git a/Tools/jit/_targets.py b/Tools/jit/_targets.py
index 5895e91c3c4..39be353ec30 100644
--- a/Tools/jit/_targets.py
+++ b/Tools/jit/_targets.py
@@ -204,8 +204,8 @@ class _Target(typing.Generic[_S, _R]):
         with tempfile.TemporaryDirectory() as tempdir:
             work = pathlib.Path(tempdir).resolve()
             async with asyncio.TaskGroup() as group:
-                coro = self._compile("trampoline", TOOLS_JIT / "trampoline.c", work)
-                tasks.append(group.create_task(coro, name="trampoline"))
+                coro = self._compile("shim", TOOLS_JIT / "shim.c", work)
+                tasks.append(group.create_task(coro, name="shim"))
                 template = TOOLS_JIT_TEMPLATE_C.read_text()
                 for case, opname in cases_and_opnames:
                     # Write out a copy of the template with *only* this case
diff --git a/Tools/jit/_writer.py b/Tools/jit/_writer.py
index 3a59ffce7a2..5fd9a2ee2d6 100644
--- a/Tools/jit/_writer.py
+++ b/Tools/jit/_writer.py
@@ -23,11 +23,11 @@ def _dump_footer(
     yield "    symbol_mask got_mask;"
     yield "} StencilGroup;"
     yield ""
-    yield f"static const StencilGroup trampoline = {groups['trampoline'].as_c('trampoline')};"
+    yield f"static const StencilGroup shim = {groups['shim'].as_c('shim')};"
     yield ""
     yield "static const StencilGroup stencil_groups[MAX_UOP_REGS_ID + 1] = {"
     for opname, group in sorted(groups.items()):
-        if opname == "trampoline":
+        if opname == "shim":
             continue
         yield f"    [{opname}] = {group.as_c(opname)},"
     yield "};"
diff --git a/Tools/jit/trampoline.c b/Tools/jit/shim.c
similarity index 100%
rename from Tools/jit/trampoline.c
rename to Tools/jit/shim.c
diff --git a/Tools/tsan/suppressions_free_threading.txt b/Tools/tsan/suppressions_free_threading.txt
index 404c3015736..e8b1501c34b 100644
--- a/Tools/tsan/suppressions_free_threading.txt
+++ b/Tools/tsan/suppressions_free_threading.txt
@@ -20,24 +20,9 @@ race_top:_PyObject_TryGetInstanceAttribute
 race_top:PyUnstable_InterpreterFrame_GetLine
 race_top:write_thread_id
 
-# gh-129068: race on shared range iterators (test_free_threading.test_zip.ZipThreading.test_threading)
-race_top:rangeiter_next
-
-# gh-129748: test.test_free_threading.test_slots.TestSlots.test_object
-race_top:mi_block_set_nextx
-
 # https://gist.github.com/mpage/6962e8870606cfc960e159b407a0cb40
 thread:pthread_create
 
-# Range iteration is not thread-safe yet (issue #129068)
-race_top:rangeiter_next
-
-# List resizing happens through different paths ending in memcpy or memmove
-# (for efficiency), which will probably need to rewritten as explicit loops
-# of ptr-sized copies to be thread-safe. (Issue #129069)
-race:list_ass_slice_lock_held
-race:list_inplace_repeat_lock_held
-
 # PyObject_Realloc internally does memcpy which isn't atomic so can race
 # with non-locking reads. See #132070
 race:PyObject_Realloc
diff --git a/configure b/configure
index a1bc7991aa8..b1faeaf806a 100755
--- a/configure
+++ b/configure
@@ -858,6 +858,8 @@ HAVE_GETHOSTBYNAME_R_3_ARG
 HAVE_GETHOSTBYNAME_R_5_ARG
 HAVE_GETHOSTBYNAME_R_6_ARG
 LIBOBJS
+REMOTE_DEBUGGING_LIBS
+REMOTE_DEBUGGING_CFLAGS
 LIBZSTD_LIBS
 LIBZSTD_CFLAGS
 LIBLZMA_LIBS
@@ -23023,6 +23025,22 @@ printf "%s\n" "yes" >&6; }
         have_libzstd=yes
 fi
 
+if test "x$have_libzstd" = xyes
+then :
+
+  REMOTE_DEBUGGING_CFLAGS="-DHAVE_ZSTD $LIBZSTD_CFLAGS"
+  REMOTE_DEBUGGING_LIBS="$LIBZSTD_LIBS"
+
+else case e in #(
+  e)
+  REMOTE_DEBUGGING_CFLAGS=""
+  REMOTE_DEBUGGING_LIBS=""
+ ;;
+esac
+fi
+
+
+
 
 
 
@@ -31644,8 +31662,8 @@ fi
   if test "x$py_cv_module__remote_debugging" = xyes
 then :
 
-
-
+    as_fn_append MODULE_BLOCK "MODULE__REMOTE_DEBUGGING_CFLAGS=$REMOTE_DEBUGGING_CFLAGS$as_nl"
+    as_fn_append MODULE_BLOCK "MODULE__REMOTE_DEBUGGING_LDFLAGS=$REMOTE_DEBUGGING_LIBS$as_nl"
 
 fi
 
diff --git a/configure.ac b/configure.ac
index a284a118f02..043ec957f40 100644
--- a/configure.ac
+++ b/configure.ac
@@ -5529,6 +5529,18 @@ PKG_CHECK_MODULES([LIBZSTD], [libzstd >= 1.4.5], [have_libzstd=yes], [
   ])
 ])
 
+dnl _remote_debugging module: optional zstd compression support
+dnl The module always builds, but zstd compression is only available when libzstd is found
+AS_VAR_IF([have_libzstd], [yes], [
+  REMOTE_DEBUGGING_CFLAGS="-DHAVE_ZSTD $LIBZSTD_CFLAGS"
+  REMOTE_DEBUGGING_LIBS="$LIBZSTD_LIBS"
+], [
+  REMOTE_DEBUGGING_CFLAGS=""
+  REMOTE_DEBUGGING_LIBS=""
+])
+AC_SUBST([REMOTE_DEBUGGING_CFLAGS])
+AC_SUBST([REMOTE_DEBUGGING_LIBS])
+
 dnl PY_CHECK_NETDB_FUNC(FUNCTION)
 AC_DEFUN([PY_CHECK_NETDB_FUNC], [PY_CHECK_FUNC([$1], [@%:@include <netdb.h>])])
 
@@ -7911,7 +7923,7 @@ PY_STDLIB_MOD_SIMPLE([_pickle])
 PY_STDLIB_MOD_SIMPLE([_posixsubprocess])
 PY_STDLIB_MOD_SIMPLE([_queue])
 PY_STDLIB_MOD_SIMPLE([_random])
-PY_STDLIB_MOD_SIMPLE([_remote_debugging])
+PY_STDLIB_MOD_SIMPLE([_remote_debugging], [$REMOTE_DEBUGGING_CFLAGS], [$REMOTE_DEBUGGING_LIBS])
 PY_STDLIB_MOD_SIMPLE([select])
 PY_STDLIB_MOD_SIMPLE([_struct])
 PY_STDLIB_MOD_SIMPLE([_types])