gh-138122: Allow tachyon to write and read binary output (#142730)

2025-12-23 09:19:18 +00:00 · 2025-12-22 23:57:20 +00:00 · 2025-12-22 23:57:20 +00:00 · 9e51301234
commit 9e51301234
parent 714037ba84
30 changed files with 6554 additions and 134 deletions
--- a/Doc/library/profiling.sampling.rst
+++ b/Doc/library/profiling.sampling.rst
@ -200,6 +200,36 @@ On most systems, attaching to another process requires appropriate permissions.
 See :ref:`profiling-permissions` for platform-specific requirements.


+.. _replay-command:
+
+The ``replay`` command
+----------------------
+
+The ``replay`` command converts binary profile files to other output formats::
+
+   python -m profiling.sampling replay profile.bin
+   python -m profiling.sampling replay --flamegraph -o profile.html profile.bin
+
+This command is useful when you have captured profiling data in binary format
+and want to analyze it later or convert it to a visualization format. Binary
+profiles can be replayed multiple times to different formats without
+re-profiling.
+
+::
+
+   # Convert binary to pstats (default, prints to stdout)
+   python -m profiling.sampling replay profile.bin
+
+   # Convert binary to flame graph
+   python -m profiling.sampling replay --flamegraph -o output.html profile.bin
+
+   # Convert binary to gecko format for Firefox Profiler
+   python -m profiling.sampling replay --gecko -o profile.json profile.bin
+
+   # Convert binary to heatmap
+   python -m profiling.sampling replay --heatmap -o my_heatmap profile.bin
+
+
 Profiling in production
 -----------------------

@ -1041,6 +1071,59 @@ intuitive view that shows exactly where time is spent without requiring
 interpretation of hierarchical visualizations.


+Binary format
+-------------
+
+Binary format (:option:`--binary`) produces a compact binary file for efficient
+storage of profiling data::
+
+   python -m profiling.sampling run --binary -o profile.bin script.py
+   python -m profiling.sampling attach --binary -o profile.bin 12345
+
+The :option:`--compression` option controls data compression:
+
+- ``auto`` (default): Use zstd compression if available, otherwise no
+  compression
+- ``zstd``: Force zstd compression (requires :mod:`compression.zstd` support)
+- ``none``: Disable compression
+
+::
+
+   python -m profiling.sampling run --binary --compression=zstd -o profile.bin script.py
+
+To analyze binary profiles, use the :ref:`replay-command` to convert them to
+other formats like flame graphs or pstats output.
+
+
+Record and replay workflow
+==========================
+
+The binary format combined with the replay command enables a record-and-replay
+workflow that separates data capture from analysis. Rather than generating
+visualizations during profiling, you capture raw data to a compact binary file
+and convert it to different formats later.
+
+This approach has three main benefits:
+
+- Sampling runs faster because the work of building data structures for
+  visualization is deferred until replay.
+- A single binary capture can be converted to multiple output formats
+  without re-profiling: pstats for a quick overview, flame graph for visual
+  exploration, heatmap for line-level detail.
+- Binary files are compact and easy to share with colleagues who can convert
+  them to their preferred format.
+
+A typical workflow::
+
+   # Capture profile in production or during tests
+   python -m profiling.sampling attach --binary -o profile.bin 12345
+
+   # Later, analyze with different formats
+   python -m profiling.sampling replay profile.bin
+   python -m profiling.sampling replay --flamegraph -o profile.html profile.bin
+   python -m profiling.sampling replay --heatmap -o heatmap profile.bin
+
+
 Live mode
 =========

@ -1252,6 +1335,10 @@ Global options

   Attach to and profile a running process by PID.

+.. option:: replay
+
+   Convert a binary profile file to another output format.
+

 Sampling options
 ----------------
@ -1335,12 +1422,22 @@ Output options

   Generate HTML heatmap with line-level sample counts.

+.. option:: --binary
+
+   Generate high-performance binary format for later conversion with the
+   ``replay`` command.
+
+.. option:: --compression <type>
+
+   Compression for binary format: ``auto`` (use zstd if available, default),
+   ``zstd``, or ``none``.
+
 .. option:: -o <path>, --output <path>

   Output file or directory path. Default behavior varies by format:
-   ``--pstats`` writes to stdout, ``--flamegraph`` and ``--gecko`` generate
-   files like ``flamegraph.PID.html``, and ``--heatmap`` creates a directory
-   named ``heatmap_PID``.
+   :option:`--pstats` writes to stdout, while other formats generate a file
+   named ``<format>_<PID>.<ext>`` (for example, ``flamegraph_12345.html``).
+   :option:`--heatmap` creates a directory named ``heatmap_<PID>``.


 pstats display options
--- a/Include/internal/pycore_global_objects_fini_generated.h
+++ b/Include/internal/pycore_global_objects_fini_generated.h
@ -1653,9 +1653,11 @@ _PyStaticObjects_CheckRefcnt(PyInterpreterState *interp) {
    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(co_varnames));
    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(code));
    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(col_offset));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(collector));
    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(command));
    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(comment_factory));
    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(compile_mode));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(compression));
    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(config));
    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(consts));
    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(context));
@ -1718,7 +1720,9 @@ _PyStaticObjects_CheckRefcnt(PyInterpreterState *interp) {
    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(event));
    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(eventmask));
    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(exc));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(exc_tb));
    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(exc_type));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(exc_val));
    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(exc_value));
    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(excepthook));
    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(exception));
@ -1974,6 +1978,7 @@ _PyStaticObjects_CheckRefcnt(PyInterpreterState *interp) {
    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(print_file_and_line));
    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(priority));
    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(progress));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(progress_callback));
    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(progress_routine));
    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(proto));
    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(protocol));
@ -2014,6 +2019,7 @@ _PyStaticObjects_CheckRefcnt(PyInterpreterState *interp) {
    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(reversed));
    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(rounding));
    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(salt));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(sample_interval_us));
    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(sched_priority));
    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(scheduler));
    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(script));
@ -2053,8 +2059,10 @@ _PyStaticObjects_CheckRefcnt(PyInterpreterState *interp) {
    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(spam));
    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(src));
    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(src_dir_fd));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(stack_frames));
    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(stacklevel));
    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(start));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(start_time_us));
    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(statement));
    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(stats));
    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(status));
@ -2095,6 +2103,7 @@ _PyStaticObjects_CheckRefcnt(PyInterpreterState *interp) {
    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(times));
    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(timespec));
    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(timestamp));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(timestamp_us));
    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(timetuple));
    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(timeunit));
    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(top));
--- a/Include/internal/pycore_global_strings.h
+++ b/Include/internal/pycore_global_strings.h
@ -376,9 +376,11 @@ struct _Py_global_strings {
        STRUCT_FOR_ID(co_varnames)
        STRUCT_FOR_ID(code)
        STRUCT_FOR_ID(col_offset)
+        STRUCT_FOR_ID(collector)
        STRUCT_FOR_ID(command)
        STRUCT_FOR_ID(comment_factory)
        STRUCT_FOR_ID(compile_mode)
+        STRUCT_FOR_ID(compression)
        STRUCT_FOR_ID(config)
        STRUCT_FOR_ID(consts)
        STRUCT_FOR_ID(context)
@ -441,7 +443,9 @@ struct _Py_global_strings {
        STRUCT_FOR_ID(event)
        STRUCT_FOR_ID(eventmask)
        STRUCT_FOR_ID(exc)
+        STRUCT_FOR_ID(exc_tb)
        STRUCT_FOR_ID(exc_type)
+        STRUCT_FOR_ID(exc_val)
        STRUCT_FOR_ID(exc_value)
        STRUCT_FOR_ID(excepthook)
        STRUCT_FOR_ID(exception)
@ -697,6 +701,7 @@ struct _Py_global_strings {
        STRUCT_FOR_ID(print_file_and_line)
        STRUCT_FOR_ID(priority)
        STRUCT_FOR_ID(progress)
+        STRUCT_FOR_ID(progress_callback)
        STRUCT_FOR_ID(progress_routine)
        STRUCT_FOR_ID(proto)
        STRUCT_FOR_ID(protocol)
@ -737,6 +742,7 @@ struct _Py_global_strings {
        STRUCT_FOR_ID(reversed)
        STRUCT_FOR_ID(rounding)
        STRUCT_FOR_ID(salt)
+        STRUCT_FOR_ID(sample_interval_us)
        STRUCT_FOR_ID(sched_priority)
        STRUCT_FOR_ID(scheduler)
        STRUCT_FOR_ID(script)
@ -776,8 +782,10 @@ struct _Py_global_strings {
        STRUCT_FOR_ID(spam)
        STRUCT_FOR_ID(src)
        STRUCT_FOR_ID(src_dir_fd)
+        STRUCT_FOR_ID(stack_frames)
        STRUCT_FOR_ID(stacklevel)
        STRUCT_FOR_ID(start)
+        STRUCT_FOR_ID(start_time_us)
        STRUCT_FOR_ID(statement)
        STRUCT_FOR_ID(stats)
        STRUCT_FOR_ID(status)
@ -818,6 +826,7 @@ struct _Py_global_strings {
        STRUCT_FOR_ID(times)
        STRUCT_FOR_ID(timespec)
        STRUCT_FOR_ID(timestamp)
+        STRUCT_FOR_ID(timestamp_us)
        STRUCT_FOR_ID(timetuple)
        STRUCT_FOR_ID(timeunit)
        STRUCT_FOR_ID(top)
--- a/Include/internal/pycore_runtime_init_generated.h
+++ b/Include/internal/pycore_runtime_init_generated.h
@ -1651,9 +1651,11 @@ extern "C" {
    INIT_ID(co_varnames), \
    INIT_ID(code), \
    INIT_ID(col_offset), \
+    INIT_ID(collector), \
    INIT_ID(command), \
    INIT_ID(comment_factory), \
    INIT_ID(compile_mode), \
+    INIT_ID(compression), \
    INIT_ID(config), \
    INIT_ID(consts), \
    INIT_ID(context), \
@ -1716,7 +1718,9 @@ extern "C" {
    INIT_ID(event), \
    INIT_ID(eventmask), \
    INIT_ID(exc), \
+    INIT_ID(exc_tb), \
    INIT_ID(exc_type), \
+    INIT_ID(exc_val), \
    INIT_ID(exc_value), \
    INIT_ID(excepthook), \
    INIT_ID(exception), \
@ -1972,6 +1976,7 @@ extern "C" {
    INIT_ID(print_file_and_line), \
    INIT_ID(priority), \
    INIT_ID(progress), \
+    INIT_ID(progress_callback), \
    INIT_ID(progress_routine), \
    INIT_ID(proto), \
    INIT_ID(protocol), \
@ -2012,6 +2017,7 @@ extern "C" {
    INIT_ID(reversed), \
    INIT_ID(rounding), \
    INIT_ID(salt), \
+    INIT_ID(sample_interval_us), \
    INIT_ID(sched_priority), \
    INIT_ID(scheduler), \
    INIT_ID(script), \
@ -2051,8 +2057,10 @@ extern "C" {
    INIT_ID(spam), \
    INIT_ID(src), \
    INIT_ID(src_dir_fd), \
+    INIT_ID(stack_frames), \
    INIT_ID(stacklevel), \
    INIT_ID(start), \
+    INIT_ID(start_time_us), \
    INIT_ID(statement), \
    INIT_ID(stats), \
    INIT_ID(status), \
@ -2093,6 +2101,7 @@ extern "C" {
    INIT_ID(times), \
    INIT_ID(timespec), \
    INIT_ID(timestamp), \
+    INIT_ID(timestamp_us), \
    INIT_ID(timetuple), \
    INIT_ID(timeunit), \
    INIT_ID(top), \
--- a/Include/internal/pycore_unicodeobject_generated.h
+++ b/Include/internal/pycore_unicodeobject_generated.h
@ -1284,6 +1284,10 @@ _PyUnicode_InitStaticStrings(PyInterpreterState *interp) {
    _PyUnicode_InternStatic(interp, &string);
    assert(_PyUnicode_CheckConsistency(string, 1));
    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(collector);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
    string = &_Py_ID(command);
    _PyUnicode_InternStatic(interp, &string);
    assert(_PyUnicode_CheckConsistency(string, 1));
@ -1296,6 +1300,10 @@ _PyUnicode_InitStaticStrings(PyInterpreterState *interp) {
    _PyUnicode_InternStatic(interp, &string);
    assert(_PyUnicode_CheckConsistency(string, 1));
    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(compression);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
    string = &_Py_ID(config);
    _PyUnicode_InternStatic(interp, &string);
    assert(_PyUnicode_CheckConsistency(string, 1));
@ -1544,10 +1552,18 @@ _PyUnicode_InitStaticStrings(PyInterpreterState *interp) {
    _PyUnicode_InternStatic(interp, &string);
    assert(_PyUnicode_CheckConsistency(string, 1));
    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(exc_tb);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
    string = &_Py_ID(exc_type);
    _PyUnicode_InternStatic(interp, &string);
    assert(_PyUnicode_CheckConsistency(string, 1));
    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(exc_val);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
    string = &_Py_ID(exc_value);
    _PyUnicode_InternStatic(interp, &string);
    assert(_PyUnicode_CheckConsistency(string, 1));
@ -2568,6 +2584,10 @@ _PyUnicode_InitStaticStrings(PyInterpreterState *interp) {
    _PyUnicode_InternStatic(interp, &string);
    assert(_PyUnicode_CheckConsistency(string, 1));
    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(progress_callback);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
    string = &_Py_ID(progress_routine);
    _PyUnicode_InternStatic(interp, &string);
    assert(_PyUnicode_CheckConsistency(string, 1));
@ -2728,6 +2748,10 @@ _PyUnicode_InitStaticStrings(PyInterpreterState *interp) {
    _PyUnicode_InternStatic(interp, &string);
    assert(_PyUnicode_CheckConsistency(string, 1));
    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(sample_interval_us);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
    string = &_Py_ID(sched_priority);
    _PyUnicode_InternStatic(interp, &string);
    assert(_PyUnicode_CheckConsistency(string, 1));
@ -2884,6 +2908,10 @@ _PyUnicode_InitStaticStrings(PyInterpreterState *interp) {
    _PyUnicode_InternStatic(interp, &string);
    assert(_PyUnicode_CheckConsistency(string, 1));
    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(stack_frames);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
    string = &_Py_ID(stacklevel);
    _PyUnicode_InternStatic(interp, &string);
    assert(_PyUnicode_CheckConsistency(string, 1));
@ -2892,6 +2920,10 @@ _PyUnicode_InitStaticStrings(PyInterpreterState *interp) {
    _PyUnicode_InternStatic(interp, &string);
    assert(_PyUnicode_CheckConsistency(string, 1));
    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(start_time_us);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
    string = &_Py_ID(statement);
    _PyUnicode_InternStatic(interp, &string);
    assert(_PyUnicode_CheckConsistency(string, 1));
@ -3052,6 +3084,10 @@ _PyUnicode_InitStaticStrings(PyInterpreterState *interp) {
    _PyUnicode_InternStatic(interp, &string);
    assert(_PyUnicode_CheckConsistency(string, 1));
    assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(timestamp_us);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
    string = &_Py_ID(timetuple);
    _PyUnicode_InternStatic(interp, &string);
    assert(_PyUnicode_CheckConsistency(string, 1));
--- a/InternalDocs/profiling_binary_format.md
+++ b/InternalDocs/profiling_binary_format.md
@ -0,0 +1,489 @@
+# Profiling Binary Format
+
+The profiling module includes a binary file format for storing sampling
+profiler data. This document describes the format's structure and the
+design decisions behind it.
+
+The implementation is in
+[`Modules/_remote_debugging/binary_io_writer.c`](../Modules/_remote_debugging/binary_io_writer.c)
+and [`Modules/_remote_debugging/binary_io_reader.c`](../Modules/_remote_debugging/binary_io_reader.c),
+with declarations in
+[`Modules/_remote_debugging/binary_io.h`](../Modules/_remote_debugging/binary_io.h).
+
+## Overview
+
+The sampling profiler can generate enormous amounts of data. A typical
+profiling session sampling at 1000 Hz for 60 seconds produces 60,000 samples.
+Each sample contains a full call stack, often 20-50 frames deep, and each
+frame includes a filename, function name, and line number. In a text-based
+format like collapsed stacks, this would mean repeating the same long file
+paths and function names thousands of times.
+
+The binary format addresses this through two key strategies:
+
+1. **Deduplication**: Strings and frames are stored once in lookup tables,
+   then referenced by small integer indices. A 100-character file path that
+   appears in 50,000 samples is stored once, not 50,000 times.
+
+2. **Compact encoding**: Variable-length integers (varints) encode small
+   values in fewer bytes. Since most indices are small (under 128), they
+   typically need only one byte instead of four.
+
+Together with optional zstd compression, these techniques reduce file sizes
+by 10-50x compared to text formats while also enabling faster I/O.
+
+## File Layout
+
+The file consists of five sections:
+
+```
+------------------+  Offset 0
+|     Header       |  64 bytes (fixed)
+------------------+  Offset 64
+|                  |
+|   Sample Data    |  Variable size (optionally compressed)
+|                  |
+------------------+  string_table_offset
+|   String Table   |  Variable size
+------------------+  frame_table_offset
+|   Frame Table    |  Variable size
+------------------+  file_size - 32
+|     Footer       |  32 bytes (fixed)
+------------------+  file_size
+```
+
+The layout is designed for streaming writes during profiling. The profiler
+cannot know in advance how many unique strings or frames will be encountered,
+so these tables must be built incrementally and written at the end.
+
+The header comes first so readers can quickly validate the file and locate
+the metadata tables. The sample data follows immediately, allowing the writer
+to stream samples directly to disk (or through a compression stream) without
+buffering the entire dataset in memory.
+
+The string and frame tables are placed after sample data because they grow
+as new unique entries are discovered during profiling. By deferring their
+output until finalization, the writer avoids the complexity of reserving
+space or rewriting portions of the file.
+
+The footer at the end contains counts needed to allocate arrays before
+parsing the tables. Placing it at a fixed offset from the end (rather than
+at a variable offset recorded in the header) means readers can locate it
+with a single seek to `file_size - 32`, without first reading the header.
+
+## Header
+
+```
+ Offset   Size   Type      Description
+--------+------+---------+----------------------------------------+
+|    0   |  4   | uint32  | Magic number (0x54414348 = "TACH")     |
+|    4   |  4   | uint32  | Format version                         |
+|    8   |  4   | bytes   | Python version (major, minor, micro,   |
+|        |      |         | reserved)                              |
+|   12   |  8   | uint64  | Start timestamp (microseconds)         |
+|   20   |  8   | uint64  | Sample interval (microseconds)         |
+|   28   |  4   | uint32  | Total sample count                     |
+|   32   |  4   | uint32  | Thread count                           |
+|   36   |  8   | uint64  | String table offset                    |
+|   44   |  8   | uint64  | Frame table offset                     |
+|   52   |  4   | uint32  | Compression type (0=none, 1=zstd)      |
+|   56   |  8   | bytes   | Reserved (zero-filled)                 |
+--------+------+---------+----------------------------------------+
+```
+
+The magic number `0x54414348` ("TACH" for Tachyon) identifies the file format
+and also serves as an **endianness marker**. When read on a system with
+different byte order than the writer, it appears as `0x48434154`. The reader
+uses this to detect cross-endian files and automatically byte-swap all
+multi-byte integer fields.
+
+The Python version field records the major, minor, and micro version numbers
+of the Python interpreter that generated the file. This allows analysis tools
+to detect version mismatches when replaying data collected on a different
+Python version, which may have different internal structures or behaviors.
+
+The header is written as zeros initially, then overwritten with actual values
+during finalization. This requires the output stream to be seekable, which
+is acceptable since the format targets regular files rather than pipes or
+network streams.
+
+## Sample Data
+
+Sample data begins at offset 64 and extends to `string_table_offset`. Samples
+use delta compression to minimize redundancy when consecutive samples from the
+same thread have identical or similar call stacks.
+
+### Stack Encoding Types
+
+Each sample record begins with thread identification, then an encoding byte:
+
+| Code | Name | Description |
+|------|------|-------------|
+| 0x00 | REPEAT | RLE: identical stack repeated N times |
+| 0x01 | FULL | Complete stack (first sample or no match) |
+| 0x02 | SUFFIX | Shares N frames from bottom of previous stack |
+| 0x03 | POP_PUSH | Remove M frames from top, add N new frames |
+
+### Record Formats
+
+**REPEAT (0x00) - Run-Length Encoded Identical Stacks:**
+```
+-----------------+-----------+----------------------------------------+
+| thread_id       | 8 bytes   | Thread identifier (uint64, fixed)      |
+| interpreter_id  | 4 bytes   | Interpreter ID (uint32, fixed)         |
+| encoding        | 1 byte    | 0x00 (REPEAT)                          |
+| count           | varint    | Number of samples in this RLE group    |
+| samples         | varies    | Interleaved: [delta: varint, status: 1]|
+|                 |           | repeated count times                   |
+-----------------+-----------+----------------------------------------+
+```
+The stack is inherited from this thread's previous sample. Each sample in the
+group gets its own timestamp delta and status byte, stored as interleaved pairs
+(delta1, status1, delta2, status2, ...) rather than separate arrays.
+
+**FULL (0x01) - Complete Stack:**
+```
+-----------------+-----------+----------------------------------------+
+| thread_id       | 8 bytes   | Thread identifier (uint64, fixed)      |
+| interpreter_id  | 4 bytes   | Interpreter ID (uint32, fixed)         |
+| encoding        | 1 byte    | 0x01 (FULL)                            |
+| timestamp_delta | varint    | Microseconds since thread's last sample|
+| status          | 1 byte    | Thread state flags                     |
+| stack_depth     | varint    | Number of frames in call stack         |
+| frame_indices   | varint[]  | Array of frame table indices           |
+-----------------+-----------+----------------------------------------+
+```
+Used for the first sample from a thread, or when delta encoding would not
+provide savings.
+
+**SUFFIX (0x02) - Shared Suffix Match:**
+```
+-----------------+-----------+----------------------------------------+
+| thread_id       | 8 bytes   | Thread identifier (uint64, fixed)      |
+| interpreter_id  | 4 bytes   | Interpreter ID (uint32, fixed)         |
+| encoding        | 1 byte    | 0x02 (SUFFIX)                          |
+| timestamp_delta | varint    | Microseconds since thread's last sample|
+| status          | 1 byte    | Thread state flags                     |
+| shared_count    | varint    | Frames shared from bottom of prev stack|
+| new_count       | varint    | New frames at top of stack             |
+| new_frames      | varint[]  | Array of new_count frame indices       |
+-----------------+-----------+----------------------------------------+
+```
+Used when a function call added frames to the top of the stack. The shared
+frames from the previous stack are kept, and new frames are prepended.
+
+**POP_PUSH (0x03) - Pop and Push:**
+```
+-----------------+-----------+----------------------------------------+
+| thread_id       | 8 bytes   | Thread identifier (uint64, fixed)      |
+| interpreter_id  | 4 bytes   | Interpreter ID (uint32, fixed)         |
+| encoding        | 1 byte    | 0x03 (POP_PUSH)                        |
+| timestamp_delta | varint    | Microseconds since thread's last sample|
+| status          | 1 byte    | Thread state flags                     |
+| pop_count       | varint    | Frames to remove from top of prev stack|
+| push_count      | varint    | New frames to add at top               |
+| new_frames      | varint[]  | Array of push_count frame indices      |
+-----------------+-----------+----------------------------------------+
+```
+Used when the code path changed: some frames were popped (function returns)
+and new frames were pushed (different function calls).
+
+### Thread and Interpreter Identification
+
+Thread IDs are 64-bit values that can be large (memory addresses on some
+platforms) and vary unpredictably. Using a fixed 8-byte encoding avoids
+the overhead of varint encoding for large values and simplifies parsing
+since the reader knows exactly where each field begins.
+
+The interpreter ID identifies which Python sub-interpreter the thread
+belongs to, allowing analysis tools to separate activity across interpreters
+in processes using multiple sub-interpreters.
+
+### Status Byte
+
+The status byte is a bitfield encoding thread state at sample time:
+
+| Bit | Flag                  | Meaning                                    |
+|-----|-----------------------|--------------------------------------------|
+|  0  | THREAD_STATUS_HAS_GIL | Thread holds the GIL (Global Interpreter Lock) |
+|  1  | THREAD_STATUS_ON_CPU  | Thread is actively running on a CPU core   |
+|  2  | THREAD_STATUS_UNKNOWN | Thread state could not be determined       |
+|  3  | THREAD_STATUS_GIL_REQUESTED | Thread is waiting to acquire the GIL  |
+|  4  | THREAD_STATUS_HAS_EXCEPTION | Thread has a pending exception         |
+
+Multiple flags can be set simultaneously (e.g., a thread can hold the GIL
+while also running on CPU). Analysis tools use these to filter samples or
+visualize thread states over time.
+
+### Timestamp Delta Encoding
+
+Timestamps use delta encoding rather than absolute values. Absolute
+timestamps in microseconds require 8 bytes each, but consecutive samples
+from the same thread are typically separated by the sampling interval
+(e.g., 1000 microseconds), so the delta between them is small and fits
+in 1-2 varint bytes. The writer tracks the previous timestamp for each
+thread separately. The first sample from a thread encodes its delta from
+the profiling start time; subsequent samples encode the delta from that
+thread's previous sample. This per-thread tracking is necessary because
+samples are interleaved across threads in arrival order, not grouped by
+thread.
+
+For REPEAT (RLE) records, timestamp deltas and status bytes are stored as
+interleaved pairs (delta, status, delta, status, ...) - one pair per
+repeated sample - allowing efficient batching while preserving the exact
+timing and state of each sample.
+
+### Frame Indexing
+
+Each frame in a call stack is represented by an index into the frame table
+rather than inline data. This provides massive space savings because call
+stacks are highly repetitive: the same function appears in many samples
+(hot functions), call stacks often share common prefixes (main -> app ->
+handler -> ...), and recursive functions create repeated frame sequences.
+A frame index is typically 1-2 varint bytes. Inline frame data would be
+20-200+ bytes (two strings plus a line number). For a profile with 100,000
+samples averaging 30 frames each, this reduces frame data from potentially
+gigabytes to tens of megabytes.
+
+Frame indices are written innermost-first (the currently executing frame
+has index 0 in the array). This ordering works well with delta compression:
+function calls typically add frames at the top (index 0), while shared
+frames remain at the bottom.
+
+## String Table
+
+The string table stores deduplicated UTF-8 strings (filenames and function
+names). It begins at `string_table_offset` and contains entries in order of
+their assignment during writing:
+
+```
+----------------+
+| length: varint |
+| data: bytes    |
+----------------+  (repeated for each string)
+```
+
+Strings are stored in the order they were first encountered during writing.
+The first unique filename gets index 0, the second gets index 1, and so on.
+Length-prefixing (rather than null-termination) allows strings containing
+null bytes and enables readers to allocate exact-sized buffers. The varint
+length encoding means short strings (under 128 bytes) need only one length
+byte.
+
+## Frame Table
+
+The frame table stores deduplicated frame entries:
+
+```
+----------------------+
+| filename_idx: varint |
+| funcname_idx: varint |
+| lineno: svarint      |
+----------------------+  (repeated for each frame)
+```
+
+Each unique (filename, funcname, lineno) combination gets one entry. Two
+calls to the same function at different line numbers produce different
+frame entries; two calls at the same line number share one entry.
+
+Strings and frames are deduplicated separately because they have different
+cardinalities and reference patterns. A codebase might have hundreds of
+unique source files but thousands of unique functions. Many functions share
+the same filename, so storing the filename index in each frame entry (rather
+than the full string) provides an additional layer of deduplication. A frame
+entry is just three varints (typically 3-6 bytes) rather than two full
+strings plus a line number.
+
+Line numbers use signed varint (zigzag encoding) rather than unsigned to
+handle edge cases. Synthetic frames—generated frames that don't correspond
+directly to Python source code, such as C extension boundaries or internal
+interpreter frames—use line number 0 or -1 to indicate the absence of a
+source location. Zigzag encoding ensures these small negative values encode
+efficiently (−1 becomes 1, which is one byte) rather than requiring the
+maximum varint length.
+
+## Footer
+
+```
+ Offset   Size   Type      Description
+--------+------+---------+----------------------------------------+
+|    0   |  4   | uint32  | String count                           |
+|    4   |  4   | uint32  | Frame count                            |
+|    8   |  8   | uint64  | Total file size                        |
+|   16   | 16   | bytes   | Checksum (reserved, currently zeros)   |
+--------+------+---------+----------------------------------------+
+```
+
+The string and frame counts allow readers to pre-allocate arrays of the
+correct size before parsing the tables. Without these counts, readers would
+need to either scan the tables twice (once to count, once to parse) or use
+dynamically-growing arrays.
+
+The file size field provides a consistency check: if the actual file size
+does not match, the file may be truncated or corrupted.
+
+The checksum field is reserved for future use. A checksum would allow
+detection of corruption but adds complexity and computation cost. The
+current implementation leaves this as zeros.
+
+## Variable-Length Integer Encoding
+
+The format uses LEB128 (Little Endian Base 128) for unsigned integers and
+zigzag + LEB128 for signed integers. These encodings are widely used
+(Protocol Buffers, DWARF debug info, WebAssembly) and well-understood.
+
+### Unsigned Varint (LEB128)
+
+Each byte stores 7 bits of data. The high bit indicates whether more bytes
+follow:
+
+```
+Value        Encoded bytes
+0-127        [0xxxxxxx]                    (1 byte)
+128-16383    [1xxxxxxx] [0xxxxxxx]         (2 bytes)
+16384+       [1xxxxxxx] [1xxxxxxx] ...     (3+ bytes)
+```
+
+Most indices in profiling data are small. A profile with 1000 unique frames
+needs at most 2 bytes per frame index. The common case (indices under 128)
+needs only 1 byte.
+
+### Signed Varint (Zigzag)
+
+Standard LEB128 encodes −1 as a very large unsigned value, requiring many
+bytes. Zigzag encoding interleaves positive and negative values:
+
+```
+ 0 -> 0    -1 -> 1     1 -> 2    -2 -> 3     2 -> 4
+```
+
+This ensures small-magnitude values (whether positive or negative) encode
+in few bytes.
+
+## Compression
+
+When compression is enabled, the sample data region contains a zstd stream.
+The string table, frame table, and footer remain uncompressed so readers can
+access metadata without decompressing the entire file. A tool that only needs
+to report "this file contains 50,000 samples of 3 threads" can read the header
+and footer without touching the compressed sample data. This also simplifies
+the format: the header's offset fields point directly to the tables rather
+than to positions within a decompressed stream.
+
+Zstd provides an excellent balance of compression ratio and speed. Profiling
+data compresses very well (often 5-10x) due to repetitive patterns: the same
+small set of frame indices appears repeatedly, and delta-encoded timestamps
+cluster around the sampling interval. Zstd's streaming API allows compression
+without buffering the entire dataset. The writer feeds sample data through
+the compressor incrementally, flushing compressed chunks to disk as they
+become available.
+
+Level 5 compression is used as a default. Lower levels (1-3) are faster but
+compress less; higher levels (6+) compress more but slow down writing. Level
+5 provides good compression with minimal impact on profiling overhead.
+
+## Reading and Writing
+
+### Writing
+
+1. Open the output file and write 64 zero bytes as a placeholder header
+2. Initialize empty string and frame dictionaries for deduplication
+3. For each sample:
+   - Intern any new strings, assigning sequential indices
+   - Intern any new frames, assigning sequential indices
+   - Encode the sample record and write to the buffer
+   - Flush the buffer through compression (if enabled) when full
+4. Flush remaining buffered data and finalize compression
+5. Write the string table (length-prefixed strings in index order)
+6. Write the frame table (varint-encoded entries in index order)
+7. Write the footer with final counts
+8. Seek to offset 0 and write the header with actual values
+
+The writer maintains two dictionaries: one mapping strings to indices, one
+mapping (filename_idx, funcname_idx, lineno) tuples to frame indices. These
+enable O(1) lookup during interning.
+
+### Reading
+
+1. Read the header magic number to detect endianness (set `needs_swap` flag
+   if the magic appears byte-swapped)
+2. Validate version and read remaining header fields (byte-swapping if needed)
+3. Seek to end − 32 and read the footer (byte-swapping counts if needed)
+4. Allocate string array of `string_count` elements
+5. Parse the string table, populating the array
+6. Allocate frame array of `frame_count * 3` uint32 elements
+7. Parse the frame table, populating the array
+8. If compressed, decompress the sample data region
+9. Iterate through samples, resolving indices to strings/frames
+   (byte-swapping thread_id and interpreter_id if needed)
+
+The reader builds lookup arrays rather than dictionaries since it only needs
+index-to-value mapping, not value-to-index.
+
+## Platform Considerations
+
+### Byte Ordering and Cross-Platform Portability
+
+The binary format uses **native byte order** for all multi-byte integer
+fields when writing. However, the reader supports **cross-endian reading**:
+files written on a little-endian system (x86, ARM) can be read on a
+big-endian system (s390x, PowerPC), and vice versa.
+
+The magic number doubles as an endianness marker. When read on a system with
+different byte order, it appears byte-swapped (`0x48434154` instead of
+`0x54414348`). The reader detects this and automatically byte-swaps all
+fixed-width integer fields during parsing.
+
+Writers must use `memcpy()` from properly-sized integer types when writing
+fixed-width integer fields. When the source variable's type differs from the
+field width (e.g., `size_t` written as 4 bytes), explicit casting to the
+correct type (e.g., `uint32_t`) is required before `memcpy()`. On big-endian
+systems, copying from an oversized type would copy the wrong bytes—high-order
+zeros instead of the actual value.
+
+The reader tracks whether byte-swapping is needed via a `needs_swap` flag set
+during header parsing. All fixed-width fields in the header, footer, and
+sample data are conditionally byte-swapped using Python's internal byte-swap
+functions (`_Py_bswap32`, `_Py_bswap64` from `pycore_bitutils.h`).
+
+Variable-length integers (varints) are byte-order independent since they
+encode values one byte at a time using the LEB128 scheme, so they require
+no special handling for cross-endian reading.
+
+### Memory-Mapped I/O
+
+On Unix systems (Linux, macOS), the reader uses `mmap()` to map the file
+into the process address space. The kernel handles paging data in and out
+as needed, no explicit read() calls or buffer management are required,
+multiple readers can share the same physical pages, and sequential access
+patterns benefit from kernel read-ahead.
+
+The implementation uses `madvise()` to hint the access pattern to the kernel:
+`MADV_SEQUENTIAL` indicates the file will be read linearly, enabling
+aggressive read-ahead. `MADV_WILLNEED` requests pre-faulting of pages.
+On Linux, `MAP_POPULATE` pre-faults all pages at mmap time rather than on
+first access, moving page fault overhead from the parsing loop to the
+initial mapping for more predictable performance. For large files (over
+32 MB), `MADV_HUGEPAGE` requests transparent huge pages (2 MB instead of
+4 KB) to reduce TLB pressure when accessing large amounts of data.
+
+On Windows, the implementation falls back to standard file I/O with full
+file buffering. Profiling data files are typically small enough (tens to
+hundreds of megabytes) that this is acceptable.
+
+The writer uses a 512 KB buffer to batch small writes. Each sample record
+is typically tens of bytes; writing these individually would incur excessive
+syscall overhead. The buffer accumulates data until full, then flushes in
+one write() call (or feeds through the compression stream).
+
+## Future Considerations
+
+The format reserves space for future extensions. The 12 reserved bytes in
+the header could hold additional metadata. The 16-byte checksum field in
+the footer is currently unused. The version field allows incompatible
+changes with graceful rejection. New compression types could be added
+(compression_type > 1).
+
+Any changes that alter the meaning of existing fields or the parsing logic
+should increment the version number to prevent older readers from
+misinterpreting new files.
--- a/Lib/profiling/sampling/binary_collector.py
+++ b/Lib/profiling/sampling/binary_collector.py
@ -0,0 +1,120 @@
+"""Thin Python wrapper around C binary writer for profiling data."""
+
+import time
+
+import _remote_debugging
+
+from .collector import Collector
+
+# Compression type constants (must match binary_io.h)
+COMPRESSION_NONE = 0
+COMPRESSION_ZSTD = 1
+
+
+def _resolve_compression(compression):
+    """Resolve compression type from string or int.
+
+    Args:
+        compression: 'auto', 'zstd', 'none', or int (0/1)
+
+    Returns:
+        int: Compression type constant
+    """
+    if isinstance(compression, int):
+        return compression
+
+    compression = compression.lower()
+    if compression == 'none':
+        return COMPRESSION_NONE
+    elif compression == 'zstd':
+        return COMPRESSION_ZSTD
+    elif compression == 'auto':
+        # Auto: use zstd if available, otherwise none
+        if _remote_debugging.zstd_available():
+            return COMPRESSION_ZSTD
+        return COMPRESSION_NONE
+    else:
+        raise ValueError(f"Unknown compression type: {compression}")
+
+
+class BinaryCollector(Collector):
+    """High-performance binary collector using C implementation.
+
+    This collector writes profiling data directly to a binary file format
+    with optional zstd compression. All I/O is performed in C for maximum
+    throughput.
+
+    The binary format uses string/frame deduplication and varint encoding
+    for efficient storage.
+    """
+
+    def __init__(self, filename, sample_interval_usec, *, skip_idle=False,
+                 compression='auto'):
+        """Create a new binary collector.
+
+        Args:
+            filename: Path to output binary file
+            sample_interval_usec: Sampling interval in microseconds
+            skip_idle: If True, skip idle threads (not used in binary format)
+            compression: 'auto', 'zstd', 'none', or int (0=none, 1=zstd)
+        """
+        self.filename = filename
+        self.sample_interval_usec = sample_interval_usec
+        self.skip_idle = skip_idle
+
+        compression_type = _resolve_compression(compression)
+        start_time_us = int(time.monotonic() * 1_000_000)
+        self._writer = _remote_debugging.BinaryWriter(
+            filename, sample_interval_usec, start_time_us, compression=compression_type
+        )
+
+    def collect(self, stack_frames, timestamp_us=None):
+        """Collect profiling data from stack frames.
+
+        This passes stack_frames directly to the C writer which handles
+        all encoding and buffering.
+
+        Args:
+            stack_frames: List of InterpreterInfo objects from _remote_debugging
+            timestamp_us: Optional timestamp in microseconds. If not provided,
+                          uses time.monotonic() to generate one.
+        """
+        if timestamp_us is None:
+            timestamp_us = int(time.monotonic() * 1_000_000)
+        self._writer.write_sample(stack_frames, timestamp_us)
+
+    def collect_failed_sample(self):
+        """Record a failed sample attempt (no-op for binary format)."""
+        pass
+
+    def export(self, filename=None):
+        """Finalize and close the binary file.
+
+        Args:
+            filename: Ignored (binary files are written incrementally)
+        """
+        self._writer.finalize()
+
+    @property
+    def total_samples(self):
+        return self._writer.total_samples
+
+    def get_stats(self):
+        """Get encoding statistics.
+
+        Returns:
+            Dict with encoding statistics including repeat/full/suffix/pop-push
+            record counts, frames written/saved, and compression ratio.
+        """
+        return self._writer.get_stats()
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        """Context manager exit - finalize unless there was an error."""
+        if exc_type is None:
+            self._writer.finalize()
+        else:
+            self._writer.close()
+        return False
--- a/Lib/profiling/sampling/binary_reader.py
+++ b/Lib/profiling/sampling/binary_reader.py
@ -0,0 +1,128 @@
+"""Thin Python wrapper around C binary reader for profiling data."""
+
+
+class BinaryReader:
+    """High-performance binary reader using C implementation.
+
+    This reader uses memory-mapped I/O (on Unix) for fast replay of
+    profiling data from binary files.
+
+    Use as a context manager:
+        with BinaryReader('profile.bin') as reader:
+            info = reader.get_info()
+            reader.replay_samples(collector, progress_callback)
+    """
+
+    def __init__(self, filename):
+        """Create a new binary reader.
+
+        Args:
+            filename: Path to input binary file
+        """
+        self.filename = filename
+        self._reader = None
+
+    def __enter__(self):
+        import _remote_debugging
+        self._reader = _remote_debugging.BinaryReader(self.filename)
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if self._reader is not None:
+            self._reader.close()
+            self._reader = None
+        return False
+
+    def get_info(self):
+        """Get metadata about the binary file.
+
+        Returns:
+            dict: File metadata including:
+                - sample_count: Number of samples in the file
+                - sample_interval_us: Sampling interval in microseconds
+                - start_time_us: Start timestamp in microseconds
+                - string_count: Number of unique strings
+                - frame_count: Number of unique frames
+                - compression: Compression type used
+        """
+        if self._reader is None:
+            raise RuntimeError("Reader not open. Use as context manager.")
+        return self._reader.get_info()
+
+    def replay_samples(self, collector, progress_callback=None):
+        """Replay samples from binary file through a collector.
+
+        This allows converting binary profiling data to other formats
+        (e.g., flamegraph, pstats) by replaying through the appropriate
+        collector.
+
+        Args:
+            collector: A Collector instance with a collect() method
+            progress_callback: Optional callable(current, total) for progress
+
+        Returns:
+            int: Number of samples replayed
+        """
+        if self._reader is None:
+            raise RuntimeError("Reader not open. Use as context manager.")
+        return self._reader.replay(collector, progress_callback)
+
+    @property
+    def sample_count(self):
+        if self._reader is None:
+            raise RuntimeError("Reader not open. Use as context manager.")
+        return self._reader.get_info()['sample_count']
+
+    def get_stats(self):
+        """Get reconstruction statistics from replay.
+
+        Returns:
+            dict: Statistics about record types decoded and samples
+                  reconstructed during replay.
+        """
+        if self._reader is None:
+            raise RuntimeError("Reader not open. Use as context manager.")
+        return self._reader.get_stats()
+
+
+def convert_binary_to_format(input_file, output_file, output_format,
+                             sample_interval_usec=None, progress_callback=None):
+    """Convert a binary profiling file to another format.
+
+    Args:
+        input_file: Path to input binary file
+        output_file: Path to output file
+        output_format: Target format ('flamegraph', 'collapsed', 'pstats', etc.)
+        sample_interval_usec: Override sample interval (uses file's if None)
+        progress_callback: Optional callable(current, total) for progress
+
+    Returns:
+        int: Number of samples converted
+    """
+    from .gecko_collector import GeckoCollector
+    from .stack_collector import FlamegraphCollector, CollapsedStackCollector
+    from .pstats_collector import PStatsCollector
+
+    with BinaryReader(input_file) as reader:
+        info = reader.get_info()
+        interval = sample_interval_usec or info['sample_interval_us']
+
+        # Create appropriate collector based on format
+        if output_format == 'flamegraph':
+            collector = FlamegraphCollector(interval)
+        elif output_format == 'collapsed':
+            collector = CollapsedStackCollector(interval)
+        elif output_format == 'pstats':
+            collector = PStatsCollector(interval)
+        elif output_format == 'gecko':
+            collector = GeckoCollector(interval)
+        else:
+            raise ValueError(f"Unknown output format: {output_format}")
+
+        # Replay samples through collector
+        count = reader.replay_samples(collector, progress_callback)
+
+        # Export to target format
+        collector.export(output_file)
+
+        return count
--- a/Lib/profiling/sampling/cli.py
+++ b/Lib/profiling/sampling/cli.py
@ -17,6 +17,8 @@ from .pstats_collector import PstatsCollector
 from .stack_collector import CollapsedStackCollector, FlamegraphCollector
 from .heatmap_collector import HeatmapCollector
 from .gecko_collector import GeckoCollector
+from .binary_collector import BinaryCollector
+from .binary_reader import BinaryReader
 from .constants import (
    PROFILING_MODE_ALL,
    PROFILING_MODE_WALL,
@ -76,6 +78,7 @@ FORMAT_EXTENSIONS = {
    "flamegraph": "html",
    "gecko": "json",
    "heatmap": "html",
+    "binary": "bin",
 }

 COLLECTOR_MAP = {
@ -84,6 +87,7 @@ COLLECTOR_MAP = {
    "flamegraph": FlamegraphCollector,
    "gecko": GeckoCollector,
    "heatmap": HeatmapCollector,
+    "binary": BinaryCollector,
 }

 def _setup_child_monitor(args, parent_pid):
@ -181,7 +185,7 @@ def _parse_mode(mode_string):
 def _check_process_died(process):
    """Check if process died and raise an error with stderr if available."""
    if process.poll() is None:
-        return  # Process still running
+        return

    # Process died - try to get stderr for error message
    stderr_msg = ""
@ -366,7 +370,7 @@ def _add_mode_options(parser):
    )


-def _add_format_options(parser):
+def _add_format_options(parser, include_compression=True, include_binary=True):
    """Add output format options to a parser."""
    output_group = parser.add_argument_group("Output options")
    format_group = output_group.add_mutually_exclusive_group()
@ -405,8 +409,24 @@ def _add_format_options(parser):
        dest="format",
        help="Generate interactive HTML heatmap visualization with line-level sample counts",
    )
+    if include_binary:
+        format_group.add_argument(
+            "--binary",
+            action="store_const",
+            const="binary",
+            dest="format",
+            help="Generate high-performance binary format (use 'replay' command to convert)",
+        )
    parser.set_defaults(format="pstats")

+    if include_compression:
+        output_group.add_argument(
+            "--compression",
+            choices=["auto", "zstd", "none"],
+            default="auto",
+            help="Compression for binary format: auto (use zstd if available), zstd, none",
+        )
+
    output_group.add_argument(
        "-o",
        "--output",
@ -461,15 +481,18 @@ def _sort_to_mode(sort_choice):
    return sort_map.get(sort_choice, SORT_MODE_NSAMPLES)


-def _create_collector(format_type, interval, skip_idle, opcodes=False):
+def _create_collector(format_type, interval, skip_idle, opcodes=False,
+                      output_file=None, compression='auto'):
    """Create the appropriate collector based on format type.

    Args:
-        format_type: The output format ('pstats', 'collapsed', 'flamegraph', 'gecko', 'heatmap')
+        format_type: The output format ('pstats', 'collapsed', 'flamegraph', 'gecko', 'heatmap', 'binary')
        interval: Sampling interval in microseconds
        skip_idle: Whether to skip idle samples
        opcodes: Whether to collect opcode information (only used by gecko format
                 for creating interval markers in Firefox Profiler)
+        output_file: Output file path (required for binary format)
+        compression: Compression type for binary format ('auto', 'zstd', 'none')

    Returns:
        A collector instance of the appropriate type
@ -478,6 +501,13 @@ def _create_collector(format_type, interval, skip_idle, opcodes=False):
    if collector_class is None:
        raise ValueError(f"Unknown format: {format_type}")

+    # Binary format requires output file and compression
+    if format_type == "binary":
+        if output_file is None:
+            raise ValueError("Binary format requires an output file")
+        return collector_class(output_file, interval, skip_idle=skip_idle,
+                              compression=compression)
+
    # Gecko format never skips idle (it needs both GIL and CPU data)
    # and is the only format that uses opcodes for interval markers
    if format_type == "gecko":
@ -513,7 +543,12 @@ def _handle_output(collector, args, pid, mode):
        pid: Process ID (for generating filenames)
        mode: Profiling mode used
    """
-    if args.format == "pstats":
+    if args.format == "binary":
+        # Binary format already wrote to file incrementally, just finalize
+        collector.export(None)
+        filename = collector.filename
+        print(f"Binary profile written to {filename} ({collector.total_samples} samples)")
+    elif args.format == "pstats":
        if args.outfile:
            # If outfile is a directory, generate filename inside it
            if os.path.isdir(args.outfile):
@ -546,6 +581,10 @@ def _validate_args(args, parser):
        args: Parsed command-line arguments
        parser: ArgumentParser instance for error reporting
    """
+    # Replay command has no special validation needed
+    if getattr(args, 'command', None) == "replay":
+        return
+
    # Check if live mode is available
    if hasattr(args, 'live') and args.live and LiveStatsCollector is None:
        parser.error(
@ -558,7 +597,7 @@ def _validate_args(args, parser):
            parser.error("--subprocesses is incompatible with --live mode.")

    # Async-aware mode is incompatible with --native, --no-gc, --mode, and --all-threads
-    if args.async_aware:
+    if getattr(args, 'async_aware', False):
        issues = []
        if args.native:
            issues.append("--native")
@ -575,7 +614,7 @@ def _validate_args(args, parser):
            )

    # --async-mode requires --async-aware
-    if hasattr(args, 'async_mode') and args.async_mode != "running" and not args.async_aware:
+    if hasattr(args, 'async_mode') and args.async_mode != "running" and not getattr(args, 'async_aware', False):
        parser.error("--async-mode requires --async-aware to be enabled.")

    # Live mode is incompatible with format options
@ -603,7 +642,7 @@ def _validate_args(args, parser):
        return

    # Validate gecko mode doesn't use non-wall mode
-    if args.format == "gecko" and args.mode != "wall":
+    if args.format == "gecko" and getattr(args, 'mode', 'wall') != "wall":
        parser.error(
            "--mode option is incompatible with --gecko. "
            "Gecko format automatically includes both GIL-holding and CPU status analysis."
@ -611,7 +650,7 @@ def _validate_args(args, parser):

    # Validate --opcodes is only used with compatible formats
    opcodes_compatible_formats = ("live", "gecko", "flamegraph", "heatmap")
-    if args.opcodes and args.format not in opcodes_compatible_formats:
+    if getattr(args, 'opcodes', False) and args.format not in opcodes_compatible_formats:
        parser.error(
            f"--opcodes is only compatible with {', '.join('--' + f for f in opcodes_compatible_formats)}."
        )
@ -733,6 +772,30 @@ Examples:
    _add_format_options(attach_parser)
    _add_pstats_options(attach_parser)

+    # === REPLAY COMMAND ===
+    replay_parser = subparsers.add_parser(
+        "replay",
+        help="Replay a binary profile and convert to another format",
+        formatter_class=CustomFormatter,
+        description="""Replay a binary profile file and convert to another format
+
+Examples:
+  # Convert binary to flamegraph
+  `python -m profiling.sampling replay --flamegraph -o output.html profile.bin`
+
+  # Convert binary to pstats and print to stdout
+  `python -m profiling.sampling replay profile.bin`
+
+  # Convert binary to gecko format
+  `python -m profiling.sampling replay --gecko -o profile.json profile.bin`""",
+    )
+    replay_parser.add_argument(
+        "input_file",
+        help="Binary profile file to replay",
+    )
+    _add_format_options(replay_parser, include_compression=False, include_binary=False)
+    _add_pstats_options(replay_parser)
+
    # Parse arguments
    args = parser.parse_args()

@ -743,6 +806,7 @@ Examples:
    command_handlers = {
        "run": _handle_run,
        "attach": _handle_attach,
+        "replay": _handle_replay,
    }

    # Execute the appropriate command
@ -774,8 +838,16 @@ def _handle_attach(args):
        mode != PROFILING_MODE_WALL if mode != PROFILING_MODE_ALL else False
    )

+    output_file = None
+    if args.format == "binary":
+        output_file = args.outfile or _generate_output_filename(args.format, args.pid)
+
    # Create the appropriate collector
-    collector = _create_collector(args.format, args.interval, skip_idle, args.opcodes)
+    collector = _create_collector(
+        args.format, args.interval, skip_idle, args.opcodes,
+        output_file=output_file,
+        compression=getattr(args, 'compression', 'auto')
+    )

    with _get_child_monitor_context(args, args.pid):
        collector = sample(
@ -843,8 +915,16 @@ def _handle_run(args):
        mode != PROFILING_MODE_WALL if mode != PROFILING_MODE_ALL else False
    )

+    output_file = None
+    if args.format == "binary":
+        output_file = args.outfile or _generate_output_filename(args.format, process.pid)
+
    # Create the appropriate collector
-    collector = _create_collector(args.format, args.interval, skip_idle, args.opcodes)
+    collector = _create_collector(
+        args.format, args.interval, skip_idle, args.opcodes,
+        output_file=output_file,
+        compression=getattr(args, 'compression', 'auto')
+    )

    with _get_child_monitor_context(args, process.pid):
        try:
@ -963,5 +1043,48 @@ def _handle_live_run(args):
                process.wait()


+def _handle_replay(args):
+    """Handle the 'replay' command - convert binary profile to another format."""
+    import os
+
+    if not os.path.exists(args.input_file):
+        sys.exit(f"Error: Input file not found: {args.input_file}")
+
+    with BinaryReader(args.input_file) as reader:
+        info = reader.get_info()
+        interval = info['sample_interval_us']
+
+        print(f"Replaying {info['sample_count']} samples from {args.input_file}")
+        print(f"  Sample interval: {interval} us")
+        print(f"  Compression: {'zstd' if info.get('compression_type', 0) == 1 else 'none'}")
+
+        collector = _create_collector(args.format, interval, skip_idle=False)
+
+        def progress_callback(current, total):
+            if total > 0:
+                pct = current / total
+                bar_width = 40
+                filled = int(bar_width * pct)
+                bar = '█' * filled + '░' * (bar_width - filled)
+                print(f"\r  [{bar}] {pct*100:5.1f}% ({current:,}/{total:,})", end="", flush=True)
+
+        count = reader.replay_samples(collector, progress_callback)
+        print()
+
+        if args.format == "pstats":
+            if args.outfile:
+                collector.export(args.outfile)
+            else:
+                sort_choice = args.sort if args.sort is not None else "nsamples"
+                limit = args.limit if args.limit is not None else 15
+                sort_mode = _sort_to_mode(sort_choice)
+                collector.print_stats(sort_mode, limit, not args.no_summary, PROFILING_MODE_WALL)
+        else:
+            filename = args.outfile or _generate_output_filename(args.format, os.getpid())
+            collector.export(filename)
+
+        print(f"Replayed {count} samples")
+
+
 if __name__ == "__main__":
    main()
--- a/Lib/profiling/sampling/collector.py
+++ b/Lib/profiling/sampling/collector.py
@ -44,8 +44,17 @@ def extract_lineno(location):

 class Collector(ABC):
    @abstractmethod
-    def collect(self, stack_frames):
-        """Collect profiling data from stack frames."""
+    def collect(self, stack_frames, timestamps_us=None):
+        """Collect profiling data from stack frames.
+
+        Args:
+            stack_frames: List of InterpreterInfo objects
+            timestamps_us: Optional list of timestamps in microseconds. If provided
+                (from binary replay with RLE batching), use these instead of current
+                time. If None, collectors should use time.monotonic() or similar.
+                The list may contain multiple timestamps when samples are batched
+                together (same stack, different times).
+        """

    def collect_failed_sample(self):
        """Collect data about a failed sample attempt."""
@ -79,6 +88,17 @@ class Collector(ABC):
        # Phase 3: Build linear stacks from each leaf to root (optimized - no sorting!)
        yield from self._build_linear_stacks(leaf_task_ids, task_map, child_to_parent)

+    def _iter_stacks(self, stack_frames, skip_idle=False):
+        """Yield (frames, thread_id) for all stacks, handling both sync and async modes."""
+        if stack_frames and hasattr(stack_frames[0], "awaited_by"):
+            for frames, thread_id, _ in self._iter_async_frames(stack_frames):
+                if frames:
+                    yield frames, thread_id
+        else:
+            for frames, thread_id in self._iter_all_frames(stack_frames, skip_idle=skip_idle):
+                if frames:
+                    yield frames, thread_id
+
    def _build_task_graph(self, awaited_info_list):
        task_map = {}
        child_to_parent = {}  # Maps child_id -> (selected_parent_id, parent_count)
--- a/Lib/profiling/sampling/gecko_collector.py
+++ b/Lib/profiling/sampling/gecko_collector.py
@ -66,7 +66,7 @@ class GeckoCollector(Collector):
        self.sample_interval_usec = sample_interval_usec
        self.skip_idle = skip_idle
        self.opcodes_enabled = opcodes
-        self.start_time = time.time() * 1000  # milliseconds since epoch
+        self.start_time = time.monotonic() * 1000  # milliseconds since start

        # Global string table (shared across all threads)
        self.global_strings = ["(root)"]  # Start with root
@ -103,6 +103,9 @@ class GeckoCollector(Collector):
        # Opcode state tracking per thread: tid -> (opcode, lineno, col_offset, funcname, filename, start_time)
        self.opcode_state = {}

+        # For binary replay: track base timestamp (first sample's timestamp)
+        self._replay_base_timestamp_us = None
+
    def _track_state_transition(self, tid, condition, active_dict, inactive_dict,
                                  active_name, inactive_name, category, current_time):
        """Track binary state transitions and emit markers.
@ -138,18 +141,35 @@ class GeckoCollector(Collector):
                self._add_marker(tid, active_name, active_dict.pop(tid),
                               current_time, category)

-    def collect(self, stack_frames):
-        """Collect a sample from stack frames."""
-        current_time = (time.time() * 1000) - self.start_time
+    def collect(self, stack_frames, timestamps_us=None):
+        """Collect samples from stack frames.
+
+        Args:
+            stack_frames: List of interpreter/thread frame info
+            timestamps_us: List of timestamps in microseconds (None for live sampling)
+        """
+        # Handle live sampling (no timestamps provided)
+        if timestamps_us is None:
+            current_time = (time.monotonic() * 1000) - self.start_time
+            times = [current_time]
+        else:
+            if not timestamps_us:
+                return
+            # Initialize base timestamp if needed
+            if self._replay_base_timestamp_us is None:
+                self._replay_base_timestamp_us = timestamps_us[0]
+            # Convert all timestamps to times (ms relative to first sample)
+            base = self._replay_base_timestamp_us
+            times = [(ts - base) / 1000 for ts in timestamps_us]
+
+        first_time = times[0]

        # Update interval calculation
        if self.sample_count > 0 and self.last_sample_time > 0:
-            self.interval = (
-                current_time - self.last_sample_time
-            ) / self.sample_count
-        self.last_sample_time = current_time
+            self.interval = (times[-1] - self.last_sample_time) / self.sample_count
+        self.last_sample_time = times[-1]

-        # Process threads and track GC per thread
+        # Process threads
        for interpreter_info in stack_frames:
            for thread_info in interpreter_info.threads:
                frames = thread_info.frame_info
@ -167,92 +187,86 @@ class GeckoCollector(Collector):
                on_cpu = bool(status_flags & THREAD_STATUS_ON_CPU)
                gil_requested = bool(status_flags & THREAD_STATUS_GIL_REQUESTED)

-                # Track GIL possession (Has GIL / No GIL)
+                # Track state transitions using first timestamp
                self._track_state_transition(
                    tid, has_gil, self.has_gil_start, self.no_gil_start,
-                    "Has GIL", "No GIL", CATEGORY_GIL, current_time
+                    "Has GIL", "No GIL", CATEGORY_GIL, first_time
                )
-
-                # Track CPU state (On CPU / Off CPU)
                self._track_state_transition(
                    tid, on_cpu, self.on_cpu_start, self.off_cpu_start,
-                    "On CPU", "Off CPU", CATEGORY_CPU, current_time
+                    "On CPU", "Off CPU", CATEGORY_CPU, first_time
                )

-                # Track code type (Python Code / Native Code)
-                # This is tri-state: Python (has_gil), Native (on_cpu without gil), or Neither
+                # Track code type
                if has_gil:
                    self._track_state_transition(
                        tid, True, self.python_code_start, self.native_code_start,
-                        "Python Code", "Native Code", CATEGORY_CODE_TYPE, current_time
+                        "Python Code", "Native Code", CATEGORY_CODE_TYPE, first_time
                    )
                elif on_cpu:
                    self._track_state_transition(
                        tid, True, self.native_code_start, self.python_code_start,
-                        "Native Code", "Python Code", CATEGORY_CODE_TYPE, current_time
+                        "Native Code", "Python Code", CATEGORY_CODE_TYPE, first_time
                    )
                else:
-                    # Thread is idle (neither has GIL nor on CPU) - close any open code markers
-                    # This handles the third state that _track_state_transition doesn't cover
                    if tid in self.initialized_threads:
                        if tid in self.python_code_start:
                            self._add_marker(tid, "Python Code", self.python_code_start.pop(tid),
-                                           current_time, CATEGORY_CODE_TYPE)
+                                           first_time, CATEGORY_CODE_TYPE)
                        if tid in self.native_code_start:
                            self._add_marker(tid, "Native Code", self.native_code_start.pop(tid),
-                                           current_time, CATEGORY_CODE_TYPE)
+                                           first_time, CATEGORY_CODE_TYPE)

-                # Track "Waiting for GIL" intervals (one-sided tracking)
+                # Track GIL wait
                if gil_requested:
-                    self.gil_wait_start.setdefault(tid, current_time)
+                    self.gil_wait_start.setdefault(tid, first_time)
                elif tid in self.gil_wait_start:
                    self._add_marker(tid, "Waiting for GIL", self.gil_wait_start.pop(tid),
-                                   current_time, CATEGORY_GIL)
+                                   first_time, CATEGORY_GIL)

-                # Track exception state (Has Exception / No Exception)
+                # Track exception state
                has_exception = bool(status_flags & THREAD_STATUS_HAS_EXCEPTION)
                self._track_state_transition(
                    tid, has_exception, self.exception_start, self.no_exception_start,
-                    "Has Exception", "No Exception", CATEGORY_EXCEPTION, current_time
+                    "Has Exception", "No Exception", CATEGORY_EXCEPTION, first_time
                )

-                # Track GC events by detecting <GC> frames in the stack trace
-                # This leverages the improved GC frame tracking from commit 336366fd7ca
-                # which precisely identifies the thread that initiated GC collection
+                # Track GC events
                has_gc_frame = any(frame[2] == "<GC>" for frame in frames)
                if has_gc_frame:
-                    # This thread initiated GC collection
                    if tid not in self.gc_start_per_thread:
-                        self.gc_start_per_thread[tid] = current_time
+                        self.gc_start_per_thread[tid] = first_time
                elif tid in self.gc_start_per_thread:
-                    # End GC marker when no more GC frames are detected
                    self._add_marker(tid, "GC Collecting", self.gc_start_per_thread.pop(tid),
-                                   current_time, CATEGORY_GC)
+                                   first_time, CATEGORY_GC)

-                # Mark thread as initialized after processing all state transitions
+                # Mark thread as initialized
                self.initialized_threads.add(tid)

-                # Categorize: idle if neither has GIL nor on CPU
+                # Skip idle threads if requested
                is_idle = not has_gil and not on_cpu
-
-                # Skip idle threads if skip_idle is enabled
                if self.skip_idle and is_idle:
                    continue

                if not frames:
                    continue

-                # Process the stack
+                # Process stack once to get stack_index
                stack_index = self._process_stack(thread_data, frames)

-                # Add sample - cache references to avoid dictionary lookups
+                # Add samples with timestamps
                samples = thread_data["samples"]
-                samples["stack"].append(stack_index)
-                samples["time"].append(current_time)
-                samples["eventDelay"].append(None)
+                samples_stack = samples["stack"]
+                samples_time = samples["time"]
+                samples_delay = samples["eventDelay"]

-                # Track opcode state changes for interval markers (leaf frame only)
-                if self.opcodes_enabled:
+                for t in times:
+                    samples_stack.append(stack_index)
+                    samples_time.append(t)
+                    samples_delay.append(None)
+
+                # Handle opcodes
+                if self.opcodes_enabled and frames:
                    leaf_frame = frames[0]
                    filename, location, funcname, opcode = leaf_frame
                    if isinstance(location, tuple):
@ -264,18 +278,15 @@ class GeckoCollector(Collector):
                    current_state = (opcode, lineno, col_offset, funcname, filename)

                    if tid not in self.opcode_state:
-                        # First observation - start tracking
-                        self.opcode_state[tid] = (*current_state, current_time)
+                        self.opcode_state[tid] = (*current_state, first_time)
                    elif self.opcode_state[tid][:5] != current_state:
-                        # State changed - emit marker for previous state
                        prev_opcode, prev_lineno, prev_col, prev_funcname, prev_filename, prev_start = self.opcode_state[tid]
                        self._add_opcode_interval_marker(
-                            tid, prev_opcode, prev_lineno, prev_col, prev_funcname, prev_start, current_time
+                            tid, prev_opcode, prev_lineno, prev_col, prev_funcname, prev_start, first_time
                        )
-                        # Start tracking new state
-                        self.opcode_state[tid] = (*current_state, current_time)
+                        self.opcode_state[tid] = (*current_state, first_time)

-        self.sample_count += 1
+        self.sample_count += len(times)

    def _create_thread(self, tid):
        """Create a new thread structure with processed profile format."""
--- a/Lib/profiling/sampling/heatmap_collector.py
+++ b/Lib/profiling/sampling/heatmap_collector.py
@ -521,7 +521,7 @@ class HeatmapCollector(StackTraceCollector):
        }
        self.stats.update(kwargs)

-    def process_frames(self, frames, thread_id):
+    def process_frames(self, frames, thread_id, weight=1):
        """Process stack frames and count samples per line.

        Args:
@ -529,8 +529,9 @@ class HeatmapCollector(StackTraceCollector):
                    leaf-to-root order. location is (lineno, end_lineno, col_offset, end_col_offset).
                    opcode is None if not gathered.
            thread_id: Thread ID for this stack trace
+            weight: Number of samples this stack represents (for batched RLE)
        """
-        self._total_samples += 1
+        self._total_samples += weight
        self._seen_lines.clear()

        for i, (filename, location, funcname, opcode) in enumerate(frames):
@ -548,15 +549,16 @@ class HeatmapCollector(StackTraceCollector):
                self._seen_lines.add(line_key)

            self._record_line_sample(filename, lineno, funcname, is_leaf=is_leaf,
-                                     count_cumulative=count_cumulative)
+                                     count_cumulative=count_cumulative, weight=weight)

            if opcode is not None:
                # Set opcodes_enabled flag when we first encounter opcode data
                self.opcodes_enabled = True
                self._record_bytecode_sample(filename, lineno, opcode,
-                                             end_lineno, col_offset, end_col_offset)
+                                             end_lineno, col_offset, end_col_offset,
+                                             weight=weight)

-            # Build call graph for adjacent frames
+            # Build call graph for adjacent frames (relationships are deduplicated anyway)
            if i + 1 < len(frames):
                next_frame = frames[i + 1]
                next_lineno = extract_lineno(next_frame[1])
@ -578,24 +580,25 @@ class HeatmapCollector(StackTraceCollector):
        return True

    def _record_line_sample(self, filename, lineno, funcname, is_leaf=False,
-                            count_cumulative=True):
+                            count_cumulative=True, weight=1):
        """Record a sample for a specific line."""
        # Track cumulative samples (all occurrences in stack)
        if count_cumulative:
-            self.line_samples[(filename, lineno)] += 1
-            self.file_samples[filename][lineno] += 1
+            self.line_samples[(filename, lineno)] += weight
+            self.file_samples[filename][lineno] += weight

        # Track self/leaf samples (only when at top of stack)
        if is_leaf:
-            self.line_self_samples[(filename, lineno)] += 1
-            self.file_self_samples[filename][lineno] += 1
+            self.line_self_samples[(filename, lineno)] += weight
+            self.file_self_samples[filename][lineno] += weight

        # Record function definition location
        if funcname and (filename, funcname) not in self.function_definitions:
            self.function_definitions[(filename, funcname)] = lineno

    def _record_bytecode_sample(self, filename, lineno, opcode,
-                                end_lineno=None, col_offset=None, end_col_offset=None):
+                                end_lineno=None, col_offset=None, end_col_offset=None,
+                                weight=1):
        """Record a sample for a specific bytecode instruction.

        Args:
@ -605,6 +608,7 @@ class HeatmapCollector(StackTraceCollector):
            end_lineno: End line number (may be -1 if not available)
            col_offset: Column offset in UTF-8 bytes (may be -1 if not available)
            end_col_offset: End column offset in UTF-8 bytes (may be -1 if not available)
+            weight: Number of samples this represents (for batched RLE)
        """
        key = (filename, lineno)

@ -612,7 +616,7 @@ class HeatmapCollector(StackTraceCollector):
        if opcode not in self.line_opcodes[key]:
            self.line_opcodes[key][opcode] = {'count': 0, 'locations': set()}

-        self.line_opcodes[key][opcode]['count'] += 1
+        self.line_opcodes[key][opcode]['count'] += weight

        # Store unique location info if column offset is available (not -1)
        if col_offset is not None and col_offset >= 0:
--- a/Lib/profiling/sampling/live_collector/collector.py
+++ b/Lib/profiling/sampling/live_collector/collector.py
@ -348,7 +348,7 @@ class LiveStatsCollector(Collector):
        self.failed_samples += 1
        self.total_samples += 1

-    def collect(self, stack_frames):
+    def collect(self, stack_frames, timestamp_us=None):
        """Collect and display profiling data."""
        if self.start_time is None:
            self.start_time = time.perf_counter()
--- a/Lib/profiling/sampling/pstats_collector.py
+++ b/Lib/profiling/sampling/pstats_collector.py
@ -18,7 +18,7 @@ class PstatsCollector(Collector):
        self.skip_idle = skip_idle
        self._seen_locations = set()

-    def _process_frames(self, frames):
+    def _process_frames(self, frames, weight=1):
        """Process a single thread's frame stack."""
        if not frames:
            return
@ -32,12 +32,12 @@ class PstatsCollector(Collector):
            location = (frame.filename, lineno, frame.funcname)
            if location not in self._seen_locations:
                self._seen_locations.add(location)
-                self.result[location]["cumulative_calls"] += 1
+                self.result[location]["cumulative_calls"] += weight

        # The top frame gets counted as an inline call (directly executing)
        top_lineno = extract_lineno(frames[0].location)
        top_location = (frames[0].filename, top_lineno, frames[0].funcname)
-        self.result[top_location]["direct_calls"] += 1
+        self.result[top_location]["direct_calls"] += weight

        # Track caller-callee relationships for call graph
        for i in range(1, len(frames)):
@ -49,17 +49,12 @@ class PstatsCollector(Collector):
            callee = (callee_frame.filename, callee_lineno, callee_frame.funcname)
            caller = (caller_frame.filename, caller_lineno, caller_frame.funcname)

-            self.callers[callee][caller] += 1
+            self.callers[callee][caller] += weight

-    def collect(self, stack_frames):
-        if stack_frames and hasattr(stack_frames[0], "awaited_by"):
-            # Async frame processing
-            for frames, thread_id, task_id in self._iter_async_frames(stack_frames):
-                self._process_frames(frames)
-        else:
-            # Regular frame processing
-            for frames, thread_id in self._iter_all_frames(stack_frames, skip_idle=self.skip_idle):
-                self._process_frames(frames)
+    def collect(self, stack_frames, timestamps_us=None):
+        weight = len(timestamps_us) if timestamps_us else 1
+        for frames, _ in self._iter_stacks(stack_frames, skip_idle=self.skip_idle):
+            self._process_frames(frames, weight=weight)

    def export(self, filename):
        self.create_stats()
--- a/Lib/profiling/sampling/sample.py
+++ b/Lib/profiling/sampling/sample.py
@ -7,6 +7,7 @@ import time
 from collections import deque
 from _colorize import ANSIColors

+from .binary_collector import BinaryCollector
 from .constants import (
    PROFILING_MODE_WALL,
    PROFILING_MODE_CPU,
@ -139,6 +140,9 @@ class SampleProfiler:
            if self.collect_stats:
                self._print_unwinder_stats()

+            if isinstance(collector, BinaryCollector):
+                self._print_binary_stats(collector)
+
        # Pass stats to flamegraph collector if it's the right type
        if hasattr(collector, 'set_stats'):
            collector.set_stats(self.sample_interval_usec, running_time, sample_rate, error_rate, missed_samples, mode=self.mode)
@ -264,6 +268,53 @@ class SampleProfiler:
        if stale_invalidations > 0:
            print(f"  {ANSIColors.YELLOW}Stale cache invalidations: {stale_invalidations}{ANSIColors.RESET}")

+    def _print_binary_stats(self, collector):
+        """Print binary I/O encoding statistics."""
+        try:
+            stats = collector.get_stats()
+        except (ValueError, RuntimeError):
+            return  # Collector closed or stats unavailable
+
+        print(f"  {ANSIColors.CYAN}Binary Encoding:{ANSIColors.RESET}")
+
+        repeat_records = stats.get('repeat_records', 0)
+        repeat_samples = stats.get('repeat_samples', 0)
+        full_records = stats.get('full_records', 0)
+        suffix_records = stats.get('suffix_records', 0)
+        pop_push_records = stats.get('pop_push_records', 0)
+        total_records = stats.get('total_records', 0)
+
+        if total_records > 0:
+            repeat_pct = repeat_records / total_records * 100
+            full_pct = full_records / total_records * 100
+            suffix_pct = suffix_records / total_records * 100
+            pop_push_pct = pop_push_records / total_records * 100
+        else:
+            repeat_pct = full_pct = suffix_pct = pop_push_pct = 0
+
+        print(f"    Records:          {total_records:,}")
+        print(f"      RLE repeat:     {repeat_records:,} ({ANSIColors.GREEN}{repeat_pct:.1f}%{ANSIColors.RESET}) [{repeat_samples:,} samples]")
+        print(f"      Full stack:     {full_records:,} ({full_pct:.1f}%)")
+        print(f"      Suffix match:   {suffix_records:,} ({suffix_pct:.1f}%)")
+        print(f"      Pop-push:       {pop_push_records:,} ({pop_push_pct:.1f}%)")
+
+        frames_written = stats.get('total_frames_written', 0)
+        frames_saved = stats.get('frames_saved', 0)
+        compression_pct = stats.get('frame_compression_pct', 0)
+
+        print(f"  {ANSIColors.CYAN}Frame Efficiency:{ANSIColors.RESET}")
+        print(f"    Frames written:   {frames_written:,}")
+        print(f"    Frames saved:     {frames_saved:,} ({ANSIColors.GREEN}{compression_pct:.1f}%{ANSIColors.RESET})")
+
+        bytes_written = stats.get('bytes_written', 0)
+        if bytes_written >= 1024 * 1024:
+            bytes_str = f"{bytes_written / (1024 * 1024):.1f} MB"
+        elif bytes_written >= 1024:
+            bytes_str = f"{bytes_written / 1024:.1f} KB"
+        else:
+            bytes_str = f"{bytes_written} B"
+        print(f"    Bytes (pre-zstd): {bytes_str}")
+

 def _is_process_running(pid):
    if pid <= 0:
--- a/Lib/profiling/sampling/stack_collector.py
+++ b/Lib/profiling/sampling/stack_collector.py
@ -18,21 +18,12 @@ class StackTraceCollector(Collector):
        self.sample_interval_usec = sample_interval_usec
        self.skip_idle = skip_idle

-    def collect(self, stack_frames, skip_idle=False):
-        if stack_frames and hasattr(stack_frames[0], "awaited_by"):
-            # Async-aware mode: process async task frames
-            for frames, thread_id, task_id in self._iter_async_frames(stack_frames):
-                if not frames:
-                    continue
-                self.process_frames(frames, thread_id)
-        else:
-            # Sync-only mode
-            for frames, thread_id in self._iter_all_frames(stack_frames, skip_idle=skip_idle):
-                if not frames:
-                    continue
-                self.process_frames(frames, thread_id)
+    def collect(self, stack_frames, timestamps_us=None, skip_idle=False):
+        weight = len(timestamps_us) if timestamps_us else 1
+        for frames, thread_id in self._iter_stacks(stack_frames, skip_idle=skip_idle):
+            self.process_frames(frames, thread_id, weight=weight)

-    def process_frames(self, frames, thread_id):
+    def process_frames(self, frames, thread_id, weight=1):
        pass


@ -41,13 +32,13 @@ class CollapsedStackCollector(StackTraceCollector):
        super().__init__(*args, **kwargs)
        self.stack_counter = collections.Counter()

-    def process_frames(self, frames, thread_id):
+    def process_frames(self, frames, thread_id, weight=1):
        # Extract only (filename, lineno, funcname) - opcode not needed for collapsed stacks
        # frame is (filename, location, funcname, opcode)
        call_tree = tuple(
            (f[0], extract_lineno(f[1]), f[2]) for f in reversed(frames)
        )
-        self.stack_counter[(call_tree, thread_id)] += 1
+        self.stack_counter[(call_tree, thread_id)] += weight

    def export(self, filename):
        lines = []
@ -96,23 +87,26 @@ class FlamegraphCollector(StackTraceCollector):
        # Per-thread statistics
        self.per_thread_stats = {}  # {thread_id: {has_gil, on_cpu, gil_requested, unknown, has_exception, total, gc_samples}}

-    def collect(self, stack_frames, skip_idle=False):
+    def collect(self, stack_frames, timestamps_us=None, skip_idle=False):
        """Override to track thread status statistics before processing frames."""
-        # Increment sample count once per sample
-        self._sample_count += 1
+        # Weight is number of timestamps (samples with identical stack)
+        weight = len(timestamps_us) if timestamps_us else 1
+
+        # Increment sample count by weight
+        self._sample_count += weight

        # Collect both aggregate and per-thread statistics using base method
        status_counts, has_gc_frame, per_thread_stats = self._collect_thread_status_stats(stack_frames)

-        # Merge aggregate status counts
+        # Merge aggregate status counts (multiply by weight)
        for key in status_counts:
-            self.thread_status_counts[key] += status_counts[key]
+            self.thread_status_counts[key] += status_counts[key] * weight

        # Update aggregate GC frame count
        if has_gc_frame:
-            self.samples_with_gc_frames += 1
+            self.samples_with_gc_frames += weight

-        # Merge per-thread statistics
+        # Merge per-thread statistics (multiply by weight)
        for thread_id, stats in per_thread_stats.items():
            if thread_id not in self.per_thread_stats:
                self.per_thread_stats[thread_id] = {
@ -125,10 +119,10 @@ class FlamegraphCollector(StackTraceCollector):
                    "gc_samples": 0,
                }
            for key, value in stats.items():
-                self.per_thread_stats[thread_id][key] += value
+                self.per_thread_stats[thread_id][key] += value * weight

        # Call parent collect to process frames
-        super().collect(stack_frames, skip_idle=skip_idle)
+        super().collect(stack_frames, timestamps_us, skip_idle=skip_idle)

    def set_stats(self, sample_interval_usec, duration_sec, sample_rate,
                  error_rate=None, missed_samples=None, mode=None):
@ -311,7 +305,7 @@ class FlamegraphCollector(StackTraceCollector):
            "opcode_mapping": opcode_mapping
        }

-    def process_frames(self, frames, thread_id):
+    def process_frames(self, frames, thread_id, weight=1):
        """Process stack frames into flamegraph tree structure.

        Args:
@ -319,10 +313,11 @@ class FlamegraphCollector(StackTraceCollector):
                    leaf-to-root order. location is (lineno, end_lineno, col_offset, end_col_offset).
                    opcode is None if not gathered.
            thread_id: Thread ID for this stack trace
+            weight: Number of samples this stack represents (for batched RLE)
        """
        # Reverse to root->leaf order for tree building
-        self._root["samples"] += 1
-        self._total_samples += 1
+        self._root["samples"] += weight
+        self._total_samples += weight
        self._root["threads"].add(thread_id)
        self._all_threads.add(thread_id)

@ -336,11 +331,11 @@ class FlamegraphCollector(StackTraceCollector):
            if node is None:
                node = {"samples": 0, "children": {}, "threads": set(), "opcodes": collections.Counter()}
                current["children"][func] = node
-            node["samples"] += 1
+            node["samples"] += weight
            node["threads"].add(thread_id)

            if opcode is not None:
-                node["opcodes"][opcode] += 1
+                node["opcodes"][opcode] += weight

            current = node

--- a/Lib/test/test_profiling/test_sampling_profiler/test_binary_format.py
+++ b/Lib/test/test_profiling/test_sampling_profiler/test_binary_format.py
--- a/Misc/NEWS.d/next/Library/2025-12-15-02-00-31.gh-issue-138122.m3EF9E.rst
+++ b/Misc/NEWS.d/next/Library/2025-12-15-02-00-31.gh-issue-138122.m3EF9E.rst
@ -0,0 +1,4 @@
+Add binary output format to :mod:`profiling.sampling` for compact storage of
+profiling data. The new ``--binary`` option captures samples to a file that
+can be converted to other formats using the ``replay`` command. Patch by
+Pablo Galindo
--- a/Modules/Setup.stdlib.in
+++ b/Modules/Setup.stdlib.in
@ -41,7 +41,7 @@
@MODULE__PICKLE_TRUE@_pickle _pickle.c
@MODULE__QUEUE_TRUE@_queue _queuemodule.c
@MODULE__RANDOM_TRUE@_random _randommodule.c
-@MODULE__REMOTE_DEBUGGING_TRUE@_remote_debugging _remote_debugging/module.c _remote_debugging/object_reading.c _remote_debugging/code_objects.c _remote_debugging/frames.c _remote_debugging/frame_cache.c _remote_debugging/threads.c _remote_debugging/asyncio.c _remote_debugging/subprocess.c
+@MODULE__REMOTE_DEBUGGING_TRUE@_remote_debugging _remote_debugging/module.c _remote_debugging/object_reading.c _remote_debugging/code_objects.c _remote_debugging/frames.c _remote_debugging/frame_cache.c _remote_debugging/threads.c _remote_debugging/asyncio.c _remote_debugging/binary_io_writer.c _remote_debugging/binary_io_reader.c _remote_debugging/subprocess.c
@MODULE__STRUCT_TRUE@_struct _struct.c

 # build supports subinterpreters
--- a/Modules/_remote_debugging/_remote_debugging.h
+++ b/Modules/_remote_debugging/_remote_debugging.h
@ -9,15 +9,19 @@
 #define Py_REMOTE_DEBUGGING_H

 /* _GNU_SOURCE must be defined before any system headers */
+#ifndef _GNU_SOURCE
 #define _GNU_SOURCE
+#endif

 #ifdef __cplusplus
 extern "C" {
 #endif

 #ifndef Py_BUILD_CORE_BUILTIN
+#  ifndef Py_BUILD_CORE_MODULE
 #    define Py_BUILD_CORE_MODULE 1
 #  endif
+#endif

 #include "Python.h"
 #include "internal/pycore_debug_offsets.h"  // _Py_DebugOffsets
@ -205,6 +209,8 @@ typedef struct {
    PyTypeObject *ThreadInfo_Type;
    PyTypeObject *InterpreterInfo_Type;
    PyTypeObject *AwaitedInfo_Type;
+    PyTypeObject *BinaryWriter_Type;
+    PyTypeObject *BinaryReader_Type;
 } RemoteDebuggingState;

 enum _ThreadState {
--- a/Modules/_remote_debugging/binary_io.h
+++ b/Modules/_remote_debugging/binary_io.h
@ -0,0 +1,638 @@
+/******************************************************************************
+ * Python Remote Debugging Module - Binary I/O Header
+ *
+ * This header provides declarations for high-performance binary file I/O
+ * for profiling data with optional zstd streaming compression.
+ ******************************************************************************/
+
+#ifndef Py_BINARY_IO_H
+#define Py_BINARY_IO_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "Python.h"
+#include "pycore_hashtable.h"
+#include <stdint.h>
+#include <stdio.h>
+
+/* ============================================================================
+ * BINARY FORMAT CONSTANTS
+ * ============================================================================ */
+
+#define BINARY_FORMAT_MAGIC     0x54414348  /* "TACH" (Tachyon) in native byte order */
+#define BINARY_FORMAT_MAGIC_SWAPPED 0x48434154  /* Byte-swapped magic for endianness detection */
+#define BINARY_FORMAT_VERSION   1
+
+/* Conditional byte-swap macros for cross-endian file reading.
+ * Uses Python's optimized byte-swap functions from pycore_bitutils.h */
+#define SWAP16_IF(swap, x) ((swap) ? _Py_bswap16(x) : (x))
+#define SWAP32_IF(swap, x) ((swap) ? _Py_bswap32(x) : (x))
+#define SWAP64_IF(swap, x) ((swap) ? _Py_bswap64(x) : (x))
+
+/* Header field offsets and sizes */
+#define HDR_OFF_MAGIC        0
+#define HDR_SIZE_MAGIC       4
+#define HDR_OFF_VERSION      (HDR_OFF_MAGIC + HDR_SIZE_MAGIC)
+#define HDR_SIZE_VERSION     4
+#define HDR_OFF_PY_VERSION   (HDR_OFF_VERSION + HDR_SIZE_VERSION)
+#define HDR_SIZE_PY_VERSION  4   /* 3 bytes: major, minor, micro + 1 reserved */
+#define HDR_OFF_PY_MAJOR     HDR_OFF_PY_VERSION
+#define HDR_OFF_PY_MINOR     (HDR_OFF_PY_VERSION + 1)
+#define HDR_OFF_PY_MICRO     (HDR_OFF_PY_VERSION + 2)
+#define HDR_OFF_START_TIME   (HDR_OFF_PY_VERSION + HDR_SIZE_PY_VERSION)
+#define HDR_SIZE_START_TIME  8
+#define HDR_OFF_INTERVAL     (HDR_OFF_START_TIME + HDR_SIZE_START_TIME)
+#define HDR_SIZE_INTERVAL    8
+#define HDR_OFF_SAMPLES      (HDR_OFF_INTERVAL + HDR_SIZE_INTERVAL)
+#define HDR_SIZE_SAMPLES     4
+#define HDR_OFF_THREADS      (HDR_OFF_SAMPLES + HDR_SIZE_SAMPLES)
+#define HDR_SIZE_THREADS     4
+#define HDR_OFF_STR_TABLE    (HDR_OFF_THREADS + HDR_SIZE_THREADS)
+#define HDR_SIZE_STR_TABLE   8
+#define HDR_OFF_FRAME_TABLE  (HDR_OFF_STR_TABLE + HDR_SIZE_STR_TABLE)
+#define HDR_SIZE_FRAME_TABLE 8
+#define HDR_OFF_COMPRESSION  (HDR_OFF_FRAME_TABLE + HDR_SIZE_FRAME_TABLE)
+#define HDR_SIZE_COMPRESSION 4
+#define FILE_HEADER_SIZE     (HDR_OFF_COMPRESSION + HDR_SIZE_COMPRESSION)
+#define FILE_HEADER_PLACEHOLDER_SIZE 64
+
+static_assert(FILE_HEADER_SIZE <= FILE_HEADER_PLACEHOLDER_SIZE,
+              "FILE_HEADER_SIZE exceeds FILE_HEADER_PLACEHOLDER_SIZE");
+
+/* Buffer sizes: 512KB balances syscall amortization against memory use,
+ * and aligns well with filesystem block sizes and zstd dictionary windows */
+#define WRITE_BUFFER_SIZE       (512 * 1024)
+#define COMPRESSED_BUFFER_SIZE  (512 * 1024)
+
+/* Compression types */
+#define COMPRESSION_NONE        0
+#define COMPRESSION_ZSTD        1
+
+/* Stack encoding types for delta compression */
+#define STACK_REPEAT            0x00  /* RLE: identical to previous, with count */
+#define STACK_FULL              0x01  /* Full stack (first sample or no match) */
+#define STACK_SUFFIX            0x02  /* Shares N frames from bottom */
+#define STACK_POP_PUSH          0x03  /* Remove M frames, add N frames */
+
+/* Maximum stack depth we'll buffer for delta encoding */
+#define MAX_STACK_DEPTH         256
+
+/* Initial capacity for RLE pending buffer */
+#define INITIAL_RLE_CAPACITY    64
+
+/* Initial capacities for dynamic arrays - sized to reduce reallocations */
+#define INITIAL_STRING_CAPACITY 4096
+#define INITIAL_FRAME_CAPACITY  4096
+#define INITIAL_THREAD_CAPACITY 256
+
+/* ============================================================================
+ * STATISTICS STRUCTURES
+ * ============================================================================ */
+
+/* Writer statistics - tracks encoding efficiency */
+typedef struct {
+    uint64_t repeat_records;      /* Number of RLE repeat records written */
+    uint64_t repeat_samples;      /* Total samples encoded via RLE */
+    uint64_t full_records;        /* Number of full stack records */
+    uint64_t suffix_records;      /* Number of suffix match records */
+    uint64_t pop_push_records;    /* Number of pop-push records */
+    uint64_t total_frames_written;/* Total frame indices written */
+    uint64_t frames_saved;        /* Frames avoided due to delta encoding */
+    uint64_t bytes_written;       /* Total bytes written (before compression) */
+} BinaryWriterStats;
+
+/* Reader statistics - tracks reconstruction performance */
+typedef struct {
+    uint64_t repeat_records;      /* RLE records decoded */
+    uint64_t repeat_samples;      /* Samples decoded from RLE */
+    uint64_t full_records;        /* Full stack records decoded */
+    uint64_t suffix_records;      /* Suffix match records decoded */
+    uint64_t pop_push_records;    /* Pop-push records decoded */
+    uint64_t total_samples;       /* Total samples reconstructed */
+    uint64_t stack_reconstructions; /* Number of stack array reconstructions */
+} BinaryReaderStats;
+
+/* ============================================================================
+ * PLATFORM ABSTRACTION
+ * ============================================================================ */
+
+#if defined(__linux__) || defined(__APPLE__)
+    #include <sys/mman.h>
+    #include <unistd.h>
+    #include <sys/stat.h>
+    #include <fcntl.h>
+    #define USE_MMAP 1
+#else
+    #define USE_MMAP 0
+#endif
+
+/* 64-bit file position support for files larger than 2GB.
+ * On POSIX: use ftello/fseeko with off_t (already 64-bit on 64-bit systems)
+ * On Windows: use _ftelli64/_fseeki64 with __int64 */
+#if defined(_WIN32) || defined(_WIN64)
+    #include <io.h>
+    typedef __int64 file_offset_t;
+    #define FTELL64(fp) _ftelli64(fp)
+    #define FSEEK64(fp, offset, whence) _fseeki64(fp, offset, whence)
+#else
+    /* POSIX - off_t is 64-bit on 64-bit systems, ftello/fseeko handle large files */
+    typedef off_t file_offset_t;
+    #define FTELL64(fp) ftello(fp)
+    #define FSEEK64(fp, offset, whence) fseeko(fp, offset, whence)
+#endif
+
+/* Forward declare zstd types if available */
+#ifdef HAVE_ZSTD
+#include <zstd.h>
+#endif
+
+/* Branch prediction hints - same as Objects/obmalloc.c */
+#if (defined(__clang__) || (defined(__GNUC__) && (__GNUC__ > 2))) && defined(__OPTIMIZE__)
+#  define UNLIKELY(value) __builtin_expect((value), 0)
+#  define LIKELY(value) __builtin_expect((value), 1)
+#else
+#  define UNLIKELY(value) (value)
+#  define LIKELY(value) (value)
+#endif
+
+/* ============================================================================
+ * BINARY WRITER STRUCTURES
+ * ============================================================================ */
+
+/* zstd compression state (only used if HAVE_ZSTD defined) */
+typedef struct {
+#ifdef HAVE_ZSTD
+    ZSTD_CCtx *cctx;  /* Modern API: CCtx and CStream are the same since v1.3.0 */
+#else
+    void *cctx;  /* Placeholder */
+#endif
+    uint8_t *compressed_buffer;
+    size_t compressed_buffer_size;
+} ZstdCompressor;
+
+/* Frame entry - combines all frame data for better cache locality */
+typedef struct {
+    uint32_t filename_idx;
+    uint32_t funcname_idx;
+    int32_t lineno;
+} FrameEntry;
+
+/* Frame key for hash table lookup */
+typedef struct {
+    uint32_t filename_idx;
+    uint32_t funcname_idx;
+    int32_t lineno;
+} FrameKey;
+
+/* Pending RLE sample - buffered for run-length encoding */
+typedef struct {
+    uint64_t timestamp_delta;
+    uint8_t status;
+} PendingRLESample;
+
+/* Thread entry - tracks per-thread state for delta encoding */
+typedef struct {
+    uint64_t thread_id;
+    uint64_t prev_timestamp;
+    uint32_t interpreter_id;
+
+    /* Previous stack for delta encoding (frame indices, innermost first) */
+    uint32_t *prev_stack;
+    size_t prev_stack_depth;
+    size_t prev_stack_capacity;
+
+    /* RLE pending buffer - samples waiting to be written as a repeat group */
+    PendingRLESample *pending_rle;
+    size_t pending_rle_count;
+    size_t pending_rle_capacity;
+    int has_pending_rle;  /* Flag: do we have buffered repeats? */
+} ThreadEntry;
+
+/* Main binary writer structure */
+typedef struct {
+    FILE *fp;
+    char *filename;
+
+    /* Write buffer for batched I/O */
+    uint8_t *write_buffer;
+    size_t buffer_pos;
+    size_t buffer_size;
+
+    /* Compression */
+    int compression_type;
+    ZstdCompressor zstd;
+
+    /* Metadata */
+    uint64_t start_time_us;
+    uint64_t sample_interval_us;
+    uint32_t total_samples;
+
+    /* String hash table: PyObject* -> uint32_t index */
+    _Py_hashtable_t *string_hash;
+    /* String storage: array of UTF-8 encoded strings */
+    char **strings;
+    size_t *string_lengths;
+    size_t string_count;
+    size_t string_capacity;
+
+    /* Frame hash table: FrameKey* -> uint32_t index */
+    _Py_hashtable_t *frame_hash;
+    /* Frame storage: combined struct for better cache locality */
+    FrameEntry *frame_entries;
+    size_t frame_count;
+    size_t frame_capacity;
+
+    /* Thread timestamp tracking for delta encoding - combined for cache locality */
+    ThreadEntry *thread_entries;
+    size_t thread_count;
+    size_t thread_capacity;
+
+    /* Statistics */
+    BinaryWriterStats stats;
+} BinaryWriter;
+
+/* ============================================================================
+ * BINARY READER STRUCTURES
+ * ============================================================================ */
+
+/* Per-thread state for stack reconstruction during replay */
+typedef struct {
+    uint64_t thread_id;
+    uint32_t interpreter_id;
+    uint64_t prev_timestamp;
+
+    /* Reconstructed stack buffer (frame indices, innermost first) */
+    uint32_t *current_stack;
+    size_t current_stack_depth;
+    size_t current_stack_capacity;
+} ReaderThreadState;
+
+/* Main binary reader structure */
+typedef struct {
+    char *filename;
+
+#if USE_MMAP
+    int fd;
+    uint8_t *mapped_data;
+    size_t mapped_size;
+#else
+    FILE *fp;
+    uint8_t *file_data;
+    size_t file_size;
+#endif
+
+    /* Decompression state */
+    int compression_type;
+    /* Note: ZSTD_DCtx is not stored - created/freed during decompression */
+    uint8_t *decompressed_data;
+    size_t decompressed_size;
+
+    /* Header metadata */
+    uint8_t py_major;
+    uint8_t py_minor;
+    uint8_t py_micro;
+    int needs_swap;  /* Non-zero if file was written on different-endian system */
+    uint64_t start_time_us;
+    uint64_t sample_interval_us;
+    uint32_t sample_count;
+    uint32_t thread_count;
+    uint64_t string_table_offset;
+    uint64_t frame_table_offset;
+
+    /* Parsed string table: array of Python string objects */
+    PyObject **strings;
+    uint32_t strings_count;
+
+    /* Parsed frame table: packed as [filename_idx, funcname_idx, lineno] */
+    uint32_t *frame_data;
+    uint32_t frames_count;
+
+    /* Sample data region */
+    uint8_t *sample_data;
+    size_t sample_data_size;
+
+    /* Per-thread state for stack reconstruction (used during replay) */
+    ReaderThreadState *thread_states;
+    size_t thread_state_count;
+    size_t thread_state_capacity;
+
+    /* Statistics */
+    BinaryReaderStats stats;
+} BinaryReader;
+
+/* ============================================================================
+ * VARINT ENCODING/DECODING (INLINE FOR PERFORMANCE)
+ * ============================================================================ */
+
+/* Encode unsigned 64-bit varint (LEB128). Returns bytes written. */
+static inline size_t
+encode_varint_u64(uint8_t *buf, uint64_t value)
+{
+    /* Fast path for single-byte values (0-127) - very common case */
+    if (value < 0x80) {
+        buf[0] = (uint8_t)value;
+        return 1;
+    }
+
+    size_t i = 0;
+    while (value >= 0x80) {
+        buf[i++] = (uint8_t)((value & 0x7F) | 0x80);
+        value >>= 7;
+    }
+    buf[i++] = (uint8_t)(value & 0x7F);
+    return i;
+}
+
+/* Encode unsigned 32-bit varint. Returns bytes written. */
+static inline size_t
+encode_varint_u32(uint8_t *buf, uint32_t value)
+{
+    return encode_varint_u64(buf, value);
+}
+
+/* Encode signed 32-bit varint (zigzag encoding). Returns bytes written. */
+static inline size_t
+encode_varint_i32(uint8_t *buf, int32_t value)
+{
+    /* Zigzag encode: map signed to unsigned */
+    uint32_t zigzag = ((uint32_t)value << 1) ^ (uint32_t)(value >> 31);
+    return encode_varint_u32(buf, zigzag);
+}
+
+/* Decode unsigned 64-bit varint (LEB128). Updates offset only on success.
+ * On error (overflow or incomplete), offset is NOT updated, allowing callers
+ * to detect errors via (offset == prev_offset) check. Sets PyErr on error. */
+static inline uint64_t
+decode_varint_u64(const uint8_t *data, size_t *offset, size_t max_size)
+{
+    size_t pos = *offset;
+    uint64_t result = 0;
+    int shift = 0;
+
+    /* Fast path for single-byte varints (0-127) - most common case */
+    if (LIKELY(pos < max_size && (data[pos] & 0x80) == 0)) {
+        *offset = pos + 1;
+        return data[pos];
+    }
+
+    while (pos < max_size) {
+        uint8_t byte = data[pos++];
+        result |= (uint64_t)(byte & 0x7F) << shift;
+        if ((byte & 0x80) == 0) {
+            *offset = pos;
+            return result;
+        }
+        shift += 7;
+        if (UNLIKELY(shift >= 64)) {
+            PyErr_SetString(PyExc_ValueError, "Invalid or incomplete varint in binary data");
+            return 0;
+        }
+    }
+
+    PyErr_SetString(PyExc_ValueError, "Invalid or incomplete varint in binary data");
+    return 0;
+}
+
+/* Decode unsigned 32-bit varint. If value exceeds UINT32_MAX, treats as error. */
+static inline uint32_t
+decode_varint_u32(const uint8_t *data, size_t *offset, size_t max_size)
+{
+    size_t saved_offset = *offset;
+    uint64_t value = decode_varint_u64(data, offset, max_size);
+    if (PyErr_Occurred()) {
+        return 0;
+    }
+    if (UNLIKELY(value > UINT32_MAX)) {
+        *offset = saved_offset;
+        PyErr_SetString(PyExc_ValueError, "Invalid or incomplete varint in binary data");
+        return 0;
+    }
+    return (uint32_t)value;
+}
+
+/* Decode signed 32-bit varint (zigzag encoding). */
+static inline int32_t
+decode_varint_i32(const uint8_t *data, size_t *offset, size_t max_size)
+{
+    uint32_t zigzag = decode_varint_u32(data, offset, max_size);
+    if (PyErr_Occurred()) {
+        return 0;
+    }
+    return (int32_t)((zigzag >> 1) ^ -(int32_t)(zigzag & 1));
+}
+
+/* ============================================================================
+ * SHARED UTILITY FUNCTIONS
+ * ============================================================================ */
+
+/* Generic array growth - returns new pointer or NULL (sets PyErr_NoMemory)
+ * Includes overflow checking for capacity doubling and allocation size. */
+static inline void *
+grow_array(void *ptr, size_t *capacity, size_t elem_size)
+{
+    size_t old_cap = *capacity;
+
+    /* Check for overflow when doubling capacity */
+    if (old_cap > SIZE_MAX / 2) {
+        PyErr_SetString(PyExc_OverflowError, "Array capacity overflow");
+        return NULL;
+    }
+    size_t new_cap = old_cap * 2;
+
+    /* Check for overflow when calculating allocation size */
+    if (new_cap > SIZE_MAX / elem_size) {
+        PyErr_SetString(PyExc_OverflowError, "Array allocation size overflow");
+        return NULL;
+    }
+
+    void *new_ptr = PyMem_Realloc(ptr, new_cap * elem_size);
+    if (new_ptr) {
+        *capacity = new_cap;
+    } else {
+        PyErr_NoMemory();
+    }
+    return new_ptr;
+}
+
+static inline int
+grow_array_inplace(void **ptr_addr, size_t count, size_t *capacity, size_t elem_size)
+{
+    if (count < *capacity) {
+        return 0;
+    }
+    void *tmp = grow_array(*ptr_addr, capacity, elem_size);
+    if (tmp == NULL) {
+        return -1;
+    }
+    *ptr_addr = tmp;
+    return 0;
+}
+
+#define GROW_ARRAY(ptr, count, cap, type) \
+    grow_array_inplace((void**)&(ptr), (count), &(cap), sizeof(type))
+
+/* ============================================================================
+ * BINARY WRITER API
+ * ============================================================================ */
+
+/*
+ * Create a new binary writer.
+ *
+ * Arguments:
+ *   filename: Path to output file
+ *   sample_interval_us: Sampling interval in microseconds
+ *   compression_type: COMPRESSION_NONE or COMPRESSION_ZSTD
+ *   start_time_us: Start timestamp in microseconds (from time.monotonic() * 1e6)
+ *
+ * Returns:
+ *   New BinaryWriter* on success, NULL on failure (PyErr set)
+ */
+BinaryWriter *binary_writer_create(
+    const char *filename,
+    uint64_t sample_interval_us,
+    int compression_type,
+    uint64_t start_time_us
+);
+
+/*
+ * Write a sample to the binary file.
+ *
+ * Arguments:
+ *   writer: Writer from binary_writer_create
+ *   stack_frames: List of InterpreterInfo struct sequences
+ *   timestamp_us: Current timestamp in microseconds (from time.monotonic() * 1e6)
+ *
+ * Returns:
+ *   0 on success, -1 on failure (PyErr set)
+ */
+int binary_writer_write_sample(
+    BinaryWriter *writer,
+    PyObject *stack_frames,
+    uint64_t timestamp_us
+);
+
+/*
+ * Finalize and close the binary file.
+ * Writes string/frame tables, footer, and updates header.
+ *
+ * Arguments:
+ *   writer: Writer to finalize
+ *
+ * Returns:
+ *   0 on success, -1 on failure (PyErr set)
+ */
+int binary_writer_finalize(BinaryWriter *writer);
+
+/*
+ * Destroy a binary writer and free all resources.
+ * Safe to call even if writer is partially initialized.
+ *
+ * Arguments:
+ *   writer: Writer to destroy (may be NULL)
+ */
+void binary_writer_destroy(BinaryWriter *writer);
+
+/* ============================================================================
+ * BINARY READER API
+ * ============================================================================ */
+
+/*
+ * Open a binary file for reading.
+ *
+ * Arguments:
+ *   filename: Path to input file
+ *
+ * Returns:
+ *   New BinaryReader* on success, NULL on failure (PyErr set)
+ */
+BinaryReader *binary_reader_open(const char *filename);
+
+/*
+ * Replay samples from binary file through a collector.
+ *
+ * Arguments:
+ *   reader: Reader from binary_reader_open
+ *   collector: Python collector with collect() method
+ *   progress_callback: Optional callable(current, total) or NULL
+ *
+ * Returns:
+ *   Number of samples replayed on success, -1 on failure (PyErr set)
+ */
+Py_ssize_t binary_reader_replay(
+    BinaryReader *reader,
+    PyObject *collector,
+    PyObject *progress_callback
+);
+
+/*
+ * Get metadata about the binary file.
+ *
+ * Arguments:
+ *   reader: Reader from binary_reader_open
+ *
+ * Returns:
+ *   Dict with file metadata on success, NULL on failure (PyErr set)
+ */
+PyObject *binary_reader_get_info(BinaryReader *reader);
+
+/*
+ * Close a binary reader and free all resources.
+ *
+ * Arguments:
+ *   reader: Reader to close (may be NULL)
+ */
+void binary_reader_close(BinaryReader *reader);
+
+/* ============================================================================
+ * STATISTICS FUNCTIONS
+ * ============================================================================ */
+
+/*
+ * Get writer statistics as a Python dict.
+ *
+ * Arguments:
+ *   writer: Writer to get stats from
+ *
+ * Returns:
+ *   Dict with statistics on success, NULL on failure (PyErr set)
+ */
+PyObject *binary_writer_get_stats(BinaryWriter *writer);
+
+/*
+ * Get reader statistics as a Python dict.
+ *
+ * Arguments:
+ *   reader: Reader to get stats from
+ *
+ * Returns:
+ *   Dict with statistics on success, NULL on failure (PyErr set)
+ */
+PyObject *binary_reader_get_stats(BinaryReader *reader);
+
+/* ============================================================================
+ * UTILITY FUNCTIONS
+ * ============================================================================ */
+
+/*
+ * Check if zstd compression is available.
+ *
+ * Returns:
+ *   1 if zstd available, 0 otherwise
+ */
+int binary_io_zstd_available(void);
+
+/*
+ * Get the best available compression type.
+ *
+ * Returns:
+ *   COMPRESSION_ZSTD if available, COMPRESSION_NONE otherwise
+ */
+int binary_io_get_best_compression(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* Py_BINARY_IO_H */
--- a/Modules/_remote_debugging/binary_io_reader.c
+++ b/Modules/_remote_debugging/binary_io_reader.c
--- a/Modules/_remote_debugging/binary_io_writer.c
+++ b/Modules/_remote_debugging/binary_io_writer.c
--- a/Modules/_remote_debugging/clinic/module.c.h
+++ b/Modules/_remote_debugging/clinic/module.c.h
@ -7,6 +7,7 @@ preserve
 #  include "pycore_runtime.h"     // _Py_ID()
 #endif
 #include "pycore_critical_section.h"// Py_BEGIN_CRITICAL_SECTION()
+#include "pycore_long.h"          // _PyLong_UnsignedLongLong_Converter()
 #include "pycore_modsupport.h"    // _PyArg_UnpackKeywords()

 PyDoc_STRVAR(_remote_debugging_RemoteUnwinder___init____doc__,
@ -434,6 +435,659 @@ _remote_debugging_RemoteUnwinder_get_stats(PyObject *self, PyObject *Py_UNUSED(i
    return return_value;
 }

+PyDoc_STRVAR(_remote_debugging_BinaryWriter___init____doc__,
+"BinaryWriter(filename, sample_interval_us, start_time_us, *,\n"
+"             compression=0)\n"
+"--\n"
+"\n"
+"High-performance binary writer for profiling data.\n"
+"\n"
+"Arguments:\n"
+"    filename: Path to output file\n"
+"    sample_interval_us: Sampling interval in microseconds\n"
+"    start_time_us: Start timestamp in microseconds (from time.monotonic() * 1e6)\n"
+"    compression: 0=none, 1=zstd (default: 0)\n"
+"\n"
+"Use as a context manager or call finalize() when done.");
+
+static int
+_remote_debugging_BinaryWriter___init___impl(BinaryWriterObject *self,
+                                             const char *filename,
+                                             unsigned long long sample_interval_us,
+                                             unsigned long long start_time_us,
+                                             int compression);
+
+static int
+_remote_debugging_BinaryWriter___init__(PyObject *self, PyObject *args, PyObject *kwargs)
+{
+    int return_value = -1;
+    #if defined(Py_BUILD_CORE) && !defined(Py_BUILD_CORE_MODULE)
+
+    #define NUM_KEYWORDS 4
+    static struct {
+        PyGC_Head _this_is_not_used;
+        PyObject_VAR_HEAD
+        Py_hash_t ob_hash;
+        PyObject *ob_item[NUM_KEYWORDS];
+    } _kwtuple = {
+        .ob_base = PyVarObject_HEAD_INIT(&PyTuple_Type, NUM_KEYWORDS)
+        .ob_hash = -1,
+        .ob_item = { &_Py_ID(filename), &_Py_ID(sample_interval_us), &_Py_ID(start_time_us), &_Py_ID(compression), },
+    };
+    #undef NUM_KEYWORDS
+    #define KWTUPLE (&_kwtuple.ob_base.ob_base)
+
+    #else  // !Py_BUILD_CORE
+    #  define KWTUPLE NULL
+    #endif  // !Py_BUILD_CORE
+
+    static const char * const _keywords[] = {"filename", "sample_interval_us", "start_time_us", "compression", NULL};
+    static _PyArg_Parser _parser = {
+        .keywords = _keywords,
+        .fname = "BinaryWriter",
+        .kwtuple = KWTUPLE,
+    };
+    #undef KWTUPLE
+    PyObject *argsbuf[4];
+    PyObject * const *fastargs;
+    Py_ssize_t nargs = PyTuple_GET_SIZE(args);
+    Py_ssize_t noptargs = nargs + (kwargs ? PyDict_GET_SIZE(kwargs) : 0) - 3;
+    const char *filename;
+    unsigned long long sample_interval_us;
+    unsigned long long start_time_us;
+    int compression = 0;
+
+    fastargs = _PyArg_UnpackKeywords(_PyTuple_CAST(args)->ob_item, nargs, kwargs, NULL, &_parser,
+            /*minpos*/ 3, /*maxpos*/ 3, /*minkw*/ 0, /*varpos*/ 0, argsbuf);
+    if (!fastargs) {
+        goto exit;
+    }
+    if (!PyUnicode_Check(fastargs[0])) {
+        _PyArg_BadArgument("BinaryWriter", "argument 'filename'", "str", fastargs[0]);
+        goto exit;
+    }
+    Py_ssize_t filename_length;
+    filename = PyUnicode_AsUTF8AndSize(fastargs[0], &filename_length);
+    if (filename == NULL) {
+        goto exit;
+    }
+    if (strlen(filename) != (size_t)filename_length) {
+        PyErr_SetString(PyExc_ValueError, "embedded null character");
+        goto exit;
+    }
+    if (!_PyLong_UnsignedLongLong_Converter(fastargs[1], &sample_interval_us)) {
+        goto exit;
+    }
+    if (!_PyLong_UnsignedLongLong_Converter(fastargs[2], &start_time_us)) {
+        goto exit;
+    }
+    if (!noptargs) {
+        goto skip_optional_kwonly;
+    }
+    compression = PyLong_AsInt(fastargs[3]);
+    if (compression == -1 && PyErr_Occurred()) {
+        goto exit;
+    }
+skip_optional_kwonly:
+    return_value = _remote_debugging_BinaryWriter___init___impl((BinaryWriterObject *)self, filename, sample_interval_us, start_time_us, compression);
+
+exit:
+    return return_value;
+}
+
+PyDoc_STRVAR(_remote_debugging_BinaryWriter_write_sample__doc__,
+"write_sample($self, /, stack_frames, timestamp_us)\n"
+"--\n"
+"\n"
+"Write a sample to the binary file.\n"
+"\n"
+"Arguments:\n"
+"    stack_frames: List of InterpreterInfo objects\n"
+"    timestamp_us: Current timestamp in microseconds (from time.monotonic() * 1e6)");
+
+#define _REMOTE_DEBUGGING_BINARYWRITER_WRITE_SAMPLE_METHODDEF    \
+    {"write_sample", _PyCFunction_CAST(_remote_debugging_BinaryWriter_write_sample), METH_FASTCALL|METH_KEYWORDS, _remote_debugging_BinaryWriter_write_sample__doc__},
+
+static PyObject *
+_remote_debugging_BinaryWriter_write_sample_impl(BinaryWriterObject *self,
+                                                 PyObject *stack_frames,
+                                                 unsigned long long timestamp_us);
+
+static PyObject *
+_remote_debugging_BinaryWriter_write_sample(PyObject *self, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames)
+{
+    PyObject *return_value = NULL;
+    #if defined(Py_BUILD_CORE) && !defined(Py_BUILD_CORE_MODULE)
+
+    #define NUM_KEYWORDS 2
+    static struct {
+        PyGC_Head _this_is_not_used;
+        PyObject_VAR_HEAD
+        Py_hash_t ob_hash;
+        PyObject *ob_item[NUM_KEYWORDS];
+    } _kwtuple = {
+        .ob_base = PyVarObject_HEAD_INIT(&PyTuple_Type, NUM_KEYWORDS)
+        .ob_hash = -1,
+        .ob_item = { &_Py_ID(stack_frames), &_Py_ID(timestamp_us), },
+    };
+    #undef NUM_KEYWORDS
+    #define KWTUPLE (&_kwtuple.ob_base.ob_base)
+
+    #else  // !Py_BUILD_CORE
+    #  define KWTUPLE NULL
+    #endif  // !Py_BUILD_CORE
+
+    static const char * const _keywords[] = {"stack_frames", "timestamp_us", NULL};
+    static _PyArg_Parser _parser = {
+        .keywords = _keywords,
+        .fname = "write_sample",
+        .kwtuple = KWTUPLE,
+    };
+    #undef KWTUPLE
+    PyObject *argsbuf[2];
+    PyObject *stack_frames;
+    unsigned long long timestamp_us;
+
+    args = _PyArg_UnpackKeywords(args, nargs, NULL, kwnames, &_parser,
+            /*minpos*/ 2, /*maxpos*/ 2, /*minkw*/ 0, /*varpos*/ 0, argsbuf);
+    if (!args) {
+        goto exit;
+    }
+    stack_frames = args[0];
+    if (!_PyLong_UnsignedLongLong_Converter(args[1], &timestamp_us)) {
+        goto exit;
+    }
+    return_value = _remote_debugging_BinaryWriter_write_sample_impl((BinaryWriterObject *)self, stack_frames, timestamp_us);
+
+exit:
+    return return_value;
+}
+
+PyDoc_STRVAR(_remote_debugging_BinaryWriter_finalize__doc__,
+"finalize($self, /)\n"
+"--\n"
+"\n"
+"Finalize and close the binary file.\n"
+"\n"
+"Writes string/frame tables, footer, and updates header.");
+
+#define _REMOTE_DEBUGGING_BINARYWRITER_FINALIZE_METHODDEF    \
+    {"finalize", (PyCFunction)_remote_debugging_BinaryWriter_finalize, METH_NOARGS, _remote_debugging_BinaryWriter_finalize__doc__},
+
+static PyObject *
+_remote_debugging_BinaryWriter_finalize_impl(BinaryWriterObject *self);
+
+static PyObject *
+_remote_debugging_BinaryWriter_finalize(PyObject *self, PyObject *Py_UNUSED(ignored))
+{
+    return _remote_debugging_BinaryWriter_finalize_impl((BinaryWriterObject *)self);
+}
+
+PyDoc_STRVAR(_remote_debugging_BinaryWriter_close__doc__,
+"close($self, /)\n"
+"--\n"
+"\n"
+"Close the writer without finalizing (discards data).");
+
+#define _REMOTE_DEBUGGING_BINARYWRITER_CLOSE_METHODDEF    \
+    {"close", (PyCFunction)_remote_debugging_BinaryWriter_close, METH_NOARGS, _remote_debugging_BinaryWriter_close__doc__},
+
+static PyObject *
+_remote_debugging_BinaryWriter_close_impl(BinaryWriterObject *self);
+
+static PyObject *
+_remote_debugging_BinaryWriter_close(PyObject *self, PyObject *Py_UNUSED(ignored))
+{
+    return _remote_debugging_BinaryWriter_close_impl((BinaryWriterObject *)self);
+}
+
+PyDoc_STRVAR(_remote_debugging_BinaryWriter___enter____doc__,
+"__enter__($self, /)\n"
+"--\n"
+"\n"
+"Enter context manager.");
+
+#define _REMOTE_DEBUGGING_BINARYWRITER___ENTER___METHODDEF    \
+    {"__enter__", (PyCFunction)_remote_debugging_BinaryWriter___enter__, METH_NOARGS, _remote_debugging_BinaryWriter___enter____doc__},
+
+static PyObject *
+_remote_debugging_BinaryWriter___enter___impl(BinaryWriterObject *self);
+
+static PyObject *
+_remote_debugging_BinaryWriter___enter__(PyObject *self, PyObject *Py_UNUSED(ignored))
+{
+    return _remote_debugging_BinaryWriter___enter___impl((BinaryWriterObject *)self);
+}
+
+PyDoc_STRVAR(_remote_debugging_BinaryWriter___exit____doc__,
+"__exit__($self, /, exc_type=None, exc_val=None, exc_tb=None)\n"
+"--\n"
+"\n"
+"Exit context manager, finalizing the file.");
+
+#define _REMOTE_DEBUGGING_BINARYWRITER___EXIT___METHODDEF    \
+    {"__exit__", _PyCFunction_CAST(_remote_debugging_BinaryWriter___exit__), METH_FASTCALL|METH_KEYWORDS, _remote_debugging_BinaryWriter___exit____doc__},
+
+static PyObject *
+_remote_debugging_BinaryWriter___exit___impl(BinaryWriterObject *self,
+                                             PyObject *exc_type,
+                                             PyObject *exc_val,
+                                             PyObject *exc_tb);
+
+static PyObject *
+_remote_debugging_BinaryWriter___exit__(PyObject *self, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames)
+{
+    PyObject *return_value = NULL;
+    #if defined(Py_BUILD_CORE) && !defined(Py_BUILD_CORE_MODULE)
+
+    #define NUM_KEYWORDS 3
+    static struct {
+        PyGC_Head _this_is_not_used;
+        PyObject_VAR_HEAD
+        Py_hash_t ob_hash;
+        PyObject *ob_item[NUM_KEYWORDS];
+    } _kwtuple = {
+        .ob_base = PyVarObject_HEAD_INIT(&PyTuple_Type, NUM_KEYWORDS)
+        .ob_hash = -1,
+        .ob_item = { &_Py_ID(exc_type), &_Py_ID(exc_val), &_Py_ID(exc_tb), },
+    };
+    #undef NUM_KEYWORDS
+    #define KWTUPLE (&_kwtuple.ob_base.ob_base)
+
+    #else  // !Py_BUILD_CORE
+    #  define KWTUPLE NULL
+    #endif  // !Py_BUILD_CORE
+
+    static const char * const _keywords[] = {"exc_type", "exc_val", "exc_tb", NULL};
+    static _PyArg_Parser _parser = {
+        .keywords = _keywords,
+        .fname = "__exit__",
+        .kwtuple = KWTUPLE,
+    };
+    #undef KWTUPLE
+    PyObject *argsbuf[3];
+    Py_ssize_t noptargs = nargs + (kwnames ? PyTuple_GET_SIZE(kwnames) : 0) - 0;
+    PyObject *exc_type = Py_None;
+    PyObject *exc_val = Py_None;
+    PyObject *exc_tb = Py_None;
+
+    args = _PyArg_UnpackKeywords(args, nargs, NULL, kwnames, &_parser,
+            /*minpos*/ 0, /*maxpos*/ 3, /*minkw*/ 0, /*varpos*/ 0, argsbuf);
+    if (!args) {
+        goto exit;
+    }
+    if (!noptargs) {
+        goto skip_optional_pos;
+    }
+    if (args[0]) {
+        exc_type = args[0];
+        if (!--noptargs) {
+            goto skip_optional_pos;
+        }
+    }
+    if (args[1]) {
+        exc_val = args[1];
+        if (!--noptargs) {
+            goto skip_optional_pos;
+        }
+    }
+    exc_tb = args[2];
+skip_optional_pos:
+    return_value = _remote_debugging_BinaryWriter___exit___impl((BinaryWriterObject *)self, exc_type, exc_val, exc_tb);
+
+exit:
+    return return_value;
+}
+
+PyDoc_STRVAR(_remote_debugging_BinaryWriter_get_stats__doc__,
+"get_stats($self, /)\n"
+"--\n"
+"\n"
+"Get encoding statistics for the writer.\n"
+"\n"
+"Returns a dict with encoding statistics including repeat/full/suffix/pop-push\n"
+"record counts, frames written/saved, and compression ratio.");
+
+#define _REMOTE_DEBUGGING_BINARYWRITER_GET_STATS_METHODDEF    \
+    {"get_stats", (PyCFunction)_remote_debugging_BinaryWriter_get_stats, METH_NOARGS, _remote_debugging_BinaryWriter_get_stats__doc__},
+
+static PyObject *
+_remote_debugging_BinaryWriter_get_stats_impl(BinaryWriterObject *self);
+
+static PyObject *
+_remote_debugging_BinaryWriter_get_stats(PyObject *self, PyObject *Py_UNUSED(ignored))
+{
+    return _remote_debugging_BinaryWriter_get_stats_impl((BinaryWriterObject *)self);
+}
+
+PyDoc_STRVAR(_remote_debugging_BinaryReader___init____doc__,
+"BinaryReader(filename)\n"
+"--\n"
+"\n"
+"High-performance binary reader for profiling data.\n"
+"\n"
+"Arguments:\n"
+"    filename: Path to input file\n"
+"\n"
+"Use as a context manager or call close() when done.");
+
+static int
+_remote_debugging_BinaryReader___init___impl(BinaryReaderObject *self,
+                                             const char *filename);
+
+static int
+_remote_debugging_BinaryReader___init__(PyObject *self, PyObject *args, PyObject *kwargs)
+{
+    int return_value = -1;
+    #if defined(Py_BUILD_CORE) && !defined(Py_BUILD_CORE_MODULE)
+
+    #define NUM_KEYWORDS 1
+    static struct {
+        PyGC_Head _this_is_not_used;
+        PyObject_VAR_HEAD
+        Py_hash_t ob_hash;
+        PyObject *ob_item[NUM_KEYWORDS];
+    } _kwtuple = {
+        .ob_base = PyVarObject_HEAD_INIT(&PyTuple_Type, NUM_KEYWORDS)
+        .ob_hash = -1,
+        .ob_item = { &_Py_ID(filename), },
+    };
+    #undef NUM_KEYWORDS
+    #define KWTUPLE (&_kwtuple.ob_base.ob_base)
+
+    #else  // !Py_BUILD_CORE
+    #  define KWTUPLE NULL
+    #endif  // !Py_BUILD_CORE
+
+    static const char * const _keywords[] = {"filename", NULL};
+    static _PyArg_Parser _parser = {
+        .keywords = _keywords,
+        .fname = "BinaryReader",
+        .kwtuple = KWTUPLE,
+    };
+    #undef KWTUPLE
+    PyObject *argsbuf[1];
+    PyObject * const *fastargs;
+    Py_ssize_t nargs = PyTuple_GET_SIZE(args);
+    const char *filename;
+
+    fastargs = _PyArg_UnpackKeywords(_PyTuple_CAST(args)->ob_item, nargs, kwargs, NULL, &_parser,
+            /*minpos*/ 1, /*maxpos*/ 1, /*minkw*/ 0, /*varpos*/ 0, argsbuf);
+    if (!fastargs) {
+        goto exit;
+    }
+    if (!PyUnicode_Check(fastargs[0])) {
+        _PyArg_BadArgument("BinaryReader", "argument 'filename'", "str", fastargs[0]);
+        goto exit;
+    }
+    Py_ssize_t filename_length;
+    filename = PyUnicode_AsUTF8AndSize(fastargs[0], &filename_length);
+    if (filename == NULL) {
+        goto exit;
+    }
+    if (strlen(filename) != (size_t)filename_length) {
+        PyErr_SetString(PyExc_ValueError, "embedded null character");
+        goto exit;
+    }
+    return_value = _remote_debugging_BinaryReader___init___impl((BinaryReaderObject *)self, filename);
+
+exit:
+    return return_value;
+}
+
+PyDoc_STRVAR(_remote_debugging_BinaryReader_replay__doc__,
+"replay($self, /, collector, progress_callback=None)\n"
+"--\n"
+"\n"
+"Replay samples through a collector.\n"
+"\n"
+"Arguments:\n"
+"    collector: Collector object with collect() method\n"
+"    progress_callback: Optional callable(current, total)\n"
+"\n"
+"Returns:\n"
+"    Number of samples replayed");
+
+#define _REMOTE_DEBUGGING_BINARYREADER_REPLAY_METHODDEF    \
+    {"replay", _PyCFunction_CAST(_remote_debugging_BinaryReader_replay), METH_FASTCALL|METH_KEYWORDS, _remote_debugging_BinaryReader_replay__doc__},
+
+static PyObject *
+_remote_debugging_BinaryReader_replay_impl(BinaryReaderObject *self,
+                                           PyObject *collector,
+                                           PyObject *progress_callback);
+
+static PyObject *
+_remote_debugging_BinaryReader_replay(PyObject *self, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames)
+{
+    PyObject *return_value = NULL;
+    #if defined(Py_BUILD_CORE) && !defined(Py_BUILD_CORE_MODULE)
+
+    #define NUM_KEYWORDS 2
+    static struct {
+        PyGC_Head _this_is_not_used;
+        PyObject_VAR_HEAD
+        Py_hash_t ob_hash;
+        PyObject *ob_item[NUM_KEYWORDS];
+    } _kwtuple = {
+        .ob_base = PyVarObject_HEAD_INIT(&PyTuple_Type, NUM_KEYWORDS)
+        .ob_hash = -1,
+        .ob_item = { &_Py_ID(collector), &_Py_ID(progress_callback), },
+    };
+    #undef NUM_KEYWORDS
+    #define KWTUPLE (&_kwtuple.ob_base.ob_base)
+
+    #else  // !Py_BUILD_CORE
+    #  define KWTUPLE NULL
+    #endif  // !Py_BUILD_CORE
+
+    static const char * const _keywords[] = {"collector", "progress_callback", NULL};
+    static _PyArg_Parser _parser = {
+        .keywords = _keywords,
+        .fname = "replay",
+        .kwtuple = KWTUPLE,
+    };
+    #undef KWTUPLE
+    PyObject *argsbuf[2];
+    Py_ssize_t noptargs = nargs + (kwnames ? PyTuple_GET_SIZE(kwnames) : 0) - 1;
+    PyObject *collector;
+    PyObject *progress_callback = Py_None;
+
+    args = _PyArg_UnpackKeywords(args, nargs, NULL, kwnames, &_parser,
+            /*minpos*/ 1, /*maxpos*/ 2, /*minkw*/ 0, /*varpos*/ 0, argsbuf);
+    if (!args) {
+        goto exit;
+    }
+    collector = args[0];
+    if (!noptargs) {
+        goto skip_optional_pos;
+    }
+    progress_callback = args[1];
+skip_optional_pos:
+    return_value = _remote_debugging_BinaryReader_replay_impl((BinaryReaderObject *)self, collector, progress_callback);
+
+exit:
+    return return_value;
+}
+
+PyDoc_STRVAR(_remote_debugging_BinaryReader_get_info__doc__,
+"get_info($self, /)\n"
+"--\n"
+"\n"
+"Get metadata about the binary file.\n"
+"\n"
+"Returns:\n"
+"    Dict with file metadata");
+
+#define _REMOTE_DEBUGGING_BINARYREADER_GET_INFO_METHODDEF    \
+    {"get_info", (PyCFunction)_remote_debugging_BinaryReader_get_info, METH_NOARGS, _remote_debugging_BinaryReader_get_info__doc__},
+
+static PyObject *
+_remote_debugging_BinaryReader_get_info_impl(BinaryReaderObject *self);
+
+static PyObject *
+_remote_debugging_BinaryReader_get_info(PyObject *self, PyObject *Py_UNUSED(ignored))
+{
+    return _remote_debugging_BinaryReader_get_info_impl((BinaryReaderObject *)self);
+}
+
+PyDoc_STRVAR(_remote_debugging_BinaryReader_get_stats__doc__,
+"get_stats($self, /)\n"
+"--\n"
+"\n"
+"Get reconstruction statistics from replay.\n"
+"\n"
+"Returns a dict with statistics about record types decoded and samples\n"
+"reconstructed during replay.");
+
+#define _REMOTE_DEBUGGING_BINARYREADER_GET_STATS_METHODDEF    \
+    {"get_stats", (PyCFunction)_remote_debugging_BinaryReader_get_stats, METH_NOARGS, _remote_debugging_BinaryReader_get_stats__doc__},
+
+static PyObject *
+_remote_debugging_BinaryReader_get_stats_impl(BinaryReaderObject *self);
+
+static PyObject *
+_remote_debugging_BinaryReader_get_stats(PyObject *self, PyObject *Py_UNUSED(ignored))
+{
+    return _remote_debugging_BinaryReader_get_stats_impl((BinaryReaderObject *)self);
+}
+
+PyDoc_STRVAR(_remote_debugging_BinaryReader_close__doc__,
+"close($self, /)\n"
+"--\n"
+"\n"
+"Close the reader and free resources.");
+
+#define _REMOTE_DEBUGGING_BINARYREADER_CLOSE_METHODDEF    \
+    {"close", (PyCFunction)_remote_debugging_BinaryReader_close, METH_NOARGS, _remote_debugging_BinaryReader_close__doc__},
+
+static PyObject *
+_remote_debugging_BinaryReader_close_impl(BinaryReaderObject *self);
+
+static PyObject *
+_remote_debugging_BinaryReader_close(PyObject *self, PyObject *Py_UNUSED(ignored))
+{
+    return _remote_debugging_BinaryReader_close_impl((BinaryReaderObject *)self);
+}
+
+PyDoc_STRVAR(_remote_debugging_BinaryReader___enter____doc__,
+"__enter__($self, /)\n"
+"--\n"
+"\n"
+"Enter context manager.");
+
+#define _REMOTE_DEBUGGING_BINARYREADER___ENTER___METHODDEF    \
+    {"__enter__", (PyCFunction)_remote_debugging_BinaryReader___enter__, METH_NOARGS, _remote_debugging_BinaryReader___enter____doc__},
+
+static PyObject *
+_remote_debugging_BinaryReader___enter___impl(BinaryReaderObject *self);
+
+static PyObject *
+_remote_debugging_BinaryReader___enter__(PyObject *self, PyObject *Py_UNUSED(ignored))
+{
+    return _remote_debugging_BinaryReader___enter___impl((BinaryReaderObject *)self);
+}
+
+PyDoc_STRVAR(_remote_debugging_BinaryReader___exit____doc__,
+"__exit__($self, /, exc_type=None, exc_val=None, exc_tb=None)\n"
+"--\n"
+"\n"
+"Exit context manager, closing the file.");
+
+#define _REMOTE_DEBUGGING_BINARYREADER___EXIT___METHODDEF    \
+    {"__exit__", _PyCFunction_CAST(_remote_debugging_BinaryReader___exit__), METH_FASTCALL|METH_KEYWORDS, _remote_debugging_BinaryReader___exit____doc__},
+
+static PyObject *
+_remote_debugging_BinaryReader___exit___impl(BinaryReaderObject *self,
+                                             PyObject *exc_type,
+                                             PyObject *exc_val,
+                                             PyObject *exc_tb);
+
+static PyObject *
+_remote_debugging_BinaryReader___exit__(PyObject *self, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames)
+{
+    PyObject *return_value = NULL;
+    #if defined(Py_BUILD_CORE) && !defined(Py_BUILD_CORE_MODULE)
+
+    #define NUM_KEYWORDS 3
+    static struct {
+        PyGC_Head _this_is_not_used;
+        PyObject_VAR_HEAD
+        Py_hash_t ob_hash;
+        PyObject *ob_item[NUM_KEYWORDS];
+    } _kwtuple = {
+        .ob_base = PyVarObject_HEAD_INIT(&PyTuple_Type, NUM_KEYWORDS)
+        .ob_hash = -1,
+        .ob_item = { &_Py_ID(exc_type), &_Py_ID(exc_val), &_Py_ID(exc_tb), },
+    };
+    #undef NUM_KEYWORDS
+    #define KWTUPLE (&_kwtuple.ob_base.ob_base)
+
+    #else  // !Py_BUILD_CORE
+    #  define KWTUPLE NULL
+    #endif  // !Py_BUILD_CORE
+
+    static const char * const _keywords[] = {"exc_type", "exc_val", "exc_tb", NULL};
+    static _PyArg_Parser _parser = {
+        .keywords = _keywords,
+        .fname = "__exit__",
+        .kwtuple = KWTUPLE,
+    };
+    #undef KWTUPLE
+    PyObject *argsbuf[3];
+    Py_ssize_t noptargs = nargs + (kwnames ? PyTuple_GET_SIZE(kwnames) : 0) - 0;
+    PyObject *exc_type = Py_None;
+    PyObject *exc_val = Py_None;
+    PyObject *exc_tb = Py_None;
+
+    args = _PyArg_UnpackKeywords(args, nargs, NULL, kwnames, &_parser,
+            /*minpos*/ 0, /*maxpos*/ 3, /*minkw*/ 0, /*varpos*/ 0, argsbuf);
+    if (!args) {
+        goto exit;
+    }
+    if (!noptargs) {
+        goto skip_optional_pos;
+    }
+    if (args[0]) {
+        exc_type = args[0];
+        if (!--noptargs) {
+            goto skip_optional_pos;
+        }
+    }
+    if (args[1]) {
+        exc_val = args[1];
+        if (!--noptargs) {
+            goto skip_optional_pos;
+        }
+    }
+    exc_tb = args[2];
+skip_optional_pos:
+    return_value = _remote_debugging_BinaryReader___exit___impl((BinaryReaderObject *)self, exc_type, exc_val, exc_tb);
+
+exit:
+    return return_value;
+}
+
+PyDoc_STRVAR(_remote_debugging_zstd_available__doc__,
+"zstd_available($module, /)\n"
+"--\n"
+"\n"
+"Check if zstd compression is available.\n"
+"\n"
+"Returns:\n"
+"    True if zstd available, False otherwise");
+
+#define _REMOTE_DEBUGGING_ZSTD_AVAILABLE_METHODDEF    \
+    {"zstd_available", (PyCFunction)_remote_debugging_zstd_available, METH_NOARGS, _remote_debugging_zstd_available__doc__},
+
+static PyObject *
+_remote_debugging_zstd_available_impl(PyObject *module);
+
+static PyObject *
+_remote_debugging_zstd_available(PyObject *module, PyObject *Py_UNUSED(ignored))
+{
+    return _remote_debugging_zstd_available_impl(module);
+}
+
 PyDoc_STRVAR(_remote_debugging_get_child_pids__doc__,
 "get_child_pids($module, /, pid, *, recursive=True)\n"
 "--\n"
@ -582,4 +1236,4 @@ _remote_debugging_is_python_process(PyObject *module, PyObject *const *args, Py_
 exit:
    return return_value;
 }
-/*[clinic end generated code: output=dc0550ad3d6a409c input=a9049054013a1b77]*/
+/*[clinic end generated code: output=036de0b06d0e34cc input=a9049054013a1b77]*/
--- a/Modules/_remote_debugging/module.c
+++ b/Modules/_remote_debugging/module.c
@ -6,6 +6,20 @@
 ******************************************************************************/

 #include "_remote_debugging.h"
+#include "binary_io.h"
+
+/* Forward declarations for clinic-generated code */
+typedef struct {
+    PyObject_HEAD
+    BinaryWriter *writer;
+    uint32_t cached_total_samples;  /* Preserved after finalize */
+} BinaryWriterObject;
+
+typedef struct {
+    PyObject_HEAD
+    BinaryReader *reader;
+} BinaryReaderObject;
+
 #include "clinic/module.c.h"

 /* ============================================================================
@ -970,6 +984,10 @@ static PyType_Spec RemoteUnwinder_spec = {
    .slots = RemoteUnwinder_slots,
 };

+/* Forward declarations for type specs defined later */
+static PyType_Spec BinaryWriter_spec;
+static PyType_Spec BinaryReader_spec;
+
 /* ============================================================================
 * MODULE INITIALIZATION
 * ============================================================================ */
@ -1048,6 +1066,18 @@ _remote_debugging_exec(PyObject *m)
    if (PyModule_AddType(m, st->AwaitedInfo_Type) < 0) {
        return -1;
    }
+
+    // Create BinaryWriter and BinaryReader types
+    CREATE_TYPE(m, st->BinaryWriter_Type, &BinaryWriter_spec);
+    if (PyModule_AddType(m, st->BinaryWriter_Type) < 0) {
+        return -1;
+    }
+
+    CREATE_TYPE(m, st->BinaryReader_Type, &BinaryReader_spec);
+    if (PyModule_AddType(m, st->BinaryReader_Type) < 0) {
+        return -1;
+    }
+
 #ifdef Py_GIL_DISABLED
    PyUnstable_Module_SetGIL(m, Py_MOD_GIL_NOT_USED);
 #endif
@ -1091,6 +1121,8 @@ remote_debugging_traverse(PyObject *mod, visitproc visit, void *arg)
    Py_VISIT(state->ThreadInfo_Type);
    Py_VISIT(state->InterpreterInfo_Type);
    Py_VISIT(state->AwaitedInfo_Type);
+    Py_VISIT(state->BinaryWriter_Type);
+    Py_VISIT(state->BinaryReader_Type);
    return 0;
 }

@ -1106,6 +1138,8 @@ remote_debugging_clear(PyObject *mod)
    Py_CLEAR(state->ThreadInfo_Type);
    Py_CLEAR(state->InterpreterInfo_Type);
    Py_CLEAR(state->AwaitedInfo_Type);
+    Py_CLEAR(state->BinaryWriter_Type);
+    Py_CLEAR(state->BinaryReader_Type);
    return 0;
 }

@ -1115,13 +1149,506 @@ remote_debugging_free(void *mod)
    (void)remote_debugging_clear((PyObject *)mod);
 }

-static PyModuleDef_Slot remote_debugging_slots[] = {
-    {Py_mod_exec, _remote_debugging_exec},
-    {Py_mod_multiple_interpreters, Py_MOD_PER_INTERPRETER_GIL_SUPPORTED},
-    {Py_mod_gil, Py_MOD_GIL_NOT_USED},
-    {0, NULL},
+/* ============================================================================
+ * BINARY WRITER CLASS
+ * ============================================================================ */
+
+#define BinaryWriter_CAST(op) ((BinaryWriterObject *)(op))
+
+/*[clinic input]
+class _remote_debugging.BinaryWriter "BinaryWriterObject *" "&PyBinaryWriter_Type"
+[clinic start generated code]*/
+/*[clinic end generated code: output=da39a3ee5e6b4b0d input=e948838b90a2003c]*/
+
+/*[clinic input]
+_remote_debugging.BinaryWriter.__init__
+    filename: str
+    sample_interval_us: unsigned_long_long
+    start_time_us: unsigned_long_long
+    *
+    compression: int = 0
+
+High-performance binary writer for profiling data.
+
+Arguments:
+    filename: Path to output file
+    sample_interval_us: Sampling interval in microseconds
+    start_time_us: Start timestamp in microseconds (from time.monotonic() * 1e6)
+    compression: 0=none, 1=zstd (default: 0)
+
+Use as a context manager or call finalize() when done.
+[clinic start generated code]*/
+
+static int
+_remote_debugging_BinaryWriter___init___impl(BinaryWriterObject *self,
+                                             const char *filename,
+                                             unsigned long long sample_interval_us,
+                                             unsigned long long start_time_us,
+                                             int compression)
+/*[clinic end generated code: output=014c0306f1bacf4b input=57497fe3cb9214a6]*/
+{
+    if (self->writer) {
+        binary_writer_destroy(self->writer);
+    }
+
+    self->writer = binary_writer_create(filename, sample_interval_us, compression, start_time_us);
+    if (!self->writer) {
+        return -1;
+    }
+
+    return 0;
+}
+
+/*[clinic input]
+_remote_debugging.BinaryWriter.write_sample
+    stack_frames: object
+    timestamp_us: unsigned_long_long
+
+Write a sample to the binary file.
+
+Arguments:
+    stack_frames: List of InterpreterInfo objects
+    timestamp_us: Current timestamp in microseconds (from time.monotonic() * 1e6)
+[clinic start generated code]*/
+
+static PyObject *
+_remote_debugging_BinaryWriter_write_sample_impl(BinaryWriterObject *self,
+                                                 PyObject *stack_frames,
+                                                 unsigned long long timestamp_us)
+/*[clinic end generated code: output=24d5b86679b4128f input=dce3148417482624]*/
+{
+    if (!self->writer) {
+        PyErr_SetString(PyExc_ValueError, "Writer is closed");
+        return NULL;
+    }
+
+    if (binary_writer_write_sample(self->writer, stack_frames, timestamp_us) < 0) {
+        return NULL;
+    }
+
+    Py_RETURN_NONE;
+}
+
+/*[clinic input]
+_remote_debugging.BinaryWriter.finalize
+
+Finalize and close the binary file.
+
+Writes string/frame tables, footer, and updates header.
+[clinic start generated code]*/
+
+static PyObject *
+_remote_debugging_BinaryWriter_finalize_impl(BinaryWriterObject *self)
+/*[clinic end generated code: output=3534b88c6628de88 input=c02191750682f6a2]*/
+{
+    if (!self->writer) {
+        PyErr_SetString(PyExc_ValueError, "Writer is already closed");
+        return NULL;
+    }
+
+    /* Save total_samples before finalizing */
+    self->cached_total_samples = self->writer->total_samples;
+
+    if (binary_writer_finalize(self->writer) < 0) {
+        return NULL;
+    }
+
+    binary_writer_destroy(self->writer);
+    self->writer = NULL;
+
+    Py_RETURN_NONE;
+}
+
+/*[clinic input]
+_remote_debugging.BinaryWriter.close
+
+Close the writer without finalizing (discards data).
+[clinic start generated code]*/
+
+static PyObject *
+_remote_debugging_BinaryWriter_close_impl(BinaryWriterObject *self)
+/*[clinic end generated code: output=9571bb2256fd1fd2 input=6e0da206e60daf16]*/
+{
+    if (self->writer) {
+        binary_writer_destroy(self->writer);
+        self->writer = NULL;
+    }
+    Py_RETURN_NONE;
+}
+
+/*[clinic input]
+_remote_debugging.BinaryWriter.__enter__
+
+Enter context manager.
+[clinic start generated code]*/
+
+static PyObject *
+_remote_debugging_BinaryWriter___enter___impl(BinaryWriterObject *self)
+/*[clinic end generated code: output=8eb95f61daf2d120 input=8ef14ee18da561d2]*/
+{
+    Py_INCREF(self);
+    return (PyObject *)self;
+}
+
+/*[clinic input]
+_remote_debugging.BinaryWriter.__exit__
+    exc_type: object = None
+    exc_val: object = None
+    exc_tb: object = None
+
+Exit context manager, finalizing the file.
+[clinic start generated code]*/
+
+static PyObject *
+_remote_debugging_BinaryWriter___exit___impl(BinaryWriterObject *self,
+                                             PyObject *exc_type,
+                                             PyObject *exc_val,
+                                             PyObject *exc_tb)
+/*[clinic end generated code: output=61831f47c72a53c6 input=12334ce1009af37f]*/
+{
+    if (self->writer) {
+        /* Only finalize on normal exit (no exception) */
+        if (exc_type == Py_None) {
+            if (binary_writer_finalize(self->writer) < 0) {
+                binary_writer_destroy(self->writer);
+                self->writer = NULL;
+                return NULL;
+            }
+        }
+        binary_writer_destroy(self->writer);
+        self->writer = NULL;
+    }
+    Py_RETURN_FALSE;
+}
+
+/*[clinic input]
+_remote_debugging.BinaryWriter.get_stats
+
+Get encoding statistics for the writer.
+
+Returns a dict with encoding statistics including repeat/full/suffix/pop-push
+record counts, frames written/saved, and compression ratio.
+[clinic start generated code]*/
+
+static PyObject *
+_remote_debugging_BinaryWriter_get_stats_impl(BinaryWriterObject *self)
+/*[clinic end generated code: output=06522cd52544df89 input=82968491b53ad277]*/
+{
+    if (!self->writer) {
+        PyErr_SetString(PyExc_ValueError, "Writer is closed");
+        return NULL;
+    }
+    return binary_writer_get_stats(self->writer);
+}
+
+static PyObject *
+BinaryWriter_get_total_samples(BinaryWriterObject *self, void *closure)
+{
+    if (!self->writer) {
+        /* Use cached value after finalize/close */
+        return PyLong_FromUnsignedLong(self->cached_total_samples);
+    }
+    return PyLong_FromUnsignedLong(self->writer->total_samples);
+}
+
+static PyGetSetDef BinaryWriter_getset[] = {
+    {"total_samples", (getter)BinaryWriter_get_total_samples, NULL, "Total samples written", NULL},
+    {NULL}
 };

+static PyMethodDef BinaryWriter_methods[] = {
+    _REMOTE_DEBUGGING_BINARYWRITER_WRITE_SAMPLE_METHODDEF
+    _REMOTE_DEBUGGING_BINARYWRITER_FINALIZE_METHODDEF
+    _REMOTE_DEBUGGING_BINARYWRITER_CLOSE_METHODDEF
+    _REMOTE_DEBUGGING_BINARYWRITER___ENTER___METHODDEF
+    _REMOTE_DEBUGGING_BINARYWRITER___EXIT___METHODDEF
+    _REMOTE_DEBUGGING_BINARYWRITER_GET_STATS_METHODDEF
+    {NULL, NULL, 0, NULL}
+};
+
+static void
+BinaryWriter_dealloc(PyObject *op)
+{
+    BinaryWriterObject *self = BinaryWriter_CAST(op);
+    PyTypeObject *tp = Py_TYPE(self);
+    if (self->writer) {
+        binary_writer_destroy(self->writer);
+    }
+    tp->tp_free(self);
+    Py_DECREF(tp);
+}
+
+static PyType_Slot BinaryWriter_slots[] = {
+    {Py_tp_getset, BinaryWriter_getset},
+    {Py_tp_methods, BinaryWriter_methods},
+    {Py_tp_init, _remote_debugging_BinaryWriter___init__},
+    {Py_tp_dealloc, BinaryWriter_dealloc},
+    {0, NULL}
+};
+
+static PyType_Spec BinaryWriter_spec = {
+    .name = "_remote_debugging.BinaryWriter",
+    .basicsize = sizeof(BinaryWriterObject),
+    .flags = (
+        Py_TPFLAGS_DEFAULT
+        | Py_TPFLAGS_IMMUTABLETYPE
+    ),
+    .slots = BinaryWriter_slots,
+};
+
+/* ============================================================================
+ * BINARY READER CLASS
+ * ============================================================================ */
+
+#define BinaryReader_CAST(op) ((BinaryReaderObject *)(op))
+
+/*[clinic input]
+class _remote_debugging.BinaryReader "BinaryReaderObject *" "&PyBinaryReader_Type"
+[clinic start generated code]*/
+/*[clinic end generated code: output=da39a3ee5e6b4b0d input=36400aaf6f53216d]*/
+
+/*[clinic input]
+_remote_debugging.BinaryReader.__init__
+    filename: str
+
+High-performance binary reader for profiling data.
+
+Arguments:
+    filename: Path to input file
+
+Use as a context manager or call close() when done.
+[clinic start generated code]*/
+
+static int
+_remote_debugging_BinaryReader___init___impl(BinaryReaderObject *self,
+                                             const char *filename)
+/*[clinic end generated code: output=9699226f7ae052bb input=4201f9cc500ef2f6]*/
+{
+    if (self->reader) {
+        binary_reader_close(self->reader);
+    }
+
+    self->reader = binary_reader_open(filename);
+    if (!self->reader) {
+        return -1;
+    }
+
+    return 0;
+}
+
+/*[clinic input]
+_remote_debugging.BinaryReader.replay
+    collector: object
+    progress_callback: object = None
+
+Replay samples through a collector.
+
+Arguments:
+    collector: Collector object with collect() method
+    progress_callback: Optional callable(current, total)
+
+Returns:
+    Number of samples replayed
+[clinic start generated code]*/
+
+static PyObject *
+_remote_debugging_BinaryReader_replay_impl(BinaryReaderObject *self,
+                                           PyObject *collector,
+                                           PyObject *progress_callback)
+/*[clinic end generated code: output=442345562574b61c input=ebb687aed3e0f4f1]*/
+{
+    if (!self->reader) {
+        PyErr_SetString(PyExc_ValueError, "Reader is closed");
+        return NULL;
+    }
+
+    Py_ssize_t replayed = binary_reader_replay(self->reader, collector, progress_callback);
+    if (replayed < 0) {
+        return NULL;
+    }
+
+    return PyLong_FromSsize_t(replayed);
+}
+
+/*[clinic input]
+_remote_debugging.BinaryReader.get_info
+
+Get metadata about the binary file.
+
+Returns:
+    Dict with file metadata
+[clinic start generated code]*/
+
+static PyObject *
+_remote_debugging_BinaryReader_get_info_impl(BinaryReaderObject *self)
+/*[clinic end generated code: output=7f641fbd39147391 input=02e75e39c8a6cd1f]*/
+{
+    if (!self->reader) {
+        PyErr_SetString(PyExc_ValueError, "Reader is closed");
+        return NULL;
+    }
+
+    return binary_reader_get_info(self->reader);
+}
+
+/*[clinic input]
+_remote_debugging.BinaryReader.get_stats
+
+Get reconstruction statistics from replay.
+
+Returns a dict with statistics about record types decoded and samples
+reconstructed during replay.
+[clinic start generated code]*/
+
+static PyObject *
+_remote_debugging_BinaryReader_get_stats_impl(BinaryReaderObject *self)
+/*[clinic end generated code: output=628b9ab5e4c4fd36 input=d8dd6654abd6c3c0]*/
+{
+    if (!self->reader) {
+        PyErr_SetString(PyExc_ValueError, "Reader is closed");
+        return NULL;
+    }
+    return binary_reader_get_stats(self->reader);
+}
+
+/*[clinic input]
+_remote_debugging.BinaryReader.close
+
+Close the reader and free resources.
+[clinic start generated code]*/
+
+static PyObject *
+_remote_debugging_BinaryReader_close_impl(BinaryReaderObject *self)
+/*[clinic end generated code: output=ad0238cf5240b4f8 input=b919a66c737712d5]*/
+{
+    if (self->reader) {
+        binary_reader_close(self->reader);
+        self->reader = NULL;
+    }
+    Py_RETURN_NONE;
+}
+
+/*[clinic input]
+_remote_debugging.BinaryReader.__enter__
+
+Enter context manager.
+[clinic start generated code]*/
+
+static PyObject *
+_remote_debugging_BinaryReader___enter___impl(BinaryReaderObject *self)
+/*[clinic end generated code: output=fade133538e93817 input=4794844c9efdc4f6]*/
+{
+    Py_INCREF(self);
+    return (PyObject *)self;
+}
+
+/*[clinic input]
+_remote_debugging.BinaryReader.__exit__
+    exc_type: object = None
+    exc_val: object = None
+    exc_tb: object = None
+
+Exit context manager, closing the file.
+[clinic start generated code]*/
+
+static PyObject *
+_remote_debugging_BinaryReader___exit___impl(BinaryReaderObject *self,
+                                             PyObject *exc_type,
+                                             PyObject *exc_val,
+                                             PyObject *exc_tb)
+/*[clinic end generated code: output=2acdd36cfdc14e4a input=87284243d7935835]*/
+{
+    if (self->reader) {
+        binary_reader_close(self->reader);
+        self->reader = NULL;
+    }
+    Py_RETURN_FALSE;
+}
+
+static PyObject *
+BinaryReader_get_sample_count(BinaryReaderObject *self, void *closure)
+{
+    if (!self->reader) {
+        return PyLong_FromLong(0);
+    }
+    return PyLong_FromUnsignedLong(self->reader->sample_count);
+}
+
+static PyObject *
+BinaryReader_get_sample_interval_us(BinaryReaderObject *self, void *closure)
+{
+    if (!self->reader) {
+        return PyLong_FromLong(0);
+    }
+    return PyLong_FromUnsignedLongLong(self->reader->sample_interval_us);
+}
+
+static PyGetSetDef BinaryReader_getset[] = {
+    {"sample_count", (getter)BinaryReader_get_sample_count, NULL, "Number of samples in file", NULL},
+    {"sample_interval_us", (getter)BinaryReader_get_sample_interval_us, NULL, "Sample interval in microseconds", NULL},
+    {NULL}
+};
+
+static PyMethodDef BinaryReader_methods[] = {
+    _REMOTE_DEBUGGING_BINARYREADER_REPLAY_METHODDEF
+    _REMOTE_DEBUGGING_BINARYREADER_GET_INFO_METHODDEF
+    _REMOTE_DEBUGGING_BINARYREADER_GET_STATS_METHODDEF
+    _REMOTE_DEBUGGING_BINARYREADER_CLOSE_METHODDEF
+    _REMOTE_DEBUGGING_BINARYREADER___ENTER___METHODDEF
+    _REMOTE_DEBUGGING_BINARYREADER___EXIT___METHODDEF
+    {NULL, NULL, 0, NULL}
+};
+
+static void
+BinaryReader_dealloc(PyObject *op)
+{
+    BinaryReaderObject *self = BinaryReader_CAST(op);
+    PyTypeObject *tp = Py_TYPE(self);
+    if (self->reader) {
+        binary_reader_close(self->reader);
+    }
+    tp->tp_free(self);
+    Py_DECREF(tp);
+}
+
+static PyType_Slot BinaryReader_slots[] = {
+    {Py_tp_getset, BinaryReader_getset},
+    {Py_tp_methods, BinaryReader_methods},
+    {Py_tp_init, _remote_debugging_BinaryReader___init__},
+    {Py_tp_dealloc, BinaryReader_dealloc},
+    {0, NULL}
+};
+
+static PyType_Spec BinaryReader_spec = {
+    .name = "_remote_debugging.BinaryReader",
+    .basicsize = sizeof(BinaryReaderObject),
+    .flags = (
+        Py_TPFLAGS_DEFAULT
+        | Py_TPFLAGS_IMMUTABLETYPE
+    ),
+    .slots = BinaryReader_slots,
+};
+
+/* ============================================================================
+ * MODULE METHODS
+ * ============================================================================ */
+
+/*[clinic input]
+_remote_debugging.zstd_available
+
+Check if zstd compression is available.
+
+Returns:
+    True if zstd available, False otherwise
+[clinic start generated code]*/
+
+static PyObject *
+_remote_debugging_zstd_available_impl(PyObject *module)
+/*[clinic end generated code: output=55e35a70ef280cdd input=a1b4d41bc09c7cf9]*/
+{
+    return PyBool_FromLong(binary_io_zstd_available());
+}
+
 /* ============================================================================
 * MODULE-LEVEL FUNCTIONS
 * ============================================================================ */
@ -1188,11 +1715,19 @@ _remote_debugging_is_python_process_impl(PyObject *module, int pid)
 }

 static PyMethodDef remote_debugging_methods[] = {
+    _REMOTE_DEBUGGING_ZSTD_AVAILABLE_METHODDEF
    _REMOTE_DEBUGGING_GET_CHILD_PIDS_METHODDEF
    _REMOTE_DEBUGGING_IS_PYTHON_PROCESS_METHODDEF
    {NULL, NULL, 0, NULL},
 };

+static PyModuleDef_Slot remote_debugging_slots[] = {
+    {Py_mod_exec, _remote_debugging_exec},
+    {Py_mod_multiple_interpreters, Py_MOD_PER_INTERPRETER_GIL_SUPPORTED},
+    {Py_mod_gil, Py_MOD_GIL_NOT_USED},
+    {0, NULL},
+};
+
 static struct PyModuleDef remote_debugging_module = {
    PyModuleDef_HEAD_INIT,
    .m_name = "_remote_debugging",
--- a/PCbuild/_remote_debugging.vcxproj
+++ b/PCbuild/_remote_debugging.vcxproj
@ -105,10 +105,13 @@
    <ClCompile Include="..\Modules\_remote_debugging\frame_cache.c" />
    <ClCompile Include="..\Modules\_remote_debugging\threads.c" />
    <ClCompile Include="..\Modules\_remote_debugging\asyncio.c" />
+    <ClCompile Include="..\Modules\_remote_debugging\binary_io_writer.c" />
+    <ClCompile Include="..\Modules\_remote_debugging\binary_io_reader.c" />
    <ClCompile Include="..\Modules\_remote_debugging\subprocess.c" />
  </ItemGroup>
  <ItemGroup>
    <ClInclude Include="..\Modules\_remote_debugging\_remote_debugging.h" />
+    <ClInclude Include="..\Modules\_remote_debugging\binary_io.h" />
  </ItemGroup>
  <ItemGroup>
    <ResourceCompile Include="..\PC\python_nt.rc" />
--- a/PCbuild/_remote_debugging.vcxproj.filters
+++ b/PCbuild/_remote_debugging.vcxproj.filters
@ -33,6 +33,12 @@
    <ClCompile Include="..\Modules\_remote_debugging\asyncio.c">
      <Filter>Source Files</Filter>
    </ClCompile>
+    <ClCompile Include="..\Modules\_remote_debugging\binary_io_writer.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\Modules\_remote_debugging\binary_io_reader.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
    <ClCompile Include="..\Modules\_remote_debugging\subprocess.c">
      <Filter>Source Files</Filter>
    </ClCompile>
@ -41,6 +47,9 @@
    <ClInclude Include="..\Modules\_remote_debugging\_remote_debugging.h">
      <Filter>Header Files</Filter>
    </ClInclude>
+    <ClInclude Include="..\Modules\_remote_debugging\binary_io.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
  </ItemGroup>
  <ItemGroup>
    <ResourceCompile Include="..\PC\python_nt.rc">
--- a/Tools/c-analyzer/cpython/_parser.py
+++ b/Tools/c-analyzer/cpython/_parser.py
@ -313,6 +313,7 @@ MAX_SIZES = {
    _abs('Modules/_hacl/*.c'): (200_000, 500),
    _abs('Modules/posixmodule.c'): (20_000, 500),
    _abs('Modules/termios.c'): (10_000, 800),
+    _abs('Modules/_remote_debugging/*.h'): (20_000, 1000),
    _abs('Modules/_testcapimodule.c'): (20_000, 400),
    _abs('Modules/expat/expat.h'): (10_000, 400),
    _abs('Objects/stringlib/unicode_format.h'): (10_000, 400),
--- a/22
+++ b/22
@ -858,6 +858,8 @@ HAVE_GETHOSTBYNAME_R_3_ARG
 HAVE_GETHOSTBYNAME_R_5_ARG
 HAVE_GETHOSTBYNAME_R_6_ARG
 LIBOBJS
+REMOTE_DEBUGGING_LIBS
+REMOTE_DEBUGGING_CFLAGS
 LIBZSTD_LIBS
 LIBZSTD_CFLAGS
 LIBLZMA_LIBS
@ -23023,6 +23025,22 @@ printf "%s\n" "yes" >&6; }
        have_libzstd=yes
 fi

+if test "x$have_libzstd" = xyes
+then :
+
+  REMOTE_DEBUGGING_CFLAGS="-DHAVE_ZSTD $LIBZSTD_CFLAGS"
+  REMOTE_DEBUGGING_LIBS="$LIBZSTD_LIBS"
+
+else case e in #(
+  e)
+  REMOTE_DEBUGGING_CFLAGS=""
+  REMOTE_DEBUGGING_LIBS=""
+ ;;
+esac
+fi
+
+
+



@ -31644,8 +31662,8 @@ fi
  if test "x$py_cv_module__remote_debugging" = xyes
 then :

-
-
+    as_fn_append MODULE_BLOCK "MODULE__REMOTE_DEBUGGING_CFLAGS=$REMOTE_DEBUGGING_CFLAGS$as_nl"
+    as_fn_append MODULE_BLOCK "MODULE__REMOTE_DEBUGGING_LDFLAGS=$REMOTE_DEBUGGING_LIBS$as_nl"

 fi

--- a/configure.ac
+++ b/configure.ac
@ -5529,6 +5529,18 @@ PKG_CHECK_MODULES([LIBZSTD], [libzstd >= 1.4.5], [have_libzstd=yes], [
  ])
 ])

+dnl _remote_debugging module: optional zstd compression support
+dnl The module always builds, but zstd compression is only available when libzstd is found
+AS_VAR_IF([have_libzstd], [yes], [
+  REMOTE_DEBUGGING_CFLAGS="-DHAVE_ZSTD $LIBZSTD_CFLAGS"
+  REMOTE_DEBUGGING_LIBS="$LIBZSTD_LIBS"
+], [
+  REMOTE_DEBUGGING_CFLAGS=""
+  REMOTE_DEBUGGING_LIBS=""
+])
+AC_SUBST([REMOTE_DEBUGGING_CFLAGS])
+AC_SUBST([REMOTE_DEBUGGING_LIBS])
+
 dnl PY_CHECK_NETDB_FUNC(FUNCTION)
 AC_DEFUN([PY_CHECK_NETDB_FUNC], [PY_CHECK_FUNC([$1], [@%:@include <netdb.h>])])

@ -7911,7 +7923,7 @@ PY_STDLIB_MOD_SIMPLE([_pickle])
 PY_STDLIB_MOD_SIMPLE([_posixsubprocess])
 PY_STDLIB_MOD_SIMPLE([_queue])
 PY_STDLIB_MOD_SIMPLE([_random])
-PY_STDLIB_MOD_SIMPLE([_remote_debugging])
+PY_STDLIB_MOD_SIMPLE([_remote_debugging], [$REMOTE_DEBUGGING_CFLAGS], [$REMOTE_DEBUGGING_LIBS])
 PY_STDLIB_MOD_SIMPLE([select])
 PY_STDLIB_MOD_SIMPLE([_struct])
 PY_STDLIB_MOD_SIMPLE([_types])