gh-138122: Validate base frame before caching in remote debugging frame cache (#142852)

2025-12-23 09:19:18 +00:00 · 2025-12-17 15:12:28 +00:00 · 2025-12-17 15:12:28 +00:00 · 568a819f67
commit 568a819f67
parent 2b466c47c3
6 changed files with 152 additions and 17 deletions
--- a/Lib/test/test_profiling/test_sampling_profiler/test_integration.py
+++ b/Lib/test/test_profiling/test_sampling_profiler/test_integration.py
@ -863,3 +863,98 @@ asyncio.run(supervisor())
        self.assertGreater(cpu_percentage, 90.0,
            f"cpu_leaf should dominate samples in 'running' mode, "
            f"got {cpu_percentage:.1f}% ({cpu_leaf_samples}/{total})")
+
+
+def _generate_deep_generators_script(chain_depth=20, recurse_depth=150):
+    """Generate a script with deep nested generators for stress testing."""
+    lines = [
+        'import sys',
+        'sys.setrecursionlimit(5000)',
+        '',
+    ]
+    # Generate chain of yield-from functions
+    for i in range(chain_depth - 1):
+        lines.extend([
+            f'def deep_yield_chain_{i}(n):',
+            f'    yield ("L{i}", n)',
+            f'    yield from deep_yield_chain_{i + 1}(n)',
+            '',
+        ])
+    # Last chain function calls recursive_diver
+    lines.extend([
+        f'def deep_yield_chain_{chain_depth - 1}(n):',
+        f'    yield ("L{chain_depth - 1}", n)',
+        f'    yield from recursive_diver(n, {chain_depth})',
+        '',
+        'def recursive_diver(n, depth):',
+        '    yield (f"DIVE_{depth}", n)',
+        f'    if depth < {recurse_depth}:',
+        '        yield from recursive_diver(n, depth + 1)',
+        '    else:',
+        '        for i in range(5):',
+        '            yield (f"BOTTOM_{depth}", i)',
+        '',
+        'def oscillating_generator(iterations=1000):',
+        '    for i in range(iterations):',
+        '        yield ("OSCILLATE", i)',
+        '        yield from deep_yield_chain_0(i)',
+        '',
+        'def run_forever():',
+        '    while True:',
+        '        for _ in oscillating_generator(10):',
+        '            pass',
+        '',
+        '_test_sock.sendall(b"working")',
+        'run_forever()',
+    ])
+    return '\n'.join(lines)
+
+
+@requires_remote_subprocess_debugging()
+class TestDeepGeneratorFrameCache(unittest.TestCase):
+    """Test frame cache consistency with deep oscillating generator stacks."""
+
+    def test_all_stacks_share_same_base_frame(self):
+        """Verify all sampled stacks reach the entry point function.
+
+        When profiling deep generators that oscillate up and down the call
+        stack, every sample should include the entry point function
+        (run_forever) in its call chain. If the frame cache stores
+        incomplete stacks, some samples will be missing this base function,
+        causing broken flamegraphs.
+        """
+        script = _generate_deep_generators_script()
+        with test_subprocess(script, wait_for_working=True) as subproc:
+            collector = CollapsedStackCollector(sample_interval_usec=1, skip_idle=False)
+
+            with (
+                io.StringIO() as captured_output,
+                mock.patch("sys.stdout", captured_output),
+            ):
+                profiling.sampling.sample.sample(
+                    subproc.process.pid,
+                    collector,
+                    duration_sec=2,
+                )
+
+        samples_with_entry_point = 0
+        samples_without_entry_point = 0
+        total_samples = 0
+
+        for (call_tree, _thread_id), count in collector.stack_counter.items():
+            total_samples += count
+            if call_tree:
+                has_entry_point = call_tree and call_tree[0][2] == "<module>"
+                if has_entry_point:
+                    samples_with_entry_point += count
+                else:
+                    samples_without_entry_point += count
+
+        self.assertGreater(total_samples, 100,
+            f"Expected at least 100 samples, got {total_samples}")
+
+        self.assertEqual(samples_without_entry_point, 0,
+            f"Found {samples_without_entry_point}/{total_samples} samples "
+            f"missing the entry point function 'run_forever'. This indicates "
+            f"incomplete stacks are being returned, likely due to frame cache "
+            f"storing partial stack traces.")
--- a/Misc/NEWS.d/next/Library/2025-12-17-03-03-12.gh-issue-138122.m3EF9E.rst
+++ b/Misc/NEWS.d/next/Library/2025-12-17-03-03-12.gh-issue-138122.m3EF9E.rst
@ -0,0 +1,4 @@
+Fix incomplete stack traces in the Tachyon profiler's frame cache when
+profiling code with deeply nested generators. The frame cache now validates
+that stack traces reach the base frame before caching, preventing broken
+flamegraphs. Patch by Pablo Galindo.
--- a/Modules/_remote_debugging/_remote_debugging.h
+++ b/Modules/_remote_debugging/_remote_debugging.h
@ -429,7 +429,8 @@ extern int process_frame_chain(
    int *stopped_at_cached_frame,
    uintptr_t *frame_addrs,
    Py_ssize_t *num_addrs,
-    Py_ssize_t max_addrs
+    Py_ssize_t max_addrs,
+    uintptr_t *out_last_frame_addr
 );

 /* Frame cache functions */
@ -447,18 +448,22 @@ extern int frame_cache_lookup_and_extend(
    Py_ssize_t *num_addrs,
    Py_ssize_t max_addrs);
 // Returns: 1 = stored, 0 = not stored (graceful), -1 = error
+// Only stores complete stacks that reach base_frame_addr
 extern int frame_cache_store(
    RemoteUnwinderObject *unwinder,
    uint64_t thread_id,
    PyObject *frame_list,
    const uintptr_t *addrs,
-    Py_ssize_t num_addrs);
+    Py_ssize_t num_addrs,
+    uintptr_t base_frame_addr,
+    uintptr_t last_frame_visited);

 extern int collect_frames_with_cache(
    RemoteUnwinderObject *unwinder,
    uintptr_t frame_addr,
    StackChunkList *chunks,
    PyObject *frame_info,
+    uintptr_t base_frame_addr,
    uintptr_t gc_frame,
    uintptr_t last_profiled_frame,
    uint64_t thread_id);
--- a/Modules/_remote_debugging/frame_cache.c
+++ b/Modules/_remote_debugging/frame_cache.c
@ -194,6 +194,7 @@ frame_cache_lookup_and_extend(
 }

 // Store frame list with addresses in cache
+// Only stores complete stacks that reach base_frame_addr (validation done internally)
 // Returns: 1 = stored successfully, 0 = not stored (graceful degradation), -1 = error
 int
 frame_cache_store(
@ -201,12 +202,25 @@ frame_cache_store(
    uint64_t thread_id,
    PyObject *frame_list,
    const uintptr_t *addrs,
-    Py_ssize_t num_addrs)
+    Py_ssize_t num_addrs,
+    uintptr_t base_frame_addr,
+    uintptr_t last_frame_visited)
 {
    if (!unwinder->frame_cache || thread_id == 0) {
        return 0;
    }

+    // Validate we have a complete stack before caching.
+    // Only cache if last_frame_visited matches base_frame_addr (the sentinel
+    // at the bottom of the stack). Note: we use last_frame_visited rather than
+    // addrs[num_addrs-1] because the base frame is visited but not added to the
+    // addrs array (it returns frame==NULL from is_frame_valid due to
+    // owner==FRAME_OWNED_BY_INTERPRETER).
+    if (base_frame_addr != 0 && last_frame_visited != base_frame_addr) {
+        // Incomplete stack - don't cache (graceful degradation)
+        return 0;
+    }
+
    // Clamp to max frames
    if (num_addrs > FRAME_CACHE_MAX_FRAMES) {
        num_addrs = FRAME_CACHE_MAX_FRAMES;
--- a/Modules/_remote_debugging/frames.c
+++ b/Modules/_remote_debugging/frames.c
@ -265,7 +265,8 @@ process_frame_chain(
    int *stopped_at_cached_frame,
    uintptr_t *frame_addrs,      // optional: C array to receive frame addresses
    Py_ssize_t *num_addrs,       // in/out: current count / updated count
-    Py_ssize_t max_addrs)        // max capacity of frame_addrs array
+    Py_ssize_t max_addrs,        // max capacity of frame_addrs array
+    uintptr_t *out_last_frame_addr)  // optional: receives last frame address visited
 {
    uintptr_t frame_addr = initial_frame_addr;
    uintptr_t prev_frame_addr = 0;
@ -273,10 +274,13 @@ process_frame_chain(
    const size_t MAX_FRAMES = 1024 + 512;
    size_t frame_count = 0;

-    // Initialize output flag
+    // Initialize output parameters
    if (stopped_at_cached_frame) {
        *stopped_at_cached_frame = 0;
    }
+    if (out_last_frame_addr) {
+        *out_last_frame_addr = 0;
+    }

    // Quick check: if current_frame == last_profiled_frame, entire stack is unchanged
    if (last_profiled_frame != 0 && initial_frame_addr == last_profiled_frame) {
@ -390,6 +394,11 @@ process_frame_chain(
        return -1;
    }

+    // Set output parameter for caller (needed for cache validation)
+    if (out_last_frame_addr) {
+        *out_last_frame_addr = last_frame_addr;
+    }
+
    return 0;
 }

@ -537,6 +546,7 @@ collect_frames_with_cache(
    uintptr_t frame_addr,
    StackChunkList *chunks,
    PyObject *frame_info,
+    uintptr_t base_frame_addr,
    uintptr_t gc_frame,
    uintptr_t last_profiled_frame,
    uint64_t thread_id)
@ -551,11 +561,13 @@ collect_frames_with_cache(
    uintptr_t addrs[FRAME_CACHE_MAX_FRAMES];
    Py_ssize_t num_addrs = 0;
    Py_ssize_t frames_before = PyList_GET_SIZE(frame_info);
+    uintptr_t last_frame_visited = 0;

    int stopped_at_cached = 0;
-    if (process_frame_chain(unwinder, frame_addr, chunks, frame_info, 0, gc_frame,
+    if (process_frame_chain(unwinder, frame_addr, chunks, frame_info, base_frame_addr, gc_frame,
                            last_profiled_frame, &stopped_at_cached,
-                            addrs, &num_addrs, FRAME_CACHE_MAX_FRAMES) < 0) {
+                            addrs, &num_addrs, FRAME_CACHE_MAX_FRAMES,
+                            &last_frame_visited) < 0) {
        return -1;
    }

@ -575,23 +587,28 @@ collect_frames_with_cache(
            // Cache miss - continue walking from last_profiled_frame to get the rest
            STATS_INC(unwinder, frame_cache_misses);
            Py_ssize_t frames_before_walk = PyList_GET_SIZE(frame_info);
-            if (process_frame_chain(unwinder, last_profiled_frame, chunks, frame_info, 0, gc_frame,
-                                    0, NULL, addrs, &num_addrs, FRAME_CACHE_MAX_FRAMES) < 0) {
+            if (process_frame_chain(unwinder, last_profiled_frame, chunks, frame_info, base_frame_addr, gc_frame,
+                                    0, NULL, addrs, &num_addrs, FRAME_CACHE_MAX_FRAMES,
+                                    &last_frame_visited) < 0) {
                return -1;
            }
            STATS_ADD(unwinder, frames_read_from_memory, PyList_GET_SIZE(frame_info) - frames_before_walk);
        } else {
-            // Partial cache hit
+            // Partial cache hit - cache was validated when stored, so we trust it
            STATS_INC(unwinder, frame_cache_partial_hits);
            STATS_ADD(unwinder, frames_read_from_cache, PyList_GET_SIZE(frame_info) - frames_before_cache);
        }
-    } else if (last_profiled_frame == 0) {
-        // No cache involvement (no last_profiled_frame or cache disabled)
-        STATS_INC(unwinder, frame_cache_misses);
+    } else {
+        if (last_profiled_frame == 0) {
+            // No cache involvement (no last_profiled_frame or cache disabled)
+            STATS_INC(unwinder, frame_cache_misses);
+        }
    }

-    // Store in cache (frame_cache_store handles truncation if num_addrs > FRAME_CACHE_MAX_FRAMES)
-    if (frame_cache_store(unwinder, thread_id, frame_info, addrs, num_addrs) < 0) {
+    // Store in cache - frame_cache_store validates internally that we have a
+    // complete stack (reached base_frame_addr) before actually storing
+    if (frame_cache_store(unwinder, thread_id, frame_info, addrs, num_addrs,
+                          base_frame_addr, last_frame_visited) < 0) {
        return -1;
    }

--- a/Modules/_remote_debugging/threads.c
+++ b/Modules/_remote_debugging/threads.c
@ -430,7 +430,7 @@ unwind_stack_for_thread(
        uintptr_t last_profiled_frame = GET_MEMBER(uintptr_t, ts,
            unwinder->debug_offsets.thread_state.last_profiled_frame);
        if (collect_frames_with_cache(unwinder, frame_addr, &chunks, frame_info,
-                                      gc_frame, last_profiled_frame, tid) < 0) {
+                                      base_frame_addr, gc_frame, last_profiled_frame, tid) < 0) {
            set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to collect frames");
            goto error;
        }
@ -444,7 +444,7 @@ unwind_stack_for_thread(
    } else {
        // No caching - process entire frame chain with base_frame validation
        if (process_frame_chain(unwinder, frame_addr, &chunks, frame_info,
-                                base_frame_addr, gc_frame, 0, NULL, NULL, NULL, 0) < 0) {
+                                base_frame_addr, gc_frame, 0, NULL, NULL, NULL, 0, NULL) < 0) {
            set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to process frame chain");
            goto error;
        }