gh-138122: Validate base frame before caching in remote debugging frame cache (#142852)

This commit is contained in:
Pablo Galindo Salgado 2025-12-17 15:12:28 +00:00 committed by GitHub
parent 2b466c47c3
commit 568a819f67
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
6 changed files with 152 additions and 17 deletions

View file

@ -863,3 +863,98 @@ asyncio.run(supervisor())
self.assertGreater(cpu_percentage, 90.0,
f"cpu_leaf should dominate samples in 'running' mode, "
f"got {cpu_percentage:.1f}% ({cpu_leaf_samples}/{total})")
def _generate_deep_generators_script(chain_depth=20, recurse_depth=150):
"""Generate a script with deep nested generators for stress testing."""
lines = [
'import sys',
'sys.setrecursionlimit(5000)',
'',
]
# Generate chain of yield-from functions
for i in range(chain_depth - 1):
lines.extend([
f'def deep_yield_chain_{i}(n):',
f' yield ("L{i}", n)',
f' yield from deep_yield_chain_{i + 1}(n)',
'',
])
# Last chain function calls recursive_diver
lines.extend([
f'def deep_yield_chain_{chain_depth - 1}(n):',
f' yield ("L{chain_depth - 1}", n)',
f' yield from recursive_diver(n, {chain_depth})',
'',
'def recursive_diver(n, depth):',
' yield (f"DIVE_{depth}", n)',
f' if depth < {recurse_depth}:',
' yield from recursive_diver(n, depth + 1)',
' else:',
' for i in range(5):',
' yield (f"BOTTOM_{depth}", i)',
'',
'def oscillating_generator(iterations=1000):',
' for i in range(iterations):',
' yield ("OSCILLATE", i)',
' yield from deep_yield_chain_0(i)',
'',
'def run_forever():',
' while True:',
' for _ in oscillating_generator(10):',
' pass',
'',
'_test_sock.sendall(b"working")',
'run_forever()',
])
return '\n'.join(lines)
@requires_remote_subprocess_debugging()
class TestDeepGeneratorFrameCache(unittest.TestCase):
"""Test frame cache consistency with deep oscillating generator stacks."""
def test_all_stacks_share_same_base_frame(self):
"""Verify all sampled stacks reach the entry point function.
When profiling deep generators that oscillate up and down the call
stack, every sample should include the entry point function
(run_forever) in its call chain. If the frame cache stores
incomplete stacks, some samples will be missing this base function,
causing broken flamegraphs.
"""
script = _generate_deep_generators_script()
with test_subprocess(script, wait_for_working=True) as subproc:
collector = CollapsedStackCollector(sample_interval_usec=1, skip_idle=False)
with (
io.StringIO() as captured_output,
mock.patch("sys.stdout", captured_output),
):
profiling.sampling.sample.sample(
subproc.process.pid,
collector,
duration_sec=2,
)
samples_with_entry_point = 0
samples_without_entry_point = 0
total_samples = 0
for (call_tree, _thread_id), count in collector.stack_counter.items():
total_samples += count
if call_tree:
has_entry_point = call_tree and call_tree[0][2] == "<module>"
if has_entry_point:
samples_with_entry_point += count
else:
samples_without_entry_point += count
self.assertGreater(total_samples, 100,
f"Expected at least 100 samples, got {total_samples}")
self.assertEqual(samples_without_entry_point, 0,
f"Found {samples_without_entry_point}/{total_samples} samples "
f"missing the entry point function 'run_forever'. This indicates "
f"incomplete stacks are being returned, likely due to frame cache "
f"storing partial stack traces.")

View file

@ -0,0 +1,4 @@
Fix incomplete stack traces in the Tachyon profiler's frame cache when
profiling code with deeply nested generators. The frame cache now validates
that stack traces reach the base frame before caching, preventing broken
flamegraphs. Patch by Pablo Galindo.

View file

@ -429,7 +429,8 @@ extern int process_frame_chain(
int *stopped_at_cached_frame,
uintptr_t *frame_addrs,
Py_ssize_t *num_addrs,
Py_ssize_t max_addrs
Py_ssize_t max_addrs,
uintptr_t *out_last_frame_addr
);
/* Frame cache functions */
@ -447,18 +448,22 @@ extern int frame_cache_lookup_and_extend(
Py_ssize_t *num_addrs,
Py_ssize_t max_addrs);
// Returns: 1 = stored, 0 = not stored (graceful), -1 = error
// Only stores complete stacks that reach base_frame_addr
extern int frame_cache_store(
RemoteUnwinderObject *unwinder,
uint64_t thread_id,
PyObject *frame_list,
const uintptr_t *addrs,
Py_ssize_t num_addrs);
Py_ssize_t num_addrs,
uintptr_t base_frame_addr,
uintptr_t last_frame_visited);
extern int collect_frames_with_cache(
RemoteUnwinderObject *unwinder,
uintptr_t frame_addr,
StackChunkList *chunks,
PyObject *frame_info,
uintptr_t base_frame_addr,
uintptr_t gc_frame,
uintptr_t last_profiled_frame,
uint64_t thread_id);

View file

@ -194,6 +194,7 @@ frame_cache_lookup_and_extend(
}
// Store frame list with addresses in cache
// Only stores complete stacks that reach base_frame_addr (validation done internally)
// Returns: 1 = stored successfully, 0 = not stored (graceful degradation), -1 = error
int
frame_cache_store(
@ -201,12 +202,25 @@ frame_cache_store(
uint64_t thread_id,
PyObject *frame_list,
const uintptr_t *addrs,
Py_ssize_t num_addrs)
Py_ssize_t num_addrs,
uintptr_t base_frame_addr,
uintptr_t last_frame_visited)
{
if (!unwinder->frame_cache || thread_id == 0) {
return 0;
}
// Validate we have a complete stack before caching.
// Only cache if last_frame_visited matches base_frame_addr (the sentinel
// at the bottom of the stack). Note: we use last_frame_visited rather than
// addrs[num_addrs-1] because the base frame is visited but not added to the
// addrs array (it returns frame==NULL from is_frame_valid due to
// owner==FRAME_OWNED_BY_INTERPRETER).
if (base_frame_addr != 0 && last_frame_visited != base_frame_addr) {
// Incomplete stack - don't cache (graceful degradation)
return 0;
}
// Clamp to max frames
if (num_addrs > FRAME_CACHE_MAX_FRAMES) {
num_addrs = FRAME_CACHE_MAX_FRAMES;

View file

@ -265,7 +265,8 @@ process_frame_chain(
int *stopped_at_cached_frame,
uintptr_t *frame_addrs, // optional: C array to receive frame addresses
Py_ssize_t *num_addrs, // in/out: current count / updated count
Py_ssize_t max_addrs) // max capacity of frame_addrs array
Py_ssize_t max_addrs, // max capacity of frame_addrs array
uintptr_t *out_last_frame_addr) // optional: receives last frame address visited
{
uintptr_t frame_addr = initial_frame_addr;
uintptr_t prev_frame_addr = 0;
@ -273,10 +274,13 @@ process_frame_chain(
const size_t MAX_FRAMES = 1024 + 512;
size_t frame_count = 0;
// Initialize output flag
// Initialize output parameters
if (stopped_at_cached_frame) {
*stopped_at_cached_frame = 0;
}
if (out_last_frame_addr) {
*out_last_frame_addr = 0;
}
// Quick check: if current_frame == last_profiled_frame, entire stack is unchanged
if (last_profiled_frame != 0 && initial_frame_addr == last_profiled_frame) {
@ -390,6 +394,11 @@ process_frame_chain(
return -1;
}
// Set output parameter for caller (needed for cache validation)
if (out_last_frame_addr) {
*out_last_frame_addr = last_frame_addr;
}
return 0;
}
@ -537,6 +546,7 @@ collect_frames_with_cache(
uintptr_t frame_addr,
StackChunkList *chunks,
PyObject *frame_info,
uintptr_t base_frame_addr,
uintptr_t gc_frame,
uintptr_t last_profiled_frame,
uint64_t thread_id)
@ -551,11 +561,13 @@ collect_frames_with_cache(
uintptr_t addrs[FRAME_CACHE_MAX_FRAMES];
Py_ssize_t num_addrs = 0;
Py_ssize_t frames_before = PyList_GET_SIZE(frame_info);
uintptr_t last_frame_visited = 0;
int stopped_at_cached = 0;
if (process_frame_chain(unwinder, frame_addr, chunks, frame_info, 0, gc_frame,
if (process_frame_chain(unwinder, frame_addr, chunks, frame_info, base_frame_addr, gc_frame,
last_profiled_frame, &stopped_at_cached,
addrs, &num_addrs, FRAME_CACHE_MAX_FRAMES) < 0) {
addrs, &num_addrs, FRAME_CACHE_MAX_FRAMES,
&last_frame_visited) < 0) {
return -1;
}
@ -575,23 +587,28 @@ collect_frames_with_cache(
// Cache miss - continue walking from last_profiled_frame to get the rest
STATS_INC(unwinder, frame_cache_misses);
Py_ssize_t frames_before_walk = PyList_GET_SIZE(frame_info);
if (process_frame_chain(unwinder, last_profiled_frame, chunks, frame_info, 0, gc_frame,
0, NULL, addrs, &num_addrs, FRAME_CACHE_MAX_FRAMES) < 0) {
if (process_frame_chain(unwinder, last_profiled_frame, chunks, frame_info, base_frame_addr, gc_frame,
0, NULL, addrs, &num_addrs, FRAME_CACHE_MAX_FRAMES,
&last_frame_visited) < 0) {
return -1;
}
STATS_ADD(unwinder, frames_read_from_memory, PyList_GET_SIZE(frame_info) - frames_before_walk);
} else {
// Partial cache hit
// Partial cache hit - cache was validated when stored, so we trust it
STATS_INC(unwinder, frame_cache_partial_hits);
STATS_ADD(unwinder, frames_read_from_cache, PyList_GET_SIZE(frame_info) - frames_before_cache);
}
} else if (last_profiled_frame == 0) {
// No cache involvement (no last_profiled_frame or cache disabled)
STATS_INC(unwinder, frame_cache_misses);
} else {
if (last_profiled_frame == 0) {
// No cache involvement (no last_profiled_frame or cache disabled)
STATS_INC(unwinder, frame_cache_misses);
}
}
// Store in cache (frame_cache_store handles truncation if num_addrs > FRAME_CACHE_MAX_FRAMES)
if (frame_cache_store(unwinder, thread_id, frame_info, addrs, num_addrs) < 0) {
// Store in cache - frame_cache_store validates internally that we have a
// complete stack (reached base_frame_addr) before actually storing
if (frame_cache_store(unwinder, thread_id, frame_info, addrs, num_addrs,
base_frame_addr, last_frame_visited) < 0) {
return -1;
}

View file

@ -430,7 +430,7 @@ unwind_stack_for_thread(
uintptr_t last_profiled_frame = GET_MEMBER(uintptr_t, ts,
unwinder->debug_offsets.thread_state.last_profiled_frame);
if (collect_frames_with_cache(unwinder, frame_addr, &chunks, frame_info,
gc_frame, last_profiled_frame, tid) < 0) {
base_frame_addr, gc_frame, last_profiled_frame, tid) < 0) {
set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to collect frames");
goto error;
}
@ -444,7 +444,7 @@ unwind_stack_for_thread(
} else {
// No caching - process entire frame chain with base_frame validation
if (process_frame_chain(unwinder, frame_addr, &chunks, frame_info,
base_frame_addr, gc_frame, 0, NULL, NULL, NULL, 0) < 0) {
base_frame_addr, gc_frame, 0, NULL, NULL, NULL, 0, NULL) < 0) {
set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to process frame chain");
goto error;
}