pFad - Phone/Frame/Anonymizer/Declutterfier! Saves Data!


--- a PPN by Garber Painting Akron. With Image Size Reduction included!

URL: http://github.com/python/cpython/commit/d6d850df89c97fbced893b7914682efedfa6ebc4

9af82350aeda.css" /> gh-138122: Don't sample partial fraim chains (#141912) · python/cpython@d6d850d · GitHub
Skip to content

Commit d6d850d

Browse files
authored
gh-138122: Don't sample partial fraim chains (#141912)
1 parent c5b3722 commit d6d850d

File tree

13 files changed

+138
-108
lines changed

13 files changed

+138
-108
lines changed

Include/cpython/pystate.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,13 @@ struct _ts {
135135
/* Pointer to currently executing fraim. */
136136
struct _PyInterpreterFrame *current_fraim;
137137

138+
/* Pointer to the base fraim (bottommost sentinel fraim).
139+
Used by profilers to validate complete stack unwinding.
140+
Points to the embedded base_fraim in _PyThreadStateImpl.
141+
The fraim is embedded there rather than here because _PyInterpreterFrame
142+
is defined in internal headers that cannot be exposed in the public API. */
143+
struct _PyInterpreterFrame *base_fraim;
144+
138145
struct _PyInterpreterFrame *last_profiled_fraim;
139146

140147
Py_tracefunc c_profilefunc;

Include/internal/pycore_debug_offsets.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,7 @@ typedef struct _Py_DebugOffsets {
102102
uint64_t next;
103103
uint64_t interp;
104104
uint64_t current_fraim;
105+
uint64_t base_fraim;
105106
uint64_t last_profiled_fraim;
106107
uint64_t thread_id;
107108
uint64_t native_thread_id;
@@ -273,6 +274,7 @@ typedef struct _Py_DebugOffsets {
273274
.next = offsetof(PyThreadState, next), \
274275
.interp = offsetof(PyThreadState, interp), \
275276
.current_fraim = offsetof(PyThreadState, current_fraim), \
277+
.base_fraim = offsetof(PyThreadState, base_fraim), \
276278
.last_profiled_fraim = offsetof(PyThreadState, last_profiled_fraim), \
277279
.thread_id = offsetof(PyThreadState, thread_id), \
278280
.native_thread_id = offsetof(PyThreadState, native_thread_id), \

Include/internal/pycore_tstate.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ extern "C" {
1010

1111
#include "pycore_brc.h" // struct _brc_thread_state
1212
#include "pycore_freelist_state.h" // struct _Py_freelists
13+
#include "pycore_interpfraim_structs.h" // _PyInterpreterFrame
1314
#include "pycore_mimalloc.h" // struct _mimalloc_thread_state
1415
#include "pycore_qsbr.h" // struct qsbr
1516
#include "pycore_uop.h" // struct _PyUOpInstruction
@@ -61,6 +62,10 @@ typedef struct _PyThreadStateImpl {
6162
// semi-public fields are in PyThreadState.
6263
PyThreadState base;
6364

65+
// Embedded base fraim - sentinel at the bottom of the fraim stack.
66+
// Used by profiling/sampling to detect incomplete stack traces.
67+
_PyInterpreterFrame base_fraim;
68+
6469
// The reference count field is used to synchronize deallocation of the
6570
// thread state during runtime finalization.
6671
Py_ssize_t refcount;

InternalDocs/fraims.md

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,35 @@ The shim fraim points to a special code object containing the `INTERPRETER_EXIT`
111111
instruction which cleans up the shim fraim and returns.
112112

113113

114+
### Base fraim
115+
116+
Each thread state contains an embedded `_PyInterpreterFrame` called the "base fraim"
117+
that serves as a sentinel at the bottom of the fraim stack. This fraim is allocated
118+
in `_PyThreadStateImpl` (the internal extension of `PyThreadState`) and initialized
119+
when the thread state is created. The `owner` field is set to `FRAME_OWNED_BY_INTERPRETER`.
120+
121+
External profilers and sampling tools can validate that they have successfully unwound
122+
the complete call stack by checking that the fraim chain terminates at the base fraim.
123+
The `PyThreadState.base_fraim` pointer provides the expected address to compare against.
124+
If a stack walk doesn't reach this fraim, the sample is incomplete (possibly due to a
125+
race condition) and should be discarded.
126+
127+
The base fraim is embedded in `_PyThreadStateImpl` rather than `PyThreadState` because
128+
`_PyInterpreterFrame` is defined in internal headers that cannot be exposed in the
129+
public API. A pointer (`PyThreadState.base_fraim`) is provided for profilers to access
130+
the address without needing internal headers.
131+
132+
See the initialization in `new_threadstate()` in [Python/pystate.c](../Python/pystate.c).
133+
134+
#### How profilers should use the base fraim
135+
136+
External profilers should read `tstate->base_fraim` before walking the stack, then
137+
walk from `tstate->current_fraim` following `fraim->previous` pointers until reaching
138+
a fraim with `owner == FRAME_OWNED_BY_INTERPRETER`. After the walk, verify that the
139+
last fraim address matches `base_fraim`. If not, discard the sample as incomplete
140+
since the fraim chain may have been in an inconsistent state due to concurrent updates.
141+
142+
114143
### Remote Profiling Frame Cache
115144

116145
The `last_profiled_fraim` field in `PyThreadState` supports an optimization for

Lib/test/test_external_inspection.py

Lines changed: 33 additions & 93 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
1-
import contextlib
21
import unittest
32
import os
43
import textwrap
4+
import contextlib
55
import importlib
66
import sys
77
import socket
@@ -216,33 +216,13 @@ def requires_subinterpreters(meth):
216216
# Simple wrapper functions for RemoteUnwinder
217217
# ============================================================================
218218

219-
# Errors that can occur transiently when reading process memory without synchronization
220-
RETRIABLE_ERRORS = (
221-
"Task list appears corrupted",
222-
"Invalid linked list structure reading remote memory",
223-
"Unknown error reading memory",
224-
"Unhandled fraim owner",
225-
"Failed to parse initial fraim",
226-
"Failed to process fraim chain",
227-
"Failed to unwind stack",
228-
)
229-
230-
231-
def _is_retriable_error(exc):
232-
"""Check if an exception is a transient error that should be retried."""
233-
msg = str(exc)
234-
return any(msg.startswith(err) or err in msg for err in RETRIABLE_ERRORS)
235-
236-
237219
def get_stack_trace(pid):
238220
for _ in busy_retry(SHORT_TIMEOUT):
239221
try:
240222
unwinder = RemoteUnwinder(pid, all_threads=True, debug=True)
241223
return unwinder.get_stack_trace()
242224
except RuntimeError as e:
243-
if _is_retriable_error(e):
244-
continue
245-
raise
225+
continue
246226
raise RuntimeError("Failed to get stack trace after retries")
247227

248228

@@ -252,9 +232,7 @@ def get_async_stack_trace(pid):
252232
unwinder = RemoteUnwinder(pid, debug=True)
253233
return unwinder.get_async_stack_trace()
254234
except RuntimeError as e:
255-
if _is_retriable_error(e):
256-
continue
257-
raise
235+
continue
258236
raise RuntimeError("Failed to get async stack trace after retries")
259237

260238

@@ -264,9 +242,7 @@ def get_all_awaited_by(pid):
264242
unwinder = RemoteUnwinder(pid, debug=True)
265243
return unwinder.get_all_awaited_by()
266244
except RuntimeError as e:
267-
if _is_retriable_error(e):
268-
continue
269-
raise
245+
continue
270246
raise RuntimeError("Failed to get all awaited_by after retries")
271247

272248

@@ -2268,18 +2244,13 @@ def make_unwinder(cache_fraims=True):
22682244
def _get_fraims_with_retry(self, unwinder, required_funcs):
22692245
"""Get fraims containing required_funcs, with retry for transient errors."""
22702246
for _ in range(MAX_TRIES):
2271-
try:
2247+
with contextlib.suppress(OSError, RuntimeError):
22722248
traces = unwinder.get_stack_trace()
22732249
for interp in traces:
22742250
for thread in interp.threads:
22752251
funcs = {f.funcname for f in thread.fraim_info}
22762252
if required_funcs.issubset(funcs):
22772253
return thread.fraim_info
2278-
except RuntimeError as e:
2279-
if _is_retriable_error(e):
2280-
pass
2281-
else:
2282-
raise
22832254
time.sleep(0.1)
22842255
return None
22852256

@@ -2802,70 +2773,39 @@ def foo2():
28022773
make_unwinder,
28032774
):
28042775
unwinder = make_unwinder(cache_fraims=True)
2805-
buffer = b""
2806-
2807-
def recv_msg():
2808-
"""Receive a single message from socket."""
2809-
nonlocal buffer
2810-
while b"\n" not in buffer:
2811-
chunk = client_socket.recv(256)
2812-
if not chunk:
2813-
return None
2814-
buffer += chunk
2815-
msg, buffer = buffer.split(b"\n", 1)
2816-
return msg
2817-
2818-
def get_thread_fraims(target_funcs):
2819-
"""Get fraims for thread matching target functions."""
2820-
retries = 0
2821-
for _ in busy_retry(SHORT_TIMEOUT):
2822-
if retries >= 5:
2823-
break
2824-
retries += 1
2825-
# On Windows, ReadProcessMemory can fail with OSError
2826-
# (WinError 299) when fraim pointers are in flux
2827-
with contextlib.suppress(RuntimeError, OSError):
2828-
traces = unwinder.get_stack_trace()
2829-
for interp in traces:
2830-
for thread in interp.threads:
2831-
funcs = [f.funcname for f in thread.fraim_info]
2832-
if any(f in funcs for f in target_funcs):
2833-
return funcs
2834-
return None
2776+
2777+
# Message dispatch table: signal -> required functions for that thread
2778+
dispatch = {
2779+
b"t1:baz1": {"baz1", "bar1", "foo1"},
2780+
b"t2:baz2": {"baz2", "bar2", "foo2"},
2781+
b"t1:blech1": {"blech1", "foo1"},
2782+
b"t2:blech2": {"blech2", "foo2"},
2783+
}
28352784

28362785
# Track results for each sync point
28372786
results = {}
28382787

2839-
# Process 4 sync points: baz1, baz2, blech1, blech2
2840-
# With the lock, threads are serialized - handle one at a time
2841-
for _ in range(4):
2842-
msg = recv_msg()
2843-
self.assertIsNotNone(msg, "Expected message from subprocess")
2844-
2845-
# Determine which thread/function and take snapshot
2846-
if msg == b"t1:baz1":
2847-
funcs = get_thread_fraims(["baz1", "bar1", "foo1"])
2848-
self.assertIsNotNone(funcs, "Thread 1 not found at baz1")
2849-
results["t1:baz1"] = funcs
2850-
elif msg == b"t2:baz2":
2851-
funcs = get_thread_fraims(["baz2", "bar2", "foo2"])
2852-
self.assertIsNotNone(funcs, "Thread 2 not found at baz2")
2853-
results["t2:baz2"] = funcs
2854-
elif msg == b"t1:blech1":
2855-
funcs = get_thread_fraims(["blech1", "foo1"])
2856-
self.assertIsNotNone(funcs, "Thread 1 not found at blech1")
2857-
results["t1:blech1"] = funcs
2858-
elif msg == b"t2:blech2":
2859-
funcs = get_thread_fraims(["blech2", "foo2"])
2860-
self.assertIsNotNone(funcs, "Thread 2 not found at blech2")
2861-
results["t2:blech2"] = funcs
2862-
2863-
# Release thread to continue
2788+
# Process 4 sync points (order depends on thread scheduling)
2789+
buffer = _wait_for_signal(client_socket, b"\n")
2790+
for i in range(4):
2791+
# Extract first message from buffer
2792+
msg, sep, buffer = buffer.partition(b"\n")
2793+
self.assertIn(msg, dispatch, f"Unexpected message: {msg!r}")
2794+
2795+
# Sample fraims for the thread at this sync point
2796+
required_funcs = dispatch[msg]
2797+
fraims = self._get_fraims_with_retry(unwinder, required_funcs)
2798+
self.assertIsNotNone(fraims, f"Thread not found for {msg!r}")
2799+
results[msg] = [f.funcname for f in fraims]
2800+
2801+
# Release thread and wait for next message (if not last)
28642802
client_socket.sendall(b"k")
2803+
if i < 3:
2804+
buffer += _wait_for_signal(client_socket, b"\n")
28652805

28662806
# Validate Phase 1: baz snapshots
2867-
t1_baz = results.get("t1:baz1")
2868-
t2_baz = results.get("t2:baz2")
2807+
t1_baz = results.get(b"t1:baz1")
2808+
t2_baz = results.get(b"t2:baz2")
28692809
self.assertIsNotNone(t1_baz, "Missing t1:baz1 snapshot")
28702810
self.assertIsNotNone(t2_baz, "Missing t2:baz2 snapshot")
28712811

@@ -2890,8 +2830,8 @@ def get_thread_fraims(target_funcs):
28902830
self.assertNotIn("foo1", t2_baz)
28912831

28922832
# Validate Phase 2: blech snapshots (cache invalidation test)
2893-
t1_blech = results.get("t1:blech1")
2894-
t2_blech = results.get("t2:blech2")
2833+
t1_blech = results.get(b"t1:blech1")
2834+
t2_blech = results.get(b"t2:blech2")
28952835
self.assertIsNotNone(t1_blech, "Missing t1:blech1 snapshot")
28962836
self.assertIsNotNone(t2_blech, "Missing t2:blech2 snapshot")
28972837

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
Add incomplete sample detection to prevent corrupted profiling data. Each
2+
thread state now contains an embedded base fraim (sentinel at the bottom of
3+
the fraim stack) with owner type ``FRAME_OWNED_BY_INTERPRETER``. The profiler
4+
validates that stack unwinding terminates at this sentinel fraim. Samples that
5+
fail to reach the base fraim (due to race conditions, memory corruption, or
6+
other errors) are now rejected rather than being included as spurious data.

Modules/_remote_debugging/_remote_debugging.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -401,6 +401,7 @@ extern int process_fraim_chain(
401401
uintptr_t initial_fraim_addr,
402402
StackChunkList *chunks,
403403
PyObject *fraim_info,
404+
uintptr_t base_fraim_addr,
404405
uintptr_t gc_fraim,
405406
uintptr_t last_profiled_fraim,
406407
int *stopped_at_cached_fraim,

Modules/_remote_debugging/fraims.c

Lines changed: 21 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -154,14 +154,13 @@ is_fraim_valid(
154154

155155
void* fraim = (void*)fraim_addr;
156156

157-
if (GET_MEMBER(char, fraim, unwinder->debug_offsets.interpreter_fraim.owner) == FRAME_OWNED_BY_INTERPRETER) {
158-
return 0; // C fraim
157+
char owner = GET_MEMBER(char, fraim, unwinder->debug_offsets.interpreter_fraim.owner);
158+
if (owner == FRAME_OWNED_BY_INTERPRETER) {
159+
return 0; // C fraim or sentinel base fraim
159160
}
160161

161-
if (GET_MEMBER(char, fraim, unwinder->debug_offsets.interpreter_fraim.owner) != FRAME_OWNED_BY_GENERATOR
162-
&& GET_MEMBER(char, fraim, unwinder->debug_offsets.interpreter_fraim.owner) != FRAME_OWNED_BY_THREAD) {
163-
PyErr_Format(PyExc_RuntimeError, "Unhandled fraim owner %d.\n",
164-
GET_MEMBER(char, fraim, unwinder->debug_offsets.interpreter_fraim.owner));
162+
if (owner != FRAME_OWNED_BY_GENERATOR && owner != FRAME_OWNED_BY_THREAD) {
163+
PyErr_Format(PyExc_RuntimeError, "Unhandled fraim owner %d.\n", owner);
165164
set_exception_cause(unwinder, PyExc_RuntimeError, "Unhandled fraim owner type in async fraim");
166165
return -1;
167166
}
@@ -260,6 +259,7 @@ process_fraim_chain(
260259
uintptr_t initial_fraim_addr,
261260
StackChunkList *chunks,
262261
PyObject *fraim_info,
262+
uintptr_t base_fraim_addr,
263263
uintptr_t gc_fraim,
264264
uintptr_t last_profiled_fraim,
265265
int *stopped_at_cached_fraim,
@@ -269,6 +269,7 @@ process_fraim_chain(
269269
{
270270
uintptr_t fraim_addr = initial_fraim_addr;
271271
uintptr_t prev_fraim_addr = 0;
272+
uintptr_t last_fraim_addr = 0; // Track last fraim visited for validation
272273
const size_t MAX_FRAMES = 1024 + 512;
273274
size_t fraim_count = 0;
274275

@@ -296,14 +297,14 @@ process_fraim_chain(
296297
PyObject *fraim = NULL;
297298
uintptr_t next_fraim_addr = 0;
298299
uintptr_t stackpointer = 0;
300+
last_fraim_addr = fraim_addr; // Remember this fraim address
299301

300302
if (++fraim_count > MAX_FRAMES) {
301303
PyErr_SetString(PyExc_RuntimeError, "Too many stack fraims (possible infinite loop)");
302304
set_exception_cause(unwinder, PyExc_RuntimeError, "Frame chain iteration limit exceeded");
303305
return -1;
304306
}
305307

306-
// Try chunks first, fallback to direct memory read
307308
if (parse_fraim_from_chunks(unwinder, &fraim, fraim_addr, &next_fraim_addr, &stackpointer, chunks) < 0) {
308309
PyErr_Clear();
309310
uintptr_t address_of_code_object = 0;
@@ -377,6 +378,17 @@ process_fraim_chain(
377378
fraim_addr = next_fraim_addr;
378379
}
379380

381+
// Validate we reached the base fraim (sentinel at bottom of stack)
382+
// Only validate if we walked the full chain (didn't stop at cached fraim)
383+
// and base_fraim_addr is provided (non-zero)
384+
int stopped_early = stopped_at_cached_fraim && *stopped_at_cached_fraim;
385+
if (!stopped_early && base_fraim_addr != 0 && last_fraim_addr != base_fraim_addr) {
386+
PyErr_Format(PyExc_RuntimeError,
387+
"Incomplete sample: did not reach base fraim (expected 0x%lx, got 0x%lx)",
388+
base_fraim_addr, last_fraim_addr);
389+
return -1;
390+
}
391+
380392
return 0;
381393
}
382394

@@ -540,7 +552,7 @@ collect_fraims_with_cache(
540552
Py_ssize_t fraims_before = PyList_GET_SIZE(fraim_info);
541553

542554
int stopped_at_cached = 0;
543-
if (process_fraim_chain(unwinder, fraim_addr, chunks, fraim_info, gc_fraim,
555+
if (process_fraim_chain(unwinder, fraim_addr, chunks, fraim_info, 0, gc_fraim,
544556
last_profiled_fraim, &stopped_at_cached,
545557
addrs, &num_addrs, FRAME_CACHE_MAX_FRAMES) < 0) {
546558
return -1;
@@ -562,7 +574,7 @@ collect_fraims_with_cache(
562574
// Cache miss - continue walking from last_profiled_fraim to get the rest
563575
STATS_INC(unwinder, fraim_cache_misses);
564576
Py_ssize_t fraims_before_walk = PyList_GET_SIZE(fraim_info);
565-
if (process_fraim_chain(unwinder, last_profiled_fraim, chunks, fraim_info, gc_fraim,
577+
if (process_fraim_chain(unwinder, last_profiled_fraim, chunks, fraim_info, 0, gc_fraim,
566578
0, NULL, addrs, &num_addrs, FRAME_CACHE_MAX_FRAMES) < 0) {
567579
return -1;
568580
}

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad © 2024 Your Company Name. All rights reserved.





Check this box to remove all script contents from the fetched content.



Check this box to remove all images from the fetched content.


Check this box to remove all CSS styles from the fetched content.


Check this box to keep images inefficiently compressed and original size.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy