gh-124878: Fix race conditions during interpreter finalization (#130649)

The PyThreadState field gains a reference count field to avoid
issues with PyThreadState being a dangling pointer to freed memory.
The refcount starts with a value of two: one reference is owned by the
interpreter's linked list of thread states and one reference is owned by
the OS thread. The reference count is decremented when the thread state
is removed from the interpreter's linked list and before the OS thread
calls `PyThread_hang_thread()`. The thread that decrements it to zero
frees the `PyThreadState` memory.

The `holds_gil` field is moved out of the `_status` bit field, to avoid
a data race where on thread calls `PyThreadState_Clear()`, modifying the
`_status` bit field while the OS thread reads `holds_gil` when
attempting to acquire the GIL.

The `PyThreadState.state` field now has `_Py_THREAD_SHUTTING_DOWN` as a
possible value. This corresponds to the `_PyThreadState_MustExit()`
check. This avoids race conditions in the free threading build when
checking `_PyThreadState_MustExit()`.
This commit is contained in:
Sam Gross 2025-03-06 10:38:34 -05:00 committed by GitHub
parent c6dd2348ca
commit 052cb717f5
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
13 changed files with 109 additions and 81 deletions

View file

@ -6,8 +6,8 @@
#include "pycore_pyerrors.h" // _PyErr_GetRaisedException()
#include "pycore_pylifecycle.h" // _PyErr_Print()
#include "pycore_pymem.h" // _PyMem_IsPtrFreed()
#include "pycore_pystate.h" // PyThread_hang_thread()
#include "pycore_pystats.h" // _Py_PrintSpecializationStats()
#include "pycore_pythread.h" // PyThread_hang_thread()
/*
Notes about the implementation:
@ -206,7 +206,7 @@ drop_gil_impl(PyThreadState *tstate, struct _gil_runtime_state *gil)
_Py_ANNOTATE_RWLOCK_RELEASED(&gil->locked, /*is_write=*/1);
_Py_atomic_store_int_relaxed(&gil->locked, 0);
if (tstate != NULL) {
tstate->_status.holds_gil = 0;
tstate->holds_gil = 0;
}
COND_SIGNAL(gil->cond);
MUTEX_UNLOCK(gil->mutex);
@ -231,7 +231,7 @@ drop_gil(PyInterpreterState *interp, PyThreadState *tstate, int final_release)
// Check if we have the GIL before dropping it. tstate will be NULL if
// take_gil() detected that this thread has been destroyed, in which case
// we know we have the GIL.
if (tstate != NULL && !tstate->_status.holds_gil) {
if (tstate != NULL && !tstate->holds_gil) {
return;
}
#endif
@ -296,15 +296,14 @@ take_gil(PyThreadState *tstate)
thread which called Py_Finalize(), this thread cannot continue.
This code path can be reached by a daemon thread after Py_Finalize()
completes. In this case, tstate is a dangling pointer: points to
PyThreadState freed memory.
completes.
This used to call a *thread_exit API, but that was not safe as it
lacks stack unwinding and local variable destruction important to
C++. gh-87135: The best that can be done is to hang the thread as
the public APIs calling this have no error reporting mechanism (!).
*/
PyThread_hang_thread();
_PyThreadState_HangThread(tstate);
}
assert(_PyThreadState_CheckConsistency(tstate));
@ -353,7 +352,7 @@ take_gil(PyThreadState *tstate)
}
// gh-87135: hang the thread as *thread_exit() is not a safe
// API. It lacks stack unwind and local variable destruction.
PyThread_hang_thread();
_PyThreadState_HangThread(tstate);
}
assert(_PyThreadState_CheckConsistency(tstate));
@ -404,11 +403,11 @@ take_gil(PyThreadState *tstate)
/* tstate could be a dangling pointer, so don't pass it to
drop_gil(). */
drop_gil(interp, NULL, 1);
PyThread_hang_thread();
_PyThreadState_HangThread(tstate);
}
assert(_PyThreadState_CheckConsistency(tstate));
tstate->_status.holds_gil = 1;
tstate->holds_gil = 1;
_Py_unset_eval_breaker_bit(tstate, _PY_GIL_DROP_REQUEST_BIT);
update_eval_breaker_for_thread(interp, tstate);
@ -460,7 +459,7 @@ PyEval_ThreadsInitialized(void)
static inline int
current_thread_holds_gil(struct _gil_runtime_state *gil, PyThreadState *tstate)
{
int holds_gil = tstate->_status.holds_gil;
int holds_gil = tstate->holds_gil;
// holds_gil is the source of truth; check that last_holder and gil->locked
// are consistent with it.