mirror of
https://github.com/python/cpython.git
synced 2025-10-06 15:11:58 +00:00
gh-114271: Fix race in Thread.join()
(#114839)
There is a race between when `Thread._tstate_lock` is released[^1] in `Thread._wait_for_tstate_lock()` and when `Thread._stop()` asserts[^2] that it is unlocked. Consider the following execution involving threads A, B, and C: 1. A starts. 2. B joins A, blocking on its `_tstate_lock`. 3. C joins A, blocking on its `_tstate_lock`. 4. A finishes and releases its `_tstate_lock`. 5. B acquires A's `_tstate_lock` in `_wait_for_tstate_lock()`, releases it, but is swapped out before calling `_stop()`. 6. C is scheduled, acquires A's `_tstate_lock` in `_wait_for_tstate_lock()` but is swapped out before releasing it. 7. B is scheduled, calls `_stop()`, which asserts that A's `_tstate_lock` is not held. However, C holds it, so the assertion fails. The race can be reproduced[^3] by inserting sleeps at the appropriate points in the threading code. To do so, run the `repro_join_race.py` from the linked repo. There are two main parts to this PR: 1. `_tstate_lock` is replaced with an event that is attached to `PyThreadState`. The event is set by the runtime prior to the thread being cleared (in the same place that `_tstate_lock` was released). `Thread.join()` blocks waiting for the event to be set. 2. `_PyInterpreterState_WaitForThreads()` provides the ability to wait for all non-daemon threads to exit. To do so, an `is_daemon` predicate was added to `PyThreadState`. This field is set each time a thread is created. `threading._shutdown()` now calls into `_PyInterpreterState_WaitForThreads()` instead of waiting on `_tstate_lock`s. [^1]:441affc9e7/Lib/threading.py (L1201)
[^2]:441affc9e7/Lib/threading.py (L1115)
[^3]:8194653279
--------- Co-authored-by: blurb-it[bot] <43283697+blurb-it[bot]@users.noreply.github.com> Co-authored-by: Antoine Pitrou <antoine@python.org>
This commit is contained in:
parent
86bc40dd41
commit
33da0e844c
12 changed files with 771 additions and 643 deletions
216
Lib/threading.py
216
Lib/threading.py
|
@ -36,8 +36,11 @@ _start_joinable_thread = _thread.start_joinable_thread
|
|||
_daemon_threads_allowed = _thread.daemon_threads_allowed
|
||||
_allocate_lock = _thread.allocate_lock
|
||||
_LockType = _thread.LockType
|
||||
_set_sentinel = _thread._set_sentinel
|
||||
_thread_shutdown = _thread._shutdown
|
||||
_make_thread_handle = _thread._make_thread_handle
|
||||
_ThreadHandle = _thread._ThreadHandle
|
||||
get_ident = _thread.get_ident
|
||||
_get_main_thread_ident = _thread._get_main_thread_ident
|
||||
_is_main_interpreter = _thread._is_main_interpreter
|
||||
try:
|
||||
get_native_id = _thread.get_native_id
|
||||
|
@ -847,25 +850,6 @@ _active = {} # maps thread id to Thread object
|
|||
_limbo = {}
|
||||
_dangling = WeakSet()
|
||||
|
||||
# Set of Thread._tstate_lock locks of non-daemon threads used by _shutdown()
|
||||
# to wait until all Python thread states get deleted:
|
||||
# see Thread._set_tstate_lock().
|
||||
_shutdown_locks_lock = _allocate_lock()
|
||||
_shutdown_locks = set()
|
||||
|
||||
def _maintain_shutdown_locks():
|
||||
"""
|
||||
Drop any shutdown locks that don't correspond to running threads anymore.
|
||||
|
||||
Calling this from time to time avoids an ever-growing _shutdown_locks
|
||||
set when Thread objects are not joined explicitly. See bpo-37788.
|
||||
|
||||
This must be called with _shutdown_locks_lock acquired.
|
||||
"""
|
||||
# If a lock was released, the corresponding thread has exited
|
||||
to_remove = [lock for lock in _shutdown_locks if not lock.locked()]
|
||||
_shutdown_locks.difference_update(to_remove)
|
||||
|
||||
|
||||
# Main class for threads
|
||||
|
||||
|
@ -930,10 +914,8 @@ class Thread:
|
|||
self._ident = None
|
||||
if _HAVE_THREAD_NATIVE_ID:
|
||||
self._native_id = None
|
||||
self._tstate_lock = None
|
||||
self._handle = None
|
||||
self._handle = _ThreadHandle()
|
||||
self._started = Event()
|
||||
self._is_stopped = False
|
||||
self._initialized = True
|
||||
# Copy of sys.stderr used by self._invoke_excepthook()
|
||||
self._stderr = _sys.stderr
|
||||
|
@ -947,28 +929,18 @@ class Thread:
|
|||
if new_ident is not None:
|
||||
# This thread is alive.
|
||||
self._ident = new_ident
|
||||
if self._handle is not None:
|
||||
assert self._handle.ident == new_ident
|
||||
# bpo-42350: If the fork happens when the thread is already stopped
|
||||
# (ex: after threading._shutdown() has been called), _tstate_lock
|
||||
# is None. Do nothing in this case.
|
||||
if self._tstate_lock is not None:
|
||||
self._tstate_lock._at_fork_reinit()
|
||||
self._tstate_lock.acquire()
|
||||
assert self._handle.ident == new_ident
|
||||
else:
|
||||
# This thread isn't alive after fork: it doesn't have a tstate
|
||||
# anymore.
|
||||
self._is_stopped = True
|
||||
self._tstate_lock = None
|
||||
self._handle = None
|
||||
# Otherwise, the thread is dead, Jim. _PyThread_AfterFork()
|
||||
# already marked our handle done.
|
||||
pass
|
||||
|
||||
def __repr__(self):
|
||||
assert self._initialized, "Thread.__init__() was not called"
|
||||
status = "initial"
|
||||
if self._started.is_set():
|
||||
status = "started"
|
||||
self.is_alive() # easy way to get ._is_stopped set when appropriate
|
||||
if self._is_stopped:
|
||||
if self._handle.is_done():
|
||||
status = "stopped"
|
||||
if self._daemonic:
|
||||
status += " daemon"
|
||||
|
@ -996,7 +968,8 @@ class Thread:
|
|||
_limbo[self] = self
|
||||
try:
|
||||
# Start joinable thread
|
||||
self._handle = _start_joinable_thread(self._bootstrap)
|
||||
_start_joinable_thread(self._bootstrap, handle=self._handle,
|
||||
daemon=self.daemon)
|
||||
except Exception:
|
||||
with _active_limbo_lock:
|
||||
del _limbo[self]
|
||||
|
@ -1047,23 +1020,9 @@ class Thread:
|
|||
def _set_native_id(self):
|
||||
self._native_id = get_native_id()
|
||||
|
||||
def _set_tstate_lock(self):
|
||||
"""
|
||||
Set a lock object which will be released by the interpreter when
|
||||
the underlying thread state (see pystate.h) gets deleted.
|
||||
"""
|
||||
self._tstate_lock = _set_sentinel()
|
||||
self._tstate_lock.acquire()
|
||||
|
||||
if not self.daemon:
|
||||
with _shutdown_locks_lock:
|
||||
_maintain_shutdown_locks()
|
||||
_shutdown_locks.add(self._tstate_lock)
|
||||
|
||||
def _bootstrap_inner(self):
|
||||
try:
|
||||
self._set_ident()
|
||||
self._set_tstate_lock()
|
||||
if _HAVE_THREAD_NATIVE_ID:
|
||||
self._set_native_id()
|
||||
self._started.set()
|
||||
|
@ -1083,33 +1042,6 @@ class Thread:
|
|||
finally:
|
||||
self._delete()
|
||||
|
||||
def _stop(self):
|
||||
# After calling ._stop(), .is_alive() returns False and .join() returns
|
||||
# immediately. ._tstate_lock must be released before calling ._stop().
|
||||
#
|
||||
# Normal case: C code at the end of the thread's life
|
||||
# (release_sentinel in _threadmodule.c) releases ._tstate_lock, and
|
||||
# that's detected by our ._wait_for_tstate_lock(), called by .join()
|
||||
# and .is_alive(). Any number of threads _may_ call ._stop()
|
||||
# simultaneously (for example, if multiple threads are blocked in
|
||||
# .join() calls), and they're not serialized. That's harmless -
|
||||
# they'll just make redundant rebindings of ._is_stopped and
|
||||
# ._tstate_lock. Obscure: we rebind ._tstate_lock last so that the
|
||||
# "assert self._is_stopped" in ._wait_for_tstate_lock() always works
|
||||
# (the assert is executed only if ._tstate_lock is None).
|
||||
#
|
||||
# Special case: _main_thread releases ._tstate_lock via this
|
||||
# module's _shutdown() function.
|
||||
lock = self._tstate_lock
|
||||
if lock is not None:
|
||||
assert not lock.locked()
|
||||
self._is_stopped = True
|
||||
self._tstate_lock = None
|
||||
if not self.daemon:
|
||||
with _shutdown_locks_lock:
|
||||
# Remove our lock and other released locks from _shutdown_locks
|
||||
_maintain_shutdown_locks()
|
||||
|
||||
def _delete(self):
|
||||
"Remove current thread from the dict of currently running threads."
|
||||
with _active_limbo_lock:
|
||||
|
@ -1150,47 +1082,12 @@ class Thread:
|
|||
if self is current_thread():
|
||||
raise RuntimeError("cannot join current thread")
|
||||
|
||||
if timeout is None:
|
||||
self._wait_for_tstate_lock()
|
||||
else:
|
||||
# the behavior of a negative timeout isn't documented, but
|
||||
# historically .join(timeout=x) for x<0 has acted as if timeout=0
|
||||
self._wait_for_tstate_lock(timeout=max(timeout, 0))
|
||||
# the behavior of a negative timeout isn't documented, but
|
||||
# historically .join(timeout=x) for x<0 has acted as if timeout=0
|
||||
if timeout is not None:
|
||||
timeout = max(timeout, 0)
|
||||
|
||||
if self._is_stopped:
|
||||
self._join_os_thread()
|
||||
|
||||
def _join_os_thread(self):
|
||||
# self._handle may be cleared post-fork
|
||||
if self._handle is not None:
|
||||
self._handle.join()
|
||||
|
||||
def _wait_for_tstate_lock(self, block=True, timeout=-1):
|
||||
# Issue #18808: wait for the thread state to be gone.
|
||||
# At the end of the thread's life, after all knowledge of the thread
|
||||
# is removed from C data structures, C code releases our _tstate_lock.
|
||||
# This method passes its arguments to _tstate_lock.acquire().
|
||||
# If the lock is acquired, the C code is done, and self._stop() is
|
||||
# called. That sets ._is_stopped to True, and ._tstate_lock to None.
|
||||
lock = self._tstate_lock
|
||||
if lock is None:
|
||||
# already determined that the C code is done
|
||||
assert self._is_stopped
|
||||
return
|
||||
|
||||
try:
|
||||
if lock.acquire(block, timeout):
|
||||
lock.release()
|
||||
self._stop()
|
||||
except:
|
||||
if lock.locked():
|
||||
# bpo-45274: lock.acquire() acquired the lock, but the function
|
||||
# was interrupted with an exception before reaching the
|
||||
# lock.release(). It can happen if a signal handler raises an
|
||||
# exception, like CTRL+C which raises KeyboardInterrupt.
|
||||
lock.release()
|
||||
self._stop()
|
||||
raise
|
||||
self._handle.join(timeout)
|
||||
|
||||
@property
|
||||
def name(self):
|
||||
|
@ -1241,13 +1138,7 @@ class Thread:
|
|||
|
||||
"""
|
||||
assert self._initialized, "Thread.__init__() not called"
|
||||
if self._is_stopped or not self._started.is_set():
|
||||
return False
|
||||
self._wait_for_tstate_lock(False)
|
||||
if not self._is_stopped:
|
||||
return True
|
||||
self._join_os_thread()
|
||||
return False
|
||||
return self._started.is_set() and not self._handle.is_done()
|
||||
|
||||
@property
|
||||
def daemon(self):
|
||||
|
@ -1456,18 +1347,14 @@ class _MainThread(Thread):
|
|||
|
||||
def __init__(self):
|
||||
Thread.__init__(self, name="MainThread", daemon=False)
|
||||
self._set_tstate_lock()
|
||||
self._started.set()
|
||||
self._set_ident()
|
||||
self._ident = _get_main_thread_ident()
|
||||
self._handle = _make_thread_handle(self._ident)
|
||||
if _HAVE_THREAD_NATIVE_ID:
|
||||
self._set_native_id()
|
||||
with _active_limbo_lock:
|
||||
_active[self._ident] = self
|
||||
|
||||
def _join_os_thread(self):
|
||||
# No ThreadHandle for main thread
|
||||
pass
|
||||
|
||||
|
||||
# Helper thread-local instance to detect when a _DummyThread
|
||||
# is collected. Not a part of the public API.
|
||||
|
@ -1510,17 +1397,15 @@ class _DummyThread(Thread):
|
|||
daemon=_daemon_threads_allowed())
|
||||
self._started.set()
|
||||
self._set_ident()
|
||||
self._handle = _make_thread_handle(self._ident)
|
||||
if _HAVE_THREAD_NATIVE_ID:
|
||||
self._set_native_id()
|
||||
with _active_limbo_lock:
|
||||
_active[self._ident] = self
|
||||
_DeleteDummyThreadOnDel(self)
|
||||
|
||||
def _stop(self):
|
||||
pass
|
||||
|
||||
def is_alive(self):
|
||||
if not self._is_stopped and self._started.is_set():
|
||||
if not self._handle.is_done() and self._started.is_set():
|
||||
return True
|
||||
raise RuntimeError("thread is not alive")
|
||||
|
||||
|
@ -1532,7 +1417,6 @@ class _DummyThread(Thread):
|
|||
self.__class__ = _MainThread
|
||||
self._name = 'MainThread'
|
||||
self._daemonic = False
|
||||
self._set_tstate_lock()
|
||||
Thread._after_fork(self, new_ident=new_ident)
|
||||
|
||||
|
||||
|
@ -1631,12 +1515,11 @@ def _shutdown():
|
|||
"""
|
||||
Wait until the Python thread state of all non-daemon threads get deleted.
|
||||
"""
|
||||
# Obscure: other threads may be waiting to join _main_thread. That's
|
||||
# dubious, but some code does it. We can't wait for C code to release
|
||||
# the main thread's tstate_lock - that won't happen until the interpreter
|
||||
# is nearly dead. So we release it here. Note that just calling _stop()
|
||||
# isn't enough: other threads may already be waiting on _tstate_lock.
|
||||
if _main_thread._is_stopped and _is_main_interpreter():
|
||||
# Obscure: other threads may be waiting to join _main_thread. That's
|
||||
# dubious, but some code does it. We can't wait for it to be marked as done
|
||||
# normally - that won't happen until the interpreter is nearly dead. So
|
||||
# mark it done here.
|
||||
if _main_thread._handle.is_done() and _is_main_interpreter():
|
||||
# _shutdown() was already called
|
||||
return
|
||||
|
||||
|
@ -1648,42 +1531,11 @@ def _shutdown():
|
|||
for atexit_call in reversed(_threading_atexits):
|
||||
atexit_call()
|
||||
|
||||
# Main thread
|
||||
if _main_thread.ident == get_ident():
|
||||
tlock = _main_thread._tstate_lock
|
||||
# The main thread isn't finished yet, so its thread state lock can't
|
||||
# have been released.
|
||||
assert tlock is not None
|
||||
if tlock.locked():
|
||||
# It should have been released already by
|
||||
# _PyInterpreterState_SetNotRunningMain(), but there may be
|
||||
# embedders that aren't calling that yet.
|
||||
tlock.release()
|
||||
_main_thread._stop()
|
||||
else:
|
||||
# bpo-1596321: _shutdown() must be called in the main thread.
|
||||
# If the threading module was not imported by the main thread,
|
||||
# _main_thread is the thread which imported the threading module.
|
||||
# In this case, ignore _main_thread, similar behavior than for threads
|
||||
# spawned by C libraries or using _thread.start_new_thread().
|
||||
pass
|
||||
if _is_main_interpreter():
|
||||
_main_thread._handle._set_done()
|
||||
|
||||
# Join all non-deamon threads
|
||||
while True:
|
||||
with _shutdown_locks_lock:
|
||||
locks = list(_shutdown_locks)
|
||||
_shutdown_locks.clear()
|
||||
|
||||
if not locks:
|
||||
break
|
||||
|
||||
for lock in locks:
|
||||
# mimic Thread.join()
|
||||
lock.acquire()
|
||||
lock.release()
|
||||
|
||||
# new threads can be spawned while we were waiting for the other
|
||||
# threads to complete
|
||||
# Wait for all non-daemon threads to exit.
|
||||
_thread_shutdown()
|
||||
|
||||
|
||||
def main_thread():
|
||||
|
@ -1703,7 +1555,6 @@ def _after_fork():
|
|||
# Reset _active_limbo_lock, in case we forked while the lock was held
|
||||
# by another (non-forked) thread. http://bugs.python.org/issue874900
|
||||
global _active_limbo_lock, _main_thread
|
||||
global _shutdown_locks_lock, _shutdown_locks
|
||||
_active_limbo_lock = RLock()
|
||||
|
||||
# fork() only copied the current thread; clear references to others.
|
||||
|
@ -1719,10 +1570,6 @@ def _after_fork():
|
|||
|
||||
_main_thread = current
|
||||
|
||||
# reset _shutdown() locks: threads re-register their _tstate_lock below
|
||||
_shutdown_locks_lock = _allocate_lock()
|
||||
_shutdown_locks = set()
|
||||
|
||||
with _active_limbo_lock:
|
||||
# Dangling thread instances must still have their locks reset,
|
||||
# because someone may join() them.
|
||||
|
@ -1739,7 +1586,6 @@ def _after_fork():
|
|||
else:
|
||||
# All the others are already stopped.
|
||||
thread._after_fork()
|
||||
thread._stop()
|
||||
|
||||
_limbo.clear()
|
||||
_active.clear()
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue